| |
comp.lang.python |
> > import re > '&(%s);' won't quite work: HTML (and, I assume, SGML, but not XHTML being > Also, this completely ignores non-name entities as also found in XML. (eg It ignores the missing semicolon issue (and note also that IE can cope import htmlentitydefs def unescape_charref(ref): def replace_entities(match): repl = htmlentitydefs.name2codepoint.get(ent[1:-1]) def unescape(data): class UnescapeTests(unittest.TestCase): def test_unescape_charref(self): def test_unescape(self): unittest.main() John
> "Adam Atlas" <a...@atlas.st> wrote in message
> news:1180965792.757685.132580@q75g2000hsh.googlegroups.com...
> > As far as I know, there isn't a standard idiom to do this, but it's
> > still a one-liner. Untested, but I think this should work:
> > from htmlentitydefs import name2codepoint
> > def htmlentitydecode(s):
> > return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m:
> > name2codepoint[m.group(1)], s)
> XML) allows you to skip the semicolon after the entity if it's followed by a
> white space (IIRC). Should this be respected, it looks more like this:
> r'&(%s)([;\s]|$)'
> %x20; for ' ' or so) Maybe some part of the HTMLParser module is useful, I
> wouldn't know. IMHO, these particular batteries aren't too commonly needed.
leave entity references that are not defined in standard library
module htmlentitydefs intact, rather than throwing an exception.
with even a missing space, like "trés mal", so you'll see that
in the wild). Probably it could be adapted to handle that (possibly
the presumably-slower htmllib-based recipe on the python.org wiki
already does handle that, not sure).
import re
import unittest
name = ref[2:-1]
base = 10
if name.startswith("x"):
name = name[1:]
base = 16
return unichr(int(name, base))
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent)
if repl is not None:
repl = unichr(repl)
else:
repl = ent
return repl
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
self.assertEqual(unescape_charref(u"&"), u"&")
self.assertEqual(unescape_charref(u"—"), u"\N{EM DASH}")
self.assertEqual(unescape_charref(u"—"), u"\N{EM DASH}")
self.assertEqual(
unescape(u"& < — — —"),
u"& < %s %s %s" % tuple(u"\N{EM DASH}"*3)
)
self.assertEqual(unescape(u"&a&"), u"&a&")
self.assertEqual(unescape(u"a&"), u"a&")
self.assertEqual(unescape(u"&nonexistent;"), u"&nonexistent;")