apos not in name2codepoint, also decode that

This commit is contained in:
j 2012-04-24 19:00:48 +02:00
parent 6ed4a2d867
commit 5a00be4b37

View file

@ -139,6 +139,8 @@ def decodeHtml(html):
u'me & you and $&%' u'me & you and $&%'
>>> decodeHtml('€') >>> decodeHtml('€')
u'' u''
>>> decodeHtml('Anniversary of Daoud's Republic')
u'Anniversary of Daoud's Republic'
""" """
if type(html) != unicode: if type(html) != unicode:
html = unicode(html)[:] html = unicode(html)[:]
@ -156,6 +158,8 @@ def decodeHtml(html):
return uchr(int(entity[1:])) return uchr(int(entity[1:]))
elif entity in name2codepoint: elif entity in name2codepoint:
return uchr(name2codepoint[entity]) return uchr(name2codepoint[entity])
elif entity == 'apos':
return "'"
else: else:
return match.group(0) return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
@ -211,6 +215,8 @@ def parse_html(html, tags=None, wikilinks=False):
'<b>foo</b>' '<b>foo</b>'
>>> parse_html('<b>foo</b></b>') >>> parse_html('<b>foo</b></b>')
'<b>foo</b>' '<b>foo</b>'
>>> parse_html('Anniversary of Daoud&apos;s Republic')
'Anniversary of Daoud&apos;s Republic'
''' '''
if not tags: if not tags:
tags = [ tags = [