apos not in name2codepoint, also decode that
This commit is contained in:
parent
6ed4a2d867
commit
5a00be4b37
1 changed files with 7 additions and 1 deletions
|
@ -139,6 +139,8 @@ def decodeHtml(html):
|
||||||
u'me & you and $&%'
|
u'me & you and $&%'
|
||||||
>>> decodeHtml('€')
|
>>> decodeHtml('€')
|
||||||
u'€'
|
u'€'
|
||||||
|
>>> decodeHtml('Anniversary of Daoud's Republic')
|
||||||
|
u'Anniversary of Daoud's Republic'
|
||||||
"""
|
"""
|
||||||
if type(html) != unicode:
|
if type(html) != unicode:
|
||||||
html = unicode(html)[:]
|
html = unicode(html)[:]
|
||||||
|
@ -156,6 +158,8 @@ def decodeHtml(html):
|
||||||
return uchr(int(entity[1:]))
|
return uchr(int(entity[1:]))
|
||||||
elif entity in name2codepoint:
|
elif entity in name2codepoint:
|
||||||
return uchr(name2codepoint[entity])
|
return uchr(name2codepoint[entity])
|
||||||
|
elif entity == 'apos':
|
||||||
|
return "'"
|
||||||
else:
|
else:
|
||||||
return match.group(0)
|
return match.group(0)
|
||||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||||
|
@ -211,6 +215,8 @@ def parse_html(html, tags=None, wikilinks=False):
|
||||||
'<b>foo</b>'
|
'<b>foo</b>'
|
||||||
>>> parse_html('<b>foo</b></b>')
|
>>> parse_html('<b>foo</b></b>')
|
||||||
'<b>foo</b>'
|
'<b>foo</b>'
|
||||||
|
>>> parse_html('Anniversary of Daoud's Republic')
|
||||||
|
'Anniversary of Daoud's Republic'
|
||||||
'''
|
'''
|
||||||
if not tags:
|
if not tags:
|
||||||
tags = [
|
tags = [
|
||||||
|
|
Loading…
Reference in a new issue