ox.html: fix sanitizing whitespace-only strings
lxml raises: ParserError: Document is empty if you ask it to parse a string with no non-whitespace characters. The existing truthiness test squashed the commonest case (empty string) but not the general case.
This commit is contained in:
parent
533a1a627e
commit
cbcef39ec0
1 changed files with 20 additions and 2 deletions
22
ox/html.py
22
ox/html.py
|
@ -220,6 +220,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
u'<b>foo</b>'
|
u'<b>foo</b>'
|
||||||
>>> sanitize_html('Anniversary of Daoud's Republic')
|
>>> sanitize_html('Anniversary of Daoud's Republic')
|
||||||
u"Anniversary of Daoud's Republic"
|
u"Anniversary of Daoud's Republic"
|
||||||
|
>>> sanitize_html('')
|
||||||
|
u''
|
||||||
|
>>> sanitize_html(' ')
|
||||||
|
u' '
|
||||||
|
>>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose
|
||||||
|
u' '
|
||||||
|
>>> sanitize_html(u'\u00a0') # also nbsp
|
||||||
|
u' '
|
||||||
'''
|
'''
|
||||||
if not tags:
|
if not tags:
|
||||||
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
|
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
|
||||||
|
@ -406,6 +414,16 @@ def sanitize_fragment(html):
|
||||||
u'<br><br>'
|
u'<br><br>'
|
||||||
>>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>')
|
>>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>')
|
||||||
u'<a href="javascript:alert()">foo</a>'
|
u'<a href="javascript:alert()">foo</a>'
|
||||||
|
>>> sanitize_fragment(u'')
|
||||||
|
u''
|
||||||
|
>>> sanitize_fragment(u' ')
|
||||||
|
u' '
|
||||||
|
>>> sanitize_fragment(u' ')
|
||||||
|
u'\\xa0'
|
||||||
|
>>> sanitize_fragment(u'\\u00a0') # nbsp
|
||||||
|
u'\\xa0'
|
||||||
|
>>> sanitize_fragment(u'\\ufeff') # zero-width no-break space
|
||||||
|
u'\\ufeff'
|
||||||
'''
|
'''
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
@ -413,8 +431,8 @@ def sanitize_fragment(html):
|
||||||
import html5lib
|
import html5lib
|
||||||
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
||||||
'''
|
'''
|
||||||
if not html:
|
if not html.strip():
|
||||||
return u''
|
return html
|
||||||
import lxml.html
|
import lxml.html
|
||||||
body = lxml.html.document_fromstring(html).find('body')
|
body = lxml.html.document_fromstring(html).find('body')
|
||||||
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
|
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
|
||||||
|
|
Loading…
Reference in a new issue