ox.html: fix sanitizing whitespace-only strings

lxml raises:

    ParserError: Document is empty

if you ask it to parse a string with no non-whitespace characters. The
existing truthiness test squashed the commonest case (empty string) but
not the general case.
This commit is contained in:
Will Thompson 2015-11-24 18:14:03 +00:00
parent 533a1a627e
commit cbcef39ec0

View file

@ -220,6 +220,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
u'<b>foo</b>'
>>> sanitize_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic"
>>> sanitize_html('')
u''
>>> sanitize_html(' ')
u' '
>>> sanitize_html(u'&nbsp;') # canonicalised to a space: okay, I suppose
u' '
>>> sanitize_html(u'\u00a0') # also nbsp
u' '
'''
if not tags:
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
@ -406,6 +414,16 @@ def sanitize_fragment(html):
u'<br><br>'
>>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>')
u'<a href="javascript:alert()">foo</a>'
>>> sanitize_fragment(u'')
u''
>>> sanitize_fragment(u' ')
u' '
>>> sanitize_fragment(u'&nbsp;')
u'\\xa0'
>>> sanitize_fragment(u'\\u00a0') # nbsp
u'\\xa0'
>>> sanitize_fragment(u'\\ufeff') # zero-width no-break space
u'\\ufeff'
'''
'''
@ -413,8 +431,8 @@ def sanitize_fragment(html):
import html5lib
return html5lib.parseFragment(html).toxml().decode('utf-8')
'''
if not html:
return u''
if not html.strip():
return html
import lxml.html
body = lxml.html.document_fromstring(html).find('body')
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')