ox.html: fix sanitizing whitespace-only strings
lxml raises: ParserError: Document is empty if you ask it to parse a string with no non-whitespace characters. The existing truthiness test squashed the commonest case (empty string) but not the general case.
This commit is contained in:
parent
533a1a627e
commit
cbcef39ec0
1 changed files with 20 additions and 2 deletions
22
ox/html.py
22
ox/html.py
|
@ -220,6 +220,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
|||
u'<b>foo</b>'
|
||||
>>> sanitize_html('Anniversary of Daoud's Republic')
|
||||
u"Anniversary of Daoud's Republic"
|
||||
>>> sanitize_html('')
|
||||
u''
|
||||
>>> sanitize_html(' ')
|
||||
u' '
|
||||
>>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose
|
||||
u' '
|
||||
>>> sanitize_html(u'\u00a0') # also nbsp
|
||||
u' '
|
||||
'''
|
||||
if not tags:
|
||||
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
|
||||
|
@ -406,6 +414,16 @@ def sanitize_fragment(html):
|
|||
u'<br><br>'
|
||||
>>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>')
|
||||
u'<a href="javascript:alert()">foo</a>'
|
||||
>>> sanitize_fragment(u'')
|
||||
u''
|
||||
>>> sanitize_fragment(u' ')
|
||||
u' '
|
||||
>>> sanitize_fragment(u' ')
|
||||
u'\\xa0'
|
||||
>>> sanitize_fragment(u'\\u00a0') # nbsp
|
||||
u'\\xa0'
|
||||
>>> sanitize_fragment(u'\\ufeff') # zero-width no-break space
|
||||
u'\\ufeff'
|
||||
'''
|
||||
|
||||
'''
|
||||
|
@ -413,8 +431,8 @@ def sanitize_fragment(html):
|
|||
import html5lib
|
||||
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
||||
'''
|
||||
if not html:
|
||||
return u''
|
||||
if not html.strip():
|
||||
return html
|
||||
import lxml.html
|
||||
body = lxml.html.document_fromstring(html).find('body')
|
||||
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
|
||||
|
|
Loading…
Reference in a new issue