ox.html: fix sanitizing whitespace-only strings

lxml raises: ParserError: Document is empty if you ask it to parse a string with no non-whitespace characters. The existing truthiness test squashed the commonest case (empty string) but not the general case.
2015-11-24 18:14:03 +00:00 · 2015-11-24 18:14:03 +00:00 · cbcef39ec0
commit cbcef39ec0
parent 533a1a627e
1 changed files with 20 additions and 2 deletions
--- a/ox/html.py
+++ b/ox/html.py
@ -220,6 +220,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
    u'<b>foo</b>'
    >>> sanitize_html('Anniversary of Daoud&apos;s Republic')
    u"Anniversary of Daoud's Republic"
+    >>> sanitize_html('')
+    u''
+    >>> sanitize_html(' ')
+    u' '
+    >>> sanitize_html(u'&nbsp;')  # canonicalised to a space: okay, I suppose
+    u' '
+    >>> sanitize_html(u'\u00a0')  # also nbsp
+    u' '
    '''
    if not tags:
        valid_url = '^((https?:\/\/|\/|mailto:).*?)'
@ -406,6 +414,16 @@ def sanitize_fragment(html):
    u'<br><br>'
    >>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>')
    u'<a href="javascript:alert()">foo</a>'
+    >>> sanitize_fragment(u'')
+    u''
+    >>> sanitize_fragment(u' ')
+    u' '
+    >>> sanitize_fragment(u'&nbsp;')
+    u'\\xa0'
+    >>> sanitize_fragment(u'\\u00a0')  # nbsp
+    u'\\xa0'
+    >>> sanitize_fragment(u'\\ufeff')  # zero-width no-break space
+    u'\\ufeff'
    '''

    '''
@ -413,8 +431,8 @@ def sanitize_fragment(html):
    import html5lib
    return html5lib.parseFragment(html).toxml().decode('utf-8')
    '''
-    if not html:
-        return u''
+    if not html.strip():
+        return html
    import lxml.html
    body = lxml.html.document_fromstring(html).find('body')
    html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')