From cbcef39ec01e4b733d0c999fa04b1899cf88b9e7 Mon Sep 17 00:00:00 2001
From: Will Thompson <will@willthompson.co.uk>
Date: Tue, 24 Nov 2015 18:14:03 +0000
Subject: [PATCH 3/3] ox.html: fix sanitizing whitespace-only strings

lxml raises:

    ParserError: Document is empty

if you ask it to parse a string with no non-whitespace characters. The
existing truthiness test squashed the commonest case (empty string) but
not the general case.
---
 ox/html.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/ox/html.py b/ox/html.py
index 202a036..642dd55 100644
--- a/ox/html.py
+++ b/ox/html.py
@@ -220,6 +220,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
     u'<b>foo</b>'
     >>> sanitize_html('Anniversary of Daoud&apos;s Republic')
     u"Anniversary of Daoud's Republic"
+    >>> sanitize_html('')
+    u''
+    >>> sanitize_html(' ')
+    u' '
+    >>> sanitize_html(u'&nbsp;')  # canonicalised to a space: okay, I suppose
+    u' '
+    >>> sanitize_html(u'\u00a0')  # also nbsp
+    u' '
     '''
     if not tags:
         valid_url = '^((https?:\/\/|\/|mailto:).*?)'
@@ -406,6 +414,16 @@ def sanitize_fragment(html):
     u'<br><br>'
     >>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>')
     u'<a href="javascript:alert()">foo</a>'
+    >>> sanitize_fragment(u'')
+    u''
+    >>> sanitize_fragment(u' ')
+    u' '
+    >>> sanitize_fragment(u'&nbsp;')
+    u'\\xa0'
+    >>> sanitize_fragment(u'\\u00a0')  # nbsp
+    u'\\xa0'
+    >>> sanitize_fragment(u'\\ufeff')  # zero-width no-break space
+    u'\\ufeff'
     '''
 
     '''
@@ -413,8 +431,8 @@ def sanitize_fragment(html):
     import html5lib
     return html5lib.parseFragment(html).toxml().decode('utf-8')
     '''
-    if not html:
-        return u''
+    if not html.strip():
+        return html
     import lxml.html
     body = lxml.html.document_fromstring(html).find('body')
     html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
-- 
2.5.0