From cbcef39ec01e4b733d0c999fa04b1899cf88b9e7 Mon Sep 17 00:00:00 2001 From: Will Thompson Date: Tue, 24 Nov 2015 18:14:03 +0000 Subject: [PATCH 3/3] ox.html: fix sanitizing whitespace-only strings lxml raises: ParserError: Document is empty if you ask it to parse a string with no non-whitespace characters. The existing truthiness test squashed the commonest case (empty string) but not the general case. --- ox/html.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ox/html.py b/ox/html.py index 202a036..642dd55 100644 --- a/ox/html.py +++ b/ox/html.py @@ -220,6 +220,14 @@ def sanitize_html(html, tags=None, global_attributes=[]): u'foo' >>> sanitize_html('Anniversary of Daoud's Republic') u"Anniversary of Daoud's Republic" + >>> sanitize_html('') + u'' + >>> sanitize_html(' ') + u' ' + >>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose + u' ' + >>> sanitize_html(u'\u00a0') # also nbsp + u' ' ''' if not tags: valid_url = '^((https?:\/\/|\/|mailto:).*?)' @@ -406,6 +414,16 @@ def sanitize_fragment(html): u'

' >>> sanitize_fragment(u'foo') u'foo' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u' ') + u' ' + >>> sanitize_fragment(u' ') + u'\\xa0' + >>> sanitize_fragment(u'\\u00a0') # nbsp + u'\\xa0' + >>> sanitize_fragment(u'\\ufeff') # zero-width no-break space + u'\\ufeff' ''' ''' @@ -413,8 +431,8 @@ def sanitize_fragment(html): import html5lib return html5lib.parseFragment(html).toxml().decode('utf-8') ''' - if not html: - return u'' + if not html.strip(): + return html import lxml.html body = lxml.html.document_fromstring(html).find('body') html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') -- 2.5.0