From d03a6b120d63b49eae2af04908645b07fadc910a Mon Sep 17 00:00:00 2001 From: j Date: Thu, 27 Jul 2023 18:35:33 +0200 Subject: [PATCH] fix sanitize_fragment('\ufeff') --- ox/html.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ox/html.py b/ox/html.py index 5fcc6c1..5286116 100644 --- a/ox/html.py +++ b/ox/html.py @@ -440,7 +440,12 @@ def sanitize_fragment(html): if not html.strip(): return html import lxml.html - body = lxml.html.document_fromstring(html).find('body') + try: + body = lxml.html.document_fromstring(html).find('body') + except lxml.etree.ParserError as e: + if e.args and e.args[0] == 'Document is empty': + return html + raise e html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') if html.startswith('

') and html.endswith('

'): html = html[3:-4]