2016-02-06 09:36:57 +00:00
|
|
|
"""
|
|
|
|
An interface to html5lib that mimics the lxml.html interface.
|
|
|
|
"""
|
|
|
|
import sys
|
|
|
|
import string
|
|
|
|
|
|
|
|
from html5lib import HTMLParser as _HTMLParser
|
|
|
|
from html5lib.treebuilders.etree_lxml import TreeBuilder
|
|
|
|
from lxml import etree
|
2018-12-31 23:25:26 +00:00
|
|
|
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
|
2016-02-06 09:36:57 +00:00
|
|
|
|
|
|
|
# python3 compatibility
|
|
|
|
try:
|
|
|
|
_strings = basestring
|
|
|
|
except NameError:
|
|
|
|
_strings = (bytes, str)
|
|
|
|
try:
|
|
|
|
from urllib2 import urlopen
|
|
|
|
except ImportError:
|
|
|
|
from urllib.request import urlopen
|
|
|
|
try:
|
|
|
|
from urlparse import urlparse
|
|
|
|
except ImportError:
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
|
2016-02-06 09:36:57 +00:00
|
|
|
class HTMLParser(_HTMLParser):
|
|
|
|
"""An html5lib HTML parser with lxml as tree."""
|
|
|
|
|
|
|
|
def __init__(self, strict=False, **kwargs):
|
|
|
|
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
from html5lib import XHTMLParser as _XHTMLParser
|
|
|
|
except ImportError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
class XHTMLParser(_XHTMLParser):
|
|
|
|
"""An html5lib XHTML Parser with lxml as tree."""
|
|
|
|
|
|
|
|
def __init__(self, strict=False, **kwargs):
|
|
|
|
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
|
|
|
|
|
|
|
|
xhtml_parser = XHTMLParser()
|
|
|
|
|
|
|
|
|
|
|
|
def _find_tag(tree, tag):
|
|
|
|
elem = tree.find(tag)
|
|
|
|
if elem is not None:
|
|
|
|
return elem
|
|
|
|
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
|
|
|
|
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
def document_fromstring(html, guess_charset=None, parser=None):
|
|
|
|
"""
|
|
|
|
Parse a whole document into a string.
|
|
|
|
|
|
|
|
If `guess_charset` is true, or if the input is not Unicode but a
|
|
|
|
byte string, the `chardet` library will perform charset guessing
|
|
|
|
on the string.
|
|
|
|
"""
|
2016-02-06 09:36:57 +00:00
|
|
|
if not isinstance(html, _strings):
|
|
|
|
raise TypeError('string required')
|
|
|
|
|
|
|
|
if parser is None:
|
|
|
|
parser = html_parser
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
options = {}
|
|
|
|
if guess_charset is None and isinstance(html, bytes):
|
|
|
|
# html5lib does not accept useChardet as an argument, if it
|
|
|
|
# detected the html argument would produce unicode objects.
|
|
|
|
guess_charset = True
|
|
|
|
if guess_charset is not None:
|
|
|
|
options['useChardet'] = guess_charset
|
|
|
|
return parser.parse(html, **options).getroot()
|
2016-02-06 09:36:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
def fragments_fromstring(html, no_leading_text=False,
|
2018-12-31 23:25:26 +00:00
|
|
|
guess_charset=None, parser=None):
|
2016-02-06 09:36:57 +00:00
|
|
|
"""Parses several HTML elements, returning a list of elements.
|
|
|
|
|
|
|
|
The first item in the list may be a string. If no_leading_text is true,
|
|
|
|
then it will be an error if there is leading text, and it will always be
|
|
|
|
a list of only elements.
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
If `guess_charset` is true, the `chardet` library will perform charset
|
|
|
|
guessing on the string.
|
2016-02-06 09:36:57 +00:00
|
|
|
"""
|
|
|
|
if not isinstance(html, _strings):
|
|
|
|
raise TypeError('string required')
|
|
|
|
|
|
|
|
if parser is None:
|
|
|
|
parser = html_parser
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
options = {}
|
|
|
|
if guess_charset is None and isinstance(html, bytes):
|
|
|
|
# html5lib does not accept useChardet as an argument, if it
|
|
|
|
# detected the html argument would produce unicode objects.
|
|
|
|
guess_charset = False
|
|
|
|
if guess_charset is not None:
|
|
|
|
options['useChardet'] = guess_charset
|
|
|
|
children = parser.parseFragment(html, 'div', **options)
|
2016-02-06 09:36:57 +00:00
|
|
|
if children and isinstance(children[0], _strings):
|
|
|
|
if no_leading_text:
|
|
|
|
if children[0].strip():
|
|
|
|
raise etree.ParserError('There is leading text: %r' %
|
|
|
|
children[0])
|
|
|
|
del children[0]
|
|
|
|
return children
|
|
|
|
|
|
|
|
|
|
|
|
def fragment_fromstring(html, create_parent=False,
|
2018-12-31 23:25:26 +00:00
|
|
|
guess_charset=None, parser=None):
|
2016-02-06 09:36:57 +00:00
|
|
|
"""Parses a single HTML element; it is an error if there is more than
|
|
|
|
one element, or if anything but whitespace precedes or follows the
|
|
|
|
element.
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
If 'create_parent' is true (or is a tag name) then a parent node
|
2016-02-06 09:36:57 +00:00
|
|
|
will be created to encapsulate the HTML in a single element. In
|
|
|
|
this case, leading or trailing text is allowed.
|
2018-12-31 23:25:26 +00:00
|
|
|
|
|
|
|
If `guess_charset` is true, the `chardet` library will perform charset
|
|
|
|
guessing on the string.
|
2016-02-06 09:36:57 +00:00
|
|
|
"""
|
|
|
|
if not isinstance(html, _strings):
|
|
|
|
raise TypeError('string required')
|
|
|
|
|
|
|
|
accept_leading_text = bool(create_parent)
|
|
|
|
|
|
|
|
elements = fragments_fromstring(
|
|
|
|
html, guess_charset=guess_charset, parser=parser,
|
|
|
|
no_leading_text=not accept_leading_text)
|
|
|
|
|
|
|
|
if create_parent:
|
|
|
|
if not isinstance(create_parent, _strings):
|
|
|
|
create_parent = 'div'
|
|
|
|
new_root = Element(create_parent)
|
|
|
|
if elements:
|
|
|
|
if isinstance(elements[0], _strings):
|
|
|
|
new_root.text = elements[0]
|
|
|
|
del elements[0]
|
|
|
|
new_root.extend(elements)
|
|
|
|
return new_root
|
|
|
|
|
|
|
|
if not elements:
|
|
|
|
raise etree.ParserError('No elements found')
|
|
|
|
if len(elements) > 1:
|
|
|
|
raise etree.ParserError('Multiple elements found')
|
|
|
|
result = elements[0]
|
|
|
|
if result.tail and result.tail.strip():
|
|
|
|
raise etree.ParserError('Element followed by text: %r' % result.tail)
|
|
|
|
result.tail = None
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
def fromstring(html, guess_charset=None, parser=None):
|
2016-02-06 09:36:57 +00:00
|
|
|
"""Parse the html, returning a single element/document.
|
|
|
|
|
|
|
|
This tries to minimally parse the chunk of text, without knowing if it
|
|
|
|
is a fragment or a document.
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
'base_url' will set the document's base_url attribute (and the tree's
|
|
|
|
docinfo.URL)
|
|
|
|
|
|
|
|
If `guess_charset` is true, or if the input is not Unicode but a
|
|
|
|
byte string, the `chardet` library will perform charset guessing
|
|
|
|
on the string.
|
2016-02-06 09:36:57 +00:00
|
|
|
"""
|
|
|
|
if not isinstance(html, _strings):
|
|
|
|
raise TypeError('string required')
|
|
|
|
doc = document_fromstring(html, parser=parser,
|
|
|
|
guess_charset=guess_charset)
|
|
|
|
|
|
|
|
# document starts with doctype or <html>, full document!
|
2018-12-31 23:25:26 +00:00
|
|
|
start = html[:50]
|
|
|
|
if isinstance(start, bytes):
|
|
|
|
# Allow text comparison in python3.
|
|
|
|
# Decode as ascii, that also covers latin-1 and utf-8 for the
|
|
|
|
# characters we need.
|
|
|
|
start = start.decode('ascii', 'replace')
|
|
|
|
|
|
|
|
start = start.lstrip().lower()
|
2016-02-06 09:36:57 +00:00
|
|
|
if start.startswith('<html') or start.startswith('<!doctype'):
|
|
|
|
return doc
|
|
|
|
|
|
|
|
head = _find_tag(doc, 'head')
|
|
|
|
|
|
|
|
# if the head is not empty we have a full document
|
|
|
|
if len(head):
|
|
|
|
return doc
|
|
|
|
|
|
|
|
body = _find_tag(doc, 'body')
|
|
|
|
|
|
|
|
# The body has just one element, so it was probably a single
|
|
|
|
# element passed in
|
|
|
|
if (len(body) == 1 and (not body.text or not body.text.strip())
|
|
|
|
and (not body[-1].tail or not body[-1].tail.strip())):
|
|
|
|
return body[0]
|
|
|
|
|
|
|
|
# Now we have a body which represents a bunch of tags which have the
|
|
|
|
# content that was passed in. We will create a fake container, which
|
|
|
|
# is the body tag, except <body> implies too much structure.
|
|
|
|
if _contains_block_level_tag(body):
|
|
|
|
body.tag = 'div'
|
|
|
|
else:
|
|
|
|
body.tag = 'span'
|
|
|
|
return body
|
|
|
|
|
|
|
|
|
2018-12-31 23:25:26 +00:00
|
|
|
def parse(filename_url_or_file, guess_charset=None, parser=None):
|
2016-02-06 09:36:57 +00:00
|
|
|
"""Parse a filename, URL, or file-like object into an HTML document
|
|
|
|
tree. Note: this returns a tree, not an element. Use
|
|
|
|
``parse(...).getroot()`` to get the document root.
|
2018-12-31 23:25:26 +00:00
|
|
|
|
|
|
|
If ``guess_charset`` is true, the ``useChardet`` option is passed into
|
|
|
|
html5lib to enable character detection. This option is on by default
|
|
|
|
when parsing from URLs, off by default when parsing from file(-like)
|
|
|
|
objects (which tend to return Unicode more often than not), and on by
|
|
|
|
default when parsing from a file path (which is read in binary mode).
|
2016-02-06 09:36:57 +00:00
|
|
|
"""
|
|
|
|
if parser is None:
|
|
|
|
parser = html_parser
|
|
|
|
if not isinstance(filename_url_or_file, _strings):
|
|
|
|
fp = filename_url_or_file
|
2018-12-31 23:25:26 +00:00
|
|
|
if guess_charset is None:
|
|
|
|
# assume that file-like objects return Unicode more often than bytes
|
|
|
|
guess_charset = False
|
2016-02-06 09:36:57 +00:00
|
|
|
elif _looks_like_url(filename_url_or_file):
|
|
|
|
fp = urlopen(filename_url_or_file)
|
2018-12-31 23:25:26 +00:00
|
|
|
if guess_charset is None:
|
|
|
|
# assume that URLs return bytes
|
|
|
|
guess_charset = True
|
2016-02-06 09:36:57 +00:00
|
|
|
else:
|
|
|
|
fp = open(filename_url_or_file, 'rb')
|
2018-12-31 23:25:26 +00:00
|
|
|
if guess_charset is None:
|
|
|
|
guess_charset = True
|
|
|
|
|
|
|
|
options = {}
|
|
|
|
# html5lib does not accept useChardet as an argument, if it
|
|
|
|
# detected the html argument would produce unicode objects.
|
|
|
|
if guess_charset:
|
|
|
|
options['useChardet'] = guess_charset
|
|
|
|
return parser.parse(fp, **options)
|
2016-02-06 09:36:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _looks_like_url(str):
|
|
|
|
scheme = urlparse(str)[0]
|
|
|
|
if not scheme:
|
|
|
|
return False
|
|
|
|
elif (sys.platform == 'win32' and
|
|
|
|
scheme in string.ascii_letters
|
|
|
|
and len(scheme) == 1):
|
|
|
|
# looks like a 'normal' absolute path
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
html_parser = HTMLParser()
|