openmedialibrary_platform_w.../Lib/site-packages/lxml/html/html5parser.py

"""
An interface to html5lib that mimics the lxml.html interface.
"""
import sys
import string

from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
from lxml import etree
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag

# python3 compatibility
try:
    _strings = basestring
except NameError:
    _strings = (bytes, str)
try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen
try:
    from urlparse import urlparse
except ImportError:
    from urllib.parse import urlparse


class HTMLParser(_HTMLParser):
    """An html5lib HTML parser with lxml as tree."""

    def __init__(self, strict=False, **kwargs):
        _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)


try:
    from html5lib import XHTMLParser as _XHTMLParser
except ImportError:
    pass
else:
    class XHTMLParser(_XHTMLParser):
        """An html5lib XHTML Parser with lxml as tree."""

        def __init__(self, strict=False, **kwargs):
            _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)

    xhtml_parser = XHTMLParser()


def _find_tag(tree, tag):
    elem = tree.find(tag)
    if elem is not None:
        return elem
    return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))


def document_fromstring(html, guess_charset=None, parser=None):
    """
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if parser is None:
        parser = html_parser

    options = {}
    if guess_charset is None and isinstance(html, bytes):
        # html5lib does not accept useChardet as an argument, if it
        # detected the html argument would produce unicode objects.
        guess_charset = True
    if guess_charset is not None:
        options['useChardet'] = guess_charset
    return parser.parse(html, **options).getroot()


def fragments_fromstring(html, no_leading_text=False,
                         guess_charset=None, parser=None):
    """Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if parser is None:
        parser = html_parser

    options = {}
    if guess_charset is None and isinstance(html, bytes):
        # html5lib does not accept useChardet as an argument, if it
        # detected the html argument would produce unicode objects.
        guess_charset = False
    if guess_charset is not None:
        options['useChardet'] = guess_charset
    children = parser.parseFragment(html, 'div', **options)
    if children and isinstance(children[0], _strings):
        if no_leading_text:
            if children[0].strip():
                raise etree.ParserError('There is leading text: %r' %
                                        children[0])
            del children[0]
    return children


def fragment_fromstring(html, create_parent=False,
                        guess_charset=None, parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    accept_leading_text = bool(create_parent)

    elements = fragments_fromstring(
        html, guess_charset=guess_charset, parser=parser,
        no_leading_text=not accept_leading_text)

    if create_parent:
        if not isinstance(create_parent, _strings):
            create_parent = 'div'
        new_root = Element(create_parent)
        if elements:
            if isinstance(elements[0], _strings):
                new_root.text = elements[0]
                del elements[0]
            new_root.extend(elements)
        return new_root

    if not elements:
        raise etree.ParserError('No elements found')
    if len(elements) > 1:
        raise etree.ParserError('Multiple elements found')
    result = elements[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError('Element followed by text: %r' % result.tail)
    result.tail = None
    return result


def fromstring(html, guess_charset=None, parser=None):
    """Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')
    doc = document_fromstring(html, parser=parser,
                              guess_charset=guess_charset)

    # document starts with doctype or <html>, full document!
    start = html[:50]
    if isinstance(start, bytes):
        # Allow text comparison in python3.
        # Decode as ascii, that also covers latin-1 and utf-8 for the
        # characters we need.
        start = start.decode('ascii', 'replace')

    start = start.lstrip().lower()
    if start.startswith('<html') or start.startswith('<!doctype'):
        return doc

    head = _find_tag(doc, 'head')

    # if the head is not empty we have a full document
    if len(head):
        return doc

    body = _find_tag(doc, 'body')

    # The body has just one element, so it was probably a single
    # element passed in
    if (len(body) == 1 and (not body.text or not body.text.strip())
        and (not body[-1].tail or not body[-1].tail.strip())):
        return body[0]

    # Now we have a body which represents a bunch of tags which have the
    # content that was passed in.  We will create a fake container, which
    # is the body tag, except <body> implies too much structure.
    if _contains_block_level_tag(body):
        body.tag = 'div'
    else:
        body.tag = 'span'
    return body


def parse(filename_url_or_file, guess_charset=None, parser=None):
    """Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    """
    if parser is None:
        parser = html_parser
    if not isinstance(filename_url_or_file, _strings):
        fp = filename_url_or_file
        if guess_charset is None:
            # assume that file-like objects return Unicode more often than bytes
            guess_charset = False
    elif _looks_like_url(filename_url_or_file):
        fp = urlopen(filename_url_or_file)
        if guess_charset is None:
            # assume that URLs return bytes
            guess_charset = True
    else:
        fp = open(filename_url_or_file, 'rb')
        if guess_charset is None:
            guess_charset = True

    options = {}
    # html5lib does not accept useChardet as an argument, if it
    # detected the html argument would produce unicode objects.
    if guess_charset:
        options['useChardet'] = guess_charset
    return parser.parse(fp, **options)


def _looks_like_url(str):
    scheme = urlparse(str)[0]
    if not scheme:
        return False
    elif (sys.platform == 'win32' and
            scheme in string.ascii_letters
            and len(scheme) == 1):
        # looks like a 'normal' absolute path
        return False
    else:
        return True


html_parser = HTMLParser()
win32 platform 2016-01-31 14:44:46 +00:00			`"""`
			`An interface to html5lib that mimics the lxml.html interface.`
			`"""`
			`import sys`
			`import string`

			`from html5lib import HTMLParser as _HTMLParser`
			`from html5lib.treebuilders.etree_lxml import TreeBuilder`
			`from lxml import etree`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag`
win32 platform 2016-01-31 14:44:46 +00:00
			`# python3 compatibility`
			`try:`
			`_strings = basestring`
			`except NameError:`
			`_strings = (bytes, str)`
			`try:`
			`from urllib2 import urlopen`
			`except ImportError:`
			`from urllib.request import urlopen`
			`try:`
			`from urlparse import urlparse`
			`except ImportError:`
			`from urllib.parse import urlparse`

update windows build to Python 3.7 2019-01-20 10:35:31 +00:00
win32 platform 2016-01-31 14:44:46 +00:00			`class HTMLParser(_HTMLParser):`
			`"""An html5lib HTML parser with lxml as tree."""`

			`def __init__(self, strict=False, **kwargs):`
			`_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)`


			`try:`
			`from html5lib import XHTMLParser as _XHTMLParser`
			`except ImportError:`
			`pass`
			`else:`
			`class XHTMLParser(_XHTMLParser):`
			`"""An html5lib XHTML Parser with lxml as tree."""`

			`def __init__(self, strict=False, **kwargs):`
			`_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)`

			`xhtml_parser = XHTMLParser()`


			`def _find_tag(tree, tag):`
			`elem = tree.find(tag)`
			`if elem is not None:`
			`return elem`
			`return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))`


update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`def document_fromstring(html, guess_charset=None, parser=None):`
			`"""`
			`Parse a whole document into a string.`

			If `guess_charset` is true, or if the input is not Unicode but a
			byte string, the `chardet` library will perform charset guessing
			`on the string.`
			`"""`
win32 platform 2016-01-31 14:44:46 +00:00			`if not isinstance(html, _strings):`
			`raise TypeError('string required')`

			`if parser is None:`
			`parser = html_parser`

update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`options = {}`
			`if guess_charset is None and isinstance(html, bytes):`
			`# html5lib does not accept useChardet as an argument, if it`
			`# detected the html argument would produce unicode objects.`
			`guess_charset = True`
			`if guess_charset is not None:`
			`options['useChardet'] = guess_charset`
			`return parser.parse(html, **options).getroot()`
win32 platform 2016-01-31 14:44:46 +00:00

			`def fragments_fromstring(html, no_leading_text=False,`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`guess_charset=None, parser=None):`
win32 platform 2016-01-31 14:44:46 +00:00			`"""Parses several HTML elements, returning a list of elements.`

			`The first item in the list may be a string. If no_leading_text is true,`
			`then it will be an error if there is leading text, and it will always be`
			`a list of only elements.`

update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			If `guess_charset` is true, the `chardet` library will perform charset
			`guessing on the string.`
win32 platform 2016-01-31 14:44:46 +00:00			`"""`
			`if not isinstance(html, _strings):`
			`raise TypeError('string required')`

			`if parser is None:`
			`parser = html_parser`

update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`options = {}`
			`if guess_charset is None and isinstance(html, bytes):`
			`# html5lib does not accept useChardet as an argument, if it`
			`# detected the html argument would produce unicode objects.`
			`guess_charset = False`
			`if guess_charset is not None:`
			`options['useChardet'] = guess_charset`
			`children = parser.parseFragment(html, 'div', **options)`
win32 platform 2016-01-31 14:44:46 +00:00			`if children and isinstance(children[0], _strings):`
			`if no_leading_text:`
			`if children[0].strip():`
			`raise etree.ParserError('There is leading text: %r' %`
			`children[0])`
			`del children[0]`
			`return children`


			`def fragment_fromstring(html, create_parent=False,`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`guess_charset=None, parser=None):`
win32 platform 2016-01-31 14:44:46 +00:00			`"""Parses a single HTML element; it is an error if there is more than`
			`one element, or if anything but whitespace precedes or follows the`
			`element.`

update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`If 'create_parent' is true (or is a tag name) then a parent node`
win32 platform 2016-01-31 14:44:46 +00:00			`will be created to encapsulate the HTML in a single element. In`
			`this case, leading or trailing text is allowed.`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00
			If `guess_charset` is true, the `chardet` library will perform charset
			`guessing on the string.`
win32 platform 2016-01-31 14:44:46 +00:00			`"""`
			`if not isinstance(html, _strings):`
			`raise TypeError('string required')`

			`accept_leading_text = bool(create_parent)`

			`elements = fragments_fromstring(`
			`html, guess_charset=guess_charset, parser=parser,`
			`no_leading_text=not accept_leading_text)`

			`if create_parent:`
			`if not isinstance(create_parent, _strings):`
			`create_parent = 'div'`
			`new_root = Element(create_parent)`
			`if elements:`
			`if isinstance(elements[0], _strings):`
			`new_root.text = elements[0]`
			`del elements[0]`
			`new_root.extend(elements)`
			`return new_root`

			`if not elements:`
			`raise etree.ParserError('No elements found')`
			`if len(elements) > 1:`
			`raise etree.ParserError('Multiple elements found')`
			`result = elements[0]`
			`if result.tail and result.tail.strip():`
			`raise etree.ParserError('Element followed by text: %r' % result.tail)`
			`result.tail = None`
			`return result`


update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`def fromstring(html, guess_charset=None, parser=None):`
win32 platform 2016-01-31 14:44:46 +00:00			`"""Parse the html, returning a single element/document.`

			`This tries to minimally parse the chunk of text, without knowing if it`
			`is a fragment or a document.`

update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`'base_url' will set the document's base_url attribute (and the tree's`
			`docinfo.URL)`

			If `guess_charset` is true, or if the input is not Unicode but a
			byte string, the `chardet` library will perform charset guessing
			`on the string.`
win32 platform 2016-01-31 14:44:46 +00:00			`"""`
			`if not isinstance(html, _strings):`
			`raise TypeError('string required')`
			`doc = document_fromstring(html, parser=parser,`
			`guess_charset=guess_charset)`

			`# document starts with doctype or <html>, full document!`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`start = html[:50]`
			`if isinstance(start, bytes):`
			`# Allow text comparison in python3.`
			`# Decode as ascii, that also covers latin-1 and utf-8 for the`
			`# characters we need.`
			`start = start.decode('ascii', 'replace')`

			`start = start.lstrip().lower()`
win32 platform 2016-01-31 14:44:46 +00:00			`if start.startswith('<html') or start.startswith('<!doctype'):`
			`return doc`

			`head = _find_tag(doc, 'head')`

			`# if the head is not empty we have a full document`
			`if len(head):`
			`return doc`

			`body = _find_tag(doc, 'body')`

			`# The body has just one element, so it was probably a single`
			`# element passed in`
			`if (len(body) == 1 and (not body.text or not body.text.strip())`
			`and (not body[-1].tail or not body[-1].tail.strip())):`
			`return body[0]`

			`# Now we have a body which represents a bunch of tags which have the`
			`# content that was passed in. We will create a fake container, which`
			`# is the body tag, except <body> implies too much structure.`
			`if _contains_block_level_tag(body):`
			`body.tag = 'div'`
			`else:`
			`body.tag = 'span'`
			`return body`


update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`def parse(filename_url_or_file, guess_charset=None, parser=None):`
win32 platform 2016-01-31 14:44:46 +00:00			`"""Parse a filename, URL, or file-like object into an HTML document`
			`tree. Note: this returns a tree, not an element. Use`
			``parse(...).getroot()`` to get the document root.
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00
			If ``guess_charset`` is true, the ``useChardet`` option is passed into
			`html5lib to enable character detection. This option is on by default`
			`when parsing from URLs, off by default when parsing from file(-like)`
			`objects (which tend to return Unicode more often than not), and on by`
			`default when parsing from a file path (which is read in binary mode).`
win32 platform 2016-01-31 14:44:46 +00:00			`"""`
			`if parser is None:`
			`parser = html_parser`
			`if not isinstance(filename_url_or_file, _strings):`
			`fp = filename_url_or_file`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`if guess_charset is None:`
			`# assume that file-like objects return Unicode more often than bytes`
			`guess_charset = False`
win32 platform 2016-01-31 14:44:46 +00:00			`elif _looks_like_url(filename_url_or_file):`
			`fp = urlopen(filename_url_or_file)`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`if guess_charset is None:`
			`# assume that URLs return bytes`
			`guess_charset = True`
win32 platform 2016-01-31 14:44:46 +00:00			`else:`
			`fp = open(filename_url_or_file, 'rb')`
update windows build to Python 3.7 2019-01-20 10:35:31 +00:00			`if guess_charset is None:`
			`guess_charset = True`

			`options = {}`
			`# html5lib does not accept useChardet as an argument, if it`
			`# detected the html argument would produce unicode objects.`
			`if guess_charset:`
			`options['useChardet'] = guess_charset`
			`return parser.parse(fp, **options)`
win32 platform 2016-01-31 14:44:46 +00:00

			`def _looks_like_url(str):`
			`scheme = urlparse(str)[0]`
			`if not scheme:`
			`return False`
			`elif (sys.platform == 'win32' and`
			`scheme in string.ascii_letters`
			`and len(scheme) == 1):`
			`# looks like a 'normal' absolute path`
			`return False`
			`else:`
			`return True`


			`html_parser = HTMLParser()`