update windows build to Python 3.7

2019-01-20 16:05:31 +05:30 · 2019-01-20 16:05:31 +05:30 · ddc59ab92d
commit ddc59ab92d
parent 73105fa71e
5761 changed files with 750298 additions and 213405 deletions
--- a/Lib/site-packages/lxml/html/init.py
+++ b/Lib/site-packages/lxml/html/init.py
@ -46,7 +46,6 @@ import re
 from functools import partial

 try:
-    # while unnecessary, importing from 'collections.abc' is the right way to do it
    from collections.abc import MutableMapping, MutableSet
 except ImportError:
    from collections import MutableMapping, MutableSet
@ -239,6 +238,15 @@ class Classes(MutableSet):

 class HtmlMixin(object):

+    def set(self, key, value=None):
+        """set(self, key, value=None)
+
+        Sets an element attribute.  If no value is provided, or if the value is None,
+        creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
+        for ``form.set('novalidate')``.
+        """
+        super(HtmlElement, self).set(key, value)
+
    @property
    def classes(self):
        """
@ -682,8 +690,9 @@ class HtmlComment(etree.CommentBase, HtmlMixin):


 class HtmlElement(etree.ElementBase, HtmlMixin):
-    # Override etree.ElementBase.cssselect, despite the MRO
+    # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
    cssselect = HtmlMixin.cssselect
+    set = HtmlMixin.set


 class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
@ -762,15 +771,14 @@ def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

 def fragments_fromstring(html, no_leading_text=False, base_url=None,
                         parser=None, **kw):
-    """
-    Parses several HTML elements, returning a list of elements.
+    """Parses several HTML elements, returning a list of elements.

-    The first item in the list may be a string (though leading
-    whitespace is removed).  If no_leading_text is true, then it will
-    be an error if there is leading text, and it will always be a list
-    of only elements.
+    The first item in the list may be a string.
+    If no_leading_text is true, then it will be an error if there is
+    leading text, and it will always be a list of only elements.

-    base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+    base_url will set the document's base_url attribute
+    (and the tree's docinfo.URL).
    """
    if parser is None:
        parser = html_parser
@ -1010,7 +1018,7 @@ class FormElement(HtmlElement):
        results = []
        for el in self.inputs:
            name = el.name
-            if not name:
+            if not name or 'disabled' in el.attrib:
                continue
            tag = _nons(el.tag)
            if tag == 'textarea':
@ -1027,7 +1035,7 @@ class FormElement(HtmlElement):
                    "Unexpected tag: %r" % el)
                if el.checkable and not el.checked:
                    continue
-                if el.type in ('submit', 'image', 'reset'):
+                if el.type in ('submit', 'image', 'reset', 'file'):
                    continue
                value = el.value
                if value is not None:
@ -1128,6 +1136,8 @@ def open_http_urllib(method, url, values):
        data = None
    else:
        data = urlencode(values)
+        if not isinstance(data, bytes):
+            data = data.encode('ASCII')
    return urlopen(url, data)


@ -1312,15 +1322,19 @@ class SelectElement(InputMixin, HtmlElement):
        """
        if self.multiple:
            return MultipleSelectOptions(self)
-        for el in _options_xpath(self):
-            if el.get('selected') is not None:
-                value = el.get('value')
-                if value is None:
-                    value = el.text or ''
-                if value:
-                    value = value.strip()
-                return value
-        return None
+        options = _options_xpath(self)
+
+        try:
+            selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
+        except StopIteration:
+            try:
+                selected_option = next(el for el in options if el.get('disabled') is None)
+            except StopIteration:
+                return None
+        value = selected_option.get('value')
+        if value is None:
+            value = (selected_option.text or '').strip()
+        return value

    @value.setter
    def value(self, value):
@ -1333,13 +1347,10 @@ class SelectElement(InputMixin, HtmlElement):
            return
        checked_option = None
        if value is not None:
-            value = value.strip()
            for el in _options_xpath(self):
                opt_value = el.get('value')
                if opt_value is None:
-                    opt_value = el.text or ''
-                if opt_value:
-                    opt_value = opt_value.strip()
+                    opt_value = (el.text or '').strip()
                if opt_value == value:
                    checked_option = el
                    break
@ -1370,9 +1381,7 @@ class SelectElement(InputMixin, HtmlElement):
        for el in _options_xpath(self):
            value = el.get('value')
            if value is None:
-                value = el.text or ''
-            if value:
-                value = value.strip()
+                value = (el.text or '').strip()
            options.append(value)
        return options

@ -1417,18 +1426,14 @@ class MultipleSelectOptions(SetMixin):
            if 'selected' in option.attrib:
                opt_value = option.get('value')
                if opt_value is None:
-                    opt_value = option.text or ''
-                if opt_value:
-                    opt_value = opt_value.strip()
+                    opt_value = (option.text or '').strip()
                yield opt_value

    def add(self, item):
        for option in self.options:
            opt_value = option.get('value')
            if opt_value is None:
-                opt_value = option.text or ''
-            if opt_value:
-                opt_value = opt_value.strip()
+                opt_value = (option.text or '').strip()
            if opt_value == item:
                option.set('selected', '')
                break
@ -1440,9 +1445,7 @@ class MultipleSelectOptions(SetMixin):
        for option in self.options:
            opt_value = option.get('value')
            if opt_value is None:
-                opt_value = option.text or ''
-            if opt_value:
-                opt_value = opt_value.strip()
+                opt_value = (option.text or '').strip()
            if opt_value == item:
                if 'selected' in option.attrib:
                    del option.attrib['selected']
--- a/Lib/site-packages/lxml/html/pycache/ElementSoup.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/ElementSoup.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/init.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/init.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/_diffcommand.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/_diffcommand.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/_html5builder.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/_html5builder.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/_setmixin.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/_setmixin.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/builder.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/builder.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/clean.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/clean.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/defs.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/defs.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/diff.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/diff.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/formfill.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/formfill.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/html5parser.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/html5parser.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/soupparser.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/soupparser.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/pycache/usedoctest.cpython-37.pyc
+++ b/Lib/site-packages/lxml/html/pycache/usedoctest.cpython-37.pyc
--- a/Lib/site-packages/lxml/html/_diffcommand.py
+++ b/Lib/site-packages/lxml/html/_diffcommand.py
@ -1,8 +1,10 @@
+from __future__ import absolute_import
+
 import optparse
 import sys
 import re
 import os
-from lxml.html.diff import htmldiff
+from .diff import htmldiff

 description = """\
 """
@ -71,6 +73,7 @@ body_end_re = re.compile(
    r"</body.*?>", re.I|re.S)
    
 def split_body(html):
+    pre = post = ''
    match = body_start_re.search(html)
    if match:
        pre = html[:match.end()]
--- a/Lib/site-packages/lxml/html/_setmixin.py
+++ b/Lib/site-packages/lxml/html/_setmixin.py
@ -1,4 +1,8 @@
-from collections import MutableSet
+try:
+    from collections.abc import MutableSet
+except ImportError:
+    from collections import MutableSet
+

 class SetMixin(MutableSet):

--- a/Lib/site-packages/lxml/html/clean.cp37-win_amd64.pyd
+++ b/Lib/site-packages/lxml/html/clean.cp37-win_amd64.pyd
--- a/Lib/site-packages/lxml/html/clean.py
+++ b/Lib/site-packages/lxml/html/clean.py
@ -1,19 +1,24 @@
+# cython: language_level=2
+
 """A cleanup tool for HTML.

 Removes unwanted tags and content.  See the `Cleaner` class for
 details.
 """

+from __future__ import absolute_import
+
 import re
 import copy
 try:
    from urlparse import urlsplit
+    from urllib import unquote_plus
 except ImportError:
    # Python 3
-    from urllib.parse import urlsplit
+    from urllib.parse import urlsplit, unquote_plus
 from lxml import etree
 from lxml.html import defs
-from lxml.html import fromstring, tostring, XHTML_NAMESPACE
+from lxml.html import fromstring, XHTML_NAMESPACE
 from lxml.html import xhtml_to_html, _transform_result

 try:
@ -26,11 +31,6 @@ try:
 except NameError:
    # Python 3
    unicode = str
-try:
-    bytes
-except NameError:
-    # Python < 2.6
-    bytes = str
 try:
    basestring
 except NameError:
@ -95,6 +95,7 @@ _find_external_links = etree.XPath(
     "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
    namespaces={'x':XHTML_NAMESPACE})

+
 class Cleaner(object):
    """
    Instances cleans the document of each of the possible offending
@ -112,7 +113,10 @@ class Cleaner(object):
        Removes any comments.

    ``style``:
-        Removes any style tags or attributes.
+        Removes any style tags.
+
+    ``inline_style``
+        Removes any style attributes.  Defaults to the value of the ``style`` option.

    ``links``:
        Removes any ``<link>`` tags
@ -191,6 +195,7 @@ class Cleaner(object):
    javascript = True
    comments = True
    style = False
+    inline_style = None
    links = True
    meta = True
    page_structure = True
@ -207,7 +212,7 @@ class Cleaner(object):
    safe_attrs = defs.safe_attrs
    add_nofollow = False
    host_whitelist = ()
-    whitelist_tags = set(['iframe', 'embed'])
+    whitelist_tags = {'iframe', 'embed'}

    def __init__(self, **kw):
        for name, value in kw.items():
@ -215,6 +220,8 @@ class Cleaner(object):
                raise TypeError(
                    "Unknown parameter: %s=%r" % (name, value))
            setattr(self, name, value)
+        if self.inline_style is None and 'inline_style' not in kw:
+            self.inline_style = self.style

    # Used to lookup the primary URL for a given tag that is up for
    # removal:
@ -280,9 +287,9 @@ class Cleaner(object):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
-            if not self.style:
-                # If we're deleting style then we don't have to remove JS links
-                # from styles, otherwise...
+            # If we're deleting style then we don't have to remove JS links
+            # from styles, otherwise...
+            if not self.inline_style:
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
@ -292,6 +299,7 @@ class Cleaner(object):
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
+            if not self.style:
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
@ -314,6 +322,7 @@ class Cleaner(object):
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
+        if self.inline_style:
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
@ -473,7 +482,7 @@ class Cleaner(object):

    def _remove_javascript_link(self, link):
        # links like "j a v a s c r i p t:" might be interpreted in IE
-        new = _substitute_whitespace('', link)
+        new = _substitute_whitespace('', unquote_plus(link))
        if _is_javascript_scheme(new):
            # FIXME: should this be None to delete?
            return ''
@ -521,7 +530,7 @@ clean_html = clean.clean_html
 _link_regexes = [
    re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
    # This is conservative, but autolinking can be a bit conservative:
-    re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
+    re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
    ]

 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
--- a/Lib/site-packages/lxml/html/defs.py
+++ b/Lib/site-packages/lxml/html/defs.py
@ -6,12 +6,6 @@
 # and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
 # for html5_tags.

-try:
-    frozenset
-except NameError:
-    from sets import Set as frozenset
-
-
 empty_tags = frozenset([
    'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
    'img', 'input', 'isindex', 'link', 'meta', 'param'])
--- a/Lib/site-packages/lxml/html/diff.cp37-win_amd64.pyd
+++ b/Lib/site-packages/lxml/html/diff.cp37-win_amd64.pyd
--- a/Lib/site-packages/lxml/html/diff.py
+++ b/Lib/site-packages/lxml/html/diff.py
@ -1,3 +1,7 @@
+# cython: language_level=3
+
+from __future__ import absolute_import
+
 import difflib
 from lxml import etree
 from lxml.html import fragment_fromstring
@ -382,7 +386,7 @@ def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
    """
    while 1:
        if not unbalanced_start:
-            # We have totally succeded in finding the position
+            # We have totally succeeded in finding the position
            break
        finding = unbalanced_start[0]
        finding_name = finding.split()[0].strip('<>')
@ -621,7 +625,7 @@ def fixup_chunks(chunks):
                    % (cur_word, result, chunk, chunks))
                cur_word.post_tags.append(chunk)
        else:
-            assert(0)
+            assert False

    if not result:
        return [token('', pre_tags=tag_accum)]
@ -799,7 +803,6 @@ def _move_el_inside_block(el, tag):
        if _contains_block_level_tag(child):
            break
    else:
-        import sys
        # No block-level tags in any child
        children_tag = etree.Element(tag)
        children_tag.text = el.text
--- a/Lib/site-packages/lxml/html/formfill.py
+++ b/Lib/site-packages/lxml/html/formfill.py
@ -1,5 +1,5 @@
 from lxml.etree import XPath, ElementBase
-from lxml.html import fromstring, tostring, XHTML_NAMESPACE
+from lxml.html import fromstring, XHTML_NAMESPACE
 from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
 from lxml.html import defs
 import copy
--- a/Lib/site-packages/lxml/html/html5parser.py
+++ b/Lib/site-packages/lxml/html/html5parser.py
@ -1,15 +1,13 @@
 """
 An interface to html5lib that mimics the lxml.html interface.
 """
-
 import sys
 import string

 from html5lib import HTMLParser as _HTMLParser
 from html5lib.treebuilders.etree_lxml import TreeBuilder
-
 from lxml import etree
-from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
+from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag

 # python3 compatibility
 try:
@ -25,6 +23,7 @@ try:
 except ImportError:
    from urllib.parse import urlparse

+
 class HTMLParser(_HTMLParser):
    """An html5lib HTML parser with lxml as tree."""

@ -53,28 +52,13 @@ def _find_tag(tree, tag):
    return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))


-def document_fromstring(html, guess_charset=True, parser=None):
-    """Parse a whole document into a string."""
-    if not isinstance(html, _strings):
-        raise TypeError('string required')
+def document_fromstring(html, guess_charset=None, parser=None):
+    """
+    Parse a whole document into a string.

-    if parser is None:
-        parser = html_parser
-
-    return parser.parse(html, useChardet=guess_charset).getroot()
-
-
-def fragments_fromstring(html, no_leading_text=False,
-                         guess_charset=False, parser=None):
-    """Parses several HTML elements, returning a list of elements.
-
-    The first item in the list may be a string.  If no_leading_text is true,
-    then it will be an error if there is leading text, and it will always be
-    a list of only elements.
-
-    If `guess_charset` is `True` and the text was not unicode but a
-    bytestring, the `chardet` library will perform charset guessing on the
-    string.
+    If `guess_charset` is true, or if the input is not Unicode but a
+    byte string, the `chardet` library will perform charset guessing
+    on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')
@ -82,7 +66,41 @@ def fragments_fromstring(html, no_leading_text=False,
    if parser is None:
        parser = html_parser

-    children = parser.parseFragment(html, 'div', useChardet=guess_charset)
+    options = {}
+    if guess_charset is None and isinstance(html, bytes):
+        # html5lib does not accept useChardet as an argument, if it
+        # detected the html argument would produce unicode objects.
+        guess_charset = True
+    if guess_charset is not None:
+        options['useChardet'] = guess_charset
+    return parser.parse(html, **options).getroot()
+
+
+def fragments_fromstring(html, no_leading_text=False,
+                         guess_charset=None, parser=None):
+    """Parses several HTML elements, returning a list of elements.
+
+    The first item in the list may be a string.  If no_leading_text is true,
+    then it will be an error if there is leading text, and it will always be
+    a list of only elements.
+
+    If `guess_charset` is true, the `chardet` library will perform charset
+    guessing on the string.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+
+    if parser is None:
+        parser = html_parser
+
+    options = {}
+    if guess_charset is None and isinstance(html, bytes):
+        # html5lib does not accept useChardet as an argument, if it
+        # detected the html argument would produce unicode objects.
+        guess_charset = False
+    if guess_charset is not None:
+        options['useChardet'] = guess_charset
+    children = parser.parseFragment(html, 'div', **options)
    if children and isinstance(children[0], _strings):
        if no_leading_text:
            if children[0].strip():
@ -93,14 +111,17 @@ def fragments_fromstring(html, no_leading_text=False,


 def fragment_fromstring(html, create_parent=False,
-                        guess_charset=False, parser=None):
+                        guess_charset=None, parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

-    If create_parent is true (or is a tag name) then a parent node
+    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.
+
+    If `guess_charset` is true, the `chardet` library will perform charset
+    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')
@ -133,13 +154,18 @@ def fragment_fromstring(html, create_parent=False,
    return result


-def fromstring(html, guess_charset=True, parser=None):
+def fromstring(html, guess_charset=None, parser=None):
    """Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

-    base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+    'base_url' will set the document's base_url attribute (and the tree's
+    docinfo.URL)
+
+    If `guess_charset` is true, or if the input is not Unicode but a
+    byte string, the `chardet` library will perform charset guessing
+    on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')
@ -147,7 +173,14 @@ def fromstring(html, guess_charset=True, parser=None):
                              guess_charset=guess_charset)

    # document starts with doctype or <html>, full document!
-    start = html[:50].lstrip().lower()
+    start = html[:50]
+    if isinstance(start, bytes):
+        # Allow text comparison in python3.
+        # Decode as ascii, that also covers latin-1 and utf-8 for the
+        # characters we need.
+        start = start.decode('ascii', 'replace')
+
+    start = start.lstrip().lower()
    if start.startswith('<html') or start.startswith('<!doctype'):
        return doc

@ -175,20 +208,40 @@ def fromstring(html, guess_charset=True, parser=None):
    return body


-def parse(filename_url_or_file, guess_charset=True, parser=None):
+def parse(filename_url_or_file, guess_charset=None, parser=None):
    """Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.
+
+    If ``guess_charset`` is true, the ``useChardet`` option is passed into
+    html5lib to enable character detection.  This option is on by default
+    when parsing from URLs, off by default when parsing from file(-like)
+    objects (which tend to return Unicode more often than not), and on by
+    default when parsing from a file path (which is read in binary mode).
    """
    if parser is None:
        parser = html_parser
    if not isinstance(filename_url_or_file, _strings):
        fp = filename_url_or_file
+        if guess_charset is None:
+            # assume that file-like objects return Unicode more often than bytes
+            guess_charset = False
    elif _looks_like_url(filename_url_or_file):
        fp = urlopen(filename_url_or_file)
+        if guess_charset is None:
+            # assume that URLs return bytes
+            guess_charset = True
    else:
        fp = open(filename_url_or_file, 'rb')
-    return parser.parse(fp, useChardet=guess_charset)
+        if guess_charset is None:
+            guess_charset = True
+
+    options = {}
+    # html5lib does not accept useChardet as an argument, if it
+    # detected the html argument would produce unicode objects.
+    if guess_charset:
+        options['useChardet'] = guess_charset
+    return parser.parse(fp, **options)


 def _looks_like_url(str):
--- a/Lib/site-packages/lxml/html/soupparser.py
+++ b/Lib/site-packages/lxml/html/soupparser.py
@ -9,12 +9,12 @@ from lxml import etree, html
 try:
    from bs4 import (
        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
-        Declaration, CData, Doctype)
+        Declaration, Doctype)
    _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
 except ImportError:
    from BeautifulSoup import (
        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
-        Declaration, CData)
+        Declaration)
    _DECLARATION_OR_DOCTYPE = Declaration


@ -74,7 +74,7 @@ def _parse(source, beautifulsoup, makeelement, **bsargs):
            bsargs['convertEntities'] = 'html'
    if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"):  # bs4
        if 'features' not in bsargs:
-            bsargs['features'] = ['html.parser']  # use Python html parser
+            bsargs['features'] = 'html.parser'  # use Python html parser
    tree = beautifulsoup(source, **bsargs)
    root = _convert_tree(tree, makeelement)
    # from ET: wrap the document in a html root element, if necessary
@ -129,9 +129,13 @@ def _convert_tree(beautiful_soup_tree, makeelement):
    # may be a soup like '<meta><head><title>Hello</head><body>Hi
    # all<\p>'. In this example roots is a list containing meta, head
    # and body elements.
-    pre_root = beautiful_soup_tree.contents[:first_element_idx]
-    roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
-    post_root = beautiful_soup_tree.contents[last_element_idx+1:]
+    if first_element_idx is None:
+        pre_root = post_root = []
+        roots = beautiful_soup_tree.contents
+    else:
+        pre_root = beautiful_soup_tree.contents[:first_element_idx]
+        roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
+        post_root = beautiful_soup_tree.contents[last_element_idx+1:]

    # Reorganize so that there is one <html> root...
    if html_root is not None:
@ -255,7 +259,7 @@ def _init_node_converters(makeelement):

    @converter(Comment)
    def convert_comment(bs_node, parent):
-        res = etree.Comment(bs_node)
+        res = html.HtmlComment(bs_node)
        if parent is not None:
            parent.append(res)
        return res
@ -288,7 +292,14 @@ except ImportError:
    from htmlentitydefs import name2codepoint


-handle_entities = re.compile("&(\w+);").sub
+handle_entities = re.compile(r"&(\w+);").sub
+
+
+try:
+    unichr
+except NameError:
+    # Python 3
+    unichr = chr


 def unescape(string):