update windows build to Python 3.7
This commit is contained in:
parent
73105fa71e
commit
ddc59ab92d
5761 changed files with 750298 additions and 213405 deletions
|
|
@ -46,7 +46,6 @@ import re
|
|||
from functools import partial
|
||||
|
||||
try:
|
||||
# while unnecessary, importing from 'collections.abc' is the right way to do it
|
||||
from collections.abc import MutableMapping, MutableSet
|
||||
except ImportError:
|
||||
from collections import MutableMapping, MutableSet
|
||||
|
|
@ -239,6 +238,15 @@ class Classes(MutableSet):
|
|||
|
||||
class HtmlMixin(object):
|
||||
|
||||
def set(self, key, value=None):
|
||||
"""set(self, key, value=None)
|
||||
|
||||
Sets an element attribute. If no value is provided, or if the value is None,
|
||||
creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
|
||||
for ``form.set('novalidate')``.
|
||||
"""
|
||||
super(HtmlElement, self).set(key, value)
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
"""
|
||||
|
|
@ -682,8 +690,9 @@ class HtmlComment(etree.CommentBase, HtmlMixin):
|
|||
|
||||
|
||||
class HtmlElement(etree.ElementBase, HtmlMixin):
|
||||
# Override etree.ElementBase.cssselect, despite the MRO
|
||||
# Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
|
||||
cssselect = HtmlMixin.cssselect
|
||||
set = HtmlMixin.set
|
||||
|
||||
|
||||
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
|
||||
|
|
@ -762,15 +771,14 @@ def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
|
|||
|
||||
def fragments_fromstring(html, no_leading_text=False, base_url=None,
|
||||
parser=None, **kw):
|
||||
"""
|
||||
Parses several HTML elements, returning a list of elements.
|
||||
"""Parses several HTML elements, returning a list of elements.
|
||||
|
||||
The first item in the list may be a string (though leading
|
||||
whitespace is removed). If no_leading_text is true, then it will
|
||||
be an error if there is leading text, and it will always be a list
|
||||
of only elements.
|
||||
The first item in the list may be a string.
|
||||
If no_leading_text is true, then it will be an error if there is
|
||||
leading text, and it will always be a list of only elements.
|
||||
|
||||
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
|
||||
base_url will set the document's base_url attribute
|
||||
(and the tree's docinfo.URL).
|
||||
"""
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
|
@ -1010,7 +1018,7 @@ class FormElement(HtmlElement):
|
|||
results = []
|
||||
for el in self.inputs:
|
||||
name = el.name
|
||||
if not name:
|
||||
if not name or 'disabled' in el.attrib:
|
||||
continue
|
||||
tag = _nons(el.tag)
|
||||
if tag == 'textarea':
|
||||
|
|
@ -1027,7 +1035,7 @@ class FormElement(HtmlElement):
|
|||
"Unexpected tag: %r" % el)
|
||||
if el.checkable and not el.checked:
|
||||
continue
|
||||
if el.type in ('submit', 'image', 'reset'):
|
||||
if el.type in ('submit', 'image', 'reset', 'file'):
|
||||
continue
|
||||
value = el.value
|
||||
if value is not None:
|
||||
|
|
@ -1128,6 +1136,8 @@ def open_http_urllib(method, url, values):
|
|||
data = None
|
||||
else:
|
||||
data = urlencode(values)
|
||||
if not isinstance(data, bytes):
|
||||
data = data.encode('ASCII')
|
||||
return urlopen(url, data)
|
||||
|
||||
|
||||
|
|
@ -1312,15 +1322,19 @@ class SelectElement(InputMixin, HtmlElement):
|
|||
"""
|
||||
if self.multiple:
|
||||
return MultipleSelectOptions(self)
|
||||
for el in _options_xpath(self):
|
||||
if el.get('selected') is not None:
|
||||
value = el.get('value')
|
||||
if value is None:
|
||||
value = el.text or ''
|
||||
if value:
|
||||
value = value.strip()
|
||||
return value
|
||||
return None
|
||||
options = _options_xpath(self)
|
||||
|
||||
try:
|
||||
selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
|
||||
except StopIteration:
|
||||
try:
|
||||
selected_option = next(el for el in options if el.get('disabled') is None)
|
||||
except StopIteration:
|
||||
return None
|
||||
value = selected_option.get('value')
|
||||
if value is None:
|
||||
value = (selected_option.text or '').strip()
|
||||
return value
|
||||
|
||||
@value.setter
|
||||
def value(self, value):
|
||||
|
|
@ -1333,13 +1347,10 @@ class SelectElement(InputMixin, HtmlElement):
|
|||
return
|
||||
checked_option = None
|
||||
if value is not None:
|
||||
value = value.strip()
|
||||
for el in _options_xpath(self):
|
||||
opt_value = el.get('value')
|
||||
if opt_value is None:
|
||||
opt_value = el.text or ''
|
||||
if opt_value:
|
||||
opt_value = opt_value.strip()
|
||||
opt_value = (el.text or '').strip()
|
||||
if opt_value == value:
|
||||
checked_option = el
|
||||
break
|
||||
|
|
@ -1370,9 +1381,7 @@ class SelectElement(InputMixin, HtmlElement):
|
|||
for el in _options_xpath(self):
|
||||
value = el.get('value')
|
||||
if value is None:
|
||||
value = el.text or ''
|
||||
if value:
|
||||
value = value.strip()
|
||||
value = (el.text or '').strip()
|
||||
options.append(value)
|
||||
return options
|
||||
|
||||
|
|
@ -1417,18 +1426,14 @@ class MultipleSelectOptions(SetMixin):
|
|||
if 'selected' in option.attrib:
|
||||
opt_value = option.get('value')
|
||||
if opt_value is None:
|
||||
opt_value = option.text or ''
|
||||
if opt_value:
|
||||
opt_value = opt_value.strip()
|
||||
opt_value = (option.text or '').strip()
|
||||
yield opt_value
|
||||
|
||||
def add(self, item):
|
||||
for option in self.options:
|
||||
opt_value = option.get('value')
|
||||
if opt_value is None:
|
||||
opt_value = option.text or ''
|
||||
if opt_value:
|
||||
opt_value = opt_value.strip()
|
||||
opt_value = (option.text or '').strip()
|
||||
if opt_value == item:
|
||||
option.set('selected', '')
|
||||
break
|
||||
|
|
@ -1440,9 +1445,7 @@ class MultipleSelectOptions(SetMixin):
|
|||
for option in self.options:
|
||||
opt_value = option.get('value')
|
||||
if opt_value is None:
|
||||
opt_value = option.text or ''
|
||||
if opt_value:
|
||||
opt_value = opt_value.strip()
|
||||
opt_value = (option.text or '').strip()
|
||||
if opt_value == item:
|
||||
if 'selected' in option.attrib:
|
||||
del option.attrib['selected']
|
||||
|
|
|
|||
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/_setmixin.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/_setmixin.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/builder.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/builder.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/clean.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/clean.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/defs.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/defs.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/diff.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/diff.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Lib/site-packages/lxml/html/__pycache__/formfill.cpython-37.pyc
Normal file
BIN
Lib/site-packages/lxml/html/__pycache__/formfill.cpython-37.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -1,8 +1,10 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
import optparse
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
from lxml.html.diff import htmldiff
|
||||
from .diff import htmldiff
|
||||
|
||||
description = """\
|
||||
"""
|
||||
|
|
@ -71,6 +73,7 @@ body_end_re = re.compile(
|
|||
r"</body.*?>", re.I|re.S)
|
||||
|
||||
def split_body(html):
|
||||
pre = post = ''
|
||||
match = body_start_re.search(html)
|
||||
if match:
|
||||
pre = html[:match.end()]
|
||||
|
|
|
|||
|
|
@ -1,4 +1,8 @@
|
|||
from collections import MutableSet
|
||||
try:
|
||||
from collections.abc import MutableSet
|
||||
except ImportError:
|
||||
from collections import MutableSet
|
||||
|
||||
|
||||
class SetMixin(MutableSet):
|
||||
|
||||
|
|
|
|||
BIN
Lib/site-packages/lxml/html/clean.cp37-win_amd64.pyd
Normal file
BIN
Lib/site-packages/lxml/html/clean.cp37-win_amd64.pyd
Normal file
Binary file not shown.
|
|
@ -1,19 +1,24 @@
|
|||
# cython: language_level=2
|
||||
|
||||
"""A cleanup tool for HTML.
|
||||
|
||||
Removes unwanted tags and content. See the `Cleaner` class for
|
||||
details.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
import copy
|
||||
try:
|
||||
from urlparse import urlsplit
|
||||
from urllib import unquote_plus
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib.parse import urlsplit
|
||||
from urllib.parse import urlsplit, unquote_plus
|
||||
from lxml import etree
|
||||
from lxml.html import defs
|
||||
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
|
||||
from lxml.html import fromstring, XHTML_NAMESPACE
|
||||
from lxml.html import xhtml_to_html, _transform_result
|
||||
|
||||
try:
|
||||
|
|
@ -26,11 +31,6 @@ try:
|
|||
except NameError:
|
||||
# Python 3
|
||||
unicode = str
|
||||
try:
|
||||
bytes
|
||||
except NameError:
|
||||
# Python < 2.6
|
||||
bytes = str
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
|
|
@ -95,6 +95,7 @@ _find_external_links = etree.XPath(
|
|||
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
|
||||
namespaces={'x':XHTML_NAMESPACE})
|
||||
|
||||
|
||||
class Cleaner(object):
|
||||
"""
|
||||
Instances cleans the document of each of the possible offending
|
||||
|
|
@ -112,7 +113,10 @@ class Cleaner(object):
|
|||
Removes any comments.
|
||||
|
||||
``style``:
|
||||
Removes any style tags or attributes.
|
||||
Removes any style tags.
|
||||
|
||||
``inline_style``
|
||||
Removes any style attributes. Defaults to the value of the ``style`` option.
|
||||
|
||||
``links``:
|
||||
Removes any ``<link>`` tags
|
||||
|
|
@ -191,6 +195,7 @@ class Cleaner(object):
|
|||
javascript = True
|
||||
comments = True
|
||||
style = False
|
||||
inline_style = None
|
||||
links = True
|
||||
meta = True
|
||||
page_structure = True
|
||||
|
|
@ -207,7 +212,7 @@ class Cleaner(object):
|
|||
safe_attrs = defs.safe_attrs
|
||||
add_nofollow = False
|
||||
host_whitelist = ()
|
||||
whitelist_tags = set(['iframe', 'embed'])
|
||||
whitelist_tags = {'iframe', 'embed'}
|
||||
|
||||
def __init__(self, **kw):
|
||||
for name, value in kw.items():
|
||||
|
|
@ -215,6 +220,8 @@ class Cleaner(object):
|
|||
raise TypeError(
|
||||
"Unknown parameter: %s=%r" % (name, value))
|
||||
setattr(self, name, value)
|
||||
if self.inline_style is None and 'inline_style' not in kw:
|
||||
self.inline_style = self.style
|
||||
|
||||
# Used to lookup the primary URL for a given tag that is up for
|
||||
# removal:
|
||||
|
|
@ -280,9 +287,9 @@ class Cleaner(object):
|
|||
del attrib[aname]
|
||||
doc.rewrite_links(self._remove_javascript_link,
|
||||
resolve_base_href=False)
|
||||
if not self.style:
|
||||
# If we're deleting style then we don't have to remove JS links
|
||||
# from styles, otherwise...
|
||||
# If we're deleting style then we don't have to remove JS links
|
||||
# from styles, otherwise...
|
||||
if not self.inline_style:
|
||||
for el in _find_styled_elements(doc):
|
||||
old = el.get('style')
|
||||
new = _css_javascript_re.sub('', old)
|
||||
|
|
@ -292,6 +299,7 @@ class Cleaner(object):
|
|||
del el.attrib['style']
|
||||
elif new != old:
|
||||
el.set('style', new)
|
||||
if not self.style:
|
||||
for el in list(doc.iter('style')):
|
||||
if el.get('type', '').lower().strip() == 'text/javascript':
|
||||
el.drop_tree()
|
||||
|
|
@ -314,6 +322,7 @@ class Cleaner(object):
|
|||
kill_tags.add(etree.ProcessingInstruction)
|
||||
if self.style:
|
||||
kill_tags.add('style')
|
||||
if self.inline_style:
|
||||
etree.strip_attributes(doc, 'style')
|
||||
if self.links:
|
||||
kill_tags.add('link')
|
||||
|
|
@ -473,7 +482,7 @@ class Cleaner(object):
|
|||
|
||||
def _remove_javascript_link(self, link):
|
||||
# links like "j a v a s c r i p t:" might be interpreted in IE
|
||||
new = _substitute_whitespace('', link)
|
||||
new = _substitute_whitespace('', unquote_plus(link))
|
||||
if _is_javascript_scheme(new):
|
||||
# FIXME: should this be None to delete?
|
||||
return ''
|
||||
|
|
@ -521,7 +530,7 @@ clean_html = clean.clean_html
|
|||
_link_regexes = [
|
||||
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
|
||||
# This is conservative, but autolinking can be a bit conservative:
|
||||
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
|
||||
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
|
||||
]
|
||||
|
||||
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
|
||||
|
|
|
|||
|
|
@ -6,12 +6,6 @@
|
|||
# and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
|
||||
# for html5_tags.
|
||||
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
from sets import Set as frozenset
|
||||
|
||||
|
||||
empty_tags = frozenset([
|
||||
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
||||
'img', 'input', 'isindex', 'link', 'meta', 'param'])
|
||||
|
|
|
|||
BIN
Lib/site-packages/lxml/html/diff.cp37-win_amd64.pyd
Normal file
BIN
Lib/site-packages/lxml/html/diff.cp37-win_amd64.pyd
Normal file
Binary file not shown.
|
|
@ -1,3 +1,7 @@
|
|||
# cython: language_level=3
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import difflib
|
||||
from lxml import etree
|
||||
from lxml.html import fragment_fromstring
|
||||
|
|
@ -382,7 +386,7 @@ def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
|
|||
"""
|
||||
while 1:
|
||||
if not unbalanced_start:
|
||||
# We have totally succeded in finding the position
|
||||
# We have totally succeeded in finding the position
|
||||
break
|
||||
finding = unbalanced_start[0]
|
||||
finding_name = finding.split()[0].strip('<>')
|
||||
|
|
@ -621,7 +625,7 @@ def fixup_chunks(chunks):
|
|||
% (cur_word, result, chunk, chunks))
|
||||
cur_word.post_tags.append(chunk)
|
||||
else:
|
||||
assert(0)
|
||||
assert False
|
||||
|
||||
if not result:
|
||||
return [token('', pre_tags=tag_accum)]
|
||||
|
|
@ -799,7 +803,6 @@ def _move_el_inside_block(el, tag):
|
|||
if _contains_block_level_tag(child):
|
||||
break
|
||||
else:
|
||||
import sys
|
||||
# No block-level tags in any child
|
||||
children_tag = etree.Element(tag)
|
||||
children_tag.text = el.text
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from lxml.etree import XPath, ElementBase
|
||||
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
|
||||
from lxml.html import fromstring, XHTML_NAMESPACE
|
||||
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
|
||||
from lxml.html import defs
|
||||
import copy
|
||||
|
|
|
|||
|
|
@ -1,15 +1,13 @@
|
|||
"""
|
||||
An interface to html5lib that mimics the lxml.html interface.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import string
|
||||
|
||||
from html5lib import HTMLParser as _HTMLParser
|
||||
from html5lib.treebuilders.etree_lxml import TreeBuilder
|
||||
|
||||
from lxml import etree
|
||||
from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
|
||||
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
|
||||
|
||||
# python3 compatibility
|
||||
try:
|
||||
|
|
@ -25,6 +23,7 @@ try:
|
|||
except ImportError:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
class HTMLParser(_HTMLParser):
|
||||
"""An html5lib HTML parser with lxml as tree."""
|
||||
|
||||
|
|
@ -53,28 +52,13 @@ def _find_tag(tree, tag):
|
|||
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
|
||||
|
||||
|
||||
def document_fromstring(html, guess_charset=True, parser=None):
|
||||
"""Parse a whole document into a string."""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
def document_fromstring(html, guess_charset=None, parser=None):
|
||||
"""
|
||||
Parse a whole document into a string.
|
||||
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
return parser.parse(html, useChardet=guess_charset).getroot()
|
||||
|
||||
|
||||
def fragments_fromstring(html, no_leading_text=False,
|
||||
guess_charset=False, parser=None):
|
||||
"""Parses several HTML elements, returning a list of elements.
|
||||
|
||||
The first item in the list may be a string. If no_leading_text is true,
|
||||
then it will be an error if there is leading text, and it will always be
|
||||
a list of only elements.
|
||||
|
||||
If `guess_charset` is `True` and the text was not unicode but a
|
||||
bytestring, the `chardet` library will perform charset guessing on the
|
||||
string.
|
||||
If `guess_charset` is true, or if the input is not Unicode but a
|
||||
byte string, the `chardet` library will perform charset guessing
|
||||
on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
|
@ -82,7 +66,41 @@ def fragments_fromstring(html, no_leading_text=False,
|
|||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
children = parser.parseFragment(html, 'div', useChardet=guess_charset)
|
||||
options = {}
|
||||
if guess_charset is None and isinstance(html, bytes):
|
||||
# html5lib does not accept useChardet as an argument, if it
|
||||
# detected the html argument would produce unicode objects.
|
||||
guess_charset = True
|
||||
if guess_charset is not None:
|
||||
options['useChardet'] = guess_charset
|
||||
return parser.parse(html, **options).getroot()
|
||||
|
||||
|
||||
def fragments_fromstring(html, no_leading_text=False,
|
||||
guess_charset=None, parser=None):
|
||||
"""Parses several HTML elements, returning a list of elements.
|
||||
|
||||
The first item in the list may be a string. If no_leading_text is true,
|
||||
then it will be an error if there is leading text, and it will always be
|
||||
a list of only elements.
|
||||
|
||||
If `guess_charset` is true, the `chardet` library will perform charset
|
||||
guessing on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
|
||||
options = {}
|
||||
if guess_charset is None and isinstance(html, bytes):
|
||||
# html5lib does not accept useChardet as an argument, if it
|
||||
# detected the html argument would produce unicode objects.
|
||||
guess_charset = False
|
||||
if guess_charset is not None:
|
||||
options['useChardet'] = guess_charset
|
||||
children = parser.parseFragment(html, 'div', **options)
|
||||
if children and isinstance(children[0], _strings):
|
||||
if no_leading_text:
|
||||
if children[0].strip():
|
||||
|
|
@ -93,14 +111,17 @@ def fragments_fromstring(html, no_leading_text=False,
|
|||
|
||||
|
||||
def fragment_fromstring(html, create_parent=False,
|
||||
guess_charset=False, parser=None):
|
||||
guess_charset=None, parser=None):
|
||||
"""Parses a single HTML element; it is an error if there is more than
|
||||
one element, or if anything but whitespace precedes or follows the
|
||||
element.
|
||||
|
||||
If create_parent is true (or is a tag name) then a parent node
|
||||
If 'create_parent' is true (or is a tag name) then a parent node
|
||||
will be created to encapsulate the HTML in a single element. In
|
||||
this case, leading or trailing text is allowed.
|
||||
|
||||
If `guess_charset` is true, the `chardet` library will perform charset
|
||||
guessing on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
|
@ -133,13 +154,18 @@ def fragment_fromstring(html, create_parent=False,
|
|||
return result
|
||||
|
||||
|
||||
def fromstring(html, guess_charset=True, parser=None):
|
||||
def fromstring(html, guess_charset=None, parser=None):
|
||||
"""Parse the html, returning a single element/document.
|
||||
|
||||
This tries to minimally parse the chunk of text, without knowing if it
|
||||
is a fragment or a document.
|
||||
|
||||
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
|
||||
'base_url' will set the document's base_url attribute (and the tree's
|
||||
docinfo.URL)
|
||||
|
||||
If `guess_charset` is true, or if the input is not Unicode but a
|
||||
byte string, the `chardet` library will perform charset guessing
|
||||
on the string.
|
||||
"""
|
||||
if not isinstance(html, _strings):
|
||||
raise TypeError('string required')
|
||||
|
|
@ -147,7 +173,14 @@ def fromstring(html, guess_charset=True, parser=None):
|
|||
guess_charset=guess_charset)
|
||||
|
||||
# document starts with doctype or <html>, full document!
|
||||
start = html[:50].lstrip().lower()
|
||||
start = html[:50]
|
||||
if isinstance(start, bytes):
|
||||
# Allow text comparison in python3.
|
||||
# Decode as ascii, that also covers latin-1 and utf-8 for the
|
||||
# characters we need.
|
||||
start = start.decode('ascii', 'replace')
|
||||
|
||||
start = start.lstrip().lower()
|
||||
if start.startswith('<html') or start.startswith('<!doctype'):
|
||||
return doc
|
||||
|
||||
|
|
@ -175,20 +208,40 @@ def fromstring(html, guess_charset=True, parser=None):
|
|||
return body
|
||||
|
||||
|
||||
def parse(filename_url_or_file, guess_charset=True, parser=None):
|
||||
def parse(filename_url_or_file, guess_charset=None, parser=None):
|
||||
"""Parse a filename, URL, or file-like object into an HTML document
|
||||
tree. Note: this returns a tree, not an element. Use
|
||||
``parse(...).getroot()`` to get the document root.
|
||||
|
||||
If ``guess_charset`` is true, the ``useChardet`` option is passed into
|
||||
html5lib to enable character detection. This option is on by default
|
||||
when parsing from URLs, off by default when parsing from file(-like)
|
||||
objects (which tend to return Unicode more often than not), and on by
|
||||
default when parsing from a file path (which is read in binary mode).
|
||||
"""
|
||||
if parser is None:
|
||||
parser = html_parser
|
||||
if not isinstance(filename_url_or_file, _strings):
|
||||
fp = filename_url_or_file
|
||||
if guess_charset is None:
|
||||
# assume that file-like objects return Unicode more often than bytes
|
||||
guess_charset = False
|
||||
elif _looks_like_url(filename_url_or_file):
|
||||
fp = urlopen(filename_url_or_file)
|
||||
if guess_charset is None:
|
||||
# assume that URLs return bytes
|
||||
guess_charset = True
|
||||
else:
|
||||
fp = open(filename_url_or_file, 'rb')
|
||||
return parser.parse(fp, useChardet=guess_charset)
|
||||
if guess_charset is None:
|
||||
guess_charset = True
|
||||
|
||||
options = {}
|
||||
# html5lib does not accept useChardet as an argument, if it
|
||||
# detected the html argument would produce unicode objects.
|
||||
if guess_charset:
|
||||
options['useChardet'] = guess_charset
|
||||
return parser.parse(fp, **options)
|
||||
|
||||
|
||||
def _looks_like_url(str):
|
||||
|
|
|
|||
|
|
@ -9,12 +9,12 @@ from lxml import etree, html
|
|||
try:
|
||||
from bs4 import (
|
||||
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
|
||||
Declaration, CData, Doctype)
|
||||
Declaration, Doctype)
|
||||
_DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
|
||||
except ImportError:
|
||||
from BeautifulSoup import (
|
||||
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
|
||||
Declaration, CData)
|
||||
Declaration)
|
||||
_DECLARATION_OR_DOCTYPE = Declaration
|
||||
|
||||
|
||||
|
|
@ -74,7 +74,7 @@ def _parse(source, beautifulsoup, makeelement, **bsargs):
|
|||
bsargs['convertEntities'] = 'html'
|
||||
if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
|
||||
if 'features' not in bsargs:
|
||||
bsargs['features'] = ['html.parser'] # use Python html parser
|
||||
bsargs['features'] = 'html.parser' # use Python html parser
|
||||
tree = beautifulsoup(source, **bsargs)
|
||||
root = _convert_tree(tree, makeelement)
|
||||
# from ET: wrap the document in a html root element, if necessary
|
||||
|
|
@ -129,9 +129,13 @@ def _convert_tree(beautiful_soup_tree, makeelement):
|
|||
# may be a soup like '<meta><head><title>Hello</head><body>Hi
|
||||
# all<\p>'. In this example roots is a list containing meta, head
|
||||
# and body elements.
|
||||
pre_root = beautiful_soup_tree.contents[:first_element_idx]
|
||||
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
|
||||
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
|
||||
if first_element_idx is None:
|
||||
pre_root = post_root = []
|
||||
roots = beautiful_soup_tree.contents
|
||||
else:
|
||||
pre_root = beautiful_soup_tree.contents[:first_element_idx]
|
||||
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
|
||||
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
|
||||
|
||||
# Reorganize so that there is one <html> root...
|
||||
if html_root is not None:
|
||||
|
|
@ -255,7 +259,7 @@ def _init_node_converters(makeelement):
|
|||
|
||||
@converter(Comment)
|
||||
def convert_comment(bs_node, parent):
|
||||
res = etree.Comment(bs_node)
|
||||
res = html.HtmlComment(bs_node)
|
||||
if parent is not None:
|
||||
parent.append(res)
|
||||
return res
|
||||
|
|
@ -288,7 +292,14 @@ except ImportError:
|
|||
from htmlentitydefs import name2codepoint
|
||||
|
||||
|
||||
handle_entities = re.compile("&(\w+);").sub
|
||||
handle_entities = re.compile(r"&(\w+);").sub
|
||||
|
||||
|
||||
try:
|
||||
unichr
|
||||
except NameError:
|
||||
# Python 3
|
||||
unichr = chr
|
||||
|
||||
|
||||
def unescape(string):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue