update windows build to Python 3.7

This commit is contained in:
j 2019-01-20 16:05:31 +05:30
commit ddc59ab92d
5761 changed files with 750298 additions and 213405 deletions

View file

@ -46,7 +46,6 @@ import re
from functools import partial
try:
# while unnecessary, importing from 'collections.abc' is the right way to do it
from collections.abc import MutableMapping, MutableSet
except ImportError:
from collections import MutableMapping, MutableSet
@ -239,6 +238,15 @@ class Classes(MutableSet):
class HtmlMixin(object):
def set(self, key, value=None):
"""set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None,
creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
for ``form.set('novalidate')``.
"""
super(HtmlElement, self).set(key, value)
@property
def classes(self):
"""
@ -682,8 +690,9 @@ class HtmlComment(etree.CommentBase, HtmlMixin):
class HtmlElement(etree.ElementBase, HtmlMixin):
# Override etree.ElementBase.cssselect, despite the MRO
# Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
cssselect = HtmlMixin.cssselect
set = HtmlMixin.set
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
@ -762,15 +771,14 @@ def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
def fragments_fromstring(html, no_leading_text=False, base_url=None,
parser=None, **kw):
"""
Parses several HTML elements, returning a list of elements.
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string (though leading
whitespace is removed). If no_leading_text is true, then it will
be an error if there is leading text, and it will always be a list
of only elements.
The first item in the list may be a string.
If no_leading_text is true, then it will be an error if there is
leading text, and it will always be a list of only elements.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
base_url will set the document's base_url attribute
(and the tree's docinfo.URL).
"""
if parser is None:
parser = html_parser
@ -1010,7 +1018,7 @@ class FormElement(HtmlElement):
results = []
for el in self.inputs:
name = el.name
if not name:
if not name or 'disabled' in el.attrib:
continue
tag = _nons(el.tag)
if tag == 'textarea':
@ -1027,7 +1035,7 @@ class FormElement(HtmlElement):
"Unexpected tag: %r" % el)
if el.checkable and not el.checked:
continue
if el.type in ('submit', 'image', 'reset'):
if el.type in ('submit', 'image', 'reset', 'file'):
continue
value = el.value
if value is not None:
@ -1128,6 +1136,8 @@ def open_http_urllib(method, url, values):
data = None
else:
data = urlencode(values)
if not isinstance(data, bytes):
data = data.encode('ASCII')
return urlopen(url, data)
@ -1312,15 +1322,19 @@ class SelectElement(InputMixin, HtmlElement):
"""
if self.multiple:
return MultipleSelectOptions(self)
for el in _options_xpath(self):
if el.get('selected') is not None:
value = el.get('value')
if value is None:
value = el.text or ''
if value:
value = value.strip()
return value
return None
options = _options_xpath(self)
try:
selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
except StopIteration:
try:
selected_option = next(el for el in options if el.get('disabled') is None)
except StopIteration:
return None
value = selected_option.get('value')
if value is None:
value = (selected_option.text or '').strip()
return value
@value.setter
def value(self, value):
@ -1333,13 +1347,10 @@ class SelectElement(InputMixin, HtmlElement):
return
checked_option = None
if value is not None:
value = value.strip()
for el in _options_xpath(self):
opt_value = el.get('value')
if opt_value is None:
opt_value = el.text or ''
if opt_value:
opt_value = opt_value.strip()
opt_value = (el.text or '').strip()
if opt_value == value:
checked_option = el
break
@ -1370,9 +1381,7 @@ class SelectElement(InputMixin, HtmlElement):
for el in _options_xpath(self):
value = el.get('value')
if value is None:
value = el.text or ''
if value:
value = value.strip()
value = (el.text or '').strip()
options.append(value)
return options
@ -1417,18 +1426,14 @@ class MultipleSelectOptions(SetMixin):
if 'selected' in option.attrib:
opt_value = option.get('value')
if opt_value is None:
opt_value = option.text or ''
if opt_value:
opt_value = opt_value.strip()
opt_value = (option.text or '').strip()
yield opt_value
def add(self, item):
for option in self.options:
opt_value = option.get('value')
if opt_value is None:
opt_value = option.text or ''
if opt_value:
opt_value = opt_value.strip()
opt_value = (option.text or '').strip()
if opt_value == item:
option.set('selected', '')
break
@ -1440,9 +1445,7 @@ class MultipleSelectOptions(SetMixin):
for option in self.options:
opt_value = option.get('value')
if opt_value is None:
opt_value = option.text or ''
if opt_value:
opt_value = opt_value.strip()
opt_value = (option.text or '').strip()
if opt_value == item:
if 'selected' in option.attrib:
del option.attrib['selected']

View file

@ -1,8 +1,10 @@
from __future__ import absolute_import
import optparse
import sys
import re
import os
from lxml.html.diff import htmldiff
from .diff import htmldiff
description = """\
"""
@ -71,6 +73,7 @@ body_end_re = re.compile(
r"</body.*?>", re.I|re.S)
def split_body(html):
pre = post = ''
match = body_start_re.search(html)
if match:
pre = html[:match.end()]

View file

@ -1,4 +1,8 @@
from collections import MutableSet
try:
from collections.abc import MutableSet
except ImportError:
from collections import MutableSet
class SetMixin(MutableSet):

Binary file not shown.

View file

@ -1,19 +1,24 @@
# cython: language_level=2
"""A cleanup tool for HTML.
Removes unwanted tags and content. See the `Cleaner` class for
details.
"""
from __future__ import absolute_import
import re
import copy
try:
from urlparse import urlsplit
from urllib import unquote_plus
except ImportError:
# Python 3
from urllib.parse import urlsplit
from urllib.parse import urlsplit, unquote_plus
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import fromstring, XHTML_NAMESPACE
from lxml.html import xhtml_to_html, _transform_result
try:
@ -26,11 +31,6 @@ try:
except NameError:
# Python 3
unicode = str
try:
bytes
except NameError:
# Python < 2.6
bytes = str
try:
basestring
except NameError:
@ -95,6 +95,7 @@ _find_external_links = etree.XPath(
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
namespaces={'x':XHTML_NAMESPACE})
class Cleaner(object):
"""
Instances cleans the document of each of the possible offending
@ -112,7 +113,10 @@ class Cleaner(object):
Removes any comments.
``style``:
Removes any style tags or attributes.
Removes any style tags.
``inline_style``
Removes any style attributes. Defaults to the value of the ``style`` option.
``links``:
Removes any ``<link>`` tags
@ -191,6 +195,7 @@ class Cleaner(object):
javascript = True
comments = True
style = False
inline_style = None
links = True
meta = True
page_structure = True
@ -207,7 +212,7 @@ class Cleaner(object):
safe_attrs = defs.safe_attrs
add_nofollow = False
host_whitelist = ()
whitelist_tags = set(['iframe', 'embed'])
whitelist_tags = {'iframe', 'embed'}
def __init__(self, **kw):
for name, value in kw.items():
@ -215,6 +220,8 @@ class Cleaner(object):
raise TypeError(
"Unknown parameter: %s=%r" % (name, value))
setattr(self, name, value)
if self.inline_style is None and 'inline_style' not in kw:
self.inline_style = self.style
# Used to lookup the primary URL for a given tag that is up for
# removal:
@ -280,9 +287,9 @@ class Cleaner(object):
del attrib[aname]
doc.rewrite_links(self._remove_javascript_link,
resolve_base_href=False)
if not self.style:
# If we're deleting style then we don't have to remove JS links
# from styles, otherwise...
# If we're deleting style then we don't have to remove JS links
# from styles, otherwise...
if not self.inline_style:
for el in _find_styled_elements(doc):
old = el.get('style')
new = _css_javascript_re.sub('', old)
@ -292,6 +299,7 @@ class Cleaner(object):
del el.attrib['style']
elif new != old:
el.set('style', new)
if not self.style:
for el in list(doc.iter('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
el.drop_tree()
@ -314,6 +322,7 @@ class Cleaner(object):
kill_tags.add(etree.ProcessingInstruction)
if self.style:
kill_tags.add('style')
if self.inline_style:
etree.strip_attributes(doc, 'style')
if self.links:
kill_tags.add('link')
@ -473,7 +482,7 @@ class Cleaner(object):
def _remove_javascript_link(self, link):
# links like "j a v a s c r i p t:" might be interpreted in IE
new = _substitute_whitespace('', link)
new = _substitute_whitespace('', unquote_plus(link))
if _is_javascript_scheme(new):
# FIXME: should this be None to delete?
return ''
@ -521,7 +530,7 @@ clean_html = clean.clean_html
_link_regexes = [
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
# This is conservative, but autolinking can be a bit conservative:
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
]
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']

View file

@ -6,12 +6,6 @@
# and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
# for html5_tags.
try:
frozenset
except NameError:
from sets import Set as frozenset
empty_tags = frozenset([
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param'])

Binary file not shown.

View file

@ -1,3 +1,7 @@
# cython: language_level=3
from __future__ import absolute_import
import difflib
from lxml import etree
from lxml.html import fragment_fromstring
@ -382,7 +386,7 @@ def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
"""
while 1:
if not unbalanced_start:
# We have totally succeded in finding the position
# We have totally succeeded in finding the position
break
finding = unbalanced_start[0]
finding_name = finding.split()[0].strip('<>')
@ -621,7 +625,7 @@ def fixup_chunks(chunks):
% (cur_word, result, chunk, chunks))
cur_word.post_tags.append(chunk)
else:
assert(0)
assert False
if not result:
return [token('', pre_tags=tag_accum)]
@ -799,7 +803,6 @@ def _move_el_inside_block(el, tag):
if _contains_block_level_tag(child):
break
else:
import sys
# No block-level tags in any child
children_tag = etree.Element(tag)
children_tag.text = el.text

View file

@ -1,5 +1,5 @@
from lxml.etree import XPath, ElementBase
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import fromstring, XHTML_NAMESPACE
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
from lxml.html import defs
import copy

View file

@ -1,15 +1,13 @@
"""
An interface to html5lib that mimics the lxml.html interface.
"""
import sys
import string
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
from lxml import etree
from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
# python3 compatibility
try:
@ -25,6 +23,7 @@ try:
except ImportError:
from urllib.parse import urlparse
class HTMLParser(_HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
@ -53,28 +52,13 @@ def _find_tag(tree, tag):
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
def document_fromstring(html, guess_charset=True, parser=None):
"""Parse a whole document into a string."""
if not isinstance(html, _strings):
raise TypeError('string required')
def document_fromstring(html, guess_charset=None, parser=None):
"""
Parse a whole document into a string.
if parser is None:
parser = html_parser
return parser.parse(html, useChardet=guess_charset).getroot()
def fragments_fromstring(html, no_leading_text=False,
guess_charset=False, parser=None):
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.
If `guess_charset` is `True` and the text was not unicode but a
bytestring, the `chardet` library will perform charset guessing on the
string.
If `guess_charset` is true, or if the input is not Unicode but a
byte string, the `chardet` library will perform charset guessing
on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
@ -82,7 +66,41 @@ def fragments_fromstring(html, no_leading_text=False,
if parser is None:
parser = html_parser
children = parser.parseFragment(html, 'div', useChardet=guess_charset)
options = {}
if guess_charset is None and isinstance(html, bytes):
# html5lib does not accept useChardet as an argument, if it
# detected the html argument would produce unicode objects.
guess_charset = True
if guess_charset is not None:
options['useChardet'] = guess_charset
return parser.parse(html, **options).getroot()
def fragments_fromstring(html, no_leading_text=False,
guess_charset=None, parser=None):
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.
If `guess_charset` is true, the `chardet` library will perform charset
guessing on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
options = {}
if guess_charset is None and isinstance(html, bytes):
# html5lib does not accept useChardet as an argument, if it
# detected the html argument would produce unicode objects.
guess_charset = False
if guess_charset is not None:
options['useChardet'] = guess_charset
children = parser.parseFragment(html, 'div', **options)
if children and isinstance(children[0], _strings):
if no_leading_text:
if children[0].strip():
@ -93,14 +111,17 @@ def fragments_fromstring(html, no_leading_text=False,
def fragment_fromstring(html, create_parent=False,
guess_charset=False, parser=None):
guess_charset=None, parser=None):
"""Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.
If create_parent is true (or is a tag name) then a parent node
If 'create_parent' is true (or is a tag name) then a parent node
will be created to encapsulate the HTML in a single element. In
this case, leading or trailing text is allowed.
If `guess_charset` is true, the `chardet` library will perform charset
guessing on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
@ -133,13 +154,18 @@ def fragment_fromstring(html, create_parent=False,
return result
def fromstring(html, guess_charset=True, parser=None):
def fromstring(html, guess_charset=None, parser=None):
"""Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
'base_url' will set the document's base_url attribute (and the tree's
docinfo.URL)
If `guess_charset` is true, or if the input is not Unicode but a
byte string, the `chardet` library will perform charset guessing
on the string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
@ -147,7 +173,14 @@ def fromstring(html, guess_charset=True, parser=None):
guess_charset=guess_charset)
# document starts with doctype or <html>, full document!
start = html[:50].lstrip().lower()
start = html[:50]
if isinstance(start, bytes):
# Allow text comparison in python3.
# Decode as ascii, that also covers latin-1 and utf-8 for the
# characters we need.
start = start.decode('ascii', 'replace')
start = start.lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
return doc
@ -175,20 +208,40 @@ def fromstring(html, guess_charset=True, parser=None):
return body
def parse(filename_url_or_file, guess_charset=True, parser=None):
def parse(filename_url_or_file, guess_charset=None, parser=None):
"""Parse a filename, URL, or file-like object into an HTML document
tree. Note: this returns a tree, not an element. Use
``parse(...).getroot()`` to get the document root.
If ``guess_charset`` is true, the ``useChardet`` option is passed into
html5lib to enable character detection. This option is on by default
when parsing from URLs, off by default when parsing from file(-like)
objects (which tend to return Unicode more often than not), and on by
default when parsing from a file path (which is read in binary mode).
"""
if parser is None:
parser = html_parser
if not isinstance(filename_url_or_file, _strings):
fp = filename_url_or_file
if guess_charset is None:
# assume that file-like objects return Unicode more often than bytes
guess_charset = False
elif _looks_like_url(filename_url_or_file):
fp = urlopen(filename_url_or_file)
if guess_charset is None:
# assume that URLs return bytes
guess_charset = True
else:
fp = open(filename_url_or_file, 'rb')
return parser.parse(fp, useChardet=guess_charset)
if guess_charset is None:
guess_charset = True
options = {}
# html5lib does not accept useChardet as an argument, if it
# detected the html argument would produce unicode objects.
if guess_charset:
options['useChardet'] = guess_charset
return parser.parse(fp, **options)
def _looks_like_url(str):

View file

@ -9,12 +9,12 @@ from lxml import etree, html
try:
from bs4 import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
Declaration, CData, Doctype)
Declaration, Doctype)
_DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
except ImportError:
from BeautifulSoup import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
Declaration, CData)
Declaration)
_DECLARATION_OR_DOCTYPE = Declaration
@ -74,7 +74,7 @@ def _parse(source, beautifulsoup, makeelement, **bsargs):
bsargs['convertEntities'] = 'html'
if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
if 'features' not in bsargs:
bsargs['features'] = ['html.parser'] # use Python html parser
bsargs['features'] = 'html.parser' # use Python html parser
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
@ -129,9 +129,13 @@ def _convert_tree(beautiful_soup_tree, makeelement):
# may be a soup like '<meta><head><title>Hello</head><body>Hi
# all<\p>'. In this example roots is a list containing meta, head
# and body elements.
pre_root = beautiful_soup_tree.contents[:first_element_idx]
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
if first_element_idx is None:
pre_root = post_root = []
roots = beautiful_soup_tree.contents
else:
pre_root = beautiful_soup_tree.contents[:first_element_idx]
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
# Reorganize so that there is one <html> root...
if html_root is not None:
@ -255,7 +259,7 @@ def _init_node_converters(makeelement):
@converter(Comment)
def convert_comment(bs_node, parent):
res = etree.Comment(bs_node)
res = html.HtmlComment(bs_node)
if parent is not None:
parent.append(res)
return res
@ -288,7 +292,14 @@ except ImportError:
from htmlentitydefs import name2codepoint
handle_entities = re.compile("&(\w+);").sub
handle_entities = re.compile(r"&(\w+);").sub
try:
unichr
except NameError:
# Python 3
unichr = chr
def unescape(string):