openmedialibrary_platform/Shared/lib/python3.7/site-packages/html5lib/treewalkers/etree.py

131 lines
4.4 KiB
Python
Raw Normal View History

2013-10-11 17:28:32 +00:00
from __future__ import absolute_import, division, unicode_literals
2018-12-15 00:08:54 +00:00
from collections import OrderedDict
2013-10-11 17:28:32 +00:00
import re
2015-11-04 12:01:55 +00:00
from six import string_types
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
from . import base
from .._utils import moduleFactoryFactory
2013-10-11 17:28:32 +00:00
tag_regexp = re.compile("{([^}]*)}(.*)")
def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
2018-12-15 00:08:54 +00:00
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
2013-10-11 17:28:32 +00:00
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
1. The current element
2. The index of the element relative to its parent
3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1)
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
2018-12-15 00:08:54 +00:00
elt, _, _, flag = node
2013-10-11 17:28:32 +00:00
if flag in ("text", "tail"):
2018-12-15 00:08:54 +00:00
return base.TEXT, getattr(elt, flag)
2013-10-11 17:28:32 +00:00
else:
node = elt
if not(hasattr(node, "tag")):
node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
2018-12-15 00:08:54 +00:00
return (base.DOCUMENT,)
2013-10-11 17:28:32 +00:00
elif node.tag == "<!DOCTYPE>":
2018-12-15 00:08:54 +00:00
return (base.DOCTYPE, node.text,
2013-10-11 17:28:32 +00:00
node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTreeCommentType:
2018-12-15 00:08:54 +00:00
return base.COMMENT, node.text
2013-10-11 17:28:32 +00:00
else:
2015-11-04 12:01:55 +00:00
assert isinstance(node.tag, string_types), type(node.tag)
2013-10-11 17:28:32 +00:00
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
attrs = OrderedDict()
for name, value in list(node.attrib.items()):
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
2018-12-15 00:08:54 +00:00
return (base.ELEMENT, namespace, tag,
2013-10-11 17:28:32 +00:00
attrs, len(node) or node.text)
def getFirstChild(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
element, key, parents, flag = node, None, [], None
if flag in ("text", "tail"):
return None
else:
if element.text:
return element, key, parents, "text"
elif len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
else:
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key + 1], key + 1, parents, None
else:
return None
def getParentNode(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if not parents:
return element
else:
return element, key, parents, None
else:
parent = parents.pop()
if not parents:
return parent
else:
2018-12-15 00:08:54 +00:00
assert list(parents[-1]).count(parent) == 1
2013-10-11 17:28:32 +00:00
return parent, list(parents[-1]).index(parent), parents, None
return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)