openmedialibrary_platform/Shared/lib/python3.7/site-packages/html5lib/html5parser.py

2792 lines
116 KiB
Python
Raw Normal View History

2013-10-11 17:28:32 +00:00
from __future__ import absolute_import, division, unicode_literals
2018-12-15 00:08:54 +00:00
from six import with_metaclass, viewkeys
2013-10-11 17:28:32 +00:00
import types
2018-12-15 00:08:54 +00:00
from collections import OrderedDict
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
from . import _inputstream
from . import _tokenizer
2013-10-11 17:28:32 +00:00
from . import treebuilders
2018-12-15 00:08:54 +00:00
from .treebuilders.base import Marker
from . import _utils
from .constants import (
spaceCharacters, asciiUpper2Lower,
specialElements, headingElements, cdataElements, rcdataElements,
tokenTypes, tagTokenTypes,
namespaces,
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
adjustForeignAttributes as adjustForeignAttributesMap,
adjustMathMLAttributes, adjustSVGAttributes,
E,
_ReparseException
)
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse an HTML document as a string or file-like object into a tree
:arg doc: the document to parse as a string or file-like object
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5parser import parse
>>> parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
"""
2013-10-11 17:28:32 +00:00
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
2018-12-15 00:08:54 +00:00
return p.parse(doc, **kwargs)
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse an HTML fragment as a string or file-like object into a tree
:arg doc: the fragment to parse as a string or file-like object
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
:arg container: the container context to parse the fragment in
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import parseFragment
>>> parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
"""
2013-10-11 17:28:32 +00:00
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
2018-12-15 00:08:54 +00:00
return p.parseFragment(doc, container=container, **kwargs)
2013-10-11 17:28:32 +00:00
def method_decorator_metaclass(function):
class Decorated(type):
def __new__(meta, classname, bases, classDict):
for attributeName, attribute in classDict.items():
if isinstance(attribute, types.FunctionType):
attribute = function(attribute)
classDict[attributeName] = attribute
return type.__new__(meta, classname, bases, classDict)
return Decorated
class HTMLParser(object):
2018-12-15 00:08:54 +00:00
"""HTML parser
Generates a tree structure from a stream of (possibly malformed) HTML.
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
"""
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
2013-10-11 17:28:32 +00:00
"""
2018-12-15 00:08:54 +00:00
:arg tree: a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
:arg strict: raise an exception when a parse error is encountered
:arg namespaceHTMLElements: whether or not to namespace HTML elements
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
:arg debug: whether or not to enable debug mode which logs things
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser() # generates parser with etree builder
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
2013-10-11 17:28:32 +00:00
"""
# Raise an exception on the first error encountered
self.strict = strict
if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
self.errors = []
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()])
2018-12-15 00:08:54 +00:00
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
2013-10-11 17:28:32 +00:00
self.innerHTMLMode = innerHTML
self.container = container
2018-12-15 00:08:54 +00:00
self.scripting = scripting
self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
2013-10-11 17:28:32 +00:00
self.reset()
2018-12-15 00:08:54 +00:00
try:
self.mainLoop()
except _ReparseException:
self.reset()
self.mainLoop()
2013-10-11 17:28:32 +00:00
def reset(self):
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.log = [] # only used with debug mode
# "quirks" / "limited quirks" / "no quirks"
self.compatMode = "no quirks"
if self.innerHTMLMode:
self.innerHTML = self.container.lower()
if self.innerHTML in cdataElements:
self.tokenizer.state = self.tokenizer.rcdataState
elif self.innerHTML in rcdataElements:
self.tokenizer.state = self.tokenizer.rawtextState
elif self.innerHTML == 'plaintext':
self.tokenizer.state = self.tokenizer.plaintextState
else:
# state already is data state
# self.tokenizer.state = self.tokenizer.dataState
pass
self.phase = self.phases["beforeHtml"]
self.phase.insertHtmlElement()
self.resetInsertionMode()
else:
2018-12-15 00:08:54 +00:00
self.innerHTML = False # pylint:disable=redefined-variable-type
2013-10-11 17:28:32 +00:00
self.phase = self.phases["initial"]
self.lastPhase = None
self.beforeRCDataPhase = None
self.framesetOK = True
2015-11-04 12:01:55 +00:00
@property
def documentEncoding(self):
2018-12-15 00:08:54 +00:00
"""Name of the character encoding that was used to decode the input stream, or
:obj:`None` if that is not determined yet
2015-11-04 12:01:55 +00:00
"""
if not hasattr(self, 'tokenizer'):
return None
2018-12-15 00:08:54 +00:00
return self.tokenizer.stream.charEncoding[0].name
2015-11-04 12:01:55 +00:00
2013-10-11 17:28:32 +00:00
def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]):
return ("encoding" in element.attributes and
element.attributes["encoding"].translate(
asciiUpper2Lower) in
("text/html", "application/xhtml+xml"))
else:
return (element.namespace, element.name) in htmlIntegrationPointElements
def isMathMLTextIntegrationPoint(self, element):
return (element.namespace, element.name) in mathmlTextIntegrationPointElements
def mainLoop(self):
CharactersToken = tokenTypes["Characters"]
SpaceCharactersToken = tokenTypes["SpaceCharacters"]
StartTagToken = tokenTypes["StartTag"]
EndTagToken = tokenTypes["EndTag"]
CommentToken = tokenTypes["Comment"]
DoctypeToken = tokenTypes["Doctype"]
ParseErrorToken = tokenTypes["ParseError"]
for token in self.normalizedTokens():
2018-12-15 00:08:54 +00:00
prev_token = None
2013-10-11 17:28:32 +00:00
new_token = token
while new_token is not None:
2018-12-15 00:08:54 +00:00
prev_token = new_token
2013-10-11 17:28:32 +00:00
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
currentNodeNamespace = currentNode.namespace if currentNode else None
currentNodeName = currentNode.name if currentNode else None
type = new_token["type"]
if type == ParseErrorToken:
self.parseError(new_token["data"], new_token.get("datavars", {}))
new_token = None
else:
if (len(self.tree.openElements) == 0 or
currentNodeNamespace == self.tree.defaultNamespace or
(self.isMathMLTextIntegrationPoint(currentNode) and
((type == StartTagToken and
token["name"] not in frozenset(["mglyph", "malignmark"])) or
type in (CharactersToken, SpaceCharactersToken))) or
(currentNodeNamespace == namespaces["mathml"] and
currentNodeName == "annotation-xml" and
2018-12-15 00:08:54 +00:00
type == StartTagToken and
2013-10-11 17:28:32 +00:00
token["name"] == "svg") or
(self.isHTMLIntegrationPoint(currentNode) and
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
phase = self.phase
else:
phase = self.phases["inForeignContent"]
if type == CharactersToken:
new_token = phase.processCharacters(new_token)
elif type == SpaceCharactersToken:
new_token = phase.processSpaceCharacters(new_token)
elif type == StartTagToken:
new_token = phase.processStartTag(new_token)
elif type == EndTagToken:
new_token = phase.processEndTag(new_token)
elif type == CommentToken:
new_token = phase.processComment(new_token)
elif type == DoctypeToken:
new_token = phase.processDoctype(new_token)
2018-12-15 00:08:54 +00:00
if (type == StartTagToken and prev_token["selfClosing"] and
not prev_token["selfClosingAcknowledged"]):
2013-10-11 17:28:32 +00:00
self.parseError("non-void-element-with-trailing-solidus",
2018-12-15 00:08:54 +00:00
{"name": prev_token["name"]})
2013-10-11 17:28:32 +00:00
# When the loop finishes it's EOF
reprocess = True
phases = []
while reprocess:
phases.append(self.phase)
reprocess = self.phase.processEOF()
if reprocess:
assert self.phase not in phases
def normalizedTokens(self):
for token in self.tokenizer:
yield self.normalizeToken(token)
2018-12-15 00:08:54 +00:00
def parse(self, stream, *args, **kwargs):
2013-10-11 17:28:32 +00:00
"""Parse a HTML document into a well-formed tree
2018-12-15 00:08:54 +00:00
:arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element).
:arg scripting: treat noscript elements as if JavaScript was turned on
:returns: parsed tree
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
2013-10-11 17:28:32 +00:00
"""
2018-12-15 00:08:54 +00:00
self._parse(stream, False, None, *args, **kwargs)
2013-10-11 17:28:32 +00:00
return self.tree.getDocument()
2018-12-15 00:08:54 +00:00
def parseFragment(self, stream, *args, **kwargs):
2013-10-11 17:28:32 +00:00
"""Parse a HTML fragment into a well-formed tree fragment
2018-12-15 00:08:54 +00:00
:arg container: name of the element we're setting the innerHTML
property if set to None, default to 'div'
:arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
:arg scripting: treat noscript elements as if JavaScript was turned on
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
2013-10-11 17:28:32 +00:00
"""
2018-12-15 00:08:54 +00:00
self._parse(stream, True, *args, **kwargs)
2013-10-11 17:28:32 +00:00
return self.tree.getFragment()
2018-12-15 00:08:54 +00:00
def parseError(self, errorcode="XXX-undefined-error", datavars=None):
2013-10-11 17:28:32 +00:00
# XXX The idea is to make errorcode mandatory.
2018-12-15 00:08:54 +00:00
if datavars is None:
datavars = {}
2013-10-11 17:28:32 +00:00
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
2015-11-04 12:01:55 +00:00
raise ParseError(E[errorcode] % datavars)
2013-10-11 17:28:32 +00:00
def normalizeToken(self, token):
2018-12-15 00:08:54 +00:00
# HTML5 specific normalizations to the token stream
2013-10-11 17:28:32 +00:00
if token["type"] == tokenTypes["StartTag"]:
2018-12-15 00:08:54 +00:00
raw = token["data"]
token["data"] = OrderedDict(raw)
if len(raw) > len(token["data"]):
# we had some duplicated attribute, fix so first wins
token["data"].update(raw[::-1])
2013-10-11 17:28:32 +00:00
return token
def adjustMathMLAttributes(self, token):
2018-12-15 00:08:54 +00:00
adjust_attributes(token, adjustMathMLAttributes)
2013-10-11 17:28:32 +00:00
def adjustSVGAttributes(self, token):
2018-12-15 00:08:54 +00:00
adjust_attributes(token, adjustSVGAttributes)
2013-10-11 17:28:32 +00:00
def adjustForeignAttributes(self, token):
2018-12-15 00:08:54 +00:00
adjust_attributes(token, adjustForeignAttributesMap)
2013-10-11 17:28:32 +00:00
def reparseTokenNormal(self, token):
2018-12-15 00:08:54 +00:00
# pylint:disable=unused-argument
2013-10-11 17:28:32 +00:00
self.parser.phase()
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
newModes = {
"select": "inSelect",
"td": "inCell",
"th": "inCell",
"tr": "inRow",
"tbody": "inTableBody",
"thead": "inTableBody",
"tfoot": "inTableBody",
"caption": "inCaption",
"colgroup": "inColumnGroup",
"table": "inTable",
"head": "inBody",
"body": "inBody",
"frameset": "inFrameset",
"html": "beforeHead"
}
for node in self.tree.openElements[::-1]:
nodeName = node.name
new_phase = None
if node == self.tree.openElements[0]:
assert self.innerHTML
last = True
nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
if nodeName in ("select", "colgroup", "head", "html"):
assert self.innerHTML
if not last and node.namespace != self.tree.defaultNamespace:
continue
if nodeName in newModes:
new_phase = self.phases[newModes[nodeName]]
break
elif last:
new_phase = self.phases["inBody"]
break
self.phase = new_phase
def parseRCDataRawtext(self, token, contentType):
2018-12-15 00:08:54 +00:00
# Generic RCDATA/RAWTEXT Parsing algorithm
2013-10-11 17:28:32 +00:00
assert contentType in ("RAWTEXT", "RCDATA")
self.tree.insertElement(token)
if contentType == "RAWTEXT":
self.tokenizer.state = self.tokenizer.rawtextState
else:
self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
self.phase = self.phases["text"]
2018-12-15 00:08:54 +00:00
@_utils.memoize
2013-10-11 17:28:32 +00:00
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
2018-12-15 00:08:54 +00:00
tokenTypes.items())
2013-10-11 17:28:32 +00:00
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
token = args[0]
try:
info = {"type": type_names[token['type']]}
except:
raise
2018-12-15 00:08:54 +00:00
if token['type'] in tagTokenTypes:
2013-10-11 17:28:32 +00:00
info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__,
self.parser.phase.__class__.__name__,
self.__class__.__name__,
function.__name__,
info))
return function(self, *args, **kwargs)
else:
return function(self, *args, **kwargs)
return wrapped
def getMetaclass(use_metaclass, metaclass_func):
if use_metaclass:
return method_decorator_metaclass(metaclass_func)
else:
return type
2018-12-15 00:08:54 +00:00
# pylint:disable=unused-argument
2013-10-11 17:28:32 +00:00
class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing
"""
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
def processEOF(self):
raise NotImplementedError
def processComment(self, token):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insertComment(token, self.tree.openElements[-1])
def processDoctype(self, token):
self.parser.parseError("unexpected-doctype")
def processCharacters(self, token):
self.tree.insertText(token["data"])
def processSpaceCharacters(self, token):
self.tree.insertText(token["data"])
def processStartTag(self, token):
return self.startTagHandler[token["name"]](token)
def startTagHtml(self, token):
if not self.parser.firstStartTag and token["name"] == "html":
self.parser.parseError("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in token["data"].items():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, token):
return self.endTagHandler[token["name"]](token)
class InitialPhase(Phase):
def processSpaceCharacters(self, token):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId is not None or
systemId is not None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
self.tree.insertDoctype(token)
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
2018-12-15 00:08:54 +00:00
if (not correct or token["name"] != "html" or
publicId.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) or
publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
"html") or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is None or
systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
2013-10-11 17:28:32 +00:00
self.parser.compatMode = "quirks"
elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
2018-12-15 00:08:54 +00:00
"-//w3c//dtd xhtml 1.0 transitional//")) or
publicId.startswith(
2013-10-11 17:28:32 +00:00
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is not None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def anythingElse(self):
self.parser.compatMode = "quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
self.anythingElse()
return token
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEOF(self):
self.parser.parseError("expected-doctype-but-got-eof")
self.anythingElse()
return True
class BeforeHtmlPhase(Phase):
# helper methods
def insertHtmlElement(self):
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
self.parser.phase = self.parser.phases["beforeHead"]
# other
def processEOF(self):
self.insertHtmlElement()
return True
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.insertHtmlElement()
return token
def processStartTag(self, token):
if token["name"] == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
return token
def processEndTag(self, token):
if token["name"] not in ("head", "body", "html", "br"):
self.parser.parseError("unexpected-end-tag-before-html",
{"name": token["name"]})
else:
self.insertHtmlElement()
return token
class BeforeHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
(("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.startTagHead(impliedTagToken("head", "StartTag"))
return True
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagOther(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagImplyHead(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagOther(self, token):
self.parser.parseError("end-tag-after-implied-root",
{"name": token["name"]})
class InHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("title", self.startTagTitle),
2018-12-15 00:08:54 +00:00
(("noframes", "style"), self.startTagNoFramesStyle),
("noscript", self.startTagNoscript),
2013-10-11 17:28:32 +00:00
("script", self.startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("head", self.endTagHead),
(("br", "html", "body"), self.endTagHtmlBodyBr)
])
self.endTagHandler.default = self.endTagOther
# the real thing
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagMeta(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
attributes = token["data"]
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif ("content" in attributes and
"http-equiv" in attributes and
attributes["http-equiv"].lower() == "content-type"):
# Encoding it as UTF-8 here is a hack, as really we should pass
# the abstract Unicode string, and just use the
# ContentAttrParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
2018-12-15 00:08:54 +00:00
data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = _inputstream.ContentAttrParser(data)
2013-10-11 17:28:32 +00:00
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
self.parser.parseRCDataRawtext(token, "RCDATA")
2018-12-15 00:08:54 +00:00
def startTagNoFramesStyle(self, token):
2013-10-11 17:28:32 +00:00
# Need to decide whether to implement the scripting-disabled case
self.parser.parseRCDataRawtext(token, "RAWTEXT")
2018-12-15 00:08:54 +00:00
def startTagNoscript(self, token):
if self.parser.scripting:
self.parser.parseRCDataRawtext(token, "RAWTEXT")
else:
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inHeadNoscript"]
2013-10-11 17:28:32 +00:00
def startTagScript(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
self.parser.originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHead(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "head", "Expected head got %s" % node.name
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.endTagHead(impliedTagToken("head"))
2018-12-15 00:08:54 +00:00
class InHeadNoscriptPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
(("head", "noscript"), self.startTagHeadNoscript),
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("noscript", self.endTagNoscript),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.parser.parseError("eof-in-head-noscript")
self.anythingElse()
return True
def processComment(self, token):
return self.parser.phases["inHead"].processComment(token)
def processCharacters(self, token):
self.parser.parseError("char-in-head-noscript")
self.anythingElse()
return token
def processSpaceCharacters(self, token):
return self.parser.phases["inHead"].processSpaceCharacters(token)
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBaseLinkCommand(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagHeadNoscript(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagNoscript(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "noscript", "Expected noscript got %s" % node.name
self.parser.phase = self.parser.phases["inHead"]
def endTagBr(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
# Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript"))
2013-10-11 17:28:32 +00:00
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
self.startTagFromHead),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
self.endTagHtmlBodyBr)])
2013-10-11 17:28:32 +00:00
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBody(self, token):
self.parser.framesetOK = False
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inBody"]
def startTagFrameset(self, token):
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, token):
self.parser.parseError("unexpected-start-tag-out-of-my-head",
{"name": token["name"]})
self.tree.openElements.append(self.tree.headPointer)
self.parser.phases["inHead"].processStartTag(token)
for node in self.tree.openElements[::-1]:
if node.name == "head":
self.tree.openElements.remove(node)
break
def startTagHead(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.tree.insertElement(impliedTagToken("body", "StartTag"))
self.parser.phase = self.parser.phases["inBody"]
self.parser.framesetOK = True
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
# Set this to the default handler
self.processSpaceCharacters = self.processSpaceCharactersNonPre
2013-10-11 17:28:32 +00:00
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
2015-11-04 12:01:55 +00:00
"script", "style", "title"),
2013-10-11 17:28:32 +00:00
self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
2018-12-15 00:08:54 +00:00
"dir", "div", "dl", "fieldset", "figcaption", "figure",
2013-10-11 17:28:32 +00:00
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
self.startTagCloseP),
(headingElements, self.startTagHeading),
(("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext", self.startTagPlaintext),
("a", self.startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), self.startTagFormatting),
("nobr", self.startTagNobr),
("button", self.startTagButton),
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
self.startTagVoidFormatting),
(("param", "source", "track"), self.startTagParamSource),
("input", self.startTagInput),
("hr", self.startTagHr),
("image", self.startTagImage),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
2018-12-15 00:08:54 +00:00
("noscript", self.startTagNoscript),
(("noembed", "noframes"), self.startTagRawtext),
2013-10-11 17:28:32 +00:00
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
(("math"), self.startTagMath),
(("svg"), self.startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), self.startTagMisplaced)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("body", self.endTagBody),
("html", self.endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), self.endTagBlock),
("form", self.endTagForm),
("p", self.endTagP),
(("dd", "dt", "li"), self.endTagListItem),
(headingElements, self.endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def isMatchingFormattingElement(self, node1, node2):
2018-12-15 00:08:54 +00:00
return (node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)
2013-10-11 17:28:32 +00:00
# helper
def addFormattingElement(self, token):
self.tree.insertElement(token)
element = self.tree.openElements[-1]
matchingElements = []
for node in self.tree.activeFormattingElements[::-1]:
if node is Marker:
break
elif self.isMatchingFormattingElement(node, element):
matchingElements.append(node)
assert len(matchingElements) <= 3
if len(matchingElements) == 3:
self.tree.activeFormattingElements.remove(matchingElements[-1])
self.tree.activeFormattingElements.append(element)
# the real deal
def processEOF(self):
allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
"tfoot", "th", "thead", "tr", "body",
"html"))
for node in self.tree.openElements[::-1]:
if node.name not in allowed_elements:
self.parser.parseError("expected-closing-tag-but-got-eof")
break
# Stop parsing
def processSpaceCharactersDropNewline(self, token):
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
# want to drop leading newlines
data = token["data"]
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and
2018-12-15 00:08:54 +00:00
self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
not self.tree.openElements[-1].hasContent()):
2013-10-11 17:28:32 +00:00
data = data[1:]
if data:
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data)
def processCharacters(self, token):
if token["data"] == "\u0000":
# The tokenizer should always emit null on its own
return
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
# This must be bad for performance
if (self.parser.framesetOK and
any([char not in spaceCharacters
for char in token["data"]])):
self.parser.framesetOK = False
2018-12-15 00:08:54 +00:00
def processSpaceCharactersNonPre(self, token):
2013-10-11 17:28:32 +00:00
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
def startTagProcessInHead(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagBody(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "body"})
2018-12-15 00:08:54 +00:00
if (len(self.tree.openElements) == 1 or
self.tree.openElements[1].name != "body"):
2013-10-11 17:28:32 +00:00
assert self.parser.innerHTML
else:
self.parser.framesetOK = False
for attr, value in token["data"].items():
if attr not in self.tree.openElements[1].attributes:
self.tree.openElements[1].attributes[attr] = value
def startTagFrameset(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
elif not self.parser.framesetOK:
pass
else:
if self.tree.openElements[1].parent:
self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
while self.tree.openElements[-1].name != "html":
self.tree.openElements.pop()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagCloseP(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
def startTagPreListing(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.parser.framesetOK = False
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagForm(self, token):
if self.tree.formPointer:
self.parser.parseError("unexpected-start-tag", {"name": "form"})
else:
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.tree.formPointer = self.tree.openElements[-1]
def startTagListItem(self, token):
self.parser.framesetOK = False
stopNamesMap = {"li": ["li"],
"dt": ["dt", "dd"],
"dd": ["dt", "dd"]}
stopNames = stopNamesMap[token["name"]]
for node in reversed(self.tree.openElements):
if node.name in stopNames:
self.parser.phase.processEndTag(
impliedTagToken(node.name, "EndTag"))
break
if (node.nameTuple in specialElements and
node.name not in ("address", "div", "p")):
break
if self.tree.elementInScope("p", variant="button"):
self.parser.phase.processEndTag(
impliedTagToken("p", "EndTag"))
self.tree.insertElement(token)
def startTagPlaintext(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
def startTagHeading(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
if self.tree.openElements[-1].name in headingElements:
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
self.tree.openElements.pop()
self.tree.insertElement(token)
def startTagA(self, token):
afeAElement = self.tree.elementInActiveFormattingElements("a")
if afeAElement:
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "a", "endName": "a"})
self.endTagFormatting(impliedTagToken("a"))
if afeAElement in self.tree.openElements:
self.tree.openElements.remove(afeAElement)
if afeAElement in self.tree.activeFormattingElements:
self.tree.activeFormattingElements.remove(afeAElement)
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(token)
def startTagFormatting(self, token):
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(token)
def startTagNobr(self, token):
self.tree.reconstructActiveFormattingElements()
if self.tree.elementInScope("nobr"):
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "nobr", "endName": "nobr"})
self.processEndTag(impliedTagToken("nobr"))
# XXX Need tests that trigger the following
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(token)
def startTagButton(self, token):
if self.tree.elementInScope("button"):
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "button", "endName": "button"})
self.processEndTag(impliedTagToken("button"))
return token
else:
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.parser.framesetOK = False
def startTagAppletMarqueeObject(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.tree.activeFormattingElements.append(Marker)
self.parser.framesetOK = False
def startTagXmp(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.reconstructActiveFormattingElements()
self.parser.framesetOK = False
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagTable(self, token):
if self.parser.compatMode != "quirks":
if self.tree.elementInScope("p", variant="button"):
self.processEndTag(impliedTagToken("p"))
self.tree.insertElement(token)
self.parser.framesetOK = False
self.parser.phase = self.parser.phases["inTable"]
def startTagVoidFormatting(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
self.parser.framesetOK = False
def startTagInput(self, token):
framesetOK = self.parser.framesetOK
self.startTagVoidFormatting(token)
if ("type" in token["data"] and
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
# input type=hidden doesn't change framesetOK
self.parser.framesetOK = framesetOK
def startTagParamSource(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagHr(self, token):
if self.tree.elementInScope("p", variant="button"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
self.parser.framesetOK = False
def startTagImage(self, token):
# No really...
self.parser.parseError("unexpected-start-tag-treated-as",
{"originalName": "image", "newName": "img"})
self.processStartTag(impliedTagToken("img", "StartTag",
attributes=token["data"],
selfClosing=token["selfClosing"]))
def startTagIsIndex(self, token):
self.parser.parseError("deprecated-tag", {"name": "isindex"})
if self.tree.formPointer:
return
form_attrs = {}
if "action" in token["data"]:
form_attrs["action"] = token["data"]["action"]
self.processStartTag(impliedTagToken("form", "StartTag",
attributes=form_attrs))
self.processStartTag(impliedTagToken("hr", "StartTag"))
self.processStartTag(impliedTagToken("label", "StartTag"))
# XXX Localization ...
if "prompt" in token["data"]:
prompt = token["data"]["prompt"]
else:
prompt = "This is a searchable index. Enter search keywords: "
self.processCharacters(
{"type": tokenTypes["Characters"], "data": prompt})
attributes = token["data"].copy()
if "action" in attributes:
del attributes["action"]
if "prompt" in attributes:
del attributes["prompt"]
attributes["name"] = "isindex"
self.processStartTag(impliedTagToken("input", "StartTag",
attributes=attributes,
2015-11-04 12:01:55 +00:00
selfClosing=token["selfClosing"]))
2013-10-11 17:28:32 +00:00
self.processEndTag(impliedTagToken("label"))
self.processStartTag(impliedTagToken("hr", "StartTag"))
self.processEndTag(impliedTagToken("form"))
def startTagTextarea(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
self.parser.framesetOK = False
def startTagIFrame(self, token):
self.parser.framesetOK = False
self.startTagRawtext(token)
2018-12-15 00:08:54 +00:00
def startTagNoscript(self, token):
if self.parser.scripting:
self.startTagRawtext(token)
else:
self.startTagOther(token)
2013-10-11 17:28:32 +00:00
def startTagRawtext(self, token):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagOpt(self, token):
if self.tree.openElements[-1].name == "option":
self.parser.phase.processEndTag(impliedTagToken("option"))
self.tree.reconstructActiveFormattingElements()
self.parser.tree.insertElement(token)
def startTagSelect(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
self.parser.framesetOK = False
if self.parser.phase in (self.parser.phases["inTable"],
self.parser.phases["inCaption"],
self.parser.phases["inColumnGroup"],
self.parser.phases["inTableBody"],
self.parser.phases["inRow"],
self.parser.phases["inCell"]):
self.parser.phase = self.parser.phases["inSelectInTable"]
else:
self.parser.phase = self.parser.phases["inSelect"]
def startTagRpRt(self, token):
if self.tree.elementInScope("ruby"):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "ruby":
self.parser.parseError()
self.tree.insertElement(token)
def startTagMath(self, token):
self.tree.reconstructActiveFormattingElements()
self.parser.adjustMathMLAttributes(token)
self.parser.adjustForeignAttributes(token)
token["namespace"] = namespaces["mathml"]
self.tree.insertElement(token)
# Need to get the parse error right for the case where the token
# has a namespace not equal to the xmlns attribute
if token["selfClosing"]:
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagSvg(self, token):
self.tree.reconstructActiveFormattingElements()
self.parser.adjustSVGAttributes(token)
self.parser.adjustForeignAttributes(token)
token["namespace"] = namespaces["svg"]
self.tree.insertElement(token)
# Need to get the parse error right for the case where the token
# has a namespace not equal to the xmlns attribute
if token["selfClosing"]:
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagMisplaced(self, token):
""" Elements that should be children of other elements that have a
different insertion mode; here they are ignored
"caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "noscript"
"""
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
def startTagOther(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
def endTagP(self, token):
if not self.tree.elementInScope("p", variant="button"):
self.startTagCloseP(impliedTagToken("p", "StartTag"))
self.parser.parseError("unexpected-end-tag", {"name": "p"})
self.endTagP(impliedTagToken("p", "EndTag"))
else:
self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError("unexpected-end-tag", {"name": "p"})
node = self.tree.openElements.pop()
while node.name != "p":
node = self.tree.openElements.pop()
def endTagBody(self, token):
if not self.tree.elementInScope("body"):
self.parser.parseError()
return
elif self.tree.openElements[-1].name != "body":
for node in self.tree.openElements[2:]:
if node.name not in frozenset(("dd", "dt", "li", "optgroup",
"option", "p", "rp", "rt",
"tbody", "td", "tfoot",
"th", "thead", "tr", "body",
"html")):
# Not sure this is the correct name for the parse error
self.parser.parseError(
"expected-one-end-tag-but-got-another",
2018-12-15 00:08:54 +00:00
{"gotName": "body", "expectedName": node.name})
2013-10-11 17:28:32 +00:00
break
self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, token):
# We repeat the test for the body end tag token being ignored here
if self.tree.elementInScope("body"):
self.endTagBody(impliedTagToken("body"))
return token
def endTagBlock(self, token):
# Put us back in the right whitespace handling mode
if token["name"] == "pre":
self.processSpaceCharacters = self.processSpaceCharactersNonPre
inScope = self.tree.elementInScope(token["name"])
if inScope:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
if inScope:
node = self.tree.openElements.pop()
while node.name != token["name"]:
node = self.tree.openElements.pop()
def endTagForm(self, token):
node = self.tree.formPointer
self.tree.formPointer = None
if node is None or not self.tree.elementInScope(node):
self.parser.parseError("unexpected-end-tag",
{"name": "form"})
else:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1] != node:
self.parser.parseError("end-tag-too-early-ignored",
{"name": "form"})
self.tree.openElements.remove(node)
def endTagListItem(self, token):
if token["name"] == "li":
variant = "list"
else:
variant = None
if not self.tree.elementInScope(token["name"], variant=variant):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
else:
self.tree.generateImpliedEndTags(exclude=token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError(
"end-tag-too-early",
{"name": token["name"]})
node = self.tree.openElements.pop()
while node.name != token["name"]:
node = self.tree.openElements.pop()
def endTagHeading(self, token):
for item in headingElements:
if self.tree.elementInScope(item):
self.tree.generateImpliedEndTags()
break
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
for item in headingElements:
if self.tree.elementInScope(item):
item = self.tree.openElements.pop()
while item.name not in headingElements:
item = self.tree.openElements.pop()
break
def endTagFormatting(self, token):
"""The much-feared adoption agency algorithm"""
# http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
# XXX Better parseError messages appreciated.
# Step 1
outerLoopCounter = 0
# Step 2
while outerLoopCounter < 8:
# Step 3
outerLoopCounter += 1
# Step 4:
# Let the formatting element be the last element in
# the list of active formatting elements that:
# - is between the end of the list and the last scope
# marker in the list, if any, or the start of the list
# otherwise, and
# - has the same tag name as the token.
formattingElement = self.tree.elementInActiveFormattingElements(
token["name"])
if (not formattingElement or
(formattingElement in self.tree.openElements and
not self.tree.elementInScope(formattingElement.name))):
# If there is no such node, then abort these steps
# and instead act as described in the "any other
# end tag" entry below.
self.endTagOther(token)
return
# Otherwise, if there is such a node, but that node is
# not in the stack of open elements, then this is a
# parse error; remove the element from the list, and
# abort these steps.
elif formattingElement not in self.tree.openElements:
self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
self.tree.activeFormattingElements.remove(formattingElement)
return
# Otherwise, if there is such a node, and that node is
# also in the stack of open elements, but the element
# is not in scope, then this is a parse error; ignore
# the token, and abort these steps.
elif not self.tree.elementInScope(formattingElement.name):
self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
return
# Otherwise, there is a formatting element and that
# element is in the stack and is in scope. If the
# element is not the current node, this is a parse
# error. In any case, proceed with the algorithm as
# written in the following steps.
else:
if formattingElement != self.tree.openElements[-1]:
self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
# Step 5:
# Let the furthest block be the topmost node in the
# stack of open elements that is lower in the stack
# than the formatting element, and is an element in
# the special category. There might not be one.
afeIndex = self.tree.openElements.index(formattingElement)
furthestBlock = None
for element in self.tree.openElements[afeIndex:]:
if element.nameTuple in specialElements:
furthestBlock = element
break
# Step 6:
# If there is no furthest block, then the UA must
# first pop all the nodes from the bottom of the stack
# of open elements, from the current node up to and
# including the formatting element, then remove the
# formatting element from the list of active
# formatting elements, and finally abort these steps.
if furthestBlock is None:
element = self.tree.openElements.pop()
while element != formattingElement:
element = self.tree.openElements.pop()
self.tree.activeFormattingElements.remove(element)
return
# Step 7
commonAncestor = self.tree.openElements[afeIndex - 1]
# Step 8:
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 15. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 9.7
bookmark = self.tree.activeFormattingElements.index(formattingElement)
# Step 9
lastNode = node = furthestBlock
innerLoopCounter = 0
index = self.tree.openElements.index(node)
while innerLoopCounter < 3:
innerLoopCounter += 1
# Node is element before node in open elements
index -= 1
node = self.tree.openElements[index]
if node not in self.tree.activeFormattingElements:
self.tree.openElements.remove(node)
continue
# Step 9.6
if node == formattingElement:
break
# Step 9.7
if lastNode == furthestBlock:
bookmark = self.tree.activeFormattingElements.index(node) + 1
# Step 9.8
clone = node.cloneNode()
# Replace node with clone
self.tree.activeFormattingElements[
self.tree.activeFormattingElements.index(node)] = clone
self.tree.openElements[
self.tree.openElements.index(node)] = clone
node = clone
# Step 9.9
# Remove lastNode from its parents, if any
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
node.appendChild(lastNode)
# Step 9.10
lastNode = node
# Step 10
# Foster parent lastNode if commonAncestor is a
# table, tbody, tfoot, thead, or tr we need to foster
# parent the lastNode
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
parent, insertBefore = self.tree.getTableMisnestedNodePosition()
parent.insertBefore(lastNode, insertBefore)
else:
commonAncestor.appendChild(lastNode)
# Step 11
clone = formattingElement.cloneNode()
# Step 12
furthestBlock.reparentChildren(clone)
# Step 13
furthestBlock.appendChild(clone)
# Step 14
self.tree.activeFormattingElements.remove(formattingElement)
self.tree.activeFormattingElements.insert(bookmark, clone)
# Step 15
self.tree.openElements.remove(formattingElement)
self.tree.openElements.insert(
self.tree.openElements.index(furthestBlock) + 1, clone)
def endTagAppletMarqueeObject(self, token):
if self.tree.elementInScope(token["name"]):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
if self.tree.elementInScope(token["name"]):
element = self.tree.openElements.pop()
while element.name != token["name"]:
element = self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
def endTagBr(self, token):
self.parser.parseError("unexpected-end-tag-treated-as",
{"originalName": "br", "newName": "br element"})
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(impliedTagToken("br", "StartTag"))
self.tree.openElements.pop()
def endTagOther(self, token):
for node in self.tree.openElements[::-1]:
if node.name == token["name"]:
self.tree.generateImpliedEndTags(exclude=token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
while self.tree.openElements.pop() != node:
pass
break
else:
if node.nameTuple in specialElements:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
break
class TextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([])
2013-10-11 17:28:32 +00:00
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("script", self.endTagScript)])
self.endTagHandler.default = self.endTagOther
def processCharacters(self, token):
self.tree.insertText(token["data"])
def processEOF(self):
self.parser.parseError("expected-named-closing-tag-but-got-eof",
{"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
self.parser.phase = self.parser.originalPhase
return True
def startTagOther(self, token):
assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
def endTagScript(self, token):
node = self.tree.openElements.pop()
assert node.name == "script"
self.parser.phase = self.parser.originalPhase
# The rest of this method is all stuff that only happens if
# document.write works
def endTagOther(self, token):
self.tree.openElements.pop()
self.parser.phase = self.parser.originalPhase
class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("caption", self.startTagCaption),
("colgroup", self.startTagColgroup),
("col", self.startTagCol),
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
(("td", "th", "tr"), self.startTagImplyTbody),
("table", self.startTagTable),
(("style", "script"), self.startTagStyleScript),
("input", self.startTagInput),
("form", self.startTagForm)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods
def clearStackToTableContext(self):
# "clear the stack back to a table context"
while self.tree.openElements[-1].name not in ("table", "html"):
# self.parser.parseError("unexpected-implied-end-tag-in-table",
# {"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
# When the current node is <html> it's an innerHTML case
# processing methods
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-table")
else:
assert self.parser.innerHTML
# Stop parsing
def processSpaceCharacters(self, token):
originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["inTableText"]
self.parser.phase.originalPhase = originalPhase
self.parser.phase.processSpaceCharacters(token)
def processCharacters(self, token):
originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["inTableText"]
self.parser.phase.originalPhase = originalPhase
self.parser.phase.processCharacters(token)
def insertText(self, token):
# If we get here there must be at least one non-whitespace character
# Do the table magic!
self.tree.insertFromTable = True
self.parser.phases["inBody"].processCharacters(token)
self.tree.insertFromTable = False
def startTagCaption(self, token):
self.clearStackToTableContext()
self.tree.activeFormattingElements.append(Marker)
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inCaption"]
def startTagColgroup(self, token):
self.clearStackToTableContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inColumnGroup"]
def startTagCol(self, token):
self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
return token
def startTagRowGroup(self, token):
self.clearStackToTableContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inTableBody"]
def startTagImplyTbody(self, token):
self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
return token
def startTagTable(self, token):
self.parser.parseError("unexpected-start-tag-implies-end-tag",
{"startName": "table", "endName": "table"})
self.parser.phase.processEndTag(impliedTagToken("table"))
if not self.parser.innerHTML:
return token
def startTagStyleScript(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagInput(self, token):
if ("type" in token["data"] and
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
self.parser.parseError("unexpected-hidden-input-in-table")
self.tree.insertElement(token)
# XXX associate with form
self.tree.openElements.pop()
else:
self.startTagOther(token)
def startTagForm(self, token):
self.parser.parseError("unexpected-form-in-table")
if self.tree.formPointer is None:
self.tree.insertElement(token)
self.tree.formPointer = self.tree.openElements[-1]
self.tree.openElements.pop()
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
# Do the table magic!
self.tree.insertFromTable = True
self.parser.phases["inBody"].processStartTag(token)
self.tree.insertFromTable = False
def endTagTable(self, token):
if self.tree.elementInScope("table", variant="table"):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
self.parser.parseError("end-tag-too-early-named",
{"gotName": "table",
"expectedName": self.tree.openElements[-1].name})
while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
# Do the table magic!
self.tree.insertFromTable = True
self.parser.phases["inBody"].processEndTag(token)
self.tree.insertFromTable = False
class InTableTextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.originalPhase = None
self.characterTokens = []
def flushCharacters(self):
data = "".join([item["data"] for item in self.characterTokens])
if any([item not in spaceCharacters for item in data]):
token = {"type": tokenTypes["Characters"], "data": data}
self.parser.phases["inTable"].insertText(token)
elif data:
self.tree.insertText(data)
self.characterTokens = []
def processComment(self, token):
self.flushCharacters()
self.parser.phase = self.originalPhase
return token
def processEOF(self):
self.flushCharacters()
self.parser.phase = self.originalPhase
return True
def processCharacters(self, token):
if token["data"] == "\u0000":
return
self.characterTokens.append(token)
def processSpaceCharacters(self, token):
# pretty sure we should never reach here
self.characterTokens.append(token)
# assert False
def processStartTag(self, token):
self.flushCharacters()
self.parser.phase = self.originalPhase
return token
def processEndTag(self, token):
self.flushCharacters()
self.parser.phase = self.originalPhase
return token
class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableElement)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("caption", self.endTagCaption),
("table", self.endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", variant="table")
def processEOF(self):
self.parser.phases["inBody"].processEOF()
def processCharacters(self, token):
return self.parser.phases["inBody"].processCharacters(token)
def startTagTableElement(self, token):
self.parser.parseError()
# XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag(impliedTagToken("caption"))
if not ignoreEndTag:
return token
def startTagOther(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def endTagCaption(self, token):
if not self.ignoreEndTagCaption():
# AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "caption":
self.parser.parseError("expected-one-end-tag-but-got-another",
{"gotName": "caption",
"expectedName": self.tree.openElements[-1].name})
while self.tree.openElements[-1].name != "caption":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inTable"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, token):
self.parser.parseError()
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag(impliedTagToken("caption"))
if not ignoreEndTag:
return token
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token)
class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("col", self.startTagCol)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("colgroup", self.endTagColgroup),
("col", self.endTagCol)
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html"
def processEOF(self):
if self.tree.openElements[-1].name == "html":
assert self.parser.innerHTML
return
else:
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return True
def processCharacters(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return token
def startTagCol(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
2018-12-15 00:08:54 +00:00
token["selfClosingAcknowledged"] = True
2013-10-11 17:28:32 +00:00
def startTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return token
def endTagColgroup(self, token):
if self.ignoreEndTagColgroup():
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
else:
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
def endTagCol(self, token):
self.parser.parseError("no-end-tag", {"name": "col"})
def endTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
return token
class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods
def clearStackToTableBodyContext(self):
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
"thead", "html"):
# self.parser.parseError("unexpected-implied-end-tag-in-table",
# {"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
if self.tree.openElements[-1].name == "html":
assert self.parser.innerHTML
# the rest
def processEOF(self):
self.parser.phases["inTable"].processEOF()
def processSpaceCharacters(self, token):
return self.parser.phases["inTable"].processSpaceCharacters(token)
def processCharacters(self, token):
return self.parser.phases["inTable"].processCharacters(token)
def startTagTr(self, token):
self.clearStackToTableBodyContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inRow"]
def startTagTableCell(self, token):
self.parser.parseError("unexpected-cell-in-table-body",
{"name": token["name"]})
self.startTagTr(impliedTagToken("tr", "StartTag"))
return token
def startTagTableOther(self, token):
# XXX AT Any ideas on how to share this with endTagTable?
if (self.tree.elementInScope("tbody", variant="table") or
self.tree.elementInScope("thead", variant="table") or
self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
return token
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def startTagOther(self, token):
return self.parser.phases["inTable"].processStartTag(token)
def endTagTableRowGroup(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.clearStackToTableBodyContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
else:
self.parser.parseError("unexpected-end-tag-in-table-body",
{"name": token["name"]})
def endTagTable(self, token):
if (self.tree.elementInScope("tbody", variant="table") or
self.tree.elementInScope("thead", variant="table") or
self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
return token
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag-in-table-body",
{"name": token["name"]})
def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token)
class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("tr", self.endTagTr),
("table", self.endTagTable),
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self):
while self.tree.openElements[-1].name not in ("tr", "html"):
self.parser.parseError("unexpected-implied-end-tag-in-table-row",
{"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
def ignoreEndTagTr(self):
return not self.tree.elementInScope("tr", variant="table")
# the rest
def processEOF(self):
self.parser.phases["inTable"].processEOF()
def processSpaceCharacters(self, token):
return self.parser.phases["inTable"].processSpaceCharacters(token)
def processCharacters(self, token):
return self.parser.phases["inTable"].processCharacters(token)
def startTagTableCell(self, token):
self.clearStackToTableRowContext()
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inCell"]
self.tree.activeFormattingElements.append(Marker)
def startTagTableOther(self, token):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr(impliedTagToken("tr"))
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
return token
def startTagOther(self, token):
return self.parser.phases["inTable"].processStartTag(token)
def endTagTr(self, token):
if not self.ignoreEndTagTr():
self.clearStackToTableRowContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTableBody"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, token):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr(impliedTagToken("tr"))
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
return token
def endTagTableRowGroup(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.endTagTr(impliedTagToken("tr"))
return token
else:
self.parser.parseError()
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag-in-table-row",
{"name": token["name"]})
def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token)
class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
(("td", "th"), self.endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
])
self.endTagHandler.default = self.endTagOther
# helper
def closeCell(self):
if self.tree.elementInScope("td", variant="table"):
self.endTagTableCell(impliedTagToken("td"))
elif self.tree.elementInScope("th", variant="table"):
self.endTagTableCell(impliedTagToken("th"))
# the rest
def processEOF(self):
self.parser.phases["inBody"].processEOF()
def processCharacters(self, token):
return self.parser.phases["inBody"].processCharacters(token)
def startTagTableOther(self, token):
if (self.tree.elementInScope("td", variant="table") or
self.tree.elementInScope("th", variant="table")):
self.closeCell()
return token
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def startTagOther(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def endTagTableCell(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.tree.generateImpliedEndTags(token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("unexpected-cell-end-tag",
{"name": token["name"]})
while True:
node = self.tree.openElements.pop()
if node.name == token["name"]:
break
else:
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inRow"]
else:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagIgnore(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagImply(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.closeCell()
return token
else:
# sometimes innerHTML case
self.parser.parseError()
def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token)
class InSelectPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("option", self.startTagOption),
("optgroup", self.startTagOptgroup),
("select", self.startTagSelect),
(("input", "keygen", "textarea"), self.startTagInput),
("script", self.startTagScript)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("option", self.endTagOption),
("optgroup", self.endTagOptgroup),
("select", self.endTagSelect)
])
self.endTagHandler.default = self.endTagOther
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-select")
else:
assert self.parser.innerHTML
def processCharacters(self, token):
if token["data"] == "\u0000":
return
self.tree.insertText(token["data"])
def startTagOption(self, token):
# We need to imply </option> if <option> is the current node.
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
self.tree.insertElement(token)
def startTagOptgroup(self, token):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
if self.tree.openElements[-1].name == "optgroup":
self.tree.openElements.pop()
self.tree.insertElement(token)
def startTagSelect(self, token):
self.parser.parseError("unexpected-select-in-select")
self.endTagSelect(impliedTagToken("select"))
def startTagInput(self, token):
self.parser.parseError("unexpected-input-in-select")
if self.tree.elementInScope("select", variant="select"):
self.endTagSelect(impliedTagToken("select"))
return token
else:
assert self.parser.innerHTML
def startTagScript(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-in-select",
{"name": token["name"]})
def endTagOption(self, token):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
else:
self.parser.parseError("unexpected-end-tag-in-select",
{"name": "option"})
def endTagOptgroup(self, token):
# </optgroup> implicitly closes <option>
if (self.tree.openElements[-1].name == "option" and
self.tree.openElements[-2].name == "optgroup"):
self.tree.openElements.pop()
# It also closes </optgroup>
if self.tree.openElements[-1].name == "optgroup":
self.tree.openElements.pop()
# But nothing else
else:
self.parser.parseError("unexpected-end-tag-in-select",
{"name": "optgroup"})
def endTagSelect(self, token):
if self.tree.elementInScope("select", variant="select"):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-in-select",
{"name": token["name"]})
class InSelectInTablePhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.startTagTable)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.endTagTable)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.parser.phases["inSelect"].processEOF()
def processCharacters(self, token):
return self.parser.phases["inSelect"].processCharacters(token)
def startTagTable(self, token):
self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
self.endTagOther(impliedTagToken("select"))
return token
def startTagOther(self, token):
return self.parser.phases["inSelect"].processStartTag(token)
def endTagTable(self, token):
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
if self.tree.elementInScope(token["name"], variant="table"):
self.endTagOther(impliedTagToken("select"))
return token
def endTagOther(self, token):
return self.parser.phases["inSelect"].processEndTag(token)
class InForeignContentPhase(Phase):
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
"center", "code", "dd", "div", "dl", "dt",
"em", "embed", "h1", "h2", "h3",
"h4", "h5", "h6", "head", "hr", "i", "img",
"li", "listing", "menu", "meta", "nobr",
"ol", "p", "pre", "ruby", "s", "small",
"span", "strong", "strike", "sub", "sup",
"table", "tt", "u", "ul", "var"])
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
def adjustSVGTagNames(self, token):
replacements = {"altglyph": "altGlyph",
"altglyphdef": "altGlyphDef",
"altglyphitem": "altGlyphItem",
"animatecolor": "animateColor",
"animatemotion": "animateMotion",
"animatetransform": "animateTransform",
"clippath": "clipPath",
"feblend": "feBlend",
"fecolormatrix": "feColorMatrix",
"fecomponenttransfer": "feComponentTransfer",
"fecomposite": "feComposite",
"feconvolvematrix": "feConvolveMatrix",
"fediffuselighting": "feDiffuseLighting",
"fedisplacementmap": "feDisplacementMap",
"fedistantlight": "feDistantLight",
"feflood": "feFlood",
"fefunca": "feFuncA",
"fefuncb": "feFuncB",
"fefuncg": "feFuncG",
"fefuncr": "feFuncR",
"fegaussianblur": "feGaussianBlur",
"feimage": "feImage",
"femerge": "feMerge",
"femergenode": "feMergeNode",
"femorphology": "feMorphology",
"feoffset": "feOffset",
"fepointlight": "fePointLight",
"fespecularlighting": "feSpecularLighting",
"fespotlight": "feSpotLight",
"fetile": "feTile",
"feturbulence": "feTurbulence",
"foreignobject": "foreignObject",
"glyphref": "glyphRef",
"lineargradient": "linearGradient",
"radialgradient": "radialGradient",
"textpath": "textPath"}
if token["name"] in replacements:
token["name"] = replacements[token["name"]]
def processCharacters(self, token):
if token["data"] == "\u0000":
token["data"] = "\uFFFD"
elif (self.parser.framesetOK and
any(char not in spaceCharacters for char in token["data"])):
self.parser.framesetOK = False
Phase.processCharacters(self, token)
def processStartTag(self, token):
currentNode = self.tree.openElements[-1]
if (token["name"] in self.breakoutElements or
(token["name"] == "font" and
set(token["data"].keys()) & set(["color", "face", "size"]))):
self.parser.parseError("unexpected-html-element-in-foreign-content",
{"name": token["name"]})
while (self.tree.openElements[-1].namespace !=
self.tree.defaultNamespace and
not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
self.tree.openElements.pop()
return token
else:
if currentNode.namespace == namespaces["mathml"]:
self.parser.adjustMathMLAttributes(token)
elif currentNode.namespace == namespaces["svg"]:
self.adjustSVGTagNames(token)
self.parser.adjustSVGAttributes(token)
self.parser.adjustForeignAttributes(token)
token["namespace"] = currentNode.namespace
self.tree.insertElement(token)
if token["selfClosing"]:
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def processEndTag(self, token):
nodeIndex = len(self.tree.openElements) - 1
node = self.tree.openElements[-1]
2018-12-15 00:08:54 +00:00
if node.name.translate(asciiUpper2Lower) != token["name"]:
2013-10-11 17:28:32 +00:00
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
while True:
if node.name.translate(asciiUpper2Lower) == token["name"]:
# XXX this isn't in the spec but it seems necessary
if self.parser.phase == self.parser.phases["inTableText"]:
self.parser.phase.flushCharacters()
self.parser.phase = self.parser.phase.originalPhase
while self.tree.openElements.pop() != node:
assert self.tree.openElements
new_token = None
break
nodeIndex -= 1
node = self.tree.openElements[nodeIndex]
if node.namespace != self.tree.defaultNamespace:
continue
else:
new_token = self.parser.phase.processEndTag(token)
break
return new_token
class AfterBodyPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
2013-10-11 17:28:32 +00:00
self.endTagHandler.default = self.endTagOther
def processEOF(self):
# Stop parsing
pass
def processComment(self, token):
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
self.tree.insertComment(token, self.tree.openElements[0])
def processCharacters(self, token):
self.parser.parseError("unexpected-char-after-body")
self.parser.phase = self.parser.phases["inBody"]
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-after-body",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
def endTagHtml(self, name):
if self.parser.innerHTML:
self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
else:
self.parser.phase = self.parser.phases["afterAfterBody"]
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-after-body",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("frameset", self.startTagFrameset),
("frame", self.startTagFrame),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("frameset", self.endTagFrameset)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-frameset")
else:
assert self.parser.innerHTML
def processCharacters(self, token):
self.parser.parseError("unexpected-char-in-frameset")
def startTagFrameset(self, token):
self.tree.insertElement(token)
def startTagFrame(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
def startTagNoframes(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-in-frameset",
{"name": token["name"]})
def endTagFrameset(self, token):
if self.tree.openElements[-1].name == "html":
# innerHTML case
self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
else:
self.tree.openElements.pop()
if (not self.parser.innerHTML and
self.tree.openElements[-1].name != "frameset"):
2018-12-15 00:08:54 +00:00
# If we're not in innerHTML mode and the current node is not a
2013-10-11 17:28:32 +00:00
# "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["afterFrameset"]
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-in-frameset",
{"name": token["name"]})
class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
2018-12-15 00:08:54 +00:00
self.endTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.endTagHtml)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
# Stop parsing
pass
def processCharacters(self, token):
self.parser.parseError("unexpected-char-after-frameset")
def startTagNoframes(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-after-frameset",
{"name": token["name"]})
def endTagHtml(self, token):
self.parser.phase = self.parser.phases["afterAfterFrameset"]
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag-after-frameset",
{"name": token["name"]})
class AfterAfterBodyPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
def processEOF(self):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
return self.parser.phases["inBody"].processSpaceCharacters(token)
def processCharacters(self, token):
self.parser.parseError("expected-eof-but-got-char")
self.parser.phase = self.parser.phases["inBody"]
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("expected-eof-but-got-start-tag",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
self.parser.phase = self.parser.phases["inBody"]
return token
class AfterAfterFramesetPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
2018-12-15 00:08:54 +00:00
self.startTagHandler = _utils.MethodDispatcher([
2013-10-11 17:28:32 +00:00
("html", self.startTagHtml),
("noframes", self.startTagNoFrames)
])
self.startTagHandler.default = self.startTagOther
def processEOF(self):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
return self.parser.phases["inBody"].processSpaceCharacters(token)
def processCharacters(self, token):
self.parser.parseError("expected-eof-but-got-char")
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagNoFrames(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagOther(self, token):
self.parser.parseError("expected-eof-but-got-start-tag",
{"name": token["name"]})
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
2018-12-15 00:08:54 +00:00
# pylint:enable=unused-argument
2013-10-11 17:28:32 +00:00
return {
"initial": InitialPhase,
"beforeHtml": BeforeHtmlPhase,
"beforeHead": BeforeHeadPhase,
"inHead": InHeadPhase,
2018-12-15 00:08:54 +00:00
"inHeadNoscript": InHeadNoscriptPhase,
2013-10-11 17:28:32 +00:00
"afterHead": AfterHeadPhase,
"inBody": InBodyPhase,
"text": TextPhase,
"inTable": InTablePhase,
"inTableText": InTableTextPhase,
"inCaption": InCaptionPhase,
"inColumnGroup": InColumnGroupPhase,
"inTableBody": InTableBodyPhase,
"inRow": InRowPhase,
"inCell": InCellPhase,
"inSelect": InSelectPhase,
"inSelectInTable": InSelectInTablePhase,
"inForeignContent": InForeignContentPhase,
"afterBody": AfterBodyPhase,
"inFrameset": InFramesetPhase,
"afterFrameset": AfterFramesetPhase,
"afterAfterBody": AfterAfterBodyPhase,
"afterAfterFrameset": AfterAfterFramesetPhase,
# XXX after after frameset
}
2018-12-15 00:08:54 +00:00
def adjust_attributes(token, replacements):
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
if needs_adjustment:
token['data'] = OrderedDict((replacements.get(k, k), v)
for k, v in token['data'].items())
2013-10-11 17:28:32 +00:00
def impliedTagToken(name, type="EndTag", attributes=None,
selfClosing=False):
if attributes is None:
attributes = {}
return {"type": tokenTypes[type], "name": name, "data": attributes,
"selfClosing": selfClosing}
class ParseError(Exception):
"""Error in parsed document"""
pass