1731 lines
75 KiB
Python
1731 lines
75 KiB
Python
from __future__ import absolute_import, division, unicode_literals
|
|
|
|
try:
|
|
chr = unichr # flake8: noqa
|
|
except NameError:
|
|
pass
|
|
|
|
from collections import deque
|
|
|
|
from .constants import spaceCharacters
|
|
from .constants import entities
|
|
from .constants import asciiLetters, asciiUpper2Lower
|
|
from .constants import digits, hexDigits, EOF
|
|
from .constants import tokenTypes, tagTokenTypes
|
|
from .constants import replacementCharacters
|
|
|
|
from .inputstream import HTMLInputStream
|
|
|
|
from .trie import Trie
|
|
|
|
entitiesTrie = Trie(entities)
|
|
|
|
|
|
class HTMLTokenizer(object):
|
|
""" This class takes care of tokenizing HTML.
|
|
|
|
* self.currentToken
|
|
Holds the token that is currently being processed.
|
|
|
|
* self.state
|
|
Holds a reference to the method to be invoked... XXX
|
|
|
|
* self.stream
|
|
Points to HTMLInputStream object.
|
|
"""
|
|
|
|
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
|
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
|
|
|
|
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
|
self.parser = parser
|
|
|
|
# Perform case conversions?
|
|
self.lowercaseElementName = lowercaseElementName
|
|
self.lowercaseAttrName = lowercaseAttrName
|
|
|
|
# Setup the initial tokenizer state
|
|
self.escapeFlag = False
|
|
self.lastFourChars = []
|
|
self.state = self.dataState
|
|
self.escape = False
|
|
|
|
# The current token being created
|
|
self.currentToken = None
|
|
super(HTMLTokenizer, self).__init__()
|
|
|
|
def __iter__(self):
|
|
""" This is where the magic happens.
|
|
|
|
We do our usually processing through the states and when we have a token
|
|
to return we yield the token which pauses processing until the next token
|
|
is requested.
|
|
"""
|
|
self.tokenQueue = deque([])
|
|
# Start processing. When EOF is reached self.state will return False
|
|
# instead of True and the loop will terminate.
|
|
while self.state():
|
|
while self.stream.errors:
|
|
yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
|
|
while self.tokenQueue:
|
|
yield self.tokenQueue.popleft()
|
|
|
|
def consumeNumberEntity(self, isHex):
|
|
"""This function returns either U+FFFD or the character based on the
|
|
decimal or hexadecimal representation. It also discards ";" if present.
|
|
If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
|
|
"""
|
|
|
|
allowed = digits
|
|
radix = 10
|
|
if isHex:
|
|
allowed = hexDigits
|
|
radix = 16
|
|
|
|
charStack = []
|
|
|
|
# Consume all the characters that are in range while making sure we
|
|
# don't hit an EOF.
|
|
c = self.stream.char()
|
|
while c in allowed and c is not EOF:
|
|
charStack.append(c)
|
|
c = self.stream.char()
|
|
|
|
# Convert the set of characters consumed to an int.
|
|
charAsInt = int("".join(charStack), radix)
|
|
|
|
# Certain characters get replaced with others
|
|
if charAsInt in replacementCharacters:
|
|
char = replacementCharacters[charAsInt]
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"illegal-codepoint-for-numeric-entity",
|
|
"datavars": {"charAsInt": charAsInt}})
|
|
elif ((0xD800 <= charAsInt <= 0xDFFF) or
|
|
(charAsInt > 0x10FFFF)):
|
|
char = "\uFFFD"
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"illegal-codepoint-for-numeric-entity",
|
|
"datavars": {"charAsInt": charAsInt}})
|
|
else:
|
|
# Should speed up this check somehow (e.g. move the set to a constant)
|
|
if ((0x0001 <= charAsInt <= 0x0008) or
|
|
(0x000E <= charAsInt <= 0x001F) or
|
|
(0x007F <= charAsInt <= 0x009F) or
|
|
(0xFDD0 <= charAsInt <= 0xFDEF) or
|
|
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
|
|
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
|
|
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
|
|
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
|
|
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
|
|
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
|
|
0xFFFFF, 0x10FFFE, 0x10FFFF])):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data":
|
|
"illegal-codepoint-for-numeric-entity",
|
|
"datavars": {"charAsInt": charAsInt}})
|
|
try:
|
|
# Try/except needed as UCS-2 Python builds' unichar only works
|
|
# within the BMP.
|
|
char = chr(charAsInt)
|
|
except ValueError:
|
|
v = charAsInt - 0x10000
|
|
char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
|
|
|
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
|
# invoke parseError on parser.
|
|
if c != ";":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"numeric-entity-without-semicolon"})
|
|
self.stream.unget(c)
|
|
|
|
return char
|
|
|
|
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
|
# Initialise to the default output for when no entity is matched
|
|
output = "&"
|
|
|
|
charStack = [self.stream.char()]
|
|
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
|
|
or (allowedChar is not None and allowedChar == charStack[0])):
|
|
self.stream.unget(charStack[0])
|
|
|
|
elif charStack[0] == "#":
|
|
# Read the next character to see if it's hex or decimal
|
|
hex = False
|
|
charStack.append(self.stream.char())
|
|
if charStack[-1] in ("x", "X"):
|
|
hex = True
|
|
charStack.append(self.stream.char())
|
|
|
|
# charStack[-1] should be the first digit
|
|
if (hex and charStack[-1] in hexDigits) \
|
|
or (not hex and charStack[-1] in digits):
|
|
# At least one digit found, so consume the whole number
|
|
self.stream.unget(charStack[-1])
|
|
output = self.consumeNumberEntity(hex)
|
|
else:
|
|
# No digits found
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "expected-numeric-entity"})
|
|
self.stream.unget(charStack.pop())
|
|
output = "&" + "".join(charStack)
|
|
|
|
else:
|
|
# At this point in the process might have named entity. Entities
|
|
# are stored in the global variable "entities".
|
|
#
|
|
# Consume characters and compare to these to a substring of the
|
|
# entity names in the list until the substring no longer matches.
|
|
while (charStack[-1] is not EOF):
|
|
if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
|
|
break
|
|
charStack.append(self.stream.char())
|
|
|
|
# At this point we have a string that starts with some characters
|
|
# that may match an entity
|
|
# Try to find the longest entity the string will match to take care
|
|
# of ¬i for instance.
|
|
try:
|
|
entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
|
|
entityLength = len(entityName)
|
|
except KeyError:
|
|
entityName = None
|
|
|
|
if entityName is not None:
|
|
if entityName[-1] != ";":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"named-entity-without-semicolon"})
|
|
if (entityName[-1] != ";" and fromAttribute and
|
|
(charStack[entityLength] in asciiLetters or
|
|
charStack[entityLength] in digits or
|
|
charStack[entityLength] == "=")):
|
|
self.stream.unget(charStack.pop())
|
|
output = "&" + "".join(charStack)
|
|
else:
|
|
output = entities[entityName]
|
|
self.stream.unget(charStack.pop())
|
|
output += "".join(charStack[entityLength:])
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-named-entity"})
|
|
self.stream.unget(charStack.pop())
|
|
output = "&" + "".join(charStack)
|
|
|
|
if fromAttribute:
|
|
self.currentToken["data"][-1][1] += output
|
|
else:
|
|
if output in spaceCharacters:
|
|
tokenType = "SpaceCharacters"
|
|
else:
|
|
tokenType = "Characters"
|
|
self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
|
|
|
|
def processEntityInAttribute(self, allowedChar):
|
|
"""This method replaces the need for "entityInAttributeValueState".
|
|
"""
|
|
self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
|
|
|
|
def emitCurrentToken(self):
|
|
"""This method is a generic handler for emitting the tags. It also sets
|
|
the state to "data" because that's what's needed after a token has been
|
|
emitted.
|
|
"""
|
|
token = self.currentToken
|
|
# Add token to the queue to be yielded
|
|
if (token["type"] in tagTokenTypes):
|
|
if self.lowercaseElementName:
|
|
token["name"] = token["name"].translate(asciiUpper2Lower)
|
|
if token["type"] == tokenTypes["EndTag"]:
|
|
if token["data"]:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "attributes-in-end-tag"})
|
|
if token["selfClosing"]:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "self-closing-flag-on-end-tag"})
|
|
self.tokenQueue.append(token)
|
|
self.state = self.dataState
|
|
|
|
# Below are the various tokenizer states worked out.
|
|
def dataState(self):
|
|
data = self.stream.char()
|
|
if data == "&":
|
|
self.state = self.entityDataState
|
|
elif data == "<":
|
|
self.state = self.tagOpenState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\u0000"})
|
|
elif data is EOF:
|
|
# Tokenization ends.
|
|
return False
|
|
elif data in spaceCharacters:
|
|
# Directly after emitting a token you switch back to the "data
|
|
# state". At that point spaceCharacters are important so they are
|
|
# emitted separately.
|
|
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
|
# No need to update lastFourChars here, since the first space will
|
|
# have already been appended to lastFourChars and will have broken
|
|
# any <!-- or --> sequences
|
|
else:
|
|
chars = self.stream.charsUntil(("&", "<", "\u0000"))
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
|
data + chars})
|
|
return True
|
|
|
|
def entityDataState(self):
|
|
self.consumeEntity()
|
|
self.state = self.dataState
|
|
return True
|
|
|
|
def rcdataState(self):
|
|
data = self.stream.char()
|
|
if data == "&":
|
|
self.state = self.characterReferenceInRcdata
|
|
elif data == "<":
|
|
self.state = self.rcdataLessThanSignState
|
|
elif data == EOF:
|
|
# Tokenization ends.
|
|
return False
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
elif data in spaceCharacters:
|
|
# Directly after emitting a token you switch back to the "data
|
|
# state". At that point spaceCharacters are important so they are
|
|
# emitted separately.
|
|
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
|
# No need to update lastFourChars here, since the first space will
|
|
# have already been appended to lastFourChars and will have broken
|
|
# any <!-- or --> sequences
|
|
else:
|
|
chars = self.stream.charsUntil(("&", "<", "\u0000"))
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
|
data + chars})
|
|
return True
|
|
|
|
def characterReferenceInRcdata(self):
|
|
self.consumeEntity()
|
|
self.state = self.rcdataState
|
|
return True
|
|
|
|
def rawtextState(self):
|
|
data = self.stream.char()
|
|
if data == "<":
|
|
self.state = self.rawtextLessThanSignState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
elif data == EOF:
|
|
# Tokenization ends.
|
|
return False
|
|
else:
|
|
chars = self.stream.charsUntil(("<", "\u0000"))
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
|
data + chars})
|
|
return True
|
|
|
|
def scriptDataState(self):
|
|
data = self.stream.char()
|
|
if data == "<":
|
|
self.state = self.scriptDataLessThanSignState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
elif data == EOF:
|
|
# Tokenization ends.
|
|
return False
|
|
else:
|
|
chars = self.stream.charsUntil(("<", "\u0000"))
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
|
data + chars})
|
|
return True
|
|
|
|
def plaintextState(self):
|
|
data = self.stream.char()
|
|
if data == EOF:
|
|
# Tokenization ends.
|
|
return False
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
|
data + self.stream.charsUntil("\u0000")})
|
|
return True
|
|
|
|
def tagOpenState(self):
|
|
data = self.stream.char()
|
|
if data == "!":
|
|
self.state = self.markupDeclarationOpenState
|
|
elif data == "/":
|
|
self.state = self.closeTagOpenState
|
|
elif data in asciiLetters:
|
|
self.currentToken = {"type": tokenTypes["StartTag"],
|
|
"name": data, "data": [],
|
|
"selfClosing": False,
|
|
"selfClosingAcknowledged": False}
|
|
self.state = self.tagNameState
|
|
elif data == ">":
|
|
# XXX In theory it could be something besides a tag name. But
|
|
# do we really care?
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-tag-name-but-got-right-bracket"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
|
|
self.state = self.dataState
|
|
elif data == "?":
|
|
# XXX In theory it could be something besides a tag name. But
|
|
# do we really care?
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-tag-name-but-got-question-mark"})
|
|
self.stream.unget(data)
|
|
self.state = self.bogusCommentState
|
|
else:
|
|
# XXX
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-tag-name"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.stream.unget(data)
|
|
self.state = self.dataState
|
|
return True
|
|
|
|
def closeTagOpenState(self):
|
|
data = self.stream.char()
|
|
if data in asciiLetters:
|
|
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.tagNameState
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-closing-tag-but-got-right-bracket"})
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-closing-tag-but-got-eof"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
|
self.state = self.dataState
|
|
else:
|
|
# XXX data can be _'_...
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-closing-tag-but-got-char",
|
|
"datavars": {"data": data}})
|
|
self.stream.unget(data)
|
|
self.state = self.bogusCommentState
|
|
return True
|
|
|
|
def tagNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == ">":
|
|
self.emitCurrentToken()
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-tag-name"})
|
|
self.state = self.dataState
|
|
elif data == "/":
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["name"] += "\uFFFD"
|
|
else:
|
|
self.currentToken["name"] += data
|
|
# (Don't use charsUntil here, because tag names are
|
|
# very short and it's faster to not do anything fancy)
|
|
return True
|
|
|
|
def rcdataLessThanSignState(self):
|
|
data = self.stream.char()
|
|
if data == "/":
|
|
self.temporaryBuffer = ""
|
|
self.state = self.rcdataEndTagOpenState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.stream.unget(data)
|
|
self.state = self.rcdataState
|
|
return True
|
|
|
|
def rcdataEndTagOpenState(self):
|
|
data = self.stream.char()
|
|
if data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
self.state = self.rcdataEndTagNameState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
|
self.stream.unget(data)
|
|
self.state = self.rcdataState
|
|
return True
|
|
|
|
def rcdataEndTagNameState(self):
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
|
data = self.stream.char()
|
|
if data in spaceCharacters and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == "/" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == ">" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.emitCurrentToken()
|
|
self.state = self.dataState
|
|
elif data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "</" + self.temporaryBuffer})
|
|
self.stream.unget(data)
|
|
self.state = self.rcdataState
|
|
return True
|
|
|
|
def rawtextLessThanSignState(self):
|
|
data = self.stream.char()
|
|
if data == "/":
|
|
self.temporaryBuffer = ""
|
|
self.state = self.rawtextEndTagOpenState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.stream.unget(data)
|
|
self.state = self.rawtextState
|
|
return True
|
|
|
|
def rawtextEndTagOpenState(self):
|
|
data = self.stream.char()
|
|
if data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
self.state = self.rawtextEndTagNameState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
|
self.stream.unget(data)
|
|
self.state = self.rawtextState
|
|
return True
|
|
|
|
def rawtextEndTagNameState(self):
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
|
data = self.stream.char()
|
|
if data in spaceCharacters and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == "/" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == ">" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.emitCurrentToken()
|
|
self.state = self.dataState
|
|
elif data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "</" + self.temporaryBuffer})
|
|
self.stream.unget(data)
|
|
self.state = self.rawtextState
|
|
return True
|
|
|
|
def scriptDataLessThanSignState(self):
|
|
data = self.stream.char()
|
|
if data == "/":
|
|
self.temporaryBuffer = ""
|
|
self.state = self.scriptDataEndTagOpenState
|
|
elif data == "!":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
|
|
self.state = self.scriptDataEscapeStartState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataState
|
|
return True
|
|
|
|
def scriptDataEndTagOpenState(self):
|
|
data = self.stream.char()
|
|
if data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
self.state = self.scriptDataEndTagNameState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataState
|
|
return True
|
|
|
|
def scriptDataEndTagNameState(self):
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
|
data = self.stream.char()
|
|
if data in spaceCharacters and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == "/" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == ">" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.emitCurrentToken()
|
|
self.state = self.dataState
|
|
elif data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "</" + self.temporaryBuffer})
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataState
|
|
return True
|
|
|
|
def scriptDataEscapeStartState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
self.state = self.scriptDataEscapeStartDashState
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataState
|
|
return True
|
|
|
|
def scriptDataEscapeStartDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
self.state = self.scriptDataEscapedDashDashState
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataState
|
|
return True
|
|
|
|
def scriptDataEscapedState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
self.state = self.scriptDataEscapedDashState
|
|
elif data == "<":
|
|
self.state = self.scriptDataEscapedLessThanSignState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
elif data == EOF:
|
|
self.state = self.dataState
|
|
else:
|
|
chars = self.stream.charsUntil(("<", "-", "\u0000"))
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
|
data + chars})
|
|
return True
|
|
|
|
def scriptDataEscapedDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
self.state = self.scriptDataEscapedDashDashState
|
|
elif data == "<":
|
|
self.state = self.scriptDataEscapedLessThanSignState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
self.state = self.scriptDataEscapedState
|
|
elif data == EOF:
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
self.state = self.scriptDataEscapedState
|
|
return True
|
|
|
|
def scriptDataEscapedDashDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
elif data == "<":
|
|
self.state = self.scriptDataEscapedLessThanSignState
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
|
|
self.state = self.scriptDataState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
self.state = self.scriptDataEscapedState
|
|
elif data == EOF:
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
self.state = self.scriptDataEscapedState
|
|
return True
|
|
|
|
def scriptDataEscapedLessThanSignState(self):
|
|
data = self.stream.char()
|
|
if data == "/":
|
|
self.temporaryBuffer = ""
|
|
self.state = self.scriptDataEscapedEndTagOpenState
|
|
elif data in asciiLetters:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
|
|
self.temporaryBuffer = data
|
|
self.state = self.scriptDataDoubleEscapeStartState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataEscapedState
|
|
return True
|
|
|
|
def scriptDataEscapedEndTagOpenState(self):
|
|
data = self.stream.char()
|
|
if data in asciiLetters:
|
|
self.temporaryBuffer = data
|
|
self.state = self.scriptDataEscapedEndTagNameState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataEscapedState
|
|
return True
|
|
|
|
def scriptDataEscapedEndTagNameState(self):
|
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
|
data = self.stream.char()
|
|
if data in spaceCharacters and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == "/" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == ">" and appropriate:
|
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
|
"name": self.temporaryBuffer,
|
|
"data": [], "selfClosing": False}
|
|
self.emitCurrentToken()
|
|
self.state = self.dataState
|
|
elif data in asciiLetters:
|
|
self.temporaryBuffer += data
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "</" + self.temporaryBuffer})
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataEscapedState
|
|
return True
|
|
|
|
def scriptDataDoubleEscapeStartState(self):
|
|
data = self.stream.char()
|
|
if data in (spaceCharacters | frozenset(("/", ">"))):
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
if self.temporaryBuffer.lower() == "script":
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
else:
|
|
self.state = self.scriptDataEscapedState
|
|
elif data in asciiLetters:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
self.temporaryBuffer += data
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataEscapedState
|
|
return True
|
|
|
|
def scriptDataDoubleEscapedState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
self.state = self.scriptDataDoubleEscapedDashState
|
|
elif data == "<":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-script-in-script"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
return True
|
|
|
|
def scriptDataDoubleEscapedDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
self.state = self.scriptDataDoubleEscapedDashDashState
|
|
elif data == "<":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-script-in-script"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
return True
|
|
|
|
def scriptDataDoubleEscapedDashDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
|
elif data == "<":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
|
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
|
|
self.state = self.scriptDataState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": "\uFFFD"})
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-script-in-script"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
return True
|
|
|
|
def scriptDataDoubleEscapedLessThanSignState(self):
|
|
data = self.stream.char()
|
|
if data == "/":
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
|
|
self.temporaryBuffer = ""
|
|
self.state = self.scriptDataDoubleEscapeEndState
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
return True
|
|
|
|
def scriptDataDoubleEscapeEndState(self):
|
|
data = self.stream.char()
|
|
if data in (spaceCharacters | frozenset(("/", ">"))):
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
if self.temporaryBuffer.lower() == "script":
|
|
self.state = self.scriptDataEscapedState
|
|
else:
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
elif data in asciiLetters:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
|
self.temporaryBuffer += data
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.scriptDataDoubleEscapedState
|
|
return True
|
|
|
|
def beforeAttributeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.stream.charsUntil(spaceCharacters, True)
|
|
elif data in asciiLetters:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.attributeNameState
|
|
elif data == ">":
|
|
self.emitCurrentToken()
|
|
elif data == "/":
|
|
self.state = self.selfClosingStartTagState
|
|
elif data in ("'", '"', "=", "<"):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"invalid-character-in-attribute-name"})
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.attributeNameState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"].append(["\uFFFD", ""])
|
|
self.state = self.attributeNameState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-attribute-name-but-got-eof"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.attributeNameState
|
|
return True
|
|
|
|
def attributeNameState(self):
|
|
data = self.stream.char()
|
|
leavingThisState = True
|
|
emitToken = False
|
|
if data == "=":
|
|
self.state = self.beforeAttributeValueState
|
|
elif data in asciiLetters:
|
|
self.currentToken["data"][-1][0] += data +\
|
|
self.stream.charsUntil(asciiLetters, True)
|
|
leavingThisState = False
|
|
elif data == ">":
|
|
# XXX If we emit here the attributes are converted to a dict
|
|
# without being checked and when the code below runs we error
|
|
# because data is a dict not a list
|
|
emitToken = True
|
|
elif data in spaceCharacters:
|
|
self.state = self.afterAttributeNameState
|
|
elif data == "/":
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"][-1][0] += "\uFFFD"
|
|
leavingThisState = False
|
|
elif data in ("'", '"', "<"):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data":
|
|
"invalid-character-in-attribute-name"})
|
|
self.currentToken["data"][-1][0] += data
|
|
leavingThisState = False
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "eof-in-attribute-name"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"][-1][0] += data
|
|
leavingThisState = False
|
|
|
|
if leavingThisState:
|
|
# Attributes are not dropped at this stage. That happens when the
|
|
# start tag token is emitted so values can still be safely appended
|
|
# to attributes, but we do want to report the parse error in time.
|
|
if self.lowercaseAttrName:
|
|
self.currentToken["data"][-1][0] = (
|
|
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
|
for name, value in self.currentToken["data"][:-1]:
|
|
if self.currentToken["data"][-1][0] == name:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"duplicate-attribute"})
|
|
break
|
|
# XXX Fix for above XXX
|
|
if emitToken:
|
|
self.emitCurrentToken()
|
|
return True
|
|
|
|
def afterAttributeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.stream.charsUntil(spaceCharacters, True)
|
|
elif data == "=":
|
|
self.state = self.beforeAttributeValueState
|
|
elif data == ">":
|
|
self.emitCurrentToken()
|
|
elif data in asciiLetters:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.attributeNameState
|
|
elif data == "/":
|
|
self.state = self.selfClosingStartTagState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"].append(["\uFFFD", ""])
|
|
self.state = self.attributeNameState
|
|
elif data in ("'", '"', "<"):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"invalid-character-after-attribute-name"})
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.attributeNameState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-end-of-tag-but-got-eof"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.attributeNameState
|
|
return True
|
|
|
|
def beforeAttributeValueState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.stream.charsUntil(spaceCharacters, True)
|
|
elif data == "\"":
|
|
self.state = self.attributeValueDoubleQuotedState
|
|
elif data == "&":
|
|
self.state = self.attributeValueUnQuotedState
|
|
self.stream.unget(data)
|
|
elif data == "'":
|
|
self.state = self.attributeValueSingleQuotedState
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-attribute-value-but-got-right-bracket"})
|
|
self.emitCurrentToken()
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"][-1][1] += "\uFFFD"
|
|
self.state = self.attributeValueUnQuotedState
|
|
elif data in ("=", "<", "`"):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"equals-in-unquoted-attribute-value"})
|
|
self.currentToken["data"][-1][1] += data
|
|
self.state = self.attributeValueUnQuotedState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-attribute-value-but-got-eof"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"][-1][1] += data
|
|
self.state = self.attributeValueUnQuotedState
|
|
return True
|
|
|
|
def attributeValueDoubleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "\"":
|
|
self.state = self.afterAttributeValueState
|
|
elif data == "&":
|
|
self.processEntityInAttribute('"')
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"][-1][1] += "\uFFFD"
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-attribute-value-double-quote"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"][-1][1] += data +\
|
|
self.stream.charsUntil(("\"", "&", "\u0000"))
|
|
return True
|
|
|
|
def attributeValueSingleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "'":
|
|
self.state = self.afterAttributeValueState
|
|
elif data == "&":
|
|
self.processEntityInAttribute("'")
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"][-1][1] += "\uFFFD"
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-attribute-value-single-quote"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"][-1][1] += data +\
|
|
self.stream.charsUntil(("'", "&", "\u0000"))
|
|
return True
|
|
|
|
def attributeValueUnQuotedState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == "&":
|
|
self.processEntityInAttribute(">")
|
|
elif data == ">":
|
|
self.emitCurrentToken()
|
|
elif data in ('"', "'", "=", "<", "`"):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-character-in-unquoted-attribute-value"})
|
|
self.currentToken["data"][-1][1] += data
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"][-1][1] += "\uFFFD"
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-attribute-value-no-quotes"})
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
|
|
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
|
|
return True
|
|
|
|
def afterAttributeValueState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.beforeAttributeNameState
|
|
elif data == ">":
|
|
self.emitCurrentToken()
|
|
elif data == "/":
|
|
self.state = self.selfClosingStartTagState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-EOF-after-attribute-value"})
|
|
self.stream.unget(data)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-character-after-attribute-value"})
|
|
self.stream.unget(data)
|
|
self.state = self.beforeAttributeNameState
|
|
return True
|
|
|
|
def selfClosingStartTagState(self):
|
|
data = self.stream.char()
|
|
if data == ">":
|
|
self.currentToken["selfClosing"] = True
|
|
self.emitCurrentToken()
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data":
|
|
"unexpected-EOF-after-solidus-in-tag"})
|
|
self.stream.unget(data)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-character-after-solidus-in-tag"})
|
|
self.stream.unget(data)
|
|
self.state = self.beforeAttributeNameState
|
|
return True
|
|
|
|
def bogusCommentState(self):
|
|
# Make a new comment token and give it as value all the characters
|
|
# until the first > or EOF (charsUntil checks for EOF automatically)
|
|
# and emit it.
|
|
data = self.stream.charsUntil(">")
|
|
data = data.replace("\u0000", "\uFFFD")
|
|
self.tokenQueue.append(
|
|
{"type": tokenTypes["Comment"], "data": data})
|
|
|
|
# Eat the character directly after the bogus comment which is either a
|
|
# ">" or an EOF.
|
|
self.stream.char()
|
|
self.state = self.dataState
|
|
return True
|
|
|
|
def markupDeclarationOpenState(self):
|
|
charStack = [self.stream.char()]
|
|
if charStack[-1] == "-":
|
|
charStack.append(self.stream.char())
|
|
if charStack[-1] == "-":
|
|
self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
|
|
self.state = self.commentStartState
|
|
return True
|
|
elif charStack[-1] in ('d', 'D'):
|
|
matched = True
|
|
for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
|
|
('y', 'Y'), ('p', 'P'), ('e', 'E')):
|
|
charStack.append(self.stream.char())
|
|
if charStack[-1] not in expected:
|
|
matched = False
|
|
break
|
|
if matched:
|
|
self.currentToken = {"type": tokenTypes["Doctype"],
|
|
"name": "",
|
|
"publicId": None, "systemId": None,
|
|
"correct": True}
|
|
self.state = self.doctypeState
|
|
return True
|
|
elif (charStack[-1] == "[" and
|
|
self.parser is not None and
|
|
self.parser.tree.openElements and
|
|
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
|
|
matched = True
|
|
for expected in ["C", "D", "A", "T", "A", "["]:
|
|
charStack.append(self.stream.char())
|
|
if charStack[-1] != expected:
|
|
matched = False
|
|
break
|
|
if matched:
|
|
self.state = self.cdataSectionState
|
|
return True
|
|
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-dashes-or-doctype"})
|
|
|
|
while charStack:
|
|
self.stream.unget(charStack.pop())
|
|
self.state = self.bogusCommentState
|
|
return True
|
|
|
|
def commentStartState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.state = self.commentStartDashState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"] += "\uFFFD"
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"incorrect-comment"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-comment"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"] += data
|
|
self.state = self.commentState
|
|
return True
|
|
|
|
def commentStartDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.state = self.commentEndState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"] += "-\uFFFD"
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"incorrect-comment"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-comment"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"] += "-" + data
|
|
self.state = self.commentState
|
|
return True
|
|
|
|
def commentState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.state = self.commentEndDashState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"] += "\uFFFD"
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "eof-in-comment"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"] += data + \
|
|
self.stream.charsUntil(("-", "\u0000"))
|
|
return True
|
|
|
|
def commentEndDashState(self):
|
|
data = self.stream.char()
|
|
if data == "-":
|
|
self.state = self.commentEndState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"] += "-\uFFFD"
|
|
self.state = self.commentState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-comment-end-dash"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"] += "-" + data
|
|
self.state = self.commentState
|
|
return True
|
|
|
|
def commentEndState(self):
|
|
data = self.stream.char()
|
|
if data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"] += "--\uFFFD"
|
|
self.state = self.commentState
|
|
elif data == "!":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-bang-after-double-dash-in-comment"})
|
|
self.state = self.commentEndBangState
|
|
elif data == "-":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-dash-after-double-dash-in-comment"})
|
|
self.currentToken["data"] += data
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-comment-double-dash"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
# XXX
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-comment"})
|
|
self.currentToken["data"] += "--" + data
|
|
self.state = self.commentState
|
|
return True
|
|
|
|
def commentEndBangState(self):
|
|
data = self.stream.char()
|
|
if data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data == "-":
|
|
self.currentToken["data"] += "--!"
|
|
self.state = self.commentEndDashState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["data"] += "--!\uFFFD"
|
|
self.state = self.commentState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-comment-end-bang-state"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["data"] += "--!" + data
|
|
self.state = self.commentState
|
|
return True
|
|
|
|
def doctypeState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.beforeDoctypeNameState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-doctype-name-but-got-eof"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"need-space-after-doctype"})
|
|
self.stream.unget(data)
|
|
self.state = self.beforeDoctypeNameState
|
|
return True
|
|
|
|
def beforeDoctypeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-doctype-name-but-got-right-bracket"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["name"] = "\uFFFD"
|
|
self.state = self.doctypeNameState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-doctype-name-but-got-eof"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["name"] = data
|
|
self.state = self.doctypeNameState
|
|
return True
|
|
|
|
def doctypeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
|
self.state = self.afterDoctypeNameState
|
|
elif data == ">":
|
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["name"] += "\uFFFD"
|
|
self.state = self.doctypeNameState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype-name"})
|
|
self.currentToken["correct"] = False
|
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["name"] += data
|
|
return True
|
|
|
|
def afterDoctypeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.currentToken["correct"] = False
|
|
self.stream.unget(data)
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
if data in ("p", "P"):
|
|
matched = True
|
|
for expected in (("u", "U"), ("b", "B"), ("l", "L"),
|
|
("i", "I"), ("c", "C")):
|
|
data = self.stream.char()
|
|
if data not in expected:
|
|
matched = False
|
|
break
|
|
if matched:
|
|
self.state = self.afterDoctypePublicKeywordState
|
|
return True
|
|
elif data in ("s", "S"):
|
|
matched = True
|
|
for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
|
|
("e", "E"), ("m", "M")):
|
|
data = self.stream.char()
|
|
if data not in expected:
|
|
matched = False
|
|
break
|
|
if matched:
|
|
self.state = self.afterDoctypeSystemKeywordState
|
|
return True
|
|
|
|
# All the characters read before the current 'data' will be
|
|
# [a-zA-Z], so they're garbage in the bogus doctype and can be
|
|
# discarded; only the latest character might be '>' or EOF
|
|
# and needs to be ungetted
|
|
self.stream.unget(data)
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"expected-space-or-right-bracket-in-doctype", "datavars":
|
|
{"data": data}})
|
|
self.currentToken["correct"] = False
|
|
self.state = self.bogusDoctypeState
|
|
|
|
return True
|
|
|
|
def afterDoctypePublicKeywordState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.beforeDoctypePublicIdentifierState
|
|
elif data in ("'", '"'):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.stream.unget(data)
|
|
self.state = self.beforeDoctypePublicIdentifierState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.beforeDoctypePublicIdentifierState
|
|
return True
|
|
|
|
def beforeDoctypePublicIdentifierState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == "\"":
|
|
self.currentToken["publicId"] = ""
|
|
self.state = self.doctypePublicIdentifierDoubleQuotedState
|
|
elif data == "'":
|
|
self.currentToken["publicId"] = ""
|
|
self.state = self.doctypePublicIdentifierSingleQuotedState
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-end-of-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.state = self.bogusDoctypeState
|
|
return True
|
|
|
|
def doctypePublicIdentifierDoubleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "\"":
|
|
self.state = self.afterDoctypePublicIdentifierState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["publicId"] += "\uFFFD"
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-end-of-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["publicId"] += data
|
|
return True
|
|
|
|
def doctypePublicIdentifierSingleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "'":
|
|
self.state = self.afterDoctypePublicIdentifierState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["publicId"] += "\uFFFD"
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-end-of-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["publicId"] += data
|
|
return True
|
|
|
|
def afterDoctypePublicIdentifierState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
|
|
elif data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data == '"':
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["systemId"] = ""
|
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
|
elif data == "'":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["systemId"] = ""
|
|
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.state = self.bogusDoctypeState
|
|
return True
|
|
|
|
def betweenDoctypePublicAndSystemIdentifiersState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data == '"':
|
|
self.currentToken["systemId"] = ""
|
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
|
elif data == "'":
|
|
self.currentToken["systemId"] = ""
|
|
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.state = self.bogusDoctypeState
|
|
return True
|
|
|
|
def afterDoctypeSystemKeywordState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.beforeDoctypeSystemIdentifierState
|
|
elif data in ("'", '"'):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.stream.unget(data)
|
|
self.state = self.beforeDoctypeSystemIdentifierState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.stream.unget(data)
|
|
self.state = self.beforeDoctypeSystemIdentifierState
|
|
return True
|
|
|
|
def beforeDoctypeSystemIdentifierState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == "\"":
|
|
self.currentToken["systemId"] = ""
|
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
|
elif data == "'":
|
|
self.currentToken["systemId"] = ""
|
|
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.state = self.bogusDoctypeState
|
|
return True
|
|
|
|
def doctypeSystemIdentifierDoubleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "\"":
|
|
self.state = self.afterDoctypeSystemIdentifierState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["systemId"] += "\uFFFD"
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-end-of-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["systemId"] += data
|
|
return True
|
|
|
|
def doctypeSystemIdentifierSingleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "'":
|
|
self.state = self.afterDoctypeSystemIdentifierState
|
|
elif data == "\u0000":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
self.currentToken["systemId"] += "\uFFFD"
|
|
elif data == ">":
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-end-of-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.currentToken["systemId"] += data
|
|
return True
|
|
|
|
def afterDoctypeSystemIdentifierState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"eof-in-doctype"})
|
|
self.currentToken["correct"] = False
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
|
"unexpected-char-in-doctype"})
|
|
self.state = self.bogusDoctypeState
|
|
return True
|
|
|
|
def bogusDoctypeState(self):
|
|
data = self.stream.char()
|
|
if data == ">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
elif data is EOF:
|
|
# XXX EMIT
|
|
self.stream.unget(data)
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.dataState
|
|
else:
|
|
pass
|
|
return True
|
|
|
|
def cdataSectionState(self):
|
|
data = []
|
|
while True:
|
|
data.append(self.stream.charsUntil("]"))
|
|
data.append(self.stream.charsUntil(">"))
|
|
char = self.stream.char()
|
|
if char == EOF:
|
|
break
|
|
else:
|
|
assert char == ">"
|
|
if data[-1][-2:] == "]]":
|
|
data[-1] = data[-1][:-2]
|
|
break
|
|
else:
|
|
data.append(char)
|
|
|
|
data = "".join(data)
|
|
# Deal with null here rather than in the parser
|
|
nullCount = data.count("\u0000")
|
|
if nullCount > 0:
|
|
for i in range(nullCount):
|
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
|
"data": "invalid-codepoint"})
|
|
data = data.replace("\u0000", "\uFFFD")
|
|
if data:
|
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
|
"data": data})
|
|
self.state = self.dataState
|
|
return True
|