update Shared
This commit is contained in:
parent
e7ebbedd38
commit
6881f3471a
184 changed files with 13080 additions and 13691 deletions
|
|
@ -28,7 +28,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
|||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
|
||||
|
||||
if utils.supports_lone_surrogates:
|
||||
# Use one extra step of indirection and create surrogates with
|
||||
# unichr. Not using this indirection would introduce an illegal
|
||||
# unicode literal on platforms not supporting such lone
|
||||
# surrogates.
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
|
||||
eval('"\\uD800-\\uDFFF"'))
|
||||
else:
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||
|
||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
|
|
@ -164,13 +175,18 @@ class HTMLUnicodeInputStream(object):
|
|||
|
||||
"""
|
||||
|
||||
# Craziness
|
||||
if len("\U0010FFFF") == 1:
|
||||
if not utils.supports_lone_surrogates:
|
||||
# Such platforms will have already checked for such
|
||||
# surrogate errors, so no need to do this checking.
|
||||
self.reportCharacterErrors = None
|
||||
self.replaceCharactersRegexp = None
|
||||
elif len("\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
||||
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||
self.replaceCharactersRegexp = re.compile(
|
||||
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
|
@ -265,11 +281,12 @@ class HTMLUnicodeInputStream(object):
|
|||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
self.reportCharacterErrors(data)
|
||||
if self.reportCharacterErrors:
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue