run update

This commit is contained in:
j 2018-12-15 01:08:54 +01:00
commit 6806bebb7c
607 changed files with 52543 additions and 31832 deletions

View file

@ -1,14 +1,23 @@
"""
HTML parsing library based on the WHATWG "HTML5"
specification. The parser is designed to be compatible with existing
HTML found in the wild and implements well-defined error recovery that
HTML parsing library based on the `WHATWG HTML specification
<https://whatwg.org/html>`_. The parser is designed to be compatible with
existing HTML found in the wild and implements well-defined error recovery that
is largely compatible with modern desktop web browsers.
Example usage:
Example usage::
import html5lib
f = open("my_document.html")
tree = html5lib.parse(f)
import html5lib
with open("my_document.html", "rb") as f:
tree = html5lib.parse(f)
For convenience, this module re-exports the following names:
* :func:`~.html5parser.parse`
* :func:`~.html5parser.parseFragment`
* :class:`~.html5parser.HTMLParser`
* :func:`~.treebuilders.getTreeBuilder`
* :func:`~.treewalkers.getTreeWalker`
* :func:`~.serializer.serialize`
"""
from __future__ import absolute_import, division, unicode_literals
@ -22,4 +31,5 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
# this has to be at the top level, see how setup.py parses this
__version__ = "0.9999999"
#: Distribution version number.
__version__ = "1.0.1"

View file

@ -175,18 +175,18 @@ def escapeRegexp(string):
return string
# output from the above
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars=None,
def __init__(self,
dropXmlnsLocalName=False,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
@ -217,7 +217,7 @@ class InfosetFilter(object):
else:
return self.toXmlName(name)
def coerceElement(self, name, namespace=None):
def coerceElement(self, name):
return self.toXmlName(name)
def coerceComment(self, data):
@ -225,11 +225,14 @@ class InfosetFilter(object):
while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -")
if data.endswith("-"):
warnings.warn("Comments cannot end in a dash", DataLossWarning)
data += " "
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
for i in range(data.count("\x0C")):
for _ in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ")
# Other non-xml characters

View file

@ -1,13 +1,16 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from six.moves import http_client
from six import text_type, binary_type
from six.moves import http_client, urllib
import codecs
import re
import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import encodings, ReparseException
from . import utils
from .constants import _ReparseException
from . import _utils
from io import StringIO
@ -16,12 +19,6 @@ try:
except ImportError:
BytesIO = StringIO
try:
from io import BufferedIOBase
except ImportError:
class BufferedIOBase(object):
pass
# Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@ -29,15 +26,17 @@ asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
if utils.supports_lone_surrogates:
if _utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# eval. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
eval('"\\uD800-\\uDFFF"'))
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
"]")
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
@ -49,7 +48,7 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF])
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil()
charsUntilRegEx = {}
@ -129,10 +128,13 @@ class BufferedStream(object):
return b"".join(rv)
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
if isinstance(source, http_client.HTTPResponse):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
# Also check for addinfourl wrapping HTTPResponse
(isinstance(source, urllib.response.addbase) and
isinstance(source.fp, http_client.HTTPResponse))):
isUnicode = False
elif hasattr(source, "read"):
isUnicode = isinstance(source.read(0), text_type)
@ -140,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
isUnicode = isinstance(source, text_type)
if isUnicode:
if encoding is not None:
raise TypeError("Cannot explicitly set an encoding with a unicode string")
encodings = [x for x in kwargs if x.endswith("_encoding")]
if encodings:
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
return HTMLUnicodeInputStream(source)
return HTMLUnicodeInputStream(source, **kwargs)
else:
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
return HTMLBinaryInputStream(source, **kwargs)
class HTMLUnicodeInputStream(object):
@ -171,27 +174,21 @@ class HTMLUnicodeInputStream(object):
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""
if not utils.supports_lone_surrogates:
if not _utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile(
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
# List of where new lines occur
self.newLines = [0]
self.charEncoding = ("utf-8", "certain")
self.charEncoding = (lookupEncoding("utf-8"), "certain")
self.dataStream = self.openStream(source)
self.reset()
@ -284,10 +281,7 @@ class HTMLUnicodeInputStream(object):
if self.reportCharacterErrors:
self.reportCharacterErrors(data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
@ -297,7 +291,7 @@ class HTMLUnicodeInputStream(object):
return True
def characterErrorsUCS4(self, data):
for i in range(len(invalid_unicode_re.findall(data))):
for _ in range(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
@ -310,9 +304,9 @@ class HTMLUnicodeInputStream(object):
codepoint = ord(match.group())
pos = match.start()
# Pretty sure there should be endianness issues here
if utils.isSurrogatePair(data[pos:pos + 2]):
if _utils.isSurrogatePair(data[pos:pos + 2]):
# We have a surrogate pair!
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
@ -395,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
"""
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -408,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
@ -417,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
HTMLUnicodeInputStream.__init__(self, self.rawStream)
self.charEncoding = (codecName(encoding), "certain")
# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 512
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
# Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
# Things from args
self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding
# Detect encoding iff no explicit "transport level" encoding is supplied
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
# Determine encoding
self.charEncoding = self.determineEncoding(useChardet)
assert self.charEncoding[0] is not None
# Call superclass
self.reset()
def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
HTMLUnicodeInputStream.reset(self)
def openStream(self, source):
@ -454,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
try:
stream.seek(stream.tell())
except:
except: # pylint:disable=bare-except
stream = BufferedStream(stream)
return stream
def detectEncoding(self, parseMeta=True, chardet=True):
# First look for a BOM
def determineEncoding(self, chardet=True):
# BOMs take precedence over everything
# This will also read past the BOM if present
encoding = self.detectBOM()
confidence = "certain"
# If there is no BOM need to look for meta elements with encoding
# information
if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
confidence = "tentative"
# Guess with chardet, if avaliable
if encoding is None and chardet:
confidence = "tentative"
charEncoding = self.detectBOM(), "certain"
if charEncoding[0] is not None:
return charEncoding
# If we've been overriden, we've been overriden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding
# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Guess with chardet, if available
if chardet:
try:
try:
from charade.universaldetector import UniversalDetector
except ImportError:
from chardet.universaldetector import UniversalDetector
from chardet.universaldetector import UniversalDetector
except ImportError:
pass
else:
buffers = []
detector = UniversalDetector()
while not detector.done:
@ -487,37 +503,34 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
buffers.append(buffer)
detector.feed(buffer)
detector.close()
encoding = detector.result['encoding']
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
if encoding is None:
confidence = "tentative"
encoding = self.defaultEncoding
if encoding is not None:
return encoding, "tentative"
# Substitute for equivalent encodings:
encodingSub = {"iso-8859-1": "windows-1252"}
# Try the default encoding
charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
if encoding.lower() in encodingSub:
encoding = encodingSub[encoding.lower()]
return encoding, confidence
# Fallback to html5lib's default if even that hasn't worked
return lookupEncoding("windows-1252"), "tentative"
def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
newEncoding = codecName(newEncoding)
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
newEncoding = lookupEncoding(newEncoding)
if newEncoding is None:
return
if newEncoding.name in ("utf-16be", "utf-16le"):
newEncoding = lookupEncoding("utf-8")
assert newEncoding is not None
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain")
else:
self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certain")
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
self.reset()
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
@ -525,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
@ -546,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
return encoding
if encoding:
self.rawStream.seek(seek)
return lookupEncoding(encoding)
else:
self.rawStream.seek(0)
return None
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
@ -559,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
self.rawStream.seek(0)
encoding = parser.getEncoding()
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
encoding = "utf-8"
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
encoding = lookupEncoding("utf-8")
return encoding
@ -574,6 +590,7 @@ class EncodingBytes(bytes):
return bytes.__new__(self, value.lower())
def __init__(self, value):
# pylint:disable=unused-argument
self._position = -1
def __iter__(self):
@ -684,7 +701,7 @@ class EncodingParser(object):
(b"<!", self.handleOther),
(b"<?", self.handleOther),
(b"<", self.handlePossibleStartTag))
for byte in self.data:
for _ in self.data:
keepParsing = True
for key, method in methodDispatch:
if self.data.matchBytes(key):
@ -723,7 +740,7 @@ class EncodingParser(object):
return False
elif attr[0] == b"charset":
tentativeEncoding = attr[1]
codec = codecName(tentativeEncoding)
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
self.encoding = codec
return False
@ -731,7 +748,7 @@ class EncodingParser(object):
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse()
if tentativeEncoding is not None:
codec = codecName(tentativeEncoding)
codec = lookupEncoding(tentativeEncoding)
if codec is not None:
if hasPragma:
self.encoding = codec
@ -888,16 +905,19 @@ class ContentAttrParser(object):
return None
def codecName(encoding):
def lookupEncoding(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if isinstance(encoding, bytes):
if isinstance(encoding, binary_type):
try:
encoding = encoding.decode("ascii")
except UnicodeDecodeError:
return None
if encoding:
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
return encodings.get(canonicalName, None)
if encoding is not None:
try:
return webencodings.lookup(encoding)
except AttributeError:
return None
else:
return None

View file

@ -1,9 +1,6 @@
from __future__ import absolute_import, division, unicode_literals
try:
chr = unichr # flake8: noqa
except NameError:
pass
from six import unichr as chr
from collections import deque
@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
from .constants import tokenTypes, tagTokenTypes
from .constants import replacementCharacters
from .inputstream import HTMLInputStream
from ._inputstream import HTMLInputStream
from .trie import Trie
from ._trie import Trie
entitiesTrie = Trie(entities)
@ -34,16 +31,11 @@ class HTMLTokenizer(object):
Points to HTMLInputStream object.
"""
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
def __init__(self, stream, parser=None, **kwargs):
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
self.stream = HTMLInputStream(stream, **kwargs)
self.parser = parser
# Perform case conversions?
self.lowercaseElementName = lowercaseElementName
self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state
self.escapeFlag = False
self.lastFourChars = []
@ -147,8 +139,8 @@ class HTMLTokenizer(object):
output = "&"
charStack = [self.stream.char()]
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
or (allowedChar is not None and allowedChar == charStack[0])):
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
(allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0])
elif charStack[0] == "#":
@ -235,8 +227,7 @@ class HTMLTokenizer(object):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
if self.lowercaseElementName:
token["name"] = token["name"].translate(asciiUpper2Lower)
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["EndTag"]:
if token["data"]:
self.tokenQueue.append({"type": tokenTypes["ParseError"],
@ -921,10 +912,9 @@ class HTMLTokenizer(object):
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
if self.lowercaseAttrName:
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, value in self.currentToken["data"][:-1]:
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"duplicate-attribute"})
@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
else:
data.append(char)
data = "".join(data)
data = "".join(data) # pylint:disable=redefined-variable-type
# Deal with null here rather than in the parser
nullCount = data.count("\u0000")
if nullCount > 0:
for i in range(nullCount):
for _ in range(nullCount):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "invalid-codepoint"})
data = data.replace("\u0000", "\uFFFD")

View file

@ -4,9 +4,11 @@ from .py import Trie as PyTrie
Trie = PyTrie
# pylint:disable=wrong-import-position
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie
# pylint:enable=wrong-import-position

View file

@ -7,13 +7,13 @@ class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
keys = super().keys()
# pylint:disable=arguments-differ
keys = super(Trie, self).keys()
if prefix is None:
return set(keys)
# Python 2.6: no set comprehensions
return set([x for x in keys if x.startswith(prefix)])
return {x for x in keys if x.startswith(prefix)}
def has_keys_with_prefix(self, prefix):
for key in self.keys():

View file

@ -22,12 +22,12 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"')
_x = eval('"\\uD800"') # pylint:disable=eval-used
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"')
_x = eval('u"\\uD800"') # pylint:disable=eval-used
assert isinstance(_x, text_type)
except:
except: # pylint:disable=bare-except
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
@ -52,19 +52,20 @@ class MethodDispatcher(dict):
# anything here.
_dictEntries = []
for name, value in items:
if type(name) in (list, tuple, frozenset, set):
if isinstance(name, (list, tuple, frozenset, set)):
for item in name:
_dictEntries.append((item, value))
else:
_dictEntries.append((name, value))
dict.__init__(self, _dictEntries)
assert len(self) == len(_dictEntries)
self.default = None
def __getitem__(self, key):
return dict.get(self, key, self.default)
# Some utility functions to dal with weirdness around UCS2 vs UCS4
# Some utility functions to deal with weirdness around UCS2 vs UCS4
# python builds
def isSurrogatePair(data):
@ -91,13 +92,33 @@ def moduleFactoryFactory(factory):
else:
name = b"_%s_factory" % baseModule.__name__
if name in moduleCache:
return moduleCache[name]
else:
kwargs_tuple = tuple(kwargs.items())
try:
return moduleCache[name][args][kwargs_tuple]
except KeyError:
mod = ModuleType(name)
objs = factory(baseModule, *args, **kwargs)
mod.__dict__.update(objs)
moduleCache[name] = mod
if "name" not in moduleCache:
moduleCache[name] = {}
if "args" not in moduleCache[name]:
moduleCache[name][args] = {}
if "kwargs" not in moduleCache[name][args]:
moduleCache[name][args][kwargs_tuple] = {}
moduleCache[name][args][kwargs_tuple] = mod
return mod
return moduleFactory
def memoize(func):
cache = {}
def wrapped(*args, **kwargs):
key = (tuple(args), tuple(kwargs.items()))
if key not in cache:
cache[key] = func(*args, **kwargs)
return cache[key]
return wrapped

View file

@ -283,6 +283,12 @@ E = {
"Element %(name)s not allowed in a non-html context",
"unexpected-end-tag-before-html":
"Unexpected end tag (%(name)s) before html.",
"unexpected-inhead-noscript-tag":
"Element %(name)s not allowed in a inhead-noscript context",
"eof-in-head-noscript":
"Unexpected end of file. Expected inhead-noscript content",
"char-in-head-noscript":
"Unexpected non-space character. Expected inhead-noscript content",
"XXX-undefined-error":
"Undefined error (this sucks and should be fixed)",
}
@ -417,7 +423,7 @@ specialElements = frozenset([
])
htmlIntegrationPointElements = frozenset([
(namespaces["mathml"], "annotaion-xml"),
(namespaces["mathml"], "annotation-xml"),
(namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"),
(namespaces["svg"], "title")
@ -431,6 +437,73 @@ mathmlTextIntegrationPointElements = frozenset([
(namespaces["mathml"], "mtext")
])
adjustSVGAttributes = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
@ -515,7 +588,7 @@ rcdataElements = frozenset([
])
booleanAttributes = {
"": frozenset(["irrelevant"]),
"": frozenset(["irrelevant", "itemscope"]),
"style": frozenset(["scoped"]),
"img": frozenset(["ismap"]),
"audio": frozenset(["autoplay", "controls"]),
@ -533,6 +606,7 @@ booleanAttributes = {
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"output": frozenset(["disabled", "readonly"]),
"iframe": frozenset(["seamless"]),
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@ -2813,7 +2887,6 @@ replacementCharacters = {
0x0d: "\u000D",
0x80: "\u20AC",
0x81: "\u0081",
0x81: "\u0081",
0x82: "\u201A",
0x83: "\u0192",
0x84: "\u201E",
@ -2846,235 +2919,6 @@ replacementCharacters = {
0x9F: "\u0178",
}
encodings = {
'437': 'cp437',
'850': 'cp850',
'852': 'cp852',
'855': 'cp855',
'857': 'cp857',
'860': 'cp860',
'861': 'cp861',
'862': 'cp862',
'863': 'cp863',
'865': 'cp865',
'866': 'cp866',
'869': 'cp869',
'ansix341968': 'ascii',
'ansix341986': 'ascii',
'arabic': 'iso8859-6',
'ascii': 'ascii',
'asmo708': 'iso8859-6',
'big5': 'big5',
'big5hkscs': 'big5hkscs',
'chinese': 'gbk',
'cp037': 'cp037',
'cp1026': 'cp1026',
'cp154': 'ptcp154',
'cp367': 'ascii',
'cp424': 'cp424',
'cp437': 'cp437',
'cp500': 'cp500',
'cp775': 'cp775',
'cp819': 'windows-1252',
'cp850': 'cp850',
'cp852': 'cp852',
'cp855': 'cp855',
'cp857': 'cp857',
'cp860': 'cp860',
'cp861': 'cp861',
'cp862': 'cp862',
'cp863': 'cp863',
'cp864': 'cp864',
'cp865': 'cp865',
'cp866': 'cp866',
'cp869': 'cp869',
'cp936': 'gbk',
'cpgr': 'cp869',
'cpis': 'cp861',
'csascii': 'ascii',
'csbig5': 'big5',
'cseuckr': 'cp949',
'cseucpkdfmtjapanese': 'euc_jp',
'csgb2312': 'gbk',
'cshproman8': 'hp-roman8',
'csibm037': 'cp037',
'csibm1026': 'cp1026',
'csibm424': 'cp424',
'csibm500': 'cp500',
'csibm855': 'cp855',
'csibm857': 'cp857',
'csibm860': 'cp860',
'csibm861': 'cp861',
'csibm863': 'cp863',
'csibm864': 'cp864',
'csibm865': 'cp865',
'csibm866': 'cp866',
'csibm869': 'cp869',
'csiso2022jp': 'iso2022_jp',
'csiso2022jp2': 'iso2022_jp_2',
'csiso2022kr': 'iso2022_kr',
'csiso58gb231280': 'gbk',
'csisolatin1': 'windows-1252',
'csisolatin2': 'iso8859-2',
'csisolatin3': 'iso8859-3',
'csisolatin4': 'iso8859-4',
'csisolatin5': 'windows-1254',
'csisolatin6': 'iso8859-10',
'csisolatinarabic': 'iso8859-6',
'csisolatincyrillic': 'iso8859-5',
'csisolatingreek': 'iso8859-7',
'csisolatinhebrew': 'iso8859-8',
'cskoi8r': 'koi8-r',
'csksc56011987': 'cp949',
'cspc775baltic': 'cp775',
'cspc850multilingual': 'cp850',
'cspc862latinhebrew': 'cp862',
'cspc8codepage437': 'cp437',
'cspcp852': 'cp852',
'csptcp154': 'ptcp154',
'csshiftjis': 'shift_jis',
'csunicode11utf7': 'utf-7',
'cyrillic': 'iso8859-5',
'cyrillicasian': 'ptcp154',
'ebcdiccpbe': 'cp500',
'ebcdiccpca': 'cp037',
'ebcdiccpch': 'cp500',
'ebcdiccphe': 'cp424',
'ebcdiccpnl': 'cp037',
'ebcdiccpus': 'cp037',
'ebcdiccpwt': 'cp037',
'ecma114': 'iso8859-6',
'ecma118': 'iso8859-7',
'elot928': 'iso8859-7',
'eucjp': 'euc_jp',
'euckr': 'cp949',
'extendedunixcodepackedformatforjapanese': 'euc_jp',
'gb18030': 'gb18030',
'gb2312': 'gbk',
'gb231280': 'gbk',
'gbk': 'gbk',
'greek': 'iso8859-7',
'greek8': 'iso8859-7',
'hebrew': 'iso8859-8',
'hproman8': 'hp-roman8',
'hzgb2312': 'hz',
'ibm037': 'cp037',
'ibm1026': 'cp1026',
'ibm367': 'ascii',
'ibm424': 'cp424',
'ibm437': 'cp437',
'ibm500': 'cp500',
'ibm775': 'cp775',
'ibm819': 'windows-1252',
'ibm850': 'cp850',
'ibm852': 'cp852',
'ibm855': 'cp855',
'ibm857': 'cp857',
'ibm860': 'cp860',
'ibm861': 'cp861',
'ibm862': 'cp862',
'ibm863': 'cp863',
'ibm864': 'cp864',
'ibm865': 'cp865',
'ibm866': 'cp866',
'ibm869': 'cp869',
'iso2022jp': 'iso2022_jp',
'iso2022jp2': 'iso2022_jp_2',
'iso2022kr': 'iso2022_kr',
'iso646irv1991': 'ascii',
'iso646us': 'ascii',
'iso88591': 'windows-1252',
'iso885910': 'iso8859-10',
'iso8859101992': 'iso8859-10',
'iso885911987': 'windows-1252',
'iso885913': 'iso8859-13',
'iso885914': 'iso8859-14',
'iso8859141998': 'iso8859-14',
'iso885915': 'iso8859-15',
'iso885916': 'iso8859-16',
'iso8859162001': 'iso8859-16',
'iso88592': 'iso8859-2',
'iso885921987': 'iso8859-2',
'iso88593': 'iso8859-3',
'iso885931988': 'iso8859-3',
'iso88594': 'iso8859-4',
'iso885941988': 'iso8859-4',
'iso88595': 'iso8859-5',
'iso885951988': 'iso8859-5',
'iso88596': 'iso8859-6',
'iso885961987': 'iso8859-6',
'iso88597': 'iso8859-7',
'iso885971987': 'iso8859-7',
'iso88598': 'iso8859-8',
'iso885981988': 'iso8859-8',
'iso88599': 'windows-1254',
'iso885991989': 'windows-1254',
'isoceltic': 'iso8859-14',
'isoir100': 'windows-1252',
'isoir101': 'iso8859-2',
'isoir109': 'iso8859-3',
'isoir110': 'iso8859-4',
'isoir126': 'iso8859-7',
'isoir127': 'iso8859-6',
'isoir138': 'iso8859-8',
'isoir144': 'iso8859-5',
'isoir148': 'windows-1254',
'isoir149': 'cp949',
'isoir157': 'iso8859-10',
'isoir199': 'iso8859-14',
'isoir226': 'iso8859-16',
'isoir58': 'gbk',
'isoir6': 'ascii',
'koi8r': 'koi8-r',
'koi8u': 'koi8-u',
'korean': 'cp949',
'ksc5601': 'cp949',
'ksc56011987': 'cp949',
'ksc56011989': 'cp949',
'l1': 'windows-1252',
'l10': 'iso8859-16',
'l2': 'iso8859-2',
'l3': 'iso8859-3',
'l4': 'iso8859-4',
'l5': 'windows-1254',
'l6': 'iso8859-10',
'l8': 'iso8859-14',
'latin1': 'windows-1252',
'latin10': 'iso8859-16',
'latin2': 'iso8859-2',
'latin3': 'iso8859-3',
'latin4': 'iso8859-4',
'latin5': 'windows-1254',
'latin6': 'iso8859-10',
'latin8': 'iso8859-14',
'latin9': 'iso8859-15',
'ms936': 'gbk',
'mskanji': 'shift_jis',
'pt154': 'ptcp154',
'ptcp154': 'ptcp154',
'r8': 'hp-roman8',
'roman8': 'hp-roman8',
'shiftjis': 'shift_jis',
'tis620': 'cp874',
'unicode11utf7': 'utf-7',
'us': 'ascii',
'usascii': 'ascii',
'utf16': 'utf-16',
'utf16be': 'utf-16-be',
'utf16le': 'utf-16-le',
'utf8': 'utf-8',
'windows1250': 'cp1250',
'windows1251': 'cp1251',
'windows1252': 'cp1252',
'windows1253': 'cp1253',
'windows1254': 'cp1254',
'windows1255': 'cp1255',
'windows1256': 'cp1256',
'windows1257': 'cp1257',
'windows1258': 'cp1258',
'windows936': 'gbk',
'x-x-big5': 'big5'}
tokenTypes = {
"Doctype": 0,
"Characters": 1,
@ -3095,8 +2939,9 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
class DataLossWarning(UserWarning):
"""Raised when the current tree is unable to represent the input data"""
pass
class ReparseException(Exception):
class _ReparseException(Exception):
pass

View file

@ -1,20 +1,29 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from . import base
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from collections import OrderedDict
class Filter(_base.Filter):
def _attr_key(attr):
"""Return an appropriate key for an attribute for sorting
Attributes have a namespace that can be either ``None`` or a string. We
can't compare the two because they're different types, so we convert
``None`` to an empty string first.
"""
return (attr[0][0] or ''), attr[0][1]
class Filter(base.Filter):
"""Alphabetizes attributes for elements"""
def __iter__(self):
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict()
for name, value in sorted(token["data"].items(),
key=lambda x: x[0]):
key=_attr_key):
attrs[name] = value
token["data"] = attrs
yield token

View file

@ -1,11 +1,19 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from . import base
class Filter(_base.Filter):
class Filter(base.Filter):
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
def __init__(self, source, encoding):
_base.Filter.__init__(self, source)
"""Creates a Filter
:arg source: the source token stream
:arg encoding: the encoding to set
"""
base.Filter.__init__(self, source)
self.encoding = encoding
def __iter__(self):
@ -13,7 +21,7 @@ class Filter(_base.Filter):
meta_found = (self.encoding is None)
pending = []
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag":
if token["name"].lower() == "head":

View file

@ -1,90 +1,93 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from ..constants import cdataElements, rcdataElements, voidElements
from six import text_type
from . import base
from ..constants import namespaces, voidElements
from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
class LintError(Exception):
pass
class Filter(base.Filter):
"""Lints the token stream for errors
If it finds any errors, it'll raise an ``AssertionError``.
"""
def __init__(self, source, require_matching_tags=True):
"""Creates a Filter
:arg source: the source token stream
:arg require_matching_tags: whether or not to require matching tags
"""
super(Filter, self).__init__(source)
self.require_matching_tags = require_matching_tags
class Filter(_base.Filter):
def __iter__(self):
open_elements = []
contentModelFlag = "PCDATA"
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
namespace = token["namespace"]
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
if not isinstance(name, str):
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
raise LintError("Empty tag name")
if type == "StartTag" and name in voidElements:
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
elif type == "EmptyTag" and name not in voidElements:
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, str):
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
if not name:
raise LintError("Empty attribute name")
if not isinstance(value, str):
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
contentModelFlag = "RCDATA"
elif name == "plaintext":
contentModelFlag = "PLAINTEXT"
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(token["data"], dict)
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert type == "EmptyTag"
else:
assert type == "StartTag"
if type == "StartTag" and self.require_matching_tags:
open_elements.append((namespace, name))
for (namespace, name), value in token["data"].items():
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(value, text_type)
elif type == "EndTag":
namespace = token["namespace"]
name = token["name"]
if not isinstance(name, str):
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
raise LintError("Empty tag name")
if name in voidElements:
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
start_name = open_elements.pop()
if start_name != name:
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
contentModelFlag = "PCDATA"
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
elif self.require_matching_tags:
start = open_elements.pop()
assert start == (namespace, name)
elif type == "Comment":
if contentModelFlag != "PCDATA":
raise LintError("Comment not in PCDATA content model flag")
data = token["data"]
assert isinstance(data, text_type)
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, str):
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
if not data:
raise LintError("%(type)s token with empty data" % {"type": type})
assert isinstance(data, text_type)
assert data != ""
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
assert data.strip(spaceCharacters) == ""
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
if not isinstance(name, str):
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
# XXX: what to do with token["data"] ?
assert name is None or isinstance(name, text_type)
assert token["publicId"] is None or isinstance(name, text_type)
assert token["systemId"] is None or isinstance(name, text_type)
elif type in ("ParseError", "SerializeError"):
pass
elif type == "Entity":
assert isinstance(token["name"], text_type)
elif type == "SerializerError":
assert isinstance(token["data"], text_type)
else:
raise LintError("Unknown token type: %(type)s" % {"type": type})
assert False, "Unknown token type: %(type)s" % {"type": type}
yield token

View file

@ -1,9 +1,10 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from . import base
class Filter(_base.Filter):
class Filter(base.Filter):
"""Removes optional tags from the token stream"""
def slider(self):
previous1 = previous2 = None
for token in self.source:
@ -11,7 +12,8 @@ class Filter(_base.Filter):
yield previous2, previous1, token
previous2 = previous1
previous1 = token
yield previous2, previous1, None
if previous1 is not None:
yield previous2, previous1, None
def __iter__(self):
for previous, token, next in self.slider():
@ -58,7 +60,7 @@ class Filter(_base.Filter):
elif tagname == 'colgroup':
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# is not immediately preceded by another colgroup element whose
# end tag has been omitted.
if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never
@ -70,7 +72,7 @@ class Filter(_base.Filter):
elif tagname == 'tbody':
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# not immediately preceded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == "StartTag":
# omit the thead and tfoot elements' end tag when they are

View file

@ -1,12 +1,896 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
from ..sanitizer import HTMLSanitizerMixin
import re
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from . import base
from ..constants import namespaces, prefixes
__all__ = ["Filter"]
class Filter(_base.Filter, HTMLSanitizerMixin):
allowed_elements = frozenset((
(namespaces['html'], 'a'),
(namespaces['html'], 'abbr'),
(namespaces['html'], 'acronym'),
(namespaces['html'], 'address'),
(namespaces['html'], 'area'),
(namespaces['html'], 'article'),
(namespaces['html'], 'aside'),
(namespaces['html'], 'audio'),
(namespaces['html'], 'b'),
(namespaces['html'], 'big'),
(namespaces['html'], 'blockquote'),
(namespaces['html'], 'br'),
(namespaces['html'], 'button'),
(namespaces['html'], 'canvas'),
(namespaces['html'], 'caption'),
(namespaces['html'], 'center'),
(namespaces['html'], 'cite'),
(namespaces['html'], 'code'),
(namespaces['html'], 'col'),
(namespaces['html'], 'colgroup'),
(namespaces['html'], 'command'),
(namespaces['html'], 'datagrid'),
(namespaces['html'], 'datalist'),
(namespaces['html'], 'dd'),
(namespaces['html'], 'del'),
(namespaces['html'], 'details'),
(namespaces['html'], 'dfn'),
(namespaces['html'], 'dialog'),
(namespaces['html'], 'dir'),
(namespaces['html'], 'div'),
(namespaces['html'], 'dl'),
(namespaces['html'], 'dt'),
(namespaces['html'], 'em'),
(namespaces['html'], 'event-source'),
(namespaces['html'], 'fieldset'),
(namespaces['html'], 'figcaption'),
(namespaces['html'], 'figure'),
(namespaces['html'], 'footer'),
(namespaces['html'], 'font'),
(namespaces['html'], 'form'),
(namespaces['html'], 'header'),
(namespaces['html'], 'h1'),
(namespaces['html'], 'h2'),
(namespaces['html'], 'h3'),
(namespaces['html'], 'h4'),
(namespaces['html'], 'h5'),
(namespaces['html'], 'h6'),
(namespaces['html'], 'hr'),
(namespaces['html'], 'i'),
(namespaces['html'], 'img'),
(namespaces['html'], 'input'),
(namespaces['html'], 'ins'),
(namespaces['html'], 'keygen'),
(namespaces['html'], 'kbd'),
(namespaces['html'], 'label'),
(namespaces['html'], 'legend'),
(namespaces['html'], 'li'),
(namespaces['html'], 'm'),
(namespaces['html'], 'map'),
(namespaces['html'], 'menu'),
(namespaces['html'], 'meter'),
(namespaces['html'], 'multicol'),
(namespaces['html'], 'nav'),
(namespaces['html'], 'nextid'),
(namespaces['html'], 'ol'),
(namespaces['html'], 'output'),
(namespaces['html'], 'optgroup'),
(namespaces['html'], 'option'),
(namespaces['html'], 'p'),
(namespaces['html'], 'pre'),
(namespaces['html'], 'progress'),
(namespaces['html'], 'q'),
(namespaces['html'], 's'),
(namespaces['html'], 'samp'),
(namespaces['html'], 'section'),
(namespaces['html'], 'select'),
(namespaces['html'], 'small'),
(namespaces['html'], 'sound'),
(namespaces['html'], 'source'),
(namespaces['html'], 'spacer'),
(namespaces['html'], 'span'),
(namespaces['html'], 'strike'),
(namespaces['html'], 'strong'),
(namespaces['html'], 'sub'),
(namespaces['html'], 'sup'),
(namespaces['html'], 'table'),
(namespaces['html'], 'tbody'),
(namespaces['html'], 'td'),
(namespaces['html'], 'textarea'),
(namespaces['html'], 'time'),
(namespaces['html'], 'tfoot'),
(namespaces['html'], 'th'),
(namespaces['html'], 'thead'),
(namespaces['html'], 'tr'),
(namespaces['html'], 'tt'),
(namespaces['html'], 'u'),
(namespaces['html'], 'ul'),
(namespaces['html'], 'var'),
(namespaces['html'], 'video'),
(namespaces['mathml'], 'maction'),
(namespaces['mathml'], 'math'),
(namespaces['mathml'], 'merror'),
(namespaces['mathml'], 'mfrac'),
(namespaces['mathml'], 'mi'),
(namespaces['mathml'], 'mmultiscripts'),
(namespaces['mathml'], 'mn'),
(namespaces['mathml'], 'mo'),
(namespaces['mathml'], 'mover'),
(namespaces['mathml'], 'mpadded'),
(namespaces['mathml'], 'mphantom'),
(namespaces['mathml'], 'mprescripts'),
(namespaces['mathml'], 'mroot'),
(namespaces['mathml'], 'mrow'),
(namespaces['mathml'], 'mspace'),
(namespaces['mathml'], 'msqrt'),
(namespaces['mathml'], 'mstyle'),
(namespaces['mathml'], 'msub'),
(namespaces['mathml'], 'msubsup'),
(namespaces['mathml'], 'msup'),
(namespaces['mathml'], 'mtable'),
(namespaces['mathml'], 'mtd'),
(namespaces['mathml'], 'mtext'),
(namespaces['mathml'], 'mtr'),
(namespaces['mathml'], 'munder'),
(namespaces['mathml'], 'munderover'),
(namespaces['mathml'], 'none'),
(namespaces['svg'], 'a'),
(namespaces['svg'], 'animate'),
(namespaces['svg'], 'animateColor'),
(namespaces['svg'], 'animateMotion'),
(namespaces['svg'], 'animateTransform'),
(namespaces['svg'], 'clipPath'),
(namespaces['svg'], 'circle'),
(namespaces['svg'], 'defs'),
(namespaces['svg'], 'desc'),
(namespaces['svg'], 'ellipse'),
(namespaces['svg'], 'font-face'),
(namespaces['svg'], 'font-face-name'),
(namespaces['svg'], 'font-face-src'),
(namespaces['svg'], 'g'),
(namespaces['svg'], 'glyph'),
(namespaces['svg'], 'hkern'),
(namespaces['svg'], 'linearGradient'),
(namespaces['svg'], 'line'),
(namespaces['svg'], 'marker'),
(namespaces['svg'], 'metadata'),
(namespaces['svg'], 'missing-glyph'),
(namespaces['svg'], 'mpath'),
(namespaces['svg'], 'path'),
(namespaces['svg'], 'polygon'),
(namespaces['svg'], 'polyline'),
(namespaces['svg'], 'radialGradient'),
(namespaces['svg'], 'rect'),
(namespaces['svg'], 'set'),
(namespaces['svg'], 'stop'),
(namespaces['svg'], 'svg'),
(namespaces['svg'], 'switch'),
(namespaces['svg'], 'text'),
(namespaces['svg'], 'title'),
(namespaces['svg'], 'tspan'),
(namespaces['svg'], 'use'),
))
allowed_attributes = frozenset((
# HTML attributes
(None, 'abbr'),
(None, 'accept'),
(None, 'accept-charset'),
(None, 'accesskey'),
(None, 'action'),
(None, 'align'),
(None, 'alt'),
(None, 'autocomplete'),
(None, 'autofocus'),
(None, 'axis'),
(None, 'background'),
(None, 'balance'),
(None, 'bgcolor'),
(None, 'bgproperties'),
(None, 'border'),
(None, 'bordercolor'),
(None, 'bordercolordark'),
(None, 'bordercolorlight'),
(None, 'bottompadding'),
(None, 'cellpadding'),
(None, 'cellspacing'),
(None, 'ch'),
(None, 'challenge'),
(None, 'char'),
(None, 'charoff'),
(None, 'choff'),
(None, 'charset'),
(None, 'checked'),
(None, 'cite'),
(None, 'class'),
(None, 'clear'),
(None, 'color'),
(None, 'cols'),
(None, 'colspan'),
(None, 'compact'),
(None, 'contenteditable'),
(None, 'controls'),
(None, 'coords'),
(None, 'data'),
(None, 'datafld'),
(None, 'datapagesize'),
(None, 'datasrc'),
(None, 'datetime'),
(None, 'default'),
(None, 'delay'),
(None, 'dir'),
(None, 'disabled'),
(None, 'draggable'),
(None, 'dynsrc'),
(None, 'enctype'),
(None, 'end'),
(None, 'face'),
(None, 'for'),
(None, 'form'),
(None, 'frame'),
(None, 'galleryimg'),
(None, 'gutter'),
(None, 'headers'),
(None, 'height'),
(None, 'hidefocus'),
(None, 'hidden'),
(None, 'high'),
(None, 'href'),
(None, 'hreflang'),
(None, 'hspace'),
(None, 'icon'),
(None, 'id'),
(None, 'inputmode'),
(None, 'ismap'),
(None, 'keytype'),
(None, 'label'),
(None, 'leftspacing'),
(None, 'lang'),
(None, 'list'),
(None, 'longdesc'),
(None, 'loop'),
(None, 'loopcount'),
(None, 'loopend'),
(None, 'loopstart'),
(None, 'low'),
(None, 'lowsrc'),
(None, 'max'),
(None, 'maxlength'),
(None, 'media'),
(None, 'method'),
(None, 'min'),
(None, 'multiple'),
(None, 'name'),
(None, 'nohref'),
(None, 'noshade'),
(None, 'nowrap'),
(None, 'open'),
(None, 'optimum'),
(None, 'pattern'),
(None, 'ping'),
(None, 'point-size'),
(None, 'poster'),
(None, 'pqg'),
(None, 'preload'),
(None, 'prompt'),
(None, 'radiogroup'),
(None, 'readonly'),
(None, 'rel'),
(None, 'repeat-max'),
(None, 'repeat-min'),
(None, 'replace'),
(None, 'required'),
(None, 'rev'),
(None, 'rightspacing'),
(None, 'rows'),
(None, 'rowspan'),
(None, 'rules'),
(None, 'scope'),
(None, 'selected'),
(None, 'shape'),
(None, 'size'),
(None, 'span'),
(None, 'src'),
(None, 'start'),
(None, 'step'),
(None, 'style'),
(None, 'summary'),
(None, 'suppress'),
(None, 'tabindex'),
(None, 'target'),
(None, 'template'),
(None, 'title'),
(None, 'toppadding'),
(None, 'type'),
(None, 'unselectable'),
(None, 'usemap'),
(None, 'urn'),
(None, 'valign'),
(None, 'value'),
(None, 'variable'),
(None, 'volume'),
(None, 'vspace'),
(None, 'vrml'),
(None, 'width'),
(None, 'wrap'),
(namespaces['xml'], 'lang'),
# MathML attributes
(None, 'actiontype'),
(None, 'align'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnlines'),
(None, 'columnspacing'),
(None, 'columnspan'),
(None, 'depth'),
(None, 'display'),
(None, 'displaystyle'),
(None, 'equalcolumns'),
(None, 'equalrows'),
(None, 'fence'),
(None, 'fontstyle'),
(None, 'fontweight'),
(None, 'frame'),
(None, 'height'),
(None, 'linethickness'),
(None, 'lspace'),
(None, 'mathbackground'),
(None, 'mathcolor'),
(None, 'mathvariant'),
(None, 'mathvariant'),
(None, 'maxsize'),
(None, 'minsize'),
(None, 'other'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowlines'),
(None, 'rowspacing'),
(None, 'rowspan'),
(None, 'rspace'),
(None, 'scriptlevel'),
(None, 'selection'),
(None, 'separator'),
(None, 'stretchy'),
(None, 'width'),
(None, 'width'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'type'),
# SVG attributes
(None, 'accent-height'),
(None, 'accumulate'),
(None, 'additive'),
(None, 'alphabetic'),
(None, 'arabic-form'),
(None, 'ascent'),
(None, 'attributeName'),
(None, 'attributeType'),
(None, 'baseProfile'),
(None, 'bbox'),
(None, 'begin'),
(None, 'by'),
(None, 'calcMode'),
(None, 'cap-height'),
(None, 'class'),
(None, 'clip-path'),
(None, 'color'),
(None, 'color-rendering'),
(None, 'content'),
(None, 'cx'),
(None, 'cy'),
(None, 'd'),
(None, 'dx'),
(None, 'dy'),
(None, 'descent'),
(None, 'display'),
(None, 'dur'),
(None, 'end'),
(None, 'fill'),
(None, 'fill-opacity'),
(None, 'fill-rule'),
(None, 'font-family'),
(None, 'font-size'),
(None, 'font-stretch'),
(None, 'font-style'),
(None, 'font-variant'),
(None, 'font-weight'),
(None, 'from'),
(None, 'fx'),
(None, 'fy'),
(None, 'g1'),
(None, 'g2'),
(None, 'glyph-name'),
(None, 'gradientUnits'),
(None, 'hanging'),
(None, 'height'),
(None, 'horiz-adv-x'),
(None, 'horiz-origin-x'),
(None, 'id'),
(None, 'ideographic'),
(None, 'k'),
(None, 'keyPoints'),
(None, 'keySplines'),
(None, 'keyTimes'),
(None, 'lang'),
(None, 'marker-end'),
(None, 'marker-mid'),
(None, 'marker-start'),
(None, 'markerHeight'),
(None, 'markerUnits'),
(None, 'markerWidth'),
(None, 'mathematical'),
(None, 'max'),
(None, 'min'),
(None, 'name'),
(None, 'offset'),
(None, 'opacity'),
(None, 'orient'),
(None, 'origin'),
(None, 'overline-position'),
(None, 'overline-thickness'),
(None, 'panose-1'),
(None, 'path'),
(None, 'pathLength'),
(None, 'points'),
(None, 'preserveAspectRatio'),
(None, 'r'),
(None, 'refX'),
(None, 'refY'),
(None, 'repeatCount'),
(None, 'repeatDur'),
(None, 'requiredExtensions'),
(None, 'requiredFeatures'),
(None, 'restart'),
(None, 'rotate'),
(None, 'rx'),
(None, 'ry'),
(None, 'slope'),
(None, 'stemh'),
(None, 'stemv'),
(None, 'stop-color'),
(None, 'stop-opacity'),
(None, 'strikethrough-position'),
(None, 'strikethrough-thickness'),
(None, 'stroke'),
(None, 'stroke-dasharray'),
(None, 'stroke-dashoffset'),
(None, 'stroke-linecap'),
(None, 'stroke-linejoin'),
(None, 'stroke-miterlimit'),
(None, 'stroke-opacity'),
(None, 'stroke-width'),
(None, 'systemLanguage'),
(None, 'target'),
(None, 'text-anchor'),
(None, 'to'),
(None, 'transform'),
(None, 'type'),
(None, 'u1'),
(None, 'u2'),
(None, 'underline-position'),
(None, 'underline-thickness'),
(None, 'unicode'),
(None, 'unicode-range'),
(None, 'units-per-em'),
(None, 'values'),
(None, 'version'),
(None, 'viewBox'),
(None, 'visibility'),
(None, 'width'),
(None, 'widths'),
(None, 'x'),
(None, 'x-height'),
(None, 'x1'),
(None, 'x2'),
(namespaces['xlink'], 'actuate'),
(namespaces['xlink'], 'arcrole'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'role'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'title'),
(namespaces['xlink'], 'type'),
(namespaces['xml'], 'base'),
(namespaces['xml'], 'lang'),
(namespaces['xml'], 'space'),
(None, 'y'),
(None, 'y1'),
(None, 'y2'),
(None, 'zoomAndPan'),
))
attr_val_is_uri = frozenset((
(None, 'href'),
(None, 'src'),
(None, 'cite'),
(None, 'action'),
(None, 'longdesc'),
(None, 'poster'),
(None, 'background'),
(None, 'datasrc'),
(None, 'dynsrc'),
(None, 'lowsrc'),
(None, 'ping'),
(namespaces['xlink'], 'href'),
(namespaces['xml'], 'base'),
))
svg_attr_val_allows_ref = frozenset((
(None, 'clip-path'),
(None, 'color-profile'),
(None, 'cursor'),
(None, 'fill'),
(None, 'filter'),
(None, 'marker'),
(None, 'marker-start'),
(None, 'marker-mid'),
(None, 'marker-end'),
(None, 'mask'),
(None, 'stroke'),
))
svg_allow_local_href = frozenset((
(None, 'altGlyph'),
(None, 'animate'),
(None, 'animateColor'),
(None, 'animateMotion'),
(None, 'animateTransform'),
(None, 'cursor'),
(None, 'feImage'),
(None, 'filter'),
(None, 'linearGradient'),
(None, 'pattern'),
(None, 'radialGradient'),
(None, 'textpath'),
(None, 'tref'),
(None, 'set'),
(None, 'use')
))
allowed_css_properties = frozenset((
'azimuth',
'background-color',
'border-bottom-color',
'border-collapse',
'border-color',
'border-left-color',
'border-right-color',
'border-top-color',
'clear',
'color',
'cursor',
'direction',
'display',
'elevation',
'float',
'font',
'font-family',
'font-size',
'font-style',
'font-variant',
'font-weight',
'height',
'letter-spacing',
'line-height',
'overflow',
'pause',
'pause-after',
'pause-before',
'pitch',
'pitch-range',
'richness',
'speak',
'speak-header',
'speak-numeral',
'speak-punctuation',
'speech-rate',
'stress',
'text-align',
'text-decoration',
'text-indent',
'unicode-bidi',
'vertical-align',
'voice-family',
'volume',
'white-space',
'width',
))
allowed_css_keywords = frozenset((
'auto',
'aqua',
'black',
'block',
'blue',
'bold',
'both',
'bottom',
'brown',
'center',
'collapse',
'dashed',
'dotted',
'fuchsia',
'gray',
'green',
'!important',
'italic',
'left',
'lime',
'maroon',
'medium',
'none',
'navy',
'normal',
'nowrap',
'olive',
'pointer',
'purple',
'red',
'right',
'solid',
'silver',
'teal',
'top',
'transparent',
'underline',
'white',
'yellow',
))
allowed_svg_properties = frozenset((
'fill',
'fill-opacity',
'fill-rule',
'stroke',
'stroke-width',
'stroke-linecap',
'stroke-linejoin',
'stroke-opacity',
))
allowed_protocols = frozenset((
'ed2k',
'ftp',
'http',
'https',
'irc',
'mailto',
'news',
'gopher',
'nntp',
'telnet',
'webcal',
'xmpp',
'callto',
'feed',
'urn',
'aim',
'rsync',
'tag',
'ssh',
'sftp',
'rtsp',
'afs',
'data',
))
allowed_content_types = frozenset((
'image/png',
'image/jpeg',
'image/gif',
'image/webp',
'image/bmp',
'text/plain',
))
data_content_type = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class Filter(base.Filter):
"""Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
def __init__(self,
source,
allowed_elements=allowed_elements,
allowed_attributes=allowed_attributes,
allowed_css_properties=allowed_css_properties,
allowed_css_keywords=allowed_css_keywords,
allowed_svg_properties=allowed_svg_properties,
allowed_protocols=allowed_protocols,
allowed_content_types=allowed_content_types,
attr_val_is_uri=attr_val_is_uri,
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
svg_allow_local_href=svg_allow_local_href):
"""Creates a Filter
:arg allowed_elements: set of elements to allow--everything else will
be escaped
:arg allowed_attributes: set of attributes to allow in
elements--everything else will be stripped
:arg allowed_css_properties: set of CSS properties to allow--everything
else will be stripped
:arg allowed_css_keywords: set of CSS keywords to allow--everything
else will be stripped
:arg allowed_svg_properties: set of SVG properties to allow--everything
else will be removed
:arg allowed_protocols: set of allowed protocols for URIs
:arg allowed_content_types: set of allowed content types for ``data`` URIs.
:arg attr_val_is_uri: set of attributes that have URI values--values
that have a scheme not listed in ``allowed_protocols`` are removed
:arg svg_attr_val_allows_ref: set of SVG attributes that can have
references
:arg svg_allow_local_href: set of SVG elements that can have local
hrefs--these are removed
"""
super(Filter, self).__init__(source)
self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes
self.allowed_css_properties = allowed_css_properties
self.allowed_css_keywords = allowed_css_keywords
self.allowed_svg_properties = allowed_svg_properties
self.allowed_protocols = allowed_protocols
self.allowed_content_types = allowed_content_types
self.attr_val_is_uri = attr_val_is_uri
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
self.svg_allow_local_href = svg_allow_local_href
def __iter__(self):
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
# are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
# ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
# are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
# allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in ("StartTag", "EndTag", "EmptyTag"):
name = token["name"]
namespace = token["namespace"]
if ((namespace, name) in self.allowed_elements or
(namespace is None and
(namespaces["html"], name) in self.allowed_elements)):
return self.allowed_token(token)
else:
return self.disallowed_token(token)
elif token_type == "Comment":
pass
else:
return token
def allowed_token(self, token):
if "data" in token:
attrs = token["data"]
attr_names = set(attrs.keys())
# Remove forbidden attributes
for to_remove in (attr_names - self.allowed_attributes):
del token["data"][to_remove]
attr_names.remove(to_remove)
# Remove attributes with disallowed URL values
for attr in (attr_names & self.attr_val_is_uri):
assert attr in attrs
# I don't have a clue where this regexp comes from or why it matches those
# characters, nor why we call unescape. I just know it's always been here.
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
# this will do is remove *more* than it otherwise would.
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
uri = urlparse.urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = data_content_type.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
(namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
attrs[(namespaces['xlink'], 'href')])):
del attrs[(namespaces['xlink'], 'href')]
if (None, 'style') in attrs:
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
token["data"] = attrs
return token
def disallowed_token(self, token):
token_type = token["type"]
if token_type == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
assert token_type in ("StartTag", "EmptyTag")
attrs = []
for (ns, name), v in token["data"].items():
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if keyword not in self.allowed_css_keywords and \
not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)

View file

@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
import re
from . import _base
from . import base
from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
class Filter(_base.Filter):
class Filter(base.Filter):
"""Collapses whitespace except in pre, textarea, and script elements"""
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self):
preserve = 0
for token in _base.Filter.__iter__(self):
for token in base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):

File diff suppressed because it is too large Load diff

View file

@ -1,300 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from .tokenizer import HTMLTokenizer
from .constants import tokenTypes
content_type_rgx = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs', 'data']
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
allowed_css_properties = acceptable_css_properties
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
allowed_content_types = acceptable_content_types
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in list(tokenTypes.keys()):
token_type = tokenTypes[token_type]
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
return self.allowed_token(token, token_type)
else:
return self.disallowed_token(token, token_type)
elif token_type == tokenTypes["Comment"]:
pass
else:
return token
def allowed_token(self, token, token_type):
if "data" in token:
attrs = dict([(name, val) for name, val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
uri = urlparse.urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = content_type_rgx.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name, val] for name, val in list(attrs.items())]
return token
def disallowed_token(self, token, token_type):
if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
token["data"] = "<%s%s>" % (token["name"], attrs)
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
if token["type"] in list(tokenTypes.keys()):
token["type"] = "Characters"
else:
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if keyword not in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
# Change case matching defaults as we only output lowercase html anyway
# This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName, parser=parser)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token

View file

@ -0,0 +1,409 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import re
from codecs import register_error, xmlcharrefreplace_errors
from .constants import voidElements, booleanAttributes, spaceCharacters
from .constants import rcdataElements, entities, xmlEntities
from . import treewalkers, _utils
from xml.sax.saxutils import escape
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")
_encode_entity_map = {}
_is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((_is_ucs4 and len(v) > 1) or
(not _is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = _utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if v not in _encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
_encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = _encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error("htmlentityreplace", htmlentityreplace_errors)
def serialize(input, tree="etree", encoding=None, **serializer_opts):
"""Serializes the input token stream using the specified treewalker
:arg input: the token stream to serialize
:arg tree: the treewalker to use
:arg encoding: the encoding to use
:arg serializer_opts: any options to pass to the
:py:class:`html5lib.serializer.HTMLSerializer` that gets created
:returns: the tree serialized as a string
Example:
>>> from html5lib.html5parser import parse
>>> from html5lib.serializer import serialize
>>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
>>> serialize(token_stream, omit_optional_tags=False)
'<html><head></head><body><p>Hi!</p></body></html>'
"""
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
s = HTMLSerializer(**serializer_opts)
return s.render(walker(input), encoding)
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer
:arg inject_meta_charset: Whether or not to inject the meta charset.
Defaults to ``True``.
:arg quote_attr_values: Whether to quote attribute values that don't
require quoting per legacy browser behavior (``"legacy"``), when
required by the standard (``"spec"``), or always (``"always"``).
Defaults to ``"legacy"``.
:arg quote_char: Use given quote character for attribute quoting.
Defaults to ``"`` which will use double quotes unless attribute
value contains a double quote, in which case single quotes are
used.
:arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
values.
Defaults to ``False``.
:arg escape_rcdata: Whether to escape characters that need to be
escaped within normal elements within rcdata elements such as
style.
Defaults to ``False``.
:arg resolve_entities: Whether to resolve named character entities that
appear in the source tree. The XML predefined entities &lt; &gt;
&amp; &quot; &apos; are unaffected by this setting.
Defaults to ``True``.
:arg strip_whitespace: Whether to remove semantically meaningless
whitespace. (This compresses all whitespace to a single space
except within ``pre``.)
Defaults to ``False``.
:arg minimize_boolean_attributes: Shortens boolean attributes to give
just the attribute value, for example::
<input disabled="disabled">
becomes::
<input disabled>
Defaults to ``True``.
:arg use_trailing_solidus: Includes a close-tag slash at the end of the
start tag of void elements (empty elements whose end tag is
forbidden). E.g. ``<hr/>``.
Defaults to ``False``.
:arg space_before_trailing_solidus: Places a space immediately before
the closing slash in a tag using a trailing solidus. E.g.
``<hr />``. Requires ``use_trailing_solidus=True``.
Defaults to ``True``.
:arg sanitize: Strip all unsafe or unknown constructs from output.
See :py:class:`html5lib.filters.sanitizer.Filter`.
Defaults to ``False``.
:arg omit_optional_tags: Omit start/end tags that are optional.
Defaults to ``True``.
:arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
Defaults to ``False``.
"""
unexpected_args = frozenset(kwargs) - frozenset(self.options)
if len(unexpected_args) > 0:
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "htmlentityreplace")
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
# pylint:disable=too-many-nested-blocks
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from .filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# Alphabetical attributes is here under the assumption that none of
# the later filters add or change order of attributes; it needs to be
# before the sanitizer so escaped elements come out correctly
if self.alphabetical_attributes:
from .filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from .filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from .filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from .filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError("System identifer contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = _quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = _quoteAttributeLegacy.search(v) is not None
else:
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(" /")
else:
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if key not in entities:
self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
"""Serializes the stream from the treewalker into a string
:arg treewalker: the treewalker to serialize
:arg encoding: the string encoding to use
:returns: the serialized tree
Example:
>>> from html5lib import parse, getTreeWalker
>>> from html5lib.serializer import HTMLSerializer
>>> token_stream = parse('<html><body>Hi!</body></html>')
>>> walker = getTreeWalker('etree')
>>> serializer = HTMLSerializer(omit_optional_tags=False)
>>> serializer.render(walker(token_stream))
'<html><head></head><body>Hi!</body></html>'
"""
if encoding:
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
class SerializeError(Exception):
"""Error in serialized tree"""
pass

View file

@ -1,16 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from .. import treewalkers
from .htmlserializer import HTMLSerializer
def serialize(input, tree="etree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
else:
raise ValueError("type must be html")
return s.render(walker(input), encoding)

View file

@ -1,317 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
try:
from functools import reduce
except ImportError:
pass
from ..constants import voidElements, booleanAttributes, spaceCharacters
from ..constants import rcdataElements, entities, xmlEntities
from .. import utils
from xml.sax.saxutils import escape
spaceCharacters = "".join(spaceCharacters)
try:
from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
unicode_encode_errors = "strict"
else:
unicode_encode_errors = "htmlentityreplace"
encode_entity_map = {}
is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((is_ucs4 and len(v) > 1) or
(not is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if v not in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error(unicode_encode_errors, htmlentityreplace_errors)
del register_error
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Keyword options (default given first unless specified) include:
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
in which case single quotes are used instead.
escape_lt_in_attrs=False|True
Whether to escape < in attribute values.
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
are unaffected by this setting.
strip_whitespace=False|True
Whether to remove semantically meaningless whitespace. (This
compresses all whitespace to a single space except within pre.)
minimize_boolean_attributes=True|False
Shortens boolean attributes to give just the attribute value,
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus=False|True
Includes a close-tag slash at the end of the start tag of void
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
space_before_trailing_solidus=True|False
Places a space immediately before the closing slash in a tag
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize=False|True
Strip all unsafe or unknown constructs from output.
See `html5lib user documentation`_
omit_optional_tags=True|False
Omit start/end tags that are optional.
alphabetical_attributes=False|True
Reorder attributes to be in alphabetical order.
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from ..filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from ..filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from ..filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from ..filters.optionaltags import Filter
treewalker = Filter(treewalker)
# Alphabetical attributes must be last, as other filters
# could add attributes and alter the order
if self.alphabetical_attributes:
from ..filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError("System identifer contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
for (attr_namespace, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple())
and k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values or not v:
quote_attr = True
else:
quote_attr = reduce(lambda x, y: x or (y in v),
spaceCharacters + ">\"'=", False)
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
yield self.encodeStrict(" /")
else:
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if key not in entities:
self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
if encoding:
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
def SerializeError(Exception):
"""Error in serialized tree"""
pass

View file

@ -0,0 +1,30 @@
"""Tree adapters let you convert from one tree structure to another
Example:
.. code-block:: python
import html5lib
from html5lib.treeadapters import genshi
doc = '<html><body>Hi!</body></html>'
treebuilder = html5lib.getTreeBuilder('etree')
parser = html5lib.HTMLParser(tree=treebuilder)
tree = parser.parse(doc)
TreeWalker = html5lib.getTreeWalker('etree')
genshi_tree = genshi.to_genshi(TreeWalker(tree))
"""
from __future__ import absolute_import, division, unicode_literals
from . import sax
__all__ = ["sax"]
try:
from . import genshi # noqa
except ImportError:
pass
else:
__all__.append("genshi")

View file

@ -0,0 +1,54 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName, Attrs
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
def to_genshi(walker):
"""Convert a tree to a genshi tree
:arg walker: the treewalker to use to walk the tree to convert it
:returns: generator of genshi nodes
"""
text = []
for token in walker:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
text.append(token["data"])
elif text:
yield TEXT, "".join(text), (None, -1, -1)
text = []
if type in ("StartTag", "EmptyTag"):
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
for attr, value in token["data"].items()])
yield (START, (QName(name), attrs), (None, -1, -1))
if type == "EmptyTag":
type = "EndTag"
if type == "EndTag":
if token["namespace"]:
name = "{%s}%s" % (token["namespace"], token["name"])
else:
name = token["name"]
yield END, QName(name), (None, -1, -1)
elif type == "Comment":
yield COMMENT, token["data"], (None, -1, -1)
elif type == "Doctype":
yield DOCTYPE, (token["name"], token["publicId"],
token["systemId"]), (None, -1, -1)
else:
pass # FIXME: What to do?
if text:
yield TEXT, "".join(text), (None, -1, -1)

View file

@ -11,7 +11,13 @@ for prefix, localName, namespace in adjustForeignAttributes.values():
def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker"""
"""Call SAX-like content handler based on treewalker walker
:arg walker: the treewalker to use to walk the tree to convert it
:arg handler: SAX handler to use
"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)

View file

@ -1,56 +1,68 @@
"""A collection of modules for building different kinds of tree from
HTML documents.
"""A collection of modules for building different kinds of trees from HTML
documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.etree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
1. A set of classes for various types of elements: Document, Doctype, Comment,
Element. These must implement the interface of ``base.treebuilders.Node``
(although comment nodes have a different signature for their constructor,
see ``treebuilders.etree.Comment``) Textual content may also be implemented
as another node type, or not, as your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
It also has one required method:
getDocument - Returns the root node of the complete document tree
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
* ``documentClass`` - the class to use for the bottommost node of a document
* ``elementClass`` - the class to use for HTML Elements
* ``commentClass`` - the class to use for comments
* ``doctypeClass`` - the class to use for doctypes
It also has one required method:
* ``getDocument`` - Returns the root node of the complete document tree
3. If you wish to run the unit tests, you must also create a ``testSerializer``
method on your treebuilder which accepts a node and returns a string
containing Node and its children serialized according to the format used in
the unittests
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
"""
from __future__ import absolute_import, division, unicode_literals
from ..utils import default_etree
from .._utils import default_etree
treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
"""Get a TreeBuilder class for various types of trees with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are:
:arg treeType: the name of the tree type required (case-insensitive). Supported
values are:
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation.
"etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to
xml.etree.cElementTree if available and
xml.etree.ElementTree if not.
"lxml" - A etree-based builder for lxml.etree, handling
limitations of lxml's implementation.
* "dom" - A generic builder for DOM implementations, defaulting to a
xml.dom.minidom based implementation.
* "etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to xml.etree.cElementTree if
available and xml.etree.ElementTree if not.
* "lxml" - A etree-based builder for lxml.etree, handling limitations
of lxml's implementation.
implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g.
xml.etree.ElementTree or xml.etree.cElementTree."""
:arg implementation: (Currently applies to the "etree" and "dom" tree
types). A module implementing the tree type e.g. xml.etree.ElementTree
or xml.etree.cElementTree.
:arg kwargs: Any additional options to pass to the TreeBuilder when
creating it.
Example:
>>> from html5lib.treebuilders import getTreeBuilder
>>> builder = getTreeBuilder('etree')
"""
treeType = treeType.lower()
if treeType not in treeBuilderCache:

View file

@ -21,22 +21,25 @@ listElementsMap = {
class Node(object):
"""Represents an item in the tree"""
def __init__(self, name):
"""Node representing an item in the tree.
name - The tag name associated with the node
parent - The parent of the current node (or None for the document node)
value - The value of the current node (applies to text nodes and
comments
attributes - a dict holding name, value pairs for attributes of the node
childNodes - a list of child nodes of the current node. This must
include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node
"""Creates a Node
:arg name: The tag name associated with the node
"""
# The tag name assocaited with the node
self.name = name
# The parent of the current node (or None for the document node)
self.parent = None
# The value of the current node (applies to text nodes and comments)
self.value = None
# A dict holding name -> value pairs for attributes of the node
self.attributes = {}
# A list of child nodes of the current node. This must include all
# elements but not necessarily other node types.
self.childNodes = []
# A list of miscellaneous flags that can be set on the node.
self._flags = []
def __str__(self):
@ -53,23 +56,41 @@ class Node(object):
def appendChild(self, node):
"""Insert node as a child of the current node
:arg node: the node to insert
"""
raise NotImplementedError
def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text.
:arg data: the data to insert
:arg insertBefore: True if you want to insert the text before the node
and False if you want to insert it after the node
"""
raise NotImplementedError
def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
the current node"""
the current node
:arg node: the node to insert
:arg refNode: the child node to insert the node before
"""
raise NotImplementedError
def removeChild(self, node):
"""Remove node from the children of the current node
:arg node: the child node to remove
"""
raise NotImplementedError
@ -77,6 +98,9 @@ class Node(object):
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
text in the correct way
:arg newParent: the node to move all this node's children to
"""
# XXX - should this method be made more general?
for child in self.childNodes:
@ -121,11 +145,14 @@ class ActiveFormattingElements(list):
class TreeBuilder(object):
"""Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
* documentClass - the class to use for the bottommost node of a document
* elementClass - the class to use for HTML Elements
* commentClass - the class to use for comments
* doctypeClass - the class to use for doctypes
"""
# pylint:disable=not-callable
# Document class
documentClass = None
@ -143,6 +170,11 @@ class TreeBuilder(object):
fragmentClass = None
def __init__(self, namespaceHTMLElements):
"""Create a TreeBuilder
:arg namespaceHTMLElements: whether or not to namespace HTML elements
"""
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
@ -166,12 +198,17 @@ class TreeBuilder(object):
# If we pass a node in we match that. if we pass a string
# match any node with that name
exactNode = hasattr(target, "nameTuple")
if not exactNode:
if isinstance(target, text_type):
target = (namespaces["html"], target)
assert isinstance(target, tuple)
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
if (node.name == target and not exactNode or
node == target and exactNode):
if exactNode and node == target:
return True
elif not exactNode and node.nameTuple == target:
return True
elif (invert ^ (node.nameTuple in listElements)):
return False
@ -353,19 +390,19 @@ class TreeBuilder(object):
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
and name != exclude):
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
name != exclude):
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):
"Return the final tree"
"""Return the final tree"""
return self.document
def getFragment(self):
"Return the final fragment"
"""Return the final fragment"""
# assert self.innerHTML
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
@ -373,5 +410,8 @@ class TreeBuilder(object):
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests
node - the node from which to start serializing"""
:arg node: the node from which to start serializing
"""
raise NotImplementedError

View file

@ -1,54 +1,62 @@
from __future__ import absolute_import, division, unicode_literals
from collections import MutableMapping
from xml.dom import minidom, Node
import weakref
from . import _base
from . import base
from .. import constants
from ..constants import namespaces
from ..utils import moduleFactoryFactory
from .._utils import moduleFactoryFactory
def getDomBuilder(DomImplementation):
Dom = DomImplementation
class AttrList(object):
class AttrList(MutableMapping):
def __init__(self, element):
self.element = element
def __iter__(self):
return list(self.element.attributes.items()).__iter__()
return iter(self.element.attributes.keys())
def __setitem__(self, name, value):
self.element.setAttribute(name, value)
def __len__(self):
return len(list(self.element.attributes.items()))
def items(self):
return [(item[0], item[1]) for item in
list(self.element.attributes.items())]
def keys(self):
return list(self.element.attributes.keys())
def __getitem__(self, name):
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.hasAttribute(name)
attr = self.element.ownerDocument.createAttribute(name)
attr.value = value
self.element.attributes[name] = attr
class NodeBuilder(_base.Node):
def __len__(self):
return len(self.element.attributes)
def items(self):
return list(self.element.attributes.items())
def values(self):
return list(self.element.attributes.values())
def __getitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.attributes[name].value
def __delitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
del self.element.attributes[name]
class NodeBuilder(base.Node):
def __init__(self, element):
_base.Node.__init__(self, element.nodeName)
base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self: hasattr(self.element, "namespaceURI")
and self.element.namespaceURI or None)
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
@ -109,7 +117,7 @@ def getDomBuilder(DomImplementation):
nameTuple = property(getNameTuple)
class TreeBuilder(_base.TreeBuilder):
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self)
@ -149,15 +157,16 @@ def getDomBuilder(DomImplementation):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
return base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data = data
if parent != self:
_base.TreeBuilder.insertText(self, data, parent)
base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
# pylint:disable=protected-access
if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)

View file

@ -1,13 +1,15 @@
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
from six import text_type
import re
from . import _base
from .. import ihatexml
from . import base
from .. import _ihatexml
from .. import constants
from ..constants import namespaces
from ..utils import moduleFactoryFactory
from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
@ -16,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class Element(_base.Node):
class Element(base.Node):
def __init__(self, name, namespace=None):
self._name = name
self._namespace = namespace
@ -98,6 +100,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
node.parent = self
def removeChild(self, node):
self._childNodes.remove(node)
self._element.remove(node._element)
node.parent = None
@ -139,7 +142,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
@ -253,10 +256,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "\n".join(rv)
def tostring(element):
def tostring(element): # pylint:disable=unused-variable
"""Serialize an element and its child nodes to a string"""
rv = []
filter = ihatexml.InfosetFilter()
filter = _ihatexml.InfosetFilter()
def serializeElement(element):
if isinstance(element, ElementTree.ElementTree):
@ -307,7 +310,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
@ -329,7 +332,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element
return base.TreeBuilder.getFragment(self)._element
return locals()

View file

@ -10,16 +10,17 @@ When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings
import re
import sys
from . import _base
from . import base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import ihatexml
from .. import _ihatexml
import lxml.etree as etree
@ -53,8 +54,7 @@ class Document(object):
def testSerializer(element):
rv = []
finalText = None
infosetFilter = ihatexml.InfosetFilter()
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
@ -79,7 +79,7 @@ def testSerializer(element):
next_element = next_element.getnext()
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info.major == 2
assert isinstance(element, str) or sys.version_info[0] == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
# Fragment case
@ -128,16 +128,12 @@ def testSerializer(element):
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
@ -173,13 +169,10 @@ def tostring(element):
serializeElement(element)
if finalText is not None:
rv.append("%s\"" % (' ' * 2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
class TreeBuilder(base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
@ -189,13 +182,15 @@ class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
def __init__(self, element, value=None):
if value is None:
value = {}
self._element = element
dict.__init__(self, value)
dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@ -257,12 +252,12 @@ class TreeBuilder(_base.TreeBuilder):
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = builder.Comment
self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
_base.TreeBuilder.reset(self)
base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
@ -303,19 +298,20 @@ class TreeBuilder(_base.TreeBuilder):
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
assert parent is None or parent is self.document
assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
"""Create the document root"""
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our iniial
# Therefore we need to use the built-in parser to create our initial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype:
@ -344,7 +340,8 @@ class TreeBuilder(_base.TreeBuilder):
# Append the initial comments:
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
comment = self.commentClass(comment_token["data"])
root.addprevious(comment._element)
# Create the root document and add the ElementTree to it
self.document = self.documentClass()

View file

@ -10,13 +10,10 @@ returning an iterator generating tokens.
from __future__ import absolute_import, division, unicode_literals
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
"pulldom"]
import sys
from .. import constants
from ..utils import default_etree
from .._utils import default_etree
__all__ = ["getTreeWalker", "pprint"]
treeWalkerCache = {}
@ -24,34 +21,38 @@ treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are:
:arg str treeType: the name of the tree type required (case-insensitive).
Supported values are:
"dom" - The xml.dom.minidom DOM implementation
"pulldom" - The xml.dom.pulldom event stream
"etree" - A generic walker for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"lxml" - Optimized walker for lxml.etree
"genshi" - a Genshi stream
* "dom": The xml.dom.minidom DOM implementation
* "etree": A generic walker for tree implementations exposing an
elementtree-like interface (known to work with ElementTree,
cElementTree and lxml.etree).
* "lxml": Optimized walker for lxml.etree
* "genshi": a Genshi stream
implementation - (Currently applies to the "etree" tree type only). A module
implementing the tree type e.g. xml.etree.ElementTree or
cElementTree."""
:arg implementation: A module implementing the tree type e.g.
xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
tree type only).
:arg kwargs: keyword arguments passed to the etree walker--for other
walkers, this has no effect
:returns: a TreeWalker class
"""
treeType = treeType.lower()
if treeType not in treeWalkerCache:
if treeType in ("dom", "pulldom"):
name = "%s.%s" % (__name__, treeType)
__import__(name)
mod = sys.modules[name]
treeWalkerCache[treeType] = mod.TreeWalker
if treeType == "dom":
from . import dom
treeWalkerCache[treeType] = dom.TreeWalker
elif treeType == "genshi":
from . import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker
from . import genshi
treeWalkerCache[treeType] = genshi.TreeWalker
elif treeType == "lxml":
from . import lxmletree
treeWalkerCache[treeType] = lxmletree.TreeWalker
from . import etree_lxml
treeWalkerCache[treeType] = etree_lxml.TreeWalker
elif treeType == "etree":
from . import etree
if implementation is None:
@ -77,7 +78,13 @@ def concatenateCharacterTokens(tokens):
def pprint(walker):
"""Pretty printer for tree walkers"""
"""Pretty printer for tree walkers
Takes a TreeWalker instance and pretty prints the output of walking the tree.
:arg walker: a TreeWalker instance
"""
output = []
indent = 0
for token in concatenateCharacterTokens(walker):

View file

@ -1,11 +1,11 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types
from xml.dom import Node
from ..constants import namespaces, voidElements, spaceCharacters
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
@ -14,80 +14,115 @@ COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
from ..constants import voidElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
def to_text(s, blank_if_none=True):
"""Wrapper around six.text_type to convert None to empty string"""
if s is None:
if blank_if_none:
return ""
else:
return None
elif isinstance(s, text_type):
return s
else:
return text_type(s)
def is_text_or_none(string):
"""Wrapper around isinstance(string_types) or is None"""
return string is None or isinstance(string, string_types)
class TreeWalker(object):
"""Walks a tree yielding tokens
Tokens are dicts that all have a ``type`` field specifying the type of the
token.
"""
def __init__(self, tree):
"""Creates a TreeWalker
:arg tree: the tree to walk
"""
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
"""Generates an error token with the given message
:arg msg: the error message
:returns: SerializeError token
"""
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
"""Generates an EmptyTag token
yield {"type": "EmptyTag", "name": to_text(name, False),
"namespace": to_text(namespace),
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:arg hasChildren: whether or not to yield a SerializationError because
this tag shouldn't have children
:returns: EmptyTag token
"""
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
"""Generates a StartTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:arg attrs: the attributes of the element as a dict
:returns: StartTag token
"""
return {"type": "StartTag",
"name": text_type(name),
"namespace": to_text(namespace),
"data": dict(((to_text(namespace, False), to_text(name)),
to_text(value, False))
for (namespace, name), value in attrs.items())}
"name": name,
"namespace": namespace,
"data": attrs}
def endTag(self, namespace, name):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(namespace)
"""Generates an EndTag token
:arg namespace: the namespace of the token--can be ``None``
:arg name: the name of the element
:returns: EndTag token
"""
return {"type": "EndTag",
"name": to_text(name, False),
"namespace": to_text(namespace),
"data": {}}
"name": name,
"namespace": namespace}
def text(self, data):
assert isinstance(data, string_types), type(data)
"""Generates SpaceCharacters and Characters tokens
data = to_text(data)
Depending on what's in the data, this generates one or more
``SpaceCharacters`` and ``Characters`` tokens.
For example:
>>> from html5lib.treewalkers.base import TreeWalker
>>> # Give it an empty tree just so it instantiates
>>> walker = TreeWalker([])
>>> list(walker.text(''))
[]
>>> list(walker.text(' '))
[{u'data': ' ', u'type': u'SpaceCharacters'}]
>>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
[{u'data': ' ', u'type': u'SpaceCharacters'},
{u'data': u'abc', u'type': u'Characters'},
{u'data': u' ', u'type': u'SpaceCharacters'}]
:arg data: the text data
:returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
"""
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
@ -101,27 +136,44 @@ class TreeWalker(object):
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
assert isinstance(data, string_types), type(data)
"""Generates a Comment token
return {"type": "Comment", "data": text_type(data)}
:arg data: the comment
def doctype(self, name, publicId=None, systemId=None, correct=True):
assert is_text_or_none(name), type(name)
assert is_text_or_none(publicId), type(publicId)
assert is_text_or_none(systemId), type(systemId)
:returns: Comment token
"""
return {"type": "Comment", "data": data}
def doctype(self, name, publicId=None, systemId=None):
"""Generates a Doctype token
:arg name:
:arg publicId:
:arg systemId:
:returns: the Doctype token
"""
return {"type": "Doctype",
"name": to_text(name),
"publicId": to_text(publicId),
"systemId": to_text(systemId),
"correct": to_text(correct)}
"name": name,
"publicId": publicId,
"systemId": systemId}
def entity(self, name):
assert isinstance(name, string_types), type(name)
"""Generates an Entity token
return {"type": "Entity", "name": text_type(name)}
:arg name: the entity name
:returns: an Entity token
"""
return {"type": "Entity", "name": name}
def unknown(self, nodeType):
"""Handles unknown node types"""
return self.error("Unknown node type: " + nodeType)
@ -154,7 +206,7 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
@ -187,7 +239,7 @@ class NonRecursiveTreeWalker(TreeWalker):
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name not in voidElements:
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None

View file

@ -2,16 +2,16 @@ from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
from . import _base
from . import base
class TreeWalker(_base.NonRecursiveTreeWalker):
class TreeWalker(base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
return _base.DOCTYPE, node.name, node.publicId, node.systemId
return base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
return _base.TEXT, node.nodeValue
return base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
attrs = {}
@ -21,17 +21,17 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs[(attr.namespaceURI, attr.localName)] = attr.value
else:
attrs[(None, attr.name)] = attr.value
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
return (base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue
return base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
return (_base.DOCUMENT,)
return (base.DOCUMENT,)
else:
return _base.UNKNOWN, node.nodeType
return base.UNKNOWN, node.nodeType
def getFirstChild(self, node):
return node.firstChild

View file

@ -1,19 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
try:
from collections import OrderedDict
except ImportError:
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
from collections import OrderedDict
import re
from six import string_types
from . import _base
from ..utils import moduleFactoryFactory
from . import base
from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
@ -22,7 +15,7 @@ def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class TreeWalker(_base.NonRecursiveTreeWalker):
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
@ -38,9 +31,9 @@ def getETreeBuilder(ElementTreeImplementation):
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, key, parents, flag = node
elt, _, _, flag = node
if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag)
return base.TEXT, getattr(elt, flag)
else:
node = elt
@ -48,14 +41,14 @@ def getETreeBuilder(ElementTreeImplementation):
node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
return (_base.DOCUMENT,)
return (base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return (_base.DOCTYPE, node.text,
return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTreeCommentType:
return _base.COMMENT, node.text
return base.COMMENT, node.text
else:
assert isinstance(node.tag, string_types), type(node.tag)
@ -73,7 +66,7 @@ def getETreeBuilder(ElementTreeImplementation):
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (_base.ELEMENT, namespace, tag,
return (base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node):
@ -129,6 +122,7 @@ def getETreeBuilder(ElementTreeImplementation):
if not parents:
return parent
else:
assert list(parents[-1]).count(parent) == 1
return parent, list(parents[-1]).index(parent), parents, None
return locals()

View file

@ -4,9 +4,9 @@ from six import text_type
from lxml import etree
from ..treebuilders.etree import tag_regexp
from . import _base
from . import base
from .. import ihatexml
from .. import _ihatexml
def ensure_str(s):
@ -15,20 +15,27 @@ def ensure_str(s):
elif isinstance(s, text_type):
return s
else:
return s.decode("utf-8", "strict")
return s.decode("ascii", "strict")
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
if et.docinfo.internalDTD:
self.children.append(Doctype(self,
ensure_str(et.docinfo.root_name),
ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
root = et.getroot()
node = root
try:
if et.docinfo.internalDTD:
self.children.append(Doctype(self,
ensure_str(et.docinfo.root_name),
ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
except AttributeError:
pass
try:
node = et.getroot()
except AttributeError:
node = et
while node.getprevious() is not None:
node = node.getprevious()
@ -115,35 +122,38 @@ class FragmentWrapper(object):
return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker):
class TreeWalker(base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
tree = Root(tree)
elif isinstance(tree, list):
# pylint:disable=redefined-variable-type
if isinstance(tree, list):
self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
else:
self.fragmentChildren = set()
tree = Root(tree)
base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = _ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return _base.TEXT, ensure_str(getattr(node, key))
return base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
return (_base.DOCUMENT,)
return (base.DOCUMENT,)
elif isinstance(node, Doctype):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
return base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return _base.TEXT, node.obj
return base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment:
return _base.COMMENT, ensure_str(node.text)
return base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else:
# This is assumed to be an ordinary element
@ -162,7 +172,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
@ -197,5 +207,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
if key == "text":
return node
# else: fallback to "normal" processing
elif node in self.fragmentChildren:
return None
return node.getparent()

View file

@ -4,12 +4,12 @@ from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from . import _base
from . import base
from ..constants import voidElements, namespaces
class TreeWalker(_base.TreeWalker):
class TreeWalker(base.TreeWalker):
def __iter__(self):
# Buffer the events so we can pass in the following one
previous = None
@ -25,7 +25,7 @@ class TreeWalker(_base.TreeWalker):
yield token
def tokens(self, event, next):
kind, data, pos = event
kind, data, _ = event
if kind == START:
tag, attribs = data
name = tag.localname
@ -39,8 +39,8 @@ class TreeWalker(_base.TreeWalker):
if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END
or next[1] != tag):
not next or next[0] != END or
next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, converted_attribs)
@ -48,7 +48,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == END:
name = data.localname
namespace = data.namespace
if name not in voidElements:
if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:

View file

@ -1,63 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
from . import _base
from ..constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
ignore_until = None
previous = None
for event in self.tree:
if previous is not None and \
(ignore_until is None or previous[1] is ignore_until):
if previous[1] is ignore_until:
ignore_until = None
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = previous[1]
previous = event
if ignore_until is None or previous[1] is ignore_until:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
def tokens(self, event, next):
type, node = event
if type == START_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
attrs[(attr.namespaceURI, attr.localName)] = attr.value
if name in voidElements:
for token in self.emptyTag(namespace,
name,
attrs,
not next or next[1] is not node):
yield token
else:
yield self.startTag(namespace, name, attrs)
elif type == END_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
if name not in voidElements:
yield self.endTag(namespace, name)
elif type == COMMENT:
yield self.comment(node.nodeValue)
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
for token in self.text(node.nodeValue):
yield token
else:
yield self.unknown(type)