1028 lines
35 KiB
Python
1028 lines
35 KiB
Python
# -*- test-case-name: twisted.web.test.test_xml -*-
|
|
# Copyright (c) Twisted Matrix Laboratories.
|
|
# See LICENSE for details.
|
|
|
|
"""
|
|
Micro Document Object Model: a partial DOM implementation with SUX.
|
|
|
|
This is an implementation of what we consider to be the useful subset of the
|
|
DOM. The chief advantage of this library is that, not being burdened with
|
|
standards compliance, it can remain very stable between versions. We can also
|
|
implement utility 'pythonic' ways to access and mutate the XML tree.
|
|
|
|
Since this has not subjected to a serious trial by fire, it is not recommended
|
|
to use this outside of Twisted applications. However, it seems to work just
|
|
fine for the documentation generator, which parses a fairly representative
|
|
sample of XML.
|
|
|
|
Microdom mainly focuses on working with HTML and XHTML.
|
|
"""
|
|
|
|
# System Imports
|
|
import re
|
|
from cStringIO import StringIO
|
|
|
|
# create NodeList class
|
|
from types import ListType as NodeList
|
|
from types import StringTypes, UnicodeType
|
|
|
|
# Twisted Imports
|
|
from twisted.web.sux import XMLParser, ParseError
|
|
from twisted.python.util import InsensitiveDict
|
|
|
|
|
|
def getElementsByTagName(iNode, name):
|
|
"""
|
|
Return a list of all child elements of C{iNode} with a name matching
|
|
C{name}.
|
|
|
|
Note that this implementation does not conform to the DOM Level 1 Core
|
|
specification because it may return C{iNode}.
|
|
|
|
@param iNode: An element at which to begin searching. If C{iNode} has a
|
|
name matching C{name}, it will be included in the result.
|
|
|
|
@param name: A C{str} giving the name of the elements to return.
|
|
|
|
@return: A C{list} of direct or indirect child elements of C{iNode} with
|
|
the name C{name}. This may include C{iNode}.
|
|
"""
|
|
matches = []
|
|
matches_append = matches.append # faster lookup. don't do this at home
|
|
slice = [iNode]
|
|
while len(slice)>0:
|
|
c = slice.pop(0)
|
|
if c.nodeName == name:
|
|
matches_append(c)
|
|
slice[:0] = c.childNodes
|
|
return matches
|
|
|
|
|
|
|
|
def getElementsByTagNameNoCase(iNode, name):
|
|
name = name.lower()
|
|
matches = []
|
|
matches_append = matches.append
|
|
slice=[iNode]
|
|
while len(slice)>0:
|
|
c = slice.pop(0)
|
|
if c.nodeName.lower() == name:
|
|
matches_append(c)
|
|
slice[:0] = c.childNodes
|
|
return matches
|
|
|
|
# order is important
|
|
HTML_ESCAPE_CHARS = (('&', '&'), # don't add any entities before this one
|
|
('<', '<'),
|
|
('>', '>'),
|
|
('"', '"'))
|
|
REV_HTML_ESCAPE_CHARS = list(HTML_ESCAPE_CHARS)
|
|
REV_HTML_ESCAPE_CHARS.reverse()
|
|
|
|
XML_ESCAPE_CHARS = HTML_ESCAPE_CHARS + (("'", '''),)
|
|
REV_XML_ESCAPE_CHARS = list(XML_ESCAPE_CHARS)
|
|
REV_XML_ESCAPE_CHARS.reverse()
|
|
|
|
def unescape(text, chars=REV_HTML_ESCAPE_CHARS):
|
|
"Perform the exact opposite of 'escape'."
|
|
for s, h in chars:
|
|
text = text.replace(h, s)
|
|
return text
|
|
|
|
def escape(text, chars=HTML_ESCAPE_CHARS):
|
|
"Escape a few XML special chars with XML entities."
|
|
for s, h in chars:
|
|
text = text.replace(s, h)
|
|
return text
|
|
|
|
|
|
class MismatchedTags(Exception):
|
|
|
|
def __init__(self, filename, expect, got, endLine, endCol, begLine, begCol):
|
|
(self.filename, self.expect, self.got, self.begLine, self.begCol, self.endLine,
|
|
self.endCol) = filename, expect, got, begLine, begCol, endLine, endCol
|
|
|
|
def __str__(self):
|
|
return ("expected </%s>, got </%s> line: %s col: %s, began line: %s col: %s"
|
|
% (self.expect, self.got, self.endLine, self.endCol, self.begLine,
|
|
self.begCol))
|
|
|
|
|
|
class Node(object):
|
|
nodeName = "Node"
|
|
|
|
def __init__(self, parentNode=None):
|
|
self.parentNode = parentNode
|
|
self.childNodes = []
|
|
|
|
def isEqualToNode(self, other):
|
|
"""
|
|
Compare this node to C{other}. If the nodes have the same number of
|
|
children and corresponding children are equal to each other, return
|
|
C{True}, otherwise return C{False}.
|
|
|
|
@type other: L{Node}
|
|
@rtype: C{bool}
|
|
"""
|
|
if len(self.childNodes) != len(other.childNodes):
|
|
return False
|
|
for a, b in zip(self.childNodes, other.childNodes):
|
|
if not a.isEqualToNode(b):
|
|
return False
|
|
return True
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
raise NotImplementedError()
|
|
|
|
def toxml(self, indent='', addindent='', newl='', strip=0, nsprefixes={},
|
|
namespace=''):
|
|
s = StringIO()
|
|
self.writexml(s, indent, addindent, newl, strip, nsprefixes, namespace)
|
|
rv = s.getvalue()
|
|
return rv
|
|
|
|
def writeprettyxml(self, stream, indent='', addindent=' ', newl='\n', strip=0):
|
|
return self.writexml(stream, indent, addindent, newl, strip)
|
|
|
|
def toprettyxml(self, indent='', addindent=' ', newl='\n', strip=0):
|
|
return self.toxml(indent, addindent, newl, strip)
|
|
|
|
def cloneNode(self, deep=0, parent=None):
|
|
raise NotImplementedError()
|
|
|
|
def hasChildNodes(self):
|
|
if self.childNodes:
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
|
|
def appendChild(self, child):
|
|
"""
|
|
Make the given L{Node} the last child of this node.
|
|
|
|
@param child: The L{Node} which will become a child of this node.
|
|
|
|
@raise TypeError: If C{child} is not a C{Node} instance.
|
|
"""
|
|
if not isinstance(child, Node):
|
|
raise TypeError("expected Node instance")
|
|
self.childNodes.append(child)
|
|
child.parentNode = self
|
|
|
|
|
|
def insertBefore(self, new, ref):
|
|
"""
|
|
Make the given L{Node} C{new} a child of this node which comes before
|
|
the L{Node} C{ref}.
|
|
|
|
@param new: A L{Node} which will become a child of this node.
|
|
|
|
@param ref: A L{Node} which is already a child of this node which
|
|
C{new} will be inserted before.
|
|
|
|
@raise TypeError: If C{new} or C{ref} is not a C{Node} instance.
|
|
|
|
@return: C{new}
|
|
"""
|
|
if not isinstance(new, Node) or not isinstance(ref, Node):
|
|
raise TypeError("expected Node instance")
|
|
i = self.childNodes.index(ref)
|
|
new.parentNode = self
|
|
self.childNodes.insert(i, new)
|
|
return new
|
|
|
|
|
|
def removeChild(self, child):
|
|
"""
|
|
Remove the given L{Node} from this node's children.
|
|
|
|
@param child: A L{Node} which is a child of this node which will no
|
|
longer be a child of this node after this method is called.
|
|
|
|
@raise TypeError: If C{child} is not a C{Node} instance.
|
|
|
|
@return: C{child}
|
|
"""
|
|
if not isinstance(child, Node):
|
|
raise TypeError("expected Node instance")
|
|
if child in self.childNodes:
|
|
self.childNodes.remove(child)
|
|
child.parentNode = None
|
|
return child
|
|
|
|
def replaceChild(self, newChild, oldChild):
|
|
"""
|
|
Replace a L{Node} which is already a child of this node with a
|
|
different node.
|
|
|
|
@param newChild: A L{Node} which will be made a child of this node.
|
|
|
|
@param oldChild: A L{Node} which is a child of this node which will
|
|
give up its position to C{newChild}.
|
|
|
|
@raise TypeError: If C{newChild} or C{oldChild} is not a C{Node}
|
|
instance.
|
|
|
|
@raise ValueError: If C{oldChild} is not a child of this C{Node}.
|
|
"""
|
|
if not isinstance(newChild, Node) or not isinstance(oldChild, Node):
|
|
raise TypeError("expected Node instance")
|
|
if oldChild.parentNode is not self:
|
|
raise ValueError("oldChild is not a child of this node")
|
|
self.childNodes[self.childNodes.index(oldChild)] = newChild
|
|
oldChild.parentNode = None
|
|
newChild.parentNode = self
|
|
|
|
|
|
def lastChild(self):
|
|
return self.childNodes[-1]
|
|
|
|
|
|
def firstChild(self):
|
|
if len(self.childNodes):
|
|
return self.childNodes[0]
|
|
return None
|
|
|
|
#def get_ownerDocument(self):
|
|
# """This doesn't really get the owner document; microdom nodes
|
|
# don't even have one necessarily. This gets the root node,
|
|
# which is usually what you really meant.
|
|
# *NOT DOM COMPLIANT.*
|
|
# """
|
|
# node=self
|
|
# while (node.parentNode): node=node.parentNode
|
|
# return node
|
|
#ownerDocument=node.get_ownerDocument()
|
|
# leaving commented for discussion; see also domhelpers.getParents(node)
|
|
|
|
class Document(Node):
|
|
|
|
def __init__(self, documentElement=None):
|
|
Node.__init__(self)
|
|
if documentElement:
|
|
self.appendChild(documentElement)
|
|
|
|
def cloneNode(self, deep=0, parent=None):
|
|
d = Document()
|
|
d.doctype = self.doctype
|
|
if deep:
|
|
newEl = self.documentElement.cloneNode(1, self)
|
|
else:
|
|
newEl = self.documentElement
|
|
d.appendChild(newEl)
|
|
return d
|
|
|
|
doctype = None
|
|
|
|
def isEqualToDocument(self, n):
|
|
return (self.doctype == n.doctype) and Node.isEqualToNode(self, n)
|
|
isEqualToNode = isEqualToDocument
|
|
|
|
def get_documentElement(self):
|
|
return self.childNodes[0]
|
|
documentElement=property(get_documentElement)
|
|
|
|
def appendChild(self, child):
|
|
"""
|
|
Make the given L{Node} the I{document element} of this L{Document}.
|
|
|
|
@param child: The L{Node} to make into this L{Document}'s document
|
|
element.
|
|
|
|
@raise ValueError: If this document already has a document element.
|
|
"""
|
|
if self.childNodes:
|
|
raise ValueError("Only one element per document.")
|
|
Node.appendChild(self, child)
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
stream.write('<?xml version="1.0"?>' + newl)
|
|
if self.doctype:
|
|
stream.write("<!DOCTYPE "+self.doctype+">" + newl)
|
|
self.documentElement.writexml(stream, indent, addindent, newl, strip,
|
|
nsprefixes, namespace)
|
|
|
|
# of dubious utility (?)
|
|
def createElement(self, name, **kw):
|
|
return Element(name, **kw)
|
|
|
|
def createTextNode(self, text):
|
|
return Text(text)
|
|
|
|
def createComment(self, text):
|
|
return Comment(text)
|
|
|
|
def getElementsByTagName(self, name):
|
|
if self.documentElement.caseInsensitive:
|
|
return getElementsByTagNameNoCase(self, name)
|
|
return getElementsByTagName(self, name)
|
|
|
|
def getElementById(self, id):
|
|
childNodes = self.childNodes[:]
|
|
while childNodes:
|
|
node = childNodes.pop(0)
|
|
if node.childNodes:
|
|
childNodes.extend(node.childNodes)
|
|
if hasattr(node, 'getAttribute') and node.getAttribute("id") == id:
|
|
return node
|
|
|
|
|
|
class EntityReference(Node):
|
|
|
|
def __init__(self, eref, parentNode=None):
|
|
Node.__init__(self, parentNode)
|
|
self.eref = eref
|
|
self.nodeValue = self.data = "&" + eref + ";"
|
|
|
|
def isEqualToEntityReference(self, n):
|
|
if not isinstance(n, EntityReference):
|
|
return 0
|
|
return (self.eref == n.eref) and (self.nodeValue == n.nodeValue)
|
|
isEqualToNode = isEqualToEntityReference
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
stream.write(self.nodeValue)
|
|
|
|
def cloneNode(self, deep=0, parent=None):
|
|
return EntityReference(self.eref, parent)
|
|
|
|
|
|
class CharacterData(Node):
|
|
|
|
def __init__(self, data, parentNode=None):
|
|
Node.__init__(self, parentNode)
|
|
self.value = self.data = self.nodeValue = data
|
|
|
|
def isEqualToCharacterData(self, n):
|
|
return self.value == n.value
|
|
isEqualToNode = isEqualToCharacterData
|
|
|
|
|
|
class Comment(CharacterData):
|
|
"""A comment node."""
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
val=self.data
|
|
if isinstance(val, UnicodeType):
|
|
val=val.encode('utf8')
|
|
stream.write("<!--%s-->" % val)
|
|
|
|
def cloneNode(self, deep=0, parent=None):
|
|
return Comment(self.nodeValue, parent)
|
|
|
|
|
|
class Text(CharacterData):
|
|
|
|
def __init__(self, data, parentNode=None, raw=0):
|
|
CharacterData.__init__(self, data, parentNode)
|
|
self.raw = raw
|
|
|
|
|
|
def isEqualToNode(self, other):
|
|
"""
|
|
Compare this text to C{text}. If the underlying values and the C{raw}
|
|
flag are the same, return C{True}, otherwise return C{False}.
|
|
"""
|
|
return (
|
|
CharacterData.isEqualToNode(self, other) and
|
|
self.raw == other.raw)
|
|
|
|
|
|
def cloneNode(self, deep=0, parent=None):
|
|
return Text(self.nodeValue, parent, self.raw)
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
if self.raw:
|
|
val = self.nodeValue
|
|
if not isinstance(val, StringTypes):
|
|
val = str(self.nodeValue)
|
|
else:
|
|
v = self.nodeValue
|
|
if not isinstance(v, StringTypes):
|
|
v = str(v)
|
|
if strip:
|
|
v = ' '.join(v.split())
|
|
val = escape(v)
|
|
if isinstance(val, UnicodeType):
|
|
val = val.encode('utf8')
|
|
stream.write(val)
|
|
|
|
def __repr__(self):
|
|
return "Text(%s" % repr(self.nodeValue) + ')'
|
|
|
|
|
|
class CDATASection(CharacterData):
|
|
def cloneNode(self, deep=0, parent=None):
|
|
return CDATASection(self.nodeValue, parent)
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
stream.write("<![CDATA[")
|
|
stream.write(self.nodeValue)
|
|
stream.write("]]>")
|
|
|
|
def _genprefix():
|
|
i = 0
|
|
while True:
|
|
yield 'p' + str(i)
|
|
i = i + 1
|
|
genprefix = _genprefix().next
|
|
|
|
class _Attr(CharacterData):
|
|
"Support class for getAttributeNode."
|
|
|
|
class Element(Node):
|
|
|
|
preserveCase = 0
|
|
caseInsensitive = 1
|
|
nsprefixes = None
|
|
|
|
def __init__(self, tagName, attributes=None, parentNode=None,
|
|
filename=None, markpos=None,
|
|
caseInsensitive=1, preserveCase=0,
|
|
namespace=None):
|
|
Node.__init__(self, parentNode)
|
|
self.preserveCase = preserveCase or not caseInsensitive
|
|
self.caseInsensitive = caseInsensitive
|
|
if not preserveCase:
|
|
tagName = tagName.lower()
|
|
if attributes is None:
|
|
self.attributes = {}
|
|
else:
|
|
self.attributes = attributes
|
|
for k, v in self.attributes.items():
|
|
self.attributes[k] = unescape(v)
|
|
|
|
if caseInsensitive:
|
|
self.attributes = InsensitiveDict(self.attributes,
|
|
preserve=preserveCase)
|
|
|
|
self.endTagName = self.nodeName = self.tagName = tagName
|
|
self._filename = filename
|
|
self._markpos = markpos
|
|
self.namespace = namespace
|
|
|
|
def addPrefixes(self, pfxs):
|
|
if self.nsprefixes is None:
|
|
self.nsprefixes = pfxs
|
|
else:
|
|
self.nsprefixes.update(pfxs)
|
|
|
|
def endTag(self, endTagName):
|
|
if not self.preserveCase:
|
|
endTagName = endTagName.lower()
|
|
self.endTagName = endTagName
|
|
|
|
def isEqualToElement(self, n):
|
|
if self.caseInsensitive:
|
|
return ((self.attributes == n.attributes)
|
|
and (self.nodeName.lower() == n.nodeName.lower()))
|
|
return (self.attributes == n.attributes) and (self.nodeName == n.nodeName)
|
|
|
|
|
|
def isEqualToNode(self, other):
|
|
"""
|
|
Compare this element to C{other}. If the C{nodeName}, C{namespace},
|
|
C{attributes}, and C{childNodes} are all the same, return C{True},
|
|
otherwise return C{False}.
|
|
"""
|
|
return (
|
|
self.nodeName.lower() == other.nodeName.lower() and
|
|
self.namespace == other.namespace and
|
|
self.attributes == other.attributes and
|
|
Node.isEqualToNode(self, other))
|
|
|
|
|
|
def cloneNode(self, deep=0, parent=None):
|
|
clone = Element(
|
|
self.tagName, parentNode=parent, namespace=self.namespace,
|
|
preserveCase=self.preserveCase, caseInsensitive=self.caseInsensitive)
|
|
clone.attributes.update(self.attributes)
|
|
if deep:
|
|
clone.childNodes = [child.cloneNode(1, clone) for child in self.childNodes]
|
|
else:
|
|
clone.childNodes = []
|
|
return clone
|
|
|
|
def getElementsByTagName(self, name):
|
|
if self.caseInsensitive:
|
|
return getElementsByTagNameNoCase(self, name)
|
|
return getElementsByTagName(self, name)
|
|
|
|
def hasAttributes(self):
|
|
return 1
|
|
|
|
def getAttribute(self, name, default=None):
|
|
return self.attributes.get(name, default)
|
|
|
|
def getAttributeNS(self, ns, name, default=None):
|
|
nsk = (ns, name)
|
|
if self.attributes.has_key(nsk):
|
|
return self.attributes[nsk]
|
|
if ns == self.namespace:
|
|
return self.attributes.get(name, default)
|
|
return default
|
|
|
|
def getAttributeNode(self, name):
|
|
return _Attr(self.getAttribute(name), self)
|
|
|
|
def setAttribute(self, name, attr):
|
|
self.attributes[name] = attr
|
|
|
|
def removeAttribute(self, name):
|
|
if name in self.attributes:
|
|
del self.attributes[name]
|
|
|
|
def hasAttribute(self, name):
|
|
return name in self.attributes
|
|
|
|
|
|
def writexml(self, stream, indent='', addindent='', newl='', strip=0,
|
|
nsprefixes={}, namespace=''):
|
|
"""
|
|
Serialize this L{Element} to the given stream.
|
|
|
|
@param stream: A file-like object to which this L{Element} will be
|
|
written.
|
|
|
|
@param nsprefixes: A C{dict} mapping namespace URIs as C{str} to
|
|
prefixes as C{str}. This defines the prefixes which are already in
|
|
scope in the document at the point at which this L{Element} exists.
|
|
This is essentially an implementation detail for namespace support.
|
|
Applications should not try to use it.
|
|
|
|
@param namespace: The namespace URI as a C{str} which is the default at
|
|
the point in the document at which this L{Element} exists. This is
|
|
essentially an implementation detail for namespace support.
|
|
Applications should not try to use it.
|
|
"""
|
|
# write beginning
|
|
ALLOWSINGLETON = ('img', 'br', 'hr', 'base', 'meta', 'link', 'param',
|
|
'area', 'input', 'col', 'basefont', 'isindex',
|
|
'frame')
|
|
BLOCKELEMENTS = ('html', 'head', 'body', 'noscript', 'ins', 'del',
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'script',
|
|
'ul', 'ol', 'dl', 'pre', 'hr', 'blockquote',
|
|
'address', 'p', 'div', 'fieldset', 'table', 'tr',
|
|
'form', 'object', 'fieldset', 'applet', 'map')
|
|
FORMATNICELY = ('tr', 'ul', 'ol', 'head')
|
|
|
|
# this should never be necessary unless people start
|
|
# changing .tagName on the fly(?)
|
|
if not self.preserveCase:
|
|
self.endTagName = self.tagName
|
|
w = stream.write
|
|
if self.nsprefixes:
|
|
newprefixes = self.nsprefixes.copy()
|
|
for ns in nsprefixes.keys():
|
|
if ns in newprefixes:
|
|
del newprefixes[ns]
|
|
else:
|
|
newprefixes = {}
|
|
|
|
begin = ['<']
|
|
if self.tagName in BLOCKELEMENTS:
|
|
begin = [newl, indent] + begin
|
|
bext = begin.extend
|
|
writeattr = lambda _atr, _val: bext((' ', _atr, '="', escape(_val), '"'))
|
|
|
|
# Make a local for tracking what end tag will be used. If namespace
|
|
# prefixes are involved, this will be changed to account for that
|
|
# before it's actually used.
|
|
endTagName = self.endTagName
|
|
|
|
if namespace != self.namespace and self.namespace is not None:
|
|
# If the current default namespace is not the namespace of this tag
|
|
# (and this tag has a namespace at all) then we'll write out
|
|
# something related to namespaces.
|
|
if self.namespace in nsprefixes:
|
|
# This tag's namespace already has a prefix bound to it. Use
|
|
# that prefix.
|
|
prefix = nsprefixes[self.namespace]
|
|
bext(prefix + ':' + self.tagName)
|
|
# Also make sure we use it for the end tag.
|
|
endTagName = prefix + ':' + self.endTagName
|
|
else:
|
|
# This tag's namespace has no prefix bound to it. Change the
|
|
# default namespace to this tag's namespace so we don't need
|
|
# prefixes. Alternatively, we could add a new prefix binding.
|
|
# I'm not sure why the code was written one way rather than the
|
|
# other. -exarkun
|
|
bext(self.tagName)
|
|
writeattr("xmlns", self.namespace)
|
|
# The default namespace just changed. Make sure any children
|
|
# know about this.
|
|
namespace = self.namespace
|
|
else:
|
|
# This tag has no namespace or its namespace is already the default
|
|
# namespace. Nothing extra to do here.
|
|
bext(self.tagName)
|
|
|
|
j = ''.join
|
|
for attr, val in self.attributes.iteritems():
|
|
if isinstance(attr, tuple):
|
|
ns, key = attr
|
|
if nsprefixes.has_key(ns):
|
|
prefix = nsprefixes[ns]
|
|
else:
|
|
prefix = genprefix()
|
|
newprefixes[ns] = prefix
|
|
assert val is not None
|
|
writeattr(prefix+':'+key,val)
|
|
else:
|
|
assert val is not None
|
|
writeattr(attr, val)
|
|
if newprefixes:
|
|
for ns, prefix in newprefixes.iteritems():
|
|
if prefix:
|
|
writeattr('xmlns:'+prefix, ns)
|
|
newprefixes.update(nsprefixes)
|
|
downprefixes = newprefixes
|
|
else:
|
|
downprefixes = nsprefixes
|
|
w(j(begin))
|
|
if self.childNodes:
|
|
w(">")
|
|
newindent = indent + addindent
|
|
for child in self.childNodes:
|
|
if self.tagName in BLOCKELEMENTS and \
|
|
self.tagName in FORMATNICELY:
|
|
w(j((newl, newindent)))
|
|
child.writexml(stream, newindent, addindent, newl, strip,
|
|
downprefixes, namespace)
|
|
if self.tagName in BLOCKELEMENTS:
|
|
w(j((newl, indent)))
|
|
w(j(('</', endTagName, '>')))
|
|
elif self.tagName.lower() not in ALLOWSINGLETON:
|
|
w(j(('></', endTagName, '>')))
|
|
else:
|
|
w(" />")
|
|
|
|
|
|
def __repr__(self):
|
|
rep = "Element(%s" % repr(self.nodeName)
|
|
if self.attributes:
|
|
rep += ", attributes=%r" % (self.attributes,)
|
|
if self._filename:
|
|
rep += ", filename=%r" % (self._filename,)
|
|
if self._markpos:
|
|
rep += ", markpos=%r" % (self._markpos,)
|
|
return rep + ')'
|
|
|
|
def __str__(self):
|
|
rep = "<" + self.nodeName
|
|
if self._filename or self._markpos:
|
|
rep += " ("
|
|
if self._filename:
|
|
rep += repr(self._filename)
|
|
if self._markpos:
|
|
rep += " line %s column %s" % self._markpos
|
|
if self._filename or self._markpos:
|
|
rep += ")"
|
|
for item in self.attributes.items():
|
|
rep += " %s=%r" % item
|
|
if self.hasChildNodes():
|
|
rep += " >...</%s>" % self.nodeName
|
|
else:
|
|
rep += " />"
|
|
return rep
|
|
|
|
def _unescapeDict(d):
|
|
dd = {}
|
|
for k, v in d.items():
|
|
dd[k] = unescape(v)
|
|
return dd
|
|
|
|
def _reverseDict(d):
|
|
dd = {}
|
|
for k, v in d.items():
|
|
dd[v]=k
|
|
return dd
|
|
|
|
class MicroDOMParser(XMLParser):
|
|
|
|
# <dash> glyph: a quick scan thru the DTD says BODY, AREA, LINK, IMG, HR,
|
|
# P, DT, DD, LI, INPUT, OPTION, THEAD, TFOOT, TBODY, COLGROUP, COL, TR, TH,
|
|
# TD, HEAD, BASE, META, HTML all have optional closing tags
|
|
|
|
soonClosers = 'area link br img hr input base meta'.split()
|
|
laterClosers = {'p': ['p', 'dt'],
|
|
'dt': ['dt','dd'],
|
|
'dd': ['dt', 'dd'],
|
|
'li': ['li'],
|
|
'tbody': ['thead', 'tfoot', 'tbody'],
|
|
'thead': ['thead', 'tfoot', 'tbody'],
|
|
'tfoot': ['thead', 'tfoot', 'tbody'],
|
|
'colgroup': ['colgroup'],
|
|
'col': ['col'],
|
|
'tr': ['tr'],
|
|
'td': ['td'],
|
|
'th': ['th'],
|
|
'head': ['body'],
|
|
'title': ['head', 'body'], # this looks wrong...
|
|
'option': ['option'],
|
|
}
|
|
|
|
|
|
def __init__(self, beExtremelyLenient=0, caseInsensitive=1, preserveCase=0,
|
|
soonClosers=soonClosers, laterClosers=laterClosers):
|
|
self.elementstack = []
|
|
d = {'xmlns': 'xmlns', '': None}
|
|
dr = _reverseDict(d)
|
|
self.nsstack = [(d,None,dr)]
|
|
self.documents = []
|
|
self._mddoctype = None
|
|
self.beExtremelyLenient = beExtremelyLenient
|
|
self.caseInsensitive = caseInsensitive
|
|
self.preserveCase = preserveCase or not caseInsensitive
|
|
self.soonClosers = soonClosers
|
|
self.laterClosers = laterClosers
|
|
# self.indentlevel = 0
|
|
|
|
def shouldPreserveSpace(self):
|
|
for edx in xrange(len(self.elementstack)):
|
|
el = self.elementstack[-edx]
|
|
if el.tagName == 'pre' or el.getAttribute("xml:space", '') == 'preserve':
|
|
return 1
|
|
return 0
|
|
|
|
def _getparent(self):
|
|
if self.elementstack:
|
|
return self.elementstack[-1]
|
|
else:
|
|
return None
|
|
|
|
COMMENT = re.compile(r"\s*/[/*]\s*")
|
|
|
|
def _fixScriptElement(self, el):
|
|
# this deals with case where there is comment or CDATA inside
|
|
# <script> tag and we want to do the right thing with it
|
|
if not self.beExtremelyLenient or not len(el.childNodes) == 1:
|
|
return
|
|
c = el.firstChild()
|
|
if isinstance(c, Text):
|
|
# deal with nasty people who do stuff like:
|
|
# <script> // <!--
|
|
# x = 1;
|
|
# // --></script>
|
|
# tidy does this, for example.
|
|
prefix = ""
|
|
oldvalue = c.value
|
|
match = self.COMMENT.match(oldvalue)
|
|
if match:
|
|
prefix = match.group()
|
|
oldvalue = oldvalue[len(prefix):]
|
|
|
|
# now see if contents are actual node and comment or CDATA
|
|
try:
|
|
e = parseString("<a>%s</a>" % oldvalue).childNodes[0]
|
|
except (ParseError, MismatchedTags):
|
|
return
|
|
if len(e.childNodes) != 1:
|
|
return
|
|
e = e.firstChild()
|
|
if isinstance(e, (CDATASection, Comment)):
|
|
el.childNodes = []
|
|
if prefix:
|
|
el.childNodes.append(Text(prefix))
|
|
el.childNodes.append(e)
|
|
|
|
def gotDoctype(self, doctype):
|
|
self._mddoctype = doctype
|
|
|
|
def gotTagStart(self, name, attributes):
|
|
# print ' '*self.indentlevel, 'start tag',name
|
|
# self.indentlevel += 1
|
|
parent = self._getparent()
|
|
if (self.beExtremelyLenient and isinstance(parent, Element)):
|
|
parentName = parent.tagName
|
|
myName = name
|
|
if self.caseInsensitive:
|
|
parentName = parentName.lower()
|
|
myName = myName.lower()
|
|
if myName in self.laterClosers.get(parentName, []):
|
|
self.gotTagEnd(parent.tagName)
|
|
parent = self._getparent()
|
|
attributes = _unescapeDict(attributes)
|
|
namespaces = self.nsstack[-1][0]
|
|
newspaces = {}
|
|
for k, v in attributes.items():
|
|
if k.startswith('xmlns'):
|
|
spacenames = k.split(':',1)
|
|
if len(spacenames) == 2:
|
|
newspaces[spacenames[1]] = v
|
|
else:
|
|
newspaces[''] = v
|
|
del attributes[k]
|
|
if newspaces:
|
|
namespaces = namespaces.copy()
|
|
namespaces.update(newspaces)
|
|
for k, v in attributes.items():
|
|
ksplit = k.split(':', 1)
|
|
if len(ksplit) == 2:
|
|
pfx, tv = ksplit
|
|
if pfx != 'xml' and pfx in namespaces:
|
|
attributes[namespaces[pfx], tv] = v
|
|
del attributes[k]
|
|
el = Element(name, attributes, parent,
|
|
self.filename, self.saveMark(),
|
|
caseInsensitive=self.caseInsensitive,
|
|
preserveCase=self.preserveCase,
|
|
namespace=namespaces.get(''))
|
|
revspaces = _reverseDict(newspaces)
|
|
el.addPrefixes(revspaces)
|
|
|
|
if newspaces:
|
|
rscopy = self.nsstack[-1][2].copy()
|
|
rscopy.update(revspaces)
|
|
self.nsstack.append((namespaces, el, rscopy))
|
|
self.elementstack.append(el)
|
|
if parent:
|
|
parent.appendChild(el)
|
|
if (self.beExtremelyLenient and el.tagName in self.soonClosers):
|
|
self.gotTagEnd(name)
|
|
|
|
def _gotStandalone(self, factory, data):
|
|
parent = self._getparent()
|
|
te = factory(data, parent)
|
|
if parent:
|
|
parent.appendChild(te)
|
|
elif self.beExtremelyLenient:
|
|
self.documents.append(te)
|
|
|
|
def gotText(self, data):
|
|
if data.strip() or self.shouldPreserveSpace():
|
|
self._gotStandalone(Text, data)
|
|
|
|
def gotComment(self, data):
|
|
self._gotStandalone(Comment, data)
|
|
|
|
def gotEntityReference(self, entityRef):
|
|
self._gotStandalone(EntityReference, entityRef)
|
|
|
|
def gotCData(self, cdata):
|
|
self._gotStandalone(CDATASection, cdata)
|
|
|
|
def gotTagEnd(self, name):
|
|
# print ' '*self.indentlevel, 'end tag',name
|
|
# self.indentlevel -= 1
|
|
if not self.elementstack:
|
|
if self.beExtremelyLenient:
|
|
return
|
|
raise MismatchedTags(*((self.filename, "NOTHING", name)
|
|
+self.saveMark()+(0,0)))
|
|
el = self.elementstack.pop()
|
|
pfxdix = self.nsstack[-1][2]
|
|
if self.nsstack[-1][1] is el:
|
|
nstuple = self.nsstack.pop()
|
|
else:
|
|
nstuple = None
|
|
if self.caseInsensitive:
|
|
tn = el.tagName.lower()
|
|
cname = name.lower()
|
|
else:
|
|
tn = el.tagName
|
|
cname = name
|
|
|
|
nsplit = name.split(':',1)
|
|
if len(nsplit) == 2:
|
|
pfx, newname = nsplit
|
|
ns = pfxdix.get(pfx,None)
|
|
if ns is not None:
|
|
if el.namespace != ns:
|
|
if not self.beExtremelyLenient:
|
|
raise MismatchedTags(*((self.filename, el.tagName, name)
|
|
+self.saveMark()+el._markpos))
|
|
if not (tn == cname):
|
|
if self.beExtremelyLenient:
|
|
if self.elementstack:
|
|
lastEl = self.elementstack[0]
|
|
for idx in xrange(len(self.elementstack)):
|
|
if self.elementstack[-(idx+1)].tagName == cname:
|
|
self.elementstack[-(idx+1)].endTag(name)
|
|
break
|
|
else:
|
|
# this was a garbage close tag; wait for a real one
|
|
self.elementstack.append(el)
|
|
if nstuple is not None:
|
|
self.nsstack.append(nstuple)
|
|
return
|
|
del self.elementstack[-(idx+1):]
|
|
if not self.elementstack:
|
|
self.documents.append(lastEl)
|
|
return
|
|
else:
|
|
raise MismatchedTags(*((self.filename, el.tagName, name)
|
|
+self.saveMark()+el._markpos))
|
|
el.endTag(name)
|
|
if not self.elementstack:
|
|
self.documents.append(el)
|
|
if self.beExtremelyLenient and el.tagName == "script":
|
|
self._fixScriptElement(el)
|
|
|
|
def connectionLost(self, reason):
|
|
XMLParser.connectionLost(self, reason) # This can cause more events!
|
|
if self.elementstack:
|
|
if self.beExtremelyLenient:
|
|
self.documents.append(self.elementstack[0])
|
|
else:
|
|
raise MismatchedTags(*((self.filename, self.elementstack[-1],
|
|
"END_OF_FILE")
|
|
+self.saveMark()
|
|
+self.elementstack[-1]._markpos))
|
|
|
|
|
|
def parse(readable, *args, **kwargs):
|
|
"""Parse HTML or XML readable."""
|
|
if not hasattr(readable, "read"):
|
|
readable = open(readable, "rb")
|
|
mdp = MicroDOMParser(*args, **kwargs)
|
|
mdp.filename = getattr(readable, "name", "<xmlfile />")
|
|
mdp.makeConnection(None)
|
|
if hasattr(readable,"getvalue"):
|
|
mdp.dataReceived(readable.getvalue())
|
|
else:
|
|
r = readable.read(1024)
|
|
while r:
|
|
mdp.dataReceived(r)
|
|
r = readable.read(1024)
|
|
mdp.connectionLost(None)
|
|
|
|
if not mdp.documents:
|
|
raise ParseError(mdp.filename, 0, 0, "No top-level Nodes in document")
|
|
|
|
if mdp.beExtremelyLenient:
|
|
if len(mdp.documents) == 1:
|
|
d = mdp.documents[0]
|
|
if not isinstance(d, Element):
|
|
el = Element("html")
|
|
el.appendChild(d)
|
|
d = el
|
|
else:
|
|
d = Element("html")
|
|
for child in mdp.documents:
|
|
d.appendChild(child)
|
|
else:
|
|
d = mdp.documents[0]
|
|
doc = Document(d)
|
|
doc.doctype = mdp._mddoctype
|
|
return doc
|
|
|
|
def parseString(st, *args, **kw):
|
|
if isinstance(st, UnicodeType):
|
|
# this isn't particularly ideal, but it does work.
|
|
return parse(StringIO(st.encode('UTF-16')), *args, **kw)
|
|
return parse(StringIO(st), *args, **kw)
|
|
|
|
|
|
def parseXML(readable):
|
|
"""Parse an XML readable object."""
|
|
return parse(readable, caseInsensitive=0, preserveCase=1)
|
|
|
|
|
|
def parseXMLString(st):
|
|
"""Parse an XML readable object."""
|
|
return parseString(st, caseInsensitive=0, preserveCase=1)
|
|
|
|
|
|
# Utility
|
|
|
|
class lmx:
|
|
"""Easy creation of XML."""
|
|
|
|
def __init__(self, node='div'):
|
|
if isinstance(node, StringTypes):
|
|
node = Element(node)
|
|
self.node = node
|
|
|
|
def __getattr__(self, name):
|
|
if name[0] == '_':
|
|
raise AttributeError("no private attrs")
|
|
return lambda **kw: self.add(name,**kw)
|
|
|
|
def __setitem__(self, key, val):
|
|
self.node.setAttribute(key, val)
|
|
|
|
def __getitem__(self, key):
|
|
return self.node.getAttribute(key)
|
|
|
|
def text(self, txt, raw=0):
|
|
nn = Text(txt, raw=raw)
|
|
self.node.appendChild(nn)
|
|
return self
|
|
|
|
def add(self, tagName, **kw):
|
|
newNode = Element(tagName, caseInsensitive=0, preserveCase=0)
|
|
self.node.appendChild(newNode)
|
|
xf = lmx(newNode)
|
|
for k, v in kw.items():
|
|
if k[0] == '_':
|
|
k = k[1:]
|
|
xf[k]=v
|
|
return xf
|