637 lines
20 KiB
Python
637 lines
20 KiB
Python
|
# -*- test-case-name: twisted.web.test.test_xml -*-
|
||
|
#
|
||
|
# Copyright (c) Twisted Matrix Laboratories.
|
||
|
# See LICENSE for details.
|
||
|
|
||
|
|
||
|
"""
|
||
|
*S*mall, *U*ncomplicated *X*ML.
|
||
|
|
||
|
This is a very simple implementation of XML/HTML as a network
|
||
|
protocol. It is not at all clever. Its main features are that it
|
||
|
does not:
|
||
|
|
||
|
- support namespaces
|
||
|
- mung mnemonic entity references
|
||
|
- validate
|
||
|
- perform *any* external actions (such as fetching URLs or writing files)
|
||
|
under *any* circumstances
|
||
|
- has lots and lots of horrible hacks for supporting broken HTML (as an
|
||
|
option, they're not on by default).
|
||
|
"""
|
||
|
|
||
|
from twisted.internet.protocol import Protocol
|
||
|
from twisted.python.reflect import prefixedMethodNames
|
||
|
|
||
|
|
||
|
|
||
|
# Elements of the three-tuples in the state table.
|
||
|
BEGIN_HANDLER = 0
|
||
|
DO_HANDLER = 1
|
||
|
END_HANDLER = 2
|
||
|
|
||
|
identChars = '.-_:'
|
||
|
lenientIdentChars = identChars + ';+#/%~'
|
||
|
|
||
|
def nop(*args, **kw):
|
||
|
"Do nothing."
|
||
|
|
||
|
|
||
|
def unionlist(*args):
|
||
|
l = []
|
||
|
for x in args:
|
||
|
l.extend(x)
|
||
|
d = dict([(x, 1) for x in l])
|
||
|
return d.keys()
|
||
|
|
||
|
|
||
|
def zipfndict(*args, **kw):
|
||
|
default = kw.get('default', nop)
|
||
|
d = {}
|
||
|
for key in unionlist(*[fndict.keys() for fndict in args]):
|
||
|
d[key] = tuple([x.get(key, default) for x in args])
|
||
|
return d
|
||
|
|
||
|
|
||
|
def prefixedMethodClassDict(clazz, prefix):
|
||
|
return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
|
||
|
|
||
|
|
||
|
def prefixedMethodObjDict(obj, prefix):
|
||
|
return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
|
||
|
|
||
|
|
||
|
class ParseError(Exception):
|
||
|
|
||
|
def __init__(self, filename, line, col, message):
|
||
|
self.filename = filename
|
||
|
self.line = line
|
||
|
self.col = col
|
||
|
self.message = message
|
||
|
|
||
|
def __str__(self):
|
||
|
return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
|
||
|
self.message)
|
||
|
|
||
|
class XMLParser(Protocol):
|
||
|
|
||
|
state = None
|
||
|
encodings = None
|
||
|
filename = "<xml />"
|
||
|
beExtremelyLenient = 0
|
||
|
_prepend = None
|
||
|
|
||
|
# _leadingBodyData will sometimes be set before switching to the
|
||
|
# 'bodydata' state, when we "accidentally" read a byte of bodydata
|
||
|
# in a different state.
|
||
|
_leadingBodyData = None
|
||
|
|
||
|
def connectionMade(self):
|
||
|
self.lineno = 1
|
||
|
self.colno = 0
|
||
|
self.encodings = []
|
||
|
|
||
|
def saveMark(self):
|
||
|
'''Get the line number and column of the last character parsed'''
|
||
|
# This gets replaced during dataReceived, restored afterwards
|
||
|
return (self.lineno, self.colno)
|
||
|
|
||
|
def _parseError(self, message):
|
||
|
raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
|
||
|
|
||
|
def _buildStateTable(self):
|
||
|
'''Return a dictionary of begin, do, end state function tuples'''
|
||
|
# _buildStateTable leaves something to be desired but it does what it
|
||
|
# does.. probably slowly, so I'm doing some evil caching so it doesn't
|
||
|
# get called more than once per class.
|
||
|
stateTable = getattr(self.__class__, '__stateTable', None)
|
||
|
if stateTable is None:
|
||
|
stateTable = self.__class__.__stateTable = zipfndict(
|
||
|
*[prefixedMethodObjDict(self, prefix)
|
||
|
for prefix in ('begin_', 'do_', 'end_')])
|
||
|
return stateTable
|
||
|
|
||
|
def _decode(self, data):
|
||
|
if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
|
||
|
assert not len(data) & 1, 'UTF-16 must come in pairs for now'
|
||
|
if self._prepend:
|
||
|
data = self._prepend + data
|
||
|
for encoding in self.encodings:
|
||
|
data = unicode(data, encoding)
|
||
|
return data
|
||
|
|
||
|
def maybeBodyData(self):
|
||
|
if self.endtag:
|
||
|
return 'bodydata'
|
||
|
|
||
|
# Get ready for fun! We're going to allow
|
||
|
# <script>if (foo < bar)</script> to work!
|
||
|
# We do this by making everything between <script> and
|
||
|
# </script> a Text
|
||
|
# BUT <script src="foo"> will be special-cased to do regular,
|
||
|
# lenient behavior, because those may not have </script>
|
||
|
# -radix
|
||
|
|
||
|
if (self.tagName == 'script' and 'src' not in self.tagAttributes):
|
||
|
# we do this ourselves rather than having begin_waitforendscript
|
||
|
# becuase that can get called multiple times and we don't want
|
||
|
# bodydata to get reset other than the first time.
|
||
|
self.begin_bodydata(None)
|
||
|
return 'waitforendscript'
|
||
|
return 'bodydata'
|
||
|
|
||
|
|
||
|
|
||
|
def dataReceived(self, data):
|
||
|
stateTable = self._buildStateTable()
|
||
|
if not self.state:
|
||
|
# all UTF-16 starts with this string
|
||
|
if data.startswith('\xff\xfe'):
|
||
|
self._prepend = '\xff\xfe'
|
||
|
self.encodings.append('UTF-16')
|
||
|
data = data[2:]
|
||
|
elif data.startswith('\xfe\xff'):
|
||
|
self._prepend = '\xfe\xff'
|
||
|
self.encodings.append('UTF-16')
|
||
|
data = data[2:]
|
||
|
self.state = 'begin'
|
||
|
if self.encodings:
|
||
|
data = self._decode(data)
|
||
|
# bring state, lineno, colno into local scope
|
||
|
lineno, colno = self.lineno, self.colno
|
||
|
curState = self.state
|
||
|
# replace saveMark with a nested scope function
|
||
|
_saveMark = self.saveMark
|
||
|
def saveMark():
|
||
|
return (lineno, colno)
|
||
|
self.saveMark = saveMark
|
||
|
# fetch functions from the stateTable
|
||
|
beginFn, doFn, endFn = stateTable[curState]
|
||
|
try:
|
||
|
for byte in data:
|
||
|
# do newline stuff
|
||
|
if byte == '\n':
|
||
|
lineno += 1
|
||
|
colno = 0
|
||
|
else:
|
||
|
colno += 1
|
||
|
newState = doFn(byte)
|
||
|
if newState is not None and newState != curState:
|
||
|
# this is the endFn from the previous state
|
||
|
endFn()
|
||
|
curState = newState
|
||
|
beginFn, doFn, endFn = stateTable[curState]
|
||
|
beginFn(byte)
|
||
|
finally:
|
||
|
self.saveMark = _saveMark
|
||
|
self.lineno, self.colno = lineno, colno
|
||
|
# state doesn't make sense if there's an exception..
|
||
|
self.state = curState
|
||
|
|
||
|
|
||
|
def connectionLost(self, reason):
|
||
|
"""
|
||
|
End the last state we were in.
|
||
|
"""
|
||
|
stateTable = self._buildStateTable()
|
||
|
stateTable[self.state][END_HANDLER]()
|
||
|
|
||
|
|
||
|
# state methods
|
||
|
|
||
|
def do_begin(self, byte):
|
||
|
if byte.isspace():
|
||
|
return
|
||
|
if byte != '<':
|
||
|
if self.beExtremelyLenient:
|
||
|
self._leadingBodyData = byte
|
||
|
return 'bodydata'
|
||
|
self._parseError("First char of document [%r] wasn't <" % (byte,))
|
||
|
return 'tagstart'
|
||
|
|
||
|
def begin_comment(self, byte):
|
||
|
self.commentbuf = ''
|
||
|
|
||
|
def do_comment(self, byte):
|
||
|
self.commentbuf += byte
|
||
|
if self.commentbuf.endswith('-->'):
|
||
|
self.gotComment(self.commentbuf[:-3])
|
||
|
return 'bodydata'
|
||
|
|
||
|
def begin_tagstart(self, byte):
|
||
|
self.tagName = '' # name of the tag
|
||
|
self.tagAttributes = {} # attributes of the tag
|
||
|
self.termtag = 0 # is the tag self-terminating
|
||
|
self.endtag = 0
|
||
|
|
||
|
def do_tagstart(self, byte):
|
||
|
if byte.isalnum() or byte in identChars:
|
||
|
self.tagName += byte
|
||
|
if self.tagName == '!--':
|
||
|
return 'comment'
|
||
|
elif byte.isspace():
|
||
|
if self.tagName:
|
||
|
if self.endtag:
|
||
|
# properly strict thing to do here is probably to only
|
||
|
# accept whitespace
|
||
|
return 'waitforgt'
|
||
|
return 'attrs'
|
||
|
else:
|
||
|
self._parseError("Whitespace before tag-name")
|
||
|
elif byte == '>':
|
||
|
if self.endtag:
|
||
|
self.gotTagEnd(self.tagName)
|
||
|
return 'bodydata'
|
||
|
else:
|
||
|
self.gotTagStart(self.tagName, {})
|
||
|
return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
|
||
|
elif byte == '/':
|
||
|
if self.tagName:
|
||
|
return 'afterslash'
|
||
|
else:
|
||
|
self.endtag = 1
|
||
|
elif byte in '!?':
|
||
|
if self.tagName:
|
||
|
if not self.beExtremelyLenient:
|
||
|
self._parseError("Invalid character in tag-name")
|
||
|
else:
|
||
|
self.tagName += byte
|
||
|
self.termtag = 1
|
||
|
elif byte == '[':
|
||
|
if self.tagName == '!':
|
||
|
return 'expectcdata'
|
||
|
else:
|
||
|
self._parseError("Invalid '[' in tag-name")
|
||
|
else:
|
||
|
if self.beExtremelyLenient:
|
||
|
self.bodydata = '<'
|
||
|
return 'unentity'
|
||
|
self._parseError('Invalid tag character: %r'% byte)
|
||
|
|
||
|
def begin_unentity(self, byte):
|
||
|
self.bodydata += byte
|
||
|
|
||
|
def do_unentity(self, byte):
|
||
|
self.bodydata += byte
|
||
|
return 'bodydata'
|
||
|
|
||
|
def end_unentity(self):
|
||
|
self.gotText(self.bodydata)
|
||
|
|
||
|
def begin_expectcdata(self, byte):
|
||
|
self.cdatabuf = byte
|
||
|
|
||
|
def do_expectcdata(self, byte):
|
||
|
self.cdatabuf += byte
|
||
|
cdb = self.cdatabuf
|
||
|
cd = '[CDATA['
|
||
|
if len(cd) > len(cdb):
|
||
|
if cd.startswith(cdb):
|
||
|
return
|
||
|
elif self.beExtremelyLenient:
|
||
|
## WHAT THE CRAP!? MSWord9 generates HTML that includes these
|
||
|
## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
|
||
|
## 'em as best I can. this should really be a separate parse
|
||
|
## state but I don't even have any idea what these _are_.
|
||
|
return 'waitforgt'
|
||
|
else:
|
||
|
self._parseError("Mal-formed CDATA header")
|
||
|
if cd == cdb:
|
||
|
self.cdatabuf = ''
|
||
|
return 'cdata'
|
||
|
self._parseError("Mal-formed CDATA header")
|
||
|
|
||
|
def do_cdata(self, byte):
|
||
|
self.cdatabuf += byte
|
||
|
if self.cdatabuf.endswith("]]>"):
|
||
|
self.cdatabuf = self.cdatabuf[:-3]
|
||
|
return 'bodydata'
|
||
|
|
||
|
def end_cdata(self):
|
||
|
self.gotCData(self.cdatabuf)
|
||
|
self.cdatabuf = ''
|
||
|
|
||
|
def do_attrs(self, byte):
|
||
|
if byte.isalnum() or byte in identChars:
|
||
|
# XXX FIXME really handle !DOCTYPE at some point
|
||
|
if self.tagName == '!DOCTYPE':
|
||
|
return 'doctype'
|
||
|
if self.tagName[0] in '!?':
|
||
|
return 'waitforgt'
|
||
|
return 'attrname'
|
||
|
elif byte.isspace():
|
||
|
return
|
||
|
elif byte == '>':
|
||
|
self.gotTagStart(self.tagName, self.tagAttributes)
|
||
|
return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
|
||
|
elif byte == '/':
|
||
|
return 'afterslash'
|
||
|
elif self.beExtremelyLenient:
|
||
|
# discard and move on? Only case I've seen of this so far was:
|
||
|
# <foo bar="baz"">
|
||
|
return
|
||
|
self._parseError("Unexpected character: %r" % byte)
|
||
|
|
||
|
def begin_doctype(self, byte):
|
||
|
self.doctype = byte
|
||
|
|
||
|
def do_doctype(self, byte):
|
||
|
if byte == '>':
|
||
|
return 'bodydata'
|
||
|
self.doctype += byte
|
||
|
|
||
|
def end_doctype(self):
|
||
|
self.gotDoctype(self.doctype)
|
||
|
self.doctype = None
|
||
|
|
||
|
def do_waitforgt(self, byte):
|
||
|
if byte == '>':
|
||
|
if self.endtag or not self.beExtremelyLenient:
|
||
|
return 'bodydata'
|
||
|
return self.maybeBodyData()
|
||
|
|
||
|
def begin_attrname(self, byte):
|
||
|
self.attrname = byte
|
||
|
self._attrname_termtag = 0
|
||
|
|
||
|
def do_attrname(self, byte):
|
||
|
if byte.isalnum() or byte in identChars:
|
||
|
self.attrname += byte
|
||
|
return
|
||
|
elif byte == '=':
|
||
|
return 'beforeattrval'
|
||
|
elif byte.isspace():
|
||
|
return 'beforeeq'
|
||
|
elif self.beExtremelyLenient:
|
||
|
if byte in '"\'':
|
||
|
return 'attrval'
|
||
|
if byte in lenientIdentChars or byte.isalnum():
|
||
|
self.attrname += byte
|
||
|
return
|
||
|
if byte == '/':
|
||
|
self._attrname_termtag = 1
|
||
|
return
|
||
|
if byte == '>':
|
||
|
self.attrval = 'True'
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
self.gotTagStart(self.tagName, self.tagAttributes)
|
||
|
if self._attrname_termtag:
|
||
|
self.gotTagEnd(self.tagName)
|
||
|
return 'bodydata'
|
||
|
return self.maybeBodyData()
|
||
|
# something is really broken. let's leave this attribute where it
|
||
|
# is and move on to the next thing
|
||
|
return
|
||
|
self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
|
||
|
|
||
|
def do_beforeattrval(self, byte):
|
||
|
if byte in '"\'':
|
||
|
return 'attrval'
|
||
|
elif byte.isspace():
|
||
|
return
|
||
|
elif self.beExtremelyLenient:
|
||
|
if byte in lenientIdentChars or byte.isalnum():
|
||
|
return 'messyattr'
|
||
|
if byte == '>':
|
||
|
self.attrval = 'True'
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
self.gotTagStart(self.tagName, self.tagAttributes)
|
||
|
return self.maybeBodyData()
|
||
|
if byte == '\\':
|
||
|
# I saw this in actual HTML once:
|
||
|
# <font size=\"3\"><sup>SM</sup></font>
|
||
|
return
|
||
|
self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
|
||
|
|
||
|
attrname = ''
|
||
|
attrval = ''
|
||
|
|
||
|
def begin_beforeeq(self,byte):
|
||
|
self._beforeeq_termtag = 0
|
||
|
|
||
|
def do_beforeeq(self, byte):
|
||
|
if byte == '=':
|
||
|
return 'beforeattrval'
|
||
|
elif byte.isspace():
|
||
|
return
|
||
|
elif self.beExtremelyLenient:
|
||
|
if byte.isalnum() or byte in identChars:
|
||
|
self.attrval = 'True'
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
return 'attrname'
|
||
|
elif byte == '>':
|
||
|
self.attrval = 'True'
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
self.gotTagStart(self.tagName, self.tagAttributes)
|
||
|
if self._beforeeq_termtag:
|
||
|
self.gotTagEnd(self.tagName)
|
||
|
return 'bodydata'
|
||
|
return self.maybeBodyData()
|
||
|
elif byte == '/':
|
||
|
self._beforeeq_termtag = 1
|
||
|
return
|
||
|
self._parseError("Invalid attribute")
|
||
|
|
||
|
def begin_attrval(self, byte):
|
||
|
self.quotetype = byte
|
||
|
self.attrval = ''
|
||
|
|
||
|
def do_attrval(self, byte):
|
||
|
if byte == self.quotetype:
|
||
|
return 'attrs'
|
||
|
self.attrval += byte
|
||
|
|
||
|
def end_attrval(self):
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
self.attrname = self.attrval = ''
|
||
|
|
||
|
def begin_messyattr(self, byte):
|
||
|
self.attrval = byte
|
||
|
|
||
|
def do_messyattr(self, byte):
|
||
|
if byte.isspace():
|
||
|
return 'attrs'
|
||
|
elif byte == '>':
|
||
|
endTag = 0
|
||
|
if self.attrval.endswith('/'):
|
||
|
endTag = 1
|
||
|
self.attrval = self.attrval[:-1]
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
self.gotTagStart(self.tagName, self.tagAttributes)
|
||
|
if endTag:
|
||
|
self.gotTagEnd(self.tagName)
|
||
|
return 'bodydata'
|
||
|
return self.maybeBodyData()
|
||
|
else:
|
||
|
self.attrval += byte
|
||
|
|
||
|
def end_messyattr(self):
|
||
|
if self.attrval:
|
||
|
self.tagAttributes[self.attrname] = self.attrval
|
||
|
|
||
|
def begin_afterslash(self, byte):
|
||
|
self._after_slash_closed = 0
|
||
|
|
||
|
def do_afterslash(self, byte):
|
||
|
# this state is only after a self-terminating slash, e.g. <foo/>
|
||
|
if self._after_slash_closed:
|
||
|
self._parseError("Mal-formed")#XXX When does this happen??
|
||
|
if byte != '>':
|
||
|
if self.beExtremelyLenient:
|
||
|
return
|
||
|
else:
|
||
|
self._parseError("No data allowed after '/'")
|
||
|
self._after_slash_closed = 1
|
||
|
self.gotTagStart(self.tagName, self.tagAttributes)
|
||
|
self.gotTagEnd(self.tagName)
|
||
|
# don't need maybeBodyData here because there better not be
|
||
|
# any javascript code after a <script/>... we'll see :(
|
||
|
return 'bodydata'
|
||
|
|
||
|
def begin_bodydata(self, byte):
|
||
|
if self._leadingBodyData:
|
||
|
self.bodydata = self._leadingBodyData
|
||
|
del self._leadingBodyData
|
||
|
else:
|
||
|
self.bodydata = ''
|
||
|
|
||
|
def do_bodydata(self, byte):
|
||
|
if byte == '<':
|
||
|
return 'tagstart'
|
||
|
if byte == '&':
|
||
|
return 'entityref'
|
||
|
self.bodydata += byte
|
||
|
|
||
|
def end_bodydata(self):
|
||
|
self.gotText(self.bodydata)
|
||
|
self.bodydata = ''
|
||
|
|
||
|
def do_waitforendscript(self, byte):
|
||
|
if byte == '<':
|
||
|
return 'waitscriptendtag'
|
||
|
self.bodydata += byte
|
||
|
|
||
|
def begin_waitscriptendtag(self, byte):
|
||
|
self.temptagdata = ''
|
||
|
self.tagName = ''
|
||
|
self.endtag = 0
|
||
|
|
||
|
def do_waitscriptendtag(self, byte):
|
||
|
# 1 enforce / as first byte read
|
||
|
# 2 enforce following bytes to be subset of "script" until
|
||
|
# tagName == "script"
|
||
|
# 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
|
||
|
# 3 spaces can happen anywhere, they're ignored
|
||
|
# e.g. < / script >
|
||
|
# 4 anything else causes all data I've read to be moved to the
|
||
|
# bodydata, and switch back to waitforendscript state
|
||
|
|
||
|
# If it turns out this _isn't_ a </script>, we need to
|
||
|
# remember all the data we've been through so we can append it
|
||
|
# to bodydata
|
||
|
self.temptagdata += byte
|
||
|
|
||
|
# 1
|
||
|
if byte == '/':
|
||
|
self.endtag = True
|
||
|
elif not self.endtag:
|
||
|
self.bodydata += "<" + self.temptagdata
|
||
|
return 'waitforendscript'
|
||
|
# 2
|
||
|
elif byte.isalnum() or byte in identChars:
|
||
|
self.tagName += byte
|
||
|
if not 'script'.startswith(self.tagName):
|
||
|
self.bodydata += "<" + self.temptagdata
|
||
|
return 'waitforendscript'
|
||
|
elif self.tagName == 'script':
|
||
|
self.gotText(self.bodydata)
|
||
|
self.gotTagEnd(self.tagName)
|
||
|
return 'waitforgt'
|
||
|
# 3
|
||
|
elif byte.isspace():
|
||
|
return 'waitscriptendtag'
|
||
|
# 4
|
||
|
else:
|
||
|
self.bodydata += "<" + self.temptagdata
|
||
|
return 'waitforendscript'
|
||
|
|
||
|
|
||
|
def begin_entityref(self, byte):
|
||
|
self.erefbuf = ''
|
||
|
self.erefextra = '' # extra bit for lenient mode
|
||
|
|
||
|
def do_entityref(self, byte):
|
||
|
if byte.isspace() or byte == "<":
|
||
|
if self.beExtremelyLenient:
|
||
|
# '&foo' probably was '&foo'
|
||
|
if self.erefbuf and self.erefbuf != "amp":
|
||
|
self.erefextra = self.erefbuf
|
||
|
self.erefbuf = "amp"
|
||
|
if byte == "<":
|
||
|
return "tagstart"
|
||
|
else:
|
||
|
self.erefextra += byte
|
||
|
return 'spacebodydata'
|
||
|
self._parseError("Bad entity reference")
|
||
|
elif byte != ';':
|
||
|
self.erefbuf += byte
|
||
|
else:
|
||
|
return 'bodydata'
|
||
|
|
||
|
def end_entityref(self):
|
||
|
self.gotEntityReference(self.erefbuf)
|
||
|
|
||
|
# hacky support for space after & in entityref in beExtremelyLenient
|
||
|
# state should only happen in that case
|
||
|
def begin_spacebodydata(self, byte):
|
||
|
self.bodydata = self.erefextra
|
||
|
self.erefextra = None
|
||
|
do_spacebodydata = do_bodydata
|
||
|
end_spacebodydata = end_bodydata
|
||
|
|
||
|
# Sorta SAX-ish API
|
||
|
|
||
|
def gotTagStart(self, name, attributes):
|
||
|
'''Encountered an opening tag.
|
||
|
|
||
|
Default behaviour is to print.'''
|
||
|
print 'begin', name, attributes
|
||
|
|
||
|
def gotText(self, data):
|
||
|
'''Encountered text
|
||
|
|
||
|
Default behaviour is to print.'''
|
||
|
print 'text:', repr(data)
|
||
|
|
||
|
def gotEntityReference(self, entityRef):
|
||
|
'''Encountered mnemonic entity reference
|
||
|
|
||
|
Default behaviour is to print.'''
|
||
|
print 'entityRef: &%s;' % entityRef
|
||
|
|
||
|
def gotComment(self, comment):
|
||
|
'''Encountered comment.
|
||
|
|
||
|
Default behaviour is to ignore.'''
|
||
|
pass
|
||
|
|
||
|
def gotCData(self, cdata):
|
||
|
'''Encountered CDATA
|
||
|
|
||
|
Default behaviour is to call the gotText method'''
|
||
|
self.gotText(cdata)
|
||
|
|
||
|
def gotDoctype(self, doctype):
|
||
|
"""Encountered DOCTYPE
|
||
|
|
||
|
This is really grotty: it basically just gives you everything between
|
||
|
'<!DOCTYPE' and '>' as an argument.
|
||
|
"""
|
||
|
print '!DOCTYPE', repr(doctype)
|
||
|
|
||
|
def gotTagEnd(self, name):
|
||
|
'''Encountered closing tag
|
||
|
|
||
|
Default behaviour is to print.'''
|
||
|
print 'end', name
|