openmedialibrary_platform/Linux/lib/python2.7/site-packages/twisted/web/sux.py

# -*- test-case-name: twisted.web.test.test_xml -*-
#
# Copyright (c) Twisted Matrix Laboratories.
# See LICENSE for details.


"""
*S*mall, *U*ncomplicated *X*ML.

This is a very simple implementation of XML/HTML as a network
protocol.  It is not at all clever.  Its main features are that it
does not:

  - support namespaces
  - mung mnemonic entity references
  - validate
  - perform *any* external actions (such as fetching URLs or writing files)
    under *any* circumstances
  - has lots and lots of horrible hacks for supporting broken HTML (as an
    option, they're not on by default).
"""

from twisted.internet.protocol import Protocol
from twisted.python.reflect import prefixedMethodNames


# Elements of the three-tuples in the state table.
BEGIN_HANDLER = 0
DO_HANDLER = 1
END_HANDLER = 2

identChars = '.-_:'
lenientIdentChars = identChars + ';+#/%~'

def nop(*args, **kw):
    "Do nothing."


def unionlist(*args):
    l = []
    for x in args:
        l.extend(x)
    d = dict([(x, 1) for x in l])
    return d.keys()


def zipfndict(*args, **kw):
    default = kw.get('default', nop)
    d = {}
    for key in unionlist(*[fndict.keys() for fndict in args]):
        d[key] = tuple([x.get(key, default) for x in args])
    return d


def prefixedMethodClassDict(clazz, prefix):
    return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])


def prefixedMethodObjDict(obj, prefix):
    return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])


class ParseError(Exception):

    def __init__(self, filename, line, col, message):
        self.filename = filename
        self.line = line
        self.col = col
        self.message = message

    def __str__(self):
       return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
                                self.message)

class XMLParser(Protocol):

    state = None
    encodings = None
    filename = "<xml />"
    beExtremelyLenient = 0
    _prepend = None

    # _leadingBodyData will sometimes be set before switching to the
    # 'bodydata' state, when we "accidentally" read a byte of bodydata
    # in a different state.
    _leadingBodyData = None

    def connectionMade(self):
        self.lineno = 1
        self.colno = 0
        self.encodings = []

    def saveMark(self):
        '''Get the line number and column of the last character parsed'''
        # This gets replaced during dataReceived, restored afterwards
        return (self.lineno, self.colno)

    def _parseError(self, message):
        raise ParseError(*((self.filename,)+self.saveMark()+(message,)))

    def _buildStateTable(self):
        '''Return a dictionary of begin, do, end state function tuples'''
        # _buildStateTable leaves something to be desired but it does what it
        # does.. probably slowly, so I'm doing some evil caching so it doesn't
        # get called more than once per class.
        stateTable = getattr(self.__class__, '__stateTable', None)
        if stateTable is None:
            stateTable = self.__class__.__stateTable = zipfndict(
                *[prefixedMethodObjDict(self, prefix)
                  for prefix in ('begin_', 'do_', 'end_')])
        return stateTable

    def _decode(self, data):
        if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
            assert not len(data) & 1, 'UTF-16 must come in pairs for now'
        if self._prepend:
            data = self._prepend + data
        for encoding in self.encodings:
            data = unicode(data, encoding)
        return data

    def maybeBodyData(self):
        if self.endtag:
            return 'bodydata'

        # Get ready for fun! We're going to allow
        # <script>if (foo < bar)</script> to work!
        # We do this by making everything between <script> and
        # </script> a Text
        # BUT <script src="foo"> will be special-cased to do regular,
        # lenient behavior, because those may not have </script>
        # -radix

        if (self.tagName == 'script' and 'src' not in self.tagAttributes):
            # we do this ourselves rather than having begin_waitforendscript
            # becuase that can get called multiple times and we don't want
            # bodydata to get reset other than the first time.
            self.begin_bodydata(None)
            return 'waitforendscript'
        return 'bodydata'


    def dataReceived(self, data):
        stateTable = self._buildStateTable()
        if not self.state:
            # all UTF-16 starts with this string
            if data.startswith('\xff\xfe'):
                self._prepend = '\xff\xfe'
                self.encodings.append('UTF-16')
                data = data[2:]
            elif data.startswith('\xfe\xff'):
                self._prepend = '\xfe\xff'
                self.encodings.append('UTF-16')
                data = data[2:]
            self.state = 'begin'
        if self.encodings:
            data = self._decode(data)
        # bring state, lineno, colno into local scope
        lineno, colno = self.lineno, self.colno
        curState = self.state
        # replace saveMark with a nested scope function
        _saveMark = self.saveMark
        def saveMark():
            return (lineno, colno)
        self.saveMark = saveMark
        # fetch functions from the stateTable
        beginFn, doFn, endFn = stateTable[curState]
        try:
            for byte in data:
                # do newline stuff
                if byte == '\n':
                    lineno += 1
                    colno = 0
                else:
                    colno += 1
                newState = doFn(byte)
                if newState is not None and newState != curState:
                    # this is the endFn from the previous state
                    endFn()
                    curState = newState
                    beginFn, doFn, endFn = stateTable[curState]
                    beginFn(byte)
        finally:
            self.saveMark = _saveMark
            self.lineno, self.colno = lineno, colno
        # state doesn't make sense if there's an exception..
        self.state = curState


    def connectionLost(self, reason):
        """
        End the last state we were in.
        """
        stateTable = self._buildStateTable()
        stateTable[self.state][END_HANDLER]()


    # state methods

    def do_begin(self, byte):
        if byte.isspace():
            return
        if byte != '<':
            if self.beExtremelyLenient:
                self._leadingBodyData = byte
                return 'bodydata'
            self._parseError("First char of document [%r] wasn't <" % (byte,))
        return 'tagstart'

    def begin_comment(self, byte):
        self.commentbuf = ''

    def do_comment(self, byte):
        self.commentbuf += byte
        if self.commentbuf.endswith('-->'):
            self.gotComment(self.commentbuf[:-3])
            return 'bodydata'

    def begin_tagstart(self, byte):
        self.tagName = ''               # name of the tag
        self.tagAttributes = {}         # attributes of the tag
        self.termtag = 0                # is the tag self-terminating
        self.endtag = 0

    def do_tagstart(self, byte):
        if byte.isalnum() or byte in identChars:
            self.tagName += byte
            if self.tagName == '!--':
                return 'comment'
        elif byte.isspace():
            if self.tagName:
                if self.endtag:
                    # properly strict thing to do here is probably to only
                    # accept whitespace
                    return 'waitforgt'
                return 'attrs'
            else:
                self._parseError("Whitespace before tag-name")
        elif byte == '>':
            if self.endtag:
                self.gotTagEnd(self.tagName)
                return 'bodydata'
            else:
                self.gotTagStart(self.tagName, {})
                return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
        elif byte == '/':
            if self.tagName:
                return 'afterslash'
            else:
                self.endtag = 1
        elif byte in '!?':
            if self.tagName:
                if not self.beExtremelyLenient:
                    self._parseError("Invalid character in tag-name")
            else:
                self.tagName += byte
                self.termtag = 1
        elif byte == '[':
            if self.tagName == '!':
                return 'expectcdata'
            else:
                self._parseError("Invalid '[' in tag-name")
        else:
            if self.beExtremelyLenient:
                self.bodydata = '<'
                return 'unentity'
            self._parseError('Invalid tag character: %r'% byte)

    def begin_unentity(self, byte):
        self.bodydata += byte

    def do_unentity(self, byte):
        self.bodydata += byte
        return 'bodydata'

    def end_unentity(self):
        self.gotText(self.bodydata)

    def begin_expectcdata(self, byte):
        self.cdatabuf = byte

    def do_expectcdata(self, byte):
        self.cdatabuf += byte
        cdb = self.cdatabuf
        cd = '[CDATA['
        if len(cd) > len(cdb):
            if cd.startswith(cdb):
                return
            elif self.beExtremelyLenient:
                ## WHAT THE CRAP!?  MSWord9 generates HTML that includes these
                ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
                ## 'em as best I can.  this should really be a separate parse
                ## state but I don't even have any idea what these _are_.
                return 'waitforgt'
            else:
                self._parseError("Mal-formed CDATA header")
        if cd == cdb:
            self.cdatabuf = ''
            return 'cdata'
        self._parseError("Mal-formed CDATA header")

    def do_cdata(self, byte):
        self.cdatabuf += byte
        if self.cdatabuf.endswith("]]>"):
            self.cdatabuf = self.cdatabuf[:-3]
            return 'bodydata'

    def end_cdata(self):
        self.gotCData(self.cdatabuf)
        self.cdatabuf = ''

    def do_attrs(self, byte):
        if byte.isalnum() or byte in identChars:
            # XXX FIXME really handle !DOCTYPE at some point
            if self.tagName == '!DOCTYPE':
                return 'doctype'
            if self.tagName[0] in '!?':
                return 'waitforgt'
            return 'attrname'
        elif byte.isspace():
            return
        elif byte == '>':
            self.gotTagStart(self.tagName, self.tagAttributes)
            return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
        elif byte == '/':
            return 'afterslash'
        elif self.beExtremelyLenient:
            # discard and move on?  Only case I've seen of this so far was:
            # <foo bar="baz"">
            return
        self._parseError("Unexpected character: %r" % byte)

    def begin_doctype(self, byte):
        self.doctype = byte

    def do_doctype(self, byte):
        if byte == '>':
            return 'bodydata'
        self.doctype += byte

    def end_doctype(self):
        self.gotDoctype(self.doctype)
        self.doctype = None

    def do_waitforgt(self, byte):
        if byte == '>':
            if self.endtag or not self.beExtremelyLenient:
                return 'bodydata'
            return self.maybeBodyData()

    def begin_attrname(self, byte):
        self.attrname = byte
        self._attrname_termtag = 0

    def do_attrname(self, byte):
        if byte.isalnum() or byte in identChars:
            self.attrname += byte
            return
        elif byte == '=':
            return 'beforeattrval'
        elif byte.isspace():
            return 'beforeeq'
        elif self.beExtremelyLenient:
            if byte in '"\'':
                return 'attrval'
            if byte in lenientIdentChars or byte.isalnum():
                self.attrname += byte
                return
            if byte == '/':
                self._attrname_termtag = 1
                return
            if byte == '>':
                self.attrval = 'True'
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                if self._attrname_termtag:
                    self.gotTagEnd(self.tagName)
                    return 'bodydata'
                return self.maybeBodyData()
            # something is really broken. let's leave this attribute where it
            # is and move on to the next thing
            return
        self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))

    def do_beforeattrval(self, byte):
        if byte in '"\'':
            return 'attrval'
        elif byte.isspace():
            return
        elif self.beExtremelyLenient:
            if byte in lenientIdentChars or byte.isalnum():
                return 'messyattr'
            if byte == '>':
                self.attrval = 'True'
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                return self.maybeBodyData()
            if byte == '\\':
                # I saw this in actual HTML once:
                # <font size=\"3\"><sup>SM</sup></font>
                return
        self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)

    attrname = ''
    attrval = ''

    def begin_beforeeq(self,byte):
        self._beforeeq_termtag = 0

    def do_beforeeq(self, byte):
        if byte == '=':
            return 'beforeattrval'
        elif byte.isspace():
            return
        elif self.beExtremelyLenient:
            if byte.isalnum() or byte in identChars:
                self.attrval = 'True'
                self.tagAttributes[self.attrname] = self.attrval
                return 'attrname'
            elif byte == '>':
                self.attrval = 'True'
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                if self._beforeeq_termtag:
                    self.gotTagEnd(self.tagName)
                    return 'bodydata'
                return self.maybeBodyData()
            elif byte == '/':
                self._beforeeq_termtag = 1
                return
        self._parseError("Invalid attribute")

    def begin_attrval(self, byte):
        self.quotetype = byte
        self.attrval = ''

    def do_attrval(self, byte):
        if byte == self.quotetype:
            return 'attrs'
        self.attrval += byte

    def end_attrval(self):
        self.tagAttributes[self.attrname] = self.attrval
        self.attrname = self.attrval = ''

    def begin_messyattr(self, byte):
        self.attrval = byte

    def do_messyattr(self, byte):
        if byte.isspace():
            return 'attrs'
        elif byte == '>':
            endTag = 0
            if self.attrval.endswith('/'):
                endTag = 1
                self.attrval = self.attrval[:-1]
            self.tagAttributes[self.attrname] = self.attrval
            self.gotTagStart(self.tagName, self.tagAttributes)
            if endTag:
                self.gotTagEnd(self.tagName)
                return 'bodydata'
            return self.maybeBodyData()
        else:
            self.attrval += byte

    def end_messyattr(self):
        if self.attrval:
            self.tagAttributes[self.attrname] = self.attrval

    def begin_afterslash(self, byte):
        self._after_slash_closed = 0

    def do_afterslash(self, byte):
        # this state is only after a self-terminating slash, e.g. <foo/>
        if self._after_slash_closed:
            self._parseError("Mal-formed")#XXX When does this happen??
        if byte != '>':
            if self.beExtremelyLenient:
                return
            else:
                self._parseError("No data allowed after '/'")
        self._after_slash_closed = 1
        self.gotTagStart(self.tagName, self.tagAttributes)
        self.gotTagEnd(self.tagName)
        # don't need maybeBodyData here because there better not be
        # any javascript code after a <script/>... we'll see :(
        return 'bodydata'

    def begin_bodydata(self, byte):
        if self._leadingBodyData:
            self.bodydata = self._leadingBodyData
            del self._leadingBodyData
        else:
            self.bodydata = ''

    def do_bodydata(self, byte):
        if byte == '<':
            return 'tagstart'
        if byte == '&':
            return 'entityref'
        self.bodydata += byte

    def end_bodydata(self):
        self.gotText(self.bodydata)
        self.bodydata = ''

    def do_waitforendscript(self, byte):
        if byte == '<':
            return 'waitscriptendtag'
        self.bodydata += byte

    def begin_waitscriptendtag(self, byte):
        self.temptagdata = ''
        self.tagName = ''
        self.endtag = 0

    def do_waitscriptendtag(self, byte):
        # 1 enforce / as first byte read
        # 2 enforce following bytes to be subset of "script" until
        #   tagName == "script"
        #   2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
        # 3 spaces can happen anywhere, they're ignored
        #   e.g. < / script >
        # 4 anything else causes all data I've read to be moved to the
        #   bodydata, and switch back to waitforendscript state

        # If it turns out this _isn't_ a </script>, we need to
        # remember all the data we've been through so we can append it
        # to bodydata
        self.temptagdata += byte

        # 1
        if byte == '/':
            self.endtag = True
        elif not self.endtag:
            self.bodydata += "<" + self.temptagdata
            return 'waitforendscript'
        # 2
        elif byte.isalnum() or byte in identChars:
            self.tagName += byte
            if not 'script'.startswith(self.tagName):
                self.bodydata += "<" + self.temptagdata
                return 'waitforendscript'
            elif self.tagName == 'script':
                self.gotText(self.bodydata)
                self.gotTagEnd(self.tagName)
                return 'waitforgt'
        # 3
        elif byte.isspace():
            return 'waitscriptendtag'
        # 4
        else:
            self.bodydata += "<" + self.temptagdata
            return 'waitforendscript'


    def begin_entityref(self, byte):
        self.erefbuf = ''
        self.erefextra = '' # extra bit for lenient mode

    def do_entityref(self, byte):
        if byte.isspace() or byte == "<":
            if self.beExtremelyLenient:
                # '&foo' probably was '&amp;foo'
                if self.erefbuf and self.erefbuf != "amp":
                    self.erefextra = self.erefbuf
                self.erefbuf = "amp"
                if byte == "<":
                    return "tagstart"
                else:
                    self.erefextra += byte
                    return 'spacebodydata'
            self._parseError("Bad entity reference")
        elif byte != ';':
            self.erefbuf += byte
        else:
            return 'bodydata'

    def end_entityref(self):
        self.gotEntityReference(self.erefbuf)

    # hacky support for space after & in entityref in beExtremelyLenient
    # state should only happen in that case
    def begin_spacebodydata(self, byte):
        self.bodydata = self.erefextra
        self.erefextra = None
    do_spacebodydata = do_bodydata
    end_spacebodydata = end_bodydata

    # Sorta SAX-ish API

    def gotTagStart(self, name, attributes):
        '''Encountered an opening tag.

        Default behaviour is to print.'''
        print 'begin', name, attributes

    def gotText(self, data):
        '''Encountered text

        Default behaviour is to print.'''
        print 'text:', repr(data)

    def gotEntityReference(self, entityRef):
        '''Encountered mnemonic entity reference

        Default behaviour is to print.'''
        print 'entityRef: &%s;' % entityRef

    def gotComment(self, comment):
        '''Encountered comment.

        Default behaviour is to ignore.'''
        pass

    def gotCData(self, cdata):
        '''Encountered CDATA

        Default behaviour is to call the gotText method'''
        self.gotText(cdata)

    def gotDoctype(self, doctype):
        """Encountered DOCTYPE

        This is really grotty: it basically just gives you everything between
        '<!DOCTYPE' and '>' as an argument.
        """
        print '!DOCTYPE', repr(doctype)

    def gotTagEnd(self, name):
        '''Encountered closing tag

        Default behaviour is to print.'''
        print 'end', name
Open Media Library Platform 2013-10-11 17:28:32 +00:00			`# -- test-case-name: twisted.web.test.test_xml --`
			`#`
			`# Copyright (c) Twisted Matrix Laboratories.`
			`# See LICENSE for details.`


			`"""`
			`Small, Uncomplicated XML.`

			`This is a very simple implementation of XML/HTML as a network`
			`protocol. It is not at all clever. Its main features are that it`
			`does not:`

			`- support namespaces`
			`- mung mnemonic entity references`
			`- validate`
			`- perform any external actions (such as fetching URLs or writing files)`
			`under any circumstances`
			`- has lots and lots of horrible hacks for supporting broken HTML (as an`
			`option, they're not on by default).`
			`"""`

			`from twisted.internet.protocol import Protocol`
			`from twisted.python.reflect import prefixedMethodNames`



			`# Elements of the three-tuples in the state table.`
			`BEGIN_HANDLER = 0`
			`DO_HANDLER = 1`
			`END_HANDLER = 2`

			`identChars = '.-_:'`
			`lenientIdentChars = identChars + ';+#/%~'`

			`def nop(args, *kw):`
			`"Do nothing."`


			`def unionlist(*args):`
			`l = []`
			`for x in args:`
			`l.extend(x)`
			`d = dict([(x, 1) for x in l])`
			`return d.keys()`


			`def zipfndict(args, *kw):`
			`default = kw.get('default', nop)`
			`d = {}`
			`for key in unionlist(*[fndict.keys() for fndict in args]):`
			`d[key] = tuple([x.get(key, default) for x in args])`
			`return d`


			`def prefixedMethodClassDict(clazz, prefix):`
			`return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])`


			`def prefixedMethodObjDict(obj, prefix):`
			`return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])`


			`class ParseError(Exception):`

			`def __init__(self, filename, line, col, message):`
			`self.filename = filename`
			`self.line = line`
			`self.col = col`
			`self.message = message`

			`def __str__(self):`
			`return "%s:%s:%s: %s" % (self.filename, self.line, self.col,`
			`self.message)`

			`class XMLParser(Protocol):`

			`state = None`
			`encodings = None`
			`filename = "<xml />"`
			`beExtremelyLenient = 0`
			`_prepend = None`

			`# _leadingBodyData will sometimes be set before switching to the`
			`# 'bodydata' state, when we "accidentally" read a byte of bodydata`
			`# in a different state.`
			`_leadingBodyData = None`

			`def connectionMade(self):`
			`self.lineno = 1`
			`self.colno = 0`
			`self.encodings = []`

			`def saveMark(self):`
			`'''Get the line number and column of the last character parsed'''`
			`# This gets replaced during dataReceived, restored afterwards`
			`return (self.lineno, self.colno)`

			`def _parseError(self, message):`
			`raise ParseError(*((self.filename,)+self.saveMark()+(message,)))`

			`def _buildStateTable(self):`
			`'''Return a dictionary of begin, do, end state function tuples'''`
			`# _buildStateTable leaves something to be desired but it does what it`
			`# does.. probably slowly, so I'm doing some evil caching so it doesn't`
			`# get called more than once per class.`
			`stateTable = getattr(self.__class__, '__stateTable', None)`
			`if stateTable is None:`
			`stateTable = self.__class__.__stateTable = zipfndict(`
			`*[prefixedMethodObjDict(self, prefix)`
			`for prefix in ('begin_', 'do_', 'end_')])`
			`return stateTable`

			`def _decode(self, data):`
			`if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:`
			`assert not len(data) & 1, 'UTF-16 must come in pairs for now'`
			`if self._prepend:`
			`data = self._prepend + data`
			`for encoding in self.encodings:`
			`data = unicode(data, encoding)`
			`return data`

			`def maybeBodyData(self):`
			`if self.endtag:`
			`return 'bodydata'`

			`# Get ready for fun! We're going to allow`
			`# <script>if (foo < bar)</script> to work!`
			`# We do this by making everything between <script> and`
			`# </script> a Text`
			`# BUT <script src="foo"> will be special-cased to do regular,`
			`# lenient behavior, because those may not have </script>`
			`# -radix`

			`if (self.tagName == 'script' and 'src' not in self.tagAttributes):`
			`# we do this ourselves rather than having begin_waitforendscript`
			`# becuase that can get called multiple times and we don't want`
			`# bodydata to get reset other than the first time.`
			`self.begin_bodydata(None)`
			`return 'waitforendscript'`
			`return 'bodydata'`



			`def dataReceived(self, data):`
			`stateTable = self._buildStateTable()`
			`if not self.state:`
			`# all UTF-16 starts with this string`
			`if data.startswith('\xff\xfe'):`
			`self._prepend = '\xff\xfe'`
			`self.encodings.append('UTF-16')`
			`data = data[2:]`
			`elif data.startswith('\xfe\xff'):`
			`self._prepend = '\xfe\xff'`
			`self.encodings.append('UTF-16')`
			`data = data[2:]`
			`self.state = 'begin'`
			`if self.encodings:`
			`data = self._decode(data)`
			`# bring state, lineno, colno into local scope`
			`lineno, colno = self.lineno, self.colno`
			`curState = self.state`
			`# replace saveMark with a nested scope function`
			`_saveMark = self.saveMark`
			`def saveMark():`
			`return (lineno, colno)`
			`self.saveMark = saveMark`
			`# fetch functions from the stateTable`
			`beginFn, doFn, endFn = stateTable[curState]`
			`try:`
			`for byte in data:`
			`# do newline stuff`
			`if byte == '\n':`
			`lineno += 1`
			`colno = 0`
			`else:`
			`colno += 1`
			`newState = doFn(byte)`
			`if newState is not None and newState != curState:`
			`# this is the endFn from the previous state`
			`endFn()`
			`curState = newState`
			`beginFn, doFn, endFn = stateTable[curState]`
			`beginFn(byte)`
			`finally:`
			`self.saveMark = _saveMark`
			`self.lineno, self.colno = lineno, colno`
			`# state doesn't make sense if there's an exception..`
			`self.state = curState`


			`def connectionLost(self, reason):`
			`"""`
			`End the last state we were in.`
			`"""`
			`stateTable = self._buildStateTable()`
			`stateTable[self.state][END_HANDLER]()`


			`# state methods`

			`def do_begin(self, byte):`
			`if byte.isspace():`
			`return`
			`if byte != '<':`
			`if self.beExtremelyLenient:`
			`self._leadingBodyData = byte`
			`return 'bodydata'`
			`self._parseError("First char of document [%r] wasn't <" % (byte,))`
			`return 'tagstart'`

			`def begin_comment(self, byte):`
			`self.commentbuf = ''`

			`def do_comment(self, byte):`
			`self.commentbuf += byte`
			`if self.commentbuf.endswith('-->'):`
			`self.gotComment(self.commentbuf[:-3])`
			`return 'bodydata'`

			`def begin_tagstart(self, byte):`
			`self.tagName = '' # name of the tag`
			`self.tagAttributes = {} # attributes of the tag`
			`self.termtag = 0 # is the tag self-terminating`
			`self.endtag = 0`

			`def do_tagstart(self, byte):`
			`if byte.isalnum() or byte in identChars:`
			`self.tagName += byte`
			`if self.tagName == '!--':`
			`return 'comment'`
			`elif byte.isspace():`
			`if self.tagName:`
			`if self.endtag:`
			`# properly strict thing to do here is probably to only`
			`# accept whitespace`
			`return 'waitforgt'`
			`return 'attrs'`
			`else:`
			`self._parseError("Whitespace before tag-name")`
			`elif byte == '>':`
			`if self.endtag:`
			`self.gotTagEnd(self.tagName)`
			`return 'bodydata'`
			`else:`
			`self.gotTagStart(self.tagName, {})`
			`return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()`
			`elif byte == '/':`
			`if self.tagName:`
			`return 'afterslash'`
			`else:`
			`self.endtag = 1`
			`elif byte in '!?':`
			`if self.tagName:`
			`if not self.beExtremelyLenient:`
			`self._parseError("Invalid character in tag-name")`
			`else:`
			`self.tagName += byte`
			`self.termtag = 1`
			`elif byte == '[':`
			`if self.tagName == '!':`
			`return 'expectcdata'`
			`else:`
			`self._parseError("Invalid '[' in tag-name")`
			`else:`
			`if self.beExtremelyLenient:`
			`self.bodydata = '<'`
			`return 'unentity'`
			`self._parseError('Invalid tag character: %r'% byte)`

			`def begin_unentity(self, byte):`
			`self.bodydata += byte`

			`def do_unentity(self, byte):`
			`self.bodydata += byte`
			`return 'bodydata'`

			`def end_unentity(self):`
			`self.gotText(self.bodydata)`

			`def begin_expectcdata(self, byte):`
			`self.cdatabuf = byte`

			`def do_expectcdata(self, byte):`
			`self.cdatabuf += byte`
			`cdb = self.cdatabuf`
			`cd = '[CDATA['`
			`if len(cd) > len(cdb):`
			`if cd.startswith(cdb):`
			`return`
			`elif self.beExtremelyLenient:`
			`## WHAT THE CRAP!? MSWord9 generates HTML that includes these`
			`## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore`
			`## 'em as best I can. this should really be a separate parse`
			`## state but I don't even have any idea what these _are_.`
			`return 'waitforgt'`
			`else:`
			`self._parseError("Mal-formed CDATA header")`
			`if cd == cdb:`
			`self.cdatabuf = ''`
			`return 'cdata'`
			`self._parseError("Mal-formed CDATA header")`

			`def do_cdata(self, byte):`
			`self.cdatabuf += byte`
			`if self.cdatabuf.endswith("]]>"):`
			`self.cdatabuf = self.cdatabuf[:-3]`
			`return 'bodydata'`

			`def end_cdata(self):`
			`self.gotCData(self.cdatabuf)`
			`self.cdatabuf = ''`

			`def do_attrs(self, byte):`
			`if byte.isalnum() or byte in identChars:`
			`# XXX FIXME really handle !DOCTYPE at some point`
			`if self.tagName == '!DOCTYPE':`
			`return 'doctype'`
			`if self.tagName[0] in '!?':`
			`return 'waitforgt'`
			`return 'attrname'`
			`elif byte.isspace():`
			`return`
			`elif byte == '>':`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()`
			`elif byte == '/':`
			`return 'afterslash'`
			`elif self.beExtremelyLenient:`
			`# discard and move on? Only case I've seen of this so far was:`
			`# <foo bar="baz"">`
			`return`
			`self._parseError("Unexpected character: %r" % byte)`

			`def begin_doctype(self, byte):`
			`self.doctype = byte`

			`def do_doctype(self, byte):`
			`if byte == '>':`
			`return 'bodydata'`
			`self.doctype += byte`

			`def end_doctype(self):`
			`self.gotDoctype(self.doctype)`
			`self.doctype = None`

			`def do_waitforgt(self, byte):`
			`if byte == '>':`
			`if self.endtag or not self.beExtremelyLenient:`
			`return 'bodydata'`
			`return self.maybeBodyData()`

			`def begin_attrname(self, byte):`
			`self.attrname = byte`
			`self._attrname_termtag = 0`

			`def do_attrname(self, byte):`
			`if byte.isalnum() or byte in identChars:`
			`self.attrname += byte`
			`return`
			`elif byte == '=':`
			`return 'beforeattrval'`
			`elif byte.isspace():`
			`return 'beforeeq'`
			`elif self.beExtremelyLenient:`
			`if byte in '"\'':`
			`return 'attrval'`
			`if byte in lenientIdentChars or byte.isalnum():`
			`self.attrname += byte`
			`return`
			`if byte == '/':`
			`self._attrname_termtag = 1`
			`return`
			`if byte == '>':`
			`self.attrval = 'True'`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`if self._attrname_termtag:`
			`self.gotTagEnd(self.tagName)`
			`return 'bodydata'`
			`return self.maybeBodyData()`
			`# something is really broken. let's leave this attribute where it`
			`# is and move on to the next thing`
			`return`
			`self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))`

			`def do_beforeattrval(self, byte):`
			`if byte in '"\'':`
			`return 'attrval'`
			`elif byte.isspace():`
			`return`
			`elif self.beExtremelyLenient:`
			`if byte in lenientIdentChars or byte.isalnum():`
			`return 'messyattr'`
			`if byte == '>':`
			`self.attrval = 'True'`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`return self.maybeBodyData()`
			`if byte == '\\':`
			`# I saw this in actual HTML once:`
			`# <font size=\"3\"><sup>SM</sup></font>`
			`return`
			`self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)`

			`attrname = ''`
			`attrval = ''`

			`def begin_beforeeq(self,byte):`
			`self._beforeeq_termtag = 0`

			`def do_beforeeq(self, byte):`
			`if byte == '=':`
			`return 'beforeattrval'`
			`elif byte.isspace():`
			`return`
			`elif self.beExtremelyLenient:`
			`if byte.isalnum() or byte in identChars:`
			`self.attrval = 'True'`
			`self.tagAttributes[self.attrname] = self.attrval`
			`return 'attrname'`
			`elif byte == '>':`
			`self.attrval = 'True'`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`if self._beforeeq_termtag:`
			`self.gotTagEnd(self.tagName)`
			`return 'bodydata'`
			`return self.maybeBodyData()`
			`elif byte == '/':`
			`self._beforeeq_termtag = 1`
			`return`
			`self._parseError("Invalid attribute")`

			`def begin_attrval(self, byte):`
			`self.quotetype = byte`
			`self.attrval = ''`

			`def do_attrval(self, byte):`
			`if byte == self.quotetype:`
			`return 'attrs'`
			`self.attrval += byte`

			`def end_attrval(self):`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.attrname = self.attrval = ''`

			`def begin_messyattr(self, byte):`
			`self.attrval = byte`

			`def do_messyattr(self, byte):`
			`if byte.isspace():`
			`return 'attrs'`
			`elif byte == '>':`
			`endTag = 0`
			`if self.attrval.endswith('/'):`
			`endTag = 1`
			`self.attrval = self.attrval[:-1]`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`if endTag:`
			`self.gotTagEnd(self.tagName)`
			`return 'bodydata'`
			`return self.maybeBodyData()`
			`else:`
			`self.attrval += byte`

			`def end_messyattr(self):`
			`if self.attrval:`
			`self.tagAttributes[self.attrname] = self.attrval`

			`def begin_afterslash(self, byte):`
			`self._after_slash_closed = 0`

			`def do_afterslash(self, byte):`
			`# this state is only after a self-terminating slash, e.g. <foo/>`
			`if self._after_slash_closed:`
			`self._parseError("Mal-formed")#XXX When does this happen??`
			`if byte != '>':`
			`if self.beExtremelyLenient:`
			`return`
			`else:`
			`self._parseError("No data allowed after '/'")`
			`self._after_slash_closed = 1`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`self.gotTagEnd(self.tagName)`
			`# don't need maybeBodyData here because there better not be`
			`# any javascript code after a <script/>... we'll see :(`
			`return 'bodydata'`

			`def begin_bodydata(self, byte):`
			`if self._leadingBodyData:`
			`self.bodydata = self._leadingBodyData`
			`del self._leadingBodyData`
			`else:`
			`self.bodydata = ''`

			`def do_bodydata(self, byte):`
			`if byte == '<':`
			`return 'tagstart'`
			`if byte == '&':`
			`return 'entityref'`
			`self.bodydata += byte`

			`def end_bodydata(self):`
			`self.gotText(self.bodydata)`
			`self.bodydata = ''`

			`def do_waitforendscript(self, byte):`
			`if byte == '<':`
			`return 'waitscriptendtag'`
			`self.bodydata += byte`

			`def begin_waitscriptendtag(self, byte):`
			`self.temptagdata = ''`
			`self.tagName = ''`
			`self.endtag = 0`

			`def do_waitscriptendtag(self, byte):`
			`# 1 enforce / as first byte read`
			`# 2 enforce following bytes to be subset of "script" until`
			`# tagName == "script"`
			`# 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)`
			`# 3 spaces can happen anywhere, they're ignored`
			`# e.g. < / script >`
			`# 4 anything else causes all data I've read to be moved to the`
			`# bodydata, and switch back to waitforendscript state`

			`# If it turns out this _isn't_ a </script>, we need to`
			`# remember all the data we've been through so we can append it`
			`# to bodydata`
			`self.temptagdata += byte`

			`# 1`
			`if byte == '/':`
			`self.endtag = True`
			`elif not self.endtag:`
			`self.bodydata += "<" + self.temptagdata`
			`return 'waitforendscript'`
			`# 2`
			`elif byte.isalnum() or byte in identChars:`
			`self.tagName += byte`
			`if not 'script'.startswith(self.tagName):`
			`self.bodydata += "<" + self.temptagdata`
			`return 'waitforendscript'`
			`elif self.tagName == 'script':`
			`self.gotText(self.bodydata)`
			`self.gotTagEnd(self.tagName)`
			`return 'waitforgt'`
			`# 3`
			`elif byte.isspace():`
			`return 'waitscriptendtag'`
			`# 4`
			`else:`
			`self.bodydata += "<" + self.temptagdata`
			`return 'waitforendscript'`


			`def begin_entityref(self, byte):`
			`self.erefbuf = ''`
			`self.erefextra = '' # extra bit for lenient mode`

			`def do_entityref(self, byte):`
			`if byte.isspace() or byte == "<":`
			`if self.beExtremelyLenient:`
			`# '&foo' probably was '&foo'`
			`if self.erefbuf and self.erefbuf != "amp":`
			`self.erefextra = self.erefbuf`
			`self.erefbuf = "amp"`
			`if byte == "<":`
			`return "tagstart"`
			`else:`
			`self.erefextra += byte`
			`return 'spacebodydata'`
			`self._parseError("Bad entity reference")`
			`elif byte != ';':`
			`self.erefbuf += byte`
			`else:`
			`return 'bodydata'`

			`def end_entityref(self):`
			`self.gotEntityReference(self.erefbuf)`

			`# hacky support for space after & in entityref in beExtremelyLenient`
			`# state should only happen in that case`
			`def begin_spacebodydata(self, byte):`
			`self.bodydata = self.erefextra`
			`self.erefextra = None`
			`do_spacebodydata = do_bodydata`
			`end_spacebodydata = end_bodydata`

			`# Sorta SAX-ish API`

			`def gotTagStart(self, name, attributes):`
			`'''Encountered an opening tag.`

			`Default behaviour is to print.'''`
			`print 'begin', name, attributes`

			`def gotText(self, data):`
			`'''Encountered text`

			`Default behaviour is to print.'''`
			`print 'text:', repr(data)`

			`def gotEntityReference(self, entityRef):`
			`'''Encountered mnemonic entity reference`

			`Default behaviour is to print.'''`
			`print 'entityRef: &%s;' % entityRef`

			`def gotComment(self, comment):`
			`'''Encountered comment.`

			`Default behaviour is to ignore.'''`
			`pass`

			`def gotCData(self, cdata):`
			`'''Encountered CDATA`

			`Default behaviour is to call the gotText method'''`
			`self.gotText(cdata)`

			`def gotDoctype(self, doctype):`
			`"""Encountered DOCTYPE`

			`This is really grotty: it basically just gives you everything between`
			`'<!DOCTYPE' and '>' as an argument.`
			`"""`
			`print '!DOCTYPE', repr(doctype)`

			`def gotTagEnd(self, name):`
			`'''Encountered closing tag`

			`Default behaviour is to print.'''`
			`print 'end', name`