split platform

This commit is contained in:
j 2016-02-06 15:06:57 +05:30
commit 8c9b09577d
2261 changed files with 676163 additions and 0 deletions

View file

@ -0,0 +1,223 @@
#
# ElementTree
# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $
#
# limited xinclude support for element trees
#
# history:
# 2003-08-15 fl created
# 2003-11-14 fl fixed default loader
#
# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2004 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
"""
Limited XInclude support for the ElementTree package.
While lxml.etree has full support for XInclude (see
`etree.ElementTree.xinclude()`), this module provides a simpler, pure
Python, ElementTree compatible implementation that supports a simple
form of custom URL resolvers.
"""
from lxml import etree
import copy
try:
from urlparse import urljoin
from urllib2 import urlopen
except ImportError:
# Python 3
from urllib.parse import urljoin
from urllib.request import urlopen
try:
set
except NameError:
# Python 2.3
from sets import Set as set
XINCLUDE = "{http://www.w3.org/2001/XInclude}"
XINCLUDE_INCLUDE = XINCLUDE + "include"
XINCLUDE_FALLBACK = XINCLUDE + "fallback"
##
# Fatal include error.
class FatalIncludeError(etree.LxmlSyntaxError):
pass
##
# ET compatible default loader.
# This loader reads an included resource from disk.
#
# @param href Resource reference.
# @param parse Parse mode. Either "xml" or "text".
# @param encoding Optional text encoding.
# @return The expanded resource. If the parse mode is "xml", this
# is an ElementTree instance. If the parse mode is "text", this
# is a Unicode string. If the loader fails, it can return None
# or raise an IOError exception.
# @throws IOError If the loader fails to load the resource.
def default_loader(href, parse, encoding=None):
file = open(href, 'rb')
if parse == "xml":
data = etree.parse(file).getroot()
else:
data = file.read()
if not encoding:
encoding = 'utf-8'
data = data.decode(encoding)
file.close()
return data
##
# Default loader used by lxml.etree - handles custom resolvers properly
#
def _lxml_default_loader(href, parse, encoding=None, parser=None):
if parse == "xml":
data = etree.parse(href, parser).getroot()
else:
if "://" in href:
f = urlopen(href)
else:
f = open(href, 'rb')
data = f.read()
f.close()
if not encoding:
encoding = 'utf-8'
data = data.decode(encoding)
return data
##
# Wrapper for ET compatibility - drops the parser
def _wrap_et_loader(loader):
def load(href, parse, encoding=None, parser=None):
return loader(href, parse, encoding)
return load
##
# Expand XInclude directives.
#
# @param elem Root element.
# @param loader Optional resource loader. If omitted, it defaults
# to {@link default_loader}. If given, it should be a callable
# that implements the same interface as <b>default_loader</b>.
# @throws FatalIncludeError If the function fails to include a given
# resource, or if the tree contains malformed XInclude elements.
# @throws IOError If the function fails to load a given resource.
# @returns the node or its replacement if it was an XInclude node
def include(elem, loader=None, base_url=None):
if base_url is None:
if hasattr(elem, 'getroot'):
tree = elem
elem = elem.getroot()
else:
tree = elem.getroottree()
if hasattr(tree, 'docinfo'):
base_url = tree.docinfo.URL
elif hasattr(elem, 'getroot'):
elem = elem.getroot()
_include(elem, loader, base_url=base_url)
def _include(elem, loader=None, _parent_hrefs=None, base_url=None):
if loader is not None:
load_include = _wrap_et_loader(loader)
else:
load_include = _lxml_default_loader
if _parent_hrefs is None:
_parent_hrefs = set()
parser = elem.getroottree().parser
include_elements = list(
elem.iter('{http://www.w3.org/2001/XInclude}*'))
for e in include_elements:
if e.tag == XINCLUDE_INCLUDE:
# process xinclude directive
href = urljoin(base_url, e.get("href"))
parse = e.get("parse", "xml")
parent = e.getparent()
if parse == "xml":
if href in _parent_hrefs:
raise FatalIncludeError(
"recursive include of %r detected" % href
)
_parent_hrefs.add(href)
node = load_include(href, parse, parser=parser)
if node is None:
raise FatalIncludeError(
"cannot load %r as %r" % (href, parse)
)
node = _include(node, loader, _parent_hrefs)
if e.tail:
node.tail = (node.tail or "") + e.tail
if parent is None:
return node # replaced the root node!
parent.replace(e, node)
elif parse == "text":
text = load_include(href, parse, encoding=e.get("encoding"))
if text is None:
raise FatalIncludeError(
"cannot load %r as %r" % (href, parse)
)
predecessor = e.getprevious()
if predecessor is not None:
predecessor.tail = (predecessor.tail or "") + text
elif parent is None:
return text # replaced the root node!
else:
parent.text = (parent.text or "") + text + (e.tail or "")
parent.remove(e)
else:
raise FatalIncludeError(
"unknown parse type in xi:include tag (%r)" % parse
)
elif e.tag == XINCLUDE_FALLBACK:
parent = e.getparent()
if parent is not None and parent.tag != XINCLUDE_INCLUDE:
raise FatalIncludeError(
"xi:fallback tag must be child of xi:include (%r)" % e.tag
)
else:
raise FatalIncludeError(
"Invalid element found in XInclude namespace (%r)" % e.tag
)
return elem

View file

@ -0,0 +1,20 @@
# this is a package
def get_include():
"""
Returns a list of header include paths (for lxml itself, libxml2
and libxslt) needed to compile C code against lxml if it was built
with statically linked libraries.
"""
import os
lxml_path = __path__[0]
include_path = os.path.join(lxml_path, 'includes')
includes = [include_path, lxml_path]
for name in os.listdir(include_path):
path = os.path.join(include_path, name)
if os.path.isdir(path):
includes.append(path)
return includes

View file

@ -0,0 +1,315 @@
#
# ElementTree
# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
#
# limited xpath support for element trees
#
# history:
# 2003-05-23 fl created
# 2003-05-28 fl added support for // etc
# 2003-08-27 fl fixed parsing of periods in element names
# 2007-09-10 fl new selection engine
# 2007-09-12 fl fixed parent selector
# 2007-09-13 fl added iterfind; changed findall to return a list
# 2007-11-30 fl added namespaces support
# 2009-10-30 fl added child element value filter
#
# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2009 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
##
# Implementation module for XPath support. There's usually no reason
# to import this module directly; the <b>ElementTree</b> does this for
# you, if needed.
##
import re
xpath_tokenizer_re = re.compile(
"("
"'[^']*'|\"[^\"]*\"|"
"::|"
"//?|"
"\.\.|"
"\(\)|"
"[/.*:\[\]\(\)@=])|"
"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
"\s+"
)
def xpath_tokenizer(pattern, namespaces=None):
for token in xpath_tokenizer_re.findall(pattern):
tag = token[1]
if tag and tag[0] != "{" and ":" in tag:
try:
prefix, uri = tag.split(":", 1)
if not namespaces:
raise KeyError
yield token[0], "{%s}%s" % (namespaces[prefix], uri)
except KeyError:
raise SyntaxError("prefix %r not found in prefix map" % prefix)
else:
yield token
def prepare_child(next, token):
tag = token[1]
def select(result):
for elem in result:
for e in elem.iterchildren(tag):
yield e
return select
def prepare_star(next, token):
def select(result):
for elem in result:
for e in elem.iterchildren('*'):
yield e
return select
def prepare_self(next, token):
def select(result):
return result
return select
def prepare_descendant(next, token):
token = next()
if token[0] == "*":
tag = "*"
elif not token[0]:
tag = token[1]
else:
raise SyntaxError("invalid descendant")
def select(result):
for elem in result:
for e in elem.iterdescendants(tag):
yield e
return select
def prepare_parent(next, token):
def select(result):
for elem in result:
parent = elem.getparent()
if parent is not None:
yield parent
return select
def prepare_predicate(next, token):
# FIXME: replace with real parser!!! refs:
# http://effbot.org/zone/simple-iterator-parser.htm
# http://javascript.crockford.com/tdop/tdop.html
signature = []
predicate = []
while 1:
token = next()
if token[0] == "]":
break
if token[0] and token[0][:1] in "'\"":
token = "'", token[0][1:-1]
signature.append(token[0] or "-")
predicate.append(token[1])
signature = "".join(signature)
# use signature to determine predicate type
if signature == "@-":
# [@attribute] predicate
key = predicate[1]
def select(result):
for elem in result:
if elem.get(key) is not None:
yield elem
return select
if signature == "@-='":
# [@attribute='value']
key = predicate[1]
value = predicate[-1]
def select(result):
for elem in result:
if elem.get(key) == value:
yield elem
return select
if signature == "-" and not re.match("-?\d+$", predicate[0]):
# [tag]
tag = predicate[0]
def select(result):
for elem in result:
for _ in elem.iterchildren(tag):
yield elem
break
return select
if signature == "-='" and not re.match("-?\d+$", predicate[0]):
# [tag='value']
tag = predicate[0]
value = predicate[-1]
def select(result):
for elem in result:
for e in elem.iterchildren(tag):
if "".join(e.itertext()) == value:
yield elem
break
return select
if signature == "-" or signature == "-()" or signature == "-()-":
# [index] or [last()] or [last()-index]
if signature == "-":
# [index]
index = int(predicate[0]) - 1
if index < 0:
if index == -1:
raise SyntaxError(
"indices in path predicates are 1-based, not 0-based")
else:
raise SyntaxError("path index >= 1 expected")
else:
if predicate[0] != "last":
raise SyntaxError("unsupported function")
if signature == "-()-":
try:
index = int(predicate[2]) - 1
except ValueError:
raise SyntaxError("unsupported expression")
else:
index = -1
def select(result):
for elem in result:
parent = elem.getparent()
if parent is None:
continue
try:
# FIXME: what if the selector is "*" ?
elems = list(parent.iterchildren(elem.tag))
if elems[index] is elem:
yield elem
except IndexError:
pass
return select
raise SyntaxError("invalid predicate")
ops = {
"": prepare_child,
"*": prepare_star,
".": prepare_self,
"..": prepare_parent,
"//": prepare_descendant,
"[": prepare_predicate,
}
# --------------------------------------------------------------------
_cache = {}
def _build_path_iterator(path, namespaces):
"""compile selector pattern"""
if namespaces and (None in namespaces or '' in namespaces):
raise ValueError("empty namespace prefix is not supported in ElementPath")
if path[-1:] == "/":
path += "*" # implicit all (FIXME: keep this?)
cache_key = (path, namespaces and tuple(sorted(namespaces.items())) or None)
try:
return _cache[cache_key]
except KeyError:
pass
if len(_cache) > 100:
_cache.clear()
if path[:1] == "/":
raise SyntaxError("cannot use absolute path on element")
stream = iter(xpath_tokenizer(path, namespaces))
try:
_next = stream.next
except AttributeError:
# Python 3
_next = stream.__next__
try:
token = _next()
except StopIteration:
raise SyntaxError("empty path expression")
selector = []
while 1:
try:
selector.append(ops[token[0]](_next, token))
except StopIteration:
raise SyntaxError("invalid path")
try:
token = _next()
if token[0] == "/":
token = _next()
except StopIteration:
break
_cache[cache_key] = selector
return selector
##
# Iterate over the matching nodes
def iterfind(elem, path, namespaces=None):
selector = _build_path_iterator(path, namespaces)
result = iter((elem,))
for select in selector:
result = select(result)
return result
##
# Find first matching object.
def find(elem, path, namespaces=None):
it = iterfind(elem, path, namespaces)
try:
try:
_next = it.next
except AttributeError:
return next(it)
else:
return _next()
except StopIteration:
return None
##
# Find all matching objects.
def findall(elem, path, namespaces=None):
return list(iterfind(elem, path, namespaces))
##
# Find text for first matching object.
def findtext(elem, path, default=None, namespaces=None):
el = find(elem, path, namespaces)
if el is None:
return default
else:
return el.text or ''

View file

@ -0,0 +1,246 @@
#
# Element generator factory by Fredrik Lundh.
#
# Source:
# http://online.effbot.org/2006_11_01_archive.htm#et-builder
# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2004 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
"""
The ``E`` Element factory for generating XML documents.
"""
import lxml.etree as ET
try:
from functools import partial
except ImportError:
# fake it for pre-2.5 releases
def partial(func, tag):
return lambda *args, **kwargs: func(tag, *args, **kwargs)
try:
callable
except NameError:
# Python 3
def callable(f):
return hasattr(f, '__call__')
try:
basestring
except NameError:
basestring = str
try:
unicode
except NameError:
unicode = str
class ElementMaker(object):
"""Element generator factory.
Unlike the ordinary Element factory, the E factory allows you to pass in
more than just a tag and some optional attributes; you can also pass in
text and other elements. The text is added as either text or tail
attributes, and elements are inserted at the right spot. Some small
examples::
>>> from lxml import etree as ET
>>> from lxml.builder import E
>>> ET.tostring(E("tag"))
'<tag/>'
>>> ET.tostring(E("tag", "text"))
'<tag>text</tag>'
>>> ET.tostring(E("tag", "text", key="value"))
'<tag key="value">text</tag>'
>>> ET.tostring(E("tag", E("subtag", "text"), "tail"))
'<tag><subtag>text</subtag>tail</tag>'
For simple tags, the factory also allows you to write ``E.tag(...)`` instead
of ``E('tag', ...)``::
>>> ET.tostring(E.tag())
'<tag/>'
>>> ET.tostring(E.tag("text"))
'<tag>text</tag>'
>>> ET.tostring(E.tag(E.subtag("text"), "tail"))
'<tag><subtag>text</subtag>tail</tag>'
Here's a somewhat larger example; this shows how to generate HTML
documents, using a mix of prepared factory functions for inline elements,
nested ``E.tag`` calls, and embedded XHTML fragments::
# some common inline elements
A = E.a
I = E.i
B = E.b
def CLASS(v):
# helper function, 'class' is a reserved word
return {'class': v}
page = (
E.html(
E.head(
E.title("This is a sample document")
),
E.body(
E.h1("Hello!", CLASS("title")),
E.p("This is a paragraph with ", B("bold"), " text in it!"),
E.p("This is another paragraph, with a ",
A("link", href="http://www.python.org"), "."),
E.p("Here are some reservered characters: <spam&egg>."),
ET.XML("<p>And finally, here is an embedded XHTML fragment.</p>"),
)
)
)
print ET.tostring(page)
Here's a prettyprinted version of the output from the above script::
<html>
<head>
<title>This is a sample document</title>
</head>
<body>
<h1 class="title">Hello!</h1>
<p>This is a paragraph with <b>bold</b> text in it!</p>
<p>This is another paragraph, with <a href="http://www.python.org">link</a>.</p>
<p>Here are some reservered characters: &lt;spam&amp;egg&gt;.</p>
<p>And finally, here is an embedded XHTML fragment.</p>
</body>
</html>
For namespace support, you can pass a namespace map (``nsmap``)
and/or a specific target ``namespace`` to the ElementMaker class::
>>> E = ElementMaker(namespace="http://my.ns/")
>>> print(ET.tostring( E.test ))
<test xmlns="http://my.ns/"/>
>>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'})
>>> print(ET.tostring( E.test ))
<p:test xmlns:p="http://my.ns/"/>
"""
def __init__(self, typemap=None,
namespace=None, nsmap=None, makeelement=None):
if namespace is not None:
self._namespace = '{' + namespace + '}'
else:
self._namespace = None
if nsmap:
self._nsmap = dict(nsmap)
else:
self._nsmap = None
if makeelement is not None:
assert callable(makeelement)
self._makeelement = makeelement
else:
self._makeelement = ET.Element
# initialize type map for this element factory
if typemap:
typemap = typemap.copy()
else:
typemap = {}
def add_text(elem, item):
try:
elem[-1].tail = (elem[-1].tail or "") + item
except IndexError:
elem.text = (elem.text or "") + item
def add_cdata(elem, cdata):
if elem.text:
raise ValueError("Can't add a CDATA section. Element already has some text: %r" % elem.text)
elem.text = cdata
if str not in typemap:
typemap[str] = add_text
if unicode not in typemap:
typemap[unicode] = add_text
if ET.CDATA not in typemap:
typemap[ET.CDATA] = add_cdata
def add_dict(elem, item):
attrib = elem.attrib
for k, v in item.items():
if isinstance(v, basestring):
attrib[k] = v
else:
attrib[k] = typemap[type(v)](None, v)
if dict not in typemap:
typemap[dict] = add_dict
self._typemap = typemap
def __call__(self, tag, *children, **attrib):
get = self._typemap.get
if self._namespace is not None and tag[0] != '{':
tag = self._namespace + tag
elem = self._makeelement(tag, nsmap=self._nsmap)
if attrib:
get(dict)(elem, attrib)
for item in children:
if callable(item):
item = item()
t = get(type(item))
if t is None:
if ET.iselement(item):
elem.append(item)
continue
for basetype in type(item).__mro__:
# See if the typemap knows of any of this type's bases.
t = get(basetype)
if t is not None:
break
else:
raise TypeError("bad argument type: %s(%r)" %
(type(item).__name__, item))
v = t(elem, item)
if v:
get(type(v))(elem, v)
return elem
def __getattr__(self, tag):
return partial(self, tag)
# create factory object
E = ElementMaker()

View file

@ -0,0 +1,102 @@
"""CSS Selectors based on XPath.
This module supports selecting XML/HTML tags based on CSS selectors.
See the `CSSSelector` class for details.
This is a thin wrapper around cssselect 0.7 or later.
"""
from __future__ import absolute_import
from . import etree
try:
import cssselect as external_cssselect
except ImportError:
raise ImportError(
'cssselect does not seem to be installed. '
'See http://packages.python.org/cssselect/')
SelectorSyntaxError = external_cssselect.SelectorSyntaxError
ExpressionError = external_cssselect.ExpressionError
SelectorError = external_cssselect.SelectorError
__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
'CSSSelector']
class LxmlTranslator(external_cssselect.GenericTranslator):
"""
A custom CSS selector to XPath translator with lxml-specific extensions.
"""
def xpath_contains_function(self, xpath, function):
# Defined there, removed in later drafts:
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
if function.argument_types() not in (['STRING'], ['IDENT']):
raise ExpressionError(
"Expected a single string or ident for :contains(), got %r"
% function.arguments)
value = function.arguments[0].value
return xpath.add_condition(
'contains(__lxml_internal_css:lower-case(string(.)), %s)'
% self.xpath_literal(value.lower()))
class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
"""
lxml extensions + HTML support.
"""
def _make_lower_case(context, s):
return s.lower()
ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
ns.prefix = '__lxml_internal_css'
ns['lower-case'] = _make_lower_case
class CSSSelector(etree.XPath):
"""A CSS selector.
Usage::
>>> from lxml import etree, cssselect
>>> select = cssselect.CSSSelector("a tag > child")
>>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
>>> [ el.tag for el in select(root) ]
['child']
To use CSS namespaces, you need to pass a prefix-to-namespace
mapping as ``namespaces`` keyword argument::
>>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
>>> select_ns = cssselect.CSSSelector('root > rdf|Description',
... namespaces={'rdf': rdfns})
>>> rdf = etree.XML((
... '<root xmlns:rdf="%s">'
... '<rdf:Description>blah</rdf:Description>'
... '</root>') % rdfns)
>>> [(el.tag, el.text) for el in select_ns(rdf)]
[('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
"""
def __init__(self, css, namespaces=None, translator='xml'):
if translator == 'xml':
translator = LxmlTranslator()
elif translator == 'html':
translator = LxmlHTMLTranslator()
elif translator == 'xhtml':
translator = LxmlHTMLTranslator(xhtml=True)
path = translator.css_to_xpath(css)
etree.XPath.__init__(self, path, namespaces=namespaces)
self.css = css
def __repr__(self):
return '<%s %s for %r>' % (
self.__class__.__name__,
hex(abs(id(self)))[2:],
self.css)

View file

@ -0,0 +1,508 @@
"""
lxml-based doctest output comparison.
Note: normally, you should just import the `lxml.usedoctest` and
`lxml.html.usedoctest` modules from within a doctest, instead of this
one::
>>> import lxml.usedoctest # for XML output
>>> import lxml.html.usedoctest # for HTML output
To use this module directly, you must call ``lxmldoctest.install()``,
which will cause doctest to use this in all subsequent calls.
This changes the way output is checked and comparisons are made for
XML or HTML-like content.
XML or HTML content is noticed because the example starts with ``<``
(it's HTML if it starts with ``<html``). You can also use the
``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
Some rough wildcard-like things are allowed. Whitespace is generally
ignored (except in attributes). In text (attributes and text in the
body) you can use ``...`` as a wildcard. In an example it also
matches any trailing tags in the element, though it does not match
leading tags. You may create a tag ``<any>`` or include an ``any``
attribute in the tag. An ``any`` tag matches any tag, while the
attribute matches any and all attributes.
When a match fails, the reformatted example and gotten text is
displayed (indented), and a rough diff-like output is given. Anything
marked with ``+`` is in the output but wasn't supposed to be, and
similarly ``-`` means its in the example but wasn't in the output.
You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
"""
from lxml import etree
import sys
import re
import doctest
try:
from html import escape as html_escape
except ImportError:
from cgi import escape as html_escape
__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
'LHTMLOutputChecker', 'install', 'temp_install']
try:
_basestring = basestring
except NameError:
_basestring = (str, bytes)
_IS_PYTHON_3 = sys.version_info[0] >= 3
PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
PARSE_XML = doctest.register_optionflag('PARSE_XML')
NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')
OutputChecker = doctest.OutputChecker
def strip(v):
if v is None:
return None
else:
return v.strip()
def norm_whitespace(v):
return _norm_whitespace_re.sub(' ', v)
_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)
def html_fromstring(html):
return etree.fromstring(html, _html_parser)
# We use this to distinguish repr()s from elements:
_repr_re = re.compile(r'^<[^>]+ (at|object) ')
_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')
class LXMLOutputChecker(OutputChecker):
empty_tags = (
'param', 'img', 'area', 'br', 'basefont', 'input',
'base', 'meta', 'link', 'col')
def get_default_parser(self):
return etree.XML
def check_output(self, want, got, optionflags):
alt_self = getattr(self, '_temp_override_self', None)
if alt_self is not None:
super_method = self._temp_call_super_check_output
self = alt_self
else:
super_method = OutputChecker.check_output
parser = self.get_parser(want, got, optionflags)
if not parser:
return super_method(
self, want, got, optionflags)
try:
want_doc = parser(want)
except etree.XMLSyntaxError:
return False
try:
got_doc = parser(got)
except etree.XMLSyntaxError:
return False
return self.compare_docs(want_doc, got_doc)
def get_parser(self, want, got, optionflags):
parser = None
if NOPARSE_MARKUP & optionflags:
return None
if PARSE_HTML & optionflags:
parser = html_fromstring
elif PARSE_XML & optionflags:
parser = etree.XML
elif (want.strip().lower().startswith('<html')
and got.strip().startswith('<html')):
parser = html_fromstring
elif (self._looks_like_markup(want)
and self._looks_like_markup(got)):
parser = self.get_default_parser()
return parser
def _looks_like_markup(self, s):
s = s.strip()
return (s.startswith('<')
and not _repr_re.search(s))
def compare_docs(self, want, got):
if not self.tag_compare(want.tag, got.tag):
return False
if not self.text_compare(want.text, got.text, True):
return False
if not self.text_compare(want.tail, got.tail, True):
return False
if 'any' not in want.attrib:
want_keys = sorted(want.attrib.keys())
got_keys = sorted(got.attrib.keys())
if want_keys != got_keys:
return False
for key in want_keys:
if not self.text_compare(want.attrib[key], got.attrib[key], False):
return False
if want.text != '...' or len(want):
want_children = list(want)
got_children = list(got)
while want_children or got_children:
if not want_children or not got_children:
return False
want_first = want_children.pop(0)
got_first = got_children.pop(0)
if not self.compare_docs(want_first, got_first):
return False
if not got_children and want_first.tail == '...':
break
return True
def text_compare(self, want, got, strip):
want = want or ''
got = got or ''
if strip:
want = norm_whitespace(want).strip()
got = norm_whitespace(got).strip()
want = '^%s$' % re.escape(want)
want = want.replace(r'\.\.\.', '.*')
if re.search(want, got):
return True
else:
return False
def tag_compare(self, want, got):
if want == 'any':
return True
if (not isinstance(want, _basestring)
or not isinstance(got, _basestring)):
return want == got
want = want or ''
got = got or ''
if want.startswith('{...}'):
# Ellipsis on the namespace
return want.split('}')[-1] == got.split('}')[-1]
else:
return want == got
def output_difference(self, example, got, optionflags):
want = example.want
parser = self.get_parser(want, got, optionflags)
errors = []
if parser is not None:
try:
want_doc = parser(want)
except etree.XMLSyntaxError:
e = sys.exc_info()[1]
errors.append('In example: %s' % e)
try:
got_doc = parser(got)
except etree.XMLSyntaxError:
e = sys.exc_info()[1]
errors.append('In actual output: %s' % e)
if parser is None or errors:
value = OutputChecker.output_difference(
self, example, got, optionflags)
if errors:
errors.append(value)
return '\n'.join(errors)
else:
return value
html = parser is html_fromstring
diff_parts = []
diff_parts.append('Expected:')
diff_parts.append(self.format_doc(want_doc, html, 2))
diff_parts.append('Got:')
diff_parts.append(self.format_doc(got_doc, html, 2))
diff_parts.append('Diff:')
diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
return '\n'.join(diff_parts)
def html_empty_tag(self, el, html=True):
if not html:
return False
if el.tag not in self.empty_tags:
return False
if el.text or len(el):
# This shouldn't happen (contents in an empty tag)
return False
return True
def format_doc(self, doc, html, indent, prefix=''):
parts = []
if not len(doc):
# No children...
parts.append(' '*indent)
parts.append(prefix)
parts.append(self.format_tag(doc))
if not self.html_empty_tag(doc, html):
if strip(doc.text):
parts.append(self.format_text(doc.text))
parts.append(self.format_end_tag(doc))
if strip(doc.tail):
parts.append(self.format_text(doc.tail))
parts.append('\n')
return ''.join(parts)
parts.append(' '*indent)
parts.append(prefix)
parts.append(self.format_tag(doc))
if not self.html_empty_tag(doc, html):
parts.append('\n')
if strip(doc.text):
parts.append(' '*indent)
parts.append(self.format_text(doc.text))
parts.append('\n')
for el in doc:
parts.append(self.format_doc(el, html, indent+2))
parts.append(' '*indent)
parts.append(self.format_end_tag(doc))
parts.append('\n')
if strip(doc.tail):
parts.append(' '*indent)
parts.append(self.format_text(doc.tail))
parts.append('\n')
return ''.join(parts)
def format_text(self, text, strip=True):
if text is None:
return ''
if strip:
text = text.strip()
return html_escape(text, 1)
def format_tag(self, el):
attrs = []
if isinstance(el, etree.CommentBase):
# FIXME: probably PIs should be handled specially too?
return '<!--'
for name, value in sorted(el.attrib.items()):
attrs.append('%s="%s"' % (name, self.format_text(value, False)))
if not attrs:
return '<%s>' % el.tag
return '<%s %s>' % (el.tag, ' '.join(attrs))
def format_end_tag(self, el):
if isinstance(el, etree.CommentBase):
# FIXME: probably PIs should be handled specially too?
return '-->'
return '</%s>' % el.tag
def collect_diff(self, want, got, html, indent):
parts = []
if not len(want) and not len(got):
parts.append(' '*indent)
parts.append(self.collect_diff_tag(want, got))
if not self.html_empty_tag(got, html):
parts.append(self.collect_diff_text(want.text, got.text))
parts.append(self.collect_diff_end_tag(want, got))
parts.append(self.collect_diff_text(want.tail, got.tail))
parts.append('\n')
return ''.join(parts)
parts.append(' '*indent)
parts.append(self.collect_diff_tag(want, got))
parts.append('\n')
if strip(want.text) or strip(got.text):
parts.append(' '*indent)
parts.append(self.collect_diff_text(want.text, got.text))
parts.append('\n')
want_children = list(want)
got_children = list(got)
while want_children or got_children:
if not want_children:
parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+'))
continue
if not got_children:
parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-'))
continue
parts.append(self.collect_diff(
want_children.pop(0), got_children.pop(0), html, indent+2))
parts.append(' '*indent)
parts.append(self.collect_diff_end_tag(want, got))
parts.append('\n')
if strip(want.tail) or strip(got.tail):
parts.append(' '*indent)
parts.append(self.collect_diff_text(want.tail, got.tail))
parts.append('\n')
return ''.join(parts)
def collect_diff_tag(self, want, got):
if not self.tag_compare(want.tag, got.tag):
tag = '%s (got: %s)' % (want.tag, got.tag)
else:
tag = got.tag
attrs = []
any = want.tag == 'any' or 'any' in want.attrib
for name, value in sorted(got.attrib.items()):
if name not in want.attrib and not any:
attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
else:
if name in want.attrib:
text = self.collect_diff_text(want.attrib[name], value, False)
else:
text = self.format_text(value, False)
attrs.append('%s="%s"' % (name, text))
if not any:
for name, value in sorted(want.attrib.items()):
if name in got.attrib:
continue
attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
if attrs:
tag = '<%s %s>' % (tag, ' '.join(attrs))
else:
tag = '<%s>' % tag
return tag
def collect_diff_end_tag(self, want, got):
if want.tag != got.tag:
tag = '%s (got: %s)' % (want.tag, got.tag)
else:
tag = got.tag
return '</%s>' % tag
def collect_diff_text(self, want, got, strip=True):
if self.text_compare(want, got, strip):
if not got:
return ''
return self.format_text(got, strip)
text = '%s (got: %s)' % (want, got)
return self.format_text(text, strip)
class LHTMLOutputChecker(LXMLOutputChecker):
def get_default_parser(self):
return html_fromstring
def install(html=False):
"""
Install doctestcompare for all future doctests.
If html is true, then by default the HTML parser will be used;
otherwise the XML parser is used.
"""
if html:
doctest.OutputChecker = LHTMLOutputChecker
else:
doctest.OutputChecker = LXMLOutputChecker
def temp_install(html=False, del_module=None):
"""
Use this *inside* a doctest to enable this checker for this
doctest only.
If html is true, then by default the HTML parser will be used;
otherwise the XML parser is used.
"""
if html:
Checker = LHTMLOutputChecker
else:
Checker = LXMLOutputChecker
frame = _find_doctest_frame()
dt_self = frame.f_locals['self']
checker = Checker()
old_checker = dt_self._checker
dt_self._checker = checker
# The unfortunate thing is that there is a local variable 'check'
# in the function that runs the doctests, that is a bound method
# into the output checker. We have to update that. We can't
# modify the frame, so we have to modify the object in place. The
# only way to do this is to actually change the func_code
# attribute of the method. We change it, and then wait for
# __record_outcome to be run, which signals the end of the __run
# method, at which point we restore the previous check_output
# implementation.
if _IS_PYTHON_3:
check_func = frame.f_locals['check'].__func__
checker_check_func = checker.check_output.__func__
else:
check_func = frame.f_locals['check'].im_func
checker_check_func = checker.check_output.im_func
# Because we can't patch up func_globals, this is the only global
# in check_output that we care about:
doctest.etree = etree
_RestoreChecker(dt_self, old_checker, checker,
check_func, checker_check_func,
del_module)
class _RestoreChecker(object):
def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
del_module):
self.dt_self = dt_self
self.checker = old_checker
self.checker._temp_call_super_check_output = self.call_super
self.checker._temp_override_self = new_checker
self.check_func = check_func
self.clone_func = clone_func
self.del_module = del_module
self.install_clone()
self.install_dt_self()
def install_clone(self):
if _IS_PYTHON_3:
self.func_code = self.check_func.__code__
self.func_globals = self.check_func.__globals__
self.check_func.__code__ = self.clone_func.__code__
else:
self.func_code = self.check_func.func_code
self.func_globals = self.check_func.func_globals
self.check_func.func_code = self.clone_func.func_code
def uninstall_clone(self):
if _IS_PYTHON_3:
self.check_func.__code__ = self.func_code
else:
self.check_func.func_code = self.func_code
def install_dt_self(self):
self.prev_func = self.dt_self._DocTestRunner__record_outcome
self.dt_self._DocTestRunner__record_outcome = self
def uninstall_dt_self(self):
self.dt_self._DocTestRunner__record_outcome = self.prev_func
def uninstall_module(self):
if self.del_module:
import sys
del sys.modules[self.del_module]
if '.' in self.del_module:
package, module = self.del_module.rsplit('.', 1)
package_mod = sys.modules[package]
delattr(package_mod, module)
def __call__(self, *args, **kw):
self.uninstall_clone()
self.uninstall_dt_self()
del self.checker._temp_override_self
del self.checker._temp_call_super_check_output
result = self.prev_func(*args, **kw)
self.uninstall_module()
return result
def call_super(self, *args, **kw):
self.uninstall_clone()
try:
return self.check_func(*args, **kw)
finally:
self.install_clone()
def _find_doctest_frame():
import sys
frame = sys._getframe(1)
while frame:
l = frame.f_locals
if 'BOOM' in l:
# Sign of doctest
return frame
frame = frame.f_back
raise LookupError(
"Could not find doctest (only use this function *inside* a doctest)")
__test__ = {
'basic': '''
>>> temp_install()
>>> print """<xml a="1" b="2">stuff</xml>"""
<xml b="2" a="1">...</xml>
>>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>"""
<xml xmlns="...">
<tag attr="..." />
</xml>
>>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
<xml>...foo /></xml>
'''}
if __name__ == '__main__':
import doctest
doctest.testmod()

Binary file not shown.

View file

@ -0,0 +1,10 @@
__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
"""
__all__ = ["parse", "convert_tree"]
from soupparser import convert_tree, parse as _parse
def parse(file, beautifulsoup=None, makeelement=None):
root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
return root.getroot()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,87 @@
import optparse
import sys
import re
import os
from lxml.html.diff import htmldiff
description = """\
"""
parser = optparse.OptionParser(
usage="%prog [OPTIONS] FILE1 FILE2\n"
"%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
description=description,
)
parser.add_option(
'-o', '--output',
metavar="FILE",
dest="output",
default="-",
help="File to write the difference to",
)
parser.add_option(
'-a', '--annotation',
action="store_true",
dest="annotation",
help="Do an annotation")
def main(args=None):
if args is None:
args = sys.argv[1:]
options, args = parser.parse_args(args)
if options.annotation:
return annotate(options, args)
if len(args) != 2:
print('Error: you must give two files')
parser.print_help()
sys.exit(1)
file1, file2 = args
input1 = read_file(file1)
input2 = read_file(file2)
body1 = split_body(input1)[1]
pre, body2, post = split_body(input2)
result = htmldiff(body1, body2)
result = pre + result + post
if options.output == '-':
if not result.endswith('\n'):
result += '\n'
sys.stdout.write(result)
else:
f = open(options.output, 'wb')
f.write(result)
f.close()
def read_file(filename):
if filename == '-':
c = sys.stdin.read()
elif not os.path.exists(filename):
raise OSError(
"Input file %s does not exist" % filename)
else:
f = open(filename, 'rb')
c = f.read()
f.close()
return c
body_start_re = re.compile(
r"<body.*?>", re.I|re.S)
body_end_re = re.compile(
r"</body.*?>", re.I|re.S)
def split_body(html):
match = body_start_re.search(html)
if match:
pre = html[:match.end()]
html = html[match.end():]
match = body_end_re.search(html)
if match:
post = html[match.start():]
html = html[:match.start()]
return pre, html, post
def annotate(options, args):
print("Not yet implemented")
sys.exit(1)

View file

@ -0,0 +1,100 @@
"""
Legacy module - don't use in new code!
html5lib now has its own proper implementation.
This module implements a tree builder for html5lib that generates lxml
html element trees. This module uses camelCase as it follows the
html5lib style guide.
"""
from html5lib.treebuilders import _base, etree as etree_builders
from lxml import html, etree
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self.childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
def __init__(self, *args, **kwargs):
html_builder = etree_builders.getETreeModule(html, fullTree=False)
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
self.elementClass = html_builder.Element
self.commentClass = etree_builder.Comment
_base.TreeBuilder.__init__(self, *args, **kwargs)
def reset(self):
_base.TreeBuilder.reset(self)
self.rootInserted = False
self.initialComments = []
self.doctype = None
def getDocument(self):
return self.document._elementTree
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, name, publicId, systemId):
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertComment(self, data, parent=None):
if not self.rootInserted:
self.initialComments.append(data)
else:
_base.TreeBuilder.insertComment(self, data, parent)
def insertRoot(self, name):
buf = []
if self.doctype and self.doctype.name:
buf.append('<!DOCTYPE %s' % self.doctype.name)
if self.doctype.publicId is not None or self.doctype.systemId is not None:
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
self.doctype.systemId))
buf.append('>')
buf.append('<html></html>')
root = html.fromstring(''.join(buf))
# Append the initial comments:
for comment in self.initialComments:
root.addprevious(etree.Comment(comment))
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name)
root_element._element = root
self.document.childNodes.append(root_element)
self.openElements.append(root_element)
self.rootInserted = True

View file

@ -0,0 +1,52 @@
from collections import MutableSet
class SetMixin(MutableSet):
"""
Mix-in for sets. You must define __iter__, add, remove
"""
def __len__(self):
length = 0
for item in self:
length += 1
return length
def __contains__(self, item):
for has_item in self:
if item == has_item:
return True
return False
issubset = MutableSet.__le__
issuperset = MutableSet.__ge__
union = MutableSet.__or__
intersection = MutableSet.__and__
difference = MutableSet.__sub__
symmetric_difference = MutableSet.__xor__
def copy(self):
return set(self)
def update(self, other):
self |= other
def intersection_update(self, other):
self &= other
def difference_update(self, other):
self -= other
def symmetric_difference_update(self, other):
self ^= other
def discard(self, item):
try:
self.remove(item)
except KeyError:
pass
@classmethod
def _from_iterable(cls, it):
return set(it)

View file

@ -0,0 +1,133 @@
# --------------------------------------------------------------------
# The ElementTree toolkit is
# Copyright (c) 1999-2004 by Fredrik Lundh
# --------------------------------------------------------------------
"""
A set of HTML generator tags for building HTML documents.
Usage::
>>> from lxml.html.builder import *
>>> html = HTML(
... HEAD( TITLE("Hello World") ),
... BODY( CLASS("main"),
... H1("Hello World !")
... )
... )
>>> import lxml.etree
>>> print lxml.etree.tostring(html, pretty_print=True)
<html>
<head>
<title>Hello World</title>
</head>
<body class="main">
<h1>Hello World !</h1>
</body>
</html>
"""
from lxml.builder import ElementMaker
from lxml.html import html_parser
E = ElementMaker(makeelement=html_parser.makeelement)
# elements
A = E.a # anchor
ABBR = E.abbr # abbreviated form (e.g., WWW, HTTP, etc.)
ACRONYM = E.acronym #
ADDRESS = E.address # information on author
APPLET = E.applet # Java applet (DEPRECATED)
AREA = E.area # client-side image map area
B = E.b # bold text style
BASE = E.base # document base URI
BASEFONT = E.basefont # base font size (DEPRECATED)
BDO = E.bdo # I18N BiDi over-ride
BIG = E.big # large text style
BLOCKQUOTE = E.blockquote # long quotation
BODY = E.body # document body
BR = E.br # forced line break
BUTTON = E.button # push button
CAPTION = E.caption # table caption
CENTER = E.center # shorthand for DIV align=center (DEPRECATED)
CITE = E.cite # citation
CODE = E.code # computer code fragment
COL = E.col # table column
COLGROUP = E.colgroup # table column group
DD = E.dd # definition description
DEL = getattr(E, 'del') # deleted text
DFN = E.dfn # instance definition
DIR = E.dir # directory list (DEPRECATED)
DIV = E.div # generic language/style container
DL = E.dl # definition list
DT = E.dt # definition term
EM = E.em # emphasis
FIELDSET = E.fieldset # form control group
FONT = E.font # local change to font (DEPRECATED)
FORM = E.form # interactive form
FRAME = E.frame # subwindow
FRAMESET = E.frameset # window subdivision
H1 = E.h1 # heading
H2 = E.h2 # heading
H3 = E.h3 # heading
H4 = E.h4 # heading
H5 = E.h5 # heading
H6 = E.h6 # heading
HEAD = E.head # document head
HR = E.hr # horizontal rule
HTML = E.html # document root element
I = E.i # italic text style
IFRAME = E.iframe # inline subwindow
IMG = E.img # Embedded image
INPUT = E.input # form control
INS = E.ins # inserted text
ISINDEX = E.isindex # single line prompt (DEPRECATED)
KBD = E.kbd # text to be entered by the user
LABEL = E.label # form field label text
LEGEND = E.legend # fieldset legend
LI = E.li # list item
LINK = E.link # a media-independent link
MAP = E.map # client-side image map
MENU = E.menu # menu list (DEPRECATED)
META = E.meta # generic metainformation
NOFRAMES = E.noframes # alternate content container for non frame-based rendering
NOSCRIPT = E.noscript # alternate content container for non script-based rendering
OBJECT = E.object # generic embedded object
OL = E.ol # ordered list
OPTGROUP = E.optgroup # option group
OPTION = E.option # selectable choice
P = E.p # paragraph
PARAM = E.param # named property value
PRE = E.pre # preformatted text
Q = E.q # short inline quotation
S = E.s # strike-through text style (DEPRECATED)
SAMP = E.samp # sample program output, scripts, etc.
SCRIPT = E.script # script statements
SELECT = E.select # option selector
SMALL = E.small # small text style
SPAN = E.span # generic language/style container
STRIKE = E.strike # strike-through text (DEPRECATED)
STRONG = E.strong # strong emphasis
STYLE = E.style # style info
SUB = E.sub # subscript
SUP = E.sup # superscript
TABLE = E.table #
TBODY = E.tbody # table body
TD = E.td # table data cell
TEXTAREA = E.textarea # multi-line text field
TFOOT = E.tfoot # table footer
TH = E.th # table header cell
THEAD = E.thead # table header
TITLE = E.title # document title
TR = E.tr # table row
TT = E.tt # teletype or monospaced text style
U = E.u # underlined text style (DEPRECATED)
UL = E.ul # unordered list
VAR = E.var # instance of a variable or program argument
# attributes (only reserved words are included here)
ATTR = dict
def CLASS(v): return {'class': v}
def FOR(v): return {'for': v}

View file

@ -0,0 +1,732 @@
"""A cleanup tool for HTML.
Removes unwanted tags and content. See the `Cleaner` class for
details.
"""
import re
import copy
try:
from urlparse import urlsplit
except ImportError:
# Python 3
from urllib.parse import urlsplit
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import xhtml_to_html, _transform_result
try:
unichr
except NameError:
# Python 3
unichr = chr
try:
unicode
except NameError:
# Python 3
unicode = str
try:
bytes
except NameError:
# Python < 2.6
bytes = str
try:
basestring
except NameError:
basestring = (str, bytes)
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
'word_break', 'word_break_html']
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
# I have multiple kinds of schemes searched; but should schemes be
# whitelisted instead?
# max height?
# remove images? Also in CSS? background attribute?
# Some way to whitelist object, iframe, etc (e.g., if you want to
# allow *just* embedded YouTube movies)
# Log what was deleted and why?
# style="behavior: ..." might be bad in IE?
# Should we have something for just <meta http-equiv>? That's the worst of the
# metas.
# UTF-7 detections? Example:
# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
# you don't always have to have the charset set, if the page has no charset
# and there's UTF7-like code in it.
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
# This is an IE-specific construct you can have in a stylesheet to
# run some Javascript:
_css_javascript_re = re.compile(
r'expression\s*\(.*?\)', re.S|re.I)
# Do I have to worry about @\nimport?
_css_import_re = re.compile(
r'@\s*import', re.I)
# All kinds of schemes besides just javascript: that can cause
# execution:
_is_image_dataurl = re.compile(
r'^data:image/.+;base64', re.I).search
_is_possibly_malicious_scheme = re.compile(
r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
re.I).search
def _is_javascript_scheme(s):
if _is_image_dataurl(s):
return None
return _is_possibly_malicious_scheme(s)
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
# FIXME: should data: be blocked?
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
_conditional_comment_re = re.compile(
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
_find_styled_elements = etree.XPath(
"descendant-or-self::*[@style]")
_find_external_links = etree.XPath(
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
namespaces={'x':XHTML_NAMESPACE})
class Cleaner(object):
"""
Instances cleans the document of each of the possible offending
elements. The cleaning is controlled by attributes; you can
override attributes in a subclass, or set them in the constructor.
``scripts``:
Removes any ``<script>`` tags.
``javascript``:
Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
as they could contain Javascript.
``comments``:
Removes any comments.
``style``:
Removes any style tags or attributes.
``links``:
Removes any ``<link>`` tags
``meta``:
Removes any ``<meta>`` tags
``page_structure``:
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
``processing_instructions``:
Removes any processing instructions.
``embedded``:
Removes any embedded objects (flash, iframes)
``frames``:
Removes any frame-related tags
``forms``:
Removes any form tags
``annoying_tags``:
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
``remove_tags``:
A list of tags to remove. Only the tags will be removed,
their content will get pulled up into the parent tag.
``kill_tags``:
A list of tags to kill. Killing also removes the tag's content,
i.e. the whole subtree, not just the tag itself.
``allow_tags``:
A list of tags to include (default include all).
``remove_unknown_tags``:
Remove any tags that aren't standard parts of HTML.
``safe_attrs_only``:
If true, only include 'safe' attributes (specifically the list
from the feedparser HTML sanitisation web site).
``safe_attrs``:
A set of attribute names to override the default list of attributes
considered 'safe' (when safe_attrs_only=True).
``add_nofollow``:
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
``host_whitelist``:
A list or set of hosts that you can use for embedded content
(for content like ``<object>``, ``<link rel="stylesheet">``, etc).
You can also implement/override the method
``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
implement more complex rules for what can be embedded.
Anything that passes this test will be shown, regardless of
the value of (for instance) ``embedded``.
Note that this parameter might not work as intended if you do not
make the links absolute before doing the cleaning.
Note that you may also need to set ``whitelist_tags``.
``whitelist_tags``:
A set of tags that can be included with ``host_whitelist``.
The default is ``iframe`` and ``embed``; you may wish to
include other tags like ``script``, or you may want to
implement ``allow_embedded_url`` for more control. Set to None to
include all tags.
This modifies the document *in place*.
"""
scripts = True
javascript = True
comments = True
style = False
links = True
meta = True
page_structure = True
processing_instructions = True
embedded = True
frames = True
forms = True
annoying_tags = True
remove_tags = None
allow_tags = None
kill_tags = None
remove_unknown_tags = True
safe_attrs_only = True
safe_attrs = defs.safe_attrs
add_nofollow = False
host_whitelist = ()
whitelist_tags = set(['iframe', 'embed'])
def __init__(self, **kw):
for name, value in kw.items():
if not hasattr(self, name):
raise TypeError(
"Unknown parameter: %s=%r" % (name, value))
setattr(self, name, value)
# Used to lookup the primary URL for a given tag that is up for
# removal:
_tag_link_attrs = dict(
script='src',
link='href',
# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
# From what I can tell, both attributes can contain a link:
applet=['code', 'object'],
iframe='src',
embed='src',
layer='src',
# FIXME: there doesn't really seem like a general way to figure out what
# links an <object> tag uses; links often go in <param> tags with values
# that we don't really know. You'd have to have knowledge about specific
# kinds of plugins (probably keyed off classid), and match against those.
##object=?,
# FIXME: not looking at the action currently, because it is more complex
# than than -- if you keep the form, you should keep the form controls.
##form='action',
a='href',
)
def __call__(self, doc):
"""
Cleans the document.
"""
if hasattr(doc, 'getroot'):
# ElementTree instance, instead of an element
doc = doc.getroot()
# convert XHTML to HTML
xhtml_to_html(doc)
# Normalize a case that IE treats <image> like <img>, and that
# can confuse either this step or later steps.
for el in doc.iter('image'):
el.tag = 'img'
if not self.comments:
# Of course, if we were going to kill comments anyway, we don't
# need to worry about this
self.kill_conditional_comments(doc)
kill_tags = set(self.kill_tags or ())
remove_tags = set(self.remove_tags or ())
allow_tags = set(self.allow_tags or ())
if self.scripts:
kill_tags.add('script')
if self.safe_attrs_only:
safe_attrs = set(self.safe_attrs)
for el in doc.iter(etree.Element):
attrib = el.attrib
for aname in attrib.keys():
if aname not in safe_attrs:
del attrib[aname]
if self.javascript:
if not (self.safe_attrs_only and
self.safe_attrs == defs.safe_attrs):
# safe_attrs handles events attributes itself
for el in doc.iter(etree.Element):
attrib = el.attrib
for aname in attrib.keys():
if aname.startswith('on'):
del attrib[aname]
doc.rewrite_links(self._remove_javascript_link,
resolve_base_href=False)
if not self.style:
# If we're deleting style then we don't have to remove JS links
# from styles, otherwise...
for el in _find_styled_elements(doc):
old = el.get('style')
new = _css_javascript_re.sub('', old)
new = _css_import_re.sub('', new)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
del el.attrib['style']
elif new != old:
el.set('style', new)
for el in list(doc.iter('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
el.drop_tree()
continue
old = el.text or ''
new = _css_javascript_re.sub('', old)
# The imported CSS can do anything; we just can't allow:
new = _css_import_re.sub('', old)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
el.text = '/* deleted */'
elif new != old:
el.text = new
if self.comments or self.processing_instructions:
# FIXME: why either? I feel like there's some obscure reason
# because you can put PIs in comments...? But I've already
# forgotten it
kill_tags.add(etree.Comment)
if self.processing_instructions:
kill_tags.add(etree.ProcessingInstruction)
if self.style:
kill_tags.add('style')
etree.strip_attributes(doc, 'style')
if self.links:
kill_tags.add('link')
elif self.style or self.javascript:
# We must get rid of included stylesheets if Javascript is not
# allowed, as you can put Javascript in them
for el in list(doc.iter('link')):
if 'stylesheet' in el.get('rel', '').lower():
# Note this kills alternate stylesheets as well
if not self.allow_element(el):
el.drop_tree()
if self.meta:
kill_tags.add('meta')
if self.page_structure:
remove_tags.update(('head', 'html', 'title'))
if self.embedded:
# FIXME: is <layer> really embedded?
# We should get rid of any <param> tags not inside <applet>;
# These are not really valid anyway.
for el in list(doc.iter('param')):
found_parent = False
parent = el.getparent()
while parent is not None and parent.tag not in ('applet', 'object'):
parent = parent.getparent()
if parent is None:
el.drop_tree()
kill_tags.update(('applet',))
# The alternate contents that are in an iframe are a good fallback:
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
if self.frames:
# FIXME: ideally we should look at the frame links, but
# generally frames don't mix properly with an HTML
# fragment anyway.
kill_tags.update(defs.frame_tags)
if self.forms:
remove_tags.add('form')
kill_tags.update(('button', 'input', 'select', 'textarea'))
if self.annoying_tags:
remove_tags.update(('blink', 'marquee'))
_remove = []
_kill = []
for el in doc.iter():
if el.tag in kill_tags:
if self.allow_element(el):
continue
_kill.append(el)
elif el.tag in remove_tags:
if self.allow_element(el):
continue
_remove.append(el)
if _remove and _remove[0] == doc:
# We have to drop the parent-most tag, which we can't
# do. Instead we'll rewrite it:
el = _remove.pop(0)
el.tag = 'div'
el.attrib.clear()
elif _kill and _kill[0] == doc:
# We have to drop the parent-most element, which we can't
# do. Instead we'll clear it:
el = _kill.pop(0)
if el.tag != 'html':
el.tag = 'div'
el.clear()
_kill.reverse() # start with innermost tags
for el in _kill:
el.drop_tree()
for el in _remove:
el.drop_tag()
if self.remove_unknown_tags:
if allow_tags:
raise ValueError(
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
allow_tags = set(defs.tags)
if allow_tags:
bad = []
for el in doc.iter():
if el.tag not in allow_tags:
bad.append(el)
if bad:
if bad[0] is doc:
el = bad.pop(0)
el.tag = 'div'
el.attrib.clear()
for el in bad:
el.drop_tag()
if self.add_nofollow:
for el in _find_external_links(doc):
if not self.allow_follow(el):
rel = el.get('rel')
if rel:
if ('nofollow' in rel
and ' nofollow ' in (' %s ' % rel)):
continue
rel = '%s nofollow' % rel
else:
rel = 'nofollow'
el.set('rel', rel)
def allow_follow(self, anchor):
"""
Override to suppress rel="nofollow" on some anchors.
"""
return False
def allow_element(self, el):
if el.tag not in self._tag_link_attrs:
return False
attr = self._tag_link_attrs[el.tag]
if isinstance(attr, (list, tuple)):
for one_attr in attr:
url = el.get(one_attr)
if not url:
return False
if not self.allow_embedded_url(el, url):
return False
return True
else:
url = el.get(attr)
if not url:
return False
return self.allow_embedded_url(el, url)
def allow_embedded_url(self, el, url):
if (self.whitelist_tags is not None
and el.tag not in self.whitelist_tags):
return False
scheme, netloc, path, query, fragment = urlsplit(url)
netloc = netloc.lower().split(':', 1)[0]
if scheme not in ('http', 'https'):
return False
if netloc in self.host_whitelist:
return True
return False
def kill_conditional_comments(self, doc):
"""
IE conditional comments basically embed HTML that the parser
doesn't normally see. We can't allow anything like that, so
we'll kill any comments that could be conditional.
"""
bad = []
self._kill_elements(
doc, lambda el: _conditional_comment_re.search(el.text),
etree.Comment)
def _kill_elements(self, doc, condition, iterate=None):
bad = []
for el in doc.iter(iterate):
if condition(el):
bad.append(el)
for el in bad:
el.drop_tree()
def _remove_javascript_link(self, link):
# links like "j a v a s c r i p t:" might be interpreted in IE
new = _substitute_whitespace('', link)
if _is_javascript_scheme(new):
# FIXME: should this be None to delete?
return ''
return link
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
def _has_sneaky_javascript(self, style):
"""
Depending on the browser, stuff like ``e x p r e s s i o n(...)``
can get interpreted, or ``expre/* stuff */ssion(...)``. This
checks for attempt to do stuff like this.
Typically the response will be to kill the entire style; if you
have just a bit of Javascript in the style another rule will catch
that and remove only the Javascript from the style; this catches
more sneaky attempts.
"""
style = self._substitute_comments('', style)
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()
if 'javascript:' in style:
return True
if 'expression(' in style:
return True
return False
def clean_html(self, html):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
return _transform_result(result_type, doc)
clean = Cleaner()
clean_html = clean.clean_html
############################################################
## Autolinking
############################################################
_link_regexes = [
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
# This is conservative, but autolinking can be a bit conservative:
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
]
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
_avoid_hosts = [
re.compile(r'^localhost', re.I),
re.compile(r'\bexample\.(?:com|org|net)$', re.I),
re.compile(r'^127\.0\.0\.1$'),
]
_avoid_classes = ['nolink']
def autolink(el, link_regexes=_link_regexes,
avoid_elements=_avoid_elements,
avoid_hosts=_avoid_hosts,
avoid_classes=_avoid_classes):
"""
Turn any URLs into links.
It will search for links identified by the given regular
expressions (by default mailto and http(s) links).
It won't link text in an element in avoid_elements, or an element
with a class in avoid_classes. It won't link to anything with a
host that matches one of the regular expressions in avoid_hosts
(default localhost and 127.0.0.1).
If you pass in an element, the element's tail will not be
substituted, only the contents of the element.
"""
if el.tag in avoid_elements:
return
class_name = el.get('class')
if class_name:
class_name = class_name.split()
for match_class in avoid_classes:
if match_class in class_name:
return
for child in list(el):
autolink(child, link_regexes=link_regexes,
avoid_elements=avoid_elements,
avoid_hosts=avoid_hosts,
avoid_classes=avoid_classes)
if child.tail:
text, tail_children = _link_text(
child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
if tail_children:
child.tail = text
index = el.index(child)
el[index+1:index+1] = tail_children
if el.text:
text, pre_children = _link_text(
el.text, link_regexes, avoid_hosts, factory=el.makeelement)
if pre_children:
el.text = text
el[:0] = pre_children
def _link_text(text, link_regexes, avoid_hosts, factory):
leading_text = ''
links = []
last_pos = 0
while 1:
best_match, best_pos = None, None
for regex in link_regexes:
regex_pos = last_pos
while 1:
match = regex.search(text, pos=regex_pos)
if match is None:
break
host = match.group('host')
for host_regex in avoid_hosts:
if host_regex.search(host):
regex_pos = match.end()
break
else:
break
if match is None:
continue
if best_pos is None or match.start() < best_pos:
best_match = match
best_pos = match.start()
if best_match is None:
# No more matches
if links:
assert not links[-1].tail
links[-1].tail = text
else:
assert not leading_text
leading_text = text
break
link = best_match.group(0)
end = best_match.end()
if link.endswith('.') or link.endswith(','):
# These punctuation marks shouldn't end a link
end -= 1
link = link[:-1]
prev_text = text[:best_match.start()]
if links:
assert not links[-1].tail
links[-1].tail = prev_text
else:
assert not leading_text
leading_text = prev_text
anchor = factory('a')
anchor.set('href', link)
body = best_match.group('body')
if not body:
body = link
if body.endswith('.') or body.endswith(','):
body = body[:-1]
anchor.text = body
links.append(anchor)
text = text[end:]
return leading_text, links
def autolink_html(html, *args, **kw):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
autolink(doc, *args, **kw)
return _transform_result(result_type, doc)
autolink_html.__doc__ = autolink.__doc__
############################################################
## Word wrapping
############################################################
_avoid_word_break_elements = ['pre', 'textarea', 'code']
_avoid_word_break_classes = ['nobreak']
def word_break(el, max_width=40,
avoid_elements=_avoid_word_break_elements,
avoid_classes=_avoid_word_break_classes,
break_character=unichr(0x200b)):
"""
Breaks any long words found in the body of the text (not attributes).
Doesn't effect any of the tags in avoid_elements, by default
``<textarea>`` and ``<pre>``
Breaks words by inserting &#8203;, which is a unicode character
for Zero Width Space character. This generally takes up no space
in rendering, but does copy as a space, and in monospace contexts
usually takes up space.
See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
"""
# Character suggestion of &#8203 comes from:
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
if el.tag in _avoid_word_break_elements:
return
class_name = el.get('class')
if class_name:
dont_break = False
class_name = class_name.split()
for avoid in avoid_classes:
if avoid in class_name:
dont_break = True
break
if dont_break:
return
if el.text:
el.text = _break_text(el.text, max_width, break_character)
for child in el:
word_break(child, max_width=max_width,
avoid_elements=avoid_elements,
avoid_classes=avoid_classes,
break_character=break_character)
if child.tail:
child.tail = _break_text(child.tail, max_width, break_character)
def word_break_html(html, *args, **kw):
result_type = type(html)
doc = fromstring(html)
word_break(doc, *args, **kw)
return _transform_result(result_type, doc)
def _break_text(text, max_width, break_character):
words = text.split()
for word in words:
if len(word) > max_width:
replacement = _insert_break(word, max_width, break_character)
text = text.replace(word, replacement)
return text
_break_prefer_re = re.compile(r'[^a-z]', re.I)
def _insert_break(word, width, break_character):
orig_word = word
result = ''
while len(word) > width:
start = word[:width]
breaks = list(_break_prefer_re.finditer(start))
if breaks:
last_break = breaks[-1]
# Only walk back up to 10 characters to find a nice break:
if last_break.end() > width-10:
# FIXME: should the break character be at the end of the
# chunk, or the beginning of the next chunk?
start = word[:last_break.end()]
result += start + break_character
word = word[len(start):]
result += word
return result

View file

@ -0,0 +1,137 @@
# FIXME: this should all be confirmed against what a DTD says
# (probably in a test; this may not match the DTD exactly, but we
# should document just how it differs).
# Data taken from http://www.w3.org/TR/html401/index/elements.html
# and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
# for html5_tags.
try:
frozenset
except NameError:
from sets import Set as frozenset
empty_tags = frozenset([
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
'img', 'input', 'isindex', 'link', 'meta', 'param'])
deprecated_tags = frozenset([
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
'menu', 's', 'strike', 'u'])
# archive actually takes a space-separated list of URIs
link_attrs = frozenset([
'action', 'archive', 'background', 'cite', 'classid',
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
'usemap',
# Not standard:
'dynsrc', 'lowsrc',
])
# Not in the HTML 4 spec:
# onerror, onresize
event_attrs = frozenset([
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
'onunload',
])
safe_attrs = frozenset([
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
# From http://htmlhelp.com/reference/html40/olist.html
top_level_tags = frozenset([
'html', 'head', 'body', 'frameset',
])
head_tags = frozenset([
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
])
general_block_tags = frozenset([
'address',
'blockquote',
'center',
'del',
'div',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'ins',
'isindex',
'noscript',
'p',
'pre',
])
list_tags = frozenset([
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
])
table_tags = frozenset([
'table', 'caption', 'colgroup', 'col',
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
])
# just this one from
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
block_tags = general_block_tags | list_tags | table_tags | frozenset([
# Partial form tags
'fieldset', 'form', 'legend', 'optgroup', 'option',
])
form_tags = frozenset([
'form', 'button', 'fieldset', 'legend', 'input', 'label',
'select', 'optgroup', 'option', 'textarea',
])
special_inline_tags = frozenset([
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
'img', 'map', 'area', 'object', 'param', 'q', 'script',
'span', 'sub', 'sup',
])
phrase_tags = frozenset([
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
'ins', 'kbd', 'samp', 'strong', 'var',
])
font_style_tags = frozenset([
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
])
frame_tags = frozenset([
'frameset', 'frame', 'noframes',
])
html5_tags = frozenset([
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
'svg', 'time', 'track', 'video', 'wbr'
])
# These tags aren't standard
nonstandard_tags = frozenset(['blink', 'marquee'])
tags = (top_level_tags | head_tags | general_block_tags | list_tags
| table_tags | form_tags | special_inline_tags | phrase_tags
| font_style_tags | nonstandard_tags | html5_tags)

View file

@ -0,0 +1,881 @@
import difflib
from lxml import etree
from lxml.html import fragment_fromstring
import re
__all__ = ['html_annotate', 'htmldiff']
try:
from html import escape as html_escape
except ImportError:
from cgi import escape as html_escape
try:
_unicode = unicode
except NameError:
# Python 3
_unicode = str
try:
basestring
except NameError:
# Python 3
basestring = str
############################################################
## Annotation
############################################################
def default_markup(text, version):
return '<span title="%s">%s</span>' % (
html_escape(_unicode(version), 1), text)
def html_annotate(doclist, markup=default_markup):
"""
doclist should be ordered from oldest to newest, like::
>>> version1 = 'Hello World'
>>> version2 = 'Goodbye World'
>>> print(html_annotate([(version1, 'version 1'),
... (version2, 'version 2')]))
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
The documents must be *fragments* (str/UTF8 or unicode), not
complete documents
The markup argument is a function to markup the spans of words.
This function is called like markup('Hello', 'version 2'), and
returns HTML. The first argument is text and never includes any
markup. The default uses a span with a title:
>>> print(default_markup('Some Text', 'by Joe'))
<span title="by Joe">Some Text</span>
"""
# The basic strategy we have is to split the documents up into
# logical tokens (which are words with attached markup). We then
# do diffs of each of the versions to track when a token first
# appeared in the document; the annotation attached to the token
# is the version where it first appeared.
tokenlist = [tokenize_annotated(doc, version)
for doc, version in doclist]
cur_tokens = tokenlist[0]
for tokens in tokenlist[1:]:
html_annotate_merge_annotations(cur_tokens, tokens)
cur_tokens = tokens
# After we've tracked all the tokens, we can combine spans of text
# that are adjacent and have the same annotation
cur_tokens = compress_tokens(cur_tokens)
# And finally add markup
result = markup_serialize_tokens(cur_tokens, markup)
return ''.join(result).strip()
def tokenize_annotated(doc, annotation):
"""Tokenize a document and add an annotation attribute to each token
"""
tokens = tokenize(doc, include_hrefs=False)
for tok in tokens:
tok.annotation = annotation
return tokens
def html_annotate_merge_annotations(tokens_old, tokens_new):
"""Merge the annotations from tokens_old into tokens_new, when the
tokens in the new document already existed in the old document.
"""
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
commands = s.get_opcodes()
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
eq_old = tokens_old[i1:i2]
eq_new = tokens_new[j1:j2]
copy_annotations(eq_old, eq_new)
def copy_annotations(src, dest):
"""
Copy annotations from the tokens listed in src to the tokens in dest
"""
assert len(src) == len(dest)
for src_tok, dest_tok in zip(src, dest):
dest_tok.annotation = src_tok.annotation
def compress_tokens(tokens):
"""
Combine adjacent tokens when there is no HTML between the tokens,
and they share an annotation
"""
result = [tokens[0]]
for tok in tokens[1:]:
if (not result[-1].post_tags and
not tok.pre_tags and
result[-1].annotation == tok.annotation):
compress_merge_back(result, tok)
else:
result.append(tok)
return result
def compress_merge_back(tokens, tok):
""" Merge tok into the last element of tokens (modifying the list of
tokens in-place). """
last = tokens[-1]
if type(last) is not token or type(tok) is not token:
tokens.append(tok)
else:
text = _unicode(last)
if last.trailing_whitespace:
text += last.trailing_whitespace
text += tok
merged = token(text,
pre_tags=last.pre_tags,
post_tags=tok.post_tags,
trailing_whitespace=tok.trailing_whitespace)
merged.annotation = last.annotation
tokens[-1] = merged
def markup_serialize_tokens(tokens, markup_func):
"""
Serialize the list of tokens into a list of text chunks, calling
markup_func around text to add annotations.
"""
for token in tokens:
for pre in token.pre_tags:
yield pre
html = token.html()
html = markup_func(html, token.annotation)
if token.trailing_whitespace:
html += token.trailing_whitespace
yield html
for post in token.post_tags:
yield post
############################################################
## HTML Diffs
############################################################
def htmldiff(old_html, new_html):
## FIXME: this should take parsed documents too, and use their body
## or other content.
""" Do a diff of the old and new document. The documents are HTML
*fragments* (str/UTF8 or unicode), they are not complete documents
(i.e., no <html> tag).
Returns HTML with <ins> and <del> tags added around the
appropriate text.
Markup is generally ignored, with the markup from new_html
preserved, and possibly some markup from old_html (though it is
considered acceptable to lose some of the old markup). Only the
words in the HTML are diffed. The exception is <img> tags, which
are treated like words, and the href attribute of <a> tags, which
are noted inside the tag itself when there are changes.
"""
old_html_tokens = tokenize(old_html)
new_html_tokens = tokenize(new_html)
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()
return fixup_ins_del_tags(result)
def htmldiff_tokens(html1_tokens, html2_tokens):
""" Does a diff on the tokens themselves, returning a list of text
chunks (not tokens).
"""
# There are several passes as we do the differences. The tokens
# isolate the portion of the content we care to diff; difflib does
# all the actual hard work at that point.
#
# Then we must create a valid document from pieces of both the old
# document and the new document. We generally prefer to take
# markup from the new document, and only do a best effort attempt
# to keep markup from the old document; anything that we can't
# resolve we throw away. Also we try to put the deletes as close
# to the location where we think they would have been -- because
# we are only keeping the markup from the new document, it can be
# fuzzy where in the new document the old text would have gone.
# Again we just do a best effort attempt.
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
commands = s.get_opcodes()
result = []
for command, i1, i2, j1, j2 in commands:
if command == 'equal':
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
continue
if command == 'insert' or command == 'replace':
ins_tokens = expand_tokens(html2_tokens[j1:j2])
merge_insert(ins_tokens, result)
if command == 'delete' or command == 'replace':
del_tokens = expand_tokens(html1_tokens[i1:i2])
merge_delete(del_tokens, result)
# If deletes were inserted directly as <del> then we'd have an
# invalid document at this point. Instead we put in special
# markers, and when the complete diffed document has been created
# we try to move the deletes around and resolve any problems.
result = cleanup_delete(result)
return result
def expand_tokens(tokens, equal=False):
"""Given a list of tokens, return a generator of the chunks of
text for the data in the tokens.
"""
for token in tokens:
for pre in token.pre_tags:
yield pre
if not equal or not token.hide_when_equal:
if token.trailing_whitespace:
yield token.html() + token.trailing_whitespace
else:
yield token.html()
for post in token.post_tags:
yield post
def merge_insert(ins_chunks, doc):
""" doc is the already-handled document (as a list of text chunks);
here we add <ins>ins_chunks</ins> to the end of that. """
# Though we don't throw away unbalanced_start or unbalanced_end
# (we assume there is accompanying markup later or earlier in the
# document), we only put <ins> around the balanced portion.
unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
doc.extend(unbalanced_start)
if doc and not doc[-1].endswith(' '):
# Fix up the case where the word before the insert didn't end with
# a space
doc[-1] += ' '
doc.append('<ins>')
if balanced and balanced[-1].endswith(' '):
# We move space outside of </ins>
balanced[-1] = balanced[-1][:-1]
doc.extend(balanced)
doc.append('</ins> ')
doc.extend(unbalanced_end)
# These are sentinals to represent the start and end of a <del>
# segment, until we do the cleanup phase to turn them into proper
# markup:
class DEL_START:
pass
class DEL_END:
pass
class NoDeletes(Exception):
""" Raised when the document no longer contains any pending deletes
(DEL_START/DEL_END) """
def merge_delete(del_chunks, doc):
""" Adds the text chunks in del_chunks to the document doc (another
list of text chunks) with marker to show it is a delete.
cleanup_delete later resolves these markers into <del> tags."""
doc.append(DEL_START)
doc.extend(del_chunks)
doc.append(DEL_END)
def cleanup_delete(chunks):
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
them with <del></del>. To do this while keeping the document
valid, it may need to drop some tags (either start or end tags).
It may also move the del into adjacent tags to try to move it to a
similar location where it was originally located (e.g., moving a
delete into preceding <div> tag, if the del looks like (DEL_START,
'Text</div>', DEL_END)"""
while 1:
# Find a pending DEL_START/DEL_END, splitting the document
# into stuff-preceding-DEL_START, stuff-inside, and
# stuff-following-DEL_END
try:
pre_delete, delete, post_delete = split_delete(chunks)
except NoDeletes:
# Nothing found, we've cleaned up the entire doc
break
# The stuff-inside-DEL_START/END may not be well balanced
# markup. First we figure out what unbalanced portions there are:
unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
# Then we move the span forward and/or backward based on these
# unbalanced portions:
locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
doc = pre_delete
if doc and not doc[-1].endswith(' '):
# Fix up case where the word before us didn't have a trailing space
doc[-1] += ' '
doc.append('<del>')
if balanced and balanced[-1].endswith(' '):
# We move space outside of </del>
balanced[-1] = balanced[-1][:-1]
doc.extend(balanced)
doc.append('</del> ')
doc.extend(post_delete)
chunks = doc
return chunks
def split_unbalanced(chunks):
"""Return (unbalanced_start, balanced, unbalanced_end), where each is
a list of text and tag chunks.
unbalanced_start is a list of all the tags that are opened, but
not closed in this span. Similarly, unbalanced_end is a list of
tags that are closed but were not opened. Extracting these might
mean some reordering of the chunks."""
start = []
end = []
tag_stack = []
balanced = []
for chunk in chunks:
if not chunk.startswith('<'):
balanced.append(chunk)
continue
endtag = chunk[1] == '/'
name = chunk.split()[0].strip('<>/')
if name in empty_tags:
balanced.append(chunk)
continue
if endtag:
if tag_stack and tag_stack[-1][0] == name:
balanced.append(chunk)
name, pos, tag = tag_stack.pop()
balanced[pos] = tag
elif tag_stack:
start.extend([tag for name, pos, tag in tag_stack])
tag_stack = []
end.append(chunk)
else:
end.append(chunk)
else:
tag_stack.append((name, len(balanced), chunk))
balanced.append(None)
start.extend(
[chunk for name, pos, chunk in tag_stack])
balanced = [chunk for chunk in balanced if chunk is not None]
return start, balanced, end
def split_delete(chunks):
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
stuff_after_DEL_END). Returns the first case found (there may be
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
there's no DEL_START found. """
try:
pos = chunks.index(DEL_START)
except ValueError:
raise NoDeletes
pos2 = chunks.index(DEL_END)
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
""" pre_delete and post_delete implicitly point to a place in the
document (where the two were split). This moves that point (by
popping items from one and pushing them onto the other). It moves
the point to try to find a place where unbalanced_start applies.
As an example::
>>> unbalanced_start = ['<div>']
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
>>> pre, post = doc[:3], doc[3:]
>>> pre, post
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
>>> locate_unbalanced_start(unbalanced_start, pre, post)
>>> pre, post
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
As you can see, we moved the point so that the dangling <div> that
we found will be effectively replaced by the div in the original
document. If this doesn't work out, we just throw away
unbalanced_start without doing anything.
"""
while 1:
if not unbalanced_start:
# We have totally succeded in finding the position
break
finding = unbalanced_start[0]
finding_name = finding.split()[0].strip('<>')
if not post_delete:
break
next = post_delete[0]
if next is DEL_START or not next.startswith('<'):
# Reached a word, we can't move the delete text forward
break
if next[1] == '/':
# Reached a closing tag, can we go further? Maybe not...
break
name = next.split()[0].strip('<>')
if name == 'ins':
# Can't move into an insert
break
assert name != 'del', (
"Unexpected delete tag: %r" % next)
if name == finding_name:
unbalanced_start.pop(0)
pre_delete.append(post_delete.pop(0))
else:
# Found a tag that doesn't match
break
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
""" like locate_unbalanced_start, except handling end tags and
possibly moving the point earlier in the document. """
while 1:
if not unbalanced_end:
# Success
break
finding = unbalanced_end[-1]
finding_name = finding.split()[0].strip('<>/')
if not pre_delete:
break
next = pre_delete[-1]
if next is DEL_END or not next.startswith('</'):
# A word or a start tag
break
name = next.split()[0].strip('<>/')
if name == 'ins' or name == 'del':
# Can't move into an insert or delete
break
if name == finding_name:
unbalanced_end.pop()
post_delete.insert(0, pre_delete.pop())
else:
# Found a tag that doesn't match
break
class token(_unicode):
""" Represents a diffable token, generally a word that is displayed to
the user. Opening tags are attached to this token when they are
adjacent (pre_tags) and closing tags that follow the word
(post_tags). Some exceptions occur when there are empty tags
adjacent to a word, so there may be close tags in pre_tags, or
open tags in post_tags.
We also keep track of whether the word was originally followed by
whitespace, even though we do not want to treat the word as
equivalent to a similar word that does not have a trailing
space."""
# When this is true, the token will be eliminated from the
# displayed diff if no change has occurred:
hide_when_equal = False
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
obj = _unicode.__new__(cls, text)
if pre_tags is not None:
obj.pre_tags = pre_tags
else:
obj.pre_tags = []
if post_tags is not None:
obj.post_tags = post_tags
else:
obj.post_tags = []
obj.trailing_whitespace = trailing_whitespace
return obj
def __repr__(self):
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
self.post_tags, self.trailing_whitespace)
def html(self):
return _unicode(self)
class tag_token(token):
""" Represents a token that is actually a tag. Currently this is just
the <img> tag, which takes up visible space just like a word but
is only represented in a document by a tag. """
def __new__(cls, tag, data, html_repr, pre_tags=None,
post_tags=None, trailing_whitespace=""):
obj = token.__new__(cls, "%s: %s" % (type, data),
pre_tags=pre_tags,
post_tags=post_tags,
trailing_whitespace=trailing_whitespace)
obj.tag = tag
obj.data = data
obj.html_repr = html_repr
return obj
def __repr__(self):
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
self.tag,
self.data,
self.html_repr,
self.pre_tags,
self.post_tags,
self.trailing_whitespace)
def html(self):
return self.html_repr
class href_token(token):
""" Represents the href in an anchor tag. Unlike other words, we only
show the href when it changes. """
hide_when_equal = True
def html(self):
return ' Link: %s' % self
def tokenize(html, include_hrefs=True):
"""
Parse the given HTML and returns token objects (words with attached tags).
This parses only the content of a page; anything in the head is
ignored, and the <head> and <body> elements are themselves
optional. The content is then parsed by lxml, which ensures the
validity of the resulting parsed document (though lxml may make
incorrect guesses when the markup is particular bad).
<ins> and <del> tags are also eliminated from the document, as
that gets confusing.
If include_hrefs is true, then the href attribute of <a> tags is
included as a special kind of diffable token."""
if etree.iselement(html):
body_el = html
else:
body_el = parse_html(html, cleanup=True)
# Then we split the document into text chunks for each tag, word, and end tag:
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
# Finally re-joining them into token objects:
return fixup_chunks(chunks)
def parse_html(html, cleanup=True):
"""
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
wrapped in a <div> tag that was not in the original document.
If cleanup is true, make sure there's no <head> or <body>, and get
rid of any <ins> and <del> tags.
"""
if cleanup:
# This removes any extra markup or structure like <head>:
html = cleanup_html(html)
return fragment_fromstring(html, create_parent=True)
_body_re = re.compile(r'<body.*?>', re.I|re.S)
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
def cleanup_html(html):
""" This 'cleans' the HTML, meaning that any page structure is removed
(only the contents of <body> are used, if there is any <body).
Also <ins> and <del> tags are removed. """
match = _body_re.search(html)
if match:
html = html[match.end():]
match = _end_body_re.search(html)
if match:
html = html[:match.start()]
html = _ins_del_re.sub('', html)
return html
end_whitespace_re = re.compile(r'[ \t\n\r]$')
def split_trailing_whitespace(word):
"""
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
"""
stripped_length = len(word.rstrip())
return word[0:stripped_length], word[stripped_length:]
def fixup_chunks(chunks):
"""
This function takes a list of chunks and produces a list of tokens.
"""
tag_accum = []
cur_word = None
result = []
for chunk in chunks:
if isinstance(chunk, tuple):
if chunk[0] == 'img':
src = chunk[1]
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
cur_word = tag_token('img', src, html_repr=tag,
pre_tags=tag_accum,
trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif chunk[0] == 'href':
href = chunk[1]
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
tag_accum = []
result.append(cur_word)
continue
if is_word(chunk):
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif is_start_tag(chunk):
tag_accum.append(chunk)
elif is_end_tag(chunk):
if tag_accum:
tag_accum.append(chunk)
else:
assert cur_word, (
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
% (cur_word, result, chunk, chunks))
cur_word.post_tags.append(chunk)
else:
assert(0)
if not result:
return [token('', pre_tags=tag_accum)]
else:
result[-1].post_tags.extend(tag_accum)
return result
# All the tags in HTML that don't require end tags:
empty_tags = (
'param', 'img', 'area', 'br', 'basefont', 'input',
'base', 'meta', 'link', 'col')
block_level_tags = (
'address',
'blockquote',
'center',
'dir',
'div',
'dl',
'fieldset',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'isindex',
'menu',
'noframes',
'noscript',
'ol',
'p',
'pre',
'table',
'ul',
)
block_level_container_tags = (
'dd',
'dt',
'frameset',
'li',
'tbody',
'td',
'tfoot',
'th',
'thead',
'tr',
)
def flatten_el(el, include_hrefs, skip_tag=False):
""" Takes an lxml element el, and generates all the text chunks for
that tag. Each start tag is a chunk, each word is a chunk, and each
end tag is a chunk.
If skip_tag is true, then the outermost container tag is
not returned (just its contents)."""
if not skip_tag:
if el.tag == 'img':
yield ('img', el.get('src'), start_tag(el))
else:
yield start_tag(el)
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
return
start_words = split_words(el.text)
for word in start_words:
yield html_escape(word)
for child in el:
for item in flatten_el(child, include_hrefs=include_hrefs):
yield item
if el.tag == 'a' and el.get('href') and include_hrefs:
yield ('href', el.get('href'))
if not skip_tag:
yield end_tag(el)
end_words = split_words(el.tail)
for word in end_words:
yield html_escape(word)
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
def split_words(text):
""" Splits some text into words. Includes trailing whitespace
on each word when appropriate. """
if not text or not text.strip():
return []
words = split_words_re.findall(text)
return words
start_whitespace_re = re.compile(r'^[ \t\n\r]')
def start_tag(el):
"""
The text representation of the start tag for a tag.
"""
return '<%s%s>' % (
el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
for name, value in el.attrib.items()]))
def end_tag(el):
""" The text representation of an end tag for a tag. Includes
trailing whitespace when appropriate. """
if el.tail and start_whitespace_re.search(el.tail):
extra = ' '
else:
extra = ''
return '</%s>%s' % (el.tag, extra)
def is_word(tok):
return not tok.startswith('<')
def is_end_tag(tok):
return tok.startswith('</')
def is_start_tag(tok):
return tok.startswith('<') and not tok.startswith('</')
def fixup_ins_del_tags(html):
""" Given an html string, move any <ins> or <del> tags inside of any
block-level elements, e.g. transform <ins><p>word</p></ins> to
<p><ins>word</ins></p> """
doc = parse_html(html, cleanup=False)
_fixup_ins_del_tags(doc)
html = serialize_html_fragment(doc, skip_outer=True)
return html
def serialize_html_fragment(el, skip_outer=False):
""" Serialize a single lxml element as HTML. The serialized form
includes the elements tail.
If skip_outer is true, then don't serialize the outermost tag
"""
assert not isinstance(el, basestring), (
"You should pass in an element, not a string like %r" % el)
html = etree.tostring(el, method="html", encoding=_unicode)
if skip_outer:
# Get rid of the extra starting tag:
html = html[html.find('>')+1:]
# Get rid of the extra end tag:
html = html[:html.rfind('<')]
return html.strip()
else:
return html
def _fixup_ins_del_tags(doc):
"""fixup_ins_del_tags that works on an lxml document in-place
"""
for tag in ['ins', 'del']:
for el in doc.xpath('descendant-or-self::%s' % tag):
if not _contains_block_level_tag(el):
continue
_move_el_inside_block(el, tag=tag)
el.drop_tag()
#_merge_element_contents(el)
def _contains_block_level_tag(el):
"""True if the element contains any block-level elements, like <p>, <td>, etc.
"""
if el.tag in block_level_tags or el.tag in block_level_container_tags:
return True
for child in el:
if _contains_block_level_tag(child):
return True
return False
def _move_el_inside_block(el, tag):
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
and moves them inside any block-level tags. """
for child in el:
if _contains_block_level_tag(child):
break
else:
import sys
# No block-level tags in any child
children_tag = etree.Element(tag)
children_tag.text = el.text
el.text = None
children_tag.extend(list(el))
el[:] = [children_tag]
return
for child in list(el):
if _contains_block_level_tag(child):
_move_el_inside_block(child, tag)
if child.tail:
tail_tag = etree.Element(tag)
tail_tag.text = child.tail
child.tail = None
el.insert(el.index(child)+1, tail_tag)
else:
child_tag = etree.Element(tag)
el.replace(child, child_tag)
child_tag.append(child)
if el.text:
text_tag = etree.Element(tag)
text_tag.text = el.text
el.text = None
el.insert(0, text_tag)
def _merge_element_contents(el):
"""
Removes an element, but merges its contents into its place, e.g.,
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
<p>Hi there!</p>
"""
parent = el.getparent()
text = el.text or ''
if el.tail:
if not len(el):
text += el.tail
else:
if el[-1].tail:
el[-1].tail += el.tail
else:
el[-1].tail = el.tail
index = parent.index(el)
if text:
if index == 0:
previous = None
else:
previous = parent[index-1]
if previous is None:
if parent.text:
parent.text += text
else:
parent.text = text
else:
if previous.tail:
previous.tail += text
else:
previous.tail = text
parent[index:index+1] = el.getchildren()
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
"""
Acts like SequenceMatcher, but tries not to find very small equal
blocks amidst large spans of changes
"""
threshold = 2
def get_matching_blocks(self):
size = min(len(self.b), len(self.b))
threshold = min(self.threshold, size / 4)
actual = difflib.SequenceMatcher.get_matching_blocks(self)
return [item for item in actual
if item[2] > threshold
or not item[2]]
if __name__ == '__main__':
from lxml.html import _diffcommand
_diffcommand.main()

View file

@ -0,0 +1,299 @@
from lxml.etree import XPath, ElementBase
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
from lxml.html import defs
import copy
try:
basestring
except NameError:
# Python 3
basestring = str
__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
'insert_errors', 'insert_errors_html',
'DefaultErrorCreator']
class FormNotFound(LookupError):
"""
Raised when no form can be found
"""
_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
namespaces={'x':XHTML_NAMESPACE})
_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
namespaces={'x':XHTML_NAMESPACE})
_name_xpath = XPath('descendant-or-self::*[@name=$name]')
def fill_form(
el,
values,
form_id=None,
form_index=None,
):
el = _find_form(el, form_id=form_id, form_index=form_index)
_fill_form(el, values)
def fill_form_html(html, values, form_id=None, form_index=None):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
fill_form(doc, values, form_id=form_id, form_index=form_index)
return _transform_result(result_type, doc)
def _fill_form(el, values):
counts = {}
if hasattr(values, 'mixed'):
# For Paste request parameters
values = values.mixed()
inputs = _input_xpath(el)
for input in inputs:
name = input.get('name')
if not name:
continue
if _takes_multiple(input):
value = values.get(name, [])
if not isinstance(value, (list, tuple)):
value = [value]
_fill_multiple(input, value)
elif name not in values:
continue
else:
index = counts.get(name, 0)
counts[name] = index + 1
value = values[name]
if isinstance(value, (list, tuple)):
try:
value = value[index]
except IndexError:
continue
elif index > 0:
continue
_fill_single(input, value)
def _takes_multiple(input):
if _nons(input.tag) == 'select' and input.get('multiple'):
# FIXME: multiple="0"?
return True
type = input.get('type', '').lower()
if type in ('radio', 'checkbox'):
return True
return False
def _fill_multiple(input, value):
type = input.get('type', '').lower()
if type == 'checkbox':
v = input.get('value')
if v is None:
if not value:
result = False
else:
result = value[0]
if isinstance(value, basestring):
# The only valid "on" value for an unnamed checkbox is 'on'
result = result == 'on'
_check(input, result)
else:
_check(input, v in value)
elif type == 'radio':
v = input.get('value')
_check(input, v in value)
else:
assert _nons(input.tag) == 'select'
for option in _options_xpath(input):
v = option.get('value')
if v is None:
# This seems to be the default, at least on IE
# FIXME: but I'm not sure
v = option.text_content()
_select(option, v in value)
def _check(el, check):
if check:
el.set('checked', '')
else:
if 'checked' in el.attrib:
del el.attrib['checked']
def _select(el, select):
if select:
el.set('selected', '')
else:
if 'selected' in el.attrib:
del el.attrib['selected']
def _fill_single(input, value):
if _nons(input.tag) == 'textarea':
input.text = value
else:
input.set('value', value)
def _find_form(el, form_id=None, form_index=None):
if form_id is None and form_index is None:
forms = _forms_xpath(el)
for form in forms:
return form
raise FormNotFound(
"No forms in page")
if form_id is not None:
form = el.get_element_by_id(form_id)
if form is not None:
return form
forms = _form_name_xpath(el, name=form_id)
if forms:
return forms[0]
else:
raise FormNotFound(
"No form with the name or id of %r (forms: %s)"
% (id, ', '.join(_find_form_ids(el))))
if form_index is not None:
forms = _forms_xpath(el)
try:
return forms[form_index]
except IndexError:
raise FormNotFound(
"There is no form with the index %r (%i forms found)"
% (form_index, len(forms)))
def _find_form_ids(el):
forms = _forms_xpath(el)
if not forms:
yield '(no forms)'
return
for index, form in enumerate(forms):
if form.get('id'):
if form.get('name'):
yield '%s or %s' % (form.get('id'),
form.get('name'))
else:
yield form.get('id')
elif form.get('name'):
yield form.get('name')
else:
yield '(unnamed form %s)' % index
############################################################
## Error filling
############################################################
class DefaultErrorCreator(object):
insert_before = True
block_inside = True
error_container_tag = 'div'
error_message_class = 'error-message'
error_block_class = 'error-block'
default_message = "Invalid"
def __init__(self, **kw):
for name, value in kw.items():
if not hasattr(self, name):
raise TypeError(
"Unexpected keyword argument: %s" % name)
setattr(self, name, value)
def __call__(self, el, is_block, message):
error_el = el.makeelement(self.error_container_tag)
if self.error_message_class:
error_el.set('class', self.error_message_class)
if is_block and self.error_block_class:
error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
if message is None or message == '':
message = self.default_message
if isinstance(message, ElementBase):
error_el.append(message)
else:
assert isinstance(message, basestring), (
"Bad message; should be a string or element: %r" % message)
error_el.text = message or self.default_message
if is_block and self.block_inside:
if self.insert_before:
error_el.tail = el.text
el.text = None
el.insert(0, error_el)
else:
el.append(error_el)
else:
parent = el.getparent()
pos = parent.index(el)
if self.insert_before:
parent.insert(pos, error_el)
else:
error_el.tail = el.tail
el.tail = None
parent.insert(pos+1, error_el)
default_error_creator = DefaultErrorCreator()
def insert_errors(
el,
errors,
form_id=None,
form_index=None,
error_class="error",
error_creator=default_error_creator,
):
el = _find_form(el, form_id=form_id, form_index=form_index)
for name, error in errors.items():
if error is None:
continue
for error_el, message in _find_elements_for_name(el, name, error):
assert isinstance(message, (basestring, type(None), ElementBase)), (
"Bad message: %r" % message)
_insert_error(error_el, message, error_class, error_creator)
def insert_errors_html(html, values, **kw):
result_type = type(html)
if isinstance(html, basestring):
doc = fromstring(html)
else:
doc = copy.deepcopy(html)
insert_errors(doc, values, **kw)
return _transform_result(result_type, doc)
def _insert_error(el, error, error_class, error_creator):
if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
is_block = False
else:
is_block = True
if _nons(el.tag) != 'form' and error_class:
_add_class(el, error_class)
if el.get('id'):
labels = _label_for_xpath(el, for_id=el.get('id'))
if labels:
for label in labels:
_add_class(label, error_class)
error_creator(el, is_block, error)
def _add_class(el, class_name):
if el.get('class'):
el.set('class', el.get('class')+' '+class_name)
else:
el.set('class', class_name)
def _find_elements_for_name(form, name, error):
if name is None:
# An error for the entire form
yield form, error
return
if name.startswith('#'):
# By id
el = form.get_element_by_id(name[1:])
if el is not None:
yield el, error
return
els = _name_xpath(form, name=name)
if not els:
# FIXME: should this raise an exception?
return
if not isinstance(error, (list, tuple)):
yield els[0], error
return
# FIXME: if error is longer than els, should it raise an error?
for el, err in zip(els, error):
if err is None:
continue
yield el, err

View file

@ -0,0 +1,207 @@
"""
An interface to html5lib that mimics the lxml.html interface.
"""
import sys
import string
from html5lib import HTMLParser as _HTMLParser
from html5lib.treebuilders.etree_lxml import TreeBuilder
from lxml import etree
from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
# python3 compatibility
try:
_strings = basestring
except NameError:
_strings = (bytes, str)
try:
from urllib2 import urlopen
except ImportError:
from urllib.request import urlopen
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
class HTMLParser(_HTMLParser):
"""An html5lib HTML parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
_HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
try:
from html5lib import XHTMLParser as _XHTMLParser
except ImportError:
pass
else:
class XHTMLParser(_XHTMLParser):
"""An html5lib XHTML Parser with lxml as tree."""
def __init__(self, strict=False, **kwargs):
_XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
xhtml_parser = XHTMLParser()
def _find_tag(tree, tag):
elem = tree.find(tag)
if elem is not None:
return elem
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
def document_fromstring(html, guess_charset=True, parser=None):
"""Parse a whole document into a string."""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
return parser.parse(html, useChardet=guess_charset).getroot()
def fragments_fromstring(html, no_leading_text=False,
guess_charset=False, parser=None):
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true,
then it will be an error if there is leading text, and it will always be
a list of only elements.
If `guess_charset` is `True` and the text was not unicode but a
bytestring, the `chardet` library will perform charset guessing on the
string.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
if parser is None:
parser = html_parser
children = parser.parseFragment(html, 'div', useChardet=guess_charset)
if children and isinstance(children[0], _strings):
if no_leading_text:
if children[0].strip():
raise etree.ParserError('There is leading text: %r' %
children[0])
del children[0]
return children
def fragment_fromstring(html, create_parent=False,
guess_charset=False, parser=None):
"""Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
element.
If create_parent is true (or is a tag name) then a parent node
will be created to encapsulate the HTML in a single element. In
this case, leading or trailing text is allowed.
"""
if not isinstance(html, _strings):
raise TypeError('string required')
accept_leading_text = bool(create_parent)
elements = fragments_fromstring(
html, guess_charset=guess_charset, parser=parser,
no_leading_text=not accept_leading_text)
if create_parent:
if not isinstance(create_parent, _strings):
create_parent = 'div'
new_root = Element(create_parent)
if elements:
if isinstance(elements[0], _strings):
new_root.text = elements[0]
del elements[0]
new_root.extend(elements)
return new_root
if not elements:
raise etree.ParserError('No elements found')
if len(elements) > 1:
raise etree.ParserError('Multiple elements found')
result = elements[0]
if result.tail and result.tail.strip():
raise etree.ParserError('Element followed by text: %r' % result.tail)
result.tail = None
return result
def fromstring(html, guess_charset=True, parser=None):
"""Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
if not isinstance(html, _strings):
raise TypeError('string required')
doc = document_fromstring(html, parser=parser,
guess_charset=guess_charset)
# document starts with doctype or <html>, full document!
start = html[:50].lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
return doc
head = _find_tag(doc, 'head')
# if the head is not empty we have a full document
if len(head):
return doc
body = _find_tag(doc, 'body')
# The body has just one element, so it was probably a single
# element passed in
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
return body[0]
# Now we have a body which represents a bunch of tags which have the
# content that was passed in. We will create a fake container, which
# is the body tag, except <body> implies too much structure.
if _contains_block_level_tag(body):
body.tag = 'div'
else:
body.tag = 'span'
return body
def parse(filename_url_or_file, guess_charset=True, parser=None):
"""Parse a filename, URL, or file-like object into an HTML document
tree. Note: this returns a tree, not an element. Use
``parse(...).getroot()`` to get the document root.
"""
if parser is None:
parser = html_parser
if not isinstance(filename_url_or_file, _strings):
fp = filename_url_or_file
elif _looks_like_url(filename_url_or_file):
fp = urlopen(filename_url_or_file)
else:
fp = open(filename_url_or_file, 'rb')
return parser.parse(fp, useChardet=guess_charset)
def _looks_like_url(str):
scheme = urlparse(str)[0]
if not scheme:
return False
elif (sys.platform == 'win32' and
scheme in string.ascii_letters
and len(scheme) == 1):
# looks like a 'normal' absolute path
return False
else:
return True
html_parser = HTMLParser()

View file

@ -0,0 +1,303 @@
"""External interface to the BeautifulSoup HTML parser.
"""
__all__ = ["fromstring", "parse", "convert_tree"]
import re
from lxml import etree, html
try:
from bs4 import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
Declaration, CData, Doctype)
_DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
except ImportError:
from BeautifulSoup import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
Declaration, CData)
_DECLARATION_OR_DOCTYPE = Declaration
def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a string of HTML data into an Element tree using the
BeautifulSoup parser.
Returns the root ``<html>`` Element of the tree.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
return _parse(data, beautifulsoup, makeelement, **bsargs)
def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a file into an ElemenTree using the BeautifulSoup parser.
You can pass a different BeautifulSoup parser through the
`beautifulsoup` keyword, and a diffent Element factory function
through the `makeelement` keyword. By default, the standard
``BeautifulSoup`` class and the default factory of `lxml.html` are
used.
"""
if not hasattr(file, 'read'):
file = open(file)
root = _parse(file, beautifulsoup, makeelement, **bsargs)
return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
"""Convert a BeautifulSoup tree to a list of Element trees.
Returns a list instead of a single root Element to support
HTML-like soup with more than one root element.
You can pass a different Element factory through the `makeelement`
keyword.
"""
root = _convert_tree(beautiful_soup_tree, makeelement)
children = root.getchildren()
for child in children:
root.remove(child)
return children
# helpers
def _parse(source, beautifulsoup, makeelement, **bsargs):
if beautifulsoup is None:
beautifulsoup = BeautifulSoup
if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
if 'features' not in bsargs:
bsargs['features'] = ['html.parser'] # use Python html parser
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
if len(root) == 1 and root[0].tag == "html":
return root[0]
root.tag = "html"
return root
_parse_doctype_declaration = re.compile(
r'(?:\s|[<!])*DOCTYPE\s*HTML'
r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
re.IGNORECASE).match
class _PseudoTag:
# Minimal imitation of BeautifulSoup.Tag
def __init__(self, contents):
self.name = 'html'
self.attrs = []
self.contents = contents
def __iter__(self):
return self.contents.__iter__()
def _convert_tree(beautiful_soup_tree, makeelement):
if makeelement is None:
makeelement = html.html_parser.makeelement
# Split the tree into three parts:
# i) everything before the root element: document type
# declaration, comments, processing instructions, whitespace
# ii) the root(s),
# iii) everything after the root: comments, processing
# instructions, whitespace
first_element_idx = last_element_idx = None
html_root = declaration = None
for i, e in enumerate(beautiful_soup_tree):
if isinstance(e, Tag):
if first_element_idx is None:
first_element_idx = i
last_element_idx = i
if html_root is None and e.name and e.name.lower() == 'html':
html_root = e
elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
declaration = e
# For a nice, well-formatted document, the variable roots below is
# a list consisting of a single <html> element. However, the document
# may be a soup like '<meta><head><title>Hello</head><body>Hi
# all<\p>'. In this example roots is a list containing meta, head
# and body elements.
pre_root = beautiful_soup_tree.contents[:first_element_idx]
roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
post_root = beautiful_soup_tree.contents[last_element_idx+1:]
# Reorganize so that there is one <html> root...
if html_root is not None:
# ... use existing one if possible, ...
i = roots.index(html_root)
html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
else:
# ... otherwise create a new one.
html_root = _PseudoTag(roots)
convert_node = _init_node_converters(makeelement)
# Process pre_root
res_root = convert_node(html_root)
prev = res_root
for e in reversed(pre_root):
converted = convert_node(e)
if converted is not None:
prev.addprevious(converted)
prev = converted
# ditto for post_root
prev = res_root
for e in post_root:
converted = convert_node(e)
if converted is not None:
prev.addnext(converted)
prev = converted
if declaration is not None:
try:
# bs4 provides full Doctype string
doctype_string = declaration.output_ready()
except AttributeError:
doctype_string = declaration.string
match = _parse_doctype_declaration(doctype_string)
if not match:
# Something is wrong if we end up in here. Since soupparser should
# tolerate errors, do not raise Exception, just let it pass.
pass
else:
external_id, sys_uri = match.groups()
docinfo = res_root.getroottree().docinfo
# strip quotes and update DOCTYPE values (any of None, '', '...')
docinfo.public_id = external_id and external_id[1:-1]
docinfo.system_url = sys_uri and sys_uri[1:-1]
return res_root
def _init_node_converters(makeelement):
converters = {}
ordered_node_types = []
def converter(*types):
def add(handler):
for t in types:
converters[t] = handler
ordered_node_types.append(t)
return handler
return add
def find_best_converter(node):
for t in ordered_node_types:
if isinstance(node, t):
return converters[t]
return None
def convert_node(bs_node, parent=None):
# duplicated in convert_tag() below
try:
handler = converters[type(bs_node)]
except KeyError:
handler = converters[type(bs_node)] = find_best_converter(bs_node)
if handler is None:
return None
return handler(bs_node, parent)
def map_attrs(bs_attrs):
if isinstance(bs_attrs, dict): # bs4
attribs = {}
for k, v in bs_attrs.items():
if isinstance(v, list):
v = " ".join(v)
attribs[k] = unescape(v)
else:
attribs = dict((k, unescape(v)) for k, v in bs_attrs)
return attribs
def append_text(parent, text):
if len(parent) == 0:
parent.text = (parent.text or '') + text
else:
parent[-1].tail = (parent[-1].tail or '') + text
# converters are tried in order of their definition
@converter(Tag, _PseudoTag)
def convert_tag(bs_node, parent):
attrs = bs_node.attrs
if parent is not None:
attribs = map_attrs(attrs) if attrs else None
res = etree.SubElement(parent, bs_node.name, attrib=attribs)
else:
attribs = map_attrs(attrs) if attrs else {}
res = makeelement(bs_node.name, attrib=attribs)
for child in bs_node:
# avoid double recursion by inlining convert_node(), see above
try:
handler = converters[type(child)]
except KeyError:
pass
else:
if handler is not None:
handler(child, res)
continue
convert_node(child, res)
return res
@converter(Comment)
def convert_comment(bs_node, parent):
res = etree.Comment(bs_node)
if parent is not None:
parent.append(res)
return res
@converter(ProcessingInstruction)
def convert_pi(bs_node, parent):
if bs_node.endswith('?'):
# The PI is of XML style (<?as df?>) but BeautifulSoup
# interpreted it as being SGML style (<?as df>). Fix.
bs_node = bs_node[:-1]
res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
if parent is not None:
parent.append(res)
return res
@converter(NavigableString)
def convert_text(bs_node, parent):
if parent is not None:
append_text(parent, unescape(bs_node))
return None
return convert_node
# copied from ET's ElementSoup
try:
from html.entities import name2codepoint # Python 3
except ImportError:
from htmlentitydefs import name2codepoint
handle_entities = re.compile("&(\w+);").sub
def unescape(string):
if not string:
return ''
# work around oddities in BeautifulSoup's entity handling
def unescape_entity(m):
try:
return unichr(name2codepoint[m.group(1)])
except KeyError:
return m.group(0) # use as is
return handle_entities(unescape_entity, string)

View file

@ -0,0 +1,13 @@
"""Doctest module for HTML comparison.
Usage::
>>> import lxml.html.usedoctest
>>> # now do your HTML doctests ...
See `lxml.doctestcompare`.
"""
from lxml import doctestcompare
doctestcompare.temp_install(html=True, del_module=__name__)

View file

@ -0,0 +1,26 @@
from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar
from lxml.includes.xpath cimport xmlNodeSet
cdef extern from "libxml/c14n.h":
cdef int xmlC14NDocDumpMemory(xmlDoc* doc,
xmlNodeSet* nodes,
int exclusive,
xmlChar** inclusive_ns_prefixes,
int with_comments,
xmlChar** doc_txt_ptr) nogil
cdef int xmlC14NDocSave(xmlDoc* doc,
xmlNodeSet* nodes,
int exclusive,
xmlChar** inclusive_ns_prefixes,
int with_comments,
char* filename,
int compression) nogil
cdef int xmlC14NDocSaveTo(xmlDoc* doc,
xmlNodeSet* nodes,
int exclusive,
xmlChar** inclusive_ns_prefixes,
int with_comments,
xmlOutputBuffer* buffer) nogil

View file

@ -0,0 +1,3 @@
cdef extern from "etree_defs.h":
cdef bint ENABLE_THREADING
cdef bint ENABLE_SCHEMATRON

View file

@ -0,0 +1,18 @@
from lxml.includes cimport tree
from lxml.includes.tree cimport xmlDoc, xmlDtd
cdef extern from "libxml/valid.h" nogil:
ctypedef void (*xmlValidityErrorFunc)(void * ctx, const char * msg, ...)
ctypedef void (*xmlValidityWarningFunc)(void * ctx, const char * msg, ...)
ctypedef struct xmlValidCtxt:
void *userData
xmlValidityErrorFunc error
xmlValidityWarningFunc warning
cdef xmlValidCtxt* xmlNewValidCtxt()
cdef void xmlFreeValidCtxt(xmlValidCtxt* cur)
cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd)
cdef tree.xmlElement* xmlGetDtdElementDesc(
xmlDtd* dtd, tree.const_xmlChar* name)

View file

@ -0,0 +1,373 @@
#ifndef HAS_ETREE_DEFS_H
#define HAS_ETREE_DEFS_H
/* quick check for Python/libxml2/libxslt devel setup */
#include "Python.h"
#ifndef PY_VERSION_HEX
# error the development package of Python (header files etc.) is not installed correctly
#else
# if PY_VERSION_HEX < 0x02060000 || PY_MAJOR_VERSION >= 3 && PY_VERSION_HEX < 0x03020000
# error this version of lxml requires Python 2.6, 2.7, 3.2 or later
# endif
#endif
#include "libxml/xmlversion.h"
#ifndef LIBXML_VERSION
# error the development package of libxml2 (header files etc.) is not installed correctly
#else
#if LIBXML_VERSION < 20700
# error minimum required version of libxml2 is 2.7.0
#endif
#endif
#include "libxslt/xsltconfig.h"
#ifndef LIBXSLT_VERSION
# error the development package of libxslt (header files etc.) is not installed correctly
#else
#if LIBXSLT_VERSION < 10123
# error minimum required version of libxslt is 1.1.23
#endif
#endif
/* v_arg functions */
#define va_int(ap) va_arg(ap, int)
#define va_charptr(ap) va_arg(ap, char *)
#ifdef PYPY_VERSION
# define IS_PYPY 1
#else
# define IS_PYPY 0
#endif
#if PY_MAJOR_VERSION >= 3
# define IS_PYTHON3 1
#else
# define IS_PYTHON3 0
#endif
#if IS_PYTHON3
#undef LXML_UNICODE_STRINGS
#define LXML_UNICODE_STRINGS 1
#else
#ifndef LXML_UNICODE_STRINGS
#define LXML_UNICODE_STRINGS 0
#endif
#endif
#if !IS_PYPY
# define PyWeakref_LockObject(obj) (NULL)
#endif
/* Threading is not currently supported by PyPy */
#if IS_PYPY
# ifndef WITHOUT_THREADING
# define WITHOUT_THREADING
# endif
#endif
#if IS_PYPY
# undef PyFile_AsFile
# define PyFile_AsFile(o) (NULL)
# undef PyByteArray_Check
# define PyByteArray_Check(o) (0)
#elif IS_PYTHON3
/* Python 3 doesn't have PyFile_*() anymore */
# define PyFile_AsFile(o) (NULL)
#endif
#if PY_VERSION_HEX <= 0x03030000 && !(defined(CYTHON_PEP393_ENABLED) && CYTHON_PEP393_ENABLED)
#define PyUnicode_IS_READY(op) (0)
#define PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
#define PyUnicode_KIND(u) (sizeof(Py_UNICODE))
#define PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u))
#endif
#if IS_PYPY
# ifndef PyUnicode_FromFormat
# define PyUnicode_FromFormat PyString_FromFormat
# endif
# if IS_PYTHON3 && !defined(PyBytes_FromFormat)
# ifdef PyString_FromFormat
# define PyBytes_FromFormat PyString_FromFormat
# else
#include <stdarg.h>
static PyObject* PyBytes_FromFormat(const char* format, ...) {
PyObject *string;
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
#else
va_start(vargs);
#endif
string = PyUnicode_FromFormatV(format, vargs);
va_end(vargs);
if (string && PyUnicode_Check(string)) {
PyObject *bstring = PyUnicode_AsUTF8String(string);
Py_DECREF(string);
string = bstring;
}
if (string && !PyBytes_CheckExact(string)) {
Py_DECREF(string);
string = NULL;
PyErr_SetString(PyExc_TypeError, "String formatting and encoding failed to return bytes object");
}
return string;
}
# endif
# endif
#endif
/* PySlice_GetIndicesEx() has wrong signature in Py<=3.1 */
#if PY_VERSION_HEX >= 0x03020000
# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(o, l, b, e, s, sl)
#else
# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(((PySliceObject*)o), l, b, e, s, sl)
#endif
#ifdef WITHOUT_THREADING
# undef PyEval_SaveThread
# define PyEval_SaveThread() (NULL)
# undef PyEval_RestoreThread
# define PyEval_RestoreThread(state) if (state); else {}
# undef PyGILState_Ensure
# define PyGILState_Ensure() (PyGILState_UNLOCKED)
# undef PyGILState_Release
# define PyGILState_Release(state) if (state); else {}
# undef Py_UNBLOCK_THREADS
# define Py_UNBLOCK_THREADS _save = NULL;
# undef Py_BLOCK_THREADS
# define Py_BLOCK_THREADS if (_save); else {}
#endif
#ifdef WITHOUT_THREADING
# define ENABLE_THREADING 0
#else
# define ENABLE_THREADING 1
#endif
#if LIBXML_VERSION < 20704
/* FIXME: hack to make new error reporting compile in old libxml2 versions */
# define xmlStructuredErrorContext NULL
# define xmlXIncludeProcessTreeFlagsData(n,o,d) xmlXIncludeProcessTreeFlags(n,o)
#endif
/* schematron was added in libxml2 2.6.21 */
#ifdef LIBXML_SCHEMATRON_ENABLED
# define ENABLE_SCHEMATRON 1
#else
# define ENABLE_SCHEMATRON 0
# define XML_SCHEMATRON_OUT_QUIET 0
# define XML_SCHEMATRON_OUT_XML 0
# define XML_SCHEMATRON_OUT_ERROR 0
typedef void xmlSchematron;
typedef void xmlSchematronParserCtxt;
typedef void xmlSchematronValidCtxt;
# define xmlSchematronNewDocParserCtxt(doc) NULL
# define xmlSchematronNewParserCtxt(file) NULL
# define xmlSchematronParse(ctxt) NULL
# define xmlSchematronFreeParserCtxt(ctxt)
# define xmlSchematronFree(schema)
# define xmlSchematronNewValidCtxt(schema, options) NULL
# define xmlSchematronValidateDoc(ctxt, doc) 0
# define xmlSchematronFreeValidCtxt(ctxt)
# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data)
#endif
#if LIBXML_VERSION < 20900
# define XML_PARSE_BIG_LINES 4194304
#endif
#include "libxml/tree.h"
#ifndef LIBXML2_NEW_BUFFER
typedef xmlBuffer xmlBuf;
# define xmlBufContent(buf) xmlBufferContent(buf)
# define xmlBufUse(buf) xmlBufferLength(buf)
#endif
/* libexslt 1.1.25+ support EXSLT functions in XPath */
#if LIBXSLT_VERSION < 10125
#define exsltDateXpathCtxtRegister(ctxt, prefix)
#define exsltSetsXpathCtxtRegister(ctxt, prefix)
#define exsltMathXpathCtxtRegister(ctxt, prefix)
#define exsltStrXpathCtxtRegister(ctxt, prefix)
#endif
/* work around MSDEV 6.0 */
#if (_MSC_VER == 1200) && (WINVER < 0x0500)
long _ftol( double ); //defined by VC6 C libs
long _ftol2( double dblSource ) { return _ftol( dblSource ); }
#endif
#ifdef __GNUC__
/* Test for GCC > 2.95 */
#if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
#define unlikely_condition(x) __builtin_expect((x), 0)
#else /* __GNUC__ > 2 ... */
#define unlikely_condition(x) (x)
#endif /* __GNUC__ > 2 ... */
#else /* __GNUC__ */
#define unlikely_condition(x) (x)
#endif /* __GNUC__ */
#ifndef Py_TYPE
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
#endif
#define PY_NEW(T) \
(((PyTypeObject*)(T))->tp_new( \
(PyTypeObject*)(T), __pyx_empty_tuple, NULL))
#define _fqtypename(o) ((Py_TYPE(o))->tp_name)
#define lxml_malloc(count, item_size) \
(unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \
(PyMem_Malloc((count) * item_size)))
#define lxml_realloc(mem, count, item_size) \
(unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \
(PyMem_Realloc(mem, (count) * item_size)))
#define lxml_free(mem) PyMem_Free(mem)
#if PY_MAJOR_VERSION < 3
#define _isString(obj) (PyString_CheckExact(obj) || \
PyUnicode_CheckExact(obj) || \
PyType_IsSubtype(Py_TYPE(obj), &PyBaseString_Type))
#else
/* builtin subtype type checks are almost as fast as exact checks in Py2.7+
* and Unicode is more common in Py3 */
#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj))
#endif
#define _isElement(c_node) \
(((c_node)->type == XML_ELEMENT_NODE) || \
((c_node)->type == XML_COMMENT_NODE) || \
((c_node)->type == XML_ENTITY_REF_NODE) || \
((c_node)->type == XML_PI_NODE))
#define _isElementOrXInclude(c_node) \
(_isElement(c_node) || \
((c_node)->type == XML_XINCLUDE_START) || \
((c_node)->type == XML_XINCLUDE_END))
#define _getNs(c_node) \
(((c_node)->ns == 0) ? 0 : ((c_node)->ns->href))
/* Macro pair implementation of a depth first tree walker
*
* Calls the code block between the BEGIN and END macros for all elements
* below c_tree_top (exclusively), starting at c_node (inclusively iff
* 'inclusive' is 1). The _ELEMENT_ variants will only stop on nodes
* that match _isElement(), the normal variant will stop on every node
* except text nodes.
*
* To traverse the node and all of its children and siblings in Pyrex, call
* cdef xmlNode* some_node
* BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1)
* # do something with some_node
* END_FOR_EACH_ELEMENT_FROM(some_node)
*
* To traverse only the children and siblings of a node, call
* cdef xmlNode* some_node
* BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0)
* # do something with some_node
* END_FOR_EACH_ELEMENT_FROM(some_node)
*
* To traverse only the children, do:
* cdef xmlNode* some_node
* some_node = parent_node.children
* BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1)
* # do something with some_node
* END_FOR_EACH_ELEMENT_FROM(some_node)
*
* NOTE: 'some_node' MUST be a plain 'xmlNode*' !
*
* NOTE: parent modification during the walk can divert the iterator, but
* should not segfault !
*/
#define _LX__ELEMENT_MATCH(c_node, only_elements) \
((only_elements) ? (_isElement(c_node)) : 1)
#define _LX__ADVANCE_TO_NEXT(c_node, only_elements) \
while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements))) \
c_node = c_node->next;
#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \
{ \
/* walk through children first */ \
xmlNode* _lx__next = c_node->children; \
if (_lx__next != 0) { \
if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
_lx__next = 0; \
} else { \
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
} \
} \
if ((_lx__next == 0) && (c_node != c_stop_node)) { \
/* try siblings */ \
_lx__next = c_node->next; \
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
/* back off through parents */ \
while (_lx__next == 0) { \
c_node = c_node->parent; \
if (c_node == 0) \
break; \
if (c_node == c_stop_node) \
break; \
if ((only_elements) && !_isElement(c_node)) \
break; \
/* we already traversed the parents -> siblings */ \
_lx__next = c_node->next; \
_LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
} \
} \
c_node = _lx__next; \
}
#define _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, only_elements) \
{ \
if (c_node != 0) { \
const xmlNode* _lx__tree_top = (c_tree_top); \
const int _lx__only_elements = (only_elements); \
/* make sure we start at an element */ \
if (!_LX__ELEMENT_MATCH(c_node, _lx__only_elements)) { \
/* we skip the node, so 'inclusive' is irrelevant */ \
if (c_node == _lx__tree_top) \
c_node = 0; /* nothing to traverse */ \
else { \
c_node = c_node->next; \
_LX__ADVANCE_TO_NEXT(c_node, _lx__only_elements) \
} \
} else if (! (inclusive)) { \
/* skip the first node */ \
_LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
} \
\
/* now run the user code on the elements we find */ \
while (c_node != 0) { \
/* here goes the code to be run for each element */
#define _LX__END_FOR_EACH_FROM(c_node) \
_LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
} \
} \
}
#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive) \
_LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 1)
#define END_FOR_EACH_ELEMENT_FROM(c_node) \
_LX__END_FOR_EACH_FROM(c_node)
#define BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive) \
_LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 0)
#define END_FOR_EACH_FROM(c_node) \
_LX__END_FOR_EACH_FROM(c_node)
#endif /* HAS_ETREE_DEFS_H */

View file

@ -0,0 +1,234 @@
# public Cython/C interface to lxml.etree
from lxml.includes cimport tree
from lxml.includes.tree cimport const_xmlChar
cdef extern from "lxml-version.h":
cdef char* LXML_VERSION_STRING
cdef extern from "etree_defs.h":
# test if c_node is considered an Element (i.e. Element, Comment, etc.)
cdef bint _isElement(tree.xmlNode* c_node) nogil
# return the namespace URI of the node or NULL
cdef const_xmlChar* _getNs(tree.xmlNode* node) nogil
# pair of macros for tree traversal
cdef void BEGIN_FOR_EACH_ELEMENT_FROM(tree.xmlNode* tree_top,
tree.xmlNode* start_node,
int start_node_inclusive) nogil
cdef void END_FOR_EACH_ELEMENT_FROM(tree.xmlNode* start_node) nogil
cdef extern from "lxml.etree_api.h":
# first function to call!
cdef int import_lxml__etree() except -1
##########################################################################
# public ElementTree API classes
cdef class lxml.etree._Document [ object LxmlDocument ]:
cdef tree.xmlDoc* _c_doc
cdef class lxml.etree._Element [ object LxmlElement ]:
cdef _Document _doc
cdef tree.xmlNode* _c_node
cdef class lxml.etree.ElementBase(_Element) [ object LxmlElementBase ]:
pass
cdef class lxml.etree._ElementTree [ object LxmlElementTree ]:
cdef _Document _doc
cdef _Element _context_node
cdef class lxml.etree.ElementClassLookup [ object LxmlElementClassLookup ]:
cdef object (*_lookup_function)(object, _Document, tree.xmlNode*)
cdef class lxml.etree.FallbackElementClassLookup(ElementClassLookup) \
[ object LxmlFallbackElementClassLookup ]:
cdef ElementClassLookup fallback
cdef object (*_fallback_function)(object, _Document, tree.xmlNode*)
##########################################################################
# creating Element objects
# create an Element for a C-node in the Document
cdef _Element elementFactory(_Document doc, tree.xmlNode* c_node)
# create an ElementTree for an Element
cdef _ElementTree elementTreeFactory(_Element context_node)
# create an ElementTree subclass for an Element
cdef _ElementTree newElementTree(_Element context_node, object subclass)
# create a new Element for an existing or new document (doc = None)
# builds Python object after setting text, tail, namespaces and attributes
cdef _Element makeElement(tag, _Document doc, parser,
text, tail, attrib, nsmap)
# create a new SubElement for an existing parent
# builds Python object after setting text, tail, namespaces and attributes
cdef _Element makeSubElement(_Element parent, tag, text, tail,
attrib, nsmap)
# deep copy a node to include it in the Document
cdef _Element deepcopyNodeToDocument(_Document doc, tree.xmlNode* c_root)
# set the internal lookup function for Element/Comment/PI classes
# use setElementClassLookupFunction(NULL, None) to reset it
# note that the lookup function *must always* return an _Element subclass!
cdef void setElementClassLookupFunction(
object (*function)(object, _Document, tree.xmlNode*), object state)
# lookup function that always returns the default Element class
# note that the first argument is expected to be None!
cdef object lookupDefaultElementClass(_1, _Document _2,
tree.xmlNode* c_node)
# lookup function for namespace/tag specific Element classes
# note that the first argument is expected to be None!
cdef object lookupNamespaceElementClass(_1, _Document _2,
tree.xmlNode* c_node)
# call the fallback lookup function of a FallbackElementClassLookup
cdef object callLookupFallback(FallbackElementClassLookup lookup,
_Document doc, tree.xmlNode* c_node)
##########################################################################
# XML attribute access
# return an attribute value for a C attribute on a C element node
cdef object attributeValue(tree.xmlNode* c_element,
tree.xmlAttr* c_attrib_node)
# return the value of the attribute with 'ns' and 'name' (or None)
cdef object attributeValueFromNsName(tree.xmlNode* c_element,
const_xmlChar* c_ns, const_xmlChar* c_name)
# return the value of attribute "{ns}name", or the default value
cdef object getAttributeValue(_Element element, key, default)
# return an iterator over attribute names (1), values (2) or items (3)
# attributes must not be removed during iteration!
cdef object iterattributes(_Element element, int keysvalues)
# return the list of all attribute names (1), values (2) or items (3)
cdef list collectAttributes(tree.xmlNode* c_element, int keysvalues)
# set an attribute value on an element
# on failure, sets an exception and returns -1
cdef int setAttributeValue(_Element element, key, value) except -1
# delete an attribute
# on failure, sets an exception and returns -1
cdef int delAttribute(_Element element, key) except -1
# delete an attribute based on name and namespace URI
# returns -1 if the attribute was not found (no exception)
cdef int delAttributeFromNsName(tree.xmlNode* c_element,
const_xmlChar* c_href, const_xmlChar* c_name)
##########################################################################
# XML node helper functions
# check if the element has at least one child
cdef bint hasChild(tree.xmlNode* c_node) nogil
# find child element number 'index' (supports negative indexes)
cdef tree.xmlNode* findChild(tree.xmlNode* c_node,
Py_ssize_t index) nogil
# find child element number 'index' starting at first one
cdef tree.xmlNode* findChildForwards(tree.xmlNode* c_node,
Py_ssize_t index) nogil
# find child element number 'index' starting at last one
cdef tree.xmlNode* findChildBackwards(tree.xmlNode* c_node,
Py_ssize_t index) nogil
# return next/previous sibling element of the node
cdef tree.xmlNode* nextElement(tree.xmlNode* c_node) nogil
cdef tree.xmlNode* previousElement(tree.xmlNode* c_node) nogil
##########################################################################
# iterators (DEPRECATED API, don't use in new code!)
cdef class lxml.etree._ElementTagMatcher [ object LxmlElementTagMatcher ]:
cdef char* _href
cdef char* _name
# store "{ns}tag" (or None) filter for this matcher or element iterator
# ** unless _href *and* _name are set up 'by hand', this function *must*
# ** be called when subclassing the iterator below!
cdef void initTagMatch(_ElementTagMatcher matcher, tag)
cdef class lxml.etree._ElementIterator(_ElementTagMatcher) [
object LxmlElementIterator ]:
cdef _Element _node
cdef tree.xmlNode* (*_next_element)(tree.xmlNode*)
# store the initial node of the iterator if it matches the required tag
# or its next matching sibling if not
cdef void iteratorStoreNext(_ElementIterator iterator, _Element node)
##########################################################################
# other helper functions
# check if a C node matches a tag name and namespace
# (NULL allowed for each => always matches)
cdef int tagMatches(tree.xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name)
# convert a UTF-8 char* to a Python string or unicode string
cdef object pyunicode(const_xmlChar* s)
# convert the string to UTF-8 using the normal lxml.etree semantics
cdef bytes utf8(object s)
# split a tag into a (URI, name) tuple, return None as URI for '{}tag'
cdef tuple getNsTag(object tag)
# split a tag into a (URI, name) tuple, return b'' as URI for '{}tag'
cdef tuple getNsTagWithEmptyNs(object tag)
# get the "{ns}tag" string for a C node
cdef object namespacedName(tree.xmlNode* c_node)
# get the "{ns}tag" string for a href/tagname pair (c_ns may be NULL)
cdef object namespacedNameFromNsName(const_xmlChar* c_ns, const_xmlChar* c_tag)
# check if the node has a text value (which may be '')
cdef bint hasText(tree.xmlNode* c_node) nogil
# check if the node has a tail value (which may be '')
cdef bint hasTail(tree.xmlNode* c_node) nogil
# get the text content of an element (or None)
cdef object textOf(tree.xmlNode* c_node)
# get the tail content of an element (or None)
cdef object tailOf(tree.xmlNode* c_node)
# set the text value of an element
cdef int setNodeText(tree.xmlNode* c_node, text) except -1
# set the tail text value of an element
cdef int setTailText(tree.xmlNode* c_node, text) except -1
# append an element to the children of a parent element
# deprecated: don't use, does not propagate exceptions!
# use appendChildToElement() instead
cdef void appendChild(_Element parent, _Element child)
# added in lxml 3.3 as a safe replacement for appendChild()
# return -1 for exception, 0 for ok
cdef int appendChildToElement(_Element parent, _Element child) except -1
# recursively lookup a namespace in element or ancestors, or create it
cdef tree.xmlNs* findOrBuildNodeNsPrefix(
_Document doc, tree.xmlNode* c_node, const_xmlChar* href, const_xmlChar* prefix)
# find the Document of an Element, ElementTree or Document (itself!)
cdef _Document documentOrRaise(object input)
# find the root Element of an Element (itself!), ElementTree or Document
cdef _Element rootNodeOrRaise(object input)

View file

@ -0,0 +1,56 @@
from libc.string cimport const_char
from lxml.includes.tree cimport xmlDoc
from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
from lxml.includes.xmlparser cimport xmlParserCtxt, xmlSAXHandler, xmlSAXHandlerV1
cdef extern from "libxml/HTMLparser.h":
ctypedef enum htmlParserOption:
HTML_PARSE_NOERROR # suppress error reports
HTML_PARSE_NOWARNING # suppress warning reports
HTML_PARSE_PEDANTIC # pedantic error reporting
HTML_PARSE_NOBLANKS # remove blank nodes
HTML_PARSE_NONET # Forbid network access
# libxml2 2.6.21+ only:
HTML_PARSE_RECOVER # Relaxed parsing
HTML_PARSE_COMPACT # compact small text nodes
# libxml2 2.7.7+ only:
HTML_PARSE_NOIMPLIED # Do not add implied html/body... elements
# libxml2 2.7.8+ only:
HTML_PARSE_NODEFDTD # do not default a doctype if not found
# libxml2 2.8.0+ only:
XML_PARSE_IGNORE_ENC # ignore internal document encoding hint
xmlSAXHandlerV1 htmlDefaultSAXHandler
cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(
char* buffer, int size) nogil
cdef xmlParserCtxt* htmlCreateFileParserCtxt(
char* filename, char* encoding) nogil
cdef xmlParserCtxt* htmlCreatePushParserCtxt(xmlSAXHandler* sax,
void* user_data,
char* chunk, int size,
char* filename, int enc) nogil
cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) nogil
cdef void htmlCtxtReset(xmlParserCtxt* ctxt) nogil
cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) nogil
cdef int htmlParseDocument(xmlParserCtxt* ctxt) nogil
cdef int htmlParseChunk(xmlParserCtxt* ctxt,
char* chunk, int size, int terminate) nogil
cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt,
char* filename, const_char* encoding,
int options) nogil
cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt,
char* buffer, char* URL, const_char* encoding,
int options) nogil
cdef xmlDoc* htmlCtxtReadIO(xmlParserCtxt* ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void* ioctx,
char* URL, const_char* encoding,
int options) nogil
cdef xmlDoc* htmlCtxtReadMemory(xmlParserCtxt* ctxt,
char* buffer, int size,
char* filename, const_char* encoding,
int options) nogil

View file

@ -0,0 +1,3 @@
#ifndef LXML_VERSION_STRING
#define LXML_VERSION_STRING "3.5.0"
#endif

View file

@ -0,0 +1,64 @@
from lxml.includes.tree cimport xmlDoc
from lxml.includes.xmlerror cimport xmlStructuredErrorFunc
cdef extern from "libxml/relaxng.h":
ctypedef struct xmlRelaxNG
ctypedef struct xmlRelaxNGParserCtxt
ctypedef struct xmlRelaxNGValidCtxt
ctypedef enum xmlRelaxNGValidErr:
XML_RELAXNG_OK = 0
XML_RELAXNG_ERR_MEMORY = 1
XML_RELAXNG_ERR_TYPE = 2
XML_RELAXNG_ERR_TYPEVAL = 3
XML_RELAXNG_ERR_DUPID = 4
XML_RELAXNG_ERR_TYPECMP = 5
XML_RELAXNG_ERR_NOSTATE = 6
XML_RELAXNG_ERR_NODEFINE = 7
XML_RELAXNG_ERR_LISTEXTRA = 8
XML_RELAXNG_ERR_LISTEMPTY = 9
XML_RELAXNG_ERR_INTERNODATA = 10
XML_RELAXNG_ERR_INTERSEQ = 11
XML_RELAXNG_ERR_INTEREXTRA = 12
XML_RELAXNG_ERR_ELEMNAME = 13
XML_RELAXNG_ERR_ATTRNAME = 14
XML_RELAXNG_ERR_ELEMNONS = 15
XML_RELAXNG_ERR_ATTRNONS = 16
XML_RELAXNG_ERR_ELEMWRONGNS = 17
XML_RELAXNG_ERR_ATTRWRONGNS = 18
XML_RELAXNG_ERR_ELEMEXTRANS = 19
XML_RELAXNG_ERR_ATTREXTRANS = 20
XML_RELAXNG_ERR_ELEMNOTEMPTY = 21
XML_RELAXNG_ERR_NOELEM = 22
XML_RELAXNG_ERR_NOTELEM = 23
XML_RELAXNG_ERR_ATTRVALID = 24
XML_RELAXNG_ERR_CONTENTVALID = 25
XML_RELAXNG_ERR_EXTRACONTENT = 26
XML_RELAXNG_ERR_INVALIDATTR = 27
XML_RELAXNG_ERR_DATAELEM = 28
XML_RELAXNG_ERR_VALELEM = 29
XML_RELAXNG_ERR_LISTELEM = 30
XML_RELAXNG_ERR_DATATYPE = 31
XML_RELAXNG_ERR_VALUE = 32
XML_RELAXNG_ERR_LIST = 33
XML_RELAXNG_ERR_NOGRAMMAR = 34
XML_RELAXNG_ERR_EXTRADATA = 35
XML_RELAXNG_ERR_LACKDATA = 36
XML_RELAXNG_ERR_INTERNAL = 37
XML_RELAXNG_ERR_ELEMWRONG = 38
XML_RELAXNG_ERR_TEXTWRONG = 39
cdef xmlRelaxNGValidCtxt* xmlRelaxNGNewValidCtxt(xmlRelaxNG* schema) nogil
cdef int xmlRelaxNGValidateDoc(xmlRelaxNGValidCtxt* ctxt, xmlDoc* doc) nogil
cdef xmlRelaxNG* xmlRelaxNGParse(xmlRelaxNGParserCtxt* ctxt) nogil
cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewParserCtxt(char* URL) nogil
cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewDocParserCtxt(xmlDoc* doc) nogil
cdef void xmlRelaxNGFree(xmlRelaxNG* schema) nogil
cdef void xmlRelaxNGFreeParserCtxt(xmlRelaxNGParserCtxt* ctxt) nogil
cdef void xmlRelaxNGFreeValidCtxt(xmlRelaxNGValidCtxt* ctxt) nogil
cdef void xmlRelaxNGSetValidStructuredErrors(
xmlRelaxNGValidCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx) nogil
cdef void xmlRelaxNGSetParserStructuredErrors(
xmlRelaxNGParserCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx) nogil

View file

@ -0,0 +1,34 @@
from lxml.includes cimport xmlerror
from lxml.includes.tree cimport xmlDoc
cdef extern from "libxml/schematron.h":
ctypedef struct xmlSchematron
ctypedef struct xmlSchematronParserCtxt
ctypedef struct xmlSchematronValidCtxt
ctypedef enum xmlSchematronValidOptions:
XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report
XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report
XML_SCHEMATRON_OUT_XML = 4 # output SVRL
XML_SCHEMATRON_OUT_ERROR = 8 # output via xmlStructuredErrorFunc
XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor
XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer
XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism
cdef xmlSchematronParserCtxt* xmlSchematronNewDocParserCtxt(
xmlDoc* doc) nogil
cdef xmlSchematronParserCtxt* xmlSchematronNewParserCtxt(
char* filename) nogil
cdef xmlSchematronValidCtxt* xmlSchematronNewValidCtxt(
xmlSchematron* schema, int options) nogil
cdef xmlSchematron* xmlSchematronParse(xmlSchematronParserCtxt* ctxt) nogil
cdef int xmlSchematronValidateDoc(xmlSchematronValidCtxt* ctxt,
xmlDoc* instance) nogil
cdef void xmlSchematronFreeParserCtxt(xmlSchematronParserCtxt* ctxt) nogil
cdef void xmlSchematronFreeValidCtxt(xmlSchematronValidCtxt* ctxt) nogil
cdef void xmlSchematronFree(xmlSchematron* schema) nogil
cdef void xmlSchematronSetValidStructuredErrors(
xmlSchematronValidCtxt* ctxt,
xmlerror.xmlStructuredErrorFunc error_func, void *data)

View file

@ -0,0 +1,474 @@
from libc cimport stdio
from libc.string cimport const_char, const_uchar
cdef extern from "lxml-version.h":
# deprecated declaration, use etreepublic.pxd instead
cdef char* LXML_VERSION_STRING
cdef extern from "libxml/xmlversion.h":
cdef const_char* xmlParserVersion
cdef int LIBXML_VERSION
cdef extern from "libxml/xmlstring.h":
ctypedef unsigned char xmlChar
ctypedef const xmlChar const_xmlChar "const xmlChar"
cdef int xmlStrlen(const_xmlChar* str) nogil
cdef xmlChar* xmlStrdup(const_xmlChar* cur) nogil
cdef int xmlStrncmp(const_xmlChar* str1, const_xmlChar* str2, int length) nogil
cdef int xmlStrcmp(const_xmlChar* str1, const_xmlChar* str2) nogil
cdef int xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) nogil
cdef const_xmlChar* xmlStrstr(const_xmlChar* str1, const_xmlChar* str2) nogil
cdef const_xmlChar* xmlStrchr(const_xmlChar* str1, xmlChar ch) nogil
cdef const_xmlChar* _xcstr "(const xmlChar*)PyBytes_AS_STRING" (object s)
cdef extern from "libxml/encoding.h":
ctypedef enum xmlCharEncoding:
XML_CHAR_ENCODING_ERROR = -1 # No char encoding detected
XML_CHAR_ENCODING_NONE = 0 # No char encoding detected
XML_CHAR_ENCODING_UTF8 = 1 # UTF-8
XML_CHAR_ENCODING_UTF16LE = 2 # UTF-16 little endian
XML_CHAR_ENCODING_UTF16BE = 3 # UTF-16 big endian
XML_CHAR_ENCODING_UCS4LE = 4 # UCS-4 little endian
XML_CHAR_ENCODING_UCS4BE = 5 # UCS-4 big endian
XML_CHAR_ENCODING_EBCDIC = 6 # EBCDIC uh!
XML_CHAR_ENCODING_UCS4_2143 = 7 # UCS-4 unusual ordering
XML_CHAR_ENCODING_UCS4_3412 = 8 # UCS-4 unusual ordering
XML_CHAR_ENCODING_UCS2 = 9 # UCS-2
XML_CHAR_ENCODING_8859_1 = 10 # ISO-8859-1 ISO Latin 1
XML_CHAR_ENCODING_8859_2 = 11 # ISO-8859-2 ISO Latin 2
XML_CHAR_ENCODING_8859_3 = 12 # ISO-8859-3
XML_CHAR_ENCODING_8859_4 = 13 # ISO-8859-4
XML_CHAR_ENCODING_8859_5 = 14 # ISO-8859-5
XML_CHAR_ENCODING_8859_6 = 15 # ISO-8859-6
XML_CHAR_ENCODING_8859_7 = 16 # ISO-8859-7
XML_CHAR_ENCODING_8859_8 = 17 # ISO-8859-8
XML_CHAR_ENCODING_8859_9 = 18 # ISO-8859-9
XML_CHAR_ENCODING_2022_JP = 19 # ISO-2022-JP
XML_CHAR_ENCODING_SHIFT_JIS = 20 # Shift_JIS
XML_CHAR_ENCODING_EUC_JP = 21 # EUC-JP
XML_CHAR_ENCODING_ASCII = 22 # pure ASCII
ctypedef struct xmlCharEncodingHandler
cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) nogil
cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler(
xmlCharEncoding enc) nogil
cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler) nogil
cdef xmlCharEncoding xmlDetectCharEncoding(const_xmlChar* text, int len) nogil
cdef const_char* xmlGetCharEncodingName(xmlCharEncoding enc) nogil
cdef xmlCharEncoding xmlParseCharEncoding(char* name) nogil
ctypedef int (*xmlCharEncodingOutputFunc)(
unsigned char *out_buf, int *outlen, const_uchar *in_buf, int *inlen)
cdef extern from "libxml/chvalid.h":
cdef int xmlIsChar_ch(char c) nogil
cdef extern from "libxml/hash.h":
ctypedef struct xmlHashTable
ctypedef void (*xmlHashScanner)(void* payload, void* data, const_xmlChar* name) # may require GIL!
void xmlHashScan(xmlHashTable* table, xmlHashScanner f, void* data) nogil
void* xmlHashLookup(xmlHashTable* table, const_xmlChar* name) nogil
ctypedef void (*xmlHashDeallocator)(void *payload, xmlChar *name)
cdef xmlHashTable* xmlHashCreate(int size)
cdef xmlHashTable* xmlHashCreateDict(int size, xmlDict *dict)
cdef int xmlHashSize(xmlHashTable* table)
cdef void xmlHashFree(xmlHashTable* table, xmlHashDeallocator f)
cdef extern from *: # actually "libxml/dict.h"
# libxml/dict.h appears to be broken to include in C
ctypedef struct xmlDict
cdef const_xmlChar* xmlDictLookup(xmlDict* dict, const_xmlChar* name, int len) nogil
cdef const_xmlChar* xmlDictExists(xmlDict* dict, const_xmlChar* name, int len) nogil
cdef int xmlDictOwns(xmlDict* dict, const_xmlChar* name) nogil
cdef size_t xmlDictSize(xmlDict* dict) nogil
cdef extern from "libxml/tree.h":
ctypedef struct xmlDoc
ctypedef struct xmlAttr
ctypedef struct xmlNotationTable
ctypedef enum xmlElementType:
XML_ELEMENT_NODE= 1
XML_ATTRIBUTE_NODE= 2
XML_TEXT_NODE= 3
XML_CDATA_SECTION_NODE= 4
XML_ENTITY_REF_NODE= 5
XML_ENTITY_NODE= 6
XML_PI_NODE= 7
XML_COMMENT_NODE= 8
XML_DOCUMENT_NODE= 9
XML_DOCUMENT_TYPE_NODE= 10
XML_DOCUMENT_FRAG_NODE= 11
XML_NOTATION_NODE= 12
XML_HTML_DOCUMENT_NODE= 13
XML_DTD_NODE= 14
XML_ELEMENT_DECL= 15
XML_ATTRIBUTE_DECL= 16
XML_ENTITY_DECL= 17
XML_NAMESPACE_DECL= 18
XML_XINCLUDE_START= 19
XML_XINCLUDE_END= 20
ctypedef enum xmlElementTypeVal:
XML_ELEMENT_TYPE_UNDEFINED= 0
XML_ELEMENT_TYPE_EMPTY= 1
XML_ELEMENT_TYPE_ANY= 2
XML_ELEMENT_TYPE_MIXED= 3
XML_ELEMENT_TYPE_ELEMENT= 4
ctypedef enum xmlElementContentType:
XML_ELEMENT_CONTENT_PCDATA= 1
XML_ELEMENT_CONTENT_ELEMENT= 2
XML_ELEMENT_CONTENT_SEQ= 3
XML_ELEMENT_CONTENT_OR= 4
ctypedef enum xmlElementContentOccur:
XML_ELEMENT_CONTENT_ONCE= 1
XML_ELEMENT_CONTENT_OPT= 2
XML_ELEMENT_CONTENT_MULT= 3
XML_ELEMENT_CONTENT_PLUS= 4
ctypedef enum xmlAttributeType:
XML_ATTRIBUTE_CDATA = 1
XML_ATTRIBUTE_ID= 2
XML_ATTRIBUTE_IDREF= 3
XML_ATTRIBUTE_IDREFS= 4
XML_ATTRIBUTE_ENTITY= 5
XML_ATTRIBUTE_ENTITIES= 6
XML_ATTRIBUTE_NMTOKEN= 7
XML_ATTRIBUTE_NMTOKENS= 8
XML_ATTRIBUTE_ENUMERATION= 9
XML_ATTRIBUTE_NOTATION= 10
ctypedef enum xmlAttributeDefault:
XML_ATTRIBUTE_NONE= 1
XML_ATTRIBUTE_REQUIRED= 2
XML_ATTRIBUTE_IMPLIED= 3
XML_ATTRIBUTE_FIXED= 4
ctypedef enum xmlEntityType:
XML_INTERNAL_GENERAL_ENTITY= 1
XML_EXTERNAL_GENERAL_PARSED_ENTITY= 2
XML_EXTERNAL_GENERAL_UNPARSED_ENTITY= 3
XML_INTERNAL_PARAMETER_ENTITY= 4
XML_EXTERNAL_PARAMETER_ENTITY= 5
XML_INTERNAL_PREDEFINED_ENTITY= 6
ctypedef struct xmlNs:
const_xmlChar* href
const_xmlChar* prefix
xmlNs* next
ctypedef struct xmlNode:
void* _private
xmlElementType type
const_xmlChar* name
xmlNode* children
xmlNode* last
xmlNode* parent
xmlNode* next
xmlNode* prev
xmlDoc* doc
xmlChar* content
xmlAttr* properties
xmlNs* ns
xmlNs* nsDef
unsigned short line
ctypedef struct xmlElementContent:
xmlElementContentType type
xmlElementContentOccur ocur
const_xmlChar *name
xmlElementContent *c1
xmlElementContent *c2
xmlElementContent *parent
const_xmlChar *prefix
ctypedef struct xmlEnumeration:
xmlEnumeration *next
const_xmlChar *name
ctypedef struct xmlAttribute:
void* _private
xmlElementType type
const_xmlChar* name
xmlNode* children
xmlNode* last
xmlDtd* parent
xmlNode* next
xmlNode* prev
xmlDoc* doc
xmlAttribute* nexth
xmlAttributeType atype
xmlAttributeDefault def_ "def"
const_xmlChar* defaultValue
xmlEnumeration* tree
const_xmlChar* prefix
const_xmlChar* elem
ctypedef struct xmlElement:
void* _private
xmlElementType type
const_xmlChar* name
xmlNode* children
xmlNode* last
xmlNode* parent
xmlNode* next
xmlNode* prev
xmlDoc* doc
xmlElementTypeVal etype
xmlElementContent* content
xmlAttribute* attributes
const_xmlChar* prefix
void *contModel
ctypedef struct xmlEntity:
void* _private
xmlElementType type
const_xmlChar* name
xmlNode* children
xmlNode* last
xmlDtd* parent
xmlNode* next
xmlNode* prev
xmlDoc* doc
xmlChar* orig
xmlChar* content
int length
xmlEntityType etype
const_xmlChar* ExternalID
const_xmlChar* SystemID
xmlEntity* nexte
const_xmlChar* URI
int owner
int checked
ctypedef struct xmlDtd:
const_xmlChar* name
const_xmlChar* ExternalID
const_xmlChar* SystemID
void* notations
void* entities
void* pentities
void* attributes
void* elements
xmlNode* children
xmlNode* last
xmlDoc* doc
ctypedef struct xmlDoc:
xmlElementType type
char* name
xmlNode* children
xmlNode* last
xmlNode* parent
xmlNode* next
xmlNode* prev
xmlDoc* doc
xmlDict* dict
xmlHashTable* ids
int standalone
const_xmlChar* version
const_xmlChar* encoding
const_xmlChar* URL
void* _private
xmlDtd* intSubset
xmlDtd* extSubset
ctypedef struct xmlAttr:
void* _private
xmlElementType type
const_xmlChar* name
xmlNode* children
xmlNode* last
xmlNode* parent
xmlAttr* next
xmlAttr* prev
xmlDoc* doc
xmlNs* ns
ctypedef struct xmlID:
const_xmlChar* value
const_xmlChar* name
xmlAttr* attr
xmlDoc* doc
ctypedef struct xmlBuffer
ctypedef struct xmlBuf # new in libxml2 2.9
ctypedef struct xmlOutputBuffer:
xmlBuf* buffer
xmlBuf* conv
int error
const_xmlChar* XML_XML_NAMESPACE
cdef void xmlFreeDoc(xmlDoc* cur) nogil
cdef void xmlFreeDtd(xmlDtd* cur) nogil
cdef void xmlFreeNode(xmlNode* cur) nogil
cdef void xmlFreeNsList(xmlNs* ns) nogil
cdef void xmlFreeNs(xmlNs* ns) nogil
cdef void xmlFree(void* buf) nogil
cdef xmlNode* xmlNewNode(xmlNs* ns, const_xmlChar* name) nogil
cdef xmlNode* xmlNewDocText(xmlDoc* doc, const_xmlChar* content) nogil
cdef xmlNode* xmlNewDocComment(xmlDoc* doc, const_xmlChar* content) nogil
cdef xmlNode* xmlNewDocPI(xmlDoc* doc, const_xmlChar* name, const_xmlChar* content) nogil
cdef xmlNode* xmlNewReference(xmlDoc* doc, const_xmlChar* name) nogil
cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, const_xmlChar* text, int len) nogil
cdef xmlNs* xmlNewNs(xmlNode* node, const_xmlChar* href, const_xmlChar* prefix) nogil
cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) nogil
cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) nogil
cdef xmlNode* xmlAddPrevSibling(xmlNode* cur, xmlNode* elem) nogil
cdef xmlNode* xmlAddNextSibling(xmlNode* cur, xmlNode* elem) nogil
cdef xmlNode* xmlNewDocNode(xmlDoc* doc, xmlNs* ns,
const_xmlChar* name, const_xmlChar* content) nogil
cdef xmlDoc* xmlNewDoc(const_xmlChar* version) nogil
cdef xmlAttr* xmlNewProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value) nogil
cdef xmlAttr* xmlNewNsProp(xmlNode* node, xmlNs* ns,
const_xmlChar* name, const_xmlChar* value) nogil
cdef xmlChar* xmlGetNoNsProp(xmlNode* node, const_xmlChar* name) nogil
cdef xmlChar* xmlGetNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace) nogil
cdef void xmlSetNs(xmlNode* node, xmlNs* ns) nogil
cdef xmlAttr* xmlSetProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value) nogil
cdef xmlAttr* xmlSetNsProp(xmlNode* node, xmlNs* ns,
const_xmlChar* name, const_xmlChar* value) nogil
cdef int xmlRemoveProp(xmlAttr* cur) nogil
cdef xmlChar* xmlGetNodePath(xmlNode* node) nogil
cdef void xmlDocDumpMemory(xmlDoc* cur, char** mem, int* size) nogil
cdef void xmlDocDumpMemoryEnc(xmlDoc* cur, char** mem, int* size,
char* encoding) nogil
cdef int xmlSaveFileTo(xmlOutputBuffer* out, xmlDoc* cur,
char* encoding) nogil
cdef void xmlUnlinkNode(xmlNode* cur) nogil
cdef xmlNode* xmlDocSetRootElement(xmlDoc* doc, xmlNode* root) nogil
cdef xmlNode* xmlDocGetRootElement(xmlDoc* doc) nogil
cdef void xmlSetTreeDoc(xmlNode* tree, xmlDoc* doc) nogil
cdef xmlAttr* xmlHasProp(xmlNode* node, const_xmlChar* name) nogil
cdef xmlAttr* xmlHasNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace) nogil
cdef xmlChar* xmlNodeGetContent(xmlNode* cur) nogil
cdef int xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) nogil
cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, const_xmlChar* prefix) nogil
cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, const_xmlChar* href) nogil
cdef int xmlIsBlankNode(xmlNode* node) nogil
cdef long xmlGetLineNo(xmlNode* node) nogil
cdef void xmlElemDump(stdio.FILE* f, xmlDoc* doc, xmlNode* cur) nogil
cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf,
xmlDoc* doc, xmlNode* cur, int level,
int format, const_char* encoding) nogil
cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil
cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil
cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil
cdef xmlDoc* xmlCopyDoc(xmlDoc* doc, int recursive) nogil
cdef xmlNode* xmlCopyNode(xmlNode* node, int extended) nogil
cdef xmlNode* xmlDocCopyNode(xmlNode* node, xmlDoc* doc, int extended) nogil
cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) nogil
cdef xmlNs* xmlNewReconciliedNs(xmlDoc* doc, xmlNode* tree, xmlNs* ns) nogil
cdef xmlBuffer* xmlBufferCreate() nogil
cdef void xmlBufferWriteChar(xmlBuffer* buf, char* string) nogil
cdef void xmlBufferFree(xmlBuffer* buf) nogil
cdef const_xmlChar* xmlBufferContent(xmlBuffer* buf) nogil
cdef int xmlBufferLength(xmlBuffer* buf) nogil
cdef const_xmlChar* xmlBufContent(xmlBuf* buf) nogil # new in libxml2 2.9
cdef size_t xmlBufUse(xmlBuf* buf) nogil # new in libxml2 2.9
cdef int xmlKeepBlanksDefault(int val) nogil
cdef xmlChar* xmlNodeGetBase(xmlDoc* doc, xmlNode* node) nogil
cdef xmlDtd* xmlCreateIntSubset(xmlDoc* doc, const_xmlChar* name,
const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
cdef void xmlNodeSetBase(xmlNode* node, const_xmlChar* uri) nogil
cdef int xmlValidateNCName(const_xmlChar* value, int space) nogil
cdef extern from "libxml/uri.h":
cdef const_xmlChar* xmlBuildURI(const_xmlChar* href, const_xmlChar* base) nogil
cdef extern from "libxml/HTMLtree.h":
cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf,
xmlDoc* doc, xmlNode* cur,
char* encoding, int format) nogil
cdef xmlDoc* htmlNewDoc(const_xmlChar* uri, const_xmlChar* externalID) nogil
cdef extern from "libxml/valid.h":
cdef xmlAttr* xmlGetID(xmlDoc* doc, const_xmlChar* ID) nogil
cdef void xmlDumpNotationTable(xmlBuffer* buffer,
xmlNotationTable* table) nogil
cdef int xmlValidateNameValue(const_xmlChar* value) nogil
cdef extern from "libxml/xmlIO.h":
cdef int xmlOutputBufferWrite(xmlOutputBuffer* out,
int len, const_char* str) nogil
cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, const_char* str) nogil
cdef int xmlOutputBufferWriteEscape(xmlOutputBuffer* out,
const_xmlChar* str,
xmlCharEncodingOutputFunc escapefunc) nogil
cdef int xmlOutputBufferFlush(xmlOutputBuffer* out) nogil
cdef int xmlOutputBufferClose(xmlOutputBuffer* out) nogil
ctypedef int (*xmlInputReadCallback)(void* context,
char* buffer, int len)
ctypedef int (*xmlInputCloseCallback)(void* context)
ctypedef int (*xmlOutputWriteCallback)(void* context,
char* buffer, int len)
ctypedef int (*xmlOutputCloseCallback)(void* context)
cdef xmlOutputBuffer* xmlAllocOutputBuffer(
xmlCharEncodingHandler* encoder) nogil
cdef xmlOutputBuffer* xmlOutputBufferCreateIO(
xmlOutputWriteCallback iowrite,
xmlOutputCloseCallback ioclose,
void * ioctx,
xmlCharEncodingHandler* encoder) nogil
cdef xmlOutputBuffer* xmlOutputBufferCreateFile(
stdio.FILE* file, xmlCharEncodingHandler* encoder) nogil
cdef xmlOutputBuffer* xmlOutputBufferCreateFilename(
char* URI, xmlCharEncodingHandler* encoder, int compression) nogil
cdef extern from "libxml/xmlsave.h":
ctypedef struct xmlSaveCtxt
ctypedef enum xmlSaveOption:
XML_SAVE_FORMAT = 1 # format save output (2.6.17)
XML_SAVE_NO_DECL = 2 # drop the xml declaration (2.6.21)
XML_SAVE_NO_EMPTY = 4 # no empty tags (2.6.22)
XML_SAVE_NO_XHTML = 8 # disable XHTML1 specific rules (2.6.22)
XML_SAVE_XHTML = 16 # force XHTML1 specific rules (2.7.2)
XML_SAVE_AS_XML = 32 # force XML serialization on HTML doc (2.7.2)
XML_SAVE_AS_HTML = 64 # force HTML serialization on XML doc (2.7.2)
cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding,
int options) nogil
cdef xmlSaveCtxt* xmlSaveToBuffer(xmlBuffer* buffer, char* encoding,
int options) nogil # libxml2 2.6.23
cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc) nogil
cdef long xmlSaveTree(xmlSaveCtxt* ctxt, xmlNode* node) nogil
cdef int xmlSaveClose(xmlSaveCtxt* ctxt) nogil
cdef int xmlSaveFlush(xmlSaveCtxt* ctxt) nogil
cdef int xmlSaveSetAttrEscape(xmlSaveCtxt* ctxt, void* escape_func) nogil
cdef int xmlSaveSetEscape(xmlSaveCtxt* ctxt, void* escape_func) nogil
cdef extern from "libxml/globals.h":
cdef int xmlThrDefKeepBlanksDefaultValue(int onoff) nogil
cdef int xmlThrDefLineNumbersDefaultValue(int onoff) nogil
cdef int xmlThrDefIndentTreeOutput(int onoff) nogil
cdef extern from "libxml/xmlmemory.h" nogil:
cdef void* xmlMalloc(size_t size)
cdef int xmlMemBlocks()
cdef int xmlMemUsed()
cdef void xmlMemDisplay(stdio.FILE* file)
cdef void xmlMemDisplayLast(stdio.FILE* file, long num_bytes)
cdef void xmlMemShow(stdio.FILE* file, int count)
cdef extern from "etree_defs.h":
cdef bint _isElement(xmlNode* node) nogil
cdef bint _isElementOrXInclude(xmlNode* node) nogil
cdef const_xmlChar* _getNs(xmlNode* node) nogil
cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* tree_top,
xmlNode* start_node,
bint inclusive) nogil
cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* start_node) nogil
cdef void BEGIN_FOR_EACH_FROM(xmlNode* tree_top,
xmlNode* start_node,
bint inclusive) nogil
cdef void END_FOR_EACH_FROM(xmlNode* start_node) nogil

View file

@ -0,0 +1,5 @@
cdef extern from "libxml/uri.h":
ctypedef struct xmlURI
cdef xmlURI* xmlParseURI(char* str)
cdef void xmlFreeURI(xmlURI* uri)

View file

@ -0,0 +1,22 @@
from lxml.includes.tree cimport xmlDoc, xmlNode
cdef extern from "libxml/xinclude.h":
ctypedef struct xmlXIncludeCtxt
cdef int xmlXIncludeProcess(xmlDoc* doc) nogil
cdef int xmlXIncludeProcessFlags(xmlDoc* doc, int parser_opts) nogil
cdef int xmlXIncludeProcessTree(xmlNode* doc) nogil
cdef int xmlXIncludeProcessTreeFlags(xmlNode* doc, int parser_opts) nogil
# libxml2 >= 2.7.4
cdef int xmlXIncludeProcessTreeFlagsData(
xmlNode* doc, int parser_opts, void* data) nogil
cdef xmlXIncludeCtxt* xmlXIncludeNewContext(xmlDoc* doc) nogil
cdef int xmlXIncludeProcessNode(xmlXIncludeCtxt* ctxt, xmlNode* node) nogil
cdef int xmlXIncludeSetFlags(xmlXIncludeCtxt* ctxt, int flags) nogil
# libxml2 >= 2.6.27
cdef int xmlXIncludeProcessFlagsData(
xmlDoc* doc, int flags, void* data) nogil

View file

@ -0,0 +1,850 @@
# --- BEGIN: GENERATED CONSTANTS ---
# This section is generated by the script 'update-error-constants.py'.
cdef extern from "libxml/xmlerror.h":
ctypedef enum xmlErrorLevel:
XML_ERR_NONE = 0
XML_ERR_WARNING = 1 # A simple warning
XML_ERR_ERROR = 2 # A recoverable error
XML_ERR_FATAL = 3 # A fatal error
ctypedef enum xmlErrorDomain:
XML_FROM_NONE = 0
XML_FROM_PARSER = 1 # The XML parser
XML_FROM_TREE = 2 # The tree module
XML_FROM_NAMESPACE = 3 # The XML Namespace module
XML_FROM_DTD = 4 # The XML DTD validation with parser contex
XML_FROM_HTML = 5 # The HTML parser
XML_FROM_MEMORY = 6 # The memory allocator
XML_FROM_OUTPUT = 7 # The serialization code
XML_FROM_IO = 8 # The Input/Output stack
XML_FROM_FTP = 9 # The FTP module
XML_FROM_HTTP = 10 # The HTTP module
XML_FROM_XINCLUDE = 11 # The XInclude processing
XML_FROM_XPATH = 12 # The XPath module
XML_FROM_XPOINTER = 13 # The XPointer module
XML_FROM_REGEXP = 14 # The regular expressions module
XML_FROM_DATATYPE = 15 # The W3C XML Schemas Datatype module
XML_FROM_SCHEMASP = 16 # The W3C XML Schemas parser module
XML_FROM_SCHEMASV = 17 # The W3C XML Schemas validation module
XML_FROM_RELAXNGP = 18 # The Relax-NG parser module
XML_FROM_RELAXNGV = 19 # The Relax-NG validator module
XML_FROM_CATALOG = 20 # The Catalog module
XML_FROM_C14N = 21 # The Canonicalization module
XML_FROM_XSLT = 22 # The XSLT engine from libxslt
XML_FROM_VALID = 23 # The XML DTD validation with valid context
XML_FROM_CHECK = 24 # The error checking module
XML_FROM_WRITER = 25 # The xmlwriter module
XML_FROM_MODULE = 26 # The dynamically loaded module modul
XML_FROM_I18N = 27 # The module handling character conversion
XML_FROM_SCHEMATRONV = 28 # The Schematron validator module
XML_FROM_BUFFER = 29 # The buffers module
XML_FROM_URI = 30 # The URI module
ctypedef enum xmlParserErrors:
XML_ERR_OK = 0
XML_ERR_INTERNAL_ERROR = 1
XML_ERR_NO_MEMORY = 2
XML_ERR_DOCUMENT_START = 3
XML_ERR_DOCUMENT_EMPTY = 4
XML_ERR_DOCUMENT_END = 5
XML_ERR_INVALID_HEX_CHARREF = 6
XML_ERR_INVALID_DEC_CHARREF = 7
XML_ERR_INVALID_CHARREF = 8
XML_ERR_INVALID_CHAR = 9
XML_ERR_CHARREF_AT_EOF = 10
XML_ERR_CHARREF_IN_PROLOG = 11
XML_ERR_CHARREF_IN_EPILOG = 12
XML_ERR_CHARREF_IN_DTD = 13
XML_ERR_ENTITYREF_AT_EOF = 14
XML_ERR_ENTITYREF_IN_PROLOG = 15
XML_ERR_ENTITYREF_IN_EPILOG = 16
XML_ERR_ENTITYREF_IN_DTD = 17
XML_ERR_PEREF_AT_EOF = 18
XML_ERR_PEREF_IN_PROLOG = 19
XML_ERR_PEREF_IN_EPILOG = 20
XML_ERR_PEREF_IN_INT_SUBSET = 21
XML_ERR_ENTITYREF_NO_NAME = 22
XML_ERR_ENTITYREF_SEMICOL_MISSING = 23
XML_ERR_PEREF_NO_NAME = 24
XML_ERR_PEREF_SEMICOL_MISSING = 25
XML_ERR_UNDECLARED_ENTITY = 26
XML_WAR_UNDECLARED_ENTITY = 27
XML_ERR_UNPARSED_ENTITY = 28
XML_ERR_ENTITY_IS_EXTERNAL = 29
XML_ERR_ENTITY_IS_PARAMETER = 30
XML_ERR_UNKNOWN_ENCODING = 31
XML_ERR_UNSUPPORTED_ENCODING = 32
XML_ERR_STRING_NOT_STARTED = 33
XML_ERR_STRING_NOT_CLOSED = 34
XML_ERR_NS_DECL_ERROR = 35
XML_ERR_ENTITY_NOT_STARTED = 36
XML_ERR_ENTITY_NOT_FINISHED = 37
XML_ERR_LT_IN_ATTRIBUTE = 38
XML_ERR_ATTRIBUTE_NOT_STARTED = 39
XML_ERR_ATTRIBUTE_NOT_FINISHED = 40
XML_ERR_ATTRIBUTE_WITHOUT_VALUE = 41
XML_ERR_ATTRIBUTE_REDEFINED = 42
XML_ERR_LITERAL_NOT_STARTED = 43
XML_ERR_LITERAL_NOT_FINISHED = 44
XML_ERR_COMMENT_NOT_FINISHED = 45
XML_ERR_PI_NOT_STARTED = 46
XML_ERR_PI_NOT_FINISHED = 47
XML_ERR_NOTATION_NOT_STARTED = 48
XML_ERR_NOTATION_NOT_FINISHED = 49
XML_ERR_ATTLIST_NOT_STARTED = 50
XML_ERR_ATTLIST_NOT_FINISHED = 51
XML_ERR_MIXED_NOT_STARTED = 52
XML_ERR_MIXED_NOT_FINISHED = 53
XML_ERR_ELEMCONTENT_NOT_STARTED = 54
XML_ERR_ELEMCONTENT_NOT_FINISHED = 55
XML_ERR_XMLDECL_NOT_STARTED = 56
XML_ERR_XMLDECL_NOT_FINISHED = 57
XML_ERR_CONDSEC_NOT_STARTED = 58
XML_ERR_CONDSEC_NOT_FINISHED = 59
XML_ERR_EXT_SUBSET_NOT_FINISHED = 60
XML_ERR_DOCTYPE_NOT_FINISHED = 61
XML_ERR_MISPLACED_CDATA_END = 62
XML_ERR_CDATA_NOT_FINISHED = 63
XML_ERR_RESERVED_XML_NAME = 64
XML_ERR_SPACE_REQUIRED = 65
XML_ERR_SEPARATOR_REQUIRED = 66
XML_ERR_NMTOKEN_REQUIRED = 67
XML_ERR_NAME_REQUIRED = 68
XML_ERR_PCDATA_REQUIRED = 69
XML_ERR_URI_REQUIRED = 70
XML_ERR_PUBID_REQUIRED = 71
XML_ERR_LT_REQUIRED = 72
XML_ERR_GT_REQUIRED = 73
XML_ERR_LTSLASH_REQUIRED = 74
XML_ERR_EQUAL_REQUIRED = 75
XML_ERR_TAG_NAME_MISMATCH = 76
XML_ERR_TAG_NOT_FINISHED = 77
XML_ERR_STANDALONE_VALUE = 78
XML_ERR_ENCODING_NAME = 79
XML_ERR_HYPHEN_IN_COMMENT = 80
XML_ERR_INVALID_ENCODING = 81
XML_ERR_EXT_ENTITY_STANDALONE = 82
XML_ERR_CONDSEC_INVALID = 83
XML_ERR_VALUE_REQUIRED = 84
XML_ERR_NOT_WELL_BALANCED = 85
XML_ERR_EXTRA_CONTENT = 86
XML_ERR_ENTITY_CHAR_ERROR = 87
XML_ERR_ENTITY_PE_INTERNAL = 88
XML_ERR_ENTITY_LOOP = 89
XML_ERR_ENTITY_BOUNDARY = 90
XML_ERR_INVALID_URI = 91
XML_ERR_URI_FRAGMENT = 92
XML_WAR_CATALOG_PI = 93
XML_ERR_NO_DTD = 94
XML_ERR_CONDSEC_INVALID_KEYWORD = 95
XML_ERR_VERSION_MISSING = 96
XML_WAR_UNKNOWN_VERSION = 97
XML_WAR_LANG_VALUE = 98
XML_WAR_NS_URI = 99
XML_WAR_NS_URI_RELATIVE = 100
XML_ERR_MISSING_ENCODING = 101
XML_WAR_SPACE_VALUE = 102
XML_ERR_NOT_STANDALONE = 103
XML_ERR_ENTITY_PROCESSING = 104
XML_ERR_NOTATION_PROCESSING = 105
XML_WAR_NS_COLUMN = 106
XML_WAR_ENTITY_REDEFINED = 107
XML_ERR_UNKNOWN_VERSION = 108
XML_ERR_VERSION_MISMATCH = 109
XML_ERR_NAME_TOO_LONG = 110
XML_ERR_USER_STOP = 111
XML_NS_ERR_XML_NAMESPACE = 200
XML_NS_ERR_UNDEFINED_NAMESPACE = 201
XML_NS_ERR_QNAME = 202
XML_NS_ERR_ATTRIBUTE_REDEFINED = 203
XML_NS_ERR_EMPTY = 204
XML_NS_ERR_COLON = 205
XML_DTD_ATTRIBUTE_DEFAULT = 500
XML_DTD_ATTRIBUTE_REDEFINED = 501
XML_DTD_ATTRIBUTE_VALUE = 502
XML_DTD_CONTENT_ERROR = 503
XML_DTD_CONTENT_MODEL = 504
XML_DTD_CONTENT_NOT_DETERMINIST = 505
XML_DTD_DIFFERENT_PREFIX = 506
XML_DTD_ELEM_DEFAULT_NAMESPACE = 507
XML_DTD_ELEM_NAMESPACE = 508
XML_DTD_ELEM_REDEFINED = 509
XML_DTD_EMPTY_NOTATION = 510
XML_DTD_ENTITY_TYPE = 511
XML_DTD_ID_FIXED = 512
XML_DTD_ID_REDEFINED = 513
XML_DTD_ID_SUBSET = 514
XML_DTD_INVALID_CHILD = 515
XML_DTD_INVALID_DEFAULT = 516
XML_DTD_LOAD_ERROR = 517
XML_DTD_MISSING_ATTRIBUTE = 518
XML_DTD_MIXED_CORRUPT = 519
XML_DTD_MULTIPLE_ID = 520
XML_DTD_NO_DOC = 521
XML_DTD_NO_DTD = 522
XML_DTD_NO_ELEM_NAME = 523
XML_DTD_NO_PREFIX = 524
XML_DTD_NO_ROOT = 525
XML_DTD_NOTATION_REDEFINED = 526
XML_DTD_NOTATION_VALUE = 527
XML_DTD_NOT_EMPTY = 528
XML_DTD_NOT_PCDATA = 529
XML_DTD_NOT_STANDALONE = 530
XML_DTD_ROOT_NAME = 531
XML_DTD_STANDALONE_WHITE_SPACE = 532
XML_DTD_UNKNOWN_ATTRIBUTE = 533
XML_DTD_UNKNOWN_ELEM = 534
XML_DTD_UNKNOWN_ENTITY = 535
XML_DTD_UNKNOWN_ID = 536
XML_DTD_UNKNOWN_NOTATION = 537
XML_DTD_STANDALONE_DEFAULTED = 538
XML_DTD_XMLID_VALUE = 539
XML_DTD_XMLID_TYPE = 540
XML_DTD_DUP_TOKEN = 541
XML_HTML_STRUCURE_ERROR = 800
XML_HTML_UNKNOWN_TAG = 801
XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000
XML_RNGP_ATTR_CONFLICT = 1001
XML_RNGP_ATTRIBUTE_CHILDREN = 1002
XML_RNGP_ATTRIBUTE_CONTENT = 1003
XML_RNGP_ATTRIBUTE_EMPTY = 1004
XML_RNGP_ATTRIBUTE_NOOP = 1005
XML_RNGP_CHOICE_CONTENT = 1006
XML_RNGP_CHOICE_EMPTY = 1007
XML_RNGP_CREATE_FAILURE = 1008
XML_RNGP_DATA_CONTENT = 1009
XML_RNGP_DEF_CHOICE_AND_INTERLEAVE = 1010
XML_RNGP_DEFINE_CREATE_FAILED = 1011
XML_RNGP_DEFINE_EMPTY = 1012
XML_RNGP_DEFINE_MISSING = 1013
XML_RNGP_DEFINE_NAME_MISSING = 1014
XML_RNGP_ELEM_CONTENT_EMPTY = 1015
XML_RNGP_ELEM_CONTENT_ERROR = 1016
XML_RNGP_ELEMENT_EMPTY = 1017
XML_RNGP_ELEMENT_CONTENT = 1018
XML_RNGP_ELEMENT_NAME = 1019
XML_RNGP_ELEMENT_NO_CONTENT = 1020
XML_RNGP_ELEM_TEXT_CONFLICT = 1021
XML_RNGP_EMPTY = 1022
XML_RNGP_EMPTY_CONSTRUCT = 1023
XML_RNGP_EMPTY_CONTENT = 1024
XML_RNGP_EMPTY_NOT_EMPTY = 1025
XML_RNGP_ERROR_TYPE_LIB = 1026
XML_RNGP_EXCEPT_EMPTY = 1027
XML_RNGP_EXCEPT_MISSING = 1028
XML_RNGP_EXCEPT_MULTIPLE = 1029
XML_RNGP_EXCEPT_NO_CONTENT = 1030
XML_RNGP_EXTERNALREF_EMTPY = 1031
XML_RNGP_EXTERNAL_REF_FAILURE = 1032
XML_RNGP_EXTERNALREF_RECURSE = 1033
XML_RNGP_FORBIDDEN_ATTRIBUTE = 1034
XML_RNGP_FOREIGN_ELEMENT = 1035
XML_RNGP_GRAMMAR_CONTENT = 1036
XML_RNGP_GRAMMAR_EMPTY = 1037
XML_RNGP_GRAMMAR_MISSING = 1038
XML_RNGP_GRAMMAR_NO_START = 1039
XML_RNGP_GROUP_ATTR_CONFLICT = 1040
XML_RNGP_HREF_ERROR = 1041
XML_RNGP_INCLUDE_EMPTY = 1042
XML_RNGP_INCLUDE_FAILURE = 1043
XML_RNGP_INCLUDE_RECURSE = 1044
XML_RNGP_INTERLEAVE_ADD = 1045
XML_RNGP_INTERLEAVE_CREATE_FAILED = 1046
XML_RNGP_INTERLEAVE_EMPTY = 1047
XML_RNGP_INTERLEAVE_NO_CONTENT = 1048
XML_RNGP_INVALID_DEFINE_NAME = 1049
XML_RNGP_INVALID_URI = 1050
XML_RNGP_INVALID_VALUE = 1051
XML_RNGP_MISSING_HREF = 1052
XML_RNGP_NAME_MISSING = 1053
XML_RNGP_NEED_COMBINE = 1054
XML_RNGP_NOTALLOWED_NOT_EMPTY = 1055
XML_RNGP_NSNAME_ATTR_ANCESTOR = 1056
XML_RNGP_NSNAME_NO_NS = 1057
XML_RNGP_PARAM_FORBIDDEN = 1058
XML_RNGP_PARAM_NAME_MISSING = 1059
XML_RNGP_PARENTREF_CREATE_FAILED = 1060
XML_RNGP_PARENTREF_NAME_INVALID = 1061
XML_RNGP_PARENTREF_NO_NAME = 1062
XML_RNGP_PARENTREF_NO_PARENT = 1063
XML_RNGP_PARENTREF_NOT_EMPTY = 1064
XML_RNGP_PARSE_ERROR = 1065
XML_RNGP_PAT_ANYNAME_EXCEPT_ANYNAME = 1066
XML_RNGP_PAT_ATTR_ATTR = 1067
XML_RNGP_PAT_ATTR_ELEM = 1068
XML_RNGP_PAT_DATA_EXCEPT_ATTR = 1069
XML_RNGP_PAT_DATA_EXCEPT_ELEM = 1070
XML_RNGP_PAT_DATA_EXCEPT_EMPTY = 1071
XML_RNGP_PAT_DATA_EXCEPT_GROUP = 1072
XML_RNGP_PAT_DATA_EXCEPT_INTERLEAVE = 1073
XML_RNGP_PAT_DATA_EXCEPT_LIST = 1074
XML_RNGP_PAT_DATA_EXCEPT_ONEMORE = 1075
XML_RNGP_PAT_DATA_EXCEPT_REF = 1076
XML_RNGP_PAT_DATA_EXCEPT_TEXT = 1077
XML_RNGP_PAT_LIST_ATTR = 1078
XML_RNGP_PAT_LIST_ELEM = 1079
XML_RNGP_PAT_LIST_INTERLEAVE = 1080
XML_RNGP_PAT_LIST_LIST = 1081
XML_RNGP_PAT_LIST_REF = 1082
XML_RNGP_PAT_LIST_TEXT = 1083
XML_RNGP_PAT_NSNAME_EXCEPT_ANYNAME = 1084
XML_RNGP_PAT_NSNAME_EXCEPT_NSNAME = 1085
XML_RNGP_PAT_ONEMORE_GROUP_ATTR = 1086
XML_RNGP_PAT_ONEMORE_INTERLEAVE_ATTR = 1087
XML_RNGP_PAT_START_ATTR = 1088
XML_RNGP_PAT_START_DATA = 1089
XML_RNGP_PAT_START_EMPTY = 1090
XML_RNGP_PAT_START_GROUP = 1091
XML_RNGP_PAT_START_INTERLEAVE = 1092
XML_RNGP_PAT_START_LIST = 1093
XML_RNGP_PAT_START_ONEMORE = 1094
XML_RNGP_PAT_START_TEXT = 1095
XML_RNGP_PAT_START_VALUE = 1096
XML_RNGP_PREFIX_UNDEFINED = 1097
XML_RNGP_REF_CREATE_FAILED = 1098
XML_RNGP_REF_CYCLE = 1099
XML_RNGP_REF_NAME_INVALID = 1100
XML_RNGP_REF_NO_DEF = 1101
XML_RNGP_REF_NO_NAME = 1102
XML_RNGP_REF_NOT_EMPTY = 1103
XML_RNGP_START_CHOICE_AND_INTERLEAVE = 1104
XML_RNGP_START_CONTENT = 1105
XML_RNGP_START_EMPTY = 1106
XML_RNGP_START_MISSING = 1107
XML_RNGP_TEXT_EXPECTED = 1108
XML_RNGP_TEXT_HAS_CHILD = 1109
XML_RNGP_TYPE_MISSING = 1110
XML_RNGP_TYPE_NOT_FOUND = 1111
XML_RNGP_TYPE_VALUE = 1112
XML_RNGP_UNKNOWN_ATTRIBUTE = 1113
XML_RNGP_UNKNOWN_COMBINE = 1114
XML_RNGP_UNKNOWN_CONSTRUCT = 1115
XML_RNGP_UNKNOWN_TYPE_LIB = 1116
XML_RNGP_URI_FRAGMENT = 1117
XML_RNGP_URI_NOT_ABSOLUTE = 1118
XML_RNGP_VALUE_EMPTY = 1119
XML_RNGP_VALUE_NO_CONTENT = 1120
XML_RNGP_XMLNS_NAME = 1121
XML_RNGP_XML_NS = 1122
XML_XPATH_EXPRESSION_OK = 1200
XML_XPATH_NUMBER_ERROR = 1201
XML_XPATH_UNFINISHED_LITERAL_ERROR = 1202
XML_XPATH_START_LITERAL_ERROR = 1203
XML_XPATH_VARIABLE_REF_ERROR = 1204
XML_XPATH_UNDEF_VARIABLE_ERROR = 1205
XML_XPATH_INVALID_PREDICATE_ERROR = 1206
XML_XPATH_EXPR_ERROR = 1207
XML_XPATH_UNCLOSED_ERROR = 1208
XML_XPATH_UNKNOWN_FUNC_ERROR = 1209
XML_XPATH_INVALID_OPERAND = 1210
XML_XPATH_INVALID_TYPE = 1211
XML_XPATH_INVALID_ARITY = 1212
XML_XPATH_INVALID_CTXT_SIZE = 1213
XML_XPATH_INVALID_CTXT_POSITION = 1214
XML_XPATH_MEMORY_ERROR = 1215
XML_XPTR_SYNTAX_ERROR = 1216
XML_XPTR_RESOURCE_ERROR = 1217
XML_XPTR_SUB_RESOURCE_ERROR = 1218
XML_XPATH_UNDEF_PREFIX_ERROR = 1219
XML_XPATH_ENCODING_ERROR = 1220
XML_XPATH_INVALID_CHAR_ERROR = 1221
XML_TREE_INVALID_HEX = 1300
XML_TREE_INVALID_DEC = 1301
XML_TREE_UNTERMINATED_ENTITY = 1302
XML_TREE_NOT_UTF8 = 1303
XML_SAVE_NOT_UTF8 = 1400
XML_SAVE_CHAR_INVALID = 1401
XML_SAVE_NO_DOCTYPE = 1402
XML_SAVE_UNKNOWN_ENCODING = 1403
XML_REGEXP_COMPILE_ERROR = 1450
XML_IO_UNKNOWN = 1500
XML_IO_EACCES = 1501
XML_IO_EAGAIN = 1502
XML_IO_EBADF = 1503
XML_IO_EBADMSG = 1504
XML_IO_EBUSY = 1505
XML_IO_ECANCELED = 1506
XML_IO_ECHILD = 1507
XML_IO_EDEADLK = 1508
XML_IO_EDOM = 1509
XML_IO_EEXIST = 1510
XML_IO_EFAULT = 1511
XML_IO_EFBIG = 1512
XML_IO_EINPROGRESS = 1513
XML_IO_EINTR = 1514
XML_IO_EINVAL = 1515
XML_IO_EIO = 1516
XML_IO_EISDIR = 1517
XML_IO_EMFILE = 1518
XML_IO_EMLINK = 1519
XML_IO_EMSGSIZE = 1520
XML_IO_ENAMETOOLONG = 1521
XML_IO_ENFILE = 1522
XML_IO_ENODEV = 1523
XML_IO_ENOENT = 1524
XML_IO_ENOEXEC = 1525
XML_IO_ENOLCK = 1526
XML_IO_ENOMEM = 1527
XML_IO_ENOSPC = 1528
XML_IO_ENOSYS = 1529
XML_IO_ENOTDIR = 1530
XML_IO_ENOTEMPTY = 1531
XML_IO_ENOTSUP = 1532
XML_IO_ENOTTY = 1533
XML_IO_ENXIO = 1534
XML_IO_EPERM = 1535
XML_IO_EPIPE = 1536
XML_IO_ERANGE = 1537
XML_IO_EROFS = 1538
XML_IO_ESPIPE = 1539
XML_IO_ESRCH = 1540
XML_IO_ETIMEDOUT = 1541
XML_IO_EXDEV = 1542
XML_IO_NETWORK_ATTEMPT = 1543
XML_IO_ENCODER = 1544
XML_IO_FLUSH = 1545
XML_IO_WRITE = 1546
XML_IO_NO_INPUT = 1547
XML_IO_BUFFER_FULL = 1548
XML_IO_LOAD_ERROR = 1549
XML_IO_ENOTSOCK = 1550
XML_IO_EISCONN = 1551
XML_IO_ECONNREFUSED = 1552
XML_IO_ENETUNREACH = 1553
XML_IO_EADDRINUSE = 1554
XML_IO_EALREADY = 1555
XML_IO_EAFNOSUPPORT = 1556
XML_XINCLUDE_RECURSION = 1600
XML_XINCLUDE_PARSE_VALUE = 1601
XML_XINCLUDE_ENTITY_DEF_MISMATCH = 1602
XML_XINCLUDE_NO_HREF = 1603
XML_XINCLUDE_NO_FALLBACK = 1604
XML_XINCLUDE_HREF_URI = 1605
XML_XINCLUDE_TEXT_FRAGMENT = 1606
XML_XINCLUDE_TEXT_DOCUMENT = 1607
XML_XINCLUDE_INVALID_CHAR = 1608
XML_XINCLUDE_BUILD_FAILED = 1609
XML_XINCLUDE_UNKNOWN_ENCODING = 1610
XML_XINCLUDE_MULTIPLE_ROOT = 1611
XML_XINCLUDE_XPTR_FAILED = 1612
XML_XINCLUDE_XPTR_RESULT = 1613
XML_XINCLUDE_INCLUDE_IN_INCLUDE = 1614
XML_XINCLUDE_FALLBACKS_IN_INCLUDE = 1615
XML_XINCLUDE_FALLBACK_NOT_IN_INCLUDE = 1616
XML_XINCLUDE_DEPRECATED_NS = 1617
XML_XINCLUDE_FRAGMENT_ID = 1618
XML_CATALOG_MISSING_ATTR = 1650
XML_CATALOG_ENTRY_BROKEN = 1651
XML_CATALOG_PREFER_VALUE = 1652
XML_CATALOG_NOT_CATALOG = 1653
XML_CATALOG_RECURSION = 1654
XML_SCHEMAP_PREFIX_UNDEFINED = 1700
XML_SCHEMAP_ATTRFORMDEFAULT_VALUE = 1701
XML_SCHEMAP_ATTRGRP_NONAME_NOREF = 1702
XML_SCHEMAP_ATTR_NONAME_NOREF = 1703
XML_SCHEMAP_COMPLEXTYPE_NONAME_NOREF = 1704
XML_SCHEMAP_ELEMFORMDEFAULT_VALUE = 1705
XML_SCHEMAP_ELEM_NONAME_NOREF = 1706
XML_SCHEMAP_EXTENSION_NO_BASE = 1707
XML_SCHEMAP_FACET_NO_VALUE = 1708
XML_SCHEMAP_FAILED_BUILD_IMPORT = 1709
XML_SCHEMAP_GROUP_NONAME_NOREF = 1710
XML_SCHEMAP_IMPORT_NAMESPACE_NOT_URI = 1711
XML_SCHEMAP_IMPORT_REDEFINE_NSNAME = 1712
XML_SCHEMAP_IMPORT_SCHEMA_NOT_URI = 1713
XML_SCHEMAP_INVALID_BOOLEAN = 1714
XML_SCHEMAP_INVALID_ENUM = 1715
XML_SCHEMAP_INVALID_FACET = 1716
XML_SCHEMAP_INVALID_FACET_VALUE = 1717
XML_SCHEMAP_INVALID_MAXOCCURS = 1718
XML_SCHEMAP_INVALID_MINOCCURS = 1719
XML_SCHEMAP_INVALID_REF_AND_SUBTYPE = 1720
XML_SCHEMAP_INVALID_WHITE_SPACE = 1721
XML_SCHEMAP_NOATTR_NOREF = 1722
XML_SCHEMAP_NOTATION_NO_NAME = 1723
XML_SCHEMAP_NOTYPE_NOREF = 1724
XML_SCHEMAP_REF_AND_SUBTYPE = 1725
XML_SCHEMAP_RESTRICTION_NONAME_NOREF = 1726
XML_SCHEMAP_SIMPLETYPE_NONAME = 1727
XML_SCHEMAP_TYPE_AND_SUBTYPE = 1728
XML_SCHEMAP_UNKNOWN_ALL_CHILD = 1729
XML_SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD = 1730
XML_SCHEMAP_UNKNOWN_ATTR_CHILD = 1731
XML_SCHEMAP_UNKNOWN_ATTRGRP_CHILD = 1732
XML_SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP = 1733
XML_SCHEMAP_UNKNOWN_BASE_TYPE = 1734
XML_SCHEMAP_UNKNOWN_CHOICE_CHILD = 1735
XML_SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD = 1736
XML_SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD = 1737
XML_SCHEMAP_UNKNOWN_ELEM_CHILD = 1738
XML_SCHEMAP_UNKNOWN_EXTENSION_CHILD = 1739
XML_SCHEMAP_UNKNOWN_FACET_CHILD = 1740
XML_SCHEMAP_UNKNOWN_FACET_TYPE = 1741
XML_SCHEMAP_UNKNOWN_GROUP_CHILD = 1742
XML_SCHEMAP_UNKNOWN_IMPORT_CHILD = 1743
XML_SCHEMAP_UNKNOWN_LIST_CHILD = 1744
XML_SCHEMAP_UNKNOWN_NOTATION_CHILD = 1745
XML_SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD = 1746
XML_SCHEMAP_UNKNOWN_REF = 1747
XML_SCHEMAP_UNKNOWN_RESTRICTION_CHILD = 1748
XML_SCHEMAP_UNKNOWN_SCHEMAS_CHILD = 1749
XML_SCHEMAP_UNKNOWN_SEQUENCE_CHILD = 1750
XML_SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD = 1751
XML_SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD = 1752
XML_SCHEMAP_UNKNOWN_TYPE = 1753
XML_SCHEMAP_UNKNOWN_UNION_CHILD = 1754
XML_SCHEMAP_ELEM_DEFAULT_FIXED = 1755
XML_SCHEMAP_REGEXP_INVALID = 1756
XML_SCHEMAP_FAILED_LOAD = 1757
XML_SCHEMAP_NOTHING_TO_PARSE = 1758
XML_SCHEMAP_NOROOT = 1759
XML_SCHEMAP_REDEFINED_GROUP = 1760
XML_SCHEMAP_REDEFINED_TYPE = 1761
XML_SCHEMAP_REDEFINED_ELEMENT = 1762
XML_SCHEMAP_REDEFINED_ATTRGROUP = 1763
XML_SCHEMAP_REDEFINED_ATTR = 1764
XML_SCHEMAP_REDEFINED_NOTATION = 1765
XML_SCHEMAP_FAILED_PARSE = 1766
XML_SCHEMAP_UNKNOWN_PREFIX = 1767
XML_SCHEMAP_DEF_AND_PREFIX = 1768
XML_SCHEMAP_UNKNOWN_INCLUDE_CHILD = 1769
XML_SCHEMAP_INCLUDE_SCHEMA_NOT_URI = 1770
XML_SCHEMAP_INCLUDE_SCHEMA_NO_URI = 1771
XML_SCHEMAP_NOT_SCHEMA = 1772
XML_SCHEMAP_UNKNOWN_MEMBER_TYPE = 1773
XML_SCHEMAP_INVALID_ATTR_USE = 1774
XML_SCHEMAP_RECURSIVE = 1775
XML_SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE = 1776
XML_SCHEMAP_INVALID_ATTR_COMBINATION = 1777
XML_SCHEMAP_INVALID_ATTR_INLINE_COMBINATION = 1778
XML_SCHEMAP_MISSING_SIMPLETYPE_CHILD = 1779
XML_SCHEMAP_INVALID_ATTR_NAME = 1780
XML_SCHEMAP_REF_AND_CONTENT = 1781
XML_SCHEMAP_CT_PROPS_CORRECT_1 = 1782
XML_SCHEMAP_CT_PROPS_CORRECT_2 = 1783
XML_SCHEMAP_CT_PROPS_CORRECT_3 = 1784
XML_SCHEMAP_CT_PROPS_CORRECT_4 = 1785
XML_SCHEMAP_CT_PROPS_CORRECT_5 = 1786
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_1 = 1787
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1 = 1788
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2 = 1789
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_2 = 1790
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_3 = 1791
XML_SCHEMAP_WILDCARD_INVALID_NS_MEMBER = 1792
XML_SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE = 1793
XML_SCHEMAP_UNION_NOT_EXPRESSIBLE = 1794
XML_SCHEMAP_SRC_IMPORT_3_1 = 1795
XML_SCHEMAP_SRC_IMPORT_3_2 = 1796
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_1 = 1797
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_2 = 1798
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_3 = 1799
XML_SCHEMAP_COS_CT_EXTENDS_1_3 = 1800
XML_SCHEMAV_NOROOT = 1801
XML_SCHEMAV_UNDECLAREDELEM = 1802
XML_SCHEMAV_NOTTOPLEVEL = 1803
XML_SCHEMAV_MISSING = 1804
XML_SCHEMAV_WRONGELEM = 1805
XML_SCHEMAV_NOTYPE = 1806
XML_SCHEMAV_NOROLLBACK = 1807
XML_SCHEMAV_ISABSTRACT = 1808
XML_SCHEMAV_NOTEMPTY = 1809
XML_SCHEMAV_ELEMCONT = 1810
XML_SCHEMAV_HAVEDEFAULT = 1811
XML_SCHEMAV_NOTNILLABLE = 1812
XML_SCHEMAV_EXTRACONTENT = 1813
XML_SCHEMAV_INVALIDATTR = 1814
XML_SCHEMAV_INVALIDELEM = 1815
XML_SCHEMAV_NOTDETERMINIST = 1816
XML_SCHEMAV_CONSTRUCT = 1817
XML_SCHEMAV_INTERNAL = 1818
XML_SCHEMAV_NOTSIMPLE = 1819
XML_SCHEMAV_ATTRUNKNOWN = 1820
XML_SCHEMAV_ATTRINVALID = 1821
XML_SCHEMAV_VALUE = 1822
XML_SCHEMAV_FACET = 1823
XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_1 = 1824
XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_2 = 1825
XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_3 = 1826
XML_SCHEMAV_CVC_TYPE_3_1_1 = 1827
XML_SCHEMAV_CVC_TYPE_3_1_2 = 1828
XML_SCHEMAV_CVC_FACET_VALID = 1829
XML_SCHEMAV_CVC_LENGTH_VALID = 1830
XML_SCHEMAV_CVC_MINLENGTH_VALID = 1831
XML_SCHEMAV_CVC_MAXLENGTH_VALID = 1832
XML_SCHEMAV_CVC_MININCLUSIVE_VALID = 1833
XML_SCHEMAV_CVC_MAXINCLUSIVE_VALID = 1834
XML_SCHEMAV_CVC_MINEXCLUSIVE_VALID = 1835
XML_SCHEMAV_CVC_MAXEXCLUSIVE_VALID = 1836
XML_SCHEMAV_CVC_TOTALDIGITS_VALID = 1837
XML_SCHEMAV_CVC_FRACTIONDIGITS_VALID = 1838
XML_SCHEMAV_CVC_PATTERN_VALID = 1839
XML_SCHEMAV_CVC_ENUMERATION_VALID = 1840
XML_SCHEMAV_CVC_COMPLEX_TYPE_2_1 = 1841
XML_SCHEMAV_CVC_COMPLEX_TYPE_2_2 = 1842
XML_SCHEMAV_CVC_COMPLEX_TYPE_2_3 = 1843
XML_SCHEMAV_CVC_COMPLEX_TYPE_2_4 = 1844
XML_SCHEMAV_CVC_ELT_1 = 1845
XML_SCHEMAV_CVC_ELT_2 = 1846
XML_SCHEMAV_CVC_ELT_3_1 = 1847
XML_SCHEMAV_CVC_ELT_3_2_1 = 1848
XML_SCHEMAV_CVC_ELT_3_2_2 = 1849
XML_SCHEMAV_CVC_ELT_4_1 = 1850
XML_SCHEMAV_CVC_ELT_4_2 = 1851
XML_SCHEMAV_CVC_ELT_4_3 = 1852
XML_SCHEMAV_CVC_ELT_5_1_1 = 1853
XML_SCHEMAV_CVC_ELT_5_1_2 = 1854
XML_SCHEMAV_CVC_ELT_5_2_1 = 1855
XML_SCHEMAV_CVC_ELT_5_2_2_1 = 1856
XML_SCHEMAV_CVC_ELT_5_2_2_2_1 = 1857
XML_SCHEMAV_CVC_ELT_5_2_2_2_2 = 1858
XML_SCHEMAV_CVC_ELT_6 = 1859
XML_SCHEMAV_CVC_ELT_7 = 1860
XML_SCHEMAV_CVC_ATTRIBUTE_1 = 1861
XML_SCHEMAV_CVC_ATTRIBUTE_2 = 1862
XML_SCHEMAV_CVC_ATTRIBUTE_3 = 1863
XML_SCHEMAV_CVC_ATTRIBUTE_4 = 1864
XML_SCHEMAV_CVC_COMPLEX_TYPE_3_1 = 1865
XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_1 = 1866
XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_2 = 1867
XML_SCHEMAV_CVC_COMPLEX_TYPE_4 = 1868
XML_SCHEMAV_CVC_COMPLEX_TYPE_5_1 = 1869
XML_SCHEMAV_CVC_COMPLEX_TYPE_5_2 = 1870
XML_SCHEMAV_ELEMENT_CONTENT = 1871
XML_SCHEMAV_DOCUMENT_ELEMENT_MISSING = 1872
XML_SCHEMAV_CVC_COMPLEX_TYPE_1 = 1873
XML_SCHEMAV_CVC_AU = 1874
XML_SCHEMAV_CVC_TYPE_1 = 1875
XML_SCHEMAV_CVC_TYPE_2 = 1876
XML_SCHEMAV_CVC_IDC = 1877
XML_SCHEMAV_CVC_WILDCARD = 1878
XML_SCHEMAV_MISC = 1879
XML_XPTR_UNKNOWN_SCHEME = 1900
XML_XPTR_CHILDSEQ_START = 1901
XML_XPTR_EVAL_FAILED = 1902
XML_XPTR_EXTRA_OBJECTS = 1903
XML_C14N_CREATE_CTXT = 1950
XML_C14N_REQUIRES_UTF8 = 1951
XML_C14N_CREATE_STACK = 1952
XML_C14N_INVALID_NODE = 1953
XML_C14N_UNKNOW_NODE = 1954
XML_C14N_RELATIVE_NAMESPACE = 1955
XML_FTP_PASV_ANSWER = 2000
XML_FTP_EPSV_ANSWER = 2001
XML_FTP_ACCNT = 2002
XML_FTP_URL_SYNTAX = 2003
XML_HTTP_URL_SYNTAX = 2020
XML_HTTP_USE_IP = 2021
XML_HTTP_UNKNOWN_HOST = 2022
XML_SCHEMAP_SRC_SIMPLE_TYPE_1 = 3000
XML_SCHEMAP_SRC_SIMPLE_TYPE_2 = 3001
XML_SCHEMAP_SRC_SIMPLE_TYPE_3 = 3002
XML_SCHEMAP_SRC_SIMPLE_TYPE_4 = 3003
XML_SCHEMAP_SRC_RESOLVE = 3004
XML_SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE = 3005
XML_SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE = 3006
XML_SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES = 3007
XML_SCHEMAP_ST_PROPS_CORRECT_1 = 3008
XML_SCHEMAP_ST_PROPS_CORRECT_2 = 3009
XML_SCHEMAP_ST_PROPS_CORRECT_3 = 3010
XML_SCHEMAP_COS_ST_RESTRICTS_1_1 = 3011
XML_SCHEMAP_COS_ST_RESTRICTS_1_2 = 3012
XML_SCHEMAP_COS_ST_RESTRICTS_1_3_1 = 3013
XML_SCHEMAP_COS_ST_RESTRICTS_1_3_2 = 3014
XML_SCHEMAP_COS_ST_RESTRICTS_2_1 = 3015
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_1 = 3016
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_2 = 3017
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_1 = 3018
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_2 = 3019
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_3 = 3020
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_4 = 3021
XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_5 = 3022
XML_SCHEMAP_COS_ST_RESTRICTS_3_1 = 3023
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1 = 3024
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1_2 = 3025
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_2 = 3026
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_1 = 3027
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_3 = 3028
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_4 = 3029
XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_5 = 3030
XML_SCHEMAP_COS_ST_DERIVED_OK_2_1 = 3031
XML_SCHEMAP_COS_ST_DERIVED_OK_2_2 = 3032
XML_SCHEMAP_S4S_ELEM_NOT_ALLOWED = 3033
XML_SCHEMAP_S4S_ELEM_MISSING = 3034
XML_SCHEMAP_S4S_ATTR_NOT_ALLOWED = 3035
XML_SCHEMAP_S4S_ATTR_MISSING = 3036
XML_SCHEMAP_S4S_ATTR_INVALID_VALUE = 3037
XML_SCHEMAP_SRC_ELEMENT_1 = 3038
XML_SCHEMAP_SRC_ELEMENT_2_1 = 3039
XML_SCHEMAP_SRC_ELEMENT_2_2 = 3040
XML_SCHEMAP_SRC_ELEMENT_3 = 3041
XML_SCHEMAP_P_PROPS_CORRECT_1 = 3042
XML_SCHEMAP_P_PROPS_CORRECT_2_1 = 3043
XML_SCHEMAP_P_PROPS_CORRECT_2_2 = 3044
XML_SCHEMAP_E_PROPS_CORRECT_2 = 3045
XML_SCHEMAP_E_PROPS_CORRECT_3 = 3046
XML_SCHEMAP_E_PROPS_CORRECT_4 = 3047
XML_SCHEMAP_E_PROPS_CORRECT_5 = 3048
XML_SCHEMAP_E_PROPS_CORRECT_6 = 3049
XML_SCHEMAP_SRC_INCLUDE = 3050
XML_SCHEMAP_SRC_ATTRIBUTE_1 = 3051
XML_SCHEMAP_SRC_ATTRIBUTE_2 = 3052
XML_SCHEMAP_SRC_ATTRIBUTE_3_1 = 3053
XML_SCHEMAP_SRC_ATTRIBUTE_3_2 = 3054
XML_SCHEMAP_SRC_ATTRIBUTE_4 = 3055
XML_SCHEMAP_NO_XMLNS = 3056
XML_SCHEMAP_NO_XSI = 3057
XML_SCHEMAP_COS_VALID_DEFAULT_1 = 3058
XML_SCHEMAP_COS_VALID_DEFAULT_2_1 = 3059
XML_SCHEMAP_COS_VALID_DEFAULT_2_2_1 = 3060
XML_SCHEMAP_COS_VALID_DEFAULT_2_2_2 = 3061
XML_SCHEMAP_CVC_SIMPLE_TYPE = 3062
XML_SCHEMAP_COS_CT_EXTENDS_1_1 = 3063
XML_SCHEMAP_SRC_IMPORT_1_1 = 3064
XML_SCHEMAP_SRC_IMPORT_1_2 = 3065
XML_SCHEMAP_SRC_IMPORT_2 = 3066
XML_SCHEMAP_SRC_IMPORT_2_1 = 3067
XML_SCHEMAP_SRC_IMPORT_2_2 = 3068
XML_SCHEMAP_INTERNAL = 3069 # 3069 non-W3C
XML_SCHEMAP_NOT_DETERMINISTIC = 3070 # 3070 non-W3C
XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_1 = 3071
XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_2 = 3072
XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_3 = 3073
XML_SCHEMAP_MG_PROPS_CORRECT_1 = 3074
XML_SCHEMAP_MG_PROPS_CORRECT_2 = 3075
XML_SCHEMAP_SRC_CT_1 = 3076
XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3 = 3077
XML_SCHEMAP_AU_PROPS_CORRECT_2 = 3078
XML_SCHEMAP_A_PROPS_CORRECT_2 = 3079
XML_SCHEMAP_C_PROPS_CORRECT = 3080
XML_SCHEMAP_SRC_REDEFINE = 3081
XML_SCHEMAP_SRC_IMPORT = 3082
XML_SCHEMAP_WARN_SKIP_SCHEMA = 3083
XML_SCHEMAP_WARN_UNLOCATED_SCHEMA = 3084
XML_SCHEMAP_WARN_ATTR_REDECL_PROH = 3085
XML_SCHEMAP_WARN_ATTR_POINTLESS_PROH = 3086 # 3085
XML_SCHEMAP_AG_PROPS_CORRECT = 3087 # 3086
XML_SCHEMAP_COS_CT_EXTENDS_1_2 = 3088 # 3087
XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088
XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089
XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090
XML_SCHEMATRONV_ASSERT = 4000
XML_SCHEMATRONV_REPORT = 4001
XML_MODULE_OPEN = 4900
XML_MODULE_CLOSE = 4901
XML_CHECK_FOUND_ELEMENT = 5000
XML_CHECK_FOUND_ATTRIBUTE = 5001
XML_CHECK_FOUND_TEXT = 5002
XML_CHECK_FOUND_CDATA = 5003
XML_CHECK_FOUND_ENTITYREF = 5004
XML_CHECK_FOUND_ENTITY = 5005
XML_CHECK_FOUND_PI = 5006
XML_CHECK_FOUND_COMMENT = 5007
XML_CHECK_FOUND_DOCTYPE = 5008
XML_CHECK_FOUND_FRAGMENT = 5009
XML_CHECK_FOUND_NOTATION = 5010
XML_CHECK_UNKNOWN_NODE = 5011
XML_CHECK_ENTITY_TYPE = 5012
XML_CHECK_NO_PARENT = 5013
XML_CHECK_NO_DOC = 5014
XML_CHECK_NO_NAME = 5015
XML_CHECK_NO_ELEM = 5016
XML_CHECK_WRONG_DOC = 5017
XML_CHECK_NO_PREV = 5018
XML_CHECK_WRONG_PREV = 5019
XML_CHECK_NO_NEXT = 5020
XML_CHECK_WRONG_NEXT = 5021
XML_CHECK_NOT_DTD = 5022
XML_CHECK_NOT_ATTR = 5023
XML_CHECK_NOT_ATTR_DECL = 5024
XML_CHECK_NOT_ELEM_DECL = 5025
XML_CHECK_NOT_ENTITY_DECL = 5026
XML_CHECK_NOT_NS_DECL = 5027
XML_CHECK_NO_HREF = 5028
XML_CHECK_WRONG_PARENT = 5029
XML_CHECK_NS_SCOPE = 5030
XML_CHECK_NS_ANCESTOR = 5031
XML_CHECK_NOT_UTF8 = 5032
XML_CHECK_NO_DICT = 5033
XML_CHECK_NOT_NCNAME = 5034
XML_CHECK_OUTSIDE_DICT = 5035
XML_CHECK_WRONG_NAME = 5036
XML_CHECK_NAME_NOT_NULL = 5037
XML_I18N_NO_NAME = 6000
XML_I18N_NO_HANDLER = 6001
XML_I18N_EXCESS_HANDLER = 6002
XML_I18N_CONV_FAILED = 6003
XML_I18N_NO_OUTPUT = 6004
XML_BUF_OVERFLOW = 7000
ctypedef enum xmlRelaxNGValidErr:
XML_RELAXNG_OK = 0
XML_RELAXNG_ERR_MEMORY = 1
XML_RELAXNG_ERR_TYPE = 2
XML_RELAXNG_ERR_TYPEVAL = 3
XML_RELAXNG_ERR_DUPID = 4
XML_RELAXNG_ERR_TYPECMP = 5
XML_RELAXNG_ERR_NOSTATE = 6
XML_RELAXNG_ERR_NODEFINE = 7
XML_RELAXNG_ERR_LISTEXTRA = 8
XML_RELAXNG_ERR_LISTEMPTY = 9
XML_RELAXNG_ERR_INTERNODATA = 10
XML_RELAXNG_ERR_INTERSEQ = 11
XML_RELAXNG_ERR_INTEREXTRA = 12
XML_RELAXNG_ERR_ELEMNAME = 13
XML_RELAXNG_ERR_ATTRNAME = 14
XML_RELAXNG_ERR_ELEMNONS = 15
XML_RELAXNG_ERR_ATTRNONS = 16
XML_RELAXNG_ERR_ELEMWRONGNS = 17
XML_RELAXNG_ERR_ATTRWRONGNS = 18
XML_RELAXNG_ERR_ELEMEXTRANS = 19
XML_RELAXNG_ERR_ATTREXTRANS = 20
XML_RELAXNG_ERR_ELEMNOTEMPTY = 21
XML_RELAXNG_ERR_NOELEM = 22
XML_RELAXNG_ERR_NOTELEM = 23
XML_RELAXNG_ERR_ATTRVALID = 24
XML_RELAXNG_ERR_CONTENTVALID = 25
XML_RELAXNG_ERR_EXTRACONTENT = 26
XML_RELAXNG_ERR_INVALIDATTR = 27
XML_RELAXNG_ERR_DATAELEM = 28
XML_RELAXNG_ERR_VALELEM = 29
XML_RELAXNG_ERR_LISTELEM = 30
XML_RELAXNG_ERR_DATATYPE = 31
XML_RELAXNG_ERR_VALUE = 32
XML_RELAXNG_ERR_LIST = 33
XML_RELAXNG_ERR_NOGRAMMAR = 34
XML_RELAXNG_ERR_EXTRADATA = 35
XML_RELAXNG_ERR_LACKDATA = 36
XML_RELAXNG_ERR_INTERNAL = 37
XML_RELAXNG_ERR_ELEMWRONG = 38
XML_RELAXNG_ERR_TEXTWRONG = 39
# --- END: GENERATED CONSTANTS ---
cdef extern from "libxml/xmlerror.h":
ctypedef struct xmlError:
int domain
int code
char* message
xmlErrorLevel level
char* file
char* str1
char* str2
char* str3
int line
int int1
int int2
ctypedef void (*xmlGenericErrorFunc)(void* ctxt, char* msg, ...) nogil
ctypedef void (*xmlStructuredErrorFunc)(void* userData,
xmlError* error) nogil
cdef void xmlSetGenericErrorFunc(
void* ctxt, xmlGenericErrorFunc func) nogil
cdef void xmlSetStructuredErrorFunc(
void* ctxt, xmlStructuredErrorFunc func) nogil
cdef extern from "libxml/globals.h":
cdef xmlStructuredErrorFunc xmlStructuredError
cdef void* xmlStructuredErrorContext

View file

@ -0,0 +1,248 @@
from libc.string cimport const_char
from lxml.includes.tree cimport (
xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar)
from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc
cdef extern from "libxml/parser.h":
ctypedef void (*startElementNsSAX2Func)(void* ctx,
const_xmlChar* localname,
const_xmlChar* prefix,
const_xmlChar* URI,
int nb_namespaces,
const_xmlChar** namespaces,
int nb_attributes,
int nb_defaulted,
const_xmlChar** attributes)
ctypedef void (*endElementNsSAX2Func)(void* ctx,
const_xmlChar* localname,
const_xmlChar* prefix,
const_xmlChar* URI)
ctypedef void (*startElementSAXFunc)(void* ctx, const_xmlChar* name, const_xmlChar** atts)
ctypedef void (*endElementSAXFunc)(void* ctx, const_xmlChar* name)
ctypedef void (*charactersSAXFunc)(void* ctx, const_xmlChar* ch, int len)
ctypedef void (*cdataBlockSAXFunc)(void* ctx, const_xmlChar* value, int len)
ctypedef void (*commentSAXFunc)(void* ctx, const_xmlChar* value)
ctypedef void (*processingInstructionSAXFunc)(void* ctx,
const_xmlChar* target,
const_xmlChar* data)
ctypedef void (*internalSubsetSAXFunc)(void* ctx,
const_xmlChar* name,
const_xmlChar* externalID,
const_xmlChar* systemID)
ctypedef void (*endDocumentSAXFunc)(void* ctx)
ctypedef void (*startDocumentSAXFunc)(void* ctx)
ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name)
cdef int XML_SAX2_MAGIC
cdef extern from "libxml/tree.h":
ctypedef struct xmlParserInput:
int line
int length
const_xmlChar* base
const_xmlChar* cur
const_xmlChar* end
ctypedef struct xmlParserInputBuffer:
void* context
xmlInputReadCallback readcallback
xmlInputCloseCallback closecallback
ctypedef struct xmlSAXHandlerV1:
# same as xmlSAXHandler, but without namespaces
pass
ctypedef struct xmlSAXHandler:
internalSubsetSAXFunc internalSubset
startElementNsSAX2Func startElementNs
endElementNsSAX2Func endElementNs
startElementSAXFunc startElement
endElementSAXFunc endElement
charactersSAXFunc characters
cdataBlockSAXFunc cdataBlock
referenceSAXFunc reference
commentSAXFunc comment
processingInstructionSAXFunc processingInstruction
startDocumentSAXFunc startDocument
endDocumentSAXFunc endDocument
int initialized
xmlStructuredErrorFunc serror
void* _private
cdef extern from "libxml/SAX2.h" nogil:
cdef void xmlSAX2StartDocument(void* ctxt)
cdef extern from "libxml/xmlIO.h" nogil:
cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
cdef extern from "libxml/parser.h":
cdef xmlDict* xmlDictCreate() nogil
cdef xmlDict* xmlDictCreateSub(xmlDict* subdict) nogil
cdef void xmlDictFree(xmlDict* sub) nogil
cdef int xmlDictReference(xmlDict* dict) nogil
cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes
cdef int XML_SKIP_IDS # SAX option for not building an XML ID dict
ctypedef enum xmlParserInputState:
XML_PARSER_EOF = -1 # nothing is to be parsed
XML_PARSER_START = 0 # nothing has been parsed
XML_PARSER_MISC = 1 # Misc* before int subset
XML_PARSER_PI = 2 # Within a processing instruction
XML_PARSER_DTD = 3 # within some DTD content
XML_PARSER_PROLOG = 4 # Misc* after internal subset
XML_PARSER_COMMENT = 5 # within a comment
XML_PARSER_START_TAG = 6 # within a start tag
XML_PARSER_CONTENT = 7 # within the content
XML_PARSER_CDATA_SECTION = 8 # within a CDATA section
XML_PARSER_END_TAG = 9 # within a closing tag
XML_PARSER_ENTITY_DECL = 10 # within an entity declaration
XML_PARSER_ENTITY_VALUE = 11 # within an entity value in a decl
XML_PARSER_ATTRIBUTE_VALUE = 12 # within an attribute value
XML_PARSER_SYSTEM_LITERAL = 13 # within a SYSTEM value
XML_PARSER_EPILOG = 14 # the Misc* after the last end tag
XML_PARSER_IGNORE = 15 # within an IGNORED section
XML_PARSER_PUBLIC_LITERAL = 16 # within a PUBLIC value
ctypedef struct xmlParserCtxt:
xmlDoc* myDoc
xmlDict* dict
int dictNames
void* _private
bint wellFormed
bint recovery
int options
bint disableSAX
int errNo
xmlParserInputState instate
bint replaceEntities
int loadsubset # != 0 if enabled, int value == why
bint validate
xmlError lastError
xmlNode* node
xmlSAXHandler* sax
void* userData
int* spaceTab
int spaceMax
bint html
bint progressive
int inSubset
int charset
xmlParserInput* input
ctypedef enum xmlParserOption:
XML_PARSE_RECOVER = 1 # recover on errors
XML_PARSE_NOENT = 2 # substitute entities
XML_PARSE_DTDLOAD = 4 # load the external subset
XML_PARSE_DTDATTR = 8 # default DTD attributes
XML_PARSE_DTDVALID = 16 # validate with the DTD
XML_PARSE_NOERROR = 32 # suppress error reports
XML_PARSE_NOWARNING = 64 # suppress warning reports
XML_PARSE_PEDANTIC = 128 # pedantic error reporting
XML_PARSE_NOBLANKS = 256 # remove blank nodes
XML_PARSE_SAX1 = 512 # use the SAX1 interface internally
XML_PARSE_XINCLUDE = 1024 # Implement XInclude substitition
XML_PARSE_NONET = 2048 # Forbid network access
XML_PARSE_NODICT = 4096 # Do not reuse the context dictionnary
XML_PARSE_NSCLEAN = 8192 # remove redundant namespaces declarations
XML_PARSE_NOCDATA = 16384 # merge CDATA as text nodes
XML_PARSE_NOXINCNODE = 32768 # do not generate XINCLUDE START/END nodes
# libxml2 2.6.21+ only:
XML_PARSE_COMPACT = 65536 # compact small text nodes
# libxml2 2.7.0+ only:
XML_PARSE_OLD10 = 131072 # parse using XML-1.0 before update 5
XML_PARSE_NOBASEFIX = 262144 # do not fixup XINCLUDE xml:base uris
XML_PARSE_HUGE = 524288 # relax any hardcoded limit from the parser
# libxml2 2.7.3+ only:
XML_PARSE_OLDSAX = 1048576 # parse using SAX2 interface before 2.7.0
# libxml2 2.8.0+ only:
XML_PARSE_IGNORE_ENC = 2097152 # ignore internal document encoding hint
# libxml2 2.9.0+ only:
XML_PARSE_BIG_LINES = 4194304 # Store big lines numbers in text PSVI field
cdef void xmlInitParser() nogil
cdef void xmlCleanupParser() nogil
cdef int xmlLineNumbersDefault(int onoff) nogil
cdef xmlParserCtxt* xmlNewParserCtxt() nogil
cdef xmlParserInput* xmlNewIOInputStream(xmlParserCtxt* ctxt,
xmlParserInputBuffer* input,
int enc) nogil
cdef int xmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) nogil
cdef void xmlFreeParserCtxt(xmlParserCtxt* ctxt) nogil
cdef void xmlCtxtReset(xmlParserCtxt* ctxt) nogil
cdef void xmlClearParserCtxt(xmlParserCtxt* ctxt) nogil
cdef int xmlParseChunk(xmlParserCtxt* ctxt,
char* chunk, int size, int terminate) nogil
cdef xmlDoc* xmlCtxtReadDoc(xmlParserCtxt* ctxt,
char* cur, char* URL, char* encoding,
int options) nogil
cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt,
char* filename, char* encoding,
int options) nogil
cdef xmlDoc* xmlCtxtReadIO(xmlParserCtxt* ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void* ioctx,
char* URL, char* encoding,
int options) nogil
cdef xmlDoc* xmlCtxtReadMemory(xmlParserCtxt* ctxt,
char* buffer, int size,
char* filename, const_char* encoding,
int options) nogil
# iterparse:
cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax,
void* user_data,
char* chunk,
int size,
char* filename) nogil
cdef int xmlCtxtResetPush(xmlParserCtxt* ctxt,
char* chunk,
int size,
char* filename,
char* encoding) nogil
# entity loaders:
ctypedef xmlParserInput* (*xmlExternalEntityLoader)(
const_char * URL, const_char * ID, xmlParserCtxt* context) nogil
cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() nogil
cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) nogil
# DTDs:
cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
cdef xmlDtd* xmlIOParseDTD(xmlSAXHandler* sax,
xmlParserInputBuffer* input,
int enc) nogil
cdef extern from "libxml/parserInternals.h":
cdef xmlParserInput* xmlNewInputStream(xmlParserCtxt* ctxt)
cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt,
char* buffer) nogil
cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt,
char* filename) nogil
cdef void xmlFreeInputStream(xmlParserInput* input) nogil
cdef int xmlSwitchEncoding(xmlParserCtxt* ctxt, int enc) nogil

View file

@ -0,0 +1,35 @@
from lxml.includes.tree cimport xmlDoc
from lxml.includes.xmlparser cimport xmlSAXHandler
from lxml.includes.xmlerror cimport xmlStructuredErrorFunc
cdef extern from "libxml/xmlschemas.h":
ctypedef struct xmlSchema
ctypedef struct xmlSchemaParserCtxt
ctypedef struct xmlSchemaSAXPlugStruct
ctypedef struct xmlSchemaValidCtxt
ctypedef enum xmlSchemaValidOption:
XML_SCHEMA_VAL_VC_I_CREATE = 1
cdef xmlSchemaValidCtxt* xmlSchemaNewValidCtxt(xmlSchema* schema) nogil
cdef void xmlSchemaSetParserStructuredErrors(xmlSchemaParserCtxt* ctxt,
xmlStructuredErrorFunc serror, void *ctx)
cdef void xmlSchemaSetValidStructuredErrors(xmlSchemaValidCtxt* ctxt,
xmlStructuredErrorFunc serror, void *ctx)
cdef int xmlSchemaValidateDoc(xmlSchemaValidCtxt* ctxt, xmlDoc* doc) nogil
cdef xmlSchema* xmlSchemaParse(xmlSchemaParserCtxt* ctxt) nogil
cdef xmlSchemaParserCtxt* xmlSchemaNewParserCtxt(char* URL) nogil
cdef xmlSchemaParserCtxt* xmlSchemaNewDocParserCtxt(xmlDoc* doc) nogil
cdef void xmlSchemaFree(xmlSchema* schema) nogil
cdef void xmlSchemaFreeParserCtxt(xmlSchemaParserCtxt* ctxt) nogil
cdef void xmlSchemaFreeValidCtxt(xmlSchemaValidCtxt* ctxt) nogil
cdef int xmlSchemaSetValidOptions(xmlSchemaValidCtxt* ctxt,
int options) nogil
cdef xmlSchemaSAXPlugStruct* xmlSchemaSAXPlug(xmlSchemaValidCtxt* ctxt,
xmlSAXHandler** sax,
void** data) nogil
cdef int xmlSchemaSAXUnplug(xmlSchemaSAXPlugStruct* sax_plug)
cdef int xmlSchemaIsValid(xmlSchemaValidCtxt* ctxt)

View file

@ -0,0 +1,135 @@
from lxml.includes cimport tree
from lxml.includes cimport xmlerror
from libc.string cimport const_char
from lxml.includes.tree cimport xmlChar, const_xmlChar
cdef extern from "libxml/xpath.h":
ctypedef enum xmlXPathObjectType:
XPATH_UNDEFINED = 0
XPATH_NODESET = 1
XPATH_BOOLEAN = 2
XPATH_NUMBER = 3
XPATH_STRING = 4
XPATH_POINT = 5
XPATH_RANGE = 6
XPATH_LOCATIONSET = 7
XPATH_USERS = 8
XPATH_XSLT_TREE = 9
ctypedef enum xmlXPathError:
XPATH_EXPRESSION_OK = 0
XPATH_NUMBER_ERROR = 1
XPATH_UNFINISHED_LITERAL_ERROR = 2
XPATH_START_LITERAL_ERROR = 3
XPATH_VARIABLE_REF_ERROR = 4
XPATH_UNDEF_VARIABLE_ERROR = 5
XPATH_INVALID_PREDICATE_ERROR = 6
XPATH_EXPR_ERROR = 7
XPATH_UNCLOSED_ERROR = 8
XPATH_UNKNOWN_FUNC_ERROR = 9
XPATH_INVALID_OPERAND = 10
XPATH_INVALID_TYPE = 11
XPATH_INVALID_ARITY = 12
XPATH_INVALID_CTXT_SIZE = 13
XPATH_INVALID_CTXT_POSITION = 14
XPATH_MEMORY_ERROR = 15
XPTR_SYNTAX_ERROR = 16
XPTR_RESOURCE_ERROR = 17
XPTR_SUB_RESOURCE_ERROR = 18
XPATH_UNDEF_PREFIX_ERROR = 19
XPATH_ENCODING_ERROR = 20
XPATH_INVALID_CHAR_ERROR = 21
XPATH_INVALID_CTXT = 22
ctypedef struct xmlNodeSet:
int nodeNr
int nodeMax
tree.xmlNode** nodeTab
ctypedef struct xmlXPathObject:
xmlXPathObjectType type
xmlNodeSet* nodesetval
bint boolval
double floatval
xmlChar* stringval
ctypedef struct xmlXPathContext:
tree.xmlDoc* doc
tree.xmlNode* node
tree.xmlDict* dict
tree.xmlHashTable* nsHash
const_xmlChar* function
const_xmlChar* functionURI
xmlerror.xmlStructuredErrorFunc error
xmlerror.xmlError lastError
void* userData
ctypedef struct xmlXPathParserContext:
xmlXPathContext* context
xmlXPathObject* value
tree.xmlNode* ancestor
int error
ctypedef struct xmlXPathCompExpr
ctypedef void (*xmlXPathFunction)(xmlXPathParserContext* ctxt, int nargs) nogil
ctypedef xmlXPathFunction (*xmlXPathFuncLookupFunc)(void* ctxt,
const_xmlChar* name,
const_xmlChar* ns_uri) nogil
cdef xmlXPathContext* xmlXPathNewContext(tree.xmlDoc* doc) nogil
cdef xmlXPathObject* xmlXPathEvalExpression(const_xmlChar* str,
xmlXPathContext* ctxt) nogil
cdef xmlXPathObject* xmlXPathCompiledEval(xmlXPathCompExpr* comp,
xmlXPathContext* ctxt) nogil
cdef xmlXPathCompExpr* xmlXPathCompile(const_xmlChar* str) nogil
cdef xmlXPathCompExpr* xmlXPathCtxtCompile(xmlXPathContext* ctxt,
const_xmlChar* str) nogil
cdef void xmlXPathFreeContext(xmlXPathContext* ctxt) nogil
cdef void xmlXPathFreeCompExpr(xmlXPathCompExpr* comp) nogil
cdef void xmlXPathFreeObject(xmlXPathObject* obj) nogil
cdef int xmlXPathRegisterNs(xmlXPathContext* ctxt,
const_xmlChar* prefix, const_xmlChar* ns_uri) nogil
cdef xmlNodeSet* xmlXPathNodeSetCreate(tree.xmlNode* val) nogil
cdef void xmlXPathFreeNodeSet(xmlNodeSet* val) nogil
cdef extern from "libxml/xpathInternals.h":
cdef int xmlXPathRegisterFunc(xmlXPathContext* ctxt,
const_xmlChar* name,
xmlXPathFunction f) nogil
cdef int xmlXPathRegisterFuncNS(xmlXPathContext* ctxt,
const_xmlChar* name,
const_xmlChar* ns_uri,
xmlXPathFunction f) nogil
cdef void xmlXPathRegisterFuncLookup(xmlXPathContext *ctxt,
xmlXPathFuncLookupFunc f,
void *funcCtxt) nogil
cdef int xmlXPathRegisterVariable(xmlXPathContext *ctxt,
const_xmlChar* name,
xmlXPathObject* value) nogil
cdef int xmlXPathRegisterVariableNS(xmlXPathContext *ctxt,
const_xmlChar* name,
const_xmlChar* ns_uri,
xmlXPathObject* value) nogil
cdef void xmlXPathRegisteredVariablesCleanup(xmlXPathContext *ctxt) nogil
cdef void xmlXPathRegisteredNsCleanup(xmlXPathContext *ctxt) nogil
cdef xmlXPathObject* valuePop (xmlXPathParserContext *ctxt) nogil
cdef int valuePush(xmlXPathParserContext* ctxt, xmlXPathObject *value) nogil
cdef xmlXPathObject* xmlXPathNewCString(const_char *val) nogil
cdef xmlXPathObject* xmlXPathWrapCString(const_char * val) nogil
cdef xmlXPathObject* xmlXPathNewString(const_xmlChar *val) nogil
cdef xmlXPathObject* xmlXPathWrapString(const_xmlChar * val) nogil
cdef xmlXPathObject* xmlXPathNewFloat(double val) nogil
cdef xmlXPathObject* xmlXPathNewBoolean(int val) nogil
cdef xmlXPathObject* xmlXPathNewNodeSet(tree.xmlNode* val) nogil
cdef xmlXPathObject* xmlXPathNewValueTree(tree.xmlNode* val) nogil
cdef void xmlXPathNodeSetAdd(xmlNodeSet* cur,
tree.xmlNode* val) nogil
cdef void xmlXPathNodeSetAddUnique(xmlNodeSet* cur,
tree.xmlNode* val) nogil
cdef xmlXPathObject* xmlXPathWrapNodeSet(xmlNodeSet* val) nogil
cdef void xmlXPathErr(xmlXPathParserContext* ctxt, int error) nogil

View file

@ -0,0 +1,176 @@
from lxml.includes.tree cimport xmlDoc, xmlNode, xmlDict, xmlChar, const_xmlChar
from lxml.includes.xpath cimport xmlXPathContext, xmlXPathFunction
from libc.string cimport const_char
cdef extern from "libxslt/xslt.h":
cdef int xsltLibxsltVersion
cdef int xsltMaxDepth
cdef extern from "libxslt/xsltconfig.h":
cdef int LIBXSLT_VERSION
cdef extern from "libxslt/xsltInternals.h":
ctypedef enum xsltTransformState:
XSLT_STATE_OK # 0
XSLT_STATE_ERROR # 1
XSLT_STATE_STOPPED # 2
ctypedef struct xsltDocument:
xmlDoc* doc
ctypedef struct xsltStylesheet:
xmlChar* encoding
xmlDoc* doc
int errors
ctypedef struct xsltTransformContext:
xsltStylesheet* style
xmlXPathContext* xpathCtxt
xsltDocument* document
void* _private
xmlDict* dict
int profile
xmlNode* node
xmlDoc* output
xmlNode* insert
xmlNode* inst
xsltTransformState state
ctypedef struct xsltStackElem
ctypedef struct xsltTemplate
cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc) nogil
cdef void xsltFreeStylesheet(xsltStylesheet* sheet) nogil
cdef extern from "libxslt/extensions.h":
ctypedef void (*xsltTransformFunction)(xsltTransformContext* ctxt,
xmlNode* context_node,
xmlNode* inst,
void* precomp_unused) nogil
cdef int xsltRegisterExtFunction(xsltTransformContext* ctxt,
const_xmlChar* name,
const_xmlChar* URI,
xmlXPathFunction function) nogil
cdef int xsltRegisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI,
xmlXPathFunction function) nogil
cdef int xsltUnregisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI)
cdef xmlXPathFunction xsltExtModuleFunctionLookup(
const_xmlChar* name, const_xmlChar* URI) nogil
cdef int xsltRegisterExtPrefix(xsltStylesheet* style,
const_xmlChar* prefix, const_xmlChar* URI) nogil
cdef int xsltRegisterExtElement(xsltTransformContext* ctxt,
const_xmlChar* name, const_xmlChar* URI,
xsltTransformFunction function) nogil
cdef extern from "libxslt/documents.h":
ctypedef enum xsltLoadType:
XSLT_LOAD_START
XSLT_LOAD_STYLESHEET
XSLT_LOAD_DOCUMENT
ctypedef xmlDoc* (*xsltDocLoaderFunc)(const_xmlChar* URI, xmlDict* dict,
int options,
void* ctxt,
xsltLoadType type) nogil
cdef xsltDocLoaderFunc xsltDocDefaultLoader
cdef void xsltSetLoaderFunc(xsltDocLoaderFunc f) nogil
cdef extern from "libxslt/transform.h":
cdef xmlDoc* xsltApplyStylesheet(xsltStylesheet* style, xmlDoc* doc,
const_char** params) nogil
cdef xmlDoc* xsltApplyStylesheetUser(xsltStylesheet* style, xmlDoc* doc,
const_char** params, const_char* output,
void* profile,
xsltTransformContext* context) nogil
cdef void xsltProcessOneNode(xsltTransformContext* ctxt,
xmlNode* contextNode,
xsltStackElem* params) nogil
cdef xsltTransformContext* xsltNewTransformContext(xsltStylesheet* style,
xmlDoc* doc) nogil
cdef void xsltFreeTransformContext(xsltTransformContext* context) nogil
cdef void xsltApplyOneTemplate(xsltTransformContext* ctxt,
xmlNode* contextNode, xmlNode* list,
xsltTemplate* templ,
xsltStackElem* params) nogil
cdef extern from "libxslt/xsltutils.h":
cdef int xsltSaveResultToString(xmlChar** doc_txt_ptr,
int* doc_txt_len,
xmlDoc* result,
xsltStylesheet* style) nogil
cdef void xsltSetGenericErrorFunc(
void* ctxt, void (*handler)(void* ctxt, char* msg, ...)) nogil
cdef void xsltSetTransformErrorFunc(
xsltTransformContext*, void* ctxt,
void (*handler)(void* ctxt, char* msg, ...) nogil) nogil
cdef void xsltTransformError(xsltTransformContext* ctxt,
xsltStylesheet* style,
xmlNode* node, char* msg, ...)
cdef void xsltSetCtxtParseOptions(
xsltTransformContext* ctxt, int options)
cdef extern from "libxslt/security.h":
ctypedef struct xsltSecurityPrefs
ctypedef enum xsltSecurityOption:
XSLT_SECPREF_READ_FILE = 1
XSLT_SECPREF_WRITE_FILE = 2
XSLT_SECPREF_CREATE_DIRECTORY = 3
XSLT_SECPREF_READ_NETWORK = 4
XSLT_SECPREF_WRITE_NETWORK = 5
ctypedef int (*xsltSecurityCheck)(xsltSecurityPrefs* sec,
xsltTransformContext* ctxt,
char* value) nogil
cdef xsltSecurityPrefs* xsltNewSecurityPrefs() nogil
cdef void xsltFreeSecurityPrefs(xsltSecurityPrefs* sec) nogil
cdef int xsltSecurityForbid(xsltSecurityPrefs* sec,
xsltTransformContext* ctxt,
char* value) nogil
cdef int xsltSecurityAllow(xsltSecurityPrefs* sec,
xsltTransformContext* ctxt,
char* value) nogil
cdef int xsltSetSecurityPrefs(xsltSecurityPrefs* sec,
xsltSecurityOption option,
xsltSecurityCheck func) nogil
cdef xsltSecurityCheck xsltGetSecurityPrefs(
xsltSecurityPrefs* sec,
xsltSecurityOption option) nogil
cdef int xsltSetCtxtSecurityPrefs(xsltSecurityPrefs* sec,
xsltTransformContext* ctxt) nogil
cdef xmlDoc* xsltGetProfileInformation(xsltTransformContext* ctxt) nogil
cdef extern from "libxslt/variables.h":
cdef int xsltQuoteUserParams(xsltTransformContext* ctxt,
const_char** params)
cdef int xsltQuoteOneUserParam(xsltTransformContext* ctxt,
const_xmlChar* name,
const_xmlChar* value)
cdef extern from "libxslt/extra.h":
const_xmlChar* XSLT_LIBXSLT_NAMESPACE
const_xmlChar* XSLT_XALAN_NAMESPACE
const_xmlChar* XSLT_SAXON_NAMESPACE
const_xmlChar* XSLT_XT_NAMESPACE
cdef xmlXPathFunction xsltFunctionNodeSet
cdef void xsltRegisterAllExtras() nogil
cdef extern from "libexslt/exslt.h":
cdef void exsltRegisterAll() nogil
# libexslt 1.1.25+
const_xmlChar* EXSLT_DATE_NAMESPACE
const_xmlChar* EXSLT_SETS_NAMESPACE
const_xmlChar* EXSLT_MATH_NAMESPACE
const_xmlChar* EXSLT_STRINGS_NAMESPACE
cdef int exsltDateXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
cdef int exsltSetsXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
cdef int exsltMathXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
cdef int exsltStrXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)

View file

@ -0,0 +1,334 @@
"""The ``lxml.isoschematron`` package implements ISO Schematron support on top
of the pure-xslt 'skeleton' implementation.
"""
import sys
import os.path
from lxml import etree as _etree # due to validator __init__ signature
# some compat stuff, borrowed from lxml.html
try:
unicode
except NameError:
# Python 3
unicode = str
try:
basestring
except NameError:
# Python 3
basestring = str
__all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include',
'iso_abstract_expand', 'iso_svrl_for_xslt1',
'svrl_validation_errors', 'schematron_schema_valid',
'stylesheet_params', 'Schematron']
# some namespaces
#FIXME: Maybe lxml should provide a dedicated place for common namespace
#FIXME: definitions?
XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
RELAXNG_NS = "http://relaxng.org/ns/structure/1.0"
SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron"
SVRL_NS = "http://purl.oclc.org/dsdl/svrl"
# some helpers
_schematron_root = '{%s}schema' % SCHEMATRON_NS
_xml_schema_root = '{%s}schema' % XML_SCHEMA_NS
_resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
# the iso-schematron skeleton implementation steps aka xsl transformations
extract_xsd = _etree.XSLT(_etree.parse(
os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl')))
extract_rng = _etree.XSLT(_etree.parse(
os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl')))
iso_dsdl_include = _etree.XSLT(_etree.parse(
os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
'iso_dsdl_include.xsl')))
iso_abstract_expand = _etree.XSLT(_etree.parse(
os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
'iso_abstract_expand.xsl')))
iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse(
os.path.join(_resources_dir,
'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl')))
# svrl result accessors
svrl_validation_errors = _etree.XPath(
'//svrl:failed-assert', namespaces={'svrl': SVRL_NS})
# RelaxNG validator for schematron schemas
schematron_schema_valid = _etree.RelaxNG(_etree.parse(
os.path.join(_resources_dir, 'rng', 'iso-schematron.rng')))
def stylesheet_params(**kwargs):
"""Convert keyword args to a dictionary of stylesheet parameters.
XSL stylesheet parameters must be XPath expressions, i.e.:
* string expressions, like "'5'"
* simple (number) expressions, like "5"
* valid XPath expressions, like "/a/b/text()"
This function converts native Python keyword arguments to stylesheet
parameters following these rules:
If an arg is a string wrap it with XSLT.strparam().
If an arg is an XPath object use its path string.
If arg is None raise TypeError.
Else convert arg to string.
"""
result = {}
for key, val in kwargs.items():
if isinstance(val, basestring):
val = _etree.XSLT.strparam(val)
elif val is None:
raise TypeError('None not allowed as a stylesheet parameter')
elif not isinstance(val, _etree.XPath):
val = unicode(val)
result[key] = val
return result
# helper function for use in Schematron __init__
def _stylesheet_param_dict(paramsDict, kwargsDict):
"""Return a copy of paramsDict, updated with kwargsDict entries, wrapped as
stylesheet arguments.
kwargsDict entries with a value of None are ignored.
"""
# beware of changing mutable default arg
paramsDict = dict(paramsDict)
for k, v in kwargsDict.items():
if v is not None: # None values do not override
paramsDict[k] = v
paramsDict = stylesheet_params(**paramsDict)
return paramsDict
class Schematron(_etree._Validator):
"""An ISO Schematron validator.
Pass a root Element or an ElementTree to turn it into a validator.
Alternatively, pass a filename as keyword argument 'file' to parse from
the file system.
Schematron is a less well known, but very powerful schema language.
The main idea is to use the capabilities of XPath to put restrictions on
the structure and the content of XML documents.
The standard behaviour is to fail on ``failed-assert`` findings only
(``ASSERTS_ONLY``). To change this, you can either pass a report filter
function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
or a custom ``XPath`` object), or subclass isoschematron.Schematron for
complete control of the validation process.
Built on the Schematron language 'reference' skeleton pure-xslt
implementation, the validator is created as an XSLT 1.0 stylesheet using
these steps:
0) (Extract from XML Schema or RelaxNG schema)
1) Process inclusions
2) Process abstract patterns
3) Compile the schematron schema to XSLT
The ``include`` and ``expand`` keyword arguments can be used to switch off
steps 1) and 2).
To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
keyword arguments ``include_params``, ``expand_params`` or
``compile_params``.
For convenience, the compile-step parameter ``phase`` is also exposed as a
keyword argument ``phase``. This takes precedence if the parameter is also
given in the parameter dictionary.
If ``store_schematron`` is set to True, the (included-and-expanded)
schematron document tree is stored and available through the ``schematron``
property.
If ``store_xslt`` is set to True, the validation XSLT document tree will be
stored and can be retrieved through the ``validator_xslt`` property.
With ``store_report`` set to True (default: False), the resulting validation
report document gets stored and can be accessed as the ``validation_report``
property.
Here is a usage example::
>>> from lxml import etree
>>> from lxml.isoschematron import Schematron
>>> schematron = Schematron(etree.XML('''
... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
... <pattern id="id_only_attribute">
... <title>id is the only permitted attribute name</title>
... <rule context="*">
... <report test="@*[not(name()='id')]">Attribute
... <name path="@*[not(name()='id')]"/> is forbidden<name/>
... </report>
... </rule>
... </pattern>
... </schema>'''),
... error_finder=Schematron.ASSERTS_AND_REPORTS)
>>> xml = etree.XML('''
... <AAA name="aaa">
... <BBB id="bbb"/>
... <CCC color="ccc"/>
... </AAA>
... ''')
>>> schematron.validate(xml)
False
>>> xml = etree.XML('''
... <AAA id="aaa">
... <BBB id="bbb"/>
... <CCC/>
... </AAA>
... ''')
>>> schematron.validate(xml)
True
"""
# libxml2 error categorization for validation errors
_domain = _etree.ErrorDomains.SCHEMATRONV
_level = _etree.ErrorLevels.ERROR
_error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT
# convenience definitions for common behaviours
ASSERTS_ONLY = svrl_validation_errors # Default
ASSERTS_AND_REPORTS = _etree.XPath(
'//svrl:failed-assert | //svrl:successful-report',
namespaces={'svrl': SVRL_NS})
def _extract(self, element):
"""Extract embedded schematron schema from non-schematron host schema.
This method will only be called by __init__ if the given schema document
is not a schematron schema by itself.
Must return a schematron schema document tree or None.
"""
schematron = None
if element.tag == _xml_schema_root:
schematron = self._extract_xsd(element)
elif element.nsmap[element.prefix] == RELAXNG_NS:
# RelaxNG does not have a single unique root element
schematron = self._extract_rng(element)
return schematron
# customization points
# etree.XSLT objects that provide the extract, include, expand, compile
# steps
_extract_xsd = extract_xsd
_extract_rng = extract_rng
_include = iso_dsdl_include
_expand = iso_abstract_expand
_compile = iso_svrl_for_xslt1
# etree.xpath object that determines input document validity when applied to
# the svrl result report; must return a list of result elements (empty if
# valid)
_validation_errors = ASSERTS_ONLY
def __init__(self, etree=None, file=None, include=True, expand=True,
include_params={}, expand_params={}, compile_params={},
store_schematron=False, store_xslt=False, store_report=False,
phase=None, error_finder=ASSERTS_ONLY):
super(Schematron, self).__init__()
self._store_report = store_report
self._schematron = None
self._validator_xslt = None
self._validation_report = None
if error_finder is not self.ASSERTS_ONLY:
self._validation_errors = error_finder
# parse schema document, may be a schematron schema or an XML Schema or
# a RelaxNG schema with embedded schematron rules
root = None
try:
if etree is not None:
if _etree.iselement(etree):
root = etree
else:
root = etree.getroot()
elif file is not None:
root = _etree.parse(file).getroot()
except Exception:
raise _etree.SchematronParseError(
"No tree or file given: %s" % sys.exc_info()[1])
if root is None:
raise ValueError("Empty tree")
if root.tag == _schematron_root:
schematron = root
else:
schematron = self._extract(root)
if schematron is None:
raise _etree.SchematronParseError(
"Document is not a schematron schema or schematron-extractable")
# perform the iso-schematron skeleton implementation steps to get a
# validating xslt
if include:
schematron = self._include(schematron, **include_params)
if expand:
schematron = self._expand(schematron, **expand_params)
if not schematron_schema_valid(schematron):
raise _etree.SchematronParseError(
"invalid schematron schema: %s" %
schematron_schema_valid.error_log)
if store_schematron:
self._schematron = schematron
# add new compile keyword args here if exposing them
compile_kwargs = {'phase': phase}
compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
validator_xslt = self._compile(schematron, **compile_params)
if store_xslt:
self._validator_xslt = validator_xslt
self._validator = _etree.XSLT(validator_xslt)
def __call__(self, etree):
"""Validate doc using Schematron.
Returns true if document is valid, false if not.
"""
self._clear_error_log()
result = self._validator(etree)
if self._store_report:
self._validation_report = result
errors = self._validation_errors(result)
if errors:
if _etree.iselement(etree):
fname = etree.getroottree().docinfo.URL or '<file>'
else:
fname = etree.docinfo.URL or '<file>'
for error in errors:
# Does svrl report the line number, anywhere? Don't think so.
self._append_log_message(
domain=self._domain, type=self._error_type,
level=self._level, line=0,
message=_etree.tostring(error, encoding='unicode'),
filename=fname)
return False
return True
@property
def schematron(self):
"""ISO-schematron schema document (None if object has been initialized
with store_schematron=False).
"""
return self._schematron
@property
def validator_xslt(self):
"""ISO-schematron skeleton implementation XSLT validator document (None
if object has been initialized with store_xslt=False).
"""
return self._validator_xslt
@property
def validation_report(self):
"""ISO-schematron validation result report (None if result-storing has
been turned off).
"""
return self._validation_report

View file

@ -0,0 +1,622 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
(c) International Organization for Standardization 2005.
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879,
provided this notice is included in all copies.
-->
<grammar ns="http://purl.oclc.org/dsdl/schematron" xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<start>
<ref name="schema"/>
</start>
<!-- Element declarations -->
<define name="schema">
<element name="schema">
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<ref name="rich"/>
<optional>
<attribute name="schemaVersion">
<ref name="non-empty-string"/>
</attribute>
</optional>
<optional>
<attribute name="defaultPhase">
<data type="IDREF"/>
</attribute>
</optional>
<optional>
<attribute name="queryBinding">
<ref name="non-empty-string"/>
</attribute>
</optional>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<ref name="inclusion"/>
</zeroOrMore>
<group>
<optional>
<ref name="title"/>
</optional>
<zeroOrMore>
<ref name="ns"/>
</zeroOrMore>
<zeroOrMore>
<ref name="p"/>
</zeroOrMore>
<zeroOrMore>
<ref name="let"/>
</zeroOrMore>
<zeroOrMore>
<ref name="phase"/>
</zeroOrMore>
<oneOrMore>
<ref name="pattern"/>
</oneOrMore>
<zeroOrMore>
<ref name="p"/>
</zeroOrMore>
<optional>
<ref name="diagnostics"/>
</optional>
</group>
</interleave>
</element>
</define>
<define name="active">
<element name="active">
<attribute name="pattern">
<data type="IDREF"/>
</attribute>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<choice>
<text/>
<ref name="dir"/>
<ref name="emph"/>
<ref name="span"/>
</choice>
</zeroOrMore>
</interleave>
</element>
</define>
<define name="assert">
<element name="assert">
<attribute name="test">
<ref name="exprValue"/>
</attribute>
<optional>
<attribute name="flag">
<ref name="flagValue"/>
</attribute>
</optional>
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<optional>
<attribute name="diagnostics">
<data type="IDREFS"/>
</attribute>
</optional>
<ref name="rich"/>
<ref name="linkable"/>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<choice>
<text/>
<ref name="name"/>
<ref name="value-of"/>
<ref name="emph"/>
<ref name="dir"/>
<ref name="span"/>
</choice>
</zeroOrMore>
</interleave>
</element>
</define>
<define name="diagnostic">
<element name="diagnostic">
<attribute name="id">
<data type="ID"/>
</attribute>
<ref name="rich"/>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<choice>
<text/>
<ref name="value-of"/>
<ref name="emph"/>
<ref name="dir"/>
<ref name="span"/>
</choice>
</zeroOrMore>
</interleave>
</element>
</define>
<define name="diagnostics">
<element name="diagnostics">
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<ref name="inclusion"/>
</zeroOrMore>
<zeroOrMore>
<ref name="diagnostic"/>
</zeroOrMore>
</interleave>
</element>
</define>
<define name="dir">
<element name="dir">
<optional>
<attribute name="value">
<choice>
<value>ltr</value>
<value>rtl</value>
</choice>
</attribute>
</optional>
<interleave>
<ref name="foreign"/>
<text/>
</interleave>
</element>
</define>
<define name="emph">
<element name="emph">
<text/>
</element>
</define>
<define name="extends">
<element name="extends">
<attribute name="rule">
<data type="IDREF"/>
</attribute>
<ref name="foreign-empty"/>
</element>
</define>
<define name="let">
<element name="let">
<attribute name="name">
<ref name="nameValue"/>
</attribute>
<attribute name="value">
<data type="string" datatypeLibrary=""/>
</attribute>
</element>
</define>
<define name="name">
<element name="name">
<optional>
<attribute name="path">
<ref name="pathValue"/>
</attribute>
</optional>
<ref name="foreign-empty"/>
</element>
</define>
<define name="ns">
<element name="ns">
<attribute name="uri">
<ref name="uriValue"/>
</attribute>
<attribute name="prefix">
<ref name="nameValue"/>
</attribute>
<ref name="foreign-empty"/>
</element>
</define>
<define name="p">
<element name="p">
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<optional>
<attribute name="class">
<ref name="classValue"/>
</attribute>
</optional>
<optional>
<attribute name="icon">
<ref name="uriValue"/>
</attribute>
</optional>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<choice>
<text/>
<ref name="dir"/>
<ref name="emph"/>
<ref name="span"/>
</choice>
</zeroOrMore>
</interleave>
</element>
</define>
<define name="param">
<element name="param">
<attribute name="name">
<ref name="nameValue"/>
</attribute>
<attribute name="value">
<ref name="non-empty-string"/>
</attribute>
</element>
</define>
<define name="pattern">
<element name="pattern">
<ref name="rich"/>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<ref name="inclusion"/>
</zeroOrMore>
<choice>
<group>
<attribute name="abstract">
<value>true</value>
</attribute>
<attribute name="id">
<data type="ID"/>
</attribute>
<optional>
<ref name="title"/>
</optional>
<group>
<zeroOrMore>
<ref name="p"/>
</zeroOrMore>
<zeroOrMore>
<ref name="let"/>
</zeroOrMore>
<zeroOrMore>
<ref name="rule"/>
</zeroOrMore>
</group>
</group>
<group>
<optional>
<attribute name="abstract">
<value>false</value>
</attribute>
</optional>
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<optional>
<ref name="title"/>
</optional>
<group>
<zeroOrMore>
<ref name="p"/>
</zeroOrMore>
<zeroOrMore>
<ref name="let"/>
</zeroOrMore>
<zeroOrMore>
<ref name="rule"/>
</zeroOrMore>
</group>
</group>
<group>
<optional>
<attribute name="abstract">
<value>false</value>
</attribute>
</optional>
<attribute name="is-a">
<data type="IDREF"/>
</attribute>
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<optional>
<ref name="title"/>
</optional>
<group>
<zeroOrMore>
<ref name="p"/>
</zeroOrMore>
<zeroOrMore>
<ref name="param"/>
</zeroOrMore>
</group>
</group>
</choice>
</interleave>
</element>
</define>
<define name="phase">
<element name="phase">
<attribute name="id">
<data type="ID"/>
</attribute>
<ref name="rich"/>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<ref name="inclusion"/>
</zeroOrMore>
<group>
<zeroOrMore>
<ref name="p"/>
</zeroOrMore>
<zeroOrMore>
<ref name="let"/>
</zeroOrMore>
<zeroOrMore>
<ref name="active"/>
</zeroOrMore>
</group>
</interleave>
</element>
</define>
<define name="report">
<element name="report">
<attribute name="test">
<ref name="exprValue"/>
</attribute>
<optional>
<attribute name="flag">
<ref name="flagValue"/>
</attribute>
</optional>
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<optional>
<attribute name="diagnostics">
<data type="IDREFS"/>
</attribute>
</optional>
<ref name="rich"/>
<ref name="linkable"/>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<choice>
<text/>
<ref name="name"/>
<ref name="value-of"/>
<ref name="emph"/>
<ref name="dir"/>
<ref name="span"/>
</choice>
</zeroOrMore>
</interleave>
</element>
</define>
<define name="rule">
<element name="rule">
<optional>
<attribute name="flag">
<ref name="flagValue"/>
</attribute>
</optional>
<ref name="rich"/>
<ref name="linkable"/>
<interleave>
<ref name="foreign"/>
<zeroOrMore>
<ref name="inclusion"/>
</zeroOrMore>
<choice>
<group>
<attribute name="abstract">
<value>true</value>
</attribute>
<attribute name="id">
<data type="ID"/>
</attribute>
<zeroOrMore>
<ref name="let"/>
</zeroOrMore>
<oneOrMore>
<choice>
<ref name="assert"/>
<ref name="report"/>
<ref name="extends"/>
</choice>
</oneOrMore>
</group>
<group>
<attribute name="context">
<ref name="pathValue"/>
</attribute>
<optional>
<attribute name="id">
<data type="ID"/>
</attribute>
</optional>
<optional>
<attribute name="abstract">
<value>false</value>
</attribute>
</optional>
<zeroOrMore>
<ref name="let"/>
</zeroOrMore>
<oneOrMore>
<choice>
<ref name="assert"/>
<ref name="report"/>
<ref name="extends"/>
</choice>
</oneOrMore>
</group>
</choice>
</interleave>
</element>
</define>
<define name="span">
<element name="span">
<attribute name="class">
<ref name="classValue"/>
</attribute>
<interleave>
<ref name="foreign"/>
<text/>
</interleave>
</element>
</define>
<define name="title">
<element name="title">
<zeroOrMore>
<choice>
<text/>
<ref name="dir"/>
</choice>
</zeroOrMore>
</element>
</define>
<define name="value-of">
<element name="value-of">
<attribute name="select">
<ref name="pathValue"/>
</attribute>
<ref name="foreign-empty"/>
</element>
</define>
<!-- common declarations -->
<define name="inclusion">
<element name="include">
<attribute name="href">
<ref name="uriValue"/>
</attribute>
</element>
</define>
<define name="rich">
<optional>
<attribute name="icon">
<ref name="uriValue"/>
</attribute>
</optional>
<optional>
<attribute name="see">
<ref name="uriValue"/>
</attribute>
</optional>
<optional>
<attribute name="fpi">
<ref name="fpiValue"/>
</attribute>
</optional>
<optional>
<attribute name="xml:lang">
<ref name="langValue"/>
</attribute>
</optional>
<optional>
<attribute name="xml:space">
<choice>
<value>preserve</value>
<value>default</value>
</choice>
</attribute>
</optional>
</define>
<define name="linkable">
<optional>
<attribute name="role">
<ref name="roleValue"/>
</attribute>
</optional>
<optional>
<attribute name="subject">
<ref name="pathValue"/>
</attribute>
</optional>
</define>
<define name="foreign">
<ref name="foreign-attributes"/>
<zeroOrMore>
<ref name="foreign-element"/>
</zeroOrMore>
</define>
<define name="foreign-empty">
<ref name="foreign-attributes"/>
</define>
<define name="foreign-attributes">
<zeroOrMore>
<attribute>
<anyName>
<except>
<nsName ns=""/>
<nsName ns="http://www.w3.org/XML/1998/namespace"/>
</except>
</anyName>
</attribute>
</zeroOrMore>
</define>
<define name="foreign-element">
<element>
<anyName>
<except>
<nsName/>
</except>
</anyName>
<zeroOrMore>
<choice>
<attribute>
<anyName/>
</attribute>
<ref name="foreign-element"/>
<ref name="schema"/>
<text/>
</choice>
</zeroOrMore>
</element>
</define>
<!-- Data types -->
<define name="uriValue">
<data type="anyURI"/>
</define>
<define name="pathValue">
<data type="string" datatypeLibrary=""/>
</define>
<define name="exprValue">
<data type="string" datatypeLibrary=""/>
</define>
<define name="fpiValue">
<data type="string" datatypeLibrary=""/>
</define>
<define name="langValue">
<data type="language"/>
</define>
<define name="roleValue">
<data type="string" datatypeLibrary=""/>
</define>
<define name="flagValue">
<data type="string" datatypeLibrary=""/>
</define>
<define name="nameValue">
<data type="string" datatypeLibrary=""/>
</define>
<!-- In the default query language binding, xsd:NCNAME -->
<define name="classValue">
<data type="string" datatypeLibrary=""/>
</define>
<define name="non-empty-string">
<data type="token">
<param name="minLength">1</param>
</data>
</define>
</grammar>

View file

@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Stylesheet for extracting Schematron information from a RELAX-NG schema.
Based on the stylesheet for extracting Schematron information from W3C XML Schema.
Created by Eddie Robertsson 2002/06/01
2009/12/10 hj: changed Schematron namespace to ISO URI (Holger Joukl)
-->
<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:sch="http://purl.oclc.org/dsdl/schematron" xmlns:rng="http://relaxng.org/ns/structure/1.0">
<!-- Set the output to be XML with an XML declaration and use indentation -->
<xsl:output method="xml" omit-xml-declaration="no" indent="yes" standalone="yes"/>
<!-- -->
<!-- match schema and call recursive template to extract included schemas -->
<!-- -->
<xsl:template match="/rng:grammar | /rng:element">
<!-- call the schema definition template ... -->
<xsl:call-template name="gatherSchema">
<!-- ... with current node as the $schemas parameter ... -->
<xsl:with-param name="schemas" select="."/>
<!-- ... and any includes in the $include parameter -->
<xsl:with-param name="includes" select="document(/rng:grammar/rng:include/@href
| //rng:externalRef/@href)"/>
</xsl:call-template>
</xsl:template>
<!-- -->
<!-- gather all included schemas into a single parameter variable -->
<!-- -->
<xsl:template name="gatherSchema">
<xsl:param name="schemas"/>
<xsl:param name="includes"/>
<xsl:choose>
<xsl:when test="count($schemas) &lt; count($schemas | $includes)">
<!-- when $includes includes something new, recurse ... -->
<xsl:call-template name="gatherSchema">
<!-- ... with current $includes added to the $schemas parameter ... -->
<xsl:with-param name="schemas" select="$schemas | $includes"/>
<!-- ... and any *new* includes in the $include parameter -->
<xsl:with-param name="includes" select="document($includes/rng:grammar/rng:include/@href
| $includes//rng:externalRef/@href)"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- we have the complete set of included schemas, so now let's output the embedded schematron -->
<xsl:call-template name="output">
<xsl:with-param name="schemas" select="$schemas"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- -->
<!-- output the schematron information -->
<!-- -->
<xsl:template name="output">
<xsl:param name="schemas"/>
<!-- -->
<sch:schema>
<!-- get header-type elements - eg title and especially ns -->
<!-- title (just one) -->
<xsl:copy-of select="$schemas//sch:title[1]"/>
<!-- get remaining schematron schema children -->
<!-- get non-blank namespace elements, dropping duplicates -->
<xsl:for-each select="$schemas//sch:ns">
<xsl:if test="generate-id(.) = generate-id($schemas//sch:ns[@prefix = current()/@prefix][1])">
<xsl:copy-of select="."/>
</xsl:if>
</xsl:for-each>
<xsl:copy-of select="$schemas//sch:phase"/>
<xsl:copy-of select="$schemas//sch:pattern"/>
<sch:diagnostics>
<xsl:copy-of select="$schemas//sch:diagnostics/*"/>
</sch:diagnostics>
</sch:schema>
</xsl:template>
<!-- -->
</xsl:transform>

View file

@ -0,0 +1,77 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
based on an original transform by Eddie Robertsson
2001/04/21 fn: added support for included schemas
2001/06/27 er: changed XMl Schema prefix from xsd: to xs: and changed to the Rec namespace
2009/12/10 hj: changed Schematron namespace to ISO URI (Holger Joukl)
-->
<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:sch="http://purl.oclc.org/dsdl/schematron" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<!-- Set the output to be XML with an XML declaration and use indentation -->
<xsl:output method="xml" omit-xml-declaration="no" indent="yes" standalone="yes"/>
<!-- -->
<!-- match schema and call recursive template to extract included schemas -->
<!-- -->
<xsl:template match="xs:schema">
<!-- call the schema definition template ... -->
<xsl:call-template name="gatherSchema">
<!-- ... with current current root as the $schemas parameter ... -->
<xsl:with-param name="schemas" select="/"/>
<!-- ... and any includes in the $include parameter -->
<xsl:with-param name="includes"
select="document(/xs:schema/xs:*[self::xs:include or self::xs:import or self::xs:redefine]/@schemaLocation)"/>
</xsl:call-template>
</xsl:template>
<!-- -->
<!-- gather all included schemas into a single parameter variable -->
<!-- -->
<xsl:template name="gatherSchema">
<xsl:param name="schemas"/>
<xsl:param name="includes"/>
<xsl:choose>
<xsl:when test="count($schemas) &lt; count($schemas | $includes)">
<!-- when $includes includes something new, recurse ... -->
<xsl:call-template name="gatherSchema">
<!-- ... with current $includes added to the $schemas parameter ... -->
<xsl:with-param name="schemas" select="$schemas | $includes"/>
<!-- ... and any *new* includes in the $include parameter -->
<xsl:with-param name="includes"
select="document($includes/xs:schema/xs:*[self::xs:include or self::xs:import or self::xs:redefine]/@schemaLocation)"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- we have the complete set of included schemas,
so now let's output the embedded schematron -->
<xsl:call-template name="output">
<xsl:with-param name="schemas" select="$schemas"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- -->
<!-- output the schematron information -->
<!-- -->
<xsl:template name="output">
<xsl:param name="schemas"/>
<!-- -->
<sch:schema>
<!-- get header-type elements - eg title and especially ns -->
<!-- title (just one) -->
<xsl:copy-of select="$schemas//xs:appinfo/sch:title[1]"/>
<!-- get remaining schematron schema children -->
<!-- get non-blank namespace elements, dropping duplicates -->
<xsl:for-each select="$schemas//xs:appinfo/sch:ns">
<xsl:if test="generate-id(.) =
generate-id($schemas//xs:appinfo/sch:ns[@prefix = current()/@prefix][1])">
<xsl:copy-of select="."/>
</xsl:if>
</xsl:for-each>
<xsl:copy-of select="$schemas//xs:appinfo/sch:phase"/>
<xsl:copy-of select="$schemas//xs:appinfo/sch:pattern"/>
<sch:diagnostics>
<xsl:copy-of select="$schemas//xs:appinfo/sch:diagnostics/*"/>
</sch:diagnostics>
</sch:schema>
</xsl:template>
<!-- -->
</xsl:transform>

View file

@ -0,0 +1,296 @@
<?xml version="1.0" encoding="UTF-8"?><?xar XSLT?>
<!--
OVERVIEW - iso_abstract_expand.xsl
This is a preprocessor for ISO Schematron, which implements abstract patterns.
It also
* extracts a particular schema using an ID, where there are multiple
schemas, such as when they are embedded in the same NVDL script
* experimentally, allows parameter recognition and substitution inside
text as well as @context, @test, & @select.
This should be used after iso-dsdl-include.xsl and before the skeleton or
meta-stylesheet (e.g. iso-svrl.xsl) . It only requires XSLT 1.
Each kind of inclusion can be turned off (or on) on the command line.
-->
<!--
VERSION INFORMATION
2008-09-18 RJ
* move out param test from iso:schema template to work with XSLT 1. (Noah Fontes)
2008-07-29 RJ
* Create. Pull out as distinct XSL in its own namespace from old iso_pre_pro.xsl
* Put everything in private namespace
* Rewrite replace_substring named template so that copyright is clear
2008-07-24 RJ
* correct abstract patterns so for correct names: param/@name and
param/@value
2007-01-12 RJ
* Use ISO namespace
* Use pattern/@id not pattern/@name
* Add Oliver Becker's suggests from old Schematron-love-in list for <copy>
* Add XT -ism?
2003 RJ
* Original written for old namespace
* http://www.topologi.com/resources/iso-pre-pro.xsl
-->
<!--
LEGAL INFORMATION
Copyright (c) 2000-2008 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from
the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim
that you wrote the original software. If you use this software in a product,
an acknowledgment in the product documentation would be appreciated but is
not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
-->
<xslt:stylesheet version="1.0" xmlns:xslt="http://www.w3.org/1999/XSL/Transform"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:iso="http://purl.oclc.org/dsdl/schematron"
xmlns:nvdl="http://purl.oclc.org/dsdl/nvdl"
xmlns:iae="http://www.schematron.com/namespace/iae"
>
<xslt:param name="schema-id"></xslt:param>
<!-- Driver for the mode -->
<xsl:template match="/">
<xsl:apply-templates select="." mode="iae:go" />
</xsl:template>
<!-- ================================================================================== -->
<!-- Normal processing rules -->
<!-- ================================================================================== -->
<!-- Output only the selected schema -->
<xslt:template match="iso:schema" >
<xsl:if test="string-length($schema-id) =0 or @id= $schema-id ">
<xslt:copy>
<xslt:copy-of select="@*" />
<xslt:apply-templates mode="iae:go" />
</xslt:copy>
</xsl:if>
</xslt:template>
<!-- Strip out any foreign elements above the Schematron schema .
-->
<xslt:template match="*[not(ancestor-or-self::iso:*)]" mode="iae:go" >
<xslt:apply-templates mode="iae:go" />
</xslt:template>
<!-- ================================================================================== -->
<!-- Handle Schematron abstract pattern preprocessing -->
<!-- abstract-to-real calls
do-pattern calls
macro-expand calls
multi-macro-expand
replace-substring -->
<!-- ================================================================================== -->
<!--
Abstract patterns allow you to say, for example
<pattern name="htmlTable" is-a="table">
<param name="row" value="html:tr"/>
<param name="cell" value="html:td" />
<param name="table" value="html:table" />
</pattern>
For a good introduction, see Uche Ogbujii's article for IBM DeveloperWorks
"Discover the flexibility of Schematron abstract patterns"
http://www-128.ibm.com/developerworks/xml/library/x-stron.html
However, note that ISO Schematron uses @name and @value attributes on
the iso:param element, and @id not @name on the pattern element.
-->
<!-- Suppress declarations of abstract patterns -->
<xslt:template match="iso:pattern[@abstract='true']" mode="iae:go" >
<xslt:comment>Suppressed abstract pattern <xslt:value-of select="@id"/> was here</xslt:comment>
</xslt:template>
<!-- Suppress uses of abstract patterns -->
<xslt:template match="iso:pattern[@is-a]" mode="iae:go" >
<xslt:comment>Start pattern based on abstract <xslt:value-of select="@is-a"/></xslt:comment>
<xslt:call-template name="iae:abstract-to-real" >
<xslt:with-param name="caller" select="@id" />
<xslt:with-param name="is-a" select="@is-a" />
</xslt:call-template>
</xslt:template>
<!-- output everything else unchanged -->
<xslt:template match="*" priority="-1" mode="iae:go" >
<xslt:copy>
<xslt:copy-of select="@*" />
<xslt:apply-templates mode="iae:go"/>
</xslt:copy>
</xslt:template>
<!-- Templates for macro expansion of abstract patterns -->
<!-- Sets up the initial conditions for the recursive call -->
<xslt:template name="iae:macro-expand">
<xslt:param name="caller"/>
<xslt:param name="text" />
<xslt:call-template name="iae:multi-macro-expand">
<xslt:with-param name="caller" select="$caller"/>
<xslt:with-param name="text" select="$text"/>
<xslt:with-param name="paramNumber" select="1"/>
</xslt:call-template>
</xslt:template>
<!-- Template to replace the current parameter and then
recurse to replace subsequent parameters. -->
<xslt:template name="iae:multi-macro-expand">
<xslt:param name="caller"/>
<xslt:param name="text" />
<xslt:param name="paramNumber" />
<xslt:choose>
<xslt:when test="//iso:pattern[@id=$caller]/iso:param[ $paramNumber]">
<xslt:call-template name="iae:multi-macro-expand">
<xslt:with-param name="caller" select="$caller"/>
<xslt:with-param name="paramNumber" select="$paramNumber + 1"/>
<xslt:with-param name="text" >
<xslt:call-template name="iae:replace-substring">
<xslt:with-param name="original" select="$text"/>
<xslt:with-param name="substring"
select="concat('$', //iso:pattern[@id=$caller]/iso:param[ $paramNumber ]/@name)"/>
<xslt:with-param name="replacement"
select="//iso:pattern[@id=$caller]/iso:param[ $paramNumber ]/@value"/>
</xslt:call-template>
</xslt:with-param>
</xslt:call-template>
</xslt:when>
<xslt:otherwise><xslt:value-of select="$text" /></xslt:otherwise>
</xslt:choose>
</xslt:template>
<!-- generate the real pattern from an abstract pattern + parameters-->
<xslt:template name="iae:abstract-to-real" >
<xslt:param name="caller"/>
<xslt:param name="is-a" />
<xslt:for-each select="//iso:pattern[@id= $is-a]">
<xslt:copy>
<xslt:choose>
<xslt:when test=" string-length( $caller ) = 0">
<xslt:attribute name="id"><xslt:value-of select="concat( generate-id(.) , $is-a)" /></xslt:attribute>
</xslt:when>
<xslt:otherwise>
<xslt:attribute name="id"><xslt:value-of select="$caller" /></xslt:attribute>
</xslt:otherwise>
</xslt:choose>
<xslt:apply-templates select="*|text()" mode="iae:do-pattern" >
<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
</xslt:apply-templates>
</xslt:copy>
</xslt:for-each>
</xslt:template>
<!-- Generate a non-abstract pattern -->
<xslt:template mode="iae:do-pattern" match="*">
<xslt:param name="caller"/>
<xslt:copy>
<xslt:for-each select="@*[name()='test' or name()='context' or name()='select']">
<xslt:attribute name="{name()}">
<xslt:call-template name="iae:macro-expand">
<xslt:with-param name="text"><xslt:value-of select="."/></xslt:with-param>
<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
</xslt:call-template>
</xslt:attribute>
</xslt:for-each>
<xslt:copy-of select="@*[name()!='test'][name()!='context'][name()!='select']" />
<xsl:for-each select="node()">
<xsl:choose>
<!-- Experiment: replace macros in text as well, to allow parameterized assertions
and so on, without having to have spurious <iso:value-of> calls and multiple
delimiting -->
<xsl:when test="self::text()">
<xslt:call-template name="iae:macro-expand">
<xslt:with-param name="text"><xslt:value-of select="."/></xslt:with-param>
<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
</xslt:call-template>
</xsl:when>
<xsl:otherwise>
<xslt:apply-templates select="." mode="iae:do-pattern">
<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
</xslt:apply-templates>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xslt:copy>
</xslt:template>
<!-- UTILITIES -->
<!-- Simple version of replace-substring function -->
<xslt:template name="iae:replace-substring">
<xslt:param name="original" />
<xslt:param name="substring" />
<xslt:param name="replacement" select="''"/>
<xsl:choose>
<xsl:when test="not($original)" />
<xsl:when test="not(string($substring))">
<xsl:value-of select="$original" />
</xsl:when>
<xsl:when test="contains($original, $substring)">
<xsl:variable name="before" select="substring-before($original, $substring)" />
<xsl:variable name="after" select="substring-after($original, $substring)" />
<xsl:value-of select="$before" />
<xsl:value-of select="$replacement" />
<!-- recursion -->
<xsl:call-template name="iae:replace-substring">
<xsl:with-param name="original" select="$after" />
<xsl:with-param name="substring" select="$substring" />
<xsl:with-param name="replacement" select="$replacement" />
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<!-- no substitution -->
<xsl:value-of select="$original" />
</xsl:otherwise>
</xsl:choose>
</xslt:template>
</xslt:stylesheet>

View file

@ -0,0 +1,55 @@
<?xml version="1.0" ?><?xar XSLT?>
<!-- Implmentation for the Schematron XML Schema Language.
http://www.ascc.net/xml/resource/schematron/schematron.html
Copyright (c) 2000,2001 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from
the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim
that you wrote the original software. If you use this software in a product,
an acknowledgment in the product documentation would be appreciated but is
not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
-->
<!-- Schematron message -->
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias">
<xsl:import href="iso_schematron_skeleton_for_xslt1.xsl"/>
<xsl:template name="process-prolog">
<axsl:output method="text" />
</xsl:template>
<!-- use default rule for process-root: copy contens / ignore title -->
<!-- use default rule for process-pattern: ignore name and see -->
<!-- use default rule for process-name: output name -->
<!-- use default rule for process-assert and process-report:
call process-message -->
<xsl:template name="process-message">
<xsl:param name="pattern" />
<xsl:param name="role" />
<axsl:message>
<xsl:apply-templates mode="text"
/> (<xsl:value-of select="$pattern" />
<xsl:if test="$role"> / <xsl:value-of select="$role" />
</xsl:if>)</axsl:message>
</xsl:template>
</xsl:stylesheet>

View file

@ -0,0 +1,588 @@
<?xml version="1.0" ?>
<!--
ISO_SVRL.xsl
Implementation of Schematron Validation Report Language from ISO Schematron
ISO/IEC 19757 Document Schema Definition Languages (DSDL)
Part 3: Rule-based validation Schematron
Annex D: Schematron Validation Report Language
This ISO Standard is available free as a Publicly Available Specification in PDF from ISO.
Also see www.schematron.com for drafts and other information.
This implementation of SVRL is designed to run with the "Skeleton" implementation
of Schematron which Oliver Becker devised. The skeleton code provides a
Schematron implementation but with named templates for handling all output;
the skeleton provides basic templates for output using this API, but client
validators can be written to import the skeleton and override the default output
templates as required. (In order to understand this, you must understand that
a named template such as "process-assert" in this XSLT stylesheet overrides and
replaces any template with the same name in the imported skeleton XSLT file.)
The other important thing to understand in this code is that there are different
versions of the Schematron skeleton. These track the development of Schematron through
Schematron 1.5, Schematron 1.6 and now ISO Schematron. One only skeleton must be
imported. The code has templates for the different skeletons commented out for
convenience. ISO Schematron has a different namespace than Schematron 1.5 and 1.6;
so the ISO Schematron skeleton has been written itself with an optional import
statement to in turn import the Schematron 1.6 skeleton. This will allow you to
validate with schemas from either namespace.
History:
2009-03-18
* Fix atrribute with space "see " which generates wrong name in some processors
2008-08-11
* RJ Fix attribute/@select which saxon allows in XSLT 1
2008-08-07
* RJ Add output-encoding attribute to specify final encoding to use
* Alter allow-foreign functionality so that Schematron span, emph and dir elements make
it to the output, for better formatting and because span can be used to mark up
semantically interesting information embedded in diagnostics, which reduces the
need to extend SVRL itself
* Diagnostic-reference had an invalid attribute @id that duplicated @diagnostic: removed
2008-08-06
* RJ Fix invalid output: svrl:diagnostic-reference is not contained in an svrl:text
* Output comment to SVRL file giving filename if available (from command-line parameter)
2008-08-04
* RJ move sch: prefix to schold: prefix to prevent confusion (we want people to
be able to switch from old namespace to new namespace without changing the
sch: prefix, so it is better to keep that prefix completely out of the XSLT)
* Extra signature fixes (PH)
2008-08-03
* Repair missing class parameter on process-p
2008-07-31
* Update skeleton names
2007-04-03
* Add option generate-fired-rule (RG)
2007-02-07
* Prefer true|false for parameters. But allow yes|no on some old for compatability
* DP Diagnostics output to svrl:text. Diagnosis put out after assertion text.
* Removed non-SVRL elements and attributes: better handled as an extra layer that invokes this one
* Add more formal parameters
* Correct confusion between $schemaVersion and $queryBinding
* Indent
* Validate against RNC schemas for XSLT 1 and 2 (with regex tests removed)
* Validate output with UniversalTest.sch against RNC schema for ISO SVRL
2007-02-01
* DP. Update formal parameters of overriding named templates to handle more attributes.
* DP. Refactor handling of rich and linkable parameters to a named template.
2007-01-22
* DP change svrl:ns to svrl:ns-in-attribute-value
* Change default when no queryBinding from "unknown" to "xslt"
2007-01-18:
* Improve documentation
* KH Add command-line options to generate paths or not
* Use axsl:attribute rather than xsl:attribute to shut XSLT2 up
* Add extra command-line options to pass to the iso_schematron_skeleton
2006-12-01: iso_svrl.xsl Rick Jelliffe,
* update namespace,
* update phase handling,
* add flag param to process-assert and process-report & @ flag on output
2001: Conformance1-5.xsl Rick Jelliffe,
* Created, using the skeleton code contributed by Oliver Becker
-->
<!--
Derived from Conformance1-5.xsl.
Copyright (c) 2001, 2006 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from
the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim
that you wrote the original software. If you use this software in a product,
an acknowledgment in the product documentation would be appreciated but is
not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
-->
<!-- Ideas nabbed from schematrons by Francis N., Miloslav N. and David C. -->
<!-- The command-line parameters are:
phase NMTOKEN | "#ALL" (default) Select the phase for validation
allow-foreign "true" | "false" (default) Pass non-Schematron elements and rich markup to the generated stylesheet
diagnose= true | false|yes|no Add the diagnostics to the assertion test in reports (yes|no are obsolete)
generate-paths=true|false|yes|no generate the @location attribute with XPaths (yes|no are obsolete)
sch.exslt.imports semi-colon delimited string of filenames for some EXSLT implementations
optimize "visit-no-attributes" Use only when the schema has no attributes as the context nodes
generate-fired-rule "true"(default) | "false" Generate fired-rule elements
-->
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias"
xmlns:schold="http://www.ascc.net/xml/schematron"
xmlns:iso="http://purl.oclc.org/dsdl/schematron"
xmlns:svrl="http://purl.oclc.org/dsdl/svrl"
>
<!-- Select the import statement and adjust the path as
necessary for your system.
If not XSLT2 then also remove svrl:active-pattern/@document="{document-uri()}" from process-pattern()
-->
<!--
<xsl:import href="iso_schematron_skeleton_for_saxon.xsl"/>
-->
<xsl:import href="iso_schematron_skeleton_for_xslt1.xsl"/>
<!--
<xsl:import href="iso_schematron_skeleton.xsl"/>
<xsl:import href="skeleton1-5.xsl"/>
<xsl:import href="skeleton1-6.xsl"/>
-->
<xsl:param name="diagnose" >true</xsl:param>
<xsl:param name="phase" >
<xsl:choose>
<!-- Handle Schematron 1.5 and 1.6 phases -->
<xsl:when test="//schold:schema/@defaultPhase">
<xsl:value-of select="//schold:schema/@defaultPhase"/>
</xsl:when>
<!-- Handle ISO Schematron phases -->
<xsl:when test="//iso:schema/@defaultPhase">
<xsl:value-of select="//iso:schema/@defaultPhase"/>
</xsl:when>
<xsl:otherwise>#ALL</xsl:otherwise>
</xsl:choose>
</xsl:param>
<xsl:param name="allow-foreign" >false</xsl:param>
<xsl:param name="generate-paths" >true</xsl:param>
<xsl:param name="generate-fired-rule" >true</xsl:param>
<xsl:param name="optimize"/>
<xsl:param name="output-encoding" ></xsl:param>
<!-- e.g. saxon file.xml file.xsl "sch.exslt.imports=.../string.xsl;.../math.xsl" -->
<xsl:param name="sch.exslt.imports" />
<!-- Experimental: If this file called, then must be generating svrl -->
<xsl:variable name="svrlTest" select="true()" />
<!-- ================================================================ -->
<xsl:template name="process-prolog">
<axsl:output method="xml" omit-xml-declaration="no" standalone="yes"
indent="yes">
<xsl:if test=" string-length($output-encoding) &gt; 0">
<xsl:attribute name="encoding"><xsl:value-of select=" $output-encoding" /></xsl:attribute>
</xsl:if>
</axsl:output>
</xsl:template>
<!-- Overrides skeleton.xsl -->
<xsl:template name="process-root">
<xsl:param name="title"/>
<xsl:param name="contents" />
<xsl:param name="queryBinding" >xslt1</xsl:param>
<xsl:param name="schemaVersion" />
<xsl:param name="id" />
<xsl:param name="version"/>
<!-- "Rich" parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<svrl:schematron-output title="{$title}" schemaVersion="{$schemaVersion}" >
<xsl:if test=" string-length( normalize-space( $phase )) &gt; 0 and
not( normalize-space( $phase ) = '#ALL') ">
<axsl:attribute name="phase">
<xsl:value-of select=" $phase " />
</axsl:attribute>
</xsl:if>
<xsl:if test=" $allow-foreign = 'true'">
</xsl:if>
<xsl:if test=" $allow-foreign = 'true'">
<xsl:call-template name='richParms'>
<xsl:with-param name="fpi" select="$fpi" />
<xsl:with-param name="icon" select="$icon"/>
<xsl:with-param name="lang" select="$lang"/>
<xsl:with-param name="see" select="$see" />
<xsl:with-param name="space" select="$space" />
</xsl:call-template>
</xsl:if>
<axsl:comment><axsl:value-of select="$archiveDirParameter"/> &#xA0;
<axsl:value-of select="$archiveNameParameter"/> &#xA0;
<axsl:value-of select="$fileNameParameter"/> &#xA0;
<axsl:value-of select="$fileDirParameter"/></axsl:comment>
<xsl:apply-templates mode="do-schema-p" />
<xsl:copy-of select="$contents" />
</svrl:schematron-output>
</xsl:template>
<xsl:template name="process-assert">
<xsl:param name="test"/>
<xsl:param name="diagnostics" />
<xsl:param name="id" />
<xsl:param name="flag" />
<!-- "Linkable" parameters -->
<xsl:param name="role"/>
<xsl:param name="subject"/>
<!-- "Rich" parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<svrl:failed-assert test="{$test}" >
<xsl:if test="string-length( $id ) &gt; 0">
<axsl:attribute name="id">
<xsl:value-of select=" $id " />
</axsl:attribute>
</xsl:if>
<xsl:if test=" string-length( $flag ) &gt; 0">
<axsl:attribute name="flag">
<xsl:value-of select=" $flag " />
</axsl:attribute>
</xsl:if>
<!-- Process rich attributes. -->
<xsl:call-template name="richParms">
<xsl:with-param name="fpi" select="$fpi"/>
<xsl:with-param name="icon" select="$icon"/>
<xsl:with-param name="lang" select="$lang"/>
<xsl:with-param name="see" select="$see" />
<xsl:with-param name="space" select="$space" />
</xsl:call-template>
<xsl:call-template name='linkableParms'>
<xsl:with-param name="role" select="$role" />
<xsl:with-param name="subject" select="$subject"/>
</xsl:call-template>
<xsl:if test=" $generate-paths = 'true' or $generate-paths= 'yes' ">
<!-- true/false is the new way -->
<axsl:attribute name="location">
<axsl:apply-templates select="." mode="schematron-get-full-path"/>
</axsl:attribute>
</xsl:if>
<svrl:text>
<xsl:apply-templates mode="text" />
</svrl:text>
<xsl:if test="$diagnose = 'yes' or $diagnose= 'true' ">
<!-- true/false is the new way -->
<xsl:call-template name="diagnosticsSplit">
<xsl:with-param name="str" select="$diagnostics"/>
</xsl:call-template>
</xsl:if>
</svrl:failed-assert>
</xsl:template>
<xsl:template name="process-report">
<xsl:param name="id"/>
<xsl:param name="test"/>
<xsl:param name="diagnostics"/>
<xsl:param name="flag" />
<!-- "Linkable" parameters -->
<xsl:param name="role"/>
<xsl:param name="subject"/>
<!-- "Rich" parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<svrl:successful-report test="{$test}" >
<xsl:if test=" string-length( $id ) &gt; 0">
<axsl:attribute name="id">
<xsl:value-of select=" $id " />
</axsl:attribute>
</xsl:if>
<xsl:if test=" string-length( $flag ) &gt; 0">
<axsl:attribute name="flag">
<xsl:value-of select=" $flag " />
</axsl:attribute>
</xsl:if>
<!-- Process rich attributes. -->
<xsl:call-template name="richParms">
<xsl:with-param name="fpi" select="$fpi"/>
<xsl:with-param name="icon" select="$icon"/>
<xsl:with-param name="lang" select="$lang"/>
<xsl:with-param name="see" select="$see" />
<xsl:with-param name="space" select="$space" />
</xsl:call-template>
<xsl:call-template name='linkableParms'>
<xsl:with-param name="role" select="$role" />
<xsl:with-param name="subject" select="$subject"/>
</xsl:call-template>
<xsl:if test=" $generate-paths = 'yes' or $generate-paths = 'true' ">
<!-- true/false is the new way -->
<axsl:attribute name="location">
<axsl:apply-templates select="." mode="schematron-get-full-path"/>
</axsl:attribute>
</xsl:if>
<svrl:text>
<xsl:apply-templates mode="text" />
</svrl:text>
<xsl:if test="$diagnose = 'yes' or $diagnose='true' ">
<!-- true/false is the new way -->
<xsl:call-template name="diagnosticsSplit">
<xsl:with-param name="str" select="$diagnostics"/>
</xsl:call-template>
</xsl:if>
</svrl:successful-report>
</xsl:template>
<!-- Overrides skeleton -->
<xsl:template name="process-dir" >
<xsl:param name="value" />
<xsl:choose>
<xsl:when test=" $allow-foreign = 'true'">
<xsl:copy-of select="."/>
</xsl:when>
<xsl:otherwise>
<!-- We generate too much whitespace rather than risking concatenation -->
<axsl:text> </axsl:text>
<xsl:apply-templates mode="inline-text"/>
<axsl:text> </axsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="process-diagnostic">
<xsl:param name="id"/>
<!-- Rich parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<svrl:diagnostic-reference diagnostic="{$id}" >
<xsl:call-template name="richParms">
<xsl:with-param name="fpi" select="$fpi"/>
<xsl:with-param name="icon" select="$icon"/>
<xsl:with-param name="lang" select="$lang"/>
<xsl:with-param name="see" select="$see" />
<xsl:with-param name="space" select="$space" />
</xsl:call-template>
<xsl:text>
</xsl:text>
<xsl:apply-templates mode="text"/>
</svrl:diagnostic-reference>
</xsl:template>
<!-- Overrides skeleton -->
<xsl:template name="process-emph" >
<xsl:param name="class" />
<xsl:choose>
<xsl:when test=" $allow-foreign = 'true'">
<xsl:copy-of select="."/>
</xsl:when>
<xsl:otherwise>
<!-- We generate too much whitespace rather than risking concatenation -->
<axsl:text> </axsl:text>
<xsl:apply-templates mode="inline-text"/>
<axsl:text> </axsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template name="process-rule">
<xsl:param name="id"/>
<xsl:param name="context"/>
<xsl:param name="flag"/>
<!-- "Linkable" parameters -->
<xsl:param name="role"/>
<xsl:param name="subject"/>
<!-- "Rich" parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<xsl:if test=" $generate-fired-rule = 'true'">
<svrl:fired-rule context="{$context}" >
<!-- Process rich attributes. -->
<xsl:call-template name="richParms">
<xsl:with-param name="fpi" select="$fpi"/>
<xsl:with-param name="icon" select="$icon"/>
<xsl:with-param name="lang" select="$lang"/>
<xsl:with-param name="see" select="$see" />
<xsl:with-param name="space" select="$space" />
</xsl:call-template>
<xsl:if test=" string( $id )">
<xsl:attribute name="id">
<xsl:value-of select=" $id " />
</xsl:attribute>
</xsl:if>
<xsl:if test=" string-length( $role ) &gt; 0">
<xsl:attribute name="role">
<xsl:value-of select=" $role " />
</xsl:attribute>
</xsl:if>
</svrl:fired-rule>
</xsl:if>
</xsl:template>
<xsl:template name="process-ns">
<xsl:param name="prefix"/>
<xsl:param name="uri"/>
<svrl:ns-prefix-in-attribute-values uri="{$uri}" prefix="{$prefix}" />
</xsl:template>
<xsl:template name="process-p">
<xsl:param name="icon"/>
<xsl:param name="class"/>
<xsl:param name="id"/>
<xsl:param name="lang"/>
<svrl:text>
<xsl:apply-templates mode="text"/>
</svrl:text>
</xsl:template>
<xsl:template name="process-pattern">
<xsl:param name="name"/>
<xsl:param name="id"/>
<xsl:param name="is-a"/>
<!-- "Rich" parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<svrl:active-pattern >
<xsl:if test=" string( $id )">
<axsl:attribute name="id">
<xsl:value-of select=" $id " />
</axsl:attribute>
</xsl:if>
<xsl:if test=" string( $name )">
<axsl:attribute name="name">
<xsl:value-of select=" $name " />
</axsl:attribute>
</xsl:if>
<xsl:call-template name='richParms'>
<xsl:with-param name="fpi" select="$fpi"/>
<xsl:with-param name="icon" select="$icon"/>
<xsl:with-param name="lang" select="$lang"/>
<xsl:with-param name="see" select="$see" />
<xsl:with-param name="space" select="$space" />
</xsl:call-template>
<!-- ?? report that this screws up iso:title processing -->
<xsl:apply-templates mode="do-pattern-p"/>
<!-- ?? Seems that this apply-templates is never triggered DP -->
<axsl:apply-templates />
</svrl:active-pattern>
</xsl:template>
<!-- Overrides skeleton -->
<xsl:template name="process-message" >
<xsl:param name="pattern"/>
<xsl:param name="role"/>
</xsl:template>
<!-- Overrides skeleton -->
<xsl:template name="process-span" >
<xsl:param name="class" />
<xsl:choose>
<xsl:when test=" $allow-foreign = 'true'">
<xsl:copy-of select="."/>
</xsl:when>
<xsl:otherwise>
<!-- We generate too much whitespace rather than risking concatenation -->
<axsl:text> </axsl:text>
<xsl:apply-templates mode="inline-text"/>
<axsl:text> </axsl:text>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<!-- =========================================================================== -->
<!-- processing rich parameters. -->
<xsl:template name='richParms'>
<!-- "Rich" parameters -->
<xsl:param name="fpi" />
<xsl:param name="icon" />
<xsl:param name="lang" />
<xsl:param name="see" />
<xsl:param name="space" />
<!-- Process rich attributes. -->
<xsl:if test=" $allow-foreign = 'true'">
<xsl:if test="string($fpi)">
<axsl:attribute name="fpi">
<xsl:value-of select="$fpi"/>
</axsl:attribute>
</xsl:if>
<xsl:if test="string($icon)">
<axsl:attribute name="icon">
<xsl:value-of select="$icon"/>
</axsl:attribute>
</xsl:if>
<xsl:if test="string($see)">
<axsl:attribute name="see">
<xsl:value-of select="$see"/>
</axsl:attribute>
</xsl:if>
</xsl:if>
<xsl:if test="string($space)">
<axsl:attribute name="xml:space">
<xsl:value-of select="$space"/>
</axsl:attribute>
</xsl:if>
<xsl:if test="string($lang)">
<axsl:attribute name="xml:lang">
<xsl:value-of select="$lang"/>
</axsl:attribute>
</xsl:if>
</xsl:template>
<!-- processing linkable parameters. -->
<xsl:template name='linkableParms'>
<xsl:param name="role"/>
<xsl:param name="subject"/>
<!-- ISO SVRL has a role attribute to match the Schematron role attribute -->
<xsl:if test=" string($role )">
<axsl:attribute name="role">
<xsl:value-of select=" $role " />
</axsl:attribute>
</xsl:if>
<!-- ISO SVRL does not have a subject attribute to match the Schematron subject attribute.
Instead, the Schematron subject attribute is folded into the location attribute -->
</xsl:template>
</xsl:stylesheet>

View file

@ -0,0 +1,83 @@
ISO SCHEMATRON 2009
XSLT implementation by Rick Jelliffe with assistance from members of Schematron-love-in maillist.
2009-03-18
Two distributions are available. One is for XSLT1 engines.
The other is for XSLT2 engines, such as SAXON 9.
This version of Schematron splits the process into a pipeline of several different XSLT stages.
1) First, preprocess your Schematron schema with iso_dsdl_include.xsl.
This is a macro processor to assemble the schema from various parts.
If your schema is not in separate parts, you can skip this stage.
2) Second, preprocess the output from stage 1 with iso_abstract_expand.xsl.
This is a macro processor to convert abstract patterns to real patterns.
If your schema does not use abstract patterns, you can skip this
stage.
3) Third, compile the Schematron schema into an XSLT script.
This will typically use iso_svrl_for_xslt1.xsl or iso_svrl_for_xslt2.xsl
(which in turn invoke iso_schematron_skeleton_for_xslt1.xsl or iso_schematron_skeleton_for_saxon.xsl)
However, other "meta-styleseets" are also in common use; the principle of operation is the same.
If your schema uses Schematron phases, supply these as command line/invocation parameters
to this process.
4) Fourth, run the script generated by stage 3 against the document being validated.
If you are using the SVRL script, then the output of validation will be an XML document.
If your schema uses Schematron parameters, supply these as command line/invocation parameters
to this process.
The XSLT2 distribution also features several next generation features,
such as validating multiple documents. See the source code for details.
Schematron assertions can be written in any language, of course; the file
sch-messages-en.xhtml contains the diagnostics messages from the XSLT2 skeleton
in English, and this can be used as template to localize the skeleton's
error messages. Note that typically programming errors in Schematron are XPath
errors, which requires localized messages from the XSLT engine.
ANT
---
To give an example of how to process a document, here is a sample ANT task.
<target name="schematron-compile-test" >
<!-- expand inclusions -->
<xslt basedir="test/schematron"
style="iso_dsdl_include.xsl" in="test.sch" out="test1.sch">
<classpath>
<pathelement location="${lib.dir}/saxon9.jar"/>
</classpath>
</xslt>
<!-- expand abstract patterns -->
<xslt basedir="test/schematron"
style="iso_abstract_expand.xsl" in="test1.sch" out="test2.sch">
<classpath>
<pathelement location="${lib.dir}/saxon9.jar"/>
</classpath>
</xslt>
<!-- compile it -->
<xslt basedir="test/schematron"
style="iso_svrl_for_xslt2.xsl" in="test2.sch" out="test.xsl">
<classpath>
<pathelement location="${lib.dir}/saxon9.jar"/>
</classpath>
</xslt>
<!-- validate -->
<xslt basedir="test/schematron"
style="test.xsl" in="instance.xml" out="instance.svrlt">
<classpath>
<pathelement location="${lib.dir}/saxon9.jar"/>
</classpath>
</xslt>
</target>

View file

@ -0,0 +1,219 @@
/* Generated by Cython 0.23.4 */
#ifndef __PYX_HAVE__lxml__etree
#define __PYX_HAVE__lxml__etree
struct LxmlDocument;
struct LxmlElement;
struct LxmlElementTree;
struct LxmlElementTagMatcher;
struct LxmlElementIterator;
struct LxmlElementBase;
struct LxmlElementClassLookup;
struct LxmlFallbackElementClassLookup;
/* "src/lxml/lxml.etree.pyx":328
*
* # type of a function that steps from node to node
* ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*) # <<<<<<<<<<<<<<
*
*
*/
typedef xmlNode *(*_node_to_node_function)(xmlNode *);
/* "src/lxml/lxml.etree.pyx":344
* @cython.final
* @cython.freelist(8)
* cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]: # <<<<<<<<<<<<<<
* u"""Internal base class to reference a libxml document.
*
*/
struct LxmlDocument {
PyObject_HEAD
struct __pyx_vtabstruct_4lxml_5etree__Document *__pyx_vtab;
int _ns_counter;
PyObject *_prefix_tail;
xmlDoc *_c_doc;
struct __pyx_obj_4lxml_5etree__BaseParser *_parser;
};
/* "src/lxml/lxml.etree.pyx":696
*
* @cython.no_gc_clear
* cdef public class _Element [ type LxmlElementType, object LxmlElement ]: # <<<<<<<<<<<<<<
* u"""Element class.
*
*/
struct LxmlElement {
PyObject_HEAD
struct LxmlDocument *_doc;
xmlNode *_c_node;
PyObject *_tag;
};
/* "src/lxml/lxml.etree.pyx":1858
*
*
* cdef public class _ElementTree [ type LxmlElementTreeType, # <<<<<<<<<<<<<<
* object LxmlElementTree ]:
* cdef _Document _doc
*/
struct LxmlElementTree {
PyObject_HEAD
struct __pyx_vtabstruct_4lxml_5etree__ElementTree *__pyx_vtab;
struct LxmlDocument *_doc;
struct LxmlElement *_context_node;
};
/* "src/lxml/lxml.etree.pyx":2572
*
*
* cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, # <<<<<<<<<<<<<<
* type LxmlElementTagMatcherType ]:
* """
*/
struct LxmlElementTagMatcher {
PyObject_HEAD
struct __pyx_vtabstruct_4lxml_5etree__ElementTagMatcher *__pyx_vtab;
PyObject *_pystrings;
int _node_type;
char *_href;
char *_name;
};
/* "src/lxml/lxml.etree.pyx":2603
* self._name = NULL
*
* cdef public class _ElementIterator(_ElementTagMatcher) [ # <<<<<<<<<<<<<<
* object LxmlElementIterator, type LxmlElementIteratorType ]:
* """
*/
struct LxmlElementIterator {
struct LxmlElementTagMatcher __pyx_base;
struct LxmlElement *_node;
_node_to_node_function _next_element;
};
/* "src/lxml/classlookup.pxi":6
* # Custom Element classes
*
* cdef public class ElementBase(_Element) [ type LxmlElementBaseType, # <<<<<<<<<<<<<<
* object LxmlElementBase ]:
* u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
*/
struct LxmlElementBase {
struct LxmlElement __pyx_base;
};
/* "src/lxml/classlookup.pxi":211
* # Element class lookup
*
* ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*) # <<<<<<<<<<<<<<
*
* # class to store element class lookup functions
*/
typedef PyObject *(*_element_class_lookup_function)(PyObject *, struct LxmlDocument *, xmlNode *);
/* "src/lxml/classlookup.pxi":214
*
* # class to store element class lookup functions
* cdef public class ElementClassLookup [ type LxmlElementClassLookupType, # <<<<<<<<<<<<<<
* object LxmlElementClassLookup ]:
* u"""ElementClassLookup(self)
*/
struct LxmlElementClassLookup {
PyObject_HEAD
_element_class_lookup_function _lookup_function;
};
/* "src/lxml/classlookup.pxi":223
* self._lookup_function = NULL # use default lookup
*
* cdef public class FallbackElementClassLookup(ElementClassLookup) \ # <<<<<<<<<<<<<<
* [ type LxmlFallbackElementClassLookupType,
* object LxmlFallbackElementClassLookup ]:
*/
struct LxmlFallbackElementClassLookup {
struct LxmlElementClassLookup __pyx_base;
struct __pyx_vtabstruct_4lxml_5etree_FallbackElementClassLookup *__pyx_vtab;
struct LxmlElementClassLookup *fallback;
_element_class_lookup_function _fallback_function;
};
#ifndef __PYX_HAVE_API__lxml__etree
#ifndef __PYX_EXTERN_C
#ifdef __cplusplus
#define __PYX_EXTERN_C extern "C"
#else
#define __PYX_EXTERN_C extern
#endif
#endif
#ifndef DL_IMPORT
#define DL_IMPORT(_T) _T
#endif
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlDocumentType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementTreeType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementTagMatcherType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementIteratorType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementBaseType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlElementClassLookupType;
__PYX_EXTERN_C DL_IMPORT(PyTypeObject) LxmlFallbackElementClassLookupType;
__PYX_EXTERN_C DL_IMPORT(struct LxmlElement) *deepcopyNodeToDocument(struct LxmlDocument *, xmlNode *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlElementTree) *elementTreeFactory(struct LxmlElement *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlElementTree) *newElementTree(struct LxmlElement *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlElement) *elementFactory(struct LxmlDocument *, xmlNode *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlElement) *makeElement(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlElement) *makeSubElement(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(void) setElementClassLookupFunction(_element_class_lookup_function, PyObject *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *lookupDefaultElementClass(PyObject *, PyObject *, xmlNode *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *lookupNamespaceElementClass(PyObject *, PyObject *, xmlNode *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *callLookupFallback(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *);
__PYX_EXTERN_C DL_IMPORT(int) tagMatches(xmlNode *, const xmlChar *, const xmlChar *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlDocument) *documentOrRaise(PyObject *);
__PYX_EXTERN_C DL_IMPORT(struct LxmlElement) *rootNodeOrRaise(PyObject *);
__PYX_EXTERN_C DL_IMPORT(int) hasText(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(int) hasTail(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *textOf(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *tailOf(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(int) setNodeText(xmlNode *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(int) setTailText(xmlNode *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *attributeValue(xmlNode *, xmlAttr *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *attributeValueFromNsName(xmlNode *, const xmlChar *, const xmlChar *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *getAttributeValue(struct LxmlElement *, PyObject *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *iterattributes(struct LxmlElement *, int);
__PYX_EXTERN_C DL_IMPORT(PyObject) *collectAttributes(xmlNode *, int);
__PYX_EXTERN_C DL_IMPORT(int) setAttributeValue(struct LxmlElement *, PyObject *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(int) delAttribute(struct LxmlElement *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(int) delAttributeFromNsName(xmlNode *, const xmlChar *, const xmlChar *);
__PYX_EXTERN_C DL_IMPORT(int) hasChild(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(xmlNode) *findChild(xmlNode *, Py_ssize_t);
__PYX_EXTERN_C DL_IMPORT(xmlNode) *findChildForwards(xmlNode *, Py_ssize_t);
__PYX_EXTERN_C DL_IMPORT(xmlNode) *findChildBackwards(xmlNode *, Py_ssize_t);
__PYX_EXTERN_C DL_IMPORT(xmlNode) *nextElement(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(xmlNode) *previousElement(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(void) appendChild(struct LxmlElement *, struct LxmlElement *);
__PYX_EXTERN_C DL_IMPORT(int) appendChildToElement(struct LxmlElement *, struct LxmlElement *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *pyunicode(const xmlChar *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *utf8(PyObject *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *getNsTag(PyObject *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *getNsTagWithEmptyNs(PyObject *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *namespacedName(xmlNode *);
__PYX_EXTERN_C DL_IMPORT(PyObject) *namespacedNameFromNsName(const xmlChar *, const xmlChar *);
__PYX_EXTERN_C DL_IMPORT(void) iteratorStoreNext(struct LxmlElementIterator *, struct LxmlElement *);
__PYX_EXTERN_C DL_IMPORT(void) initTagMatch(struct LxmlElementTagMatcher *, PyObject *);
__PYX_EXTERN_C DL_IMPORT(xmlNs) *findOrBuildNodeNsPrefix(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *);
#endif /* !__PYX_HAVE_API__lxml__etree */
#if PY_MAJOR_VERSION < 3
PyMODINIT_FUNC initetree(void);
#else
PyMODINIT_FUNC PyInit_etree(void);
#endif
#endif /* !__PYX_HAVE__lxml__etree */

View file

@ -0,0 +1,230 @@
/* Generated by Cython 0.23.4 */
#ifndef __PYX_HAVE_API__lxml__etree
#define __PYX_HAVE_API__lxml__etree
#include "Python.h"
#include "lxml.etree.h"
static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument)(struct LxmlDocument *, xmlNode *) = 0;
#define deepcopyNodeToDocument __pyx_api_f_4lxml_5etree_deepcopyNodeToDocument
static struct LxmlElementTree *(*__pyx_api_f_4lxml_5etree_elementTreeFactory)(struct LxmlElement *) = 0;
#define elementTreeFactory __pyx_api_f_4lxml_5etree_elementTreeFactory
static struct LxmlElementTree *(*__pyx_api_f_4lxml_5etree_newElementTree)(struct LxmlElement *, PyObject *) = 0;
#define newElementTree __pyx_api_f_4lxml_5etree_newElementTree
static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_elementFactory)(struct LxmlDocument *, xmlNode *) = 0;
#define elementFactory __pyx_api_f_4lxml_5etree_elementFactory
static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_makeElement)(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *) = 0;
#define makeElement __pyx_api_f_4lxml_5etree_makeElement
static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_makeSubElement)(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *) = 0;
#define makeSubElement __pyx_api_f_4lxml_5etree_makeSubElement
static void (*__pyx_api_f_4lxml_5etree_setElementClassLookupFunction)(_element_class_lookup_function, PyObject *) = 0;
#define setElementClassLookupFunction __pyx_api_f_4lxml_5etree_setElementClassLookupFunction
static PyObject *(*__pyx_api_f_4lxml_5etree_lookupDefaultElementClass)(PyObject *, PyObject *, xmlNode *) = 0;
#define lookupDefaultElementClass __pyx_api_f_4lxml_5etree_lookupDefaultElementClass
static PyObject *(*__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass)(PyObject *, PyObject *, xmlNode *) = 0;
#define lookupNamespaceElementClass __pyx_api_f_4lxml_5etree_lookupNamespaceElementClass
static PyObject *(*__pyx_api_f_4lxml_5etree_callLookupFallback)(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *) = 0;
#define callLookupFallback __pyx_api_f_4lxml_5etree_callLookupFallback
static int (*__pyx_api_f_4lxml_5etree_tagMatches)(xmlNode *, const xmlChar *, const xmlChar *) = 0;
#define tagMatches __pyx_api_f_4lxml_5etree_tagMatches
static struct LxmlDocument *(*__pyx_api_f_4lxml_5etree_documentOrRaise)(PyObject *) = 0;
#define documentOrRaise __pyx_api_f_4lxml_5etree_documentOrRaise
static struct LxmlElement *(*__pyx_api_f_4lxml_5etree_rootNodeOrRaise)(PyObject *) = 0;
#define rootNodeOrRaise __pyx_api_f_4lxml_5etree_rootNodeOrRaise
static int (*__pyx_api_f_4lxml_5etree_hasText)(xmlNode *) = 0;
#define hasText __pyx_api_f_4lxml_5etree_hasText
static int (*__pyx_api_f_4lxml_5etree_hasTail)(xmlNode *) = 0;
#define hasTail __pyx_api_f_4lxml_5etree_hasTail
static PyObject *(*__pyx_api_f_4lxml_5etree_textOf)(xmlNode *) = 0;
#define textOf __pyx_api_f_4lxml_5etree_textOf
static PyObject *(*__pyx_api_f_4lxml_5etree_tailOf)(xmlNode *) = 0;
#define tailOf __pyx_api_f_4lxml_5etree_tailOf
static int (*__pyx_api_f_4lxml_5etree_setNodeText)(xmlNode *, PyObject *) = 0;
#define setNodeText __pyx_api_f_4lxml_5etree_setNodeText
static int (*__pyx_api_f_4lxml_5etree_setTailText)(xmlNode *, PyObject *) = 0;
#define setTailText __pyx_api_f_4lxml_5etree_setTailText
static PyObject *(*__pyx_api_f_4lxml_5etree_attributeValue)(xmlNode *, xmlAttr *) = 0;
#define attributeValue __pyx_api_f_4lxml_5etree_attributeValue
static PyObject *(*__pyx_api_f_4lxml_5etree_attributeValueFromNsName)(xmlNode *, const xmlChar *, const xmlChar *) = 0;
#define attributeValueFromNsName __pyx_api_f_4lxml_5etree_attributeValueFromNsName
static PyObject *(*__pyx_api_f_4lxml_5etree_getAttributeValue)(struct LxmlElement *, PyObject *, PyObject *) = 0;
#define getAttributeValue __pyx_api_f_4lxml_5etree_getAttributeValue
static PyObject *(*__pyx_api_f_4lxml_5etree_iterattributes)(struct LxmlElement *, int) = 0;
#define iterattributes __pyx_api_f_4lxml_5etree_iterattributes
static PyObject *(*__pyx_api_f_4lxml_5etree_collectAttributes)(xmlNode *, int) = 0;
#define collectAttributes __pyx_api_f_4lxml_5etree_collectAttributes
static int (*__pyx_api_f_4lxml_5etree_setAttributeValue)(struct LxmlElement *, PyObject *, PyObject *) = 0;
#define setAttributeValue __pyx_api_f_4lxml_5etree_setAttributeValue
static int (*__pyx_api_f_4lxml_5etree_delAttribute)(struct LxmlElement *, PyObject *) = 0;
#define delAttribute __pyx_api_f_4lxml_5etree_delAttribute
static int (*__pyx_api_f_4lxml_5etree_delAttributeFromNsName)(xmlNode *, const xmlChar *, const xmlChar *) = 0;
#define delAttributeFromNsName __pyx_api_f_4lxml_5etree_delAttributeFromNsName
static int (*__pyx_api_f_4lxml_5etree_hasChild)(xmlNode *) = 0;
#define hasChild __pyx_api_f_4lxml_5etree_hasChild
static xmlNode *(*__pyx_api_f_4lxml_5etree_findChild)(xmlNode *, Py_ssize_t) = 0;
#define findChild __pyx_api_f_4lxml_5etree_findChild
static xmlNode *(*__pyx_api_f_4lxml_5etree_findChildForwards)(xmlNode *, Py_ssize_t) = 0;
#define findChildForwards __pyx_api_f_4lxml_5etree_findChildForwards
static xmlNode *(*__pyx_api_f_4lxml_5etree_findChildBackwards)(xmlNode *, Py_ssize_t) = 0;
#define findChildBackwards __pyx_api_f_4lxml_5etree_findChildBackwards
static xmlNode *(*__pyx_api_f_4lxml_5etree_nextElement)(xmlNode *) = 0;
#define nextElement __pyx_api_f_4lxml_5etree_nextElement
static xmlNode *(*__pyx_api_f_4lxml_5etree_previousElement)(xmlNode *) = 0;
#define previousElement __pyx_api_f_4lxml_5etree_previousElement
static void (*__pyx_api_f_4lxml_5etree_appendChild)(struct LxmlElement *, struct LxmlElement *) = 0;
#define appendChild __pyx_api_f_4lxml_5etree_appendChild
static int (*__pyx_api_f_4lxml_5etree_appendChildToElement)(struct LxmlElement *, struct LxmlElement *) = 0;
#define appendChildToElement __pyx_api_f_4lxml_5etree_appendChildToElement
static PyObject *(*__pyx_api_f_4lxml_5etree_pyunicode)(const xmlChar *) = 0;
#define pyunicode __pyx_api_f_4lxml_5etree_pyunicode
static PyObject *(*__pyx_api_f_4lxml_5etree_utf8)(PyObject *) = 0;
#define utf8 __pyx_api_f_4lxml_5etree_utf8
static PyObject *(*__pyx_api_f_4lxml_5etree_getNsTag)(PyObject *) = 0;
#define getNsTag __pyx_api_f_4lxml_5etree_getNsTag
static PyObject *(*__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs)(PyObject *) = 0;
#define getNsTagWithEmptyNs __pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs
static PyObject *(*__pyx_api_f_4lxml_5etree_namespacedName)(xmlNode *) = 0;
#define namespacedName __pyx_api_f_4lxml_5etree_namespacedName
static PyObject *(*__pyx_api_f_4lxml_5etree_namespacedNameFromNsName)(const xmlChar *, const xmlChar *) = 0;
#define namespacedNameFromNsName __pyx_api_f_4lxml_5etree_namespacedNameFromNsName
static void (*__pyx_api_f_4lxml_5etree_iteratorStoreNext)(struct LxmlElementIterator *, struct LxmlElement *) = 0;
#define iteratorStoreNext __pyx_api_f_4lxml_5etree_iteratorStoreNext
static void (*__pyx_api_f_4lxml_5etree_initTagMatch)(struct LxmlElementTagMatcher *, PyObject *) = 0;
#define initTagMatch __pyx_api_f_4lxml_5etree_initTagMatch
static xmlNs *(*__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix)(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *) = 0;
#define findOrBuildNodeNsPrefix __pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix
#if !defined(__Pyx_PyIdentifier_FromString)
#if PY_MAJOR_VERSION < 3
#define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
#else
#define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
#endif
#endif
#ifndef __PYX_HAVE_RT_ImportModule
#define __PYX_HAVE_RT_ImportModule
static PyObject *__Pyx_ImportModule(const char *name) {
PyObject *py_name = 0;
PyObject *py_module = 0;
py_name = __Pyx_PyIdentifier_FromString(name);
if (!py_name)
goto bad;
py_module = PyImport_Import(py_name);
Py_DECREF(py_name);
return py_module;
bad:
Py_XDECREF(py_name);
return 0;
}
#endif
#ifndef __PYX_HAVE_RT_ImportFunction
#define __PYX_HAVE_RT_ImportFunction
static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
PyObject *d = 0;
PyObject *cobj = 0;
union {
void (*fp)(void);
void *p;
} tmp;
d = PyObject_GetAttrString(module, (char *)"__pyx_capi__");
if (!d)
goto bad;
cobj = PyDict_GetItemString(d, funcname);
if (!cobj) {
PyErr_Format(PyExc_ImportError,
"%.200s does not export expected C function %.200s",
PyModule_GetName(module), funcname);
goto bad;
}
#if PY_VERSION_HEX >= 0x02070000
if (!PyCapsule_IsValid(cobj, sig)) {
PyErr_Format(PyExc_TypeError,
"C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj));
goto bad;
}
tmp.p = PyCapsule_GetPointer(cobj, sig);
#else
{const char *desc, *s1, *s2;
desc = (const char *)PyCObject_GetDesc(cobj);
if (!desc)
goto bad;
s1 = desc; s2 = sig;
while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; }
if (*s1 != *s2) {
PyErr_Format(PyExc_TypeError,
"C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
PyModule_GetName(module), funcname, sig, desc);
goto bad;
}
tmp.p = PyCObject_AsVoidPtr(cobj);}
#endif
*f = tmp.fp;
if (!(*f))
goto bad;
Py_DECREF(d);
return 0;
bad:
Py_XDECREF(d);
return -1;
}
#endif
static int import_lxml__etree(void) {
PyObject *module = 0;
module = __Pyx_ImportModule("lxml.etree");
if (!module) goto bad;
if (__Pyx_ImportFunction(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
if (__Pyx_ImportFunction(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
Py_DECREF(module); module = 0;
return 0;
bad:
Py_XDECREF(module);
return -1;
}
#endif /* !__PYX_HAVE_API__lxml__etree */

View file

@ -0,0 +1,3 @@
# dummy module for backwards compatibility
from lxml.etree import PythonElementClassLookup

View file

@ -0,0 +1,248 @@
"""
SAX-based adapter to copy trees from/to the Python standard library.
Use the `ElementTreeContentHandler` class to build an ElementTree from
SAX events.
Use the `ElementTreeProducer` class or the `saxify()` function to fire
the SAX events of an ElementTree against a SAX ContentHandler.
See http://codespeak.net/lxml/sax.html
"""
from xml.sax.handler import ContentHandler
from lxml import etree
from lxml.etree import ElementTree, SubElement
from lxml.etree import Comment, ProcessingInstruction
class SaxError(etree.LxmlError):
"""General SAX error.
"""
pass
def _getNsTag(tag):
if tag[0] == '{':
return tuple(tag[1:].split('}', 1))
else:
return (None, tag)
class ElementTreeContentHandler(ContentHandler):
"""Build an lxml ElementTree from SAX events.
"""
def __init__(self, makeelement=None):
ContentHandler.__init__(self)
self._root = None
self._root_siblings = []
self._element_stack = []
self._default_ns = None
self._ns_mapping = { None : [None] }
self._new_mappings = {}
if makeelement is None:
makeelement = etree.Element
self._makeelement = makeelement
def _get_etree(self):
"Contains the generated ElementTree after parsing is finished."
return ElementTree(self._root)
etree = property(_get_etree, doc=_get_etree.__doc__)
def setDocumentLocator(self, locator):
pass
def startDocument(self):
pass
def endDocument(self):
pass
def startPrefixMapping(self, prefix, uri):
self._new_mappings[prefix] = uri
try:
self._ns_mapping[prefix].append(uri)
except KeyError:
self._ns_mapping[prefix] = [uri]
if prefix is None:
self._default_ns = uri
def endPrefixMapping(self, prefix):
ns_uri_list = self._ns_mapping[prefix]
ns_uri_list.pop()
if prefix is None:
self._default_ns = ns_uri_list[-1]
def _buildTag(self, ns_name_tuple):
ns_uri, local_name = ns_name_tuple
if ns_uri:
el_tag = "{%s}%s" % ns_name_tuple
elif self._default_ns:
el_tag = "{%s}%s" % (self._default_ns, local_name)
else:
el_tag = local_name
return el_tag
def startElementNS(self, ns_name, qname, attributes=None):
el_name = self._buildTag(ns_name)
if attributes:
attrs = {}
try:
iter_attributes = attributes.iteritems()
except AttributeError:
iter_attributes = attributes.items()
for name_tuple, value in iter_attributes:
if name_tuple[0]:
attr_name = "{%s}%s" % name_tuple
else:
attr_name = name_tuple[1]
attrs[attr_name] = value
else:
attrs = None
element_stack = self._element_stack
if self._root is None:
element = self._root = \
self._makeelement(el_name, attrs, self._new_mappings)
if self._root_siblings and hasattr(element, 'addprevious'):
for sibling in self._root_siblings:
element.addprevious(sibling)
del self._root_siblings[:]
else:
element = SubElement(element_stack[-1], el_name,
attrs, self._new_mappings)
element_stack.append(element)
self._new_mappings.clear()
def processingInstruction(self, target, data):
pi = ProcessingInstruction(target, data)
if self._root is None:
self._root_siblings.append(pi)
else:
self._element_stack[-1].append(pi)
def endElementNS(self, ns_name, qname):
element = self._element_stack.pop()
el_tag = self._buildTag(ns_name)
if el_tag != element.tag:
raise SaxError("Unexpected element closed: " + el_tag)
def startElement(self, name, attributes=None):
if attributes:
attributes = dict(
[((None, k), v) for k, v in attributes.items()]
)
self.startElementNS((None, name), name, attributes)
def endElement(self, name):
self.endElementNS((None, name), name)
def characters(self, data):
last_element = self._element_stack[-1]
try:
# if there already is a child element, we must append to its tail
last_element = last_element[-1]
last_element.tail = (last_element.tail or '') + data
except IndexError:
# otherwise: append to the text
last_element.text = (last_element.text or '') + data
ignorableWhitespace = characters
class ElementTreeProducer(object):
"""Produces SAX events for an element and children.
"""
def __init__(self, element_or_tree, content_handler):
try:
element = element_or_tree.getroot()
except AttributeError:
element = element_or_tree
self._element = element
self._content_handler = content_handler
from xml.sax.xmlreader import AttributesNSImpl as attr_class
self._attr_class = attr_class
self._empty_attributes = attr_class({}, {})
def saxify(self):
self._content_handler.startDocument()
element = self._element
if hasattr(element, 'getprevious'):
siblings = []
sibling = element.getprevious()
while getattr(sibling, 'tag', None) is ProcessingInstruction:
siblings.append(sibling)
sibling = sibling.getprevious()
for sibling in siblings[::-1]:
self._recursive_saxify(sibling, {})
self._recursive_saxify(element, {})
if hasattr(element, 'getnext'):
sibling = element.getnext()
while getattr(sibling, 'tag', None) is ProcessingInstruction:
self._recursive_saxify(sibling, {})
sibling = sibling.getnext()
self._content_handler.endDocument()
def _recursive_saxify(self, element, prefixes):
content_handler = self._content_handler
tag = element.tag
if tag is Comment or tag is ProcessingInstruction:
if tag is ProcessingInstruction:
content_handler.processingInstruction(
element.target, element.text)
if element.tail:
content_handler.characters(element.tail)
return
new_prefixes = []
build_qname = self._build_qname
attribs = element.items()
if attribs:
attr_values = {}
attr_qnames = {}
for attr_ns_name, value in attribs:
attr_ns_tuple = _getNsTag(attr_ns_name)
attr_values[attr_ns_tuple] = value
attr_qnames[attr_ns_tuple] = build_qname(
attr_ns_tuple[0], attr_ns_tuple[1], prefixes, new_prefixes)
sax_attributes = self._attr_class(attr_values, attr_qnames)
else:
sax_attributes = self._empty_attributes
ns_uri, local_name = _getNsTag(tag)
qname = build_qname(ns_uri, local_name, prefixes, new_prefixes)
for prefix, uri in new_prefixes:
content_handler.startPrefixMapping(prefix, uri)
content_handler.startElementNS((ns_uri, local_name),
qname, sax_attributes)
if element.text:
content_handler.characters(element.text)
for child in element:
self._recursive_saxify(child, prefixes)
content_handler.endElementNS((ns_uri, local_name), qname)
for prefix, uri in new_prefixes:
content_handler.endPrefixMapping(prefix)
if element.tail:
content_handler.characters(element.tail)
def _build_qname(self, ns_uri, local_name, prefixes, new_prefixes):
if ns_uri is None:
return local_name
try:
prefix = prefixes[ns_uri]
except KeyError:
prefix = prefixes[ns_uri] = 'ns%02d' % len(prefixes)
new_prefixes.append( (prefix, ns_uri) )
return prefix + ':' + local_name
def saxify(element_or_tree, content_handler):
"""One-shot helper to generate SAX events from an XML tree and fire
them against a SAX ContentHandler.
"""
return ElementTreeProducer(element_or_tree, content_handler).saxify()

View file

@ -0,0 +1,13 @@
"""Doctest module for XML comparison.
Usage::
>>> import lxml.usedoctest
>>> # now do your XML doctests ...
See `lxml.doctestcompare`
"""
from lxml import doctestcompare
doctestcompare.temp_install(del_module=__name__)