aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/lxml/etree.pyx
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/lxml/etree.pyx')
-rw-r--r--.venv/lib/python3.12/site-packages/lxml/etree.pyx3732
1 files changed, 3732 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lxml/etree.pyx b/.venv/lib/python3.12/site-packages/lxml/etree.pyx
new file mode 100644
index 00000000..f7da01c1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/lxml/etree.pyx
@@ -0,0 +1,3732 @@
+# cython: binding=True
+# cython: auto_pickle=False
+# cython: language_level=3
+
+"""
+The ``lxml.etree`` module implements the extended ElementTree API for XML.
+"""
+
+__docformat__ = "restructuredtext en"
+
+__all__ = [
+ 'AttributeBasedElementClassLookup', 'C14NError', 'C14NWriterTarget', 'CDATA',
+ 'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG',
+ 'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError',
+ 'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element',
+ 'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup',
+ 'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase',
+ 'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension',
+ 'FallbackElementClassLookup', 'FunctionNamespace', 'HTML', 'HTMLParser',
+ 'ICONV_COMPILED_VERSION',
+ 'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION',
+ 'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION',
+ 'LXML_VERSION',
+ 'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError',
+ 'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError',
+ 'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction',
+ 'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG',
+ 'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError',
+ 'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError',
+ 'SchematronParseError', 'SchematronValidateError', 'SerialisationError',
+ 'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML',
+ 'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError',
+ 'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError',
+ 'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError',
+ 'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError',
+ 'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError',
+ 'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError',
+ 'XSLTSaveError', 'canonicalize',
+ 'cleanup_namespaces', 'clear_error_log', 'dump',
+ 'fromstring', 'fromstringlist', 'get_default_parser', 'iselement',
+ 'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace',
+ 'set_default_parser', 'set_element_class_lookup', 'strip_attributes',
+ 'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode',
+ 'use_global_python_log'
+ ]
+
+cimport cython
+
+from lxml cimport python
+from lxml.includes cimport tree, config
+from lxml.includes.tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs
+from lxml.includes.tree cimport const_xmlChar, xmlChar, _xcstr
+from lxml.python cimport _cstr, _isString
+from lxml.includes cimport xpath
+from lxml.includes cimport c14n
+
+# Cython's standard declarations
+cimport cpython.mem
+cimport cpython.ref
+from libc cimport limits, stdio, stdlib
+from libc cimport string as cstring_h # not to be confused with stdlib 'string'
+from libc.string cimport const_char
+
+cdef object os_path_abspath
+from os.path import abspath as os_path_abspath
+
+cdef object BytesIO, StringIO
+from io import BytesIO, StringIO
+
+cdef object OrderedDict
+from collections import OrderedDict
+
+cdef object _elementpath
+from lxml import _elementpath
+
+cdef object sys
+import sys
+
+cdef object re
+import re
+
+cdef object partial
+from functools import partial
+
+cdef object islice
+from itertools import islice
+
+cdef object ITER_EMPTY = iter(())
+
+cdef object MutableMapping
+from collections.abc import MutableMapping
+
+class _ImmutableMapping(MutableMapping):
+ def __getitem__(self, key):
+ raise KeyError, key
+
+ def __setitem__(self, key, value):
+ raise KeyError, key
+
+ def __delitem__(self, key):
+ raise KeyError, key
+
+ def __contains__(self, key):
+ return False
+
+ def __len__(self):
+ return 0
+
+ def __iter__(self):
+ return ITER_EMPTY
+ iterkeys = itervalues = iteritems = __iter__
+
+cdef object IMMUTABLE_EMPTY_MAPPING = _ImmutableMapping()
+del _ImmutableMapping
+
+
+# the rules
+# ---------
+# any libxml C argument/variable is prefixed with c_
+# any non-public function/class is prefixed with an underscore
+# instance creation is always through factories
+
+# what to do with libxml2/libxslt error messages?
+# 0 : drop
+# 1 : use log
+DEF __DEBUG = 1
+
+# maximum number of lines in the libxml2/xslt log if __DEBUG == 1
+DEF __MAX_LOG_SIZE = 100
+
+# make the compiled-in debug state publicly available
+DEBUG = __DEBUG
+
+# A struct to store a cached qualified tag name+href pair.
+# While we can borrow the c_name from the document dict,
+# PyPy requires us to store a Python reference for the
+# namespace in order to keep the byte buffer alive.
+cdef struct qname:
+ const_xmlChar* c_name
+ python.PyObject* href
+
+# initialize parser (and threading)
+xmlparser.xmlInitParser()
+
+# global per-thread setup
+tree.xmlThrDefIndentTreeOutput(1)
+tree.xmlThrDefLineNumbersDefaultValue(1)
+
+_initThreadLogging()
+
+# filename encoding
+cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")
+cdef char* _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
+
+# set up some default namespace prefixes
+cdef dict _DEFAULT_NAMESPACE_PREFIXES = {
+ b"http://www.w3.org/XML/1998/namespace": b'xml',
+ b"http://www.w3.org/1999/xhtml": b"html",
+ b"http://www.w3.org/1999/XSL/Transform": b"xsl",
+ b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf",
+ b"http://schemas.xmlsoap.org/wsdl/": b"wsdl",
+ # xml schema
+ b"http://www.w3.org/2001/XMLSchema": b"xs",
+ b"http://www.w3.org/2001/XMLSchema-instance": b"xsi",
+ # dublin core
+ b"http://purl.org/dc/elements/1.1/": b"dc",
+ # objectify
+ b"http://codespeak.net/lxml/objectify/pytype" : b"py",
+}
+
+# To avoid runtime encoding overhead, we keep a Unicode copy
+# of the uri-prefix mapping as (str, str) items view.
+cdef object _DEFAULT_NAMESPACE_PREFIXES_ITEMS = []
+
+cdef _update_default_namespace_prefixes_items():
+ cdef bytes ns, prefix
+ global _DEFAULT_NAMESPACE_PREFIXES_ITEMS
+ _DEFAULT_NAMESPACE_PREFIXES_ITEMS = {
+ ns.decode('utf-8') : prefix.decode('utf-8')
+ for ns, prefix in _DEFAULT_NAMESPACE_PREFIXES.items()
+ }.items()
+
+_update_default_namespace_prefixes_items()
+
+cdef object _check_internal_prefix = re.compile(br"ns\d+$").match
+
+def register_namespace(prefix, uri):
+ """Registers a namespace prefix that newly created Elements in that
+ namespace will use. The registry is global, and any existing
+ mapping for either the given prefix or the namespace URI will be
+ removed.
+ """
+ prefix_utf, uri_utf = _utf8(prefix), _utf8(uri)
+ if _check_internal_prefix(prefix_utf):
+ raise ValueError("Prefix format reserved for internal use")
+ _tagValidOrRaise(prefix_utf)
+ _uriValidOrRaise(uri_utf)
+ if (uri_utf == b"http://www.w3.org/XML/1998/namespace" and prefix_utf != b'xml'
+ or prefix_utf == b'xml' and uri_utf != b"http://www.w3.org/XML/1998/namespace"):
+ raise ValueError("Cannot change the 'xml' prefix of the XML namespace")
+ for k, v in list(_DEFAULT_NAMESPACE_PREFIXES.items()):
+ if k == uri_utf or v == prefix_utf:
+ del _DEFAULT_NAMESPACE_PREFIXES[k]
+ _DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf
+ _update_default_namespace_prefixes_items()
+
+
+# Error superclass for ElementTree compatibility
+cdef class Error(Exception):
+ pass
+
+# module level superclass for all exceptions
+cdef class LxmlError(Error):
+ """Main exception base class for lxml. All other exceptions inherit from
+ this one.
+ """
+ def __init__(self, message, error_log=None):
+ super(_Error, self).__init__(message)
+ if error_log is None:
+ self.error_log = __copyGlobalErrorLog()
+ else:
+ self.error_log = error_log.copy()
+
+cdef object _Error = Error
+
+
+# superclass for all syntax errors
+class LxmlSyntaxError(LxmlError, SyntaxError):
+ """Base class for all syntax errors.
+ """
+
+cdef class C14NError(LxmlError):
+ """Error during C14N serialisation.
+ """
+
+# version information
+cdef tuple __unpackDottedVersion(version):
+ version_list = []
+ l = (version.decode("ascii").replace('-', '.').split('.') + [0]*4)[:4]
+ for item in l:
+ try:
+ item = int(item)
+ except ValueError:
+ if item.startswith('dev'):
+ count = item[3:]
+ item = -300
+ elif item.startswith('alpha'):
+ count = item[5:]
+ item = -200
+ elif item.startswith('beta'):
+ count = item[4:]
+ item = -100
+ else:
+ count = 0
+ if count:
+ item += int(count)
+ version_list.append(item)
+ return tuple(version_list)
+
+cdef tuple __unpackIntVersion(int c_version, int base=100):
+ return (
+ ((c_version // (base*base)) % base),
+ ((c_version // base) % base),
+ (c_version % base)
+ )
+
+cdef int _LIBXML_VERSION_INT
+try:
+ _LIBXML_VERSION_INT = int(
+ re.match('[0-9]+', (<unsigned char*>tree.xmlParserVersion).decode("ascii")).group(0))
+except Exception:
+ print("Unknown libxml2 version: " + (<unsigned char*>tree.xmlParserVersion).decode("latin1"))
+ _LIBXML_VERSION_INT = 0
+
+LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT)
+LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION)
+LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING)
+
+__version__ = tree.LXML_VERSION_STRING.decode("ascii")
+
+cdef extern from *:
+ """
+ #ifdef ZLIB_VERNUM
+ #define __lxml_zlib_version (ZLIB_VERNUM >> 4)
+ #else
+ #define __lxml_zlib_version 0
+ #endif
+ #ifdef _LIBICONV_VERSION
+ #define __lxml_iconv_version (_LIBICONV_VERSION << 8)
+ #else
+ #define __lxml_iconv_version 0
+ #endif
+ """
+ # zlib isn't included automatically by libxml2's headers
+ #long ZLIB_HEX_VERSION "__lxml_zlib_version"
+ long LIBICONV_HEX_VERSION "__lxml_iconv_version"
+
+#ZLIB_COMPILED_VERSION = __unpackIntVersion(ZLIB_HEX_VERSION, base=0x10)
+ICONV_COMPILED_VERSION = __unpackIntVersion(LIBICONV_HEX_VERSION, base=0x100)[:2]
+
+
+# class for temporary storage of Python references,
+# used e.g. for XPath results
+@cython.final
+@cython.internal
+cdef class _TempStore:
+ cdef list _storage
+ def __init__(self):
+ self._storage = []
+
+ cdef int add(self, obj) except -1:
+ self._storage.append(obj)
+ return 0
+
+ cdef int clear(self) except -1:
+ del self._storage[:]
+ return 0
+
+
+# class for temporarily storing exceptions raised in extensions
+@cython.internal
+cdef class _ExceptionContext:
+ cdef object _exc_info
+ cdef int clear(self) except -1:
+ self._exc_info = None
+ return 0
+
+ cdef void _store_raised(self) noexcept:
+ try:
+ self._exc_info = sys.exc_info()
+ except BaseException as e:
+ self._store_exception(e)
+ finally:
+ return # and swallow any further exceptions
+
+ cdef int _store_exception(self, exception) except -1:
+ self._exc_info = (exception, None, None)
+ return 0
+
+ cdef bint _has_raised(self) except -1:
+ return self._exc_info is not None
+
+ cdef int _raise_if_stored(self) except -1:
+ if self._exc_info is None:
+ return 0
+ type, value, traceback = self._exc_info
+ self._exc_info = None
+ if value is None and traceback is None:
+ raise type
+ else:
+ raise type, value, traceback
+
+
+# type of a function that steps from node to node
+ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*)
+
+
+################################################################################
+# Include submodules
+
+include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
+include "apihelpers.pxi" # Private helper functions
+include "xmlerror.pxi" # Error and log handling
+
+
+################################################################################
+# Public Python API
+
+@cython.final
+@cython.freelist(8)
+cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]:
+ """Internal base class to reference a libxml document.
+
+ When instances of this class are garbage collected, the libxml
+ document is cleaned up.
+ """
+ cdef int _ns_counter
+ cdef bytes _prefix_tail
+ cdef xmlDoc* _c_doc
+ cdef _BaseParser _parser
+
+ def __dealloc__(self):
+ # if there are no more references to the document, it is safe
+ # to clean the whole thing up, as all nodes have a reference to
+ # the document
+ tree.xmlFreeDoc(self._c_doc)
+
+ @cython.final
+ cdef getroot(self):
+ # return an element proxy for the document root
+ cdef xmlNode* c_node
+ c_node = tree.xmlDocGetRootElement(self._c_doc)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self, c_node)
+
+ @cython.final
+ cdef bint hasdoctype(self) noexcept:
+ # DOCTYPE gets parsed into internal subset (xmlDTD*)
+ return self._c_doc is not NULL and self._c_doc.intSubset is not NULL
+
+ @cython.final
+ cdef getdoctype(self):
+ # get doctype info: root tag, public/system ID (or None if not known)
+ cdef tree.xmlDtd* c_dtd
+ cdef xmlNode* c_root_node
+ public_id = None
+ sys_url = None
+ c_dtd = self._c_doc.intSubset
+ if c_dtd is not NULL:
+ if c_dtd.ExternalID is not NULL:
+ public_id = funicode(c_dtd.ExternalID)
+ if c_dtd.SystemID is not NULL:
+ sys_url = funicode(c_dtd.SystemID)
+ c_dtd = self._c_doc.extSubset
+ if c_dtd is not NULL:
+ if not public_id and c_dtd.ExternalID is not NULL:
+ public_id = funicode(c_dtd.ExternalID)
+ if not sys_url and c_dtd.SystemID is not NULL:
+ sys_url = funicode(c_dtd.SystemID)
+ c_root_node = tree.xmlDocGetRootElement(self._c_doc)
+ if c_root_node is NULL:
+ root_name = None
+ else:
+ root_name = funicode(c_root_node.name)
+ return root_name, public_id, sys_url
+
+ @cython.final
+ cdef getxmlinfo(self):
+ # return XML version and encoding (or None if not known)
+ cdef xmlDoc* c_doc = self._c_doc
+ if c_doc.version is NULL:
+ version = None
+ else:
+ version = funicode(c_doc.version)
+ if c_doc.encoding is NULL:
+ encoding = None
+ else:
+ encoding = funicode(c_doc.encoding)
+ return version, encoding
+
+ @cython.final
+ cdef isstandalone(self):
+ # returns True for "standalone=true",
+ # False for "standalone=false", None if not provided
+ if self._c_doc.standalone == -1:
+ return None
+ else:
+ return <bint>(self._c_doc.standalone == 1)
+
+ @cython.final
+ cdef bytes buildNewPrefix(self):
+ # get a new unique prefix ("nsX") for this document
+ cdef bytes ns
+ if self._ns_counter < len(_PREFIX_CACHE):
+ ns = _PREFIX_CACHE[self._ns_counter]
+ else:
+ ns = python.PyBytes_FromFormat("ns%d", self._ns_counter)
+ if self._prefix_tail is not None:
+ ns += self._prefix_tail
+ self._ns_counter += 1
+ if self._ns_counter < 0:
+ # overflow!
+ self._ns_counter = 0
+ if self._prefix_tail is None:
+ self._prefix_tail = b"A"
+ else:
+ self._prefix_tail += b"A"
+ return ns
+
+ @cython.final
+ cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node,
+ const_xmlChar* c_href, const_xmlChar* c_prefix,
+ bint is_attribute) except NULL:
+ """Get or create namespace structure for a node. Reuses the prefix if
+ possible.
+ """
+ cdef xmlNs* c_ns
+ cdef xmlNs* c_doc_ns
+ cdef python.PyObject* dict_result
+ if c_node.type != tree.XML_ELEMENT_NODE:
+ assert c_node.type == tree.XML_ELEMENT_NODE, \
+ "invalid node type %d, expected %d" % (
+ c_node.type, tree.XML_ELEMENT_NODE)
+ # look for existing ns declaration
+ c_ns = _searchNsByHref(c_node, c_href, is_attribute)
+ if c_ns is not NULL:
+ if is_attribute and c_ns.prefix is NULL:
+ # do not put namespaced attributes into the default
+ # namespace as this would break serialisation
+ pass
+ else:
+ return c_ns
+
+ # none found => determine a suitable new prefix
+ if c_prefix is NULL:
+ dict_result = python.PyDict_GetItem(
+ _DEFAULT_NAMESPACE_PREFIXES, <unsigned char*>c_href)
+ if dict_result is not NULL:
+ prefix = <object>dict_result
+ else:
+ prefix = self.buildNewPrefix()
+ c_prefix = _xcstr(prefix)
+
+ # make sure the prefix is not in use already
+ while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL:
+ prefix = self.buildNewPrefix()
+ c_prefix = _xcstr(prefix)
+
+ # declare the namespace and return it
+ c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
+ if c_ns is NULL:
+ raise MemoryError()
+ return c_ns
+
+ @cython.final
+ cdef int _setNodeNs(self, xmlNode* c_node, const_xmlChar* c_href) except -1:
+ "Lookup namespace structure and set it for the node."
+ c_ns = self._findOrBuildNodeNs(c_node, c_href, NULL, 0)
+ tree.xmlSetNs(c_node, c_ns)
+
+cdef tuple __initPrefixCache():
+ cdef int i
+ return tuple([ python.PyBytes_FromFormat("ns%d", i)
+ for i in range(30) ])
+
+cdef tuple _PREFIX_CACHE = __initPrefixCache()
+
+cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser):
+ cdef _Document result
+ result = _Document.__new__(_Document)
+ result._c_doc = c_doc
+ result._ns_counter = 0
+ result._prefix_tail = None
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ result._parser = parser
+ return result
+
+
+cdef object _find_invalid_public_id_characters = re.compile(
+ ur"[^\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]+").search
+
+
+cdef class DocInfo:
+ "Document information provided by parser and DTD."
+ cdef _Document _doc
+ def __cinit__(self, tree):
+ "Create a DocInfo object for an ElementTree object or root Element."
+ self._doc = _documentOrRaise(tree)
+ root_name, public_id, system_url = self._doc.getdoctype()
+ if not root_name and (public_id or system_url):
+ raise ValueError, "Could not find root node"
+
+ @property
+ def root_name(self):
+ """Returns the name of the root node as defined by the DOCTYPE."""
+ root_name, public_id, system_url = self._doc.getdoctype()
+ return root_name
+
+ @cython.final
+ cdef tree.xmlDtd* _get_c_dtd(self):
+ """"Return the DTD. Create it if it does not yet exist."""
+ cdef xmlDoc* c_doc = self._doc._c_doc
+ cdef xmlNode* c_root_node
+ cdef const_xmlChar* c_name
+
+ if c_doc.intSubset:
+ return c_doc.intSubset
+
+ c_root_node = tree.xmlDocGetRootElement(c_doc)
+ c_name = c_root_node.name if c_root_node else NULL
+ return tree.xmlCreateIntSubset(c_doc, c_name, NULL, NULL)
+
+ def clear(self):
+ """Removes DOCTYPE and internal subset from the document."""
+ cdef xmlDoc* c_doc = self._doc._c_doc
+ cdef tree.xmlNode* c_dtd = <xmlNode*>c_doc.intSubset
+ if c_dtd is NULL:
+ return
+ tree.xmlUnlinkNode(c_dtd)
+ tree.xmlFreeNode(c_dtd)
+
+ property public_id:
+ """Public ID of the DOCTYPE.
+
+ Mutable. May be set to a valid string or None. If a DTD does not
+ exist, setting this variable (even to None) will create one.
+ """
+ def __get__(self):
+ root_name, public_id, system_url = self._doc.getdoctype()
+ return public_id
+
+ def __set__(self, value):
+ cdef xmlChar* c_value = NULL
+ if value is not None:
+ match = _find_invalid_public_id_characters(value)
+ if match:
+ raise ValueError, f'Invalid character(s) {match.group(0)!r} in public_id.'
+ value = _utf8(value)
+ c_value = tree.xmlStrdup(_xcstr(value))
+ if not c_value:
+ raise MemoryError()
+
+ c_dtd = self._get_c_dtd()
+ if not c_dtd:
+ tree.xmlFree(c_value)
+ raise MemoryError()
+ if c_dtd.ExternalID:
+ tree.xmlFree(<void*>c_dtd.ExternalID)
+ c_dtd.ExternalID = c_value
+
+ property system_url:
+ """System ID of the DOCTYPE.
+
+ Mutable. May be set to a valid string or None. If a DTD does not
+ exist, setting this variable (even to None) will create one.
+ """
+ def __get__(self):
+ root_name, public_id, system_url = self._doc.getdoctype()
+ return system_url
+
+ def __set__(self, value):
+ cdef xmlChar* c_value = NULL
+ if value is not None:
+ bvalue = _utf8(value)
+ # sys_url may be any valid unicode string that can be
+ # enclosed in single quotes or quotes.
+ if b"'" in bvalue and b'"' in bvalue:
+ raise ValueError(
+ 'System URL may not contain both single (\') and double quotes (").')
+ c_value = tree.xmlStrdup(_xcstr(bvalue))
+ if not c_value:
+ raise MemoryError()
+
+ c_dtd = self._get_c_dtd()
+ if not c_dtd:
+ tree.xmlFree(c_value)
+ raise MemoryError()
+ if c_dtd.SystemID:
+ tree.xmlFree(<void*>c_dtd.SystemID)
+ c_dtd.SystemID = c_value
+
+ @property
+ def xml_version(self):
+ """Returns the XML version as declared by the document."""
+ xml_version, encoding = self._doc.getxmlinfo()
+ return xml_version
+
+ @property
+ def encoding(self):
+ """Returns the encoding name as declared by the document."""
+ xml_version, encoding = self._doc.getxmlinfo()
+ return encoding
+
+ @property
+ def standalone(self):
+ """Returns the standalone flag as declared by the document. The possible
+ values are True (``standalone='yes'``), False
+ (``standalone='no'`` or flag not provided in the declaration),
+ and None (unknown or no declaration found). Note that a
+ normal truth test on this value will always tell if the
+ ``standalone`` flag was set to ``'yes'`` or not.
+ """
+ return self._doc.isstandalone()
+
+ property URL:
+ "The source URL of the document (or None if unknown)."
+ def __get__(self):
+ if self._doc._c_doc.URL is NULL:
+ return None
+ return _decodeFilename(self._doc._c_doc.URL)
+ def __set__(self, url):
+ url = _encodeFilename(url)
+ c_oldurl = self._doc._c_doc.URL
+ if url is None:
+ self._doc._c_doc.URL = NULL
+ else:
+ self._doc._c_doc.URL = tree.xmlStrdup(_xcstr(url))
+ if c_oldurl is not NULL:
+ tree.xmlFree(<void*>c_oldurl)
+
+ @property
+ def doctype(self):
+ """Returns a DOCTYPE declaration string for the document."""
+ root_name, public_id, system_url = self._doc.getdoctype()
+ if system_url:
+ # If '"' in system_url, we must escape it with single
+ # quotes, otherwise escape with double quotes. If url
+ # contains both a single quote and a double quote, XML
+ # standard is being violated.
+ if '"' in system_url:
+ quoted_system_url = f"'{system_url}'"
+ else:
+ quoted_system_url = f'"{system_url}"'
+ if public_id:
+ if system_url:
+ return f'<!DOCTYPE {root_name} PUBLIC "{public_id}" {quoted_system_url}>'
+ else:
+ return f'<!DOCTYPE {root_name} PUBLIC "{public_id}">'
+ elif system_url:
+ return f'<!DOCTYPE {root_name} SYSTEM {quoted_system_url}>'
+ elif self._doc.hasdoctype():
+ return f'<!DOCTYPE {root_name}>'
+ else:
+ return ''
+
+ @property
+ def internalDTD(self):
+ """Returns a DTD validator based on the internal subset of the document."""
+ return _dtdFactory(self._doc._c_doc.intSubset)
+
+ @property
+ def externalDTD(self):
+ """Returns a DTD validator based on the external subset of the document."""
+ return _dtdFactory(self._doc._c_doc.extSubset)
+
+
+@cython.no_gc_clear
+cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
+ """Element class.
+
+ References a document object and a libxml node.
+
+ By pointing to a Document instance, a reference is kept to
+ _Document as long as there is some pointer to a node in it.
+ """
+ cdef _Document _doc
+ cdef xmlNode* _c_node
+ cdef object _tag
+
+ def _init(self):
+ """_init(self)
+
+ Called after object initialisation. Custom subclasses may override
+ this if they recursively call _init() in the superclasses.
+ """
+
+ @cython.linetrace(False)
+ @cython.profile(False)
+ def __dealloc__(self):
+ #print("trying to free node:", <int>self._c_node)
+ #displayNode(self._c_node, 0)
+ if self._c_node is not NULL:
+ _unregisterProxy(self)
+ attemptDeallocation(self._c_node)
+
+ # MANIPULATORS
+
+ def __setitem__(self, x, value):
+ """__setitem__(self, x, value)
+
+ Replaces the given subelement index or slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef xmlNode* c_next
+ cdef xmlDoc* c_source_doc
+ cdef _Element element
+ cdef bint left_to_right
+ cdef Py_ssize_t slicelength = 0, step = 0
+ _assertValidNode(self)
+ if value is None:
+ raise ValueError, "cannot assign None"
+ if isinstance(x, slice):
+ # slice assignment
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ if step > 0:
+ left_to_right = 1
+ else:
+ left_to_right = 0
+ step = -step
+ _replaceSlice(self, c_node, slicelength, step, left_to_right, value)
+ return
+ else:
+ # otherwise: normal item assignment
+ element = value
+ _assertValidNode(element)
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, "list index out of range"
+ c_source_doc = element._c_node.doc
+ c_next = element._c_node.next
+ _removeText(c_node.next)
+ tree.xmlReplaceNode(c_node, element._c_node)
+ _moveTail(c_next, element._c_node)
+ moveNodeToDocument(self._doc, c_source_doc, element._c_node)
+ if not attemptDeallocation(c_node):
+ moveNodeToDocument(self._doc, c_node.doc, c_node)
+
+ def __delitem__(self, x):
+ """__delitem__(self, x)
+
+ Deletes the given subelement or a slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef xmlNode* c_next
+ cdef Py_ssize_t step = 0, slicelength = 0
+ _assertValidNode(self)
+ if isinstance(x, slice):
+ # slice deletion
+ if _isFullSlice(<slice>x):
+ c_node = self._c_node.children
+ if c_node is not NULL:
+ if not _isElement(c_node):
+ c_node = _nextElement(c_node)
+ while c_node is not NULL:
+ c_next = _nextElement(c_node)
+ _removeNode(self._doc, c_node)
+ c_node = c_next
+ else:
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ _deleteSlice(self._doc, c_node, slicelength, step)
+ else:
+ # item deletion
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, f"index out of range: {x}"
+ _removeNode(self._doc, c_node)
+
+ def __deepcopy__(self, memo):
+ "__deepcopy__(self, memo)"
+ return self.__copy__()
+
+ def __copy__(self):
+ "__copy__(self)"
+ cdef xmlDoc* c_doc
+ cdef xmlNode* c_node
+ cdef _Document new_doc
+ _assertValidNode(self)
+ c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive
+ new_doc = _documentFactory(c_doc, self._doc._parser)
+ root = new_doc.getroot()
+ if root is not None:
+ return root
+ # Comment/PI
+ c_node = c_doc.children
+ while c_node is not NULL and c_node.type != self._c_node.type:
+ c_node = c_node.next
+ if c_node is NULL:
+ return None
+ return _elementFactory(new_doc, c_node)
+
+ def set(self, key, value):
+ """set(self, key, value)
+
+ Sets an element attribute.
+ In HTML documents (not XML or XHTML), the value None is allowed and creates
+ an attribute without value (just the attribute name).
+ """
+ _assertValidNode(self)
+ _setAttributeValue(self, key, value)
+
+ def append(self, _Element element not None):
+ """append(self, element)
+
+ Adds a subelement to the end of this element.
+ """
+ _assertValidNode(self)
+ _assertValidNode(element)
+ _appendChild(self, element)
+
+ def addnext(self, _Element element not None):
+ """addnext(self, element)
+
+ Adds the element as a following sibling directly after this
+ element.
+
+ This is normally used to set a processing instruction or comment after
+ the root node of a document. Note that tail text is automatically
+ discarded when adding at the root level.
+ """
+ _assertValidNode(self)
+ _assertValidNode(element)
+ if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
+ if element._c_node.type not in (tree.XML_PI_NODE, tree.XML_COMMENT_NODE):
+ raise TypeError, "Only processing instructions and comments can be siblings of the root element"
+ element.tail = None
+ _appendSibling(self, element)
+
+ def addprevious(self, _Element element not None):
+ """addprevious(self, element)
+
+ Adds the element as a preceding sibling directly before this
+ element.
+
+ This is normally used to set a processing instruction or comment
+ before the root node of a document. Note that tail text is
+ automatically discarded when adding at the root level.
+ """
+ _assertValidNode(self)
+ _assertValidNode(element)
+ if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
+ if element._c_node.type != tree.XML_PI_NODE:
+ if element._c_node.type != tree.XML_COMMENT_NODE:
+ raise TypeError, "Only processing instructions and comments can be siblings of the root element"
+ element.tail = None
+ _prependSibling(self, element)
+
+ def extend(self, elements):
+ """extend(self, elements)
+
+ Extends the current children by the elements in the iterable.
+ """
+ cdef _Element element
+ _assertValidNode(self)
+ for element in elements:
+ if element is None:
+ raise TypeError, "Node must not be None"
+ _assertValidNode(element)
+ _appendChild(self, element)
+
+ def clear(self, bint keep_tail=False):
+ """clear(self, keep_tail=False)
+
+ Resets an element. This function removes all subelements, clears
+ all attributes and sets the text and tail properties to None.
+
+ Pass ``keep_tail=True`` to leave the tail text untouched.
+ """
+ cdef xmlAttr* c_attr
+ cdef xmlAttr* c_attr_next
+ cdef xmlNode* c_node
+ cdef xmlNode* c_node_next
+ _assertValidNode(self)
+ c_node = self._c_node
+ # remove self.text and self.tail
+ _removeText(c_node.children)
+ if not keep_tail:
+ _removeText(c_node.next)
+ # remove all attributes
+ c_attr = c_node.properties
+ if c_attr:
+ c_node.properties = NULL
+ tree.xmlFreePropList(c_attr)
+ # remove all subelements
+ c_node = c_node.children
+ if c_node and not _isElement(c_node):
+ c_node = _nextElement(c_node)
+ while c_node is not NULL:
+ c_node_next = _nextElement(c_node)
+ _removeNode(self._doc, c_node)
+ c_node = c_node_next
+
+ def insert(self, index: int, _Element element not None):
+ """insert(self, index, element)
+
+ Inserts a subelement at the given position in this element
+ """
+ cdef xmlNode* c_node
+ cdef xmlNode* c_next
+ cdef xmlDoc* c_source_doc
+ _assertValidNode(self)
+ _assertValidNode(element)
+ c_node = _findChild(self._c_node, index)
+ if c_node is NULL:
+ _appendChild(self, element)
+ return
+ # prevent cycles
+ if _isAncestorOrSame(element._c_node, self._c_node):
+ raise ValueError("cannot append parent to itself")
+ c_source_doc = element._c_node.doc
+ c_next = element._c_node.next
+ tree.xmlAddPrevSibling(c_node, element._c_node)
+ _moveTail(c_next, element._c_node)
+ moveNodeToDocument(self._doc, c_source_doc, element._c_node)
+
+ def remove(self, _Element element not None):
+ """remove(self, element)
+
+ Removes a matching subelement. Unlike the find methods, this
+ method compares elements based on identity, not on tag value
+ or contents.
+ """
+ cdef xmlNode* c_node
+ cdef xmlNode* c_next
+ _assertValidNode(self)
+ _assertValidNode(element)
+ c_node = element._c_node
+ if c_node.parent is not self._c_node:
+ raise ValueError, "Element is not a child of this node."
+ c_next = element._c_node.next
+ tree.xmlUnlinkNode(c_node)
+ _moveTail(c_next, c_node)
+ # fix namespace declarations
+ moveNodeToDocument(self._doc, c_node.doc, c_node)
+
+ def replace(self, _Element old_element not None,
+ _Element new_element not None):
+ """replace(self, old_element, new_element)
+
+ Replaces a subelement with the element passed as second argument.
+ """
+ cdef xmlNode* c_old_node
+ cdef xmlNode* c_old_next
+ cdef xmlNode* c_new_node
+ cdef xmlNode* c_new_next
+ cdef xmlDoc* c_source_doc
+ _assertValidNode(self)
+ _assertValidNode(old_element)
+ _assertValidNode(new_element)
+ c_old_node = old_element._c_node
+ if c_old_node.parent is not self._c_node:
+ raise ValueError, "Element is not a child of this node."
+ c_new_node = new_element._c_node
+ # prevent cycles
+ if _isAncestorOrSame(c_new_node, self._c_node):
+ raise ValueError("cannot append parent to itself")
+ # replace node
+ c_old_next = c_old_node.next
+ c_new_next = c_new_node.next
+ c_source_doc = c_new_node.doc
+ tree.xmlReplaceNode(c_old_node, c_new_node)
+ _moveTail(c_new_next, c_new_node)
+ _moveTail(c_old_next, c_old_node)
+ moveNodeToDocument(self._doc, c_source_doc, c_new_node)
+ # fix namespace declarations
+ moveNodeToDocument(self._doc, c_old_node.doc, c_old_node)
+
+ # PROPERTIES
+ property tag:
+ """Element tag
+ """
+ def __get__(self):
+ if self._tag is not None:
+ return self._tag
+ _assertValidNode(self)
+ self._tag = _namespacedName(self._c_node)
+ return self._tag
+
+ def __set__(self, value):
+ cdef _BaseParser parser
+ _assertValidNode(self)
+ ns, name = _getNsTag(value)
+ parser = self._doc._parser
+ if parser is not None and parser._for_html:
+ _htmlTagValidOrRaise(name)
+ else:
+ _tagValidOrRaise(name)
+ self._tag = value
+ tree.xmlNodeSetName(self._c_node, _xcstr(name))
+ if ns is None:
+ self._c_node.ns = NULL
+ else:
+ self._doc._setNodeNs(self._c_node, _xcstr(ns))
+
+ @property
+ def attrib(self):
+ """Element attribute dictionary. Where possible, use get(), set(),
+ keys(), values() and items() to access element attributes.
+ """
+ return _Attrib.__new__(_Attrib, self)
+
+ property text:
+ """Text before the first subelement. This is either a string or
+ the value None, if there was no text.
+ """
+ def __get__(self):
+ _assertValidNode(self)
+ return _collectText(self._c_node.children)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ if isinstance(value, QName):
+ value = _resolveQNameText(self, value).decode('utf8')
+ _setNodeText(self._c_node, value)
+
+ # using 'del el.text' is the wrong thing to do
+ #def __del__(self):
+ # _setNodeText(self._c_node, None)
+
+ property tail:
+ """Text after this element's end tag, but before the next sibling
+ element's start tag. This is either a string or the value None, if
+ there was no text.
+ """
+ def __get__(self):
+ _assertValidNode(self)
+ return _collectText(self._c_node.next)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ _setTailText(self._c_node, value)
+
+ # using 'del el.tail' is the wrong thing to do
+ #def __del__(self):
+ # _setTailText(self._c_node, None)
+
+ # not in ElementTree, read-only
+ @property
+ def prefix(self):
+ """Namespace prefix or None.
+ """
+ if self._c_node.ns is not NULL:
+ if self._c_node.ns.prefix is not NULL:
+ return funicode(self._c_node.ns.prefix)
+ return None
+
+ # not in ElementTree, read-only
+ property sourceline:
+ """Original line number as found by the parser or None if unknown.
+ """
+ def __get__(self):
+ cdef long line
+ _assertValidNode(self)
+ line = tree.xmlGetLineNo(self._c_node)
+ return line if line > 0 else None
+
+ def __set__(self, line):
+ _assertValidNode(self)
+ if line <= 0:
+ self._c_node.line = 0
+ else:
+ self._c_node.line = line
+
+ # not in ElementTree, read-only
+ @property
+ def nsmap(self):
+ """Namespace prefix->URI mapping known in the context of this
+ Element. This includes all namespace declarations of the
+ parents.
+
+ Note that changing the returned dict has no effect on the Element.
+ """
+ _assertValidNode(self)
+ return _build_nsmap(self._c_node)
+
+ # not in ElementTree, read-only
+ property base:
+ """The base URI of the Element (xml:base or HTML base URL).
+ None if the base URI is unknown.
+
+ Note that the value depends on the URL of the document that
+ holds the Element if there is no xml:base attribute on the
+ Element or its ancestors.
+
+ Setting this property will set an xml:base attribute on the
+ Element, regardless of the document type (XML or HTML).
+ """
+ def __get__(self):
+ _assertValidNode(self)
+ c_base = tree.xmlNodeGetBase(self._doc._c_doc, self._c_node)
+ if c_base is NULL:
+ if self._doc._c_doc.URL is NULL:
+ return None
+ return _decodeFilename(self._doc._c_doc.URL)
+ try:
+ base = _decodeFilename(c_base)
+ finally:
+ tree.xmlFree(c_base)
+ return base
+
+ def __set__(self, url):
+ _assertValidNode(self)
+ if url is None:
+ c_base = <const_xmlChar*>NULL
+ else:
+ url = _encodeFilename(url)
+ c_base = _xcstr(url)
+ tree.xmlNodeSetBase(self._c_node, c_base)
+
+ # ACCESSORS
+ def __repr__(self):
+ "__repr__(self)"
+ return "<Element %s at 0x%x>" % (self.tag, id(self))
+
+ def __getitem__(self, x):
+ """Returns the subelement at the given position or the requested
+ slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef Py_ssize_t step = 0, slicelength = 0
+ cdef Py_ssize_t c, i
+ cdef _node_to_node_function next_element
+ cdef list result
+ _assertValidNode(self)
+ if isinstance(x, slice):
+ # slicing
+ if _isFullSlice(<slice>x):
+ return _collectChildren(self)
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ if c_node is NULL:
+ return []
+ if step > 0:
+ next_element = _nextElement
+ else:
+ step = -step
+ next_element = _previousElement
+ result = []
+ c = 0
+ while c_node is not NULL and c < slicelength:
+ result.append(_elementFactory(self._doc, c_node))
+ c += 1
+ for i in range(step):
+ c_node = next_element(c_node)
+ if c_node is NULL:
+ break
+ return result
+ else:
+ # indexing
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, "list index out of range"
+ return _elementFactory(self._doc, c_node)
+
+ def __len__(self):
+ """__len__(self)
+
+ Returns the number of subelements.
+ """
+ _assertValidNode(self)
+ return _countElements(self._c_node.children)
+
+ def __bool__(self):
+ """__bool__(self)"""
+ import warnings
+ warnings.warn(
+ "Truth-testing of elements was a source of confusion and will always "
+ "return True in future versions. "
+ "Use specific 'len(elem)' or 'elem is not None' test instead.",
+ FutureWarning
+ )
+ # emulate old behaviour
+ _assertValidNode(self)
+ return _hasChild(self._c_node)
+
+ def __contains__(self, element):
+ "__contains__(self, element)"
+ cdef xmlNode* c_node
+ _assertValidNode(self)
+ if not isinstance(element, _Element):
+ return 0
+ c_node = (<_Element>element)._c_node
+ return c_node is not NULL and c_node.parent is self._c_node
+
+ def __iter__(self):
+ "__iter__(self)"
+ return ElementChildIterator(self)
+
+ def __reversed__(self):
+ "__reversed__(self)"
+ return ElementChildIterator(self, reversed=True)
+
+ def index(self, child: _Element, start: int = None, stop: int = None):
+ """index(self, child, start=None, stop=None)
+
+ Find the position of the child within the parent.
+
+ This method is not part of the original ElementTree API.
+ """
+ cdef Py_ssize_t k, l
+ cdef Py_ssize_t c_start, c_stop
+ cdef xmlNode* c_child
+ cdef xmlNode* c_start_node
+ _assertValidNode(self)
+ _assertValidNode(child)
+ c_child = child._c_node
+ if c_child.parent is not self._c_node:
+ raise ValueError, "Element is not a child of this node."
+
+ # handle the unbounded search straight away (normal case)
+ if stop is None and (start is None or start == 0):
+ k = 0
+ c_child = c_child.prev
+ while c_child is not NULL:
+ if _isElement(c_child):
+ k += 1
+ c_child = c_child.prev
+ return k
+
+ # check indices
+ if start is None:
+ c_start = 0
+ else:
+ c_start = start
+ if stop is None:
+ c_stop = 0
+ else:
+ c_stop = stop
+ if c_stop == 0 or \
+ c_start >= c_stop and (c_stop > 0 or c_start < 0):
+ raise ValueError, "list.index(x): x not in slice"
+
+ # for negative slice indices, check slice before searching index
+ if c_start < 0 or c_stop < 0:
+ # start from right, at most up to leftmost(c_start, c_stop)
+ if c_start < c_stop:
+ k = -c_start
+ else:
+ k = -c_stop
+ c_start_node = self._c_node.last
+ l = 1
+ while c_start_node != c_child and l < k:
+ if _isElement(c_start_node):
+ l += 1
+ c_start_node = c_start_node.prev
+ if c_start_node == c_child:
+ # found! before slice end?
+ if c_stop < 0 and l <= -c_stop:
+ raise ValueError, "list.index(x): x not in slice"
+ elif c_start < 0:
+ raise ValueError, "list.index(x): x not in slice"
+
+ # now determine the index backwards from child
+ c_child = c_child.prev
+ k = 0
+ if c_stop > 0:
+ # we can optimize: stop after c_stop elements if not found
+ while c_child != NULL and k < c_stop:
+ if _isElement(c_child):
+ k += 1
+ c_child = c_child.prev
+ if k < c_stop:
+ return k
+ else:
+ # traverse all
+ while c_child != NULL:
+ if _isElement(c_child):
+ k = k + 1
+ c_child = c_child.prev
+ if c_start > 0:
+ if k >= c_start:
+ return k
+ else:
+ return k
+ if c_start != 0 or c_stop != 0:
+ raise ValueError, "list.index(x): x not in slice"
+ else:
+ raise ValueError, "list.index(x): x not in list"
+
+ def get(self, key, default=None):
+ """get(self, key, default=None)
+
+ Gets an element attribute.
+ """
+ _assertValidNode(self)
+ return _getAttributeValue(self, key, default)
+
+ def keys(self):
+ """keys(self)
+
+ Gets a list of attribute names. The names are returned in an
+ arbitrary order (just like for an ordinary Python dictionary).
+ """
+ _assertValidNode(self)
+ return _collectAttributes(self._c_node, 1)
+
+ def values(self):
+ """values(self)
+
+ Gets element attribute values as a sequence of strings. The
+ attributes are returned in an arbitrary order.
+ """
+ _assertValidNode(self)
+ return _collectAttributes(self._c_node, 2)
+
+ def items(self):
+ """items(self)
+
+ Gets element attributes, as a sequence. The attributes are returned in
+ an arbitrary order.
+ """
+ _assertValidNode(self)
+ return _collectAttributes(self._c_node, 3)
+
+ def getchildren(self):
+ """getchildren(self)
+
+ Returns all direct children. The elements are returned in document
+ order.
+
+ :deprecated: Note that this method has been deprecated as of
+ ElementTree 1.3 and lxml 2.0. New code should use
+ ``list(element)`` or simply iterate over elements.
+ """
+ _assertValidNode(self)
+ return _collectChildren(self)
+
+ def getparent(self):
+ """getparent(self)
+
+ Returns the parent of this element or None for the root element.
+ """
+ cdef xmlNode* c_node
+ #_assertValidNode(self) # not needed
+ c_node = _parentElement(self._c_node)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self._doc, c_node)
+
+ def getnext(self):
+ """getnext(self)
+
+ Returns the following sibling of this element or None.
+ """
+ cdef xmlNode* c_node
+ #_assertValidNode(self) # not needed
+ c_node = _nextElement(self._c_node)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self._doc, c_node)
+
+ def getprevious(self):
+ """getprevious(self)
+
+ Returns the preceding sibling of this element or None.
+ """
+ cdef xmlNode* c_node
+ #_assertValidNode(self) # not needed
+ c_node = _previousElement(self._c_node)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self._doc, c_node)
+
+ def itersiblings(self, tag=None, *tags, preceding=False):
+ """itersiblings(self, tag=None, *tags, preceding=False)
+
+ Iterate over the following or preceding siblings of this element.
+
+ The direction is determined by the 'preceding' keyword which
+ defaults to False, i.e. forward iteration over the following
+ siblings. When True, the iterator yields the preceding
+ siblings in reverse document order, i.e. starting right before
+ the current element and going backwards.
+
+ Can be restricted to find only elements with specific tags,
+ see `iter`.
+ """
+ if preceding:
+ if self._c_node and not self._c_node.prev:
+ return ITER_EMPTY
+ elif self._c_node and not self._c_node.next:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return SiblingsIterator(self, tags, preceding=preceding)
+
+ def iterancestors(self, tag=None, *tags):
+ """iterancestors(self, tag=None, *tags)
+
+ Iterate over the ancestors of this element (from parent to parent).
+
+ Can be restricted to find only elements with specific tags,
+ see `iter`.
+ """
+ if self._c_node and not self._c_node.parent:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return AncestorsIterator(self, tags)
+
+ def iterdescendants(self, tag=None, *tags):
+ """iterdescendants(self, tag=None, *tags)
+
+ Iterate over the descendants of this element in document order.
+
+ As opposed to ``el.iter()``, this iterator does not yield the element
+ itself. The returned elements can be restricted to find only elements
+ with specific tags, see `iter`.
+ """
+ if self._c_node and not self._c_node.children:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return ElementDepthFirstIterator(self, tags, inclusive=False)
+
+ def iterchildren(self, tag=None, *tags, reversed=False):
+ """iterchildren(self, tag=None, *tags, reversed=False)
+
+ Iterate over the children of this element.
+
+ As opposed to using normal iteration on this element, the returned
+ elements can be reversed with the 'reversed' keyword and restricted
+ to find only elements with specific tags, see `iter`.
+ """
+ if self._c_node and not self._c_node.children:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return ElementChildIterator(self, tags, reversed=reversed)
+
+ def getroottree(self):
+ """getroottree(self)
+
+ Return an ElementTree for the root node of the document that
+ contains this element.
+
+ This is the same as following element.getparent() up the tree until it
+ returns None (for the root element) and then build an ElementTree for
+ the last parent that was returned."""
+ _assertValidDoc(self._doc)
+ return _elementTreeFactory(self._doc, None)
+
+ def getiterator(self, tag=None, *tags):
+ """getiterator(self, tag=None, *tags)
+
+ Returns a sequence or iterator of all elements in the subtree in
+ document order (depth first pre-order), starting with this
+ element.
+
+ Can be restricted to find only elements with specific tags,
+ see `iter`.
+
+ :deprecated: Note that this method is deprecated as of
+ ElementTree 1.3 and lxml 2.0. It returns an iterator in
+ lxml, which diverges from the original ElementTree
+ behaviour. If you want an efficient iterator, use the
+ ``element.iter()`` method instead. You should only use this
+ method in new code if you require backwards compatibility
+ with older versions of lxml or ElementTree.
+ """
+ if tag is not None:
+ tags += (tag,)
+ return ElementDepthFirstIterator(self, tags)
+
+ def iter(self, tag=None, *tags):
+ """iter(self, tag=None, *tags)
+
+ Iterate over all elements in the subtree in document order (depth
+ first pre-order), starting with this element.
+
+ Can be restricted to find only elements with specific tags:
+ pass ``"{ns}localname"`` as tag. Either or both of ``ns`` and
+ ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty
+ for no namespace. ``"localname"`` is equivalent to ``"{}localname"``
+ (i.e. no namespace) but ``"*"`` is ``"{*}*"`` (any or no namespace),
+ not ``"{}*"``.
+
+ You can also pass the Element, Comment, ProcessingInstruction and
+ Entity factory functions to look only for the specific element type.
+
+ Passing multiple tags (or a sequence of tags) instead of a single tag
+ will let the iterator return all elements matching any of these tags,
+ in document order.
+ """
+ if tag is not None:
+ tags += (tag,)
+ return ElementDepthFirstIterator(self, tags)
+
+ def itertext(self, tag=None, *tags, with_tail=True):
+ """itertext(self, tag=None, *tags, with_tail=True)
+
+ Iterates over the text content of a subtree.
+
+ You can pass tag names to restrict text content to specific elements,
+ see `iter`.
+
+ You can set the ``with_tail`` keyword argument to ``False`` to skip
+ over tail text.
+ """
+ if tag is not None:
+ tags += (tag,)
+ return ElementTextIterator(self, tags, with_tail=with_tail)
+
+ def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
+ """makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
+
+ Creates a new element associated with the same document.
+ """
+ _assertValidDoc(self._doc)
+ return _makeElement(_tag, NULL, self._doc, None, None, None,
+ attrib, nsmap, _extra)
+
+ def find(self, path, namespaces=None):
+ """find(self, path, namespaces=None)
+
+ Finds the first matching subelement, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
+
+ def findtext(self, path, default=None, namespaces=None):
+ """findtext(self, path, default=None, namespaces=None)
+
+ Finds text for the first matching subelement, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self))
+
+ def findall(self, path, namespaces=None):
+ """findall(self, path, namespaces=None)
+
+ Finds all matching subelements, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
+
+ def iterfind(self, path, namespaces=None):
+ """iterfind(self, path, namespaces=None)
+
+ Iterates over all matching subelements, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
+
+ def xpath(self, _path, *, namespaces=None, extensions=None,
+ smart_strings=True, **_variables):
+ """xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
+
+ Evaluate an xpath expression using the element as context node.
+ """
+ evaluator = XPathElementEvaluator(self, namespaces=namespaces,
+ extensions=extensions,
+ smart_strings=smart_strings)
+ return evaluator(_path, **_variables)
+
+ def cssselect(self, expr, *, translator='xml'):
+ """
+ Run the CSS expression on this element and its children,
+ returning a list of the results.
+
+ Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
+ that pre-compiling the expression can provide a substantial
+ speedup.
+ """
+ # Do the import here to make the dependency optional.
+ from lxml.cssselect import CSSSelector
+ return CSSSelector(expr, translator=translator)(self)
+
+
+cdef extern from "includes/etree_defs.h":
+ # macro call to 't->tp_new()' for fast instantiation
+ cdef object NEW_ELEMENT "PY_NEW" (object t)
+
+
+@cython.linetrace(False)
+cdef _Element _elementFactory(_Document doc, xmlNode* c_node):
+ cdef _Element result
+ result = getProxy(c_node)
+ if result is not None:
+ return result
+ if c_node is NULL:
+ return None
+
+ element_class = LOOKUP_ELEMENT_CLASS(
+ ELEMENT_CLASS_LOOKUP_STATE, doc, c_node)
+ if hasProxy(c_node):
+ # prevent re-entry race condition - we just called into Python
+ return getProxy(c_node)
+ result = NEW_ELEMENT(element_class)
+ if hasProxy(c_node):
+ # prevent re-entry race condition - we just called into Python
+ result._c_node = NULL
+ return getProxy(c_node)
+
+ _registerProxy(result, doc, c_node)
+ if element_class is not _Element:
+ result._init()
+ return result
+
+
+@cython.internal
+cdef class __ContentOnlyElement(_Element):
+ cdef int _raiseImmutable(self) except -1:
+ raise TypeError, "this element does not have children or attributes"
+
+ def set(self, key, value):
+ "set(self, key, value)"
+ self._raiseImmutable()
+
+ def append(self, value):
+ "append(self, value)"
+ self._raiseImmutable()
+
+ def insert(self, index, value):
+ "insert(self, index, value)"
+ self._raiseImmutable()
+
+ def __setitem__(self, index, value):
+ "__setitem__(self, index, value)"
+ self._raiseImmutable()
+
+ @property
+ def attrib(self):
+ return IMMUTABLE_EMPTY_MAPPING
+
+ property text:
+ def __get__(self):
+ _assertValidNode(self)
+ return funicodeOrEmpty(self._c_node.content)
+
+ def __set__(self, value):
+ cdef tree.xmlDict* c_dict
+ _assertValidNode(self)
+ if value is None:
+ c_text = <const_xmlChar*>NULL
+ else:
+ value = _utf8(value)
+ c_text = _xcstr(value)
+ tree.xmlNodeSetContent(self._c_node, c_text)
+
+ # ACCESSORS
+ def __getitem__(self, x):
+ "__getitem__(self, x)"
+ if isinstance(x, slice):
+ return []
+ else:
+ raise IndexError, "list index out of range"
+
+ def __len__(self):
+ "__len__(self)"
+ return 0
+
+ def get(self, key, default=None):
+ "get(self, key, default=None)"
+ return None
+
+ def keys(self):
+ "keys(self)"
+ return []
+
+ def items(self):
+ "items(self)"
+ return []
+
+ def values(self):
+ "values(self)"
+ return []
+
+cdef class _Comment(__ContentOnlyElement):
+ @property
+ def tag(self):
+ return Comment
+
+ def __repr__(self):
+ return "<!--%s-->" % self.text
+
+cdef class _ProcessingInstruction(__ContentOnlyElement):
+ @property
+ def tag(self):
+ return ProcessingInstruction
+
+ property target:
+ # not in ElementTree
+ def __get__(self):
+ _assertValidNode(self)
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ value = _utf8(value)
+ c_text = _xcstr(value)
+ tree.xmlNodeSetName(self._c_node, c_text)
+
+ def __repr__(self):
+ text = self.text
+ if text:
+ return "<?%s %s?>" % (self.target, text)
+ else:
+ return "<?%s?>" % self.target
+
+ def get(self, key, default=None):
+ """get(self, key, default=None)
+
+ Try to parse pseudo-attributes from the text content of the
+ processing instruction, search for one with the given key as
+ name and return its associated value.
+
+ Note that this is only a convenience method for the most
+ common case that all text content is structured in
+ attribute-like name-value pairs with properly quoted values.
+ It is not guaranteed to work for all possible text content.
+ """
+ return self.attrib.get(key, default)
+
+ @property
+ def attrib(self):
+ """Returns a dict containing all pseudo-attributes that can be
+ parsed from the text content of this processing instruction.
+ Note that modifying the dict currently has no effect on the
+ XML node, although this is not guaranteed to stay this way.
+ """
+ return { attr : (value1 or value2)
+ for attr, value1, value2 in _FIND_PI_ATTRIBUTES(' ' + self.text) }
+
+cdef object _FIND_PI_ATTRIBUTES = re.compile(r'\s+(\w+)\s*=\s*(?:\'([^\']*)\'|"([^"]*)")', re.U).findall
+
+cdef class _Entity(__ContentOnlyElement):
+ @property
+ def tag(self):
+ return Entity
+
+ property name:
+ # not in ElementTree
+ def __get__(self):
+ _assertValidNode(self)
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ value_utf = _utf8(value)
+ if b'&' in value_utf or b';' in value_utf:
+ raise ValueError, f"Invalid entity name '{value}'"
+ tree.xmlNodeSetName(self._c_node, _xcstr(value_utf))
+
+ @property
+ def text(self):
+ # FIXME: should this be None or '&[VALUE];' or the resolved
+ # entity value ?
+ _assertValidNode(self)
+ return f'&{funicode(self._c_node.name)};'
+
+ def __repr__(self):
+ return "&%s;" % self.name
+
+
+cdef class QName:
+ """QName(text_or_uri_or_element, tag=None)
+
+ QName wrapper for qualified XML names.
+
+ Pass a tag name by itself or a namespace URI and a tag name to
+ create a qualified name. Alternatively, pass an Element to
+ extract its tag name. ``None`` as first argument is ignored in
+ order to allow for generic 2-argument usage.
+
+ The ``text`` property holds the qualified name in
+ ``{namespace}tagname`` notation. The ``namespace`` and
+ ``localname`` properties hold the respective parts of the tag
+ name.
+
+ You can pass QName objects wherever a tag name is expected. Also,
+ setting Element text from a QName will resolve the namespace prefix
+ on assignment and set a qualified text value. This is helpful in XML
+ languages like SOAP or XML-Schema that use prefixed tag names in
+ their text content.
+ """
+ cdef readonly unicode text
+ cdef readonly unicode localname
+ cdef readonly unicode namespace
+ def __init__(self, text_or_uri_or_element, tag=None):
+ if text_or_uri_or_element is None:
+ # Allow None as no namespace.
+ text_or_uri_or_element, tag = tag, None
+ if not _isString(text_or_uri_or_element):
+ if isinstance(text_or_uri_or_element, _Element):
+ text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag
+ if not _isString(text_or_uri_or_element):
+ raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
+ elif isinstance(text_or_uri_or_element, QName):
+ text_or_uri_or_element = (<QName>text_or_uri_or_element).text
+ elif text_or_uri_or_element is not None:
+ text_or_uri_or_element = unicode(text_or_uri_or_element)
+ else:
+ raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
+
+ ns_utf, tag_utf = _getNsTag(text_or_uri_or_element)
+ if tag is not None:
+ # either ('ns', 'tag') or ('{ns}oldtag', 'newtag')
+ if ns_utf is None:
+ ns_utf = tag_utf # case 1: namespace ended up as tag name
+ tag_utf = _utf8(tag)
+ _tagValidOrRaise(tag_utf)
+ self.localname = (<bytes>tag_utf).decode('utf8')
+ if ns_utf is None:
+ self.namespace = None
+ self.text = self.localname
+ else:
+ self.namespace = (<bytes>ns_utf).decode('utf8')
+ self.text = "{%s}%s" % (self.namespace, self.localname)
+ def __str__(self):
+ return self.text
+ def __hash__(self):
+ return hash(self.text)
+ def __richcmp__(self, other, int op):
+ try:
+ if type(other) is QName:
+ other = (<QName>other).text
+ elif not isinstance(other, unicode):
+ other = unicode(other)
+ except (ValueError, UnicodeDecodeError):
+ return NotImplemented
+ return python.PyObject_RichCompare(self.text, other, op)
+
+
+cdef public class _ElementTree [ type LxmlElementTreeType,
+ object LxmlElementTree ]:
+ cdef _Document _doc
+ cdef _Element _context_node
+
+ # Note that _doc is only used to store the original document if we do not
+ # have a _context_node. All methods should prefer self._context_node._doc
+ # to honour tree restructuring. _doc can happily be None!
+
+ @cython.final
+ cdef int _assertHasRoot(self) except -1:
+ """We have to take care here: the document may not have a root node!
+ This can happen if ElementTree() is called without any argument and
+ the caller 'forgets' to call parse() afterwards, so this is a bug in
+ the caller program.
+ """
+ assert self._context_node is not None, \
+ "ElementTree not initialized, missing root"
+ return 0
+
+ def parse(self, source, _BaseParser parser=None, *, base_url=None):
+ """parse(self, source, parser=None, base_url=None)
+
+ Updates self with the content of source and returns its root.
+ """
+ cdef _Document doc = None
+ try:
+ doc = _parseDocument(source, parser, base_url)
+ except _TargetParserResult as result_container:
+ # raises a TypeError if we don't get an _Element
+ self._context_node = result_container.result
+ else:
+ self._context_node = doc.getroot()
+ self._doc = None if self._context_node is not None else doc
+ return self._context_node
+
+ def _setroot(self, _Element root not None):
+ """_setroot(self, root)
+
+ Relocate the ElementTree to a new root node.
+ """
+ _assertValidNode(root)
+ if root._c_node.type != tree.XML_ELEMENT_NODE:
+ raise TypeError, "Only elements can be the root of an ElementTree"
+ self._context_node = root
+ self._doc = None
+
+ def getroot(self):
+ """getroot(self)
+
+ Gets the root element for this tree.
+ """
+ return self._context_node
+
+ def __copy__(self):
+ return _elementTreeFactory(self._doc, self._context_node)
+
+ def __deepcopy__(self, memo):
+ cdef _Element root
+ cdef _Document doc
+ cdef xmlDoc* c_doc
+ if self._context_node is not None:
+ root = self._context_node.__copy__()
+ assert root is not None
+ _assertValidNode(root)
+ _copyNonElementSiblings(self._context_node._c_node, root._c_node)
+ return _elementTreeFactory(None, root)
+ elif self._doc is not None:
+ _assertValidDoc(self._doc)
+ c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1)
+ if c_doc is NULL:
+ raise MemoryError()
+ doc = _documentFactory(c_doc, self._doc._parser)
+ return _elementTreeFactory(doc, None)
+ else:
+ # so what ...
+ return self
+
+ # not in ElementTree
+ @property
+ def docinfo(self) -> DocInfo:
+ """Information about the document provided by parser and DTD."""
+ self._assertHasRoot()
+ return DocInfo(self._context_node._doc)
+
+ # not in ElementTree, read-only
+ @property
+ def parser(self):
+ """The parser that was used to parse the document in this ElementTree.
+ """
+ if self._context_node is not None and \
+ self._context_node._doc is not None:
+ return self._context_node._doc._parser
+ if self._doc is not None:
+ return self._doc._parser
+ return None
+
+ def write(self, file, *, encoding=None, method="xml",
+ bint pretty_print=False, xml_declaration=None, bint with_tail=True,
+ standalone=None, doctype=None, compression=0,
+ bint exclusive=False, inclusive_ns_prefixes=None,
+ bint with_comments=True, bint strip_text=False,
+ docstring=None):
+ """write(self, file, encoding=None, method="xml",
+ pretty_print=False, xml_declaration=None, with_tail=True,
+ standalone=None, doctype=None, compression=0,
+ exclusive=False, inclusive_ns_prefixes=None,
+ with_comments=True, strip_text=False)
+
+ Write the tree to a filename, file or file-like object.
+
+ Defaults to ASCII encoding and writing a declaration as needed.
+
+ The keyword argument 'method' selects the output method:
+ 'xml', 'html', 'text', 'c14n' or 'c14n2'. Default is 'xml'.
+
+ With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
+ ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
+ C14N, include comments, and list the inclusive prefixes respectively.
+
+ With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
+ ``strip_text`` options control the output of comments and text space
+ according to C14N 2.0.
+
+ Passing a boolean value to the ``standalone`` option will
+ output an XML declaration with the corresponding
+ ``standalone`` flag.
+
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
+ The ``compression`` option enables GZip compression level 1-9.
+
+ The ``inclusive_ns_prefixes`` should be a list of namespace strings
+ (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
+ during exclusive C14N serialisation. This parameter is ignored if
+ exclusive mode=False.
+
+ If exclusive=True and no list is provided, a namespace will only be
+ rendered if it is used by the immediate parent or one of its attributes
+ and its prefix and values have not already been rendered by an ancestor
+ of the namespace node's parent element.
+ """
+ cdef bint write_declaration
+ cdef int is_standalone
+
+ self._assertHasRoot()
+ _assertValidNode(self._context_node)
+ if compression is None or compression < 0:
+ compression = 0
+
+ # C14N serialisation
+ if method in ('c14n', 'c14n2'):
+ if encoding is not None:
+ raise ValueError("Cannot specify encoding with C14N")
+ if xml_declaration:
+ raise ValueError("Cannot enable XML declaration in C14N")
+
+ if method == 'c14n':
+ _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
+ compression, inclusive_ns_prefixes)
+ else: # c14n2
+ with _open_utf8_file(file, compression=compression) as f:
+ target = C14NWriterTarget(
+ f.write, with_comments=with_comments, strip_text=strip_text)
+ _tree_to_target(self, target)
+ return
+
+ if not with_comments:
+ raise ValueError("Can only discard comments in C14N serialisation")
+ # suppress decl. in default case (purely for ElementTree compatibility)
+ if xml_declaration is not None:
+ write_declaration = xml_declaration
+ if encoding is None:
+ encoding = 'ASCII'
+ else:
+ encoding = encoding.upper()
+ elif encoding is None:
+ encoding = 'ASCII'
+ write_declaration = 0
+ else:
+ encoding = encoding.upper()
+ write_declaration = encoding not in (
+ 'US-ASCII', 'ASCII', 'UTF8', 'UTF-8')
+ if standalone is None:
+ is_standalone = -1
+ elif standalone:
+ write_declaration = 1
+ is_standalone = 1
+ else:
+ write_declaration = 1
+ is_standalone = 0
+
+ if docstring is not None and doctype is None:
+ import warnings
+ warnings.warn(
+ "The 'docstring' option is deprecated. Use 'doctype' instead.",
+ DeprecationWarning)
+ doctype = docstring
+
+ _tofilelike(file, self._context_node, encoding, doctype, method,
+ write_declaration, 1, pretty_print, with_tail,
+ is_standalone, compression)
+
+ def getpath(self, _Element element not None):
+ """getpath(self, element)
+
+ Returns a structural, absolute XPath expression to find the element.
+
+ For namespaced elements, the expression uses prefixes from the
+ document, which therefore need to be provided in order to make any
+ use of the expression in XPath.
+
+ Also see the method getelementpath(self, element), which returns a
+ self-contained ElementPath expression.
+ """
+ cdef _Document doc
+ cdef _Element root
+ cdef xmlDoc* c_doc
+ _assertValidNode(element)
+ if self._context_node is not None:
+ root = self._context_node
+ doc = root._doc
+ elif self._doc is not None:
+ doc = self._doc
+ root = doc.getroot()
+ else:
+ raise ValueError, "Element is not in this tree."
+ _assertValidDoc(doc)
+ _assertValidNode(root)
+ if element._doc is not doc:
+ raise ValueError, "Element is not in this tree."
+
+ c_doc = _fakeRootDoc(doc._c_doc, root._c_node)
+ c_path = tree.xmlGetNodePath(element._c_node)
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ if c_path is NULL:
+ raise MemoryError()
+ path = funicode(c_path)
+ tree.xmlFree(c_path)
+ return path
+
+ def getelementpath(self, _Element element not None):
+ """getelementpath(self, element)
+
+ Returns a structural, absolute ElementPath expression to find the
+ element. This path can be used in the .find() method to look up
+ the element, provided that the elements along the path and their
+ list of immediate children were not modified in between.
+
+ ElementPath has the advantage over an XPath expression (as returned
+ by the .getpath() method) that it does not require additional prefix
+ declarations. It is always self-contained.
+ """
+ cdef _Element root
+ cdef Py_ssize_t count
+ _assertValidNode(element)
+ if element._c_node.type != tree.XML_ELEMENT_NODE:
+ raise ValueError, "input is not an Element"
+ if self._context_node is not None:
+ root = self._context_node
+ elif self._doc is not None:
+ root = self._doc.getroot()
+ else:
+ raise ValueError, "Element is not in this tree"
+ _assertValidNode(root)
+ if element._doc is not root._doc:
+ raise ValueError, "Element is not in this tree"
+
+ path = []
+ c_element = element._c_node
+ while c_element is not root._c_node:
+ c_name = c_element.name
+ c_href = _getNs(c_element)
+ tag = _namespacedNameFromNsName(c_href, c_name)
+ if c_href is NULL:
+ c_href = <const_xmlChar*>b'' # no namespace (NULL is wildcard)
+ # use tag[N] if there are preceding siblings with the same tag
+ count = 0
+ c_node = c_element.prev
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _tagMatches(c_node, c_href, c_name):
+ count += 1
+ c_node = c_node.prev
+ if count:
+ tag = f'{tag}[{count+1}]'
+ else:
+ # use tag[1] if there are following siblings with the same tag
+ c_node = c_element.next
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _tagMatches(c_node, c_href, c_name):
+ tag += '[1]'
+ break
+ c_node = c_node.next
+
+ path.append(tag)
+ c_element = c_element.parent
+ if c_element is NULL or c_element.type != tree.XML_ELEMENT_NODE:
+ raise ValueError, "Element is not in this tree."
+ if not path:
+ return '.'
+ path.reverse()
+ return '/'.join(path)
+
+ def getiterator(self, tag=None, *tags):
+ """getiterator(self, *tags, tag=None)
+
+ Returns a sequence or iterator of all elements in document order
+ (depth first pre-order), starting with the root element.
+
+ Can be restricted to find only elements with specific tags,
+ see `_Element.iter`.
+
+ :deprecated: Note that this method is deprecated as of
+ ElementTree 1.3 and lxml 2.0. It returns an iterator in
+ lxml, which diverges from the original ElementTree
+ behaviour. If you want an efficient iterator, use the
+ ``tree.iter()`` method instead. You should only use this
+ method in new code if you require backwards compatibility
+ with older versions of lxml or ElementTree.
+ """
+ root = self.getroot()
+ if root is None:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return root.getiterator(*tags)
+
+ def iter(self, tag=None, *tags):
+ """iter(self, tag=None, *tags)
+
+ Creates an iterator for the root element. The iterator loops over
+ all elements in this tree, in document order. Note that siblings
+ of the root element (comments or processing instructions) are not
+ returned by the iterator.
+
+ Can be restricted to find only elements with specific tags,
+ see `_Element.iter`.
+ """
+ root = self.getroot()
+ if root is None:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return root.iter(*tags)
+
+ def find(self, path, namespaces=None):
+ """find(self, path, namespaces=None)
+
+ Finds the first toplevel element with given tag. Same as
+ ``tree.getroot().find(path)``.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ from warnings import warn
+ warn(
+ "This search incorrectly ignores the root element, and will be "
+ "fixed in a future version. If you rely on the current "
+ f"behaviour, change it to {path!r}",
+ FutureWarning, stacklevel=1
+ )
+ return root.find(path, namespaces)
+
+ def findtext(self, path, default=None, namespaces=None):
+ """findtext(self, path, default=None, namespaces=None)
+
+ Finds the text for the first element matching the ElementPath
+ expression. Same as getroot().findtext(path)
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ from warnings import warn
+ warn(
+ "This search incorrectly ignores the root element, and will be "
+ "fixed in a future version. If you rely on the current "
+ f"behaviour, change it to {path!r}",
+ FutureWarning, stacklevel=1
+ )
+ return root.findtext(path, default, namespaces)
+
+ def findall(self, path, namespaces=None):
+ """findall(self, path, namespaces=None)
+
+ Finds all elements matching the ElementPath expression. Same as
+ getroot().findall(path).
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ from warnings import warn
+ warn(
+ "This search incorrectly ignores the root element, and will be "
+ "fixed in a future version. If you rely on the current "
+ f"behaviour, change it to {path!r}",
+ FutureWarning, stacklevel=1
+ )
+ return root.findall(path, namespaces)
+
+ def iterfind(self, path, namespaces=None):
+ """iterfind(self, path, namespaces=None)
+
+ Iterates over all elements matching the ElementPath expression.
+ Same as getroot().iterfind(path).
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ from warnings import warn
+ warn(
+ "This search incorrectly ignores the root element, and will be "
+ "fixed in a future version. If you rely on the current "
+ f"behaviour, change it to {path!r}",
+ FutureWarning, stacklevel=1
+ )
+ return root.iterfind(path, namespaces)
+
+ def xpath(self, _path, *, namespaces=None, extensions=None,
+ smart_strings=True, **_variables):
+ """xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
+
+ XPath evaluate in context of document.
+
+ ``namespaces`` is an optional dictionary with prefix to namespace URI
+ mappings, used by XPath. ``extensions`` defines additional extension
+ functions.
+
+ Returns a list (nodeset), or bool, float or string.
+
+ In case of a list result, return Element for element nodes,
+ string for text and attribute values.
+
+ Note: if you are going to apply multiple XPath expressions
+ against the same document, it is more efficient to use
+ XPathEvaluator directly.
+ """
+ self._assertHasRoot()
+ evaluator = XPathDocumentEvaluator(self, namespaces=namespaces,
+ extensions=extensions,
+ smart_strings=smart_strings)
+ return evaluator(_path, **_variables)
+
+ def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
+ """xslt(self, _xslt, extensions=None, access_control=None, **_kw)
+
+ Transform this document using other document.
+
+ xslt is a tree that should be XSLT
+ keyword parameters are XSLT transformation parameters.
+
+ Returns the transformed tree.
+
+ Note: if you are going to apply the same XSLT stylesheet against
+ multiple documents, it is more efficient to use the XSLT
+ class directly.
+ """
+ self._assertHasRoot()
+ style = XSLT(_xslt, extensions=extensions,
+ access_control=access_control)
+ return style(self, **_kw)
+
+ def relaxng(self, relaxng):
+ """relaxng(self, relaxng)
+
+ Validate this document using other document.
+
+ The relaxng argument is a tree that should contain a Relax NG schema.
+
+ Returns True or False, depending on whether validation
+ succeeded.
+
+ Note: if you are going to apply the same Relax NG schema against
+ multiple documents, it is more efficient to use the RelaxNG
+ class directly.
+ """
+ self._assertHasRoot()
+ schema = RelaxNG(relaxng)
+ return schema.validate(self)
+
+ def xmlschema(self, xmlschema):
+ """xmlschema(self, xmlschema)
+
+ Validate this document using other document.
+
+ The xmlschema argument is a tree that should contain an XML Schema.
+
+ Returns True or False, depending on whether validation
+ succeeded.
+
+ Note: If you are going to apply the same XML Schema against
+ multiple documents, it is more efficient to use the XMLSchema
+ class directly.
+ """
+ self._assertHasRoot()
+ schema = XMLSchema(xmlschema)
+ return schema.validate(self)
+
+ def xinclude(self):
+ """xinclude(self)
+
+ Process the XInclude nodes in this document and include the
+ referenced XML fragments.
+
+ There is support for loading files through the file system, HTTP and
+ FTP.
+
+ Note that XInclude does not support custom resolvers in Python space
+ due to restrictions of libxml2 <= 2.6.29.
+ """
+ self._assertHasRoot()
+ XInclude()(self._context_node)
+
+ def write_c14n(self, file, *, bint exclusive=False, bint with_comments=True,
+ compression=0, inclusive_ns_prefixes=None):
+ """write_c14n(self, file, exclusive=False, with_comments=True,
+ compression=0, inclusive_ns_prefixes=None)
+
+ C14N write of document. Always writes UTF-8.
+
+ The ``compression`` option enables GZip compression level 1-9.
+
+ The ``inclusive_ns_prefixes`` should be a list of namespace strings
+ (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
+ during exclusive C14N serialisation. This parameter is ignored if
+ exclusive mode=False.
+
+ If exclusive=True and no list is provided, a namespace will only be
+ rendered if it is used by the immediate parent or one of its attributes
+ and its prefix and values have not already been rendered by an ancestor
+ of the namespace node's parent element.
+
+ NOTE: This method is deprecated as of lxml 4.4 and will be removed in a
+ future release. Use ``.write(f, method="c14n")`` instead.
+ """
+ self._assertHasRoot()
+ _assertValidNode(self._context_node)
+ if compression is None or compression < 0:
+ compression = 0
+
+ _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
+ compression, inclusive_ns_prefixes)
+
+cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node):
+ return _newElementTree(doc, context_node, _ElementTree)
+
+cdef _ElementTree _newElementTree(_Document doc, _Element context_node,
+ object baseclass):
+ cdef _ElementTree result
+ result = baseclass()
+ if context_node is None and doc is not None:
+ context_node = doc.getroot()
+ if context_node is None:
+ _assertValidDoc(doc)
+ result._doc = doc
+ else:
+ _assertValidNode(context_node)
+ result._context_node = context_node
+ return result
+
+
+@cython.final
+@cython.freelist(16)
+cdef class _Attrib:
+ """A dict-like proxy for the ``Element.attrib`` property.
+ """
+ cdef _Element _element
+ def __cinit__(self, _Element element not None):
+ _assertValidNode(element)
+ self._element = element
+
+ # MANIPULATORS
+ def __setitem__(self, key, value):
+ _assertValidNode(self._element)
+ _setAttributeValue(self._element, key, value)
+
+ def __delitem__(self, key):
+ _assertValidNode(self._element)
+ _delAttribute(self._element, key)
+
+ def update(self, sequence_or_dict):
+ _assertValidNode(self._element)
+ if isinstance(sequence_or_dict, (dict, _Attrib)):
+ sequence_or_dict = sequence_or_dict.items()
+ for key, value in sequence_or_dict:
+ _setAttributeValue(self._element, key, value)
+
+ def pop(self, key, *default):
+ if len(default) > 1:
+ raise TypeError, f"pop expected at most 2 arguments, got {len(default)+1}"
+ _assertValidNode(self._element)
+ result = _getAttributeValue(self._element, key, None)
+ if result is None:
+ if not default:
+ raise KeyError, key
+ result = default[0]
+ else:
+ _delAttribute(self._element, key)
+ return result
+
+ def clear(self):
+ _assertValidNode(self._element)
+ c_attrs = self._element._c_node.properties
+ if c_attrs:
+ self._element._c_node.properties = NULL
+ tree.xmlFreePropList(c_attrs)
+
+ # ACCESSORS
+ def __repr__(self):
+ _assertValidNode(self._element)
+ return repr(dict( _collectAttributes(self._element._c_node, 3) ))
+
+ def __copy__(self):
+ _assertValidNode(self._element)
+ return dict(_collectAttributes(self._element._c_node, 3))
+
+ def __deepcopy__(self, memo):
+ _assertValidNode(self._element)
+ return dict(_collectAttributes(self._element._c_node, 3))
+
+ def __getitem__(self, key):
+ _assertValidNode(self._element)
+ result = _getAttributeValue(self._element, key, None)
+ if result is None:
+ raise KeyError, key
+ return result
+
+ def __bool__(self):
+ _assertValidNode(self._element)
+ cdef xmlAttr* c_attr = self._element._c_node.properties
+ while c_attr is not NULL:
+ if c_attr.type == tree.XML_ATTRIBUTE_NODE:
+ return 1
+ c_attr = c_attr.next
+ return 0
+
+ def __len__(self):
+ _assertValidNode(self._element)
+ cdef xmlAttr* c_attr = self._element._c_node.properties
+ cdef Py_ssize_t c = 0
+ while c_attr is not NULL:
+ if c_attr.type == tree.XML_ATTRIBUTE_NODE:
+ c += 1
+ c_attr = c_attr.next
+ return c
+
+ def get(self, key, default=None):
+ _assertValidNode(self._element)
+ return _getAttributeValue(self._element, key, default)
+
+ def keys(self):
+ _assertValidNode(self._element)
+ return _collectAttributes(self._element._c_node, 1)
+
+ def __iter__(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 1))
+
+ def iterkeys(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 1))
+
+ def values(self):
+ _assertValidNode(self._element)
+ return _collectAttributes(self._element._c_node, 2)
+
+ def itervalues(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 2))
+
+ def items(self):
+ _assertValidNode(self._element)
+ return _collectAttributes(self._element._c_node, 3)
+
+ def iteritems(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 3))
+
+ def has_key(self, key):
+ _assertValidNode(self._element)
+ return key in self
+
+ def __contains__(self, key):
+ _assertValidNode(self._element)
+ cdef xmlNode* c_node
+ ns, tag = _getNsTag(key)
+ c_node = self._element._c_node
+ c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
+ return 1 if tree.xmlHasNsProp(c_node, _xcstr(tag), c_href) else 0
+
+ def __richcmp__(self, other, int op):
+ try:
+ one = dict(self.items())
+ if not isinstance(other, dict):
+ other = dict(other)
+ except (TypeError, ValueError):
+ return NotImplemented
+ return python.PyObject_RichCompare(one, other, op)
+
+MutableMapping.register(_Attrib)
+
+
+@cython.final
+@cython.internal
+cdef class _AttribIterator:
+ """Attribute iterator - for internal use only!
+ """
+ # XML attributes must not be removed while running!
+ cdef _Element _node
+ cdef xmlAttr* _c_attr
+ cdef int _keysvalues # 1 - keys, 2 - values, 3 - items (key, value)
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef xmlAttr* c_attr
+ if self._node is None:
+ raise StopIteration
+ c_attr = self._c_attr
+ while c_attr is not NULL and c_attr.type != tree.XML_ATTRIBUTE_NODE:
+ c_attr = c_attr.next
+ if c_attr is NULL:
+ self._node = None
+ raise StopIteration
+
+ self._c_attr = c_attr.next
+ if self._keysvalues == 1:
+ return _namespacedName(<xmlNode*>c_attr)
+ elif self._keysvalues == 2:
+ return _attributeValue(self._node._c_node, c_attr)
+ else:
+ return (_namespacedName(<xmlNode*>c_attr),
+ _attributeValue(self._node._c_node, c_attr))
+
+cdef object _attributeIteratorFactory(_Element element, int keysvalues):
+ cdef _AttribIterator attribs
+ if element._c_node.properties is NULL:
+ return ITER_EMPTY
+ attribs = _AttribIterator()
+ attribs._node = element
+ attribs._c_attr = element._c_node.properties
+ attribs._keysvalues = keysvalues
+ return attribs
+
+
+cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
+ type LxmlElementTagMatcherType ]:
+ """
+ Dead but public. :)
+ """
+ cdef object _pystrings
+ cdef int _node_type
+ cdef char* _href
+ cdef char* _name
+ cdef _initTagMatch(self, tag):
+ self._href = NULL
+ self._name = NULL
+ if tag is None:
+ self._node_type = 0
+ elif tag is Comment:
+ self._node_type = tree.XML_COMMENT_NODE
+ elif tag is ProcessingInstruction:
+ self._node_type = tree.XML_PI_NODE
+ elif tag is Entity:
+ self._node_type = tree.XML_ENTITY_REF_NODE
+ elif tag is Element:
+ self._node_type = tree.XML_ELEMENT_NODE
+ else:
+ self._node_type = tree.XML_ELEMENT_NODE
+ self._pystrings = _getNsTag(tag)
+ if self._pystrings[0] is not None:
+ self._href = _cstr(self._pystrings[0])
+ self._name = _cstr(self._pystrings[1])
+ if self._name[0] == c'*' and self._name[1] == c'\0':
+ self._name = NULL
+
+cdef public class _ElementIterator(_ElementTagMatcher) [
+ object LxmlElementIterator, type LxmlElementIteratorType ]:
+ """
+ Dead but public. :)
+ """
+ # we keep Python references here to control GC
+ cdef _Element _node
+ cdef _node_to_node_function _next_element
+ def __iter__(self):
+ return self
+
+ cdef void _storeNext(self, _Element node):
+ cdef xmlNode* c_node
+ c_node = self._next_element(node._c_node)
+ while c_node is not NULL and \
+ self._node_type != 0 and \
+ (<tree.xmlElementType>self._node_type != c_node.type or
+ not _tagMatches(c_node, <const_xmlChar*>self._href, <const_xmlChar*>self._name)):
+ c_node = self._next_element(c_node)
+ if c_node is NULL:
+ self._node = None
+ else:
+ # Python ref:
+ self._node = _elementFactory(node._doc, c_node)
+
+ def __next__(self):
+ cdef xmlNode* c_node
+ cdef _Element current_node
+ if self._node is None:
+ raise StopIteration
+ # Python ref:
+ current_node = self._node
+ self._storeNext(current_node)
+ return current_node
+
+@cython.final
+@cython.internal
+cdef class _MultiTagMatcher:
+ """
+ Match an xmlNode against a list of tags.
+ """
+ cdef list _py_tags
+ cdef qname* _cached_tags
+ cdef size_t _tag_count
+ cdef size_t _cached_size
+ cdef _Document _cached_doc
+ cdef int _node_types
+
+ def __cinit__(self, tags):
+ self._py_tags = []
+ self.initTagMatch(tags)
+
+ def __dealloc__(self):
+ self._clear()
+
+ cdef bint rejectsAll(self) noexcept:
+ return not self._tag_count and not self._node_types
+
+ cdef bint rejectsAllAttributes(self) noexcept:
+ return not self._tag_count
+
+ cdef bint matchesType(self, int node_type) noexcept:
+ if node_type == tree.XML_ELEMENT_NODE and self._tag_count:
+ return True
+ return self._node_types & (1 << node_type)
+
+ cdef void _clear(self) noexcept:
+ cdef size_t i, count
+ count = self._tag_count
+ self._tag_count = 0
+ if self._cached_tags:
+ for i in range(count):
+ cpython.ref.Py_XDECREF(self._cached_tags[i].href)
+ python.lxml_free(self._cached_tags)
+ self._cached_tags = NULL
+
+ cdef initTagMatch(self, tags):
+ self._cached_doc = None
+ del self._py_tags[:]
+ self._clear()
+ if tags is None or tags == ():
+ # no selection in tags argument => match anything
+ self._node_types = (
+ 1 << tree.XML_COMMENT_NODE |
+ 1 << tree.XML_PI_NODE |
+ 1 << tree.XML_ENTITY_REF_NODE |
+ 1 << tree.XML_ELEMENT_NODE)
+ else:
+ self._node_types = 0
+ self._storeTags(tags, set())
+
+ cdef _storeTags(self, tag, set seen):
+ if tag is Comment:
+ self._node_types |= 1 << tree.XML_COMMENT_NODE
+ elif tag is ProcessingInstruction:
+ self._node_types |= 1 << tree.XML_PI_NODE
+ elif tag is Entity:
+ self._node_types |= 1 << tree.XML_ENTITY_REF_NODE
+ elif tag is Element:
+ self._node_types |= 1 << tree.XML_ELEMENT_NODE
+ elif python._isString(tag):
+ if tag in seen:
+ return
+ seen.add(tag)
+ if tag in ('*', '{*}*'):
+ self._node_types |= 1 << tree.XML_ELEMENT_NODE
+ else:
+ href, name = _getNsTag(tag)
+ if name == b'*':
+ name = None
+ if href is None:
+ href = b'' # no namespace
+ elif href == b'*':
+ href = None # wildcard: any namespace, including none
+ self._py_tags.append((href, name))
+ elif isinstance(tag, QName):
+ self._storeTags(tag.text, seen)
+ else:
+ # support a sequence of tags
+ for item in tag:
+ self._storeTags(item, seen)
+
+ cdef inline int cacheTags(self, _Document doc, bint force_into_dict=False) except -1:
+ """
+ Look up the tag names in the doc dict to enable string pointer comparisons.
+ """
+ cdef size_t dict_size = tree.xmlDictSize(doc._c_doc.dict)
+ if doc is self._cached_doc and dict_size == self._cached_size:
+ # doc and dict didn't change => names already cached
+ return 0
+ self._tag_count = 0
+ if not self._py_tags:
+ self._cached_doc = doc
+ self._cached_size = dict_size
+ return 0
+ if not self._cached_tags:
+ self._cached_tags = <qname*>python.lxml_malloc(len(self._py_tags), sizeof(qname))
+ if not self._cached_tags:
+ self._cached_doc = None
+ raise MemoryError()
+ self._tag_count = <size_t>_mapTagsToQnameMatchArray(
+ doc._c_doc, self._py_tags, self._cached_tags, force_into_dict)
+ self._cached_doc = doc
+ self._cached_size = dict_size
+ return 0
+
+ cdef inline bint matches(self, xmlNode* c_node) noexcept:
+ cdef qname* c_qname
+ if self._node_types & (1 << c_node.type):
+ return True
+ elif c_node.type == tree.XML_ELEMENT_NODE:
+ for c_qname in self._cached_tags[:self._tag_count]:
+ if _tagMatchesExactly(c_node, c_qname):
+ return True
+ return False
+
+ cdef inline bint matchesNsTag(self, const_xmlChar* c_href,
+ const_xmlChar* c_name) noexcept:
+ cdef qname* c_qname
+ if self._node_types & (1 << tree.XML_ELEMENT_NODE):
+ return True
+ for c_qname in self._cached_tags[:self._tag_count]:
+ if _nsTagMatchesExactly(c_href, c_name, c_qname):
+ return True
+ return False
+
+ cdef inline bint matchesAttribute(self, xmlAttr* c_attr) noexcept:
+ """Attribute matches differ from Element matches in that they do
+ not care about node types.
+ """
+ cdef qname* c_qname
+ for c_qname in self._cached_tags[:self._tag_count]:
+ if _tagMatchesExactly(<xmlNode*>c_attr, c_qname):
+ return True
+ return False
+
+cdef class _ElementMatchIterator:
+ cdef _Element _node
+ cdef _node_to_node_function _next_element
+ cdef _MultiTagMatcher _matcher
+
+ @cython.final
+ cdef _initTagMatcher(self, tags):
+ self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tags)
+
+ def __iter__(self):
+ return self
+
+ @cython.final
+ cdef int _storeNext(self, _Element node) except -1:
+ self._matcher.cacheTags(node._doc)
+ c_node = self._next_element(node._c_node)
+ while c_node is not NULL and not self._matcher.matches(c_node):
+ c_node = self._next_element(c_node)
+ # store Python ref to next node to make sure it's kept alive
+ self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
+ return 0
+
+ def __next__(self):
+ cdef _Element current_node = self._node
+ if current_node is None:
+ raise StopIteration
+ self._storeNext(current_node)
+ return current_node
+
+cdef class ElementChildIterator(_ElementMatchIterator):
+ """ElementChildIterator(self, node, tag=None, reversed=False)
+ Iterates over the children of an element.
+ """
+ def __cinit__(self, _Element node not None, tag=None, *, bint reversed=False):
+ cdef xmlNode* c_node
+ _assertValidNode(node)
+ self._initTagMatcher(tag)
+ if reversed:
+ c_node = _findChildBackwards(node._c_node, 0)
+ self._next_element = _previousElement
+ else:
+ c_node = _findChildForwards(node._c_node, 0)
+ self._next_element = _nextElement
+ self._matcher.cacheTags(node._doc)
+ while c_node is not NULL and not self._matcher.matches(c_node):
+ c_node = self._next_element(c_node)
+ # store Python ref to next node to make sure it's kept alive
+ self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
+
+cdef class SiblingsIterator(_ElementMatchIterator):
+ """SiblingsIterator(self, node, tag=None, preceding=False)
+ Iterates over the siblings of an element.
+
+ You can pass the boolean keyword ``preceding`` to specify the direction.
+ """
+ def __cinit__(self, _Element node not None, tag=None, *, bint preceding=False):
+ _assertValidNode(node)
+ self._initTagMatcher(tag)
+ if preceding:
+ self._next_element = _previousElement
+ else:
+ self._next_element = _nextElement
+ self._storeNext(node)
+
+cdef class AncestorsIterator(_ElementMatchIterator):
+ """AncestorsIterator(self, node, tag=None)
+ Iterates over the ancestors of an element (from parent to parent).
+ """
+ def __cinit__(self, _Element node not None, tag=None):
+ _assertValidNode(node)
+ self._initTagMatcher(tag)
+ self._next_element = _parentElement
+ self._storeNext(node)
+
+cdef class ElementDepthFirstIterator:
+ """ElementDepthFirstIterator(self, node, tag=None, inclusive=True)
+ Iterates over an element and its sub-elements in document order (depth
+ first pre-order).
+
+ Note that this also includes comments, entities and processing
+ instructions. To filter them out, check if the ``tag`` property
+ of the returned element is a string (i.e. not None and not a
+ factory function), or pass the ``Element`` factory for the ``tag``
+ argument to receive only Elements.
+
+ If the optional ``tag`` argument is not None, the iterator returns only
+ the elements that match the respective name and namespace.
+
+ The optional boolean argument 'inclusive' defaults to True and can be set
+ to False to exclude the start element itself.
+
+ Note that the behaviour of this iterator is completely undefined if the
+ tree it traverses is modified during iteration.
+ """
+ # we keep Python references here to control GC
+ # keep the next Element after the one we return, and the (s)top node
+ cdef _Element _next_node
+ cdef _Element _top_node
+ cdef _MultiTagMatcher _matcher
+ def __cinit__(self, _Element node not None, tag=None, *, bint inclusive=True):
+ _assertValidNode(node)
+ self._top_node = node
+ self._next_node = node
+ self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
+ self._matcher.cacheTags(node._doc)
+ if not inclusive or not self._matcher.matches(node._c_node):
+ # find start node (this cannot raise StopIteration, self._next_node != None)
+ next(self)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef xmlNode* c_node
+ cdef _Element current_node = self._next_node
+ if current_node is None:
+ raise StopIteration
+ c_node = current_node._c_node
+ self._matcher.cacheTags(current_node._doc)
+ if not self._matcher._tag_count:
+ # no tag name was found in the dict => not in document either
+ # try to match by node type
+ c_node = self._nextNodeAnyTag(c_node)
+ else:
+ c_node = self._nextNodeMatchTag(c_node)
+ if c_node is NULL:
+ self._next_node = None
+ else:
+ self._next_node = _elementFactory(current_node._doc, c_node)
+ return current_node
+
+ @cython.final
+ cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node) noexcept:
+ cdef int node_types = self._matcher._node_types
+ if not node_types:
+ return NULL
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
+ if node_types & (1 << c_node.type):
+ return c_node
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return NULL
+
+ @cython.final
+ cdef xmlNode* _nextNodeMatchTag(self, xmlNode* c_node) noexcept:
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
+ if self._matcher.matches(c_node):
+ return c_node
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return NULL
+
+
+cdef class ElementTextIterator:
+ """ElementTextIterator(self, element, tag=None, with_tail=True)
+ Iterates over the text content of a subtree.
+
+ You can pass the ``tag`` keyword argument to restrict text content to a
+ specific tag name.
+
+ You can set the ``with_tail`` keyword argument to ``False`` to skip over
+ tail text (e.g. if you know that it's only whitespace from pretty-printing).
+ """
+ cdef object _events
+ cdef _Element _start_element
+ def __cinit__(self, _Element element not None, tag=None, *, bint with_tail=True):
+ _assertValidNode(element)
+ if with_tail:
+ events = ("start", "comment", "pi", "end")
+ else:
+ events = ("start",)
+ self._start_element = element
+ self._events = iterwalk(element, events=events, tag=tag)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef _Element element
+ result = None
+ while result is None:
+ event, element = next(self._events) # raises StopIteration
+ if event == "start":
+ result = element.text
+ elif element is not self._start_element:
+ result = element.tail
+ return result
+
+
+cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL:
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewDocNode(c_doc, NULL, _xcstr(name_utf), NULL)
+ return c_node
+
+cdef xmlNode* _createComment(xmlDoc* c_doc, const_xmlChar* text) noexcept:
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewDocComment(c_doc, text)
+ return c_node
+
+cdef xmlNode* _createPI(xmlDoc* c_doc, const_xmlChar* target, const_xmlChar* text) noexcept:
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewDocPI(c_doc, target, text)
+ return c_node
+
+cdef xmlNode* _createEntity(xmlDoc* c_doc, const_xmlChar* name) noexcept:
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewReference(c_doc, name)
+ return c_node
+
+# module-level API for ElementTree
+
+def Element(_tag, attrib=None, nsmap=None, **_extra):
+ """Element(_tag, attrib=None, nsmap=None, **_extra)
+
+ Element factory. This function returns an object implementing the
+ Element interface.
+
+ Also look at the `_Element.makeelement()` and
+ `_BaseParser.makeelement()` methods, which provide a faster way to
+ create an Element within a specific document or parser context.
+ """
+ return _makeElement(_tag, NULL, None, None, None, None,
+ attrib, nsmap, _extra)
+
+
+def Comment(text=None):
+ """Comment(text=None)
+
+ Comment element factory. This factory function creates a special element that will
+ be serialized as an XML comment.
+ """
+ cdef _Document doc
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+
+ if text is None:
+ text = b''
+ else:
+ text = _utf8(text)
+ if b'--' in text or text.endswith(b'-'):
+ raise ValueError("Comment may not contain '--' or end with '-'")
+
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ c_node = _createComment(c_doc, _xcstr(text))
+ tree.xmlAddChild(<xmlNode*>c_doc, c_node)
+ return _elementFactory(doc, c_node)
+
+
+def ProcessingInstruction(target, text=None):
+ """ProcessingInstruction(target, text=None)
+
+ ProcessingInstruction element factory. This factory function creates a
+ special element that will be serialized as an XML processing instruction.
+ """
+ cdef _Document doc
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+
+ target = _utf8(target)
+ _tagValidOrRaise(target)
+ if target.lower() == b'xml':
+ raise ValueError, f"Invalid PI name '{target}'"
+
+ if text is None:
+ text = b''
+ else:
+ text = _utf8(text)
+ if b'?>' in text:
+ raise ValueError, "PI text must not contain '?>'"
+
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
+ tree.xmlAddChild(<xmlNode*>c_doc, c_node)
+ return _elementFactory(doc, c_node)
+
+PI = ProcessingInstruction
+
+
+cdef class CDATA:
+ """CDATA(data)
+
+ CDATA factory. This factory creates an opaque data object that
+ can be used to set Element text. The usual way to use it is::
+
+ >>> el = Element('content')
+ >>> el.text = CDATA('a string')
+
+ >>> print(el.text)
+ a string
+ >>> print(tostring(el, encoding="unicode"))
+ <content><![CDATA[a string]]></content>
+ """
+ cdef bytes _utf8_data
+ def __cinit__(self, data):
+ self._utf8_data = _utf8(data)
+
+
+def Entity(name):
+ """Entity(name)
+
+ Entity factory. This factory function creates a special element
+ that will be serialized as an XML entity reference or character
+ reference. Note, however, that entities will not be automatically
+ declared in the document. A document that uses entity references
+ requires a DTD to define the entities.
+ """
+ cdef _Document doc
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+ name_utf = _utf8(name)
+ c_name = _xcstr(name_utf)
+ if c_name[0] == c'#':
+ if not _characterReferenceIsValid(c_name + 1):
+ raise ValueError, f"Invalid character reference: '{name}'"
+ elif not _xmlNameIsValid(c_name):
+ raise ValueError, f"Invalid entity reference: '{name}'"
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ c_node = _createEntity(c_doc, c_name)
+ tree.xmlAddChild(<xmlNode*>c_doc, c_node)
+ return _elementFactory(doc, c_node)
+
+
+def SubElement(_Element _parent not None, _tag,
+ attrib=None, nsmap=None, **_extra):
+ """SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
+
+ Subelement factory. This function creates an element instance, and
+ appends it to an existing element.
+ """
+ return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra)
+
+
+def ElementTree(_Element element=None, *, file=None, _BaseParser parser=None):
+ """ElementTree(element=None, file=None, parser=None)
+
+ ElementTree wrapper class.
+ """
+ cdef xmlNode* c_next
+ cdef xmlNode* c_node
+ cdef xmlNode* c_node_copy
+ cdef xmlDoc* c_doc
+ cdef _ElementTree etree
+ cdef _Document doc
+
+ if element is not None:
+ doc = element._doc
+ elif file is not None:
+ try:
+ doc = _parseDocument(file, parser, None)
+ except _TargetParserResult as result_container:
+ return result_container.result
+ else:
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, parser)
+
+ return _elementTreeFactory(doc, element)
+
+
+def HTML(text, _BaseParser parser=None, *, base_url=None):
+ """HTML(text, parser=None, base_url=None)
+
+ Parses an HTML document from a string constant. Returns the root
+ node (or the result returned by a parser target). This function
+ can be used to embed "HTML literals" in Python code.
+
+ To override the parser with a different ``HTMLParser`` you can pass it to
+ the ``parser`` keyword argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ cdef _Document doc
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ if not isinstance(parser, HTMLParser):
+ parser = __DEFAULT_HTML_PARSER
+ try:
+ doc = _parseMemoryDocument(text, base_url, parser)
+ return doc.getroot()
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def XML(text, _BaseParser parser=None, *, base_url=None):
+ """XML(text, parser=None, base_url=None)
+
+ Parses an XML document or fragment from a string constant.
+ Returns the root node (or the result returned by a parser target).
+ This function can be used to embed "XML literals" in Python code,
+ like in
+
+ >>> root = XML("<root><test/></root>")
+ >>> print(root.tag)
+ root
+
+ To override the parser with a different ``XMLParser`` you can pass it to
+ the ``parser`` keyword argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ cdef _Document doc
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ if not isinstance(parser, XMLParser):
+ parser = __DEFAULT_XML_PARSER
+ try:
+ doc = _parseMemoryDocument(text, base_url, parser)
+ return doc.getroot()
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def fromstring(text, _BaseParser parser=None, *, base_url=None):
+ """fromstring(text, parser=None, base_url=None)
+
+ Parses an XML document or fragment from a string. Returns the
+ root node (or the result returned by a parser target).
+
+ To override the default parser with a different parser you can pass it to
+ the ``parser`` keyword argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ cdef _Document doc
+ try:
+ doc = _parseMemoryDocument(text, base_url, parser)
+ return doc.getroot()
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def fromstringlist(strings, _BaseParser parser=None):
+ """fromstringlist(strings, parser=None)
+
+ Parses an XML document from a sequence of strings. Returns the
+ root node (or the result returned by a parser target).
+
+ To override the default parser with a different parser you can pass it to
+ the ``parser`` keyword argument.
+ """
+ cdef _Document doc
+ if isinstance(strings, (bytes, unicode)):
+ raise ValueError("passing a single string into fromstringlist() is not"
+ " efficient, use fromstring() instead")
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ feed = parser.feed
+ for data in strings:
+ feed(data)
+ return parser.close()
+
+
+def iselement(element):
+ """iselement(element)
+
+ Checks if an object appears to be a valid element object.
+ """
+ return isinstance(element, _Element) and (<_Element>element)._c_node is not NULL
+
+
+def indent(tree, space=" ", *, Py_ssize_t level=0):
+ """indent(tree, space=" ", level=0)
+
+ Indent an XML document by inserting newlines and indentation space
+ after elements.
+
+ *tree* is the ElementTree or Element to modify. The (root) element
+ itself will not be changed, but the tail text of all elements in its
+ subtree will be adapted.
+
+ *space* is the whitespace to insert for each indentation level, two
+ space characters by default.
+
+ *level* is the initial indentation level. Setting this to a higher
+ value than 0 can be used for indenting subtrees that are more deeply
+ nested inside of a document.
+ """
+ root = _rootNodeOrRaise(tree)
+ if level < 0:
+ raise ValueError(f"Initial indentation level must be >= 0, got {level}")
+ if _hasChild(root._c_node):
+ space = _utf8(space)
+ indent = b"\n" + level * space
+ _indent_children(root._c_node, 1, space, [indent, indent + space])
+
+
+cdef int _indent_children(xmlNode* c_node, Py_ssize_t level, bytes one_space, list indentations) except -1:
+ # Reuse indentation strings for speed.
+ if len(indentations) <= level:
+ indentations.append(indentations[-1] + one_space)
+
+ # Start a new indentation level for the first child.
+ child_indentation = indentations[level]
+ if not _hasNonWhitespaceText(c_node):
+ _setNodeText(c_node, child_indentation)
+
+ # Recursively indent all children.
+ cdef xmlNode* c_child = _findChildForwards(c_node, 0)
+ while c_child is not NULL:
+ if _hasChild(c_child):
+ _indent_children(c_child, level+1, one_space, indentations)
+ c_next_child = _nextElement(c_child)
+ if not _hasNonWhitespaceTail(c_child):
+ if c_next_child is NULL:
+ # Dedent after the last child.
+ child_indentation = indentations[level-1]
+ _setTailText(c_child, child_indentation)
+ c_child = c_next_child
+ return 0
+
+
+def dump(_Element elem not None, *, bint pretty_print=True, bint with_tail=True):
+ """dump(elem, pretty_print=True, with_tail=True)
+
+ Writes an element tree or element structure to sys.stdout. This function
+ should be used for debugging only.
+ """
+ xml = tostring(elem, pretty_print=pretty_print, with_tail=with_tail, encoding='unicode')
+ if not pretty_print:
+ xml += '\n'
+ sys.stdout.write(xml)
+
+
+def tostring(element_or_tree, *, encoding=None, method="xml",
+ xml_declaration=None, bint pretty_print=False, bint with_tail=True,
+ standalone=None, doctype=None,
+ # method='c14n'
+ bint exclusive=False, inclusive_ns_prefixes=None,
+ # method='c14n2'
+ bint with_comments=True, bint strip_text=False,
+ ):
+ """tostring(element_or_tree, encoding=None, method="xml",
+ xml_declaration=None, pretty_print=False, with_tail=True,
+ standalone=None, doctype=None,
+ exclusive=False, inclusive_ns_prefixes=None,
+ with_comments=True, strip_text=False,
+ )
+
+ Serialize an element to an encoded string representation of its XML
+ tree.
+
+ Defaults to ASCII encoding without XML declaration. This
+ behaviour can be configured with the keyword arguments 'encoding'
+ (string) and 'xml_declaration' (bool). Note that changing the
+ encoding to a non UTF-8 compatible encoding will enable a
+ declaration by default.
+
+ You can also serialise to a Unicode string without declaration by
+ passing the name ``'unicode'`` as encoding (or the ``str`` function
+ in Py3 or ``unicode`` in Py2). This changes the return value from
+ a byte string to an unencoded unicode string.
+
+ The keyword argument 'pretty_print' (bool) enables formatted XML.
+
+ The keyword argument 'method' selects the output method: 'xml',
+ 'html', plain 'text' (text content without tags), 'c14n' or 'c14n2'.
+ Default is 'xml'.
+
+ With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
+ ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
+ C14N, include comments, and list the inclusive prefixes respectively.
+
+ With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
+ ``strip_text`` options control the output of comments and text space
+ according to C14N 2.0.
+
+ Passing a boolean value to the ``standalone`` option will output
+ an XML declaration with the corresponding ``standalone`` flag.
+
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
+ You can prevent the tail text of the element from being serialised
+ by passing the boolean ``with_tail`` option. This has no impact
+ on the tail text of children, which will always be serialised.
+ """
+ cdef bint write_declaration
+ cdef int is_standalone
+ # C14N serialisation
+ if method in ('c14n', 'c14n2'):
+ if encoding is not None:
+ raise ValueError("Cannot specify encoding with C14N")
+ if xml_declaration:
+ raise ValueError("Cannot enable XML declaration in C14N")
+ if method == 'c14n':
+ return _tostringC14N(element_or_tree, exclusive, with_comments, inclusive_ns_prefixes)
+ else:
+ out = BytesIO()
+ target = C14NWriterTarget(
+ utf8_writer(out).write,
+ with_comments=with_comments, strip_text=strip_text)
+ _tree_to_target(element_or_tree, target)
+ return out.getvalue()
+ if not with_comments:
+ raise ValueError("Can only discard comments in C14N serialisation")
+ if strip_text:
+ raise ValueError("Can only strip text in C14N 2.0 serialisation")
+ if encoding is unicode or (encoding is not None and encoding.lower() == 'unicode'):
+ if xml_declaration:
+ raise ValueError, \
+ "Serialisation to unicode must not request an XML declaration"
+ write_declaration = 0
+ encoding = unicode
+ elif xml_declaration is None:
+ # by default, write an XML declaration only for non-standard encodings
+ write_declaration = encoding is not None and encoding.upper() not in \
+ ('ASCII', 'UTF-8', 'UTF8', 'US-ASCII')
+ else:
+ write_declaration = xml_declaration
+ if encoding is None:
+ encoding = 'ASCII'
+ if standalone is None:
+ is_standalone = -1
+ elif standalone:
+ write_declaration = 1
+ is_standalone = 1
+ else:
+ write_declaration = 1
+ is_standalone = 0
+
+ if isinstance(element_or_tree, _Element):
+ return _tostring(<_Element>element_or_tree, encoding, doctype, method,
+ write_declaration, 0, pretty_print, with_tail,
+ is_standalone)
+ elif isinstance(element_or_tree, _ElementTree):
+ return _tostring((<_ElementTree>element_or_tree)._context_node,
+ encoding, doctype, method, write_declaration, 1,
+ pretty_print, with_tail, is_standalone)
+ else:
+ raise TypeError, f"Type '{python._fqtypename(element_or_tree).decode('utf8')}' cannot be serialized."
+
+
+
+def tostringlist(element_or_tree, *args, **kwargs):
+ """tostringlist(element_or_tree, *args, **kwargs)
+
+ Serialize an element to an encoded string representation of its XML
+ tree, stored in a list of partial strings.
+
+ This is purely for ElementTree 1.3 compatibility. The result is a
+ single string wrapped in a list.
+ """
+ return [tostring(element_or_tree, *args, **kwargs)]
+
+
+def tounicode(element_or_tree, *, method="xml", bint pretty_print=False,
+ bint with_tail=True, doctype=None):
+ """tounicode(element_or_tree, method="xml", pretty_print=False,
+ with_tail=True, doctype=None)
+
+ Serialize an element to the Python unicode representation of its XML
+ tree.
+
+ :deprecated: use ``tostring(el, encoding='unicode')`` instead.
+
+ Note that the result does not carry an XML encoding declaration and is
+ therefore not necessarily suited for serialization to byte streams without
+ further treatment.
+
+ The boolean keyword argument 'pretty_print' enables formatted XML.
+
+ The keyword argument 'method' selects the output method: 'xml',
+ 'html' or plain 'text'.
+
+ You can prevent the tail text of the element from being serialised
+ by passing the boolean ``with_tail`` option. This has no impact
+ on the tail text of children, which will always be serialised.
+ """
+ if isinstance(element_or_tree, _Element):
+ return _tostring(<_Element>element_or_tree, unicode, doctype, method,
+ 0, 0, pretty_print, with_tail, -1)
+ elif isinstance(element_or_tree, _ElementTree):
+ return _tostring((<_ElementTree>element_or_tree)._context_node,
+ unicode, doctype, method, 0, 1, pretty_print,
+ with_tail, -1)
+ else:
+ raise TypeError, f"Type '{type(element_or_tree)}' cannot be serialized."
+
+
+def parse(source, _BaseParser parser=None, *, base_url=None):
+ """parse(source, parser=None, base_url=None)
+
+ Return an ElementTree object loaded with source elements. If no parser
+ is provided as second argument, the default parser is used.
+
+ The ``source`` can be any of the following:
+
+ - a file name/path
+ - a file object
+ - a file-like object
+ - a URL using the HTTP or FTP protocol
+
+ To parse from a string, use the ``fromstring()`` function instead.
+
+ Note that it is generally faster to parse from a file path or URL
+ than from an open file object or file-like object. Transparent
+ decompression from gzip compressed sources is supported (unless
+ explicitly disabled in libxml2).
+
+ The ``base_url`` keyword allows setting a URL for the document
+ when parsing from a file-like object. This is needed when looking
+ up external entities (DTD, XInclude, ...) with relative paths.
+ """
+ cdef _Document doc
+ try:
+ doc = _parseDocument(source, parser, base_url)
+ return _elementTreeFactory(doc, None)
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def adopt_external_document(capsule, _BaseParser parser=None):
+ """adopt_external_document(capsule, parser=None)
+
+ Unpack a libxml2 document pointer from a PyCapsule and wrap it in an
+ lxml ElementTree object.
+
+ This allows external libraries to build XML/HTML trees using libxml2
+ and then pass them efficiently into lxml for further processing.
+
+ If a ``parser`` is provided, it will be used for configuring the
+ lxml document. No parsing will be done.
+
+ The capsule must have the name ``"libxml2:xmlDoc"`` and its pointer
+ value must reference a correct libxml2 document of type ``xmlDoc*``.
+ The creator of the capsule must take care to correctly clean up the
+ document using an appropriate capsule destructor. By default, the
+ libxml2 document will be copied to let lxml safely own the memory
+ of the internal tree that it uses.
+
+ If the capsule context is non-NULL, it must point to a C string that
+ can be compared using ``strcmp()``. If the context string equals
+ ``"destructor:xmlFreeDoc"``, the libxml2 document will not be copied
+ but the capsule invalidated instead by clearing its destructor and
+ name. That way, lxml takes ownership of the libxml2 document in memory
+ without creating a copy first, and the capsule destructor will not be
+ called. The document will then eventually be cleaned up by lxml using
+ the libxml2 API function ``xmlFreeDoc()`` once it is no longer used.
+
+ If no copy is made, later modifications of the tree outside of lxml
+ should not be attempted after transferring the ownership.
+ """
+ cdef xmlDoc* c_doc
+ cdef bint is_owned = False
+ c_doc = <xmlDoc*> python.lxml_unpack_xmldoc_capsule(capsule, &is_owned)
+ doc = _adoptForeignDoc(c_doc, parser, is_owned)
+ return _elementTreeFactory(doc, None)
+
+
+################################################################################
+# Include submodules
+
+include "readonlytree.pxi" # Read-only implementation of Element proxies
+include "classlookup.pxi" # Element class lookup mechanisms
+include "nsclasses.pxi" # Namespace implementation and registry
+include "docloader.pxi" # Support for custom document loaders
+include "parser.pxi" # XML and HTML parsers
+include "saxparser.pxi" # SAX-like Parser interface and tree builder
+include "parsertarget.pxi" # ET Parser target
+include "serializer.pxi" # XML output functions
+include "iterparse.pxi" # incremental XML parsing
+include "xmlid.pxi" # XMLID and IDDict
+include "xinclude.pxi" # XInclude
+include "cleanup.pxi" # Cleanup and recursive element removal functions
+
+
+################################################################################
+# Include submodules for XPath and XSLT
+
+include "extensions.pxi" # XPath/XSLT extension functions
+include "xpath.pxi" # XPath evaluation
+include "xslt.pxi" # XSL transformations
+include "xsltext.pxi" # XSL extension elements
+
+
+################################################################################
+# Validation
+
+cdef class DocumentInvalid(LxmlError):
+ """Validation error.
+
+ Raised by all document validators when their ``assertValid(tree)``
+ method fails.
+ """
+
+
+cdef class _Validator:
+ "Base class for XML validators."
+ cdef _ErrorLog _error_log
+ def __cinit__(self):
+ self._error_log = _ErrorLog()
+
+ def validate(self, etree):
+ """validate(self, etree)
+
+ Validate the document using this schema.
+
+ Returns true if document is valid, false if not.
+ """
+ return self(etree)
+
+ def assertValid(self, etree):
+ """assertValid(self, etree)
+
+ Raises `DocumentInvalid` if the document does not comply with the schema.
+ """
+ if not self(etree):
+ raise DocumentInvalid(self._error_log._buildExceptionMessage(
+ "Document does not comply with schema"),
+ self._error_log)
+
+ def assert_(self, etree):
+ """assert_(self, etree)
+
+ Raises `AssertionError` if the document does not comply with the schema.
+ """
+ if not self(etree):
+ raise AssertionError, self._error_log._buildExceptionMessage(
+ "Document does not comply with schema")
+
+ cpdef _append_log_message(self, int domain, int type, int level, int line,
+ message, filename):
+ self._error_log._receiveGeneric(domain, type, level, line, message,
+ filename)
+
+ cpdef _clear_error_log(self):
+ self._error_log.clear()
+
+ @property
+ def error_log(self):
+ """The log of validation errors and warnings."""
+ assert self._error_log is not None, "XPath evaluator not initialised"
+ return self._error_log.copy()
+
+include "dtd.pxi" # DTD
+include "relaxng.pxi" # RelaxNG
+include "xmlschema.pxi" # XMLSchema
+include "schematron.pxi" # Schematron (requires libxml2 2.6.21+)
+
+################################################################################
+# Public C API
+
+include "public-api.pxi"
+
+################################################################################
+# Other stuff
+
+include "debug.pxi"