two version of R2R are here HEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/lxml/parser.pxi
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 2000 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lxml/parser.pxi b/.venv/lib/python3.12/site-packages/lxml/parser.pxi
new file mode 100644
index 00000000..70337d87
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/lxml/parser.pxi
@@ -0,0 +1,2000 @@
+# Parsers for XML and HTML
+
+from lxml.includes cimport xmlparser
+from lxml.includes cimport htmlparser
+
+
+class ParseError(LxmlSyntaxError):
+    """Syntax error while parsing an XML document.
+
+    For compatibility with ElementTree 1.3 and later.
+    """
+    def __init__(self, message, code, line, column, filename=None):
+        super(_ParseError, self).__init__(message)
+        self.lineno, self.offset = (line, column - 1)
+        self.code = code
+        self.filename = filename
+
+    @property
+    def position(self):
+        return self.lineno, self.offset + 1
+
+    @position.setter
+    def position(self, new_pos):
+        self.lineno, column = new_pos
+        self.offset = column - 1
+
+cdef object _ParseError = ParseError
+
+
+class XMLSyntaxError(ParseError):
+    """Syntax error while parsing an XML document.
+    """
+
+cdef class ParserError(LxmlError):
+    """Internal lxml parser error.
+    """
+
+
+@cython.final
+@cython.internal
+cdef class _ParserDictionaryContext:
+    # Global parser context to share the string dictionary.
+    #
+    # This class is a delegate singleton!
+    #
+    # It creates _ParserDictionaryContext objects for each thread to keep thread state,
+    # but those must never be used directly.  Always stick to using the static
+    # __GLOBAL_PARSER_CONTEXT as defined below the class.
+    #
+
+    cdef tree.xmlDict* _c_dict
+    cdef _BaseParser _default_parser
+    cdef list _implied_parser_contexts
+
+    def __cinit__(self):
+        self._c_dict = NULL
+        self._implied_parser_contexts = []
+
+    def __dealloc__(self):
+        if self._c_dict is not NULL:
+            xmlparser.xmlDictFree(self._c_dict)
+
+    cdef int initMainParserContext(self) except -1:
+        """Put the global context into the thread dictionary of the main
+        thread.  To be called once and only in the main thread."""
+        thread_dict = python.PyThreadState_GetDict()
+        if thread_dict is not NULL:
+            (<dict>thread_dict)["_ParserDictionaryContext"] = self
+
+    cdef _ParserDictionaryContext _findThreadParserContext(self):
+        "Find (or create) the _ParserDictionaryContext object for the current thread"
+        cdef _ParserDictionaryContext context
+        thread_dict = python.PyThreadState_GetDict()
+        if thread_dict is NULL:
+            return self
+        d = <dict>thread_dict
+        result = python.PyDict_GetItem(d, "_ParserDictionaryContext")
+        if result is not NULL:
+            return <object>result
+        context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
+        d["_ParserDictionaryContext"] = context
+        return context
+
+    cdef int setDefaultParser(self, _BaseParser parser) except -1:
+        "Set the default parser for the current thread"
+        cdef _ParserDictionaryContext context
+        context = self._findThreadParserContext()
+        context._default_parser = parser
+
+    cdef _BaseParser getDefaultParser(self):
+        "Return (or create) the default parser of the current thread"
+        cdef _ParserDictionaryContext context
+        context = self._findThreadParserContext()
+        if context._default_parser is None:
+            if self._default_parser is None:
+                self._default_parser = __DEFAULT_XML_PARSER._copy()
+            if context is not self:
+                context._default_parser = self._default_parser._copy()
+        return context._default_parser
+
+    cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
+        "Return the thread-local dict or create a new one if necessary."
+        cdef _ParserDictionaryContext context
+        context = self._findThreadParserContext()
+        if context._c_dict is NULL:
+            # thread dict not yet set up => use default or create a new one
+            if default is not NULL:
+                context._c_dict = default
+                xmlparser.xmlDictReference(default)
+                return default
+            if self._c_dict is NULL:
+                self._c_dict = xmlparser.xmlDictCreate()
+            if context is not self:
+                context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
+        return context._c_dict
+
+    cdef int initThreadDictRef(self, tree.xmlDict** c_dict_ref) except -1:
+        c_dict = c_dict_ref[0]
+        c_thread_dict = self._getThreadDict(c_dict)
+        if c_dict is c_thread_dict:
+            return 0
+        if c_dict is not NULL:
+            xmlparser.xmlDictFree(c_dict)
+        c_dict_ref[0] = c_thread_dict
+        xmlparser.xmlDictReference(c_thread_dict)
+
+    cdef int initParserDict(self, xmlparser.xmlParserCtxt* pctxt) except -1:
+        "Assure we always use the same string dictionary."
+        self.initThreadDictRef(&pctxt.dict)
+        pctxt.dictNames = 1
+
+    cdef int initXPathParserDict(self, xpath.xmlXPathContext* pctxt) except -1:
+        "Assure we always use the same string dictionary."
+        self.initThreadDictRef(&pctxt.dict)
+
+    cdef int initDocDict(self, xmlDoc* result) except -1:
+        "Store dict of last object parsed if no shared dict yet"
+        # XXX We also free the result dict here if there already was one.
+        # This case should only occur for new documents with empty dicts,
+        # otherwise we'd free data that's in use => segfault
+        self.initThreadDictRef(&result.dict)
+
+    cdef _ParserContext findImpliedContext(self):
+        """Return any current implied xml parser context for the current
+        thread.  This is used when the resolver functions are called
+        with an xmlParserCtxt that was generated from within libxml2
+        (i.e. without a _ParserContext) - which happens when parsing
+        schema and xinclude external references."""
+        cdef _ParserDictionaryContext context
+        cdef _ParserContext implied_context
+
+        # see if we have a current implied parser
+        context = self._findThreadParserContext()
+        if context._implied_parser_contexts:
+            implied_context = context._implied_parser_contexts[-1]
+            return implied_context
+        return None
+
+    cdef int pushImpliedContextFromParser(self, _BaseParser parser) except -1:
+        "Push a new implied context object taken from the parser."
+        if parser is not None:
+            self.pushImpliedContext(parser._getParserContext())
+        else:
+            self.pushImpliedContext(None)
+
+    cdef int pushImpliedContext(self, _ParserContext parser_context) except -1:
+        "Push a new implied context object."
+        cdef _ParserDictionaryContext context
+        context = self._findThreadParserContext()
+        context._implied_parser_contexts.append(parser_context)
+
+    cdef int popImpliedContext(self) except -1:
+        "Pop the current implied context object."
+        cdef _ParserDictionaryContext context
+        context = self._findThreadParserContext()
+        context._implied_parser_contexts.pop()
+
+cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
+__GLOBAL_PARSER_CONTEXT.initMainParserContext()
+
+############################################################
+## support for Python unicode I/O
+############################################################
+
+# name of Python Py_UNICODE encoding as known to libxml2
+cdef const_char* _PY_UNICODE_ENCODING = NULL
+
+cdef int _setupPythonUnicode() except -1:
+    """Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode
+    strings if libxml2 supports reading native Python unicode.  This depends
+    on iconv and the local Python installation, so we simply check if we find
+    a matching encoding handler.
+    """
+    cdef tree.xmlCharEncodingHandler* enchandler
+    cdef Py_ssize_t l
+    cdef const_char* enc
+    cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
+    cdef const_xmlChar* buffer = <const_xmlChar*>uchars
+    # apparently, libxml2 can't detect UTF-16 on some systems
+    if (buffer[0] == c'<' and buffer[1] == c'\0' and
+            buffer[2] == c't' and buffer[3] == c'\0'):
+        enc = "UTF-16LE"
+    elif (buffer[0] == c'\0' and buffer[1] == c'<' and
+            buffer[2] == c'\0' and buffer[3] == c't'):
+        enc = "UTF-16BE"
+    else:
+        # let libxml2 give it a try
+        enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
+        if enc is NULL:
+            # not my fault, it's YOUR broken system :)
+            return 0
+    enchandler = tree.xmlFindCharEncodingHandler(enc)
+    if enchandler is not NULL:
+        global _PY_UNICODE_ENCODING
+        tree.xmlCharEncCloseFunc(enchandler)
+        _PY_UNICODE_ENCODING = enc
+    return 0
+
+cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
+    "Work around bug in libxml2: find iconv name of encoding on our own."
+    cdef tree.xmlCharEncoding enc
+    enc = tree.xmlDetectCharEncoding(buffer, size)
+    if enc == tree.XML_CHAR_ENCODING_UTF16LE:
+        if size >= 4 and (buffer[0] == <const_xmlChar> b'\xFF' and
+                          buffer[1] == <const_xmlChar> b'\xFE' and
+                          buffer[2] == 0 and buffer[3] == 0):
+            return "UTF-32LE"  # according to BOM
+        else:
+            return "UTF-16LE"
+    elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
+        return "UTF-16BE"
+    elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
+        return "UCS-4LE"
+    elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
+        return "UCS-4BE"
+    elif enc == tree.XML_CHAR_ENCODING_NONE:
+        return NULL
+    else:
+        # returns a constant char*, no need to free it
+        return tree.xmlGetCharEncodingName(enc)
+
+# Python 3.12 removed support for "Py_UNICODE".
+if python.PY_VERSION_HEX < 0x030C0000:
+    _setupPythonUnicode()
+
+
+cdef unicode _find_PyUCS4EncodingName():
+    """
+    Find a suitable encoding for Py_UCS4 PyUnicode strings in libxml2.
+    """
+    ustring = "<xml>\U0001F92A</xml>"
+    cdef const xmlChar* buffer = <const xmlChar*> python.PyUnicode_DATA(ustring)
+    cdef Py_ssize_t py_buffer_len = python.PyUnicode_GET_LENGTH(ustring)
+
+    encoding_name = ''
+    cdef tree.xmlCharEncoding enc = tree.xmlDetectCharEncoding(buffer, py_buffer_len)
+    enchandler = tree.xmlGetCharEncodingHandler(enc)
+    if enchandler is not NULL:
+        try:
+            if enchandler.name:
+                encoding_name = enchandler.name.decode('UTF-8')
+        finally:
+            tree.xmlCharEncCloseFunc(enchandler)
+    else:
+        c_name = tree.xmlGetCharEncodingName(enc)
+        if c_name:
+            encoding_name = c_name.decode('UTF-8')
+
+
+    if encoding_name and not encoding_name.endswith('LE') and not encoding_name.endswith('BE'):
+        encoding_name += 'BE' if python.PY_BIG_ENDIAN else 'LE'
+    return encoding_name or None
+
+_pyucs4_encoding_name = _find_PyUCS4EncodingName()
+
+
+############################################################
+## support for file-like objects
+############################################################
+
+@cython.final
+@cython.internal
+cdef class _FileReaderContext:
+    cdef object _filelike
+    cdef object _encoding
+    cdef object _url
+    cdef object _bytes
+    cdef _ExceptionContext _exc_context
+    cdef Py_ssize_t _bytes_read
+    cdef char* _c_url
+    cdef bint _close_file_after_read
+
+    def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
+        self._exc_context = exc_context
+        self._filelike = filelike
+        self._close_file_after_read = close_file
+        self._encoding = encoding
+        if url is None:
+            self._c_url = NULL
+        else:
+            url = _encodeFilename(url)
+            self._c_url = _cstr(url)
+        self._url = url
+        self._bytes  = b''
+        self._bytes_read = 0
+
+    cdef _close_file(self):
+        if self._filelike is None or not self._close_file_after_read:
+            return
+        try:
+            close = self._filelike.close
+        except AttributeError:
+            close = None
+        finally:
+            self._filelike = None
+        if close is not None:
+            close()
+
+    cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self) noexcept:
+        cdef xmlparser.xmlParserInputBuffer* c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
+        if c_buffer:
+            c_buffer.readcallback  = _readFilelikeParser
+            c_buffer.context = <python.PyObject*> self
+        return c_buffer
+
+    cdef xmlparser.xmlParserInput* _createParserInput(
+            self, xmlparser.xmlParserCtxt* ctxt) noexcept:
+        cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
+        if not c_buffer:
+            return NULL
+        return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
+
+    cdef tree.xmlDtd* _readDtd(self) noexcept:
+        cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
+        if not c_buffer:
+            return NULL
+        with nogil:
+            return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
+
+    cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options) noexcept:
+        cdef xmlDoc* result
+        cdef void* c_callback_context = <python.PyObject*> self
+        cdef char* c_encoding = _cstr(self._encoding) if self._encoding is not None else NULL
+
+        orig_options = ctxt.options
+        with nogil:
+            if ctxt.html:
+                result = htmlparser.htmlCtxtReadIO(
+                        ctxt, _readFilelikeParser, NULL, c_callback_context,
+                        self._c_url, c_encoding, options)
+                if result is not NULL:
+                    if _fixHtmlDictNames(ctxt.dict, result) < 0:
+                        tree.xmlFreeDoc(result)
+                        result = NULL
+            else:
+                result = xmlparser.xmlCtxtReadIO(
+                    ctxt, _readFilelikeParser, NULL, c_callback_context,
+                    self._c_url, c_encoding, options)
+        ctxt.options = orig_options # work around libxml2 problem
+
+        try:
+            self._close_file()
+        except:
+            self._exc_context._store_raised()
+        finally:
+            return result  # swallow any exceptions
+
+    cdef int copyToBuffer(self, char* c_buffer, int c_requested) noexcept:
+        cdef int c_byte_count = 0
+        cdef char* c_start
+        cdef Py_ssize_t byte_count, remaining
+        if self._bytes_read < 0:
+            return 0
+        try:
+            byte_count = python.PyBytes_GET_SIZE(self._bytes)
+            remaining  = byte_count - self._bytes_read
+            while c_requested > remaining:
+                c_start = _cstr(self._bytes) + self._bytes_read
+                cstring_h.memcpy(c_buffer, c_start, remaining)
+                c_byte_count += remaining
+                c_buffer += remaining
+                c_requested -= remaining
+
+                self._bytes = self._filelike.read(c_requested)
+                if not isinstance(self._bytes, bytes):
+                    if isinstance(self._bytes, unicode):
+                        if self._encoding is None:
+                            self._bytes = (<unicode>self._bytes).encode('utf8')
+                        else:
+                            self._bytes = python.PyUnicode_AsEncodedString(
+                                self._bytes, _cstr(self._encoding), NULL)
+                    else:
+                        self._close_file()
+                        raise TypeError, \
+                            "reading from file-like objects must return byte strings or unicode strings"
+
+                remaining = python.PyBytes_GET_SIZE(self._bytes)
+                if remaining == 0:
+                    self._bytes_read = -1
+                    self._close_file()
+                    return c_byte_count
+                self._bytes_read = 0
+
+            if c_requested > 0:
+                c_start = _cstr(self._bytes) + self._bytes_read
+                cstring_h.memcpy(c_buffer, c_start, c_requested)
+                c_byte_count += c_requested
+                self._bytes_read += c_requested
+        except:
+            c_byte_count = -1
+            self._exc_context._store_raised()
+            try:
+                self._close_file()
+            except:
+                self._exc_context._store_raised()
+        finally:
+            return c_byte_count  # swallow any exceptions
+
+cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil:
+    return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
+
+cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) noexcept nogil:
+    return stdio.fread(c_buffer, 1,  c_size, <stdio.FILE*>ctxt)
+
+############################################################
+## support for custom document loaders
+############################################################
+
+cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
+                                               xmlparser.xmlParserCtxt* c_context) noexcept with gil:
+    cdef _ResolverContext context
+    cdef xmlparser.xmlParserInput* c_input
+    cdef _InputDocument doc_ref
+    cdef _FileReaderContext file_context
+    # if there is no _ParserContext associated with the xmlParserCtxt
+    # passed, check to see if the thread state object has an implied
+    # context.
+    if c_context._private is not NULL:
+        context = <_ResolverContext>c_context._private
+    else:
+        context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
+
+    if context is None:
+        if __DEFAULT_ENTITY_LOADER is NULL:
+            return NULL
+        with nogil:
+            # free the GIL as we might do serious I/O here (e.g. HTTP)
+            c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
+        return c_input
+
+    try:
+        if c_url is NULL:
+            url = None
+        else:
+            # parsing a related document (DTD etc.) => UTF-8 encoded URL?
+            url = _decodeFilename(<const_xmlChar*>c_url)
+        if c_pubid is NULL:
+            pubid = None
+        else:
+            pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
+
+        doc_ref = context._resolvers.resolve(url, pubid, context)
+    except:
+        context._store_raised()
+        return NULL
+
+    if doc_ref is not None:
+        if doc_ref._type == PARSER_DATA_STRING:
+            data = doc_ref._data_bytes
+            filename = doc_ref._filename
+            if not filename:
+                filename = None
+            elif not isinstance(filename, bytes):
+                # most likely a text URL
+                filename = filename.encode('utf8')
+                if not isinstance(filename, bytes):
+                    filename = None
+
+            c_input = xmlparser.xmlNewInputStream(c_context)
+            if c_input is not NULL:
+                if filename is not None:
+                    c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
+                c_input.base = _xcstr(data)
+                c_input.length = python.PyBytes_GET_SIZE(data)
+                c_input.cur = c_input.base
+                c_input.end = c_input.base + c_input.length
+        elif doc_ref._type == PARSER_DATA_FILENAME:
+            data = None
+            c_filename = _cstr(doc_ref._filename)
+            with nogil:
+                # free the GIL as we might do serious I/O here
+                c_input = xmlparser.xmlNewInputFromFile(
+                    c_context, c_filename)
+        elif doc_ref._type == PARSER_DATA_FILE:
+            file_context = _FileReaderContext(doc_ref._file, context, url,
+                                              None, doc_ref._close_file)
+            c_input = file_context._createParserInput(c_context)
+            data = file_context
+        else:
+            data = None
+            c_input = NULL
+
+        if data is not None:
+            context._storage.add(data)
+        if c_input is not NULL:
+            return c_input
+
+    if __DEFAULT_ENTITY_LOADER is NULL:
+        return NULL
+
+    with nogil:
+        # free the GIL as we might do serious I/O here (e.g. HTTP)
+        c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
+    return c_input
+
+cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
+__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
+
+
+cdef xmlparser.xmlExternalEntityLoader _register_document_loader() noexcept nogil:
+    cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
+    xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
+    return old
+
+cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) noexcept nogil:
+    xmlparser.xmlSetExternalEntityLoader(old)
+
+
+############################################################
+## Parsers
+############################################################
+
+@cython.no_gc_clear  # May have to call "self._validator.disconnect()" on dealloc.
+@cython.internal
+cdef class _ParserContext(_ResolverContext):
+    cdef _ErrorLog _error_log
+    cdef _ParserSchemaValidationContext _validator
+    cdef xmlparser.xmlParserCtxt* _c_ctxt
+    cdef xmlparser.xmlExternalEntityLoader _orig_loader
+    cdef python.PyThread_type_lock _lock
+    cdef _Document _doc
+    cdef bint _collect_ids
+
+    def __cinit__(self):
+        self._c_ctxt = NULL
+        self._collect_ids = True
+        if not config.ENABLE_THREADING:
+            self._lock = NULL
+        else:
+            self._lock = python.PyThread_allocate_lock()
+        self._error_log = _ErrorLog()
+
+    def __dealloc__(self):
+        if config.ENABLE_THREADING and self._lock is not NULL:
+            python.PyThread_free_lock(self._lock)
+            self._lock = NULL
+        if self._c_ctxt is not NULL:
+            if <void*>self._validator is not NULL and self._validator is not None:
+                # If the parser was not closed correctly (e.g. interrupted iterparse()),
+                # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
+                # validator plug might still be in place, which will make xmlFreeParserCtxt()
+                # crash when trying to xmlFree() a static SAX handler.
+                # Thus, make sure we disconnect the handler interceptor here at the latest.
+                self._validator.disconnect()
+            xmlparser.xmlFreeParserCtxt(self._c_ctxt)
+
+    cdef _ParserContext _copy(self):
+        cdef _ParserContext context
+        context = self.__class__()
+        context._collect_ids = self._collect_ids
+        context._validator = self._validator.copy()
+        _initParserContext(context, self._resolvers._copy(), NULL)
+        return context
+
+    cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
+        self._c_ctxt = c_ctxt
+        c_ctxt._private = <void*>self
+
+    cdef void _resetParserContext(self) noexcept:
+        if self._c_ctxt is not NULL:
+            if self._c_ctxt.html:
+                htmlparser.htmlCtxtReset(self._c_ctxt)
+                self._c_ctxt.disableSAX = 0 # work around bug in libxml2
+            else:
+                xmlparser.xmlClearParserCtxt(self._c_ctxt)
+                # work around bug in libxml2 [2.9.10 .. 2.9.14]:
+                # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
+                self._c_ctxt.nsNr = 0
+
+    cdef int prepare(self, bint set_document_loader=True) except -1:
+        cdef int result
+        if config.ENABLE_THREADING and self._lock is not NULL:
+            with nogil:
+                result = python.PyThread_acquire_lock(
+                    self._lock, python.WAIT_LOCK)
+            if result == 0:
+                raise ParserError, "parser locking failed"
+        self._error_log.clear()
+        self._doc = None
+        # Need a cast here because older libxml2 releases do not use 'const' in the functype.
+        self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
+        self._orig_loader = _register_document_loader() if set_document_loader else NULL
+        if self._validator is not None:
+            self._validator.connect(self._c_ctxt, self._error_log)
+        return 0
+
+    cdef int cleanup(self) except -1:
+        if self._orig_loader is not NULL:
+            _reset_document_loader(self._orig_loader)
+        try:
+            if self._validator is not None:
+                self._validator.disconnect()
+            self._resetParserContext()
+            self.clear()
+            self._doc = None
+            self._c_ctxt.sax.serror = NULL
+        finally:
+            if config.ENABLE_THREADING and self._lock is not NULL:
+                python.PyThread_release_lock(self._lock)
+        return 0
+
+    cdef object _handleParseResult(self, _BaseParser parser,
+                                   xmlDoc* result, filename):
+        c_doc = self._handleParseResultDoc(parser, result, filename)
+        if self._doc is not None and self._doc._c_doc is c_doc:
+            return self._doc
+        else:
+            return _documentFactory(c_doc, parser)
+
+    cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
+                                       xmlDoc* result, filename) except NULL:
+        recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
+        return _handleParseResult(self, self._c_ctxt, result,
+                                  filename, recover,
+                                  free_doc=self._doc is None)
+
+cdef _initParserContext(_ParserContext context,
+                        _ResolverRegistry resolvers,
+                        xmlparser.xmlParserCtxt* c_ctxt):
+    _initResolverContext(context, resolvers)
+    if c_ctxt is not NULL:
+        context._initParserContext(c_ctxt)
+
+cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil:
+    (<_ParserContext>_parser_context._private)._error_log._receive(error)
+
+cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
+    if __DEBUG:
+        if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
+            _forwardError(NULL, error)
+        else:
+            _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
+
+cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
+                          _ErrorLog error_log) except -1:
+    if filename is not None and \
+           ctxt.lastError.domain == xmlerror.XML_FROM_IO:
+        if isinstance(filename, bytes):
+            filename = _decodeFilenameWithLength(
+                <bytes>filename, len(<bytes>filename))
+        if ctxt.lastError.message is not NULL:
+            try:
+                message = ctxt.lastError.message.decode('utf-8')
+            except UnicodeDecodeError:
+                # the filename may be in there => play it safe
+                message = ctxt.lastError.message.decode('iso8859-1')
+            message = f"Error reading file '{filename}': {message.strip()}"
+        else:
+            message = f"Error reading '{filename}'"
+        raise IOError, message
+    elif error_log:
+        raise error_log._buildParseException(
+            XMLSyntaxError, "Document is not well formed")
+    elif ctxt.lastError.message is not NULL:
+        message = ctxt.lastError.message.strip()
+        code = ctxt.lastError.code
+        line = ctxt.lastError.line
+        column = ctxt.lastError.int2
+        if ctxt.lastError.line > 0:
+            message = f"line {line}: {message}"
+        raise XMLSyntaxError(message, code, line, column, filename)
+    else:
+        raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
+                             filename)
+
+cdef xmlDoc* _handleParseResult(_ParserContext context,
+                                xmlparser.xmlParserCtxt* c_ctxt,
+                                xmlDoc* result, filename,
+                                bint recover, bint free_doc) except NULL:
+    cdef bint well_formed
+    if result is not NULL:
+        __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+
+    if c_ctxt.myDoc is not NULL:
+        if c_ctxt.myDoc is not result:
+            __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
+            tree.xmlFreeDoc(c_ctxt.myDoc)
+        c_ctxt.myDoc = NULL
+
+    if result is not NULL:
+        if (context._validator is not None and
+                not context._validator.isvalid()):
+            well_formed = 0  # actually not 'valid', but anyway ...
+        elif (not c_ctxt.wellFormed and not c_ctxt.html and
+                c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
+                [1 for error in context._error_log
+                 if error.type == ErrorTypes.ERR_INVALID_CHAR]):
+            # An encoding error occurred and libxml2 switched from UTF-8
+            # input to (undecoded) Latin-1, at some arbitrary point in the
+            # document.  Better raise an error than allowing for a broken
+            # tree with mixed encodings. This is fixed in libxml2 2.12.
+            well_formed = 0
+        elif recover or (c_ctxt.wellFormed and
+                         c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
+            well_formed = 1
+        elif not c_ctxt.replaceEntities and not c_ctxt.validate \
+                 and context is not None:
+            # in this mode, we ignore errors about undefined entities
+            for error in context._error_log.filter_from_errors():
+                if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
+                       error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
+                    well_formed = 0
+                    break
+            else:
+                well_formed = 1
+        else:
+            well_formed = 0
+
+        if not well_formed:
+            if free_doc:
+                tree.xmlFreeDoc(result)
+            result = NULL
+
+    if context is not None and context._has_raised():
+        if result is not NULL:
+            if free_doc:
+                tree.xmlFreeDoc(result)
+            result = NULL
+        context._raise_if_stored()
+
+    if result is NULL:
+        if context is not None:
+            _raiseParseError(c_ctxt, filename, context._error_log)
+        else:
+            _raiseParseError(c_ctxt, filename, None)
+    else:
+        if result.URL is NULL and filename is not None:
+            result.URL = tree.xmlStrdup(_xcstr(filename))
+        if result.encoding is NULL:
+            result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
+
+    if context._validator is not None and \
+           context._validator._add_default_attributes:
+        # we currently need to do this here as libxml2 does not
+        # support inserting default attributes during parse-time
+        # validation
+        context._validator.inject_default_attributes(result)
+
+    return result
+
+cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) noexcept nogil:
+    cdef xmlNode* c_node
+    if c_doc is NULL:
+        return 0
+    c_node = c_doc.children
+    tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
+    if c_node.type == tree.XML_ELEMENT_NODE:
+        if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
+            return -1
+    tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+    return 0
+
+cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
+                                  xmlNode* c_start_node) noexcept nogil:
+    """
+    Move names to the dict, iterating in document order, starting at
+    c_start_node. This is used in incremental parsing after each chunk.
+    """
+    cdef xmlNode* c_node
+    if not c_doc:
+        return 0
+    if not c_start_node:
+        return _fixHtmlDictNames(c_dict, c_doc)
+    c_node = c_start_node
+    tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
+    if c_node.type == tree.XML_ELEMENT_NODE:
+        if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
+            return -1
+    tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+    return 0
+
+cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
+                                      xmlNode* c_node) noexcept nogil:
+    cdef xmlNode* c_attr
+    c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
+    if c_name is NULL:
+        return -1
+    if c_name is not c_node.name:
+        tree.xmlFree(<char*>c_node.name)
+        c_node.name = c_name
+    c_attr = <xmlNode*>c_node.properties
+    while c_attr is not NULL:
+        c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
+        if c_name is NULL:
+            return -1
+        if c_name is not c_attr.name:
+            tree.xmlFree(<char*>c_attr.name)
+            c_attr.name = c_name
+        c_attr = c_attr.next
+    return 0
+
+
+@cython.internal
+cdef class _BaseParser:
+    cdef ElementClassLookup _class_lookup
+    cdef _ResolverRegistry _resolvers
+    cdef _ParserContext _parser_context
+    cdef _ParserContext _push_parser_context
+    cdef int _parse_options
+    cdef bint _for_html
+    cdef bint _remove_comments
+    cdef bint _remove_pis
+    cdef bint _strip_cdata
+    cdef bint _collect_ids
+    cdef bint _resolve_external_entities
+    cdef XMLSchema _schema
+    cdef bytes _filename
+    cdef readonly object target
+    cdef object _default_encoding
+    cdef tuple _events_to_collect  # (event_types, tag)
+
+    def __init__(self, int parse_options, bint for_html, XMLSchema schema,
+                 remove_comments, remove_pis, strip_cdata, collect_ids,
+                 target, encoding, bint resolve_external_entities=True):
+        cdef tree.xmlCharEncodingHandler* enchandler
+        cdef int c_encoding
+        if not isinstance(self, (XMLParser, HTMLParser)):
+            raise TypeError, "This class cannot be instantiated"
+
+        self._parse_options = parse_options
+        self.target = target
+        self._for_html = for_html
+        self._remove_comments = remove_comments
+        self._remove_pis = remove_pis
+        self._strip_cdata = strip_cdata
+        self._collect_ids = collect_ids
+        self._resolve_external_entities = resolve_external_entities
+        self._schema = schema
+
+        self._resolvers = _ResolverRegistry()
+
+        if encoding is None:
+            self._default_encoding = None
+        else:
+            encoding = _utf8(encoding)
+            enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
+            if enchandler is NULL:
+                raise LookupError, f"unknown encoding: '{encoding}'"
+            tree.xmlCharEncCloseFunc(enchandler)
+            self._default_encoding = encoding
+
+    cdef _setBaseURL(self, base_url):
+        self._filename = _encodeFilename(base_url)
+
+    cdef _collectEvents(self, event_types, tag):
+        if event_types is None:
+            event_types = ()
+        else:
+            event_types = tuple(set(event_types))
+            _buildParseEventFilter(event_types)  # purely for validation
+        self._events_to_collect = (event_types, tag)
+
+    cdef _ParserContext _getParserContext(self):
+        cdef xmlparser.xmlParserCtxt* pctxt
+        if self._parser_context is None:
+            self._parser_context = self._createContext(self.target, None)
+            self._parser_context._collect_ids = self._collect_ids
+            if self._schema is not None:
+                self._parser_context._validator = \
+                    self._schema._newSaxValidator(
+                        self._parse_options & xmlparser.XML_PARSE_DTDATTR)
+            pctxt = self._newParserCtxt()
+            _initParserContext(self._parser_context, self._resolvers, pctxt)
+            self._configureSaxContext(pctxt)
+        return self._parser_context
+
+    cdef _ParserContext _getPushParserContext(self):
+        cdef xmlparser.xmlParserCtxt* pctxt
+        if self._push_parser_context is None:
+            self._push_parser_context = self._createContext(
+                self.target, self._events_to_collect)
+            self._push_parser_context._collect_ids = self._collect_ids
+            if self._schema is not None:
+                self._push_parser_context._validator = \
+                    self._schema._newSaxValidator(
+                        self._parse_options & xmlparser.XML_PARSE_DTDATTR)
+            pctxt = self._newPushParserCtxt()
+            _initParserContext(
+                self._push_parser_context, self._resolvers, pctxt)
+            self._configureSaxContext(pctxt)
+        return self._push_parser_context
+
+    cdef _ParserContext _createContext(self, target, events_to_collect):
+        cdef _SaxParserContext sax_context
+        if target is not None:
+            sax_context = _TargetParserContext(self)
+            (<_TargetParserContext>sax_context)._setTarget(target)
+        elif events_to_collect:
+            sax_context = _SaxParserContext(self)
+        else:
+            # nothing special to configure
+            return _ParserContext()
+        if events_to_collect:
+            events, tag = events_to_collect
+            sax_context._setEventFilter(events, tag)
+        return sax_context
+
+    @cython.final
+    cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
+        if self._remove_comments:
+            pctxt.sax.comment = NULL
+        if self._remove_pis:
+            pctxt.sax.processingInstruction = NULL
+        if self._strip_cdata:
+            # hard switch-off for CDATA nodes => makes them plain text
+            pctxt.sax.cdataBlock = NULL
+        if not self._resolve_external_entities:
+            pctxt.sax.getEntity = _getInternalEntityOnly
+
+    cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
+        cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
+        if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
+            # need to extend SAX1 context to SAX2 to get proper error reports
+            if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
+                sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
+                if sax is NULL:
+                    raise MemoryError()
+                cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
+                                 sizeof(htmlparser.htmlDefaultSAXHandler))
+                c_ctxt.sax = sax
+            sax.initialized = xmlparser.XML_SAX2_MAGIC
+            # Need a cast here because older libxml2 releases do not use 'const' in the functype.
+            sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
+            sax.startElementNs = NULL
+            sax.endElementNs = NULL
+            sax._private = NULL
+        return 0
+
+    cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
+        cdef xmlparser.xmlParserCtxt* c_ctxt
+        if self._for_html:
+            c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
+            if c_ctxt is not NULL:
+                self._registerHtmlErrorHandler(c_ctxt)
+        else:
+            c_ctxt = xmlparser.xmlNewParserCtxt()
+        if c_ctxt is NULL:
+            raise MemoryError
+        c_ctxt.sax.startDocument = _initSaxDocument
+        return c_ctxt
+
+    cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
+        cdef xmlparser.xmlParserCtxt* c_ctxt
+        cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
+        if self._for_html:
+            c_ctxt = htmlparser.htmlCreatePushParserCtxt(
+                NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
+            if c_ctxt is not NULL:
+                self._registerHtmlErrorHandler(c_ctxt)
+                htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
+        else:
+            c_ctxt = xmlparser.xmlCreatePushParserCtxt(
+                NULL, NULL, NULL, 0, c_filename)
+            if c_ctxt is not NULL:
+                xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
+        if c_ctxt is NULL:
+            raise MemoryError()
+        c_ctxt.sax.startDocument = _initSaxDocument
+        return c_ctxt
+
+    @property
+    def error_log(self):
+        """The error log of the last parser run.
+        """
+        cdef _ParserContext context
+        context = self._getParserContext()
+        return context._error_log.copy()
+
+    @property
+    def resolvers(self):
+        """The custom resolver registry of this parser."""
+        return self._resolvers
+
+    @property
+    def version(self):
+        """The version of the underlying XML parser."""
+        return "libxml2 %d.%d.%d" % LIBXML_VERSION
+
+    def set_element_class_lookup(self, ElementClassLookup lookup = None):
+        """set_element_class_lookup(self, lookup = None)
+
+        Set a lookup scheme for element classes generated from this parser.
+
+        Reset it by passing None or nothing.
+        """
+        self._class_lookup = lookup
+
+    cdef _BaseParser _copy(self):
+        "Create a new parser with the same configuration."
+        cdef _BaseParser parser
+        parser = self.__class__()
+        parser._parse_options = self._parse_options
+        parser._for_html = self._for_html
+        parser._remove_comments = self._remove_comments
+        parser._remove_pis = self._remove_pis
+        parser._strip_cdata = self._strip_cdata
+        parser._filename = self._filename
+        parser._resolvers = self._resolvers
+        parser.target = self.target
+        parser._class_lookup  = self._class_lookup
+        parser._default_encoding = self._default_encoding
+        parser._schema = self._schema
+        parser._events_to_collect = self._events_to_collect
+        return parser
+
+    def copy(self):
+        """copy(self)
+
+        Create a new parser with the same configuration.
+        """
+        return self._copy()
+
+    def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
+        """makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
+
+        Creates a new element associated with this parser.
+        """
+        return _makeElement(_tag, NULL, None, self, None, None,
+                            attrib, nsmap, _extra)
+
+    # internal parser methods
+
+    cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
+        """Parse unicode document, share dictionary if possible.
+        """
+        cdef _ParserContext context
+        cdef xmlDoc* result
+        cdef xmlparser.xmlParserCtxt* pctxt
+        cdef Py_ssize_t py_buffer_len
+        cdef int buffer_len, c_kind
+        cdef const_char* c_text
+        cdef const_char* c_encoding = _PY_UNICODE_ENCODING
+        if python.PyUnicode_IS_READY(utext):
+            # PEP-393 string
+            c_text = <const_char*>python.PyUnicode_DATA(utext)
+            py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
+            c_kind = python.PyUnicode_KIND(utext)
+            if c_kind == 1:
+                if python.PyUnicode_MAX_CHAR_VALUE(utext) <= 127:
+                    c_encoding = 'UTF-8'
+                else:
+                    c_encoding = 'ISO-8859-1'
+            elif c_kind == 2:
+                py_buffer_len *= 2
+                if python.PY_BIG_ENDIAN:
+                    c_encoding = 'UTF-16BE'  # actually UCS-2
+                else:
+                    c_encoding = 'UTF-16LE'  # actually UCS-2
+            elif c_kind == 4:
+                py_buffer_len *= 4
+                if python.PY_BIG_ENDIAN:
+                    c_encoding = 'UTF-32BE'  # actually UCS-4
+                else:
+                    c_encoding = 'UTF-32LE'  # actually UCS-4
+            else:
+                assert False, f"Illegal Unicode kind {c_kind}"
+        else:
+            # old Py_UNICODE string
+            py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
+            c_text = python.PyUnicode_AS_DATA(utext)
+        assert 0 <= py_buffer_len <= limits.INT_MAX
+        buffer_len = py_buffer_len
+
+        context = self._getParserContext()
+        context.prepare()
+        try:
+            pctxt = context._c_ctxt
+            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+            orig_options = pctxt.options
+            with nogil:
+                if self._for_html:
+                    result = htmlparser.htmlCtxtReadMemory(
+                        pctxt, c_text, buffer_len, c_filename, c_encoding,
+                        self._parse_options)
+                    if result is not NULL:
+                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
+                            tree.xmlFreeDoc(result)
+                            result = NULL
+                else:
+                    result = xmlparser.xmlCtxtReadMemory(
+                        pctxt, c_text, buffer_len, c_filename, c_encoding,
+                        self._parse_options)
+            pctxt.options = orig_options # work around libxml2 problem
+
+            return context._handleParseResultDoc(self, result, None)
+        finally:
+            context.cleanup()
+
+    cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
+                           char* c_filename) except NULL:
+        """Parse document, share dictionary if possible.
+        """
+        cdef _ParserContext context
+        cdef xmlDoc* result
+        cdef xmlparser.xmlParserCtxt* pctxt
+        cdef char* c_encoding
+        cdef tree.xmlCharEncoding enc
+        context = self._getParserContext()
+        context.prepare()
+        try:
+            pctxt = context._c_ctxt
+            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+
+            if self._default_encoding is None:
+                c_encoding = NULL
+                # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
+                # NOTE: limit to problematic cases because it changes character offsets
+                if c_len >= 4 and (c_text[0] == b'\xFF' and c_text[1] == b'\xFE' and
+                                   c_text[2] == 0 and c_text[3] == 0):
+                    c_encoding = "UTF-32LE"
+                    c_text += 4
+                    c_len -= 4
+                elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
+                                     c_text[2] == b'\xFE' and c_text[3] == b'\xFF'):
+                    c_encoding = "UTF-32BE"
+                    c_text += 4
+                    c_len -= 4
+                else:
+                    # no BOM => try to determine encoding
+                    enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
+                    if enc == tree.XML_CHAR_ENCODING_UCS4LE:
+                        c_encoding = 'UTF-32LE'
+                    elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
+                        c_encoding = 'UTF-32BE'
+            else:
+                c_encoding = _cstr(self._default_encoding)
+
+            orig_options = pctxt.options
+            with nogil:
+                if self._for_html:
+                    result = htmlparser.htmlCtxtReadMemory(
+                        pctxt, c_text, c_len, c_filename,
+                        c_encoding, self._parse_options)
+                    if result is not NULL:
+                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
+                            tree.xmlFreeDoc(result)
+                            result = NULL
+                else:
+                    result = xmlparser.xmlCtxtReadMemory(
+                        pctxt, c_text, c_len, c_filename,
+                        c_encoding, self._parse_options)
+            pctxt.options = orig_options # work around libxml2 problem
+
+            return context._handleParseResultDoc(self, result, None)
+        finally:
+            context.cleanup()
+
+    cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
+        cdef _ParserContext context
+        cdef xmlDoc* result
+        cdef xmlparser.xmlParserCtxt* pctxt
+        cdef char* c_encoding
+        result = NULL
+
+        context = self._getParserContext()
+        context.prepare()
+        try:
+            pctxt = context._c_ctxt
+            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+
+            if self._default_encoding is None:
+                c_encoding = NULL
+            else:
+                c_encoding = _cstr(self._default_encoding)
+
+            orig_options = pctxt.options
+            with nogil:
+                if self._for_html:
+                    result = htmlparser.htmlCtxtReadFile(
+                        pctxt, c_filename, c_encoding, self._parse_options)
+                    if result is not NULL:
+                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
+                            tree.xmlFreeDoc(result)
+                            result = NULL
+                else:
+                    result = xmlparser.xmlCtxtReadFile(
+                        pctxt, c_filename, c_encoding, self._parse_options)
+            pctxt.options = orig_options # work around libxml2 problem
+
+            return context._handleParseResultDoc(self, result, c_filename)
+        finally:
+            context.cleanup()
+
+    cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
+                                       encoding) except NULL:
+        cdef _ParserContext context
+        cdef _FileReaderContext file_context
+        cdef xmlDoc* result
+        cdef xmlparser.xmlParserCtxt* pctxt
+        cdef char* c_filename
+        if not filename:
+            filename = None
+
+        context = self._getParserContext()
+        context.prepare()
+        try:
+            pctxt = context._c_ctxt
+            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+            file_context = _FileReaderContext(
+                filelike, context, filename,
+                encoding or self._default_encoding)
+            result = file_context._readDoc(pctxt, self._parse_options)
+
+            return context._handleParseResultDoc(
+                self, result, filename)
+        finally:
+            context.cleanup()
+
+
+cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name) noexcept nogil:
+    """
+    Callback function to intercept the entity resolution when external entity loading is disabled.
+    """
+    cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
+    if not entity:
+        return NULL
+    if entity.etype not in (
+            tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
+            tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
+            tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
+        return entity
+
+    # Reject all external entities and fail the parsing instead. There is currently
+    # no way in libxml2 to just prevent the entity resolution in this case.
+    cdef xmlerror.xmlError c_error
+    cdef xmlerror.xmlStructuredErrorFunc err_func
+    cdef xmlparser.xmlParserInput* parser_input
+    cdef void* err_context
+
+    c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
+    err_func = xmlerror.xmlStructuredError
+    if err_func:
+        parser_input = c_ctxt.input
+        # Copied from xmlVErrParser() in libxml2: get current input from stack.
+        if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
+            parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
+
+        c_error = xmlerror.xmlError(
+            domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
+            code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
+            level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
+            message=b"External entity resolution is disabled for security reasons "
+                    b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
+                    b"if you consider it safe to enable it.",
+            file=parser_input.filename,
+            node=entity,
+            str1=<char*> name,
+            str2=NULL,
+            str3=NULL,
+            line=parser_input.line if parser_input else 0,
+            int1=0,
+            int2=parser_input.col if parser_input else 0,
+        )
+        err_context = xmlerror.xmlStructuredErrorContext
+        err_func(err_context, &c_error)
+
+    c_ctxt.wellFormed = 0
+    # The entity was looked up and does not need to be freed.
+    return NULL
+
+
+cdef void _initSaxDocument(void* ctxt) noexcept with gil:
+    xmlparser.xmlSAX2StartDocument(ctxt)
+    c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+    c_doc = c_ctxt.myDoc
+
+    # set up document dict
+    if c_doc and c_ctxt.dict and not c_doc.dict:
+        # I have no idea why libxml2 disables this - we need it
+        c_ctxt.dictNames = 1
+        c_doc.dict = c_ctxt.dict
+        xmlparser.xmlDictReference(c_ctxt.dict)
+
+    # set up XML ID hash table
+    if c_ctxt._private:
+        context = <_ParserContext>c_ctxt._private
+        if context._collect_ids:
+            # keep the global parser dict from filling up with XML IDs
+            if c_doc and not c_doc.ids:
+                # memory errors are not fatal here
+                c_dict = xmlparser.xmlDictCreate()
+                if c_dict:
+                    c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
+                    xmlparser.xmlDictFree(c_dict)
+                else:
+                    c_doc.ids = tree.xmlHashCreate(0)
+        else:
+            c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
+            if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
+                # already initialised but empty => clear
+                tree.xmlHashFree(c_doc.ids, NULL)
+                c_doc.ids = NULL
+
+
+############################################################
+## ET feed parser
+############################################################
+
+cdef class _FeedParser(_BaseParser):
+    cdef bint _feed_parser_running
+
+    @property
+    def feed_error_log(self):
+        """The error log of the last (or current) run of the feed parser.
+
+        Note that this is local to the feed parser and thus is
+        different from what the ``error_log`` property returns.
+        """
+        return self._getPushParserContext()._error_log.copy()
+
+    cpdef feed(self, data):
+        """feed(self, data)
+
+        Feeds data to the parser.  The argument should be an 8-bit string
+        buffer containing encoded data, although Unicode is supported as long
+        as both string types are not mixed.
+
+        This is the main entry point to the consumer interface of a
+        parser.  The parser will parse as much of the XML stream as it
+        can on each call.  To finish parsing or to reset the parser,
+        call the ``close()`` method.  Both methods may raise
+        ParseError if errors occur in the input data.  If an error is
+        raised, there is no longer a need to call ``close()``.
+
+        The feed parser interface is independent of the normal parser
+        usage.  You can use the same parser as a feed parser and in
+        the ``parse()`` function concurrently.
+        """
+        cdef _ParserContext context
+        cdef bytes bstring
+        cdef xmlparser.xmlParserCtxt* pctxt
+        cdef Py_ssize_t py_buffer_len, ustart
+        cdef const_char* char_data
+        cdef const_char* c_encoding
+        cdef int buffer_len
+        cdef int error
+        cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
+
+        if isinstance(data, bytes):
+            if self._default_encoding is None:
+                c_encoding = NULL
+            else:
+                c_encoding = self._default_encoding
+            char_data = _cstr(data)
+            py_buffer_len = python.PyBytes_GET_SIZE(data)
+            ustart = 0
+        elif isinstance(data, unicode):
+            c_encoding = b"UTF-8"
+            char_data = NULL
+            py_buffer_len = len(<unicode> data)
+            ustart = 0
+        else:
+            raise TypeError, "Parsing requires string data"
+
+        context = self._getPushParserContext()
+        pctxt = context._c_ctxt
+        error = 0
+        if not self._feed_parser_running:
+            context.prepare(set_document_loader=False)
+            self._feed_parser_running = 1
+            c_filename = (_cstr(self._filename)
+                          if self._filename is not None else NULL)
+
+            # We have to give *mlCtxtResetPush() enough input to figure
+            # out the character encoding (at least four bytes),
+            # however if we give it all we got, we'll have nothing for
+            # *mlParseChunk() and things go wrong.
+            buffer_len = 0
+            if char_data is not NULL:
+                buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
+            orig_loader = _register_document_loader()
+            if self._for_html:
+                error = _htmlCtxtResetPush(
+                    pctxt, char_data, buffer_len, c_filename, c_encoding,
+                    self._parse_options)
+            else:
+                xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
+                error = xmlparser.xmlCtxtResetPush(
+                    pctxt, char_data, buffer_len, c_filename, c_encoding)
+            _reset_document_loader(orig_loader)
+            py_buffer_len -= buffer_len
+            char_data += buffer_len
+            if error:
+                raise MemoryError()
+            __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+
+        #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
+
+        fixup_error = 0
+        while py_buffer_len > 0 and (error == 0 or recover):
+            if char_data is NULL:
+                # Unicode parsing by converting chunks to UTF-8
+                buffer_len = 2**19  # len(bytes) <= 4 * (2**19) == 2 MiB
+                bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8')
+                ustart += buffer_len
+                py_buffer_len -= buffer_len  # may end up < 0
+                error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring))
+            else:
+                # Direct byte string parsing.
+                buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX
+                error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len)
+                py_buffer_len -= buffer_len
+                char_data += buffer_len
+
+            if fixup_error:
+                context.store_exception(MemoryError())
+
+            if context._has_raised():
+                # propagate Python exceptions immediately
+                recover = 0
+                error = 1
+                break
+
+            if error and not pctxt.replaceEntities and not pctxt.validate:
+                # in this mode, we ignore errors about undefined entities
+                for entry in context._error_log.filter_from_errors():
+                    if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
+                           entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
+                        break
+                else:
+                    error = 0
+
+        if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
+            # propagate Python exceptions immediately
+            recover = 0
+            error = 1
+
+        if fixup_error or not recover and (error or not pctxt.wellFormed):
+            self._feed_parser_running = 0
+            try:
+                context._handleParseResult(self, pctxt.myDoc, None)
+            finally:
+                context.cleanup()
+
+    cpdef close(self):
+        """close(self)
+
+        Terminates feeding data to this parser.  This tells the parser to
+        process any remaining data in the feed buffer, and then returns the
+        root Element of the tree that was parsed.
+
+        This method must be called after passing the last chunk of data into
+        the ``feed()`` method.  It should only be called when using the feed
+        parser interface, all other usage is undefined.
+        """
+        if not self._feed_parser_running:
+            raise XMLSyntaxError("no element found",
+                                 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
+                                 self._filename)
+
+        context = self._getPushParserContext()
+        pctxt = context._c_ctxt
+
+        self._feed_parser_running = 0
+        if self._for_html:
+            htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
+        else:
+            xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
+
+        if (pctxt.recovery and not pctxt.disableSAX and
+                isinstance(context, _SaxParserContext)):
+            # apply any left-over 'end' events
+            (<_SaxParserContext>context).flushEvents()
+
+        try:
+            result = context._handleParseResult(self, pctxt.myDoc, None)
+        finally:
+            context.cleanup()
+
+        if isinstance(result, _Document):
+            return (<_Document>result).getroot()
+        else:
+            return result
+
+
+cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt,
+                                  const char* char_data, int buffer_len):
+    fixup_error = 0
+    with nogil:
+        if c_ctxt.html:
+            c_node = c_ctxt.node  # last node where the parser stopped
+            orig_loader = _register_document_loader()
+            error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0)
+            _reset_document_loader(orig_loader)
+            # and now for the fun part: move node names to the dict
+            if c_ctxt.myDoc:
+                fixup_error = _fixHtmlDictSubtreeNames(
+                    c_ctxt.dict, c_ctxt.myDoc, c_node)
+                if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict:
+                    xmlparser.xmlDictFree(c_ctxt.myDoc.dict)
+                    c_ctxt.myDoc.dict = c_ctxt.dict
+                    xmlparser.xmlDictReference(c_ctxt.dict)
+        else:
+            orig_loader = _register_document_loader()
+            error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0)
+            _reset_document_loader(orig_loader)
+    return (error, fixup_error)
+
+
+cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
+                             const_char* c_data, int buffer_len,
+                             const_char* c_filename, const_char* c_encoding,
+                             int parse_options) except -1:
+    cdef xmlparser.xmlParserInput* c_input_stream
+    # libxml2 lacks an HTML push parser setup function
+    error = xmlparser.xmlCtxtResetPush(
+        c_ctxt, c_data, buffer_len, c_filename, c_encoding)
+    if error:
+        return error
+
+    # fix libxml2 setup for HTML
+    c_ctxt.progressive = 1
+    c_ctxt.html = 1
+    htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
+
+    return 0
+
+
+############################################################
+## XML parser
+############################################################
+
+cdef int _XML_DEFAULT_PARSE_OPTIONS
+_XML_DEFAULT_PARSE_OPTIONS = (
+    xmlparser.XML_PARSE_NOENT   |
+    xmlparser.XML_PARSE_NOCDATA |
+    xmlparser.XML_PARSE_NONET   |
+    xmlparser.XML_PARSE_COMPACT |
+    xmlparser.XML_PARSE_BIG_LINES
+    )
+
+cdef class XMLParser(_FeedParser):
+    """XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
+
+    The XML parser.
+
+    Parsers can be supplied as additional argument to various parse
+    functions of the lxml API.  A default parser is always available
+    and can be replaced by a call to the global function
+    'set_default_parser'.  New parsers can be created at any time
+    without a major run-time overhead.
+
+    The keyword arguments in the constructor are mainly based on the
+    libxml2 parser configuration.  A DTD will also be loaded if DTD
+    validation or attribute default values are requested (unless you
+    additionally provide an XMLSchema from which the default
+    attributes can be read).
+
+    Available boolean keyword arguments:
+
+    - attribute_defaults - inject default attributes from DTD or XMLSchema
+    - dtd_validation     - validate against a DTD referenced by the document
+    - load_dtd           - use DTD for parsing
+    - no_network         - prevent network access for related files (default: True)
+    - ns_clean           - clean up redundant namespace declarations
+    - recover            - try hard to parse through broken XML
+    - remove_blank_text  - discard blank text nodes that appear ignorable
+    - remove_comments    - discard comments
+    - remove_pis         - discard processing instructions
+    - strip_cdata        - replace CDATA sections by normal text content (default: True)
+    - compact            - save memory for short text content (default: True)
+    - collect_ids        - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
+    - huge_tree          - disable security restrictions and support very deep trees
+                           and very long text content (only affects libxml2 2.7+)
+
+    Other keyword arguments:
+
+    - resolve_entities - replace entities by their text value: False for keeping the
+          entity references, True for resolving them, and 'internal' for resolving
+          internal definitions only (no external file/URL access).
+          The default used to be True and was changed to 'internal' in lxml 5.0.
+    - encoding - override the document encoding (note: libiconv encoding name)
+    - target   - a parser target object that will receive the parse events
+    - schema   - an XMLSchema to validate against
+
+    Note that you should avoid sharing parsers between threads.  While this is
+    not harmful, it is more efficient to use separate parsers.  This does not
+    apply to the default parser.
+    """
+    def __init__(self, *, encoding=None, attribute_defaults=False,
+                 dtd_validation=False, load_dtd=False, no_network=True,
+                 ns_clean=False, recover=False, XMLSchema schema=None,
+                 huge_tree=False, remove_blank_text=False, resolve_entities='internal',
+                 remove_comments=False, remove_pis=False, strip_cdata=True,
+                 collect_ids=True, target=None, compact=True):
+        cdef int parse_options
+        cdef bint resolve_external = True
+        parse_options = _XML_DEFAULT_PARSE_OPTIONS
+        if load_dtd:
+            parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
+        if dtd_validation:
+            parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
+                            xmlparser.XML_PARSE_DTDLOAD
+        if attribute_defaults:
+            parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
+            if schema is None:
+                parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
+        if ns_clean:
+            parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
+        if recover:
+            parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
+        if remove_blank_text:
+            parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
+        if huge_tree:
+            parse_options = parse_options | xmlparser.XML_PARSE_HUGE
+        if not no_network:
+            parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
+        if not compact:
+            parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
+        if not resolve_entities:
+            parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
+        elif resolve_entities == 'internal':
+            resolve_external = False
+        if not strip_cdata:
+            parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
+
+        _BaseParser.__init__(self, parse_options, False, schema,
+                             remove_comments, remove_pis, strip_cdata,
+                             collect_ids, target, encoding, resolve_external)
+
+
+cdef class XMLPullParser(XMLParser):
+    """XMLPullParser(self, events=None, *, tag=None, **kwargs)
+
+    XML parser that collects parse events in an iterator.
+
+    The collected events are the same as for iterparse(), but the
+    parser itself is non-blocking in the sense that it receives
+    data chunks incrementally through its .feed() method, instead
+    of reading them directly from a file(-like) object all by itself.
+
+    By default, it collects Element end events.  To change that,
+    pass any subset of the available events into the ``events``
+    argument: ``'start'``, ``'end'``, ``'start-ns'``,
+    ``'end-ns'``, ``'comment'``, ``'pi'``.
+
+    To support loading external dependencies relative to the input
+    source, you can pass the ``base_url``.
+    """
+    def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
+        XMLParser.__init__(self, **kwargs)
+        if events is None:
+            events = ('end',)
+        self._setBaseURL(base_url)
+        self._collectEvents(events, tag)
+
+    def read_events(self):
+        return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
+
+
+cdef class ETCompatXMLParser(XMLParser):
+    """ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
+                 dtd_validation=False, load_dtd=False, no_network=True, \
+                 ns_clean=False, recover=False, schema=None, \
+                 huge_tree=False, remove_blank_text=False, resolve_entities=True, \
+                 remove_comments=True, remove_pis=True, strip_cdata=True, \
+                 target=None, compact=True)
+
+    An XML parser with an ElementTree compatible default setup.
+
+    See the XMLParser class for details.
+
+    This parser has ``remove_comments`` and ``remove_pis`` enabled by default
+    and thus ignores comments and processing instructions.
+    """
+    def __init__(self, *, encoding=None, attribute_defaults=False,
+                 dtd_validation=False, load_dtd=False, no_network=True,
+                 ns_clean=False, recover=False, schema=None,
+                 huge_tree=False, remove_blank_text=False, resolve_entities=True,
+                 remove_comments=True, remove_pis=True, strip_cdata=True,
+                 target=None, compact=True):
+        XMLParser.__init__(self,
+                           attribute_defaults=attribute_defaults,
+                           dtd_validation=dtd_validation,
+                           load_dtd=load_dtd,
+                           no_network=no_network,
+                           ns_clean=ns_clean,
+                           recover=recover,
+                           remove_blank_text=remove_blank_text,
+                           huge_tree=huge_tree,
+                           compact=compact,
+                           resolve_entities=resolve_entities,
+                           remove_comments=remove_comments,
+                           remove_pis=remove_pis,
+                           strip_cdata=strip_cdata,
+                           target=target,
+                           encoding=encoding,
+                           schema=schema)
+
+# ET 1.2 compatible name
+XMLTreeBuilder = ETCompatXMLParser
+
+
+cdef XMLParser __DEFAULT_XML_PARSER
+__DEFAULT_XML_PARSER = XMLParser()
+
+__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
+
+def set_default_parser(_BaseParser parser=None):
+    """set_default_parser(parser=None)
+
+    Set a default parser for the current thread.  This parser is used
+    globally whenever no parser is supplied to the various parse functions of
+    the lxml API.  If this function is called without a parser (or if it is
+    None), the default parser is reset to the original configuration.
+
+    Note that the pre-installed default parser is not thread-safe.  Avoid the
+    default parser in multi-threaded environments.  You can create a separate
+    parser for each thread explicitly or use a parser pool.
+    """
+    if parser is None:
+        parser = __DEFAULT_XML_PARSER
+    __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
+
+def get_default_parser():
+    "get_default_parser()"
+    return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+
+############################################################
+## HTML parser
+############################################################
+
+cdef int _HTML_DEFAULT_PARSE_OPTIONS
+_HTML_DEFAULT_PARSE_OPTIONS = (
+    htmlparser.HTML_PARSE_RECOVER |
+    htmlparser.HTML_PARSE_NONET   |
+    htmlparser.HTML_PARSE_COMPACT
+    )
+
+cdef object _UNUSED = object()
+
+cdef class HTMLParser(_FeedParser):
+    """HTMLParser(self, encoding=None, remove_blank_text=False, \
+                   remove_comments=False, remove_pis=False, \
+                   no_network=True, target=None, schema: XMLSchema =None, \
+                   recover=True, compact=True, collect_ids=True, huge_tree=False)
+
+    The HTML parser.
+
+    This parser allows reading HTML into a normal XML tree.  By
+    default, it can read broken (non well-formed) HTML, depending on
+    the capabilities of libxml2.  Use the 'recover' option to switch
+    this off.
+
+    Available boolean keyword arguments:
+
+    - recover            - try hard to parse through broken HTML (default: True)
+    - no_network         - prevent network access for related files (default: True)
+    - remove_blank_text  - discard empty text nodes that are ignorable (i.e. not actual text content)
+    - remove_comments    - discard comments
+    - remove_pis         - discard processing instructions
+    - compact            - save memory for short text content (default: True)
+    - default_doctype    - add a default doctype even if it is not found in the HTML (default: True)
+    - collect_ids        - use a hash table of XML IDs for fast access (default: True)
+    - huge_tree          - disable security restrictions and support very deep trees
+                           and very long text content (only affects libxml2 2.7+)
+
+    Other keyword arguments:
+
+    - encoding - override the document encoding (note: libiconv encoding name)
+    - target   - a parser target object that will receive the parse events
+    - schema   - an XMLSchema to validate against
+
+    Note that you should avoid sharing parsers between threads for performance
+    reasons.
+    """
+    def __init__(self, *, encoding=None, remove_blank_text=False,
+                 remove_comments=False, remove_pis=False, strip_cdata=_UNUSED,
+                 no_network=True, target=None, XMLSchema schema=None,
+                 recover=True, compact=True, default_doctype=True,
+                 collect_ids=True, huge_tree=False):
+        cdef int parse_options
+        parse_options = _HTML_DEFAULT_PARSE_OPTIONS
+        if remove_blank_text:
+            parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
+        if not recover:
+            parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
+        if not no_network:
+            parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
+        if not compact:
+            parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
+        if not default_doctype:
+            parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
+        if huge_tree:
+            parse_options = parse_options | xmlparser.XML_PARSE_HUGE
+
+        if strip_cdata is not _UNUSED:
+            import warnings
+            warnings.warn(
+                "The 'strip_cdata' option of HTMLParser() has never done anything and will eventually be removed.",
+                DeprecationWarning)
+        _BaseParser.__init__(self, parse_options, True, schema,
+                             remove_comments, remove_pis, strip_cdata,
+                             collect_ids, target, encoding)
+
+
+cdef HTMLParser __DEFAULT_HTML_PARSER
+__DEFAULT_HTML_PARSER = HTMLParser()
+
+
+cdef class HTMLPullParser(HTMLParser):
+    """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
+
+    HTML parser that collects parse events in an iterator.
+
+    The collected events are the same as for iterparse(), but the
+    parser itself is non-blocking in the sense that it receives
+    data chunks incrementally through its .feed() method, instead
+    of reading them directly from a file(-like) object all by itself.
+
+    By default, it collects Element end events.  To change that,
+    pass any subset of the available events into the ``events``
+    argument: ``'start'``, ``'end'``, ``'start-ns'``,
+    ``'end-ns'``, ``'comment'``, ``'pi'``.
+
+    To support loading external dependencies relative to the input
+    source, you can pass the ``base_url``.
+    """
+    def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
+        HTMLParser.__init__(self, **kwargs)
+        if events is None:
+            events = ('end',)
+        self._setBaseURL(base_url)
+        self._collectEvents(events, tag)
+
+    def read_events(self):
+        return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
+
+
+############################################################
+## helper functions for document creation
+############################################################
+
+cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
+    cdef char* c_filename
+    cdef char* c_text
+    cdef Py_ssize_t c_len
+    if parser is None:
+        parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+    if not filename:
+        c_filename = NULL
+    else:
+        filename_utf = _encodeFilenameUTF8(filename)
+        c_filename = _cstr(filename_utf)
+    if isinstance(text, unicode):
+        if python.PyUnicode_IS_READY(text):
+            # PEP-393 Unicode string
+            c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
+        else:
+            # old Py_UNICODE string
+            c_len = python.PyUnicode_GET_DATA_SIZE(text)
+        if c_len > limits.INT_MAX:
+            return (<_BaseParser>parser)._parseDocFromFilelike(
+                StringIO(text), filename, None)
+        return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
+    else:
+        c_len = python.PyBytes_GET_SIZE(text)
+        if c_len > limits.INT_MAX:
+            return (<_BaseParser>parser)._parseDocFromFilelike(
+                BytesIO(text), filename, None)
+        c_text = _cstr(text)
+        return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
+
+cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
+    if parser is None:
+        parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+    return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
+
+cdef xmlDoc* _parseDocFromFilelike(source, filename,
+                                   _BaseParser parser) except NULL:
+    if parser is None:
+        parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+    return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
+
+cdef xmlDoc* _newXMLDoc() except NULL:
+    cdef xmlDoc* result
+    result = tree.xmlNewDoc(NULL)
+    if result is NULL:
+        raise MemoryError()
+    if result.encoding is NULL:
+        result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
+    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+    return result
+
+cdef xmlDoc* _newHTMLDoc() except NULL:
+    cdef xmlDoc* result
+    result = tree.htmlNewDoc(NULL, NULL)
+    if result is NULL:
+        raise MemoryError()
+    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+    return result
+
+cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
+    cdef xmlDoc* result
+    if recursive:
+        with nogil:
+            result = tree.xmlCopyDoc(c_doc, recursive)
+    else:
+        result = tree.xmlCopyDoc(c_doc, 0)
+    if result is NULL:
+        raise MemoryError()
+    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+    return result
+
+cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
+    "Recursively copy the document and make c_new_root the new root node."
+    cdef xmlDoc* result
+    cdef xmlNode* c_node
+    result = tree.xmlCopyDoc(c_doc, 0) # non recursive
+    __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+    with nogil:
+        c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
+    if c_node is NULL:
+        raise MemoryError()
+    tree.xmlDocSetRootElement(result, c_node)
+    _copyTail(c_new_root.next, c_node)
+    return result
+
+cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
+    "Recursively copy the element into the document. c_doc is not modified."
+    cdef xmlNode* c_root
+    c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
+    if c_root is NULL:
+        raise MemoryError()
+    _copyTail(c_node.next, c_root)
+    return c_root
+
+
+############################################################
+## API level helper functions for _Document creation
+############################################################
+
+cdef _Document _parseDocument(source, _BaseParser parser, base_url):
+    cdef _Document doc
+    source = _getFSPathOrObject(source)
+    if _isString(source):
+        # parse the file directly from the filesystem
+        doc = _parseDocumentFromURL(_encodeFilename(source), parser)
+        # fix base URL if requested
+        if base_url is not None:
+            base_url = _encodeFilenameUTF8(base_url)
+            if doc._c_doc.URL is not NULL:
+                tree.xmlFree(<char*>doc._c_doc.URL)
+            doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
+        return doc
+
+    if base_url is not None:
+        url = base_url
+    else:
+        url = _getFilenameForFile(source)
+
+    if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
+        # StringIO - reading from start?
+        if source.tell() == 0:
+            return _parseMemoryDocument(source.getvalue(), url, parser)
+
+    # Support for file-like objects (urlgrabber.urlopen, ...)
+    if hasattr(source, 'read'):
+        return _parseFilelikeDocument(source, url, parser)
+
+    raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
+
+cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
+    c_doc = _parseDocFromFile(url, parser)
+    return _documentFactory(c_doc, parser)
+
+cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
+    if isinstance(text, unicode):
+        if _hasEncodingDeclaration(text):
+            raise ValueError(
+                "Unicode strings with encoding declaration are not supported. "
+                "Please use bytes input or XML fragments without declaration.")
+    elif not isinstance(text, bytes):
+        raise ValueError, "can only parse strings"
+    c_doc = _parseDoc(text, url, parser)
+    return _documentFactory(c_doc, parser)
+
+cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
+    c_doc = _parseDocFromFilelike(source, url, parser)
+    return _documentFactory(c_doc, parser)
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/lxml/parser.pxi
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz