diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/lxml/parser.pxi')
-rw-r--r-- | .venv/lib/python3.12/site-packages/lxml/parser.pxi | 2000 |
1 files changed, 2000 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lxml/parser.pxi b/.venv/lib/python3.12/site-packages/lxml/parser.pxi new file mode 100644 index 00000000..70337d87 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lxml/parser.pxi @@ -0,0 +1,2000 @@ +# Parsers for XML and HTML + +from lxml.includes cimport xmlparser +from lxml.includes cimport htmlparser + + +class ParseError(LxmlSyntaxError): + """Syntax error while parsing an XML document. + + For compatibility with ElementTree 1.3 and later. + """ + def __init__(self, message, code, line, column, filename=None): + super(_ParseError, self).__init__(message) + self.lineno, self.offset = (line, column - 1) + self.code = code + self.filename = filename + + @property + def position(self): + return self.lineno, self.offset + 1 + + @position.setter + def position(self, new_pos): + self.lineno, column = new_pos + self.offset = column - 1 + +cdef object _ParseError = ParseError + + +class XMLSyntaxError(ParseError): + """Syntax error while parsing an XML document. + """ + +cdef class ParserError(LxmlError): + """Internal lxml parser error. + """ + + +@cython.final +@cython.internal +cdef class _ParserDictionaryContext: + # Global parser context to share the string dictionary. + # + # This class is a delegate singleton! + # + # It creates _ParserDictionaryContext objects for each thread to keep thread state, + # but those must never be used directly. Always stick to using the static + # __GLOBAL_PARSER_CONTEXT as defined below the class. + # + + cdef tree.xmlDict* _c_dict + cdef _BaseParser _default_parser + cdef list _implied_parser_contexts + + def __cinit__(self): + self._c_dict = NULL + self._implied_parser_contexts = [] + + def __dealloc__(self): + if self._c_dict is not NULL: + xmlparser.xmlDictFree(self._c_dict) + + cdef int initMainParserContext(self) except -1: + """Put the global context into the thread dictionary of the main + thread. To be called once and only in the main thread.""" + thread_dict = python.PyThreadState_GetDict() + if thread_dict is not NULL: + (<dict>thread_dict)["_ParserDictionaryContext"] = self + + cdef _ParserDictionaryContext _findThreadParserContext(self): + "Find (or create) the _ParserDictionaryContext object for the current thread" + cdef _ParserDictionaryContext context + thread_dict = python.PyThreadState_GetDict() + if thread_dict is NULL: + return self + d = <dict>thread_dict + result = python.PyDict_GetItem(d, "_ParserDictionaryContext") + if result is not NULL: + return <object>result + context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext) + d["_ParserDictionaryContext"] = context + return context + + cdef int setDefaultParser(self, _BaseParser parser) except -1: + "Set the default parser for the current thread" + cdef _ParserDictionaryContext context + context = self._findThreadParserContext() + context._default_parser = parser + + cdef _BaseParser getDefaultParser(self): + "Return (or create) the default parser of the current thread" + cdef _ParserDictionaryContext context + context = self._findThreadParserContext() + if context._default_parser is None: + if self._default_parser is None: + self._default_parser = __DEFAULT_XML_PARSER._copy() + if context is not self: + context._default_parser = self._default_parser._copy() + return context._default_parser + + cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default): + "Return the thread-local dict or create a new one if necessary." + cdef _ParserDictionaryContext context + context = self._findThreadParserContext() + if context._c_dict is NULL: + # thread dict not yet set up => use default or create a new one + if default is not NULL: + context._c_dict = default + xmlparser.xmlDictReference(default) + return default + if self._c_dict is NULL: + self._c_dict = xmlparser.xmlDictCreate() + if context is not self: + context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict) + return context._c_dict + + cdef int initThreadDictRef(self, tree.xmlDict** c_dict_ref) except -1: + c_dict = c_dict_ref[0] + c_thread_dict = self._getThreadDict(c_dict) + if c_dict is c_thread_dict: + return 0 + if c_dict is not NULL: + xmlparser.xmlDictFree(c_dict) + c_dict_ref[0] = c_thread_dict + xmlparser.xmlDictReference(c_thread_dict) + + cdef int initParserDict(self, xmlparser.xmlParserCtxt* pctxt) except -1: + "Assure we always use the same string dictionary." + self.initThreadDictRef(&pctxt.dict) + pctxt.dictNames = 1 + + cdef int initXPathParserDict(self, xpath.xmlXPathContext* pctxt) except -1: + "Assure we always use the same string dictionary." + self.initThreadDictRef(&pctxt.dict) + + cdef int initDocDict(self, xmlDoc* result) except -1: + "Store dict of last object parsed if no shared dict yet" + # XXX We also free the result dict here if there already was one. + # This case should only occur for new documents with empty dicts, + # otherwise we'd free data that's in use => segfault + self.initThreadDictRef(&result.dict) + + cdef _ParserContext findImpliedContext(self): + """Return any current implied xml parser context for the current + thread. This is used when the resolver functions are called + with an xmlParserCtxt that was generated from within libxml2 + (i.e. without a _ParserContext) - which happens when parsing + schema and xinclude external references.""" + cdef _ParserDictionaryContext context + cdef _ParserContext implied_context + + # see if we have a current implied parser + context = self._findThreadParserContext() + if context._implied_parser_contexts: + implied_context = context._implied_parser_contexts[-1] + return implied_context + return None + + cdef int pushImpliedContextFromParser(self, _BaseParser parser) except -1: + "Push a new implied context object taken from the parser." + if parser is not None: + self.pushImpliedContext(parser._getParserContext()) + else: + self.pushImpliedContext(None) + + cdef int pushImpliedContext(self, _ParserContext parser_context) except -1: + "Push a new implied context object." + cdef _ParserDictionaryContext context + context = self._findThreadParserContext() + context._implied_parser_contexts.append(parser_context) + + cdef int popImpliedContext(self) except -1: + "Pop the current implied context object." + cdef _ParserDictionaryContext context + context = self._findThreadParserContext() + context._implied_parser_contexts.pop() + +cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext() +__GLOBAL_PARSER_CONTEXT.initMainParserContext() + +############################################################ +## support for Python unicode I/O +############################################################ + +# name of Python Py_UNICODE encoding as known to libxml2 +cdef const_char* _PY_UNICODE_ENCODING = NULL + +cdef int _setupPythonUnicode() except -1: + """Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode + strings if libxml2 supports reading native Python unicode. This depends + on iconv and the local Python installation, so we simply check if we find + a matching encoding handler. + """ + cdef tree.xmlCharEncodingHandler* enchandler + cdef Py_ssize_t l + cdef const_char* enc + cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>'] + cdef const_xmlChar* buffer = <const_xmlChar*>uchars + # apparently, libxml2 can't detect UTF-16 on some systems + if (buffer[0] == c'<' and buffer[1] == c'\0' and + buffer[2] == c't' and buffer[3] == c'\0'): + enc = "UTF-16LE" + elif (buffer[0] == c'\0' and buffer[1] == c'<' and + buffer[2] == c'\0' and buffer[3] == c't'): + enc = "UTF-16BE" + else: + # let libxml2 give it a try + enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7) + if enc is NULL: + # not my fault, it's YOUR broken system :) + return 0 + enchandler = tree.xmlFindCharEncodingHandler(enc) + if enchandler is not NULL: + global _PY_UNICODE_ENCODING + tree.xmlCharEncCloseFunc(enchandler) + _PY_UNICODE_ENCODING = enc + return 0 + +cdef const_char* _findEncodingName(const_xmlChar* buffer, int size): + "Work around bug in libxml2: find iconv name of encoding on our own." + cdef tree.xmlCharEncoding enc + enc = tree.xmlDetectCharEncoding(buffer, size) + if enc == tree.XML_CHAR_ENCODING_UTF16LE: + if size >= 4 and (buffer[0] == <const_xmlChar> b'\xFF' and + buffer[1] == <const_xmlChar> b'\xFE' and + buffer[2] == 0 and buffer[3] == 0): + return "UTF-32LE" # according to BOM + else: + return "UTF-16LE" + elif enc == tree.XML_CHAR_ENCODING_UTF16BE: + return "UTF-16BE" + elif enc == tree.XML_CHAR_ENCODING_UCS4LE: + return "UCS-4LE" + elif enc == tree.XML_CHAR_ENCODING_UCS4BE: + return "UCS-4BE" + elif enc == tree.XML_CHAR_ENCODING_NONE: + return NULL + else: + # returns a constant char*, no need to free it + return tree.xmlGetCharEncodingName(enc) + +# Python 3.12 removed support for "Py_UNICODE". +if python.PY_VERSION_HEX < 0x030C0000: + _setupPythonUnicode() + + +cdef unicode _find_PyUCS4EncodingName(): + """ + Find a suitable encoding for Py_UCS4 PyUnicode strings in libxml2. + """ + ustring = "<xml>\U0001F92A</xml>" + cdef const xmlChar* buffer = <const xmlChar*> python.PyUnicode_DATA(ustring) + cdef Py_ssize_t py_buffer_len = python.PyUnicode_GET_LENGTH(ustring) + + encoding_name = '' + cdef tree.xmlCharEncoding enc = tree.xmlDetectCharEncoding(buffer, py_buffer_len) + enchandler = tree.xmlGetCharEncodingHandler(enc) + if enchandler is not NULL: + try: + if enchandler.name: + encoding_name = enchandler.name.decode('UTF-8') + finally: + tree.xmlCharEncCloseFunc(enchandler) + else: + c_name = tree.xmlGetCharEncodingName(enc) + if c_name: + encoding_name = c_name.decode('UTF-8') + + + if encoding_name and not encoding_name.endswith('LE') and not encoding_name.endswith('BE'): + encoding_name += 'BE' if python.PY_BIG_ENDIAN else 'LE' + return encoding_name or None + +_pyucs4_encoding_name = _find_PyUCS4EncodingName() + + +############################################################ +## support for file-like objects +############################################################ + +@cython.final +@cython.internal +cdef class _FileReaderContext: + cdef object _filelike + cdef object _encoding + cdef object _url + cdef object _bytes + cdef _ExceptionContext _exc_context + cdef Py_ssize_t _bytes_read + cdef char* _c_url + cdef bint _close_file_after_read + + def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False): + self._exc_context = exc_context + self._filelike = filelike + self._close_file_after_read = close_file + self._encoding = encoding + if url is None: + self._c_url = NULL + else: + url = _encodeFilename(url) + self._c_url = _cstr(url) + self._url = url + self._bytes = b'' + self._bytes_read = 0 + + cdef _close_file(self): + if self._filelike is None or not self._close_file_after_read: + return + try: + close = self._filelike.close + except AttributeError: + close = None + finally: + self._filelike = None + if close is not None: + close() + + cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self) noexcept: + cdef xmlparser.xmlParserInputBuffer* c_buffer = xmlparser.xmlAllocParserInputBuffer(0) + if c_buffer: + c_buffer.readcallback = _readFilelikeParser + c_buffer.context = <python.PyObject*> self + return c_buffer + + cdef xmlparser.xmlParserInput* _createParserInput( + self, xmlparser.xmlParserCtxt* ctxt) noexcept: + cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer() + if not c_buffer: + return NULL + return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) + + cdef tree.xmlDtd* _readDtd(self) noexcept: + cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer() + if not c_buffer: + return NULL + with nogil: + return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0) + + cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options) noexcept: + cdef xmlDoc* result + cdef void* c_callback_context = <python.PyObject*> self + cdef char* c_encoding = _cstr(self._encoding) if self._encoding is not None else NULL + + orig_options = ctxt.options + with nogil: + if ctxt.html: + result = htmlparser.htmlCtxtReadIO( + ctxt, _readFilelikeParser, NULL, c_callback_context, + self._c_url, c_encoding, options) + if result is not NULL: + if _fixHtmlDictNames(ctxt.dict, result) < 0: + tree.xmlFreeDoc(result) + result = NULL + else: + result = xmlparser.xmlCtxtReadIO( + ctxt, _readFilelikeParser, NULL, c_callback_context, + self._c_url, c_encoding, options) + ctxt.options = orig_options # work around libxml2 problem + + try: + self._close_file() + except: + self._exc_context._store_raised() + finally: + return result # swallow any exceptions + + cdef int copyToBuffer(self, char* c_buffer, int c_requested) noexcept: + cdef int c_byte_count = 0 + cdef char* c_start + cdef Py_ssize_t byte_count, remaining + if self._bytes_read < 0: + return 0 + try: + byte_count = python.PyBytes_GET_SIZE(self._bytes) + remaining = byte_count - self._bytes_read + while c_requested > remaining: + c_start = _cstr(self._bytes) + self._bytes_read + cstring_h.memcpy(c_buffer, c_start, remaining) + c_byte_count += remaining + c_buffer += remaining + c_requested -= remaining + + self._bytes = self._filelike.read(c_requested) + if not isinstance(self._bytes, bytes): + if isinstance(self._bytes, unicode): + if self._encoding is None: + self._bytes = (<unicode>self._bytes).encode('utf8') + else: + self._bytes = python.PyUnicode_AsEncodedString( + self._bytes, _cstr(self._encoding), NULL) + else: + self._close_file() + raise TypeError, \ + "reading from file-like objects must return byte strings or unicode strings" + + remaining = python.PyBytes_GET_SIZE(self._bytes) + if remaining == 0: + self._bytes_read = -1 + self._close_file() + return c_byte_count + self._bytes_read = 0 + + if c_requested > 0: + c_start = _cstr(self._bytes) + self._bytes_read + cstring_h.memcpy(c_buffer, c_start, c_requested) + c_byte_count += c_requested + self._bytes_read += c_requested + except: + c_byte_count = -1 + self._exc_context._store_raised() + try: + self._close_file() + except: + self._exc_context._store_raised() + finally: + return c_byte_count # swallow any exceptions + +cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil: + return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size) + +cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) noexcept nogil: + return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt) + +############################################################ +## support for custom document loaders +############################################################ + +cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid, + xmlparser.xmlParserCtxt* c_context) noexcept with gil: + cdef _ResolverContext context + cdef xmlparser.xmlParserInput* c_input + cdef _InputDocument doc_ref + cdef _FileReaderContext file_context + # if there is no _ParserContext associated with the xmlParserCtxt + # passed, check to see if the thread state object has an implied + # context. + if c_context._private is not NULL: + context = <_ResolverContext>c_context._private + else: + context = __GLOBAL_PARSER_CONTEXT.findImpliedContext() + + if context is None: + if __DEFAULT_ENTITY_LOADER is NULL: + return NULL + with nogil: + # free the GIL as we might do serious I/O here (e.g. HTTP) + c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) + return c_input + + try: + if c_url is NULL: + url = None + else: + # parsing a related document (DTD etc.) => UTF-8 encoded URL? + url = _decodeFilename(<const_xmlChar*>c_url) + if c_pubid is NULL: + pubid = None + else: + pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8 + + doc_ref = context._resolvers.resolve(url, pubid, context) + except: + context._store_raised() + return NULL + + if doc_ref is not None: + if doc_ref._type == PARSER_DATA_STRING: + data = doc_ref._data_bytes + filename = doc_ref._filename + if not filename: + filename = None + elif not isinstance(filename, bytes): + # most likely a text URL + filename = filename.encode('utf8') + if not isinstance(filename, bytes): + filename = None + + c_input = xmlparser.xmlNewInputStream(c_context) + if c_input is not NULL: + if filename is not None: + c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename)) + c_input.base = _xcstr(data) + c_input.length = python.PyBytes_GET_SIZE(data) + c_input.cur = c_input.base + c_input.end = c_input.base + c_input.length + elif doc_ref._type == PARSER_DATA_FILENAME: + data = None + c_filename = _cstr(doc_ref._filename) + with nogil: + # free the GIL as we might do serious I/O here + c_input = xmlparser.xmlNewInputFromFile( + c_context, c_filename) + elif doc_ref._type == PARSER_DATA_FILE: + file_context = _FileReaderContext(doc_ref._file, context, url, + None, doc_ref._close_file) + c_input = file_context._createParserInput(c_context) + data = file_context + else: + data = None + c_input = NULL + + if data is not None: + context._storage.add(data) + if c_input is not NULL: + return c_input + + if __DEFAULT_ENTITY_LOADER is NULL: + return NULL + + with nogil: + # free the GIL as we might do serious I/O here (e.g. HTTP) + c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) + return c_input + +cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER +__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader() + + +cdef xmlparser.xmlExternalEntityLoader _register_document_loader() noexcept nogil: + cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader() + xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver) + return old + +cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) noexcept nogil: + xmlparser.xmlSetExternalEntityLoader(old) + + +############################################################ +## Parsers +############################################################ + +@cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc. +@cython.internal +cdef class _ParserContext(_ResolverContext): + cdef _ErrorLog _error_log + cdef _ParserSchemaValidationContext _validator + cdef xmlparser.xmlParserCtxt* _c_ctxt + cdef xmlparser.xmlExternalEntityLoader _orig_loader + cdef python.PyThread_type_lock _lock + cdef _Document _doc + cdef bint _collect_ids + + def __cinit__(self): + self._c_ctxt = NULL + self._collect_ids = True + if not config.ENABLE_THREADING: + self._lock = NULL + else: + self._lock = python.PyThread_allocate_lock() + self._error_log = _ErrorLog() + + def __dealloc__(self): + if config.ENABLE_THREADING and self._lock is not NULL: + python.PyThread_free_lock(self._lock) + self._lock = NULL + if self._c_ctxt is not NULL: + if <void*>self._validator is not NULL and self._validator is not None: + # If the parser was not closed correctly (e.g. interrupted iterparse()), + # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX + # validator plug might still be in place, which will make xmlFreeParserCtxt() + # crash when trying to xmlFree() a static SAX handler. + # Thus, make sure we disconnect the handler interceptor here at the latest. + self._validator.disconnect() + xmlparser.xmlFreeParserCtxt(self._c_ctxt) + + cdef _ParserContext _copy(self): + cdef _ParserContext context + context = self.__class__() + context._collect_ids = self._collect_ids + context._validator = self._validator.copy() + _initParserContext(context, self._resolvers._copy(), NULL) + return context + + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept: + self._c_ctxt = c_ctxt + c_ctxt._private = <void*>self + + cdef void _resetParserContext(self) noexcept: + if self._c_ctxt is not NULL: + if self._c_ctxt.html: + htmlparser.htmlCtxtReset(self._c_ctxt) + self._c_ctxt.disableSAX = 0 # work around bug in libxml2 + else: + xmlparser.xmlClearParserCtxt(self._c_ctxt) + # work around bug in libxml2 [2.9.10 .. 2.9.14]: + # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378 + self._c_ctxt.nsNr = 0 + + cdef int prepare(self, bint set_document_loader=True) except -1: + cdef int result + if config.ENABLE_THREADING and self._lock is not NULL: + with nogil: + result = python.PyThread_acquire_lock( + self._lock, python.WAIT_LOCK) + if result == 0: + raise ParserError, "parser locking failed" + self._error_log.clear() + self._doc = None + # Need a cast here because older libxml2 releases do not use 'const' in the functype. + self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError + self._orig_loader = _register_document_loader() if set_document_loader else NULL + if self._validator is not None: + self._validator.connect(self._c_ctxt, self._error_log) + return 0 + + cdef int cleanup(self) except -1: + if self._orig_loader is not NULL: + _reset_document_loader(self._orig_loader) + try: + if self._validator is not None: + self._validator.disconnect() + self._resetParserContext() + self.clear() + self._doc = None + self._c_ctxt.sax.serror = NULL + finally: + if config.ENABLE_THREADING and self._lock is not NULL: + python.PyThread_release_lock(self._lock) + return 0 + + cdef object _handleParseResult(self, _BaseParser parser, + xmlDoc* result, filename): + c_doc = self._handleParseResultDoc(parser, result, filename) + if self._doc is not None and self._doc._c_doc is c_doc: + return self._doc + else: + return _documentFactory(c_doc, parser) + + cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser, + xmlDoc* result, filename) except NULL: + recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(self, self._c_ctxt, result, + filename, recover, + free_doc=self._doc is None) + +cdef _initParserContext(_ParserContext context, + _ResolverRegistry resolvers, + xmlparser.xmlParserCtxt* c_ctxt): + _initResolverContext(context, resolvers) + if c_ctxt is not NULL: + context._initParserContext(c_ctxt) + +cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil: + (<_ParserContext>_parser_context._private)._error_log._receive(error) + +cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil: + if __DEBUG: + if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL: + _forwardError(NULL, error) + else: + _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error) + +cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, + _ErrorLog error_log) except -1: + if filename is not None and \ + ctxt.lastError.domain == xmlerror.XML_FROM_IO: + if isinstance(filename, bytes): + filename = _decodeFilenameWithLength( + <bytes>filename, len(<bytes>filename)) + if ctxt.lastError.message is not NULL: + try: + message = ctxt.lastError.message.decode('utf-8') + except UnicodeDecodeError: + # the filename may be in there => play it safe + message = ctxt.lastError.message.decode('iso8859-1') + message = f"Error reading file '{filename}': {message.strip()}" + else: + message = f"Error reading '{filename}'" + raise IOError, message + elif error_log: + raise error_log._buildParseException( + XMLSyntaxError, "Document is not well formed") + elif ctxt.lastError.message is not NULL: + message = ctxt.lastError.message.strip() + code = ctxt.lastError.code + line = ctxt.lastError.line + column = ctxt.lastError.int2 + if ctxt.lastError.line > 0: + message = f"line {line}: {message}" + raise XMLSyntaxError(message, code, line, column, filename) + else: + raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0, + filename) + +cdef xmlDoc* _handleParseResult(_ParserContext context, + xmlparser.xmlParserCtxt* c_ctxt, + xmlDoc* result, filename, + bint recover, bint free_doc) except NULL: + cdef bint well_formed + if result is not NULL: + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + + if c_ctxt.myDoc is not NULL: + if c_ctxt.myDoc is not result: + __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc) + tree.xmlFreeDoc(c_ctxt.myDoc) + c_ctxt.myDoc = NULL + + if result is not NULL: + if (context._validator is not None and + not context._validator.isvalid()): + well_formed = 0 # actually not 'valid', but anyway ... + elif (not c_ctxt.wellFormed and not c_ctxt.html and + c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and + [1 for error in context._error_log + if error.type == ErrorTypes.ERR_INVALID_CHAR]): + # An encoding error occurred and libxml2 switched from UTF-8 + # input to (undecoded) Latin-1, at some arbitrary point in the + # document. Better raise an error than allowing for a broken + # tree with mixed encodings. This is fixed in libxml2 2.12. + well_formed = 0 + elif recover or (c_ctxt.wellFormed and + c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): + well_formed = 1 + elif not c_ctxt.replaceEntities and not c_ctxt.validate \ + and context is not None: + # in this mode, we ignore errors about undefined entities + for error in context._error_log.filter_from_errors(): + if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ + error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: + well_formed = 0 + break + else: + well_formed = 1 + else: + well_formed = 0 + + if not well_formed: + if free_doc: + tree.xmlFreeDoc(result) + result = NULL + + if context is not None and context._has_raised(): + if result is not NULL: + if free_doc: + tree.xmlFreeDoc(result) + result = NULL + context._raise_if_stored() + + if result is NULL: + if context is not None: + _raiseParseError(c_ctxt, filename, context._error_log) + else: + _raiseParseError(c_ctxt, filename, None) + else: + if result.URL is NULL and filename is not None: + result.URL = tree.xmlStrdup(_xcstr(filename)) + if result.encoding is NULL: + result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8") + + if context._validator is not None and \ + context._validator._add_default_attributes: + # we currently need to do this here as libxml2 does not + # support inserting default attributes during parse-time + # validation + context._validator.inject_default_attributes(result) + + return result + +cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) noexcept nogil: + cdef xmlNode* c_node + if c_doc is NULL: + return 0 + c_node = c_doc.children + tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1) + if c_node.type == tree.XML_ELEMENT_NODE: + if _fixHtmlDictNodeNames(c_dict, c_node) < 0: + return -1 + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + return 0 + +cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc, + xmlNode* c_start_node) noexcept nogil: + """ + Move names to the dict, iterating in document order, starting at + c_start_node. This is used in incremental parsing after each chunk. + """ + cdef xmlNode* c_node + if not c_doc: + return 0 + if not c_start_node: + return _fixHtmlDictNames(c_dict, c_doc) + c_node = c_start_node + tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1) + if c_node.type == tree.XML_ELEMENT_NODE: + if _fixHtmlDictNodeNames(c_dict, c_node) < 0: + return -1 + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + return 0 + +cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict, + xmlNode* c_node) noexcept nogil: + cdef xmlNode* c_attr + c_name = tree.xmlDictLookup(c_dict, c_node.name, -1) + if c_name is NULL: + return -1 + if c_name is not c_node.name: + tree.xmlFree(<char*>c_node.name) + c_node.name = c_name + c_attr = <xmlNode*>c_node.properties + while c_attr is not NULL: + c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1) + if c_name is NULL: + return -1 + if c_name is not c_attr.name: + tree.xmlFree(<char*>c_attr.name) + c_attr.name = c_name + c_attr = c_attr.next + return 0 + + +@cython.internal +cdef class _BaseParser: + cdef ElementClassLookup _class_lookup + cdef _ResolverRegistry _resolvers + cdef _ParserContext _parser_context + cdef _ParserContext _push_parser_context + cdef int _parse_options + cdef bint _for_html + cdef bint _remove_comments + cdef bint _remove_pis + cdef bint _strip_cdata + cdef bint _collect_ids + cdef bint _resolve_external_entities + cdef XMLSchema _schema + cdef bytes _filename + cdef readonly object target + cdef object _default_encoding + cdef tuple _events_to_collect # (event_types, tag) + + def __init__(self, int parse_options, bint for_html, XMLSchema schema, + remove_comments, remove_pis, strip_cdata, collect_ids, + target, encoding, bint resolve_external_entities=True): + cdef tree.xmlCharEncodingHandler* enchandler + cdef int c_encoding + if not isinstance(self, (XMLParser, HTMLParser)): + raise TypeError, "This class cannot be instantiated" + + self._parse_options = parse_options + self.target = target + self._for_html = for_html + self._remove_comments = remove_comments + self._remove_pis = remove_pis + self._strip_cdata = strip_cdata + self._collect_ids = collect_ids + self._resolve_external_entities = resolve_external_entities + self._schema = schema + + self._resolvers = _ResolverRegistry() + + if encoding is None: + self._default_encoding = None + else: + encoding = _utf8(encoding) + enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding)) + if enchandler is NULL: + raise LookupError, f"unknown encoding: '{encoding}'" + tree.xmlCharEncCloseFunc(enchandler) + self._default_encoding = encoding + + cdef _setBaseURL(self, base_url): + self._filename = _encodeFilename(base_url) + + cdef _collectEvents(self, event_types, tag): + if event_types is None: + event_types = () + else: + event_types = tuple(set(event_types)) + _buildParseEventFilter(event_types) # purely for validation + self._events_to_collect = (event_types, tag) + + cdef _ParserContext _getParserContext(self): + cdef xmlparser.xmlParserCtxt* pctxt + if self._parser_context is None: + self._parser_context = self._createContext(self.target, None) + self._parser_context._collect_ids = self._collect_ids + if self._schema is not None: + self._parser_context._validator = \ + self._schema._newSaxValidator( + self._parse_options & xmlparser.XML_PARSE_DTDATTR) + pctxt = self._newParserCtxt() + _initParserContext(self._parser_context, self._resolvers, pctxt) + self._configureSaxContext(pctxt) + return self._parser_context + + cdef _ParserContext _getPushParserContext(self): + cdef xmlparser.xmlParserCtxt* pctxt + if self._push_parser_context is None: + self._push_parser_context = self._createContext( + self.target, self._events_to_collect) + self._push_parser_context._collect_ids = self._collect_ids + if self._schema is not None: + self._push_parser_context._validator = \ + self._schema._newSaxValidator( + self._parse_options & xmlparser.XML_PARSE_DTDATTR) + pctxt = self._newPushParserCtxt() + _initParserContext( + self._push_parser_context, self._resolvers, pctxt) + self._configureSaxContext(pctxt) + return self._push_parser_context + + cdef _ParserContext _createContext(self, target, events_to_collect): + cdef _SaxParserContext sax_context + if target is not None: + sax_context = _TargetParserContext(self) + (<_TargetParserContext>sax_context)._setTarget(target) + elif events_to_collect: + sax_context = _SaxParserContext(self) + else: + # nothing special to configure + return _ParserContext() + if events_to_collect: + events, tag = events_to_collect + sax_context._setEventFilter(events, tag) + return sax_context + + @cython.final + cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1: + if self._remove_comments: + pctxt.sax.comment = NULL + if self._remove_pis: + pctxt.sax.processingInstruction = NULL + if self._strip_cdata: + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL + if not self._resolve_external_entities: + pctxt.sax.getEntity = _getInternalEntityOnly + + cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1: + cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax + if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC: + # need to extend SAX1 context to SAX2 to get proper error reports + if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler: + sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler)) + if sax is NULL: + raise MemoryError() + cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler, + sizeof(htmlparser.htmlDefaultSAXHandler)) + c_ctxt.sax = sax + sax.initialized = xmlparser.XML_SAX2_MAGIC + # Need a cast here because older libxml2 releases do not use 'const' in the functype. + sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError + sax.startElementNs = NULL + sax.endElementNs = NULL + sax._private = NULL + return 0 + + cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL: + cdef xmlparser.xmlParserCtxt* c_ctxt + if self._for_html: + c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) + if c_ctxt is not NULL: + self._registerHtmlErrorHandler(c_ctxt) + else: + c_ctxt = xmlparser.xmlNewParserCtxt() + if c_ctxt is NULL: + raise MemoryError + c_ctxt.sax.startDocument = _initSaxDocument + return c_ctxt + + cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL: + cdef xmlparser.xmlParserCtxt* c_ctxt + cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL + if self._for_html: + c_ctxt = htmlparser.htmlCreatePushParserCtxt( + NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE) + if c_ctxt is not NULL: + self._registerHtmlErrorHandler(c_ctxt) + htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options) + else: + c_ctxt = xmlparser.xmlCreatePushParserCtxt( + NULL, NULL, NULL, 0, c_filename) + if c_ctxt is not NULL: + xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options) + if c_ctxt is NULL: + raise MemoryError() + c_ctxt.sax.startDocument = _initSaxDocument + return c_ctxt + + @property + def error_log(self): + """The error log of the last parser run. + """ + cdef _ParserContext context + context = self._getParserContext() + return context._error_log.copy() + + @property + def resolvers(self): + """The custom resolver registry of this parser.""" + return self._resolvers + + @property + def version(self): + """The version of the underlying XML parser.""" + return "libxml2 %d.%d.%d" % LIBXML_VERSION + + def set_element_class_lookup(self, ElementClassLookup lookup = None): + """set_element_class_lookup(self, lookup = None) + + Set a lookup scheme for element classes generated from this parser. + + Reset it by passing None or nothing. + """ + self._class_lookup = lookup + + cdef _BaseParser _copy(self): + "Create a new parser with the same configuration." + cdef _BaseParser parser + parser = self.__class__() + parser._parse_options = self._parse_options + parser._for_html = self._for_html + parser._remove_comments = self._remove_comments + parser._remove_pis = self._remove_pis + parser._strip_cdata = self._strip_cdata + parser._filename = self._filename + parser._resolvers = self._resolvers + parser.target = self.target + parser._class_lookup = self._class_lookup + parser._default_encoding = self._default_encoding + parser._schema = self._schema + parser._events_to_collect = self._events_to_collect + return parser + + def copy(self): + """copy(self) + + Create a new parser with the same configuration. + """ + return self._copy() + + def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): + """makeelement(self, _tag, attrib=None, nsmap=None, **_extra) + + Creates a new element associated with this parser. + """ + return _makeElement(_tag, NULL, None, self, None, None, + attrib, nsmap, _extra) + + # internal parser methods + + cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: + """Parse unicode document, share dictionary if possible. + """ + cdef _ParserContext context + cdef xmlDoc* result + cdef xmlparser.xmlParserCtxt* pctxt + cdef Py_ssize_t py_buffer_len + cdef int buffer_len, c_kind + cdef const_char* c_text + cdef const_char* c_encoding = _PY_UNICODE_ENCODING + if python.PyUnicode_IS_READY(utext): + # PEP-393 string + c_text = <const_char*>python.PyUnicode_DATA(utext) + py_buffer_len = python.PyUnicode_GET_LENGTH(utext) + c_kind = python.PyUnicode_KIND(utext) + if c_kind == 1: + if python.PyUnicode_MAX_CHAR_VALUE(utext) <= 127: + c_encoding = 'UTF-8' + else: + c_encoding = 'ISO-8859-1' + elif c_kind == 2: + py_buffer_len *= 2 + if python.PY_BIG_ENDIAN: + c_encoding = 'UTF-16BE' # actually UCS-2 + else: + c_encoding = 'UTF-16LE' # actually UCS-2 + elif c_kind == 4: + py_buffer_len *= 4 + if python.PY_BIG_ENDIAN: + c_encoding = 'UTF-32BE' # actually UCS-4 + else: + c_encoding = 'UTF-32LE' # actually UCS-4 + else: + assert False, f"Illegal Unicode kind {c_kind}" + else: + # old Py_UNICODE string + py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext) + c_text = python.PyUnicode_AS_DATA(utext) + assert 0 <= py_buffer_len <= limits.INT_MAX + buffer_len = py_buffer_len + + context = self._getParserContext() + context.prepare() + try: + pctxt = context._c_ctxt + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + orig_options = pctxt.options + with nogil: + if self._for_html: + result = htmlparser.htmlCtxtReadMemory( + pctxt, c_text, buffer_len, c_filename, c_encoding, + self._parse_options) + if result is not NULL: + if _fixHtmlDictNames(pctxt.dict, result) < 0: + tree.xmlFreeDoc(result) + result = NULL + else: + result = xmlparser.xmlCtxtReadMemory( + pctxt, c_text, buffer_len, c_filename, c_encoding, + self._parse_options) + pctxt.options = orig_options # work around libxml2 problem + + return context._handleParseResultDoc(self, result, None) + finally: + context.cleanup() + + cdef xmlDoc* _parseDoc(self, char* c_text, int c_len, + char* c_filename) except NULL: + """Parse document, share dictionary if possible. + """ + cdef _ParserContext context + cdef xmlDoc* result + cdef xmlparser.xmlParserCtxt* pctxt + cdef char* c_encoding + cdef tree.xmlCharEncoding enc + context = self._getParserContext() + context.prepare() + try: + pctxt = context._c_ctxt + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + + if self._default_encoding is None: + c_encoding = NULL + # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs + # NOTE: limit to problematic cases because it changes character offsets + if c_len >= 4 and (c_text[0] == b'\xFF' and c_text[1] == b'\xFE' and + c_text[2] == 0 and c_text[3] == 0): + c_encoding = "UTF-32LE" + c_text += 4 + c_len -= 4 + elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and + c_text[2] == b'\xFE' and c_text[3] == b'\xFF'): + c_encoding = "UTF-32BE" + c_text += 4 + c_len -= 4 + else: + # no BOM => try to determine encoding + enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len) + if enc == tree.XML_CHAR_ENCODING_UCS4LE: + c_encoding = 'UTF-32LE' + elif enc == tree.XML_CHAR_ENCODING_UCS4BE: + c_encoding = 'UTF-32BE' + else: + c_encoding = _cstr(self._default_encoding) + + orig_options = pctxt.options + with nogil: + if self._for_html: + result = htmlparser.htmlCtxtReadMemory( + pctxt, c_text, c_len, c_filename, + c_encoding, self._parse_options) + if result is not NULL: + if _fixHtmlDictNames(pctxt.dict, result) < 0: + tree.xmlFreeDoc(result) + result = NULL + else: + result = xmlparser.xmlCtxtReadMemory( + pctxt, c_text, c_len, c_filename, + c_encoding, self._parse_options) + pctxt.options = orig_options # work around libxml2 problem + + return context._handleParseResultDoc(self, result, None) + finally: + context.cleanup() + + cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: + cdef _ParserContext context + cdef xmlDoc* result + cdef xmlparser.xmlParserCtxt* pctxt + cdef char* c_encoding + result = NULL + + context = self._getParserContext() + context.prepare() + try: + pctxt = context._c_ctxt + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + + if self._default_encoding is None: + c_encoding = NULL + else: + c_encoding = _cstr(self._default_encoding) + + orig_options = pctxt.options + with nogil: + if self._for_html: + result = htmlparser.htmlCtxtReadFile( + pctxt, c_filename, c_encoding, self._parse_options) + if result is not NULL: + if _fixHtmlDictNames(pctxt.dict, result) < 0: + tree.xmlFreeDoc(result) + result = NULL + else: + result = xmlparser.xmlCtxtReadFile( + pctxt, c_filename, c_encoding, self._parse_options) + pctxt.options = orig_options # work around libxml2 problem + + return context._handleParseResultDoc(self, result, c_filename) + finally: + context.cleanup() + + cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename, + encoding) except NULL: + cdef _ParserContext context + cdef _FileReaderContext file_context + cdef xmlDoc* result + cdef xmlparser.xmlParserCtxt* pctxt + cdef char* c_filename + if not filename: + filename = None + + context = self._getParserContext() + context.prepare() + try: + pctxt = context._c_ctxt + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + file_context = _FileReaderContext( + filelike, context, filename, + encoding or self._default_encoding) + result = file_context._readDoc(pctxt, self._parse_options) + + return context._handleParseResultDoc( + self, result, filename) + finally: + context.cleanup() + + +cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name) noexcept nogil: + """ + Callback function to intercept the entity resolution when external entity loading is disabled. + """ + cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name) + if not entity: + return NULL + if entity.etype not in ( + tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY, + tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY, + tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY): + return entity + + # Reject all external entities and fail the parsing instead. There is currently + # no way in libxml2 to just prevent the entity resolution in this case. + cdef xmlerror.xmlError c_error + cdef xmlerror.xmlStructuredErrorFunc err_func + cdef xmlparser.xmlParserInput* parser_input + cdef void* err_context + + c_ctxt = <xmlparser.xmlParserCtxt *> ctxt + err_func = xmlerror.xmlStructuredError + if err_func: + parser_input = c_ctxt.input + # Copied from xmlVErrParser() in libxml2: get current input from stack. + if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1: + parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2] + + c_error = xmlerror.xmlError( + domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER, + code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE, + level=xmlerror.xmlErrorLevel.XML_ERR_FATAL, + message=b"External entity resolution is disabled for security reasons " + b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' " + b"if you consider it safe to enable it.", + file=parser_input.filename, + node=entity, + str1=<char*> name, + str2=NULL, + str3=NULL, + line=parser_input.line if parser_input else 0, + int1=0, + int2=parser_input.col if parser_input else 0, + ) + err_context = xmlerror.xmlStructuredErrorContext + err_func(err_context, &c_error) + + c_ctxt.wellFormed = 0 + # The entity was looked up and does not need to be freed. + return NULL + + +cdef void _initSaxDocument(void* ctxt) noexcept with gil: + xmlparser.xmlSAX2StartDocument(ctxt) + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + c_doc = c_ctxt.myDoc + + # set up document dict + if c_doc and c_ctxt.dict and not c_doc.dict: + # I have no idea why libxml2 disables this - we need it + c_ctxt.dictNames = 1 + c_doc.dict = c_ctxt.dict + xmlparser.xmlDictReference(c_ctxt.dict) + + # set up XML ID hash table + if c_ctxt._private: + context = <_ParserContext>c_ctxt._private + if context._collect_ids: + # keep the global parser dict from filling up with XML IDs + if c_doc and not c_doc.ids: + # memory errors are not fatal here + c_dict = xmlparser.xmlDictCreate() + if c_dict: + c_doc.ids = tree.xmlHashCreateDict(0, c_dict) + xmlparser.xmlDictFree(c_dict) + else: + c_doc.ids = tree.xmlHashCreate(0) + else: + c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS + if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids): + # already initialised but empty => clear + tree.xmlHashFree(c_doc.ids, NULL) + c_doc.ids = NULL + + +############################################################ +## ET feed parser +############################################################ + +cdef class _FeedParser(_BaseParser): + cdef bint _feed_parser_running + + @property + def feed_error_log(self): + """The error log of the last (or current) run of the feed parser. + + Note that this is local to the feed parser and thus is + different from what the ``error_log`` property returns. + """ + return self._getPushParserContext()._error_log.copy() + + cpdef feed(self, data): + """feed(self, data) + + Feeds data to the parser. The argument should be an 8-bit string + buffer containing encoded data, although Unicode is supported as long + as both string types are not mixed. + + This is the main entry point to the consumer interface of a + parser. The parser will parse as much of the XML stream as it + can on each call. To finish parsing or to reset the parser, + call the ``close()`` method. Both methods may raise + ParseError if errors occur in the input data. If an error is + raised, there is no longer a need to call ``close()``. + + The feed parser interface is independent of the normal parser + usage. You can use the same parser as a feed parser and in + the ``parse()`` function concurrently. + """ + cdef _ParserContext context + cdef bytes bstring + cdef xmlparser.xmlParserCtxt* pctxt + cdef Py_ssize_t py_buffer_len, ustart + cdef const_char* char_data + cdef const_char* c_encoding + cdef int buffer_len + cdef int error + cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + + if isinstance(data, bytes): + if self._default_encoding is None: + c_encoding = NULL + else: + c_encoding = self._default_encoding + char_data = _cstr(data) + py_buffer_len = python.PyBytes_GET_SIZE(data) + ustart = 0 + elif isinstance(data, unicode): + c_encoding = b"UTF-8" + char_data = NULL + py_buffer_len = len(<unicode> data) + ustart = 0 + else: + raise TypeError, "Parsing requires string data" + + context = self._getPushParserContext() + pctxt = context._c_ctxt + error = 0 + if not self._feed_parser_running: + context.prepare(set_document_loader=False) + self._feed_parser_running = 1 + c_filename = (_cstr(self._filename) + if self._filename is not None else NULL) + + # We have to give *mlCtxtResetPush() enough input to figure + # out the character encoding (at least four bytes), + # however if we give it all we got, we'll have nothing for + # *mlParseChunk() and things go wrong. + buffer_len = 0 + if char_data is not NULL: + buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len + orig_loader = _register_document_loader() + if self._for_html: + error = _htmlCtxtResetPush( + pctxt, char_data, buffer_len, c_filename, c_encoding, + self._parse_options) + else: + xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + error = xmlparser.xmlCtxtResetPush( + pctxt, char_data, buffer_len, c_filename, c_encoding) + _reset_document_loader(orig_loader) + py_buffer_len -= buffer_len + char_data += buffer_len + if error: + raise MemoryError() + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + + #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding + + fixup_error = 0 + while py_buffer_len > 0 and (error == 0 or recover): + if char_data is NULL: + # Unicode parsing by converting chunks to UTF-8 + buffer_len = 2**19 # len(bytes) <= 4 * (2**19) == 2 MiB + bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8') + ustart += buffer_len + py_buffer_len -= buffer_len # may end up < 0 + error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring)) + else: + # Direct byte string parsing. + buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX + error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len) + py_buffer_len -= buffer_len + char_data += buffer_len + + if fixup_error: + context.store_exception(MemoryError()) + + if context._has_raised(): + # propagate Python exceptions immediately + recover = 0 + error = 1 + break + + if error and not pctxt.replaceEntities and not pctxt.validate: + # in this mode, we ignore errors about undefined entities + for entry in context._error_log.filter_from_errors(): + if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ + entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY: + break + else: + error = 0 + + if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised(): + # propagate Python exceptions immediately + recover = 0 + error = 1 + + if fixup_error or not recover and (error or not pctxt.wellFormed): + self._feed_parser_running = 0 + try: + context._handleParseResult(self, pctxt.myDoc, None) + finally: + context.cleanup() + + cpdef close(self): + """close(self) + + Terminates feeding data to this parser. This tells the parser to + process any remaining data in the feed buffer, and then returns the + root Element of the tree that was parsed. + + This method must be called after passing the last chunk of data into + the ``feed()`` method. It should only be called when using the feed + parser interface, all other usage is undefined. + """ + if not self._feed_parser_running: + raise XMLSyntaxError("no element found", + xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0, + self._filename) + + context = self._getPushParserContext() + pctxt = context._c_ctxt + + self._feed_parser_running = 0 + if self._for_html: + htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) + else: + xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) + + if (pctxt.recovery and not pctxt.disableSAX and + isinstance(context, _SaxParserContext)): + # apply any left-over 'end' events + (<_SaxParserContext>context).flushEvents() + + try: + result = context._handleParseResult(self, pctxt.myDoc, None) + finally: + context.cleanup() + + if isinstance(result, _Document): + return (<_Document>result).getroot() + else: + return result + + +cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt, + const char* char_data, int buffer_len): + fixup_error = 0 + with nogil: + if c_ctxt.html: + c_node = c_ctxt.node # last node where the parser stopped + orig_loader = _register_document_loader() + error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0) + _reset_document_loader(orig_loader) + # and now for the fun part: move node names to the dict + if c_ctxt.myDoc: + fixup_error = _fixHtmlDictSubtreeNames( + c_ctxt.dict, c_ctxt.myDoc, c_node) + if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict: + xmlparser.xmlDictFree(c_ctxt.myDoc.dict) + c_ctxt.myDoc.dict = c_ctxt.dict + xmlparser.xmlDictReference(c_ctxt.dict) + else: + orig_loader = _register_document_loader() + error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0) + _reset_document_loader(orig_loader) + return (error, fixup_error) + + +cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt, + const_char* c_data, int buffer_len, + const_char* c_filename, const_char* c_encoding, + int parse_options) except -1: + cdef xmlparser.xmlParserInput* c_input_stream + # libxml2 lacks an HTML push parser setup function + error = xmlparser.xmlCtxtResetPush( + c_ctxt, c_data, buffer_len, c_filename, c_encoding) + if error: + return error + + # fix libxml2 setup for HTML + c_ctxt.progressive = 1 + c_ctxt.html = 1 + htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options) + + return 0 + + +############################################################ +## XML parser +############################################################ + +cdef int _XML_DEFAULT_PARSE_OPTIONS +_XML_DEFAULT_PARSE_OPTIONS = ( + xmlparser.XML_PARSE_NOENT | + xmlparser.XML_PARSE_NOCDATA | + xmlparser.XML_PARSE_NONET | + xmlparser.XML_PARSE_COMPACT | + xmlparser.XML_PARSE_BIG_LINES + ) + +cdef class XMLParser(_FeedParser): + """XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True) + + The XML parser. + + Parsers can be supplied as additional argument to various parse + functions of the lxml API. A default parser is always available + and can be replaced by a call to the global function + 'set_default_parser'. New parsers can be created at any time + without a major run-time overhead. + + The keyword arguments in the constructor are mainly based on the + libxml2 parser configuration. A DTD will also be loaded if DTD + validation or attribute default values are requested (unless you + additionally provide an XMLSchema from which the default + attributes can be read). + + Available boolean keyword arguments: + + - attribute_defaults - inject default attributes from DTD or XMLSchema + - dtd_validation - validate against a DTD referenced by the document + - load_dtd - use DTD for parsing + - no_network - prevent network access for related files (default: True) + - ns_clean - clean up redundant namespace declarations + - recover - try hard to parse through broken XML + - remove_blank_text - discard blank text nodes that appear ignorable + - remove_comments - discard comments + - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) + - compact - save memory for short text content (default: True) + - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation) + - huge_tree - disable security restrictions and support very deep trees + and very long text content (only affects libxml2 2.7+) + + Other keyword arguments: + + - resolve_entities - replace entities by their text value: False for keeping the + entity references, True for resolving them, and 'internal' for resolving + internal definitions only (no external file/URL access). + The default used to be True and was changed to 'internal' in lxml 5.0. + - encoding - override the document encoding (note: libiconv encoding name) + - target - a parser target object that will receive the parse events + - schema - an XMLSchema to validate against + + Note that you should avoid sharing parsers between threads. While this is + not harmful, it is more efficient to use separate parsers. This does not + apply to the default parser. + """ + def __init__(self, *, encoding=None, attribute_defaults=False, + dtd_validation=False, load_dtd=False, no_network=True, + ns_clean=False, recover=False, XMLSchema schema=None, + huge_tree=False, remove_blank_text=False, resolve_entities='internal', + remove_comments=False, remove_pis=False, strip_cdata=True, + collect_ids=True, target=None, compact=True): + cdef int parse_options + cdef bint resolve_external = True + parse_options = _XML_DEFAULT_PARSE_OPTIONS + if load_dtd: + parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD + if dtd_validation: + parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \ + xmlparser.XML_PARSE_DTDLOAD + if attribute_defaults: + parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR + if schema is None: + parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD + if ns_clean: + parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN + if recover: + parse_options = parse_options | xmlparser.XML_PARSE_RECOVER + if remove_blank_text: + parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS + if huge_tree: + parse_options = parse_options | xmlparser.XML_PARSE_HUGE + if not no_network: + parse_options = parse_options ^ xmlparser.XML_PARSE_NONET + if not compact: + parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT + if not resolve_entities: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT + elif resolve_entities == 'internal': + resolve_external = False + if not strip_cdata: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA + + _BaseParser.__init__(self, parse_options, False, schema, + remove_comments, remove_pis, strip_cdata, + collect_ids, target, encoding, resolve_external) + + +cdef class XMLPullParser(XMLParser): + """XMLPullParser(self, events=None, *, tag=None, **kwargs) + + XML parser that collects parse events in an iterator. + + The collected events are the same as for iterparse(), but the + parser itself is non-blocking in the sense that it receives + data chunks incrementally through its .feed() method, instead + of reading them directly from a file(-like) object all by itself. + + By default, it collects Element end events. To change that, + pass any subset of the available events into the ``events`` + argument: ``'start'``, ``'end'``, ``'start-ns'``, + ``'end-ns'``, ``'comment'``, ``'pi'``. + + To support loading external dependencies relative to the input + source, you can pass the ``base_url``. + """ + def __init__(self, events=None, *, tag=None, base_url=None, **kwargs): + XMLParser.__init__(self, **kwargs) + if events is None: + events = ('end',) + self._setBaseURL(base_url) + self._collectEvents(events, tag) + + def read_events(self): + return (<_SaxParserContext?>self._getPushParserContext()).events_iterator + + +cdef class ETCompatXMLParser(XMLParser): + """ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \ + dtd_validation=False, load_dtd=False, no_network=True, \ + ns_clean=False, recover=False, schema=None, \ + huge_tree=False, remove_blank_text=False, resolve_entities=True, \ + remove_comments=True, remove_pis=True, strip_cdata=True, \ + target=None, compact=True) + + An XML parser with an ElementTree compatible default setup. + + See the XMLParser class for details. + + This parser has ``remove_comments`` and ``remove_pis`` enabled by default + and thus ignores comments and processing instructions. + """ + def __init__(self, *, encoding=None, attribute_defaults=False, + dtd_validation=False, load_dtd=False, no_network=True, + ns_clean=False, recover=False, schema=None, + huge_tree=False, remove_blank_text=False, resolve_entities=True, + remove_comments=True, remove_pis=True, strip_cdata=True, + target=None, compact=True): + XMLParser.__init__(self, + attribute_defaults=attribute_defaults, + dtd_validation=dtd_validation, + load_dtd=load_dtd, + no_network=no_network, + ns_clean=ns_clean, + recover=recover, + remove_blank_text=remove_blank_text, + huge_tree=huge_tree, + compact=compact, + resolve_entities=resolve_entities, + remove_comments=remove_comments, + remove_pis=remove_pis, + strip_cdata=strip_cdata, + target=target, + encoding=encoding, + schema=schema) + +# ET 1.2 compatible name +XMLTreeBuilder = ETCompatXMLParser + + +cdef XMLParser __DEFAULT_XML_PARSER +__DEFAULT_XML_PARSER = XMLParser() + +__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) + +def set_default_parser(_BaseParser parser=None): + """set_default_parser(parser=None) + + Set a default parser for the current thread. This parser is used + globally whenever no parser is supplied to the various parse functions of + the lxml API. If this function is called without a parser (or if it is + None), the default parser is reset to the original configuration. + + Note that the pre-installed default parser is not thread-safe. Avoid the + default parser in multi-threaded environments. You can create a separate + parser for each thread explicitly or use a parser pool. + """ + if parser is None: + parser = __DEFAULT_XML_PARSER + __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser) + +def get_default_parser(): + "get_default_parser()" + return __GLOBAL_PARSER_CONTEXT.getDefaultParser() + +############################################################ +## HTML parser +############################################################ + +cdef int _HTML_DEFAULT_PARSE_OPTIONS +_HTML_DEFAULT_PARSE_OPTIONS = ( + htmlparser.HTML_PARSE_RECOVER | + htmlparser.HTML_PARSE_NONET | + htmlparser.HTML_PARSE_COMPACT + ) + +cdef object _UNUSED = object() + +cdef class HTMLParser(_FeedParser): + """HTMLParser(self, encoding=None, remove_blank_text=False, \ + remove_comments=False, remove_pis=False, \ + no_network=True, target=None, schema: XMLSchema =None, \ + recover=True, compact=True, collect_ids=True, huge_tree=False) + + The HTML parser. + + This parser allows reading HTML into a normal XML tree. By + default, it can read broken (non well-formed) HTML, depending on + the capabilities of libxml2. Use the 'recover' option to switch + this off. + + Available boolean keyword arguments: + + - recover - try hard to parse through broken HTML (default: True) + - no_network - prevent network access for related files (default: True) + - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content) + - remove_comments - discard comments + - remove_pis - discard processing instructions + - compact - save memory for short text content (default: True) + - default_doctype - add a default doctype even if it is not found in the HTML (default: True) + - collect_ids - use a hash table of XML IDs for fast access (default: True) + - huge_tree - disable security restrictions and support very deep trees + and very long text content (only affects libxml2 2.7+) + + Other keyword arguments: + + - encoding - override the document encoding (note: libiconv encoding name) + - target - a parser target object that will receive the parse events + - schema - an XMLSchema to validate against + + Note that you should avoid sharing parsers between threads for performance + reasons. + """ + def __init__(self, *, encoding=None, remove_blank_text=False, + remove_comments=False, remove_pis=False, strip_cdata=_UNUSED, + no_network=True, target=None, XMLSchema schema=None, + recover=True, compact=True, default_doctype=True, + collect_ids=True, huge_tree=False): + cdef int parse_options + parse_options = _HTML_DEFAULT_PARSE_OPTIONS + if remove_blank_text: + parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS + if not recover: + parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER + if not no_network: + parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET + if not compact: + parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT + if not default_doctype: + parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD + if huge_tree: + parse_options = parse_options | xmlparser.XML_PARSE_HUGE + + if strip_cdata is not _UNUSED: + import warnings + warnings.warn( + "The 'strip_cdata' option of HTMLParser() has never done anything and will eventually be removed.", + DeprecationWarning) + _BaseParser.__init__(self, parse_options, True, schema, + remove_comments, remove_pis, strip_cdata, + collect_ids, target, encoding) + + +cdef HTMLParser __DEFAULT_HTML_PARSER +__DEFAULT_HTML_PARSER = HTMLParser() + + +cdef class HTMLPullParser(HTMLParser): + """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs) + + HTML parser that collects parse events in an iterator. + + The collected events are the same as for iterparse(), but the + parser itself is non-blocking in the sense that it receives + data chunks incrementally through its .feed() method, instead + of reading them directly from a file(-like) object all by itself. + + By default, it collects Element end events. To change that, + pass any subset of the available events into the ``events`` + argument: ``'start'``, ``'end'``, ``'start-ns'``, + ``'end-ns'``, ``'comment'``, ``'pi'``. + + To support loading external dependencies relative to the input + source, you can pass the ``base_url``. + """ + def __init__(self, events=None, *, tag=None, base_url=None, **kwargs): + HTMLParser.__init__(self, **kwargs) + if events is None: + events = ('end',) + self._setBaseURL(base_url) + self._collectEvents(events, tag) + + def read_events(self): + return (<_SaxParserContext?>self._getPushParserContext()).events_iterator + + +############################################################ +## helper functions for document creation +############################################################ + +cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL: + cdef char* c_filename + cdef char* c_text + cdef Py_ssize_t c_len + if parser is None: + parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() + if not filename: + c_filename = NULL + else: + filename_utf = _encodeFilenameUTF8(filename) + c_filename = _cstr(filename_utf) + if isinstance(text, unicode): + if python.PyUnicode_IS_READY(text): + # PEP-393 Unicode string + c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text) + else: + # old Py_UNICODE string + c_len = python.PyUnicode_GET_DATA_SIZE(text) + if c_len > limits.INT_MAX: + return (<_BaseParser>parser)._parseDocFromFilelike( + StringIO(text), filename, None) + return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename) + else: + c_len = python.PyBytes_GET_SIZE(text) + if c_len > limits.INT_MAX: + return (<_BaseParser>parser)._parseDocFromFilelike( + BytesIO(text), filename, None) + c_text = _cstr(text) + return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename) + +cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL: + if parser is None: + parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() + return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8)) + +cdef xmlDoc* _parseDocFromFilelike(source, filename, + _BaseParser parser) except NULL: + if parser is None: + parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() + return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None) + +cdef xmlDoc* _newXMLDoc() except NULL: + cdef xmlDoc* result + result = tree.xmlNewDoc(NULL) + if result is NULL: + raise MemoryError() + if result.encoding is NULL: + result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8") + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + return result + +cdef xmlDoc* _newHTMLDoc() except NULL: + cdef xmlDoc* result + result = tree.htmlNewDoc(NULL, NULL) + if result is NULL: + raise MemoryError() + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + return result + +cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL: + cdef xmlDoc* result + if recursive: + with nogil: + result = tree.xmlCopyDoc(c_doc, recursive) + else: + result = tree.xmlCopyDoc(c_doc, 0) + if result is NULL: + raise MemoryError() + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + return result + +cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL: + "Recursively copy the document and make c_new_root the new root node." + cdef xmlDoc* result + cdef xmlNode* c_node + result = tree.xmlCopyDoc(c_doc, 0) # non recursive + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + with nogil: + c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive + if c_node is NULL: + raise MemoryError() + tree.xmlDocSetRootElement(result, c_node) + _copyTail(c_new_root.next, c_node) + return result + +cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL: + "Recursively copy the element into the document. c_doc is not modified." + cdef xmlNode* c_root + c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive + if c_root is NULL: + raise MemoryError() + _copyTail(c_node.next, c_root) + return c_root + + +############################################################ +## API level helper functions for _Document creation +############################################################ + +cdef _Document _parseDocument(source, _BaseParser parser, base_url): + cdef _Document doc + source = _getFSPathOrObject(source) + if _isString(source): + # parse the file directly from the filesystem + doc = _parseDocumentFromURL(_encodeFilename(source), parser) + # fix base URL if requested + if base_url is not None: + base_url = _encodeFilenameUTF8(base_url) + if doc._c_doc.URL is not NULL: + tree.xmlFree(<char*>doc._c_doc.URL) + doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url)) + return doc + + if base_url is not None: + url = base_url + else: + url = _getFilenameForFile(source) + + if hasattr(source, 'getvalue') and hasattr(source, 'tell'): + # StringIO - reading from start? + if source.tell() == 0: + return _parseMemoryDocument(source.getvalue(), url, parser) + + # Support for file-like objects (urlgrabber.urlopen, ...) + if hasattr(source, 'read'): + return _parseFilelikeDocument(source, url, parser) + + raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'" + +cdef _Document _parseDocumentFromURL(url, _BaseParser parser): + c_doc = _parseDocFromFile(url, parser) + return _documentFactory(c_doc, parser) + +cdef _Document _parseMemoryDocument(text, url, _BaseParser parser): + if isinstance(text, unicode): + if _hasEncodingDeclaration(text): + raise ValueError( + "Unicode strings with encoding declaration are not supported. " + "Please use bytes input or XML fragments without declaration.") + elif not isinstance(text, bytes): + raise ValueError, "can only parse strings" + c_doc = _parseDoc(text, url, parser) + return _documentFactory(c_doc, parser) + +cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser): + c_doc = _parseDocFromFilelike(source, url, parser) + return _documentFactory(c_doc, parser) |