diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/lxml/saxparser.pxi | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/lxml/saxparser.pxi')
-rw-r--r-- | .venv/lib/python3.12/site-packages/lxml/saxparser.pxi | 875 |
1 files changed, 875 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lxml/saxparser.pxi b/.venv/lib/python3.12/site-packages/lxml/saxparser.pxi new file mode 100644 index 00000000..dc03df9a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lxml/saxparser.pxi @@ -0,0 +1,875 @@ +# SAX-like interfaces + +class XMLSyntaxAssertionError(XMLSyntaxError, AssertionError): + """ + An XMLSyntaxError that additionally inherits from AssertionError for + ElementTree / backwards compatibility reasons. + + This class may get replaced by a plain XMLSyntaxError in a future version. + """ + def __init__(self, message): + XMLSyntaxError.__init__(self, message, None, 0, 1) + + +ctypedef enum _SaxParserEvents: + SAX_EVENT_START = 1 << 0 + SAX_EVENT_END = 1 << 1 + SAX_EVENT_DATA = 1 << 2 + SAX_EVENT_DOCTYPE = 1 << 3 + SAX_EVENT_PI = 1 << 4 + SAX_EVENT_COMMENT = 1 << 5 + SAX_EVENT_START_NS = 1 << 6 + SAX_EVENT_END_NS = 1 << 7 + +ctypedef enum _ParseEventFilter: + PARSE_EVENT_FILTER_START = 1 << 0 + PARSE_EVENT_FILTER_END = 1 << 1 + PARSE_EVENT_FILTER_START_NS = 1 << 2 + PARSE_EVENT_FILTER_END_NS = 1 << 3 + PARSE_EVENT_FILTER_COMMENT = 1 << 4 + PARSE_EVENT_FILTER_PI = 1 << 5 + + +cdef int _buildParseEventFilter(events) except -1: + cdef int event_filter = 0 + for event in events: + if event == 'start': + event_filter |= PARSE_EVENT_FILTER_START + elif event == 'end': + event_filter |= PARSE_EVENT_FILTER_END + elif event == 'start-ns': + event_filter |= PARSE_EVENT_FILTER_START_NS + elif event == 'end-ns': + event_filter |= PARSE_EVENT_FILTER_END_NS + elif event == 'comment': + event_filter |= PARSE_EVENT_FILTER_COMMENT + elif event == 'pi': + event_filter |= PARSE_EVENT_FILTER_PI + else: + raise ValueError, f"invalid event name '{event}'" + return event_filter + + +cdef class _SaxParserTarget: + cdef int _sax_event_filter + + cdef _handleSaxStart(self, tag, attrib, nsmap): + return None + cdef _handleSaxEnd(self, tag): + return None + cdef int _handleSaxData(self, data) except -1: + return 0 + cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1: + return 0 + cdef _handleSaxPi(self, target, data): + return None + cdef _handleSaxComment(self, comment): + return None + cdef _handleSaxStartNs(self, prefix, uri): + return None + cdef _handleSaxEndNs(self, prefix): + return None + + +#@cython.final +@cython.internal +@cython.no_gc_clear # Required because parent class uses it - Cython bug. +cdef class _SaxParserContext(_ParserContext): + """This class maps SAX2 events to parser target events. + """ + cdef _SaxParserTarget _target + cdef _BaseParser _parser + cdef xmlparser.startElementNsSAX2Func _origSaxStart + cdef xmlparser.endElementNsSAX2Func _origSaxEnd + cdef xmlparser.startElementSAXFunc _origSaxStartNoNs + cdef xmlparser.endElementSAXFunc _origSaxEndNoNs + cdef xmlparser.charactersSAXFunc _origSaxData + cdef xmlparser.cdataBlockSAXFunc _origSaxCData + cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype + cdef xmlparser.commentSAXFunc _origSaxComment + cdef xmlparser.processingInstructionSAXFunc _origSaxPI + cdef xmlparser.startDocumentSAXFunc _origSaxStartDocument + + # for event collecting + cdef int _event_filter + cdef list _ns_stack + cdef list _node_stack + cdef _ParseEventsIterator events_iterator + + # for iterparse + cdef _Element _root + cdef _MultiTagMatcher _matcher + + def __cinit__(self, _BaseParser parser): + self._ns_stack = [] + self._node_stack = [] + self._parser = parser + self.events_iterator = _ParseEventsIterator() + + cdef void _setSaxParserTarget(self, _SaxParserTarget target) noexcept: + self._target = target + + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept: + _ParserContext._initParserContext(self, c_ctxt) + if self._target is not None: + self._connectTarget(c_ctxt) + elif self._event_filter: + self._connectEvents(c_ctxt) + + cdef void _connectTarget(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept: + """Wrap original SAX2 callbacks to call into parser target. + """ + sax = c_ctxt.sax + self._origSaxStart = sax.startElementNs = NULL + self._origSaxStartNoNs = sax.startElement = NULL + if self._target._sax_event_filter & (SAX_EVENT_START | + SAX_EVENT_START_NS | + SAX_EVENT_END_NS): + # intercept => overwrite orig callback + # FIXME: also intercept on when collecting END events + if sax.initialized == xmlparser.XML_SAX2_MAGIC: + sax.startElementNs = _handleSaxTargetStart + if self._target._sax_event_filter & SAX_EVENT_START: + sax.startElement = _handleSaxTargetStartNoNs + + self._origSaxEnd = sax.endElementNs = NULL + self._origSaxEndNoNs = sax.endElement = NULL + if self._target._sax_event_filter & (SAX_EVENT_END | + SAX_EVENT_END_NS): + if sax.initialized == xmlparser.XML_SAX2_MAGIC: + sax.endElementNs = _handleSaxEnd + if self._target._sax_event_filter & SAX_EVENT_END: + sax.endElement = _handleSaxEndNoNs + + self._origSaxData = sax.characters = sax.cdataBlock = NULL + if self._target._sax_event_filter & SAX_EVENT_DATA: + sax.characters = sax.cdataBlock = _handleSaxData + + # doctype propagation is always required for entity replacement + self._origSaxDoctype = sax.internalSubset + if self._target._sax_event_filter & SAX_EVENT_DOCTYPE: + sax.internalSubset = _handleSaxTargetDoctype + + self._origSaxPI = sax.processingInstruction = NULL + if self._target._sax_event_filter & SAX_EVENT_PI: + sax.processingInstruction = _handleSaxTargetPI + + self._origSaxComment = sax.comment = NULL + if self._target._sax_event_filter & SAX_EVENT_COMMENT: + sax.comment = _handleSaxTargetComment + + # enforce entity replacement + sax.reference = NULL + c_ctxt.replaceEntities = 1 + + cdef void _connectEvents(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept: + """Wrap original SAX2 callbacks to collect parse events without parser target. + """ + sax = c_ctxt.sax + self._origSaxStartDocument = sax.startDocument + sax.startDocument = _handleSaxStartDocument + + # only override "start" event handler if needed + self._origSaxStart = sax.startElementNs + if self._event_filter == 0 or c_ctxt.html or \ + self._event_filter & (PARSE_EVENT_FILTER_START | + PARSE_EVENT_FILTER_END | + PARSE_EVENT_FILTER_START_NS | + PARSE_EVENT_FILTER_END_NS): + sax.startElementNs = <xmlparser.startElementNsSAX2Func>_handleSaxStart + + self._origSaxStartNoNs = sax.startElement + if self._event_filter == 0 or c_ctxt.html or \ + self._event_filter & (PARSE_EVENT_FILTER_START | + PARSE_EVENT_FILTER_END): + sax.startElement = <xmlparser.startElementSAXFunc>_handleSaxStartNoNs + + # only override "end" event handler if needed + self._origSaxEnd = sax.endElementNs + if self._event_filter == 0 or \ + self._event_filter & (PARSE_EVENT_FILTER_END | + PARSE_EVENT_FILTER_END_NS): + sax.endElementNs = <xmlparser.endElementNsSAX2Func>_handleSaxEnd + + self._origSaxEndNoNs = sax.endElement + if self._event_filter == 0 or \ + self._event_filter & PARSE_EVENT_FILTER_END: + sax.endElement = <xmlparser.endElementSAXFunc>_handleSaxEndNoNs + + self._origSaxComment = sax.comment + if self._event_filter & PARSE_EVENT_FILTER_COMMENT: + sax.comment = <xmlparser.commentSAXFunc>_handleSaxComment + + self._origSaxPI = sax.processingInstruction + if self._event_filter & PARSE_EVENT_FILTER_PI: + sax.processingInstruction = <xmlparser.processingInstructionSAXFunc>_handleSaxPIEvent + + cdef _setEventFilter(self, events, tag): + self._event_filter = _buildParseEventFilter(events) + if not self._event_filter or tag is None or tag == '*': + self._matcher = None + else: + self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag) + + cdef int startDocument(self, xmlDoc* c_doc) except -1: + try: + self._doc = _documentFactory(c_doc, self._parser) + finally: + self._parser = None # clear circular reference ASAP + if self._matcher is not None: + self._matcher.cacheTags(self._doc, True) # force entry in libxml2 dict + return 0 + + cdef int pushEvent(self, event, xmlNode* c_node) except -1: + cdef _Element root + if self._root is None: + root = self._doc.getroot() + if root is not None and root._c_node.type == tree.XML_ELEMENT_NODE: + self._root = root + node = _elementFactory(self._doc, c_node) + self.events_iterator._events.append( (event, node) ) + return 0 + + cdef int flushEvents(self) except -1: + events = self.events_iterator._events + while self._node_stack: + events.append( ('end', self._node_stack.pop()) ) + _pushSaxNsEndEvents(self) + while self._ns_stack: + _pushSaxNsEndEvents(self) + + cdef void _handleSaxException(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept: + if c_ctxt.errNo == xmlerror.XML_ERR_OK: + c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR + # stop parsing immediately + c_ctxt.wellFormed = 0 + c_ctxt.disableSAX = 1 + c_ctxt.instate = xmlparser.XML_PARSER_EOF + self._store_raised() + + +@cython.final +@cython.internal +cdef class _ParseEventsIterator: + """A reusable parse events iterator""" + cdef list _events + cdef int _event_index + + def __cinit__(self): + self._events = [] + self._event_index = 0 + + def __iter__(self): + return self + + def __next__(self): + cdef int event_index = self._event_index + events = self._events + if event_index >= 2**10 or event_index * 2 >= len(events): + if event_index: + # clean up from time to time + del events[:event_index] + self._event_index = event_index = 0 + if event_index >= len(events): + raise StopIteration + item = events[event_index] + self._event_index = event_index + 1 + return item + + +cdef list _build_prefix_uri_list(_SaxParserContext context, int c_nb_namespaces, + const_xmlChar** c_namespaces): + "Build [(prefix, uri)] list of declared namespaces." + cdef int i + namespaces = [] + for i in xrange(c_nb_namespaces): + namespaces.append((funicodeOrEmpty(c_namespaces[0]), funicode(c_namespaces[1]))) + c_namespaces += 2 + return namespaces + + +cdef void _handleSaxStart( + void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix, + const_xmlChar* c_namespace, int c_nb_namespaces, + const_xmlChar** c_namespaces, + int c_nb_attributes, int c_nb_defaulted, + const_xmlChar** c_attributes) noexcept with gil: + cdef int i + cdef size_t c_len + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + cdef int event_filter = context._event_filter + try: + if (c_nb_namespaces and + event_filter & (PARSE_EVENT_FILTER_START_NS | + PARSE_EVENT_FILTER_END_NS)): + declared_namespaces = _build_prefix_uri_list( + context, c_nb_namespaces, c_namespaces) + if event_filter & PARSE_EVENT_FILTER_START_NS: + for prefix_uri_tuple in declared_namespaces: + context.events_iterator._events.append(("start-ns", prefix_uri_tuple)) + else: + declared_namespaces = None + + context._origSaxStart(c_ctxt, c_localname, c_prefix, c_namespace, + c_nb_namespaces, c_namespaces, c_nb_attributes, + c_nb_defaulted, c_attributes) + if c_ctxt.html: + _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node) + # The HTML parser in libxml2 reports the missing opening tags when it finds + # misplaced ones, but with tag names from C string constants that ignore the + # parser dict. Thus, we need to intern the name ourselves. + c_localname = tree.xmlDictLookup(c_ctxt.dict, c_localname, -1) + if c_localname is NULL: + raise MemoryError() + + if event_filter & PARSE_EVENT_FILTER_END_NS: + context._ns_stack.append(declared_namespaces) + if event_filter & (PARSE_EVENT_FILTER_END | + PARSE_EVENT_FILTER_START): + _pushSaxStartEvent(context, c_ctxt, c_namespace, c_localname, None) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxTargetStart( + void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix, + const_xmlChar* c_namespace, int c_nb_namespaces, + const_xmlChar** c_namespaces, + int c_nb_attributes, int c_nb_defaulted, + const_xmlChar** c_attributes) noexcept with gil: + cdef int i + cdef size_t c_len + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + + cdef int event_filter = context._event_filter + cdef int sax_event_filter = context._target._sax_event_filter + try: + if c_nb_namespaces: + declared_namespaces = _build_prefix_uri_list( + context, c_nb_namespaces, c_namespaces) + + if event_filter & PARSE_EVENT_FILTER_START_NS: + for prefix_uri_tuple in declared_namespaces: + context.events_iterator._events.append(("start-ns", prefix_uri_tuple)) + + if sax_event_filter & SAX_EVENT_START_NS: + for prefix, uri in declared_namespaces: + context._target._handleSaxStartNs(prefix, uri) + else: + declared_namespaces = None + + if sax_event_filter & SAX_EVENT_START: + if c_nb_defaulted > 0: + # only add default attributes if we asked for them + if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0: + c_nb_attributes -= c_nb_defaulted + if c_nb_attributes == 0: + attrib = IMMUTABLE_EMPTY_MAPPING + else: + attrib = {} + for i in xrange(c_nb_attributes): + name = _namespacedNameFromNsName( + c_attributes[2], c_attributes[0]) + if c_attributes[3] is NULL: + value = '' + else: + c_len = c_attributes[4] - c_attributes[3] + value = c_attributes[3][:c_len].decode('utf8') + attrib[name] = value + c_attributes += 5 + + nsmap = dict(declared_namespaces) if c_nb_namespaces else IMMUTABLE_EMPTY_MAPPING + + element = _callTargetSaxStart( + context, c_ctxt, + _namespacedNameFromNsName(c_namespace, c_localname), + attrib, nsmap) + else: + element = None + + if (event_filter & PARSE_EVENT_FILTER_END_NS or + sax_event_filter & SAX_EVENT_END_NS): + context._ns_stack.append(declared_namespaces) + if event_filter & (PARSE_EVENT_FILTER_END | + PARSE_EVENT_FILTER_START): + _pushSaxStartEvent(context, c_ctxt, c_namespace, + c_localname, element) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name, + const_xmlChar** c_attributes) noexcept with gil: + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + context._origSaxStartNoNs(c_ctxt, c_name, c_attributes) + if c_ctxt.html: + _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node) + # The HTML parser in libxml2 reports the missing opening tags when it finds + # misplaced ones, but with tag names from C string constants that ignore the + # parser dict. Thus, we need to intern the name ourselves. + c_name = tree.xmlDictLookup(c_ctxt.dict, c_name, -1) + if c_name is NULL: + raise MemoryError() + if context._event_filter & (PARSE_EVENT_FILTER_END | + PARSE_EVENT_FILTER_START): + _pushSaxStartEvent(context, c_ctxt, NULL, c_name, None) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name, + const_xmlChar** c_attributes) noexcept with gil: + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + if c_attributes is NULL: + attrib = IMMUTABLE_EMPTY_MAPPING + else: + attrib = {} + while c_attributes[0] is not NULL: + name = funicode(c_attributes[0]) + attrib[name] = funicodeOrEmpty(c_attributes[1]) + c_attributes += 2 + element = _callTargetSaxStart( + context, c_ctxt, funicode(c_name), + attrib, IMMUTABLE_EMPTY_MAPPING) + if context._event_filter & (PARSE_EVENT_FILTER_END | + PARSE_EVENT_FILTER_START): + _pushSaxStartEvent(context, c_ctxt, NULL, c_name, element) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef _callTargetSaxStart(_SaxParserContext context, + xmlparser.xmlParserCtxt* c_ctxt, + tag, attrib, nsmap): + element = context._target._handleSaxStart(tag, attrib, nsmap) + if element is not None and c_ctxt.input is not NULL: + if isinstance(element, _Element): + (<_Element>element)._c_node.line = ( + <unsigned short>c_ctxt.input.line + if c_ctxt.input.line < 65535 else 65535) + return element + + +cdef int _pushSaxStartEvent(_SaxParserContext context, + xmlparser.xmlParserCtxt* c_ctxt, + const_xmlChar* c_href, + const_xmlChar* c_name, node) except -1: + if (context._matcher is None or + context._matcher.matchesNsTag(c_href, c_name)): + if node is None and context._target is None: + assert context._doc is not None + node = _elementFactory(context._doc, c_ctxt.node) + if context._event_filter & PARSE_EVENT_FILTER_START: + context.events_iterator._events.append(('start', node)) + if (context._target is None and + context._event_filter & PARSE_EVENT_FILTER_END): + context._node_stack.append(node) + return 0 + + +cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname, + const_xmlChar* c_prefix, + const_xmlChar* c_namespace) noexcept with gil: + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + if context._target is not None: + if context._target._sax_event_filter & SAX_EVENT_END: + node = context._target._handleSaxEnd( + _namespacedNameFromNsName(c_namespace, c_localname)) + else: + node = None + else: + context._origSaxEnd(c_ctxt, c_localname, c_prefix, c_namespace) + node = None + _pushSaxEndEvent(context, c_namespace, c_localname, node) + _pushSaxNsEndEvents(context) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) noexcept with gil: + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + if context._target is not None: + node = context._target._handleSaxEnd(funicode(c_name)) + else: + context._origSaxEndNoNs(c_ctxt, c_name) + node = None + _pushSaxEndEvent(context, NULL, c_name, node) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef int _pushSaxNsEndEvents(_SaxParserContext context) except -1: + cdef bint build_events = context._event_filter & PARSE_EVENT_FILTER_END_NS + cdef bint call_target = ( + context._target is not None + and context._target._sax_event_filter & SAX_EVENT_END_NS) + if not build_events and not call_target: + return 0 + + cdef list declared_namespaces = context._ns_stack.pop() + if declared_namespaces is None: + return 0 + + cdef tuple prefix_uri + for prefix_uri in reversed(declared_namespaces): + if call_target: + context._target._handleSaxEndNs(prefix_uri[0]) + if build_events: + context.events_iterator._events.append(('end-ns', None)) + + return 0 + + +cdef int _pushSaxEndEvent(_SaxParserContext context, + const_xmlChar* c_href, + const_xmlChar* c_name, node) except -1: + if context._event_filter & PARSE_EVENT_FILTER_END: + if (context._matcher is None or + context._matcher.matchesNsTag(c_href, c_name)): + if context._target is None: + node = context._node_stack.pop() + context.events_iterator._events.append(('end', node)) + return 0 + + +cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) noexcept with gil: + # can only be called if parsing with a target + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + context._target._handleSaxData( + c_data[:data_len].decode('utf8')) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name, + const_xmlChar* c_public, + const_xmlChar* c_system) noexcept with gil: + # can only be called if parsing with a target + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + context._target._handleSaxDoctype( + funicodeOrNone(c_name), + funicodeOrNone(c_public), + funicodeOrNone(c_system)) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxStartDocument(void* ctxt) noexcept with gil: + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + context._origSaxStartDocument(ctxt) + c_doc = c_ctxt.myDoc + try: + context.startDocument(c_doc) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target, + const_xmlChar* c_data) noexcept with gil: + # can only be called if parsing with a target + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + pi = context._target._handleSaxPi( + funicodeOrNone(c_target), + funicodeOrEmpty(c_data)) + if context._event_filter & PARSE_EVENT_FILTER_PI: + context.events_iterator._events.append(('pi', pi)) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target, + const_xmlChar* data) noexcept with gil: + # can only be called when collecting pi events + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + context._origSaxPI(ctxt, target, data) + c_node = _findLastEventNode(c_ctxt) + if c_node is NULL: + return + try: + context.pushEvent('pi', c_node) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) noexcept with gil: + # can only be called if parsing with a target + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + try: + comment = context._target._handleSaxComment(funicodeOrEmpty(c_data)) + if context._event_filter & PARSE_EVENT_FILTER_COMMENT: + context.events_iterator._events.append(('comment', comment)) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) noexcept with gil: + # can only be called when collecting comment events + c_ctxt = <xmlparser.xmlParserCtxt*>ctxt + if c_ctxt._private is NULL or c_ctxt.disableSAX: + return + context = <_SaxParserContext>c_ctxt._private + context._origSaxComment(ctxt, text) + c_node = _findLastEventNode(c_ctxt) + if c_node is NULL: + return + try: + context.pushEvent('comment', c_node) + except: + context._handleSaxException(c_ctxt) + finally: + return # swallow any further exceptions + + +cdef inline xmlNode* _findLastEventNode(xmlparser.xmlParserCtxt* c_ctxt): + # this mimics what libxml2 creates for comments/PIs + if c_ctxt.inSubset == 1: + return c_ctxt.myDoc.intSubset.last + elif c_ctxt.inSubset == 2: + return c_ctxt.myDoc.extSubset.last + elif c_ctxt.node is NULL: + return c_ctxt.myDoc.last + elif c_ctxt.node.type == tree.XML_ELEMENT_NODE: + return c_ctxt.node.last + else: + return c_ctxt.node.next + + +############################################################ +## ET compatible XML tree builder +############################################################ + +cdef class TreeBuilder(_SaxParserTarget): + """TreeBuilder(self, element_factory=None, parser=None, + comment_factory=None, pi_factory=None, + insert_comments=True, insert_pis=True) + + Parser target that builds a tree from parse event callbacks. + + The factory arguments can be used to influence the creation of + elements, comments and processing instructions. + + By default, comments and processing instructions are inserted into + the tree, but they can be ignored by passing the respective flags. + + The final tree is returned by the ``close()`` method. + """ + cdef _BaseParser _parser + cdef object _factory + cdef object _comment_factory + cdef object _pi_factory + cdef list _data + cdef list _element_stack + cdef object _element_stack_pop + cdef _Element _last # may be None + cdef bint _in_tail + cdef bint _insert_comments + cdef bint _insert_pis + + def __init__(self, *, element_factory=None, parser=None, + comment_factory=None, pi_factory=None, + bint insert_comments=True, bint insert_pis=True): + self._sax_event_filter = \ + SAX_EVENT_START | SAX_EVENT_END | SAX_EVENT_DATA | \ + SAX_EVENT_PI | SAX_EVENT_COMMENT + self._data = [] # data collector + self._element_stack = [] # element stack + self._element_stack_pop = self._element_stack.pop + self._last = None # last element + self._in_tail = 0 # true if we're after an end tag + self._factory = element_factory + self._comment_factory = comment_factory if comment_factory is not None else Comment + self._pi_factory = pi_factory if pi_factory is not None else ProcessingInstruction + self._insert_comments = insert_comments + self._insert_pis = insert_pis + self._parser = parser + + @cython.final + cdef int _flush(self) except -1: + if self._data: + if self._last is not None: + text = "".join(self._data) + if self._in_tail: + assert self._last.tail is None, "internal error (tail)" + self._last.tail = text + else: + assert self._last.text is None, "internal error (text)" + self._last.text = text + del self._data[:] + return 0 + + # internal SAX event handlers + + @cython.final + cdef _handleSaxStart(self, tag, attrib, nsmap): + self._flush() + if self._factory is not None: + self._last = self._factory(tag, attrib) + if self._element_stack: + _appendChild(self._element_stack[-1], self._last) + elif self._element_stack: + self._last = _makeSubElement( + self._element_stack[-1], tag, None, None, attrib, nsmap, None) + else: + self._last = _makeElement( + tag, NULL, None, self._parser, None, None, attrib, nsmap, None) + self._element_stack.append(self._last) + self._in_tail = 0 + return self._last + + @cython.final + cdef _handleSaxEnd(self, tag): + self._flush() + self._last = self._element_stack_pop() + self._in_tail = 1 + return self._last + + @cython.final + cdef int _handleSaxData(self, data) except -1: + self._data.append(data) + + @cython.final + cdef _handleSaxPi(self, target, data): + elem = self._pi_factory(target, data) + if self._insert_pis: + self._flush() + self._last = elem + if self._element_stack: + _appendChild(self._element_stack[-1], self._last) + self._in_tail = 1 + return self._last + + @cython.final + cdef _handleSaxComment(self, comment): + elem = self._comment_factory(comment) + if self._insert_comments: + self._flush() + self._last = elem + if self._element_stack: + _appendChild(self._element_stack[-1], self._last) + self._in_tail = 1 + return elem + + # Python level event handlers + + def close(self): + """close(self) + + Flushes the builder buffers, and returns the toplevel document + element. Raises XMLSyntaxError on inconsistencies. + """ + if self._element_stack: + raise XMLSyntaxAssertionError("missing end tags") + # TODO: this does not necessarily seem like an error case. Why not just return None? + if self._last is None: + raise XMLSyntaxAssertionError("missing toplevel element") + return self._last + + def data(self, data): + """data(self, data) + + Adds text to the current element. The value should be either an + 8-bit string containing ASCII text, or a Unicode string. + """ + self._handleSaxData(data) + + def start(self, tag, attrs, nsmap=None): + """start(self, tag, attrs, nsmap=None) + + Opens a new element. + """ + if nsmap is None: + nsmap = IMMUTABLE_EMPTY_MAPPING + return self._handleSaxStart(tag, attrs, nsmap) + + def end(self, tag): + """end(self, tag) + + Closes the current element. + """ + element = self._handleSaxEnd(tag) + assert self._last.tag == tag,\ + f"end tag mismatch (expected {self._last.tag}, got {tag})" + return element + + def pi(self, target, data=None): + """pi(self, target, data=None) + + Creates a processing instruction using the factory, appends it + (unless disabled) and returns it. + """ + return self._handleSaxPi(target, data) + + def comment(self, comment): + """comment(self, comment) + + Creates a comment using the factory, appends it (unless disabled) + and returns it. + """ + return self._handleSaxComment(comment) |