about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py490
1 files changed, 490 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py b/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py
new file mode 100644
index 00000000..1f367da3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py
@@ -0,0 +1,490 @@
+# encoding: utf-8
+from __future__ import annotations
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+__all__ = [
+    "LXMLTreeBuilderForXML",
+    "LXMLTreeBuilder",
+]
+
+
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+from typing_extensions import TypeAlias
+
+from io import BytesIO
+from io import StringIO
+from lxml import etree
+from bs4.element import (
+    AttributeDict,
+    XMLAttributeDict,
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+    XMLProcessingInstruction,
+)
+from bs4.builder import (
+    DetectsXMLParsedAsHTML,
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML,
+)
+from bs4.dammit import EncodingDetector
+from bs4.exceptions import ParserRejectedMarkup
+
+if TYPE_CHECKING:
+    from bs4._typing import (
+        _Encoding,
+        _Encodings,
+        _NamespacePrefix,
+        _NamespaceURL,
+        _NamespaceMapping,
+        _InvertedNamespaceMapping,
+        _RawMarkup,
+    )
+    from bs4 import BeautifulSoup
+
+LXML: str = "lxml"
+
+
+def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
+    "Invert a dictionary."
+    return dict((v, k) for k, v in list(d.items()))
+
+
+_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
+_ParserOrParserClass: TypeAlias = Union[
+    _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
+]
+
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
+
+    is_xml: bool = True
+
+    processing_instruction_class: Type[ProcessingInstruction]
+
+    NAME: str = "lxml-xml"
+    ALTERNATE_NAMES: Iterable[str] = ["xml"]
+
+    # Well, it's permissive by XML parser standards.
+    features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE: int = 512
+
+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
+
+    DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
+
+    nsmaps: List[Optional[_InvertedNamespaceMapping]]
+    empty_element_tags: Set[str]
+    parser: Any
+    _default_parser: Optional[etree.XMLParser]
+
+    # NOTE: If we parsed Element objects and looked at .sourceline,
+    # we'd be able to see the line numbers from the original document.
+    # But instead we build an XMLParser or HTMLParser object to serve
+    # as the target of parse messages, and those messages don't include
+    # line numbers.
+    # See: https://bugs.launchpad.net/lxml/+bug/1846906
+
+    def initialize_soup(self, soup: BeautifulSoup) -> None:
+        """Let the BeautifulSoup object know about the standard namespace
+        mapping.
+
+        :param soup: A `BeautifulSoup`.
+        """
+        # Beyond this point, self.soup is set, so we can assume (and
+        # assert) it's not None whenever necessary.
+        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+        self._register_namespaces(self.DEFAULT_NSMAPS)
+
+    def _register_namespaces(self, mapping: Dict[str, str]) -> None:
+        """Let the BeautifulSoup object know about namespaces encountered
+        while parsing the document.
+
+        This might be useful later on when creating CSS selectors.
+
+        This will track (almost) all namespaces, even ones that were
+        only in scope for part of the document. If two namespaces have
+        the same prefix, only the first one encountered will be
+        tracked. Un-prefixed namespaces are not tracked.
+
+        :param mapping: A dictionary mapping namespace prefixes to URIs.
+        """
+        assert self.soup is not None
+        for key, value in list(mapping.items()):
+            # This is 'if key' and not 'if key is not None' because we
+            # don't track un-prefixed namespaces. Soupselect will
+            # treat an un-prefixed namespace as the default, which
+            # causes confusion in some cases.
+            if key and key not in self.soup._namespaces:
+                # Let the BeautifulSoup object know about a new namespace.
+                # If there are multiple namespaces defined with the same
+                # prefix, the first one in the document takes precedence.
+                self.soup._namespaces[key] = value
+
+    def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
+        """Find the default parser for the given encoding.
+
+        :return: Either a parser object or a class, which
+          will be instantiated with default arguments.
+        """
+        if self._default_parser is not None:
+            return self._default_parser
+        return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)
+
+    def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
+        """Instantiate an appropriate parser for the given encoding.
+
+        :param encoding: A string.
+        :return: A parser object such as an `etree.XMLParser`.
+        """
+        # Use the default parser.
+        parser = self.default_parser(encoding)
+
+        if callable(parser):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, recover=True, encoding=encoding)
+        return parser
+
+    def __init__(
+        self,
+        parser: Optional[etree.XMLParser] = None,
+        empty_element_tags: Optional[Set[str]] = None,
+        **kwargs: Any,
+    ):
+        # TODO: Issue a warning if parser is present but not a
+        # callable, since that means there's no way to create new
+        # parsers for different encodings.
+        self._default_parser = parser
+        self.soup = None
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
+        if "attribute_dict_class" not in kwargs:
+            kwargs["attribute_dict_class"] = XMLAttributeDict
+        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+
+    def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == "{":
+            namespace, name = tag[1:].split("}", 1)
+            return (namespace, name)
+        else:
+            return (None, tag)
+
+    def prepare_markup(
+        self,
+        markup: _RawMarkup,
+        user_specified_encoding: Optional[_Encoding] = None,
+        document_declared_encoding: Optional[_Encoding] = None,
+        exclude_encodings: Optional[_Encodings] = None,
+    ) -> Iterable[
+        Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
+    ]:
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        lxml really wants to get a bytestring and convert it to
+        Unicode itself. So instead of using UnicodeDammit to convert
+        the bytestring to Unicode using different encodings, this
+        implementation uses EncodingDetector to iterate over the
+        encodings, and tell lxml to try to parse the document as each
+        one in turn.
+
+        :param markup: Some markup -- hopefully a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
+            has undergone character replacement)
+
+            Each 4-tuple represents a strategy for converting the
+            document to Unicode and parsing it. Each strategy will be tried
+            in turn.
+        """
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+            # We're in HTML mode, so if we're given XML, that's worth
+            # noting.
+            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
+        if isinstance(markup, str):
+            # We were given Unicode. Maybe lxml can parse Unicode on
+            # this system?
+
+            # TODO: This is a workaround for
+            # https://bugs.launchpad.net/lxml/+bug/1948551.
+            # We can remove it once the upstream issue is fixed.
+            if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
+                markup = markup[1:]
+            yield markup, None, document_declared_encoding, False
+
+        if isinstance(markup, str):
+            # No, apparently not. Convert the Unicode to UTF-8 and
+            # tell lxml to parse it as UTF-8.
+            yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
+
+            # Since the document was Unicode in the first place, there
+            # is no need to try any more strategies; we know this will
+            # work.
+            return
+
+        known_definite_encodings: List[_Encoding] = []
+        if user_specified_encoding:
+            # This was provided by the end-user; treat it as a known
+            # definite encoding per the algorithm laid out in the
+            # HTML5 spec. (See the EncodingDetector class for
+            # details.)
+            known_definite_encodings.append(user_specified_encoding)
+
+        user_encodings: List[_Encoding] = []
+        if document_declared_encoding:
+            # This was found in the document; treat it as a slightly
+            # lower-priority user encoding.
+            user_encodings.append(document_declared_encoding)
+
+        detector = EncodingDetector(
+            markup,
+            known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings,
+            is_html=is_html,
+            exclude_encodings=exclude_encodings,
+        )
+        for encoding in detector.encodings:
+            yield (detector.markup, encoding, document_declared_encoding, False)
+
+    def feed(self, markup: _RawMarkup) -> None:
+        io: Union[BytesIO, StringIO]
+        if isinstance(markup, bytes):
+            io = BytesIO(markup)
+        elif isinstance(markup, str):
+            io = StringIO(markup)
+
+        # initialize_soup is called before feed, so we know this
+        # is not None.
+        assert self.soup is not None
+
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = io.read(self.CHUNK_SIZE)
+        try:
+            self.parser = self.parser_for(self.soup.original_encoding)
+            self.parser.feed(data)
+            while len(data) != 0:
+                # Now call feed() on the rest of the data, chunk by chunk.
+                data = io.read(self.CHUNK_SIZE)
+                if len(data) != 0:
+                    self.parser.feed(data)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
+            raise ParserRejectedMarkup(e)
+
+    def close(self) -> None:
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+
+    def start(
+        self,
+        tag: str | bytes,
+        attrs: Dict[str | bytes, str | bytes],
+        nsmap: _NamespaceMapping = {},
+    ) -> None:
+        # This is called by lxml code as a result of calling
+        # BeautifulSoup.feed(), and we know self.soup is set by the time feed()
+        # is called.
+        assert self.soup is not None
+        assert isinstance(tag, str)
+
+        # We need to recreate the attribute dict for three
+        # reasons. First, for type checking, so we can assert there
+        # are no bytestrings in the keys or values. Second, because we
+        # need a mutable dict--lxml might send us an immutable
+        # dictproxy. Third, so we can handle namespaced attribute
+        # names by converting the keys to NamespacedAttributes.
+        new_attrs: Dict[Union[str, NamespacedAttribute], str] = (
+            self.attribute_dict_class()
+        )
+        for k, v in attrs.items():
+            assert isinstance(k, str)
+            assert isinstance(v, str)
+            new_attrs[k] = v
+
+        nsprefix: Optional[_NamespacePrefix] = None
+        namespace: Optional[_NamespaceURL] = None
+        # Invert each namespace map as it comes in.
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+
+            # First, Let the BeautifulSoup object know about it.
+            self._register_namespaces(nsmap)
+
+            # Then, add it to our running list of inverted namespace
+            # mappings.
+            self.nsmaps.append(_invert(nsmap))
+
+            # The currently active namespace prefixes have
+            # changed. Calculate the new mapping so it can be stored
+            # with all Tag objects created while these prefixes are in
+            # scope.
+            current_mapping = dict(self.active_namespace_prefixes[-1])
+            current_mapping.update(nsmap)
+
+            # We should not track un-prefixed namespaces as we can only hold one
+            # and it will be recognized as the default namespace by soupsieve,
+            # which may be confusing in some situations.
+            if "" in current_mapping:
+                del current_mapping[""]
+            self.active_namespace_prefixes.append(current_mapping)
+
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            for prefix, namespace in list(nsmap.items()):
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/"
+                )
+                new_attrs[attribute] = namespace
+
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        final_attrs: AttributeDict = self.attribute_dict_class()
+        for attr, value in list(new_attrs.items()):
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                final_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                final_attrs[attr] = value
+
+        namespace, tag = self._getNsTag(tag)
+        nsprefix = self._prefix_for_namespace(namespace)
+        self.soup.handle_starttag(
+            tag,
+            namespace,
+            nsprefix,
+            final_attrs,
+            namespaces=self.active_namespace_prefixes[-1],
+        )
+
+    def _prefix_for_namespace(
+        self, namespace: Optional[_NamespaceURL]
+    ) -> Optional[_NamespacePrefix]:
+        """Find the currently active prefix for the given namespace."""
+        if namespace is None:
+            return None
+        for inverted_nsmap in reversed(self.nsmaps):
+            if inverted_nsmap is not None and namespace in inverted_nsmap:
+                return inverted_nsmap[namespace]
+        return None
+
+    def end(self, name: str | bytes) -> None:
+        assert self.soup is not None
+        assert isinstance(name, str)
+        self.soup.endData()
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if len(self.nsmaps) > 1:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            out_of_scope_nsmap = self.nsmaps.pop()
+
+            if out_of_scope_nsmap is not None:
+                # This tag introduced a namespace mapping which is no
+                # longer in scope. Recalculate the currently active
+                # namespace prefixes.
+                self.active_namespace_prefixes.pop()
+
+    def pi(self, target: str, data: str) -> None:
+        assert self.soup is not None
+        self.soup.endData()
+        data = target + " " + data
+        self.soup.handle_data(data)
+        self.soup.endData(self.processing_instruction_class)
+
+    def data(self, data: str | bytes) -> None:
+        assert self.soup is not None
+        assert isinstance(data, str)
+        self.soup.handle_data(data)
+
+    def doctype(self, name: str, pubid: str, system: str) -> None:
+        assert self.soup is not None
+        self.soup.endData()
+        doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
+        self.soup.handle_data(doctype_string)
+        self.soup.endData(containerClass=Doctype)
+
+    def comment(self, text: str | bytes) -> None:
+        "Handle comments as Comment objects."
+        assert self.soup is not None
+        assert isinstance(text, str)
+        self.soup.endData()
+        self.soup.handle_data(text)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """See `TreeBuilder`."""
+        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+    NAME: str = LXML
+    ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
+
+    features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
+    is_xml: bool = False
+
+    def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
+        return etree.HTMLParser
+
+    def feed(self, markup: _RawMarkup) -> None:
+        # We know self.soup is set by the time feed() is called.
+        assert self.soup is not None
+        encoding = self.soup.original_encoding
+        try:
+            self.parser = self.parser_for(encoding)
+            self.parser.feed(markup)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
+            raise ParserRejectedMarkup(e)
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """See `TreeBuilder`."""
+        return "<html><body>%s</body></html>" % fragment