two version of R2R are here HEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/builder
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
4 files changed, 2406 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py
new file mode 100644
index 00000000..5f2b38de
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py
@@ -0,0 +1,848 @@
+from __future__ import annotations
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+from collections import defaultdict
+import re
+from types import ModuleType
+from typing import (
+    Any,
+    cast,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Pattern,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+)
+import warnings
+import sys
+from bs4.element import (
+    AttributeDict,
+    AttributeValueList,
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    RubyParenthesisString,
+    RubyTextString,
+    Stylesheet,
+    Script,
+    TemplateString,
+    nonwhitespace_re,
+)
+
+# Exceptions were moved to their own module in 4.13. Import here for
+# backwards compatibility.
+from bs4.exceptions import ParserRejectedMarkup
+
+from bs4._typing import (
+    _AttributeValues,
+    _RawAttributeValue,
+)
+
+from bs4._warnings import XMLParsedAsHTMLWarning
+
+if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
+    from bs4.element import (
+        NavigableString,
+        Tag,
+    )
+    from bs4._typing import (
+        _AttributeValue,
+        _Encoding,
+        _Encodings,
+        _RawOrProcessedAttributeValues,
+        _RawMarkup,
+    )
+
+__all__ = [
+    "HTMLTreeBuilder",
+    "SAXTreeBuilder",
+    "TreeBuilder",
+    "TreeBuilderRegistry",
+]
+
+# Some useful features for a TreeBuilder to have.
+FAST = "fast"
+PERMISSIVE = "permissive"
+STRICT = "strict"
+XML = "xml"
+HTML = "html"
+HTML_5 = "html5"
+
+__all__ = [
+    "TreeBuilderRegistry",
+    "TreeBuilder",
+    "HTMLTreeBuilder",
+    "DetectsXMLParsedAsHTML",
+
+    "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
+]
+
+class TreeBuilderRegistry(object):
+    """A way of looking up TreeBuilder subclasses by their name or by desired
+    features.
+    """
+
+    builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
+    builders: List[Type[TreeBuilder]]
+
+    def __init__(self) -> None:
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class: type[TreeBuilder]) -> None:
+        """Register a treebuilder based on its advertised features.
+
+        :param treebuilder_class: A subclass of `TreeBuilder`. its
+           `TreeBuilder.features` attribute should list its features.
+        """
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
+        """Look up a TreeBuilder subclass with the desired features.
+
+        :param features: A list of features to look for. If none are
+            provided, the most recently registered TreeBuilder subclass
+            will be used.
+        :return: A TreeBuilder subclass, or None if there's no
+            registered subclass with all the requested features.
+        """
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        feature_list = list(features)
+        feature_list.reverse()
+        candidates = None
+        candidate_set = None
+        while len(feature_list) > 0:
+            feature = feature_list.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None or candidates is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+
+#: The `BeautifulSoup` constructor will take a list of features
+#: and use it to look up `TreeBuilder` classes in this registry.
+builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()
+
+
+class TreeBuilder(object):
+    """Turn a textual document into a Beautiful Soup object tree.
+
+    This is an abstract superclass which smooths out the behavior of
+    different parser libraries into a single, unified interface.
+
+    :param multi_valued_attributes: If this is set to None, the
+     TreeBuilder will not turn any values for attributes like
+     'class' into lists. Setting this to a dictionary will
+     customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
+     for an example.
+
+     Internally, these are called "CDATA list attributes", but that
+     probably doesn't make sense to an end-user, so the argument name
+     is ``multi_valued_attributes``.
+
+    :param preserve_whitespace_tags: A set of tags to treat
+     the way <pre> tags are treated in HTML. Tags in this set
+     are immune from pretty-printing; their contents will always be
+     output as-is.
+
+    :param string_containers: A dictionary mapping tag names to
+     the classes that should be instantiated to contain the textual
+     contents of those tags. The default is to use NavigableString
+     for every tag, no matter what the name. You can override the
+     default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.
+
+    :param store_line_numbers: If the parser keeps track of the line
+     numbers and positions of the original markup, that information
+     will, by default, be stored in each corresponding
+     :py:class:`bs4.element.Tag` object. You can turn this off by
+     passing store_line_numbers=False; then Tag.sourcepos and
+     Tag.sourceline will always be None. If the parser you're using
+     doesn't keep track of this information, then store_line_numbers
+     is irrelevant.
+
+    :param attribute_dict_class: The value of a multi-valued attribute
+      (such as HTML's 'class') willl be stored in an instance of this
+      class.  The default is Beautiful Soup's built-in
+      `AttributeValueList`, which is a normal Python list, and you
+      will probably never need to change it.
+    """
+
+    USE_DEFAULT: Any = object()  #: :meta private:
+
+    def __init__(
+        self,
+        multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
+        preserve_whitespace_tags: Set[str] = USE_DEFAULT,
+        store_line_numbers: bool = USE_DEFAULT,
+        string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
+        empty_element_tags: Set[str] = USE_DEFAULT,
+        attribute_dict_class: Type[AttributeDict] = AttributeDict,
+        attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
+    ):
+        self.soup = None
+        if multi_valued_attributes is self.USE_DEFAULT:
+            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+        self.cdata_list_attributes = multi_valued_attributes
+        if preserve_whitespace_tags is self.USE_DEFAULT:
+            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+        self.preserve_whitespace_tags = preserve_whitespace_tags
+        if empty_element_tags is self.USE_DEFAULT:
+            self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
+        else:
+            self.empty_element_tags = empty_element_tags
+        # TODO: store_line_numbers is probably irrelevant now that
+        # the behavior of sourceline and sourcepos has been made consistent
+        # everywhere.
+        if store_line_numbers == self.USE_DEFAULT:
+            store_line_numbers = self.TRACKS_LINE_NUMBERS
+        self.store_line_numbers = store_line_numbers
+        if string_containers == self.USE_DEFAULT:
+            string_containers = self.DEFAULT_STRING_CONTAINERS
+        self.string_containers = string_containers
+        self.attribute_dict_class = attribute_dict_class
+        self.attribute_value_list_class = attribute_value_list_class
+
+    NAME: str = "[Unknown tree builder]"
+    ALTERNATE_NAMES: Iterable[str] = []
+    features: Iterable[str] = []
+
+    is_xml: bool = False
+    picklable: bool = False
+
+    soup: Optional[BeautifulSoup]  #: :meta private:
+
+    #: A tag will be considered an empty-element
+    #: tag when and only when it has no contents.
+    empty_element_tags: Optional[Set[str]] = None  #: :meta private:
+    cdata_list_attributes: Dict[str, Set[str]]  #: :meta private:
+    preserve_whitespace_tags: Set[str]  #: :meta private:
+    string_containers: Dict[str, Type[NavigableString]]  #: :meta private:
+    tracks_line_numbers: bool  #: :meta private:
+
+    #: A value for these tag/attribute combinations is a space- or
+    #: comma-separated list of CDATA, rather than a single CDATA.
+    DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)
+
+    #: Whitespace should be preserved inside these tags.
+    DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()
+
+    #: The textual contents of tags with these names should be
+    #: instantiated with some class other than `bs4.element.NavigableString`.
+    DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}
+
+    #: By default, tags are treated as empty-element tags if they have
+    #: no contents--that is, using XML rules. HTMLTreeBuilder
+    #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
+    #: HTML 4 and HTML5 standards.
+    DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None
+
+    #: Most parsers don't keep track of line numbers.
+    TRACKS_LINE_NUMBERS: bool = False
+
+    def initialize_soup(self, soup: BeautifulSoup) -> None:
+        """The BeautifulSoup object has been initialized and is now
+        being associated with the TreeBuilder.
+
+        :param soup: A BeautifulSoup object.
+        """
+        self.soup = soup
+
+    def reset(self) -> None:
+        """Do any work necessary to reset the underlying parser
+        for a new document.
+
+        By default, this does nothing.
+        """
+        pass
+
+    def can_be_empty_element(self, tag_name: str) -> bool:
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p/>" or "<p>".
+
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no children.
+        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
+        be left alone.
+
+        :param tag_name: The name of a markup tag.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
+    def feed(self, markup: _RawMarkup) -> None:
+        """Run incoming markup through some parsing process."""
+        raise NotImplementedError()
+
+    def prepare_markup(
+        self,
+        markup: _RawMarkup,
+        user_specified_encoding: Optional[_Encoding] = None,
+        document_declared_encoding: Optional[_Encoding] = None,
+        exclude_encodings: Optional[_Encodings] = None,
+    ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: The markup that's about to be parsed.
+        :param user_specified_encoding: The user asked to try this encoding
+           to convert the markup into a Unicode string.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding. NOTE: This argument is not used by the
+            calling code and can probably be removed.
+        :param exclude_encodings: The user asked *not* to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
+            has undergone character replacement)
+
+            Each 4-tuple represents a strategy that the parser can try
+            to convert the document to Unicode and parse it. Each
+            strategy will be tried in turn.
+
+         By default, the only strategy is to parse the markup
+         as-is. See `LXMLTreeBuilderForXML` and
+         `HTMLParserTreeBuilder` for implementations that take into
+         account the quirks of particular parsers.
+
+        :meta private:
+
+        """
+        yield markup, None, None, False
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """Wrap an HTML fragment to make it look like a document.
+
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+
+        This method should not be used outside of unit tests.
+
+        :param fragment: A fragment of HTML.
+        :return: A full HTML document.
+        :meta private:
+        """
+        return fragment
+
+    def set_up_substitutions(self, tag: Tag) -> bool:
+        """Set up any substitutions that will need to be performed on
+        a `Tag` when it's output as a string.
+
+        By default, this does nothing. See `HTMLTreeBuilder` for a
+        case where this is used.
+
+        :return: Whether or not a substitution was performed.
+        :meta private:
+        """
+        return False
+
+    def _replace_cdata_list_attribute_values(
+        self, tag_name: str, attrs: _RawOrProcessedAttributeValues
+    ) -> _AttributeValues:
+        """When an attribute value is associated with a tag that can
+        have multiple values for that attribute, convert the string
+        value to a list of strings.
+
+        Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+        NOTE: This method modifies its input in place.
+
+        :param tag_name: The name of a tag.
+        :param attrs: A dictionary containing the tag's attributes.
+           Any appropriate attribute values will be modified in place.
+        :return: The modified dictionary that was originally passed in.
+        """
+
+        # First, cast the attrs dict to _AttributeValues. This might
+        # not be accurate yet, but it will be by the time this method
+        # returns.
+        modified_attrs = cast(_AttributeValues, attrs)
+        if not modified_attrs or not self.cdata_list_attributes:
+            # Nothing to do.
+            return modified_attrs
+
+        # There is at least a possibility that we need to modify one of
+        # the attribute values.
+        universal: Set[str] = self.cdata_list_attributes.get("*", set())
+        tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
+        for attr in list(modified_attrs.keys()):
+            modified_value: _AttributeValue
+            if attr in universal or (tag_specific and attr in tag_specific):
+                # We have a "class"-type attribute whose string
+                # value is a whitespace-separated list of
+                # values. Split it into a list.
+                original_value: _AttributeValue = modified_attrs[attr]
+                if isinstance(original_value, _RawAttributeValue):
+                    # This is a _RawAttributeValue (a string) that
+                    # needs to be split and converted to a
+                    # AttributeValueList so it can be an
+                    # _AttributeValue.
+                    modified_value = self.attribute_value_list_class(
+                        nonwhitespace_re.findall(original_value)
+                    )
+                else:
+                    # html5lib calls setAttributes twice for the
+                    # same tag when rearranging the parse tree. On
+                    # the second call the attribute value here is
+                    # already a list. This can also happen when a
+                    # Tag object is cloned. If this happens, leave
+                    # the value alone rather than trying to split
+                    # it again.
+                    modified_value = original_value
+                modified_attrs[attr] = modified_value
+        return modified_attrs
+
+
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events.
+
+    This is not currently used for anything, and it will be removed
+    soon. It was a good idea, but it wasn't properly integrated into the
+    rest of Beautiful Soup, so there have been long stretches where it
+    hasn't worked properly.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        warnings.warn(
+            "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super(SAXTreeBuilder, self).__init__(*args, **kwargs)
+
+    def feed(self, markup: _RawMarkup) -> None:
+        raise NotImplementedError()
+
+    def close(self) -> None:
+        pass
+
+    def startElement(self, name: str, attrs: Dict[str, str]) -> None:
+        attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
+        # print("Start %s, %r" % (name, attrs))
+        assert self.soup is not None
+        self.soup.handle_starttag(name, None, None, attrs)
+
+    def endElement(self, name: str) -> None:
+        # print("End %s" % name)
+        assert self.soup is not None
+        self.soup.handle_endtag(name)
+
+    def startElementNS(
+        self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
+    ) -> None:
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+
+    def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        # handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+    def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
+        # Ignore the prefix for now.
+        pass
+
+    def endPrefixMapping(self, prefix: str) -> None:
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+
+    def characters(self, content: str) -> None:
+        assert self.soup is not None
+        self.soup.handle_data(content)
+
+    def startDocument(self) -> None:
+        pass
+
+    def endDocument(self) -> None:
+        pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML, such as which tags are treated
+    specially by the HTML standard.
+    """
+
+    #: Some HTML tags are defined as having no contents. Beautiful Soup
+    #: treats these specially.
+    DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set(
+        [
+            # These are from HTML5.
+            "area",
+            "base",
+            "br",
+            "col",
+            "embed",
+            "hr",
+            "img",
+            "input",
+            "keygen",
+            "link",
+            "menuitem",
+            "meta",
+            "param",
+            "source",
+            "track",
+            "wbr",
+            # These are from earlier versions of HTML and are removed in HTML5.
+            "basefont",
+            "bgsound",
+            "command",
+            "frame",
+            "image",
+            "isindex",
+            "nextid",
+            "spacer",
+        ]
+    )
+
+    #: The HTML standard defines these tags as block-level elements. Beautiful
+    #: Soup does not treat these elements differently from other elements,
+    #: but it may do so eventually, and this information is available if
+    #: you need to use it.
+    DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
+        [
+            "address",
+            "article",
+            "aside",
+            "blockquote",
+            "canvas",
+            "dd",
+            "div",
+            "dl",
+            "dt",
+            "fieldset",
+            "figcaption",
+            "figure",
+            "footer",
+            "form",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "header",
+            "hr",
+            "li",
+            "main",
+            "nav",
+            "noscript",
+            "ol",
+            "output",
+            "p",
+            "pre",
+            "section",
+            "table",
+            "tfoot",
+            "ul",
+            "video",
+        ]
+    )
+
+    #: These HTML tags need special treatment so they can be
+    #: represented by a string class other than `bs4.element.NavigableString`.
+    #:
+    #: For some of these tags, it's because the HTML standard defines
+    #: an unusual content model for them. I made this list by going
+    #: through the HTML spec
+    #: (https://html.spec.whatwg.org/#metadata-content) and looking for
+    #: "metadata content" elements that can contain strings.
+    #:
+    #: The Ruby tags (<rt> and <rp>) are here despite being normal
+    #: "phrasing content" tags, because the content they contain is
+    #: qualitatively different from other text in the document, and it
+    #: can be useful to be able to distinguish it.
+    #:
+    #: TODO: Arguably <noscript> could go here but it seems
+    #: qualitatively different from the other tags.
+    DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
+        "rt": RubyTextString,
+        "rp": RubyParenthesisString,
+        "style": Stylesheet,
+        "script": Script,
+        "template": TemplateString,
+    }
+
+    #: The HTML standard defines these attributes as containing a
+    #: space-separated list of values, not a single value. That is,
+    #: class="foo bar" means that the 'class' attribute has two values,
+    #: 'foo' and 'bar', not the single value 'foo bar'.  When we
+    #: encounter one of these attributes, we will parse its value into
+    #: a list of values if possible. Upon output, the list will be
+    #: converted back into a string.
+    DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
+        "*": {"class", "accesskey", "dropzone"},
+        "a": {"rel", "rev"},
+        "link": {"rel", "rev"},
+        "td": {"headers"},
+        "th": {"headers"},
+        "form": {"accept-charset"},
+        "object": {"archive"},
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area": {"rel"},
+        "icon": {"sizes"},
+        "iframe": {"sandbox"},
+        "output": {"for"},
+    }
+
+    #: By default, whitespace inside these HTML tags will be
+    #: preserved rather than being collapsed.
+    DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])
+
+    def set_up_substitutions(self, tag: Tag) -> bool:
+        """Replace the declared encoding in a <meta> tag with a placeholder,
+        to be substituted when the tag is output to a string.
+
+        An HTML document may come in to Beautiful Soup as one
+        encoding, but exit in a different encoding, and the <meta> tag
+        needs to be changed to reflect this.
+
+        :return: Whether or not a substitution was performed.
+
+        :meta private:
+        """
+        # We are only interested in <meta> tags
+        if tag.name != "meta":
+            return False
+
+        # TODO: This cast will fail in the (very unlikely) scenario
+        # that the programmer who instantiates the TreeBuilder
+        # specifies meta['content'] or meta['charset'] as
+        # cdata_list_attributes.
+        content: Optional[str] = cast(Optional[str], tag.get("content"))
+        charset: Optional[str] = cast(Optional[str], tag.get("charset"))
+
+        # But we can accommodate meta['http-equiv'] being made a
+        # cdata_list_attribute (again, very unlikely) without much
+        # trouble.
+        http_equiv: List[str] = tag.get_attribute_list("http-equiv")
+
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
+        substituted = False
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            tag["charset"] = CharsetMetaAttributeValue(charset)
+            substituted = True
+
+        elif content is not None and any(
+            x.lower() == "content-type" for x in http_equiv
+        ):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
+            tag["content"] = ContentMetaAttributeValue(content)
+            substituted = True
+
+        return substituted
+
+
+class DetectsXMLParsedAsHTML(object):
+    """A mixin class for any class (a TreeBuilder, or some class used by a
+    TreeBuilder) that's in a position to detect whether an XML
+    document is being incorrectly parsed as HTML, and issue an
+    appropriate warning.
+
+    This requires being able to observe an incoming processing
+    instruction that might be an XML declaration, and also able to
+    observe tags as they're opened. If you can't do that for a given
+    `TreeBuilder`, there's a less reliable implementation based on
+    examining the raw markup.
+    """
+
+    #: Regular expression for seeing if string markup has an <html> tag.
+    LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)
+
+    #: Regular expression for seeing if byte markup has an <html> tag.
+    LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)
+
+    #: The start of an XML document string.
+    XML_PREFIX: str = "<?xml"
+
+    #: The start of an XML document bytestring.
+    XML_PREFIX_B: bytes = b"<?xml"
+
+    # This is typed as str, not `ProcessingInstruction`, because this
+    # check may be run before any Beautiful Soup objects are created.
+    _first_processing_instruction: Optional[str]  #: :meta private:
+    _root_tag_name: Optional[str]  #: :meta private:
+
+    @classmethod
+    def warn_if_markup_looks_like_xml(
+        cls, markup: Optional[_RawMarkup], stacklevel: int = 3
+    ) -> bool:
+        """Perform a check on some markup to see if it looks like XML
+        that's not XHTML. If so, issue a warning.
+
+        This is much less reliable than doing the check while parsing,
+        but some of the tree builders can't do that.
+
+        :param stacklevel: The stacklevel of the code calling this\
+         function.
+
+        :return: True if the markup looks like non-XHTML XML, False
+         otherwise.
+        """
+        if markup is None:
+            return False
+        markup = markup[:500]
+        if isinstance(markup, bytes):
+            markup_b: bytes = markup
+            looks_like_xml = markup_b.startswith(
+                cls.XML_PREFIX_B
+            ) and not cls.LOOKS_LIKE_HTML_B.search(markup)
+        else:
+            markup_s: str = markup
+            looks_like_xml = markup_s.startswith(
+                cls.XML_PREFIX
+            ) and not cls.LOOKS_LIKE_HTML.search(markup)
+
+        if looks_like_xml:
+            cls._warn(stacklevel=stacklevel + 2)
+            return True
+        return False
+
+    @classmethod
+    def _warn(cls, stacklevel: int = 5) -> None:
+        """Issue a warning about XML being parsed as HTML."""
+        warnings.warn(
+            XMLParsedAsHTMLWarning.MESSAGE,
+            XMLParsedAsHTMLWarning,
+            stacklevel=stacklevel,
+        )
+
+    def _initialize_xml_detector(self) -> None:
+        """Call this method before parsing a document."""
+        self._first_processing_instruction = None
+        self._root_tag_name = None
+
+    def _document_might_be_xml(self, processing_instruction: str) -> None:
+        """Call this method when encountering an XML declaration, or a
+        "processing instruction" that might be an XML declaration.
+
+        This helps Beautiful Soup detect potential issues later, if
+        the XML document turns out to be a non-XHTML document that's
+        being parsed as XML.
+        """
+        if (
+            self._first_processing_instruction is not None
+            or self._root_tag_name is not None
+        ):
+            # The document has already started. Don't bother checking
+            # anymore.
+            return
+
+        self._first_processing_instruction = processing_instruction
+
+        # We won't know until we encounter the first tag whether or
+        # not this is actually a problem.
+
+    def _root_tag_encountered(self, name: str) -> None:
+        """Call this when you encounter the document's root tag.
+
+        This is where we actually check whether an XML document is
+        being incorrectly parsed as HTML, and issue the warning.
+        """
+        if self._root_tag_name is not None:
+            # This method was incorrectly called multiple times. Do
+            # nothing.
+            return
+
+        self._root_tag_name = name
+
+        if (
+            name != "html"
+            and self._first_processing_instruction is not None
+            and self._first_processing_instruction.lower().startswith("xml ")
+        ):
+            # We encountered an XML declaration and then a tag other
+            # than 'html'. This is a reliable indicator that a
+            # non-XHTML document is being parsed as XML.
+            self._warn(stacklevel=10)
+
+
+def register_treebuilders_from(module: ModuleType) -> None:
+    """Copy TreeBuilders from the given module into this module."""
+    this_module = sys.modules[__name__]
+    for name in module.__all__:
+        obj = getattr(module, name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last resort.
+from . import _htmlparser # noqa: E402
+
+register_treebuilders_from(_htmlparser)
+try:
+    from . import _html5lib
+
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
+try:
+    from . import _lxml
+
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py
new file mode 100644
index 00000000..c13439d0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py
@@ -0,0 +1,594 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+__all__ = [
+    "HTML5TreeBuilder",
+]
+
+from typing import (
+    Any,
+    cast,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Tuple,
+    Union,
+)
+from typing_extensions import TypeAlias
+from bs4._typing import (
+    _AttributeValue,
+    _AttributeValues,
+    _Encoding,
+    _Encodings,
+    _NamespaceURL,
+    _RawMarkup,
+)
+
+import warnings
+from bs4.builder import (
+    DetectsXMLParsedAsHTML,
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+)
+from bs4.element import (
+    NamespacedAttribute,
+    PageElement,
+    nonwhitespace_re,
+)
+import html5lib
+from html5lib.constants import (
+    namespaces,
+)
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+)
+
+if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
+
+from html5lib.treebuilders import base as treebuilder_base
+
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
+    build a tree.
+
+    Note that `HTML5TreeBuilder` does not support some common HTML
+    `TreeBuilder` features. Some of these features could theoretically
+    be implemented, but at the very least it's quite difficult,
+    because html5lib moves the parse tree around as it's being built.
+
+    Specifically:
+
+    * This `TreeBuilder` doesn't use different subclasses of
+      `NavigableString` (e.g. `Script`) based on the name of the tag
+      in which the string was found.
+    * You can't use a `SoupStrainer` to parse only part of a document.
+    """
+
+    NAME: str = "html5lib"
+
+    features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML]
+
+    #: html5lib can tell us which line number and position in the
+    #: original file is the source of an element.
+    TRACKS_LINE_NUMBERS: bool = True
+
+    underlying_builder: "TreeBuilderForHtml5lib"  #: :meta private:
+    user_specified_encoding: Optional[_Encoding]
+
+    def prepare_markup(
+        self,
+        markup: _RawMarkup,
+        user_specified_encoding: Optional[_Encoding] = None,
+        document_declared_encoding: Optional[_Encoding] = None,
+        exclude_encodings: Optional[_Encodings] = None,
+    ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        for variable, name in (
+            (document_declared_encoding, "document_declared_encoding"),
+            (exclude_encodings, "exclude_encodings"),
+        ):
+            if variable:
+                warnings.warn(
+                    f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
+                    stacklevel=3,
+                )
+
+        # html5lib only parses HTML, so if it's given XML that's worth
+        # noting.
+        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
+
+        yield (markup, None, None, False)
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup: _RawMarkup) -> None:
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
+        """
+        if self.soup is not None and self.soup.parse_only is not None:
+            warnings.warn(
+                "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
+                stacklevel=4,
+            )
+
+        # self.underlying_builder is probably None now, but it'll be set
+        # when html5lib calls self.create_treebuilder().
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        assert self.underlying_builder is not None
+        self.underlying_builder.parser = parser
+        extra_kwargs = dict()
+        if not isinstance(markup, str):
+            # kwargs, specifically override_encoding, will eventually
+            # be passed in to html5lib's
+            # HTMLBinaryInputStream.__init__.
+            extra_kwargs["override_encoding"] = self.user_specified_encoding
+
+        doc = parser.parse(markup, **extra_kwargs)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, str):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            # The encoding is an html5lib Encoding object. We want to
+            # use a string for compatibility with other tree builders.
+            original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding
+        self.underlying_builder.parser = None
+
+    def create_treebuilder(
+        self, namespaceHTMLElements: bool
+    ) -> "TreeBuilderForHtml5lib":
+        """Called by html5lib to instantiate the kind of class it
+        calls a 'TreeBuilder'.
+
+        :param namespaceHTMLElements: Whether or not to namespace HTML elements.
+
+        :meta private:
+        """
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
+        )
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """See `TreeBuilder`."""
+        return "<html><head></head><body>%s</body></html>" % fragment
+
+
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
+    soup: "BeautifulSoup"  #: :meta private:
+    parser: Optional[html5lib.HTMLParser]  #: :meta private:
+
+    def __init__(
+        self,
+        namespaceHTMLElements: bool,
+        soup: Optional["BeautifulSoup"] = None,
+        store_line_numbers: bool = True,
+        **kwargs: Any,
+    ):
+        if soup:
+            self.soup = soup
+        else:
+            warnings.warn(
+                "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            from bs4 import BeautifulSoup
+
+            # TODO: Why is the parser 'html.parser' here? Using
+            # html5lib doesn't cause an infinite loop and is more
+            # accurate. Best to get rid of this entire section, I think.
+            self.soup = BeautifulSoup(
+                "", "html.parser", store_line_numbers=store_line_numbers, **kwargs
+            )
+        # TODO: What are **kwargs exactly? Should they be passed in
+        # here in addition to/instead of being passed to the BeautifulSoup
+        # constructor?
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+        # This will be set later to a real html5lib HTMLParser object,
+        # which we can use to track the current line number.
+        self.parser = None
+        self.store_line_numbers = store_line_numbers
+
+    def documentClass(self) -> "Element":
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token: Dict[str, Any]) -> None:
+        name: str = cast(str, token["name"])
+        publicId: Optional[str] = cast(Optional[str], token["publicId"])
+        systemId: Optional[str] = cast(Optional[str], token["systemId"])
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name: str, namespace: str) -> "Element":
+        sourceline: Optional[int] = None
+        sourcepos: Optional[int] = None
+        if self.parser is not None and self.store_line_numbers:
+            # This represents the point immediately after the end of the
+            # tag. We don't know when the tag started, but we do know
+            # where it ended -- the character just before this one.
+            sourceline, sourcepos = self.parser.tokenizer.stream.position()
+            assert sourcepos is not None
+            sourcepos = sourcepos - 1
+        tag = self.soup.new_tag(
+            name, namespace, sourceline=sourceline, sourcepos=sourcepos
+        )
+
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data: str) -> "TextNode":
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self) -> "Element":
+        """This is only used by html5lib HTMLParser.parseFragment(),
+        which is never used by Beautiful Soup, only by the html5lib
+        unit tests. Since we don't currently hook into those tests,
+        the implementation is left blank.
+        """
+        raise NotImplementedError()
+
+    def getFragment(self) -> "Element":
+        """This is only used by the html5lib unit tests. Since we
+        don't currently hook into those tests, the implementation is
+        left blank.
+        """
+        raise NotImplementedError()
+
+    def appendChild(self, node: "Element") -> None:
+        # TODO: This code is not covered by the BS4 tests, and
+        # apparently not triggered by the html5lib test suite either.
+        # But it doesn't seem test-specific and there are calls to it
+        # (or a method with the same name) all over html5lib, so I'm
+        # leaving the implementation in place rather than replacing it
+        # with NotImplementedError()
+        self.soup.append(node.element)
+
+    def getDocument(self) -> "BeautifulSoup":
+        return self.soup
+
+    def testSerializer(self, element: "Element") -> str:
+        """This is only used by the html5lib unit tests. Since we
+        don't currently hook into those tests, the implementation is
+        left blank.
+        """
+        raise NotImplementedError()
+
+
+class AttrList(object):
+    """Represents a Tag's attributes in a way compatible with html5lib."""
+
+    element: Tag
+    attrs: _AttributeValues
+
+    def __init__(self, element: Tag):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+
+    def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
+        return list(self.attrs.items()).__iter__()
+
+    def __setitem__(self, name: str, value: _AttributeValue) -> None:
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = self.element.cdata_list_attributes or {}
+        if name in list_attr.get("*", []) or (
+            self.element.name in list_attr
+            and name in list_attr.get(self.element.name, [])
+        ):
+            # A node that is being cloned may have already undergone
+            # this procedure. Check for this and skip it.
+            if not isinstance(value, list):
+                assert isinstance(value, str)
+                value = self.element.attribute_value_list_class(
+                    nonwhitespace_re.findall(value)
+                )
+        self.element[name] = value
+
+    def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
+        return list(self.attrs.items())
+
+    def keys(self) -> Iterable[str]:
+        return list(self.attrs.keys())
+
+    def __len__(self) -> int:
+        return len(self.attrs)
+
+    def __getitem__(self, name: str) -> _AttributeValue:
+        return self.attrs[name]
+
+    def __contains__(self, name: str) -> bool:
+        return name in list(self.attrs.keys())
+
+
+class BeautifulSoupNode(treebuilder_base.Node):
+    element: PageElement
+    soup: "BeautifulSoup"
+    namespace: Optional[_NamespaceURL]
+
+    @property
+    def nodeType(self) -> int:
+        """Return the html5lib constant corresponding to the type of
+        the underlying DOM object.
+
+        NOTE: This property is only accessed by the html5lib test
+        suite, not by Beautiful Soup proper.
+        """
+        raise NotImplementedError()
+
+    # TODO-TYPING: typeshed stubs are incorrect about this;
+    # cloneNode returns a new Node, not None.
+    def cloneNode(self) -> treebuilder_base.Node:
+        raise NotImplementedError()
+
+
+class Element(BeautifulSoupNode):
+    element: Tag
+    namespace: Optional[_NamespaceURL]
+
+    def __init__(
+        self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
+    ):
+        treebuilder_base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node: "BeautifulSoupNode") -> None:
+        string_child: Optional[NavigableString] = None
+        child: PageElement
+        if type(node.element) is NavigableString:
+            string_child = child = node.element
+        else:
+            child = node.element
+        node.parent = self
+
+        if (
+            child is not None
+            and child.parent is not None
+            and not isinstance(child, str)
+        ):
+            node.element.extract()
+
+        if (
+            string_child is not None
+            and self.element.contents
+            and type(self.element.contents[-1]) is NavigableString
+        ):
+            # We are appending a string onto another string.
+            # TODO This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + string_child)
+            old_element.replace_with(new_element)
+            self.soup._most_recent_element = new_element
+        else:
+            if isinstance(node, str):
+                # Create a brand new NavigableString from this string.
+                child = self.soup.new_string(node)
+
+            # Tell Beautiful Soup to act as if it parsed this element
+            # immediately after the parent's last descendant. (Or
+            # immediately after the parent, if it has no children.)
+            if self.element.contents:
+                most_recent_element = self.element._last_descendant(False)
+            elif self.element.next_element is not None:
+                # Something from further ahead in the parse tree is
+                # being inserted into this earlier element. This is
+                # very annoying because it means an expensive search
+                # for the last element in the tree.
+                most_recent_element = self.soup._last_descendant()
+            else:
+                most_recent_element = self.element
+
+            self.soup.object_was_parsed(
+                child, parent=self.element, most_recent_element=most_recent_element
+            )
+
+    def getAttributes(self) -> AttrList:
+        if isinstance(self.element, Comment):
+            return {}
+        return AttrList(self.element)
+
+    # An HTML5lib attribute name may either be a single string,
+    # or a tuple (namespace, name).
+    _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
+    # Now we can define the type this method accepts as a dictionary
+    # mapping those attribute names to single string values.
+    _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
+
+    def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
+        if attributes is not None and len(attributes) > 0:
+            # Replace any namespaced attributes with
+            # NamespacedAttribute objects.
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            # We can now cast attributes to the type of Dict
+            # used by Beautiful Soup.
+            normalized_attributes = cast(_AttributeValues, attributes)
+
+            # Values for tags like 'class' came in as single strings;
+            # replace them with lists of strings as appropriate.
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, normalized_attributes
+            )
+
+            # Then set the attributes on the Tag associated with this
+            # BeautifulSoupNode.
+            for name, value_or_values in list(normalized_attributes.items()):
+                self.element[name] = value_or_values
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(
+        self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
+    ) -> None:
+        text = TextNode(self.soup.new_string(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(
+        self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
+    ) -> None:
+        index = self.element.index(refNode.element)
+        if (
+            type(node.element) is NavigableString
+            and self.element.contents
+            and type(self.element.contents[index - 1]) is NavigableString
+        ):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index - 1]
+            assert type(old_node) is NavigableString
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node: "Element") -> None:
+        node.element.extract()
+
+    def reparentChildren(self, new_parent: "Element") -> None:
+        """Move all of this tag's children into another tag."""
+        # print("MOVE", self.element.contents)
+        # print("FROM", self.element)
+        # print("TO", new_parent.element)
+
+        element = self.element
+        new_parent_element = new_parent.element
+        # Determine what this tag's next_element will be once all the children
+        # are removed.
+        final_next_element = element.next_sibling
+
+        new_parents_last_descendant = new_parent_element._last_descendant(False, False)
+        if len(new_parent_element.contents) > 0:
+            # The new parent already contains children. We will be
+            # appending this tag's children to the end.
+
+            # We can make this assertion since we know new_parent has
+            # children.
+            assert new_parents_last_descendant is not None
+            new_parents_last_child = new_parent_element.contents[-1]
+            new_parents_last_descendant_next_element = (
+                new_parents_last_descendant.next_element
+            )
+        else:
+            # The new parent contains no children.
+            new_parents_last_child = None
+            new_parents_last_descendant_next_element = new_parent_element.next_element
+
+        to_append = element.contents
+        if len(to_append) > 0:
+            # Set the first child's previous_element and previous_sibling
+            # to elements within the new parent
+            first_child = to_append[0]
+            if new_parents_last_descendant is not None:
+                first_child.previous_element = new_parents_last_descendant
+            else:
+                first_child.previous_element = new_parent_element
+            first_child.previous_sibling = new_parents_last_child
+            if new_parents_last_descendant is not None:
+                new_parents_last_descendant.next_element = first_child
+            else:
+                new_parent_element.next_element = first_child
+            if new_parents_last_child is not None:
+                new_parents_last_child.next_sibling = first_child
+
+            # Find the very last element being moved. It is now the
+            # parent's last descendant. It has no .next_sibling and
+            # its .next_element is whatever the previous last
+            # descendant had.
+            last_childs_last_descendant = to_append[-1]._last_descendant(
+                is_initialized=False, accept_self=True
+            )
+
+            # Since we passed accept_self=True into _last_descendant,
+            # there's no possibility that the result is None.
+            assert last_childs_last_descendant is not None
+            last_childs_last_descendant.next_element = (
+                new_parents_last_descendant_next_element
+            )
+            if new_parents_last_descendant_next_element is not None:
+                # TODO-COVERAGE: This code has no test coverage and
+                # I'm not sure how to get html5lib to go through this
+                # path, but it's just the other side of the previous
+                # line.
+                new_parents_last_descendant_next_element.previous_element = (
+                    last_childs_last_descendant
+                )
+            last_childs_last_descendant.next_sibling = None
+
+        for child in to_append:
+            child.parent = new_parent_element
+            new_parent_element.contents.append(child)
+
+        # Now that this element has no children, change its .next_element.
+        element.contents = []
+        element.next_element = final_next_element
+
+        # print("DONE WITH MOVE")
+        # print("FROM", self.element)
+        # print("TO", new_parent_element)
+
+    # TODO-TYPING: typeshed stubs are incorrect about this;
+    # hasContent returns a boolean, not None.
+    def hasContent(self) -> bool:
+        return len(self.element.contents) > 0
+
+    # TODO-TYPING: typeshed stubs are incorrect about this;
+    # cloneNode returns a new Node, not None.
+    def cloneNode(self) -> treebuilder_base.Node:
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key, value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
+        if self.namespace is None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+
+class TextNode(BeautifulSoupNode):
+    element: NavigableString
+
+    def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
+        treebuilder_base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_htmlparser.py b/.venv/lib/python3.12/site-packages/bs4/builder/_htmlparser.py
new file mode 100644
index 00000000..417f7dc4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/_htmlparser.py
@@ -0,0 +1,474 @@
+# encoding: utf-8
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+from __future__ import annotations
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+__all__ = [
+    "HTMLParserTreeBuilder",
+]
+
+from html.parser import HTMLParser
+
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    TYPE_CHECKING,
+    Tuple,
+    Type,
+    Union,
+)
+
+from bs4.element import (
+    AttributeDict,
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+)
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    DetectsXMLParsedAsHTML,
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+)
+
+from bs4.exceptions import ParserRejectedMarkup
+
+if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
+    from bs4.element import NavigableString
+    from bs4._typing import (
+        _Encoding,
+        _Encodings,
+        _RawMarkup,
+    )
+
+HTMLPARSER = "html.parser"
+
+_DuplicateAttributeHandler = Callable[[Dict[str, str], str, str], None]
+
+
+class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
+    #: Constant to handle duplicate attributes by ignoring later values
+    #: and keeping the earlier ones.
+    REPLACE: str = "replace"
+
+    #: Constant to handle duplicate attributes by replacing earlier values
+    #: with later ones.
+    IGNORE: str = "ignore"
+
+    """A subclass of the Python standard library's HTMLParser class, which
+    listens for HTMLParser events and translates them into calls
+    to Beautiful Soup's tree construction API.
+
+        :param on_duplicate_attribute: A strategy for what to do if a
+            tag includes the same attribute more than once. Accepted
+            values are: REPLACE (replace earlier values with later
+            ones, the default), IGNORE (keep the earliest value
+            encountered), or a callable. A callable must take three
+            arguments: the dictionary of attributes already processed,
+            the name of the duplicate attribute, and the most recent value
+            encountered.
+    """
+
+    def __init__(
+        self,
+        soup: BeautifulSoup,
+        *args: Any,
+        on_duplicate_attribute: Union[str, _DuplicateAttributeHandler] = REPLACE,
+        **kwargs: Any,
+    ):
+        self.soup = soup
+        self.on_duplicate_attribute = on_duplicate_attribute
+        self.attribute_dict_class = soup.builder.attribute_dict_class
+        HTMLParser.__init__(self, *args, **kwargs)
+
+        # Keep a list of empty-element tags that were encountered
+        # without an explicit closing tag. If we encounter a closing tag
+        # of this type, we'll associate it with one of those entries.
+        #
+        # This isn't a stack because we don't care about the
+        # order. It's a list of closing tags we've already handled and
+        # will ignore, assuming they ever show up.
+        self.already_closed_empty_element = []
+
+        self._initialize_xml_detector()
+
+    on_duplicate_attribute: Union[str, _DuplicateAttributeHandler]
+    already_closed_empty_element: List[str]
+    soup: BeautifulSoup
+
+    def error(self, message: str) -> None:
+        # NOTE: This method is required so long as Python 3.9 is
+        # supported. The corresponding code is removed from HTMLParser
+        # in 3.5, but not removed from ParserBase until 3.10.
+        # https://github.com/python/cpython/issues/76025
+        #
+        # The original implementation turned the error into a warning,
+        # but in every case I discovered, this made HTMLParser
+        # immediately crash with an error message that was less
+        # helpful than the warning. The new implementation makes it
+        # more clear that html.parser just can't parse this
+        # markup. The 3.10 implementation does the same, though it
+        # raises AssertionError rather than calling a method. (We
+        # catch this error and wrap it in a ParserRejectedMarkup.)
+        raise ParserRejectedMarkup(message)
+
+    def handle_startendtag(
+        self, name: str, attrs: List[Tuple[str, Optional[str]]]
+    ) -> None:
+        """Handle an incoming empty-element tag.
+
+        html.parser only calls this method when the markup looks like
+        <tag/>.
+        """
+        # `handle_empty_element` tells handle_starttag not to close the tag
+        # just because its name matches a known empty-element tag. We
+        # know that this is an empty-element tag, and we want to call
+        # handle_endtag ourselves.
+        self.handle_starttag(name, attrs, handle_empty_element=False)
+        self.handle_endtag(name)
+
+    def handle_starttag(
+        self,
+        name: str,
+        attrs: List[Tuple[str, Optional[str]]],
+        handle_empty_element: bool = True,
+    ) -> None:
+        """Handle an opening tag, e.g. '<tag>'
+
+        :param handle_empty_element: True if this tag is known to be
+            an empty-element tag (i.e. there is not expected to be any
+            closing tag).
+        """
+        # TODO: handle namespaces here?
+        attr_dict: AttributeDict = self.attribute_dict_class()
+        for key, value in attrs:
+            # Change None attribute values to the empty string
+            # for consistency with the other tree builders.
+            if value is None:
+                value = ""
+            if key in attr_dict:
+                # A single attribute shows up multiple times in this
+                # tag. How to handle it depends on the
+                # on_duplicate_attribute setting.
+                on_dupe = self.on_duplicate_attribute
+                if on_dupe == self.IGNORE:
+                    pass
+                elif on_dupe in (None, self.REPLACE):
+                    attr_dict[key] = value
+                else:
+                    on_dupe = cast(_DuplicateAttributeHandler, on_dupe)
+                    on_dupe(attr_dict, key, value)
+            else:
+                attr_dict[key] = value
+        # print("START", name)
+        sourceline: Optional[int]
+        sourcepos: Optional[int]
+        if self.soup.builder.store_line_numbers:
+            sourceline, sourcepos = self.getpos()
+        else:
+            sourceline = sourcepos = None
+        tag = self.soup.handle_starttag(
+            name, None, None, attr_dict, sourceline=sourceline, sourcepos=sourcepos
+        )
+        if tag and tag.is_empty_element and handle_empty_element:
+            # Unlike other parsers, html.parser doesn't send separate end tag
+            # events for empty-element tags. (It's handled in
+            # handle_startendtag, but only if the original markup looked like
+            # <tag/>.)
+            #
+            # So we need to call handle_endtag() ourselves. Since we
+            # know the start event is identical to the end event, we
+            # don't want handle_endtag() to cross off any previous end
+            # events for tags of this name.
+            self.handle_endtag(name, check_already_closed=False)
+
+            # But we might encounter an explicit closing tag for this tag
+            # later on. If so, we want to ignore it.
+            self.already_closed_empty_element.append(name)
+
+        if self._root_tag_name is None:
+            self._root_tag_encountered(name)
+
+    def handle_endtag(self, name: str, check_already_closed: bool = True) -> None:
+        """Handle a closing tag, e.g. '</tag>'
+
+        :param name: A tag name.
+        :param check_already_closed: True if this tag is expected to
+           be the closing portion of an empty-element tag,
+           e.g. '<tag></tag>'.
+        """
+        # print("END", name)
+        if check_already_closed and name in self.already_closed_empty_element:
+            # This is a redundant end tag for an empty-element tag.
+            # We've already called handle_endtag() for it, so just
+            # check it off the list.
+            # print("ALREADY CLOSED", name)
+            self.already_closed_empty_element.remove(name)
+        else:
+            self.soup.handle_endtag(name)
+
+    def handle_data(self, data: str) -> None:
+        """Handle some textual data that shows up between tags."""
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name: str) -> None:
+        """Handle a numeric character reference by converting it to the
+        corresponding Unicode character and treating it as textual
+        data.
+
+        :param name: Character number, possibly in hexadecimal.
+        """
+        # TODO: This was originally a workaround for a bug in
+        # HTMLParser. (http://bugs.python.org/issue13633) The bug has
+        # been fixed, but removing this code still makes some
+        # Beautiful Soup tests fail. This needs investigation.
+        if name.startswith("x"):
+            real_name = int(name.lstrip("x"), 16)
+        elif name.startswith("X"):
+            real_name = int(name.lstrip("X"), 16)
+        else:
+            real_name = int(name)
+
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, "windows-1252"):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError:
+                    pass
+        if not data:
+            try:
+                data = chr(real_name)
+            except (ValueError, OverflowError):
+                pass
+        data = data or "\N{REPLACEMENT CHARACTER}"
+        self.handle_data(data)
+
+    def handle_entityref(self, name: str) -> None:
+        """Handle a named entity reference by converting it to the
+        corresponding Unicode character(s) and treating it as textual
+        data.
+
+        :param name: Name of the entity reference.
+        """
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            # If this were XML, it would be ambiguous whether "&foo"
+            # was an character entity reference with a missing
+            # semicolon or the literal string "&foo". Since this is
+            # HTML, we have a complete list of all character entity references,
+            # and this one wasn't found, so assume it's the literal string "&foo".
+            data = "&%s" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data: str) -> None:
+        """Handle an HTML comment.
+
+        :param data: The text of the comment.
+        """
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data: str) -> None:
+        """Handle a DOCTYPE declaration.
+
+        :param data: The text of the declaration.
+        """
+        self.soup.endData()
+        data = data[len("DOCTYPE ") :]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data: str) -> None:
+        """Handle a declaration of unknown type -- probably a CDATA block.
+
+        :param data: The text of the declaration.
+        """
+        cls: Type[NavigableString]
+        if data.upper().startswith("CDATA["):
+            cls = CData
+            data = data[len("CDATA[") :]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data: str) -> None:
+        """Handle a processing instruction.
+
+        :param data: The text of the instruction.
+        """
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self._document_might_be_xml(data)
+        self.soup.endData(ProcessingInstruction)
+
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+    """A Beautiful soup `bs4.builder.TreeBuilder` that uses the
+    :py:class:`html.parser.HTMLParser` parser, found in the Python
+    standard library.
+
+    """
+
+    is_xml: bool = False
+    picklable: bool = True
+    NAME: str = HTMLPARSER
+    features: Iterable[str] = [NAME, HTML, STRICT]
+    parser_args: Tuple[Iterable[Any], Dict[str, Any]]
+
+    #: The html.parser knows which line number and position in the
+    #: original file is the source of an element.
+    TRACKS_LINE_NUMBERS: bool = True
+
+    def __init__(
+        self,
+        parser_args: Optional[Iterable[Any]] = None,
+        parser_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ):
+        """Constructor.
+
+        :param parser_args: Positional arguments to pass into
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param parser_kwargs: Keyword arguments to pass into
+            the BeautifulSoupHTMLParser constructor, once it's
+            invoked.
+        :param kwargs: Keyword arguments for the superclass constructor.
+        """
+        # Some keyword arguments will be pulled out of kwargs and placed
+        # into parser_kwargs.
+        extra_parser_kwargs = dict()
+        for arg in ("on_duplicate_attribute",):
+            if arg in kwargs:
+                value = kwargs.pop(arg)
+                extra_parser_kwargs[arg] = value
+        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+        parser_args = parser_args or []
+        parser_kwargs = parser_kwargs or {}
+        parser_kwargs.update(extra_parser_kwargs)
+        parser_kwargs["convert_charrefs"] = False
+        self.parser_args = (parser_args, parser_kwargs)
+
+    def prepare_markup(
+        self,
+        markup: _RawMarkup,
+        user_specified_encoding: Optional[_Encoding] = None,
+        document_declared_encoding: Optional[_Encoding] = None,
+        exclude_encodings: Optional[_Encodings] = None,
+    ) -> Iterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]]:
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        :param markup: Some markup -- probably a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
+             has undergone character replacement)
+
+            Each 4-tuple represents a strategy for parsing the document.
+            This TreeBuilder uses Unicode, Dammit to convert the markup
+            into Unicode, so the ``markup`` element of the tuple will
+            always be a string.
+        """
+        if isinstance(markup, str):
+            # Parse Unicode as-is.
+            yield (markup, None, None, False)
+            return
+
+        # Ask UnicodeDammit to sniff the most likely encoding.
+
+        known_definite_encodings: List[_Encoding] = []
+        if user_specified_encoding:
+            # This was provided by the end-user; treat it as a known
+            # definite encoding per the algorithm laid out in the
+            # HTML5 spec. (See the EncodingDetector class for
+            # details.)
+            known_definite_encodings.append(user_specified_encoding)
+
+        user_encodings: List[_Encoding] = []
+        if document_declared_encoding:
+            # This was found in the document; treat it as a slightly
+            # lower-priority user encoding.
+            user_encodings.append(document_declared_encoding)
+
+        dammit = UnicodeDammit(
+            markup,
+            known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings,
+            is_html=True,
+            exclude_encodings=exclude_encodings,
+        )
+
+        if dammit.unicode_markup is None:
+            # In every case I've seen, Unicode, Dammit is able to
+            # convert the markup into Unicode, even if it needs to use
+            # REPLACEMENT CHARACTER. But there is a code path that
+            # could result in unicode_markup being None, and
+            # HTMLParser can only parse Unicode, so here we handle
+            # that code path.
+            raise ParserRejectedMarkup(
+                "Could not convert input to Unicode, and html.parser will not accept bytestrings."
+            )
+        else:
+            yield (
+                dammit.unicode_markup,
+                dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters,
+            )
+
+    def feed(self, markup: _RawMarkup) -> None:
+        args, kwargs = self.parser_args
+
+        # HTMLParser.feed will only handle str, but
+        # BeautifulSoup.markup is allowed to be _RawMarkup, because
+        # it's set by the yield value of
+        # TreeBuilder.prepare_markup. Fortunately,
+        # HTMLParserTreeBuilder.prepare_markup always yields a str
+        # (UnicodeDammit.unicode_markup).
+        assert isinstance(markup, str)
+
+        # We know BeautifulSoup calls TreeBuilder.initialize_soup
+        # before calling feed(), so we can assume self.soup
+        # is set.
+        assert self.soup is not None
+        parser = BeautifulSoupHTMLParser(self.soup, *args, **kwargs)
+
+        try:
+            parser.feed(markup)
+            parser.close()
+        except AssertionError as e:
+            # html.parser raises AssertionError in rare cases to
+            # indicate a fatal problem with the markup, especially
+            # when there's an error in the doctype declaration.
+            raise ParserRejectedMarkup(e)
+        parser.already_closed_empty_element = []
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py b/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py
new file mode 100644
index 00000000..1f367da3
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py
@@ -0,0 +1,490 @@
+# encoding: utf-8
+from __future__ import annotations
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+__all__ = [
+    "LXMLTreeBuilderForXML",
+    "LXMLTreeBuilder",
+]
+
+
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+from typing_extensions import TypeAlias
+
+from io import BytesIO
+from io import StringIO
+from lxml import etree
+from bs4.element import (
+    AttributeDict,
+    XMLAttributeDict,
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+    XMLProcessingInstruction,
+)
+from bs4.builder import (
+    DetectsXMLParsedAsHTML,
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML,
+)
+from bs4.dammit import EncodingDetector
+from bs4.exceptions import ParserRejectedMarkup
+
+if TYPE_CHECKING:
+    from bs4._typing import (
+        _Encoding,
+        _Encodings,
+        _NamespacePrefix,
+        _NamespaceURL,
+        _NamespaceMapping,
+        _InvertedNamespaceMapping,
+        _RawMarkup,
+    )
+    from bs4 import BeautifulSoup
+
+LXML: str = "lxml"
+
+
+def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
+    "Invert a dictionary."
+    return dict((v, k) for k, v in list(d.items()))
+
+
+_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
+_ParserOrParserClass: TypeAlias = Union[
+    _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
+]
+
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
+
+    is_xml: bool = True
+
+    processing_instruction_class: Type[ProcessingInstruction]
+
+    NAME: str = "lxml-xml"
+    ALTERNATE_NAMES: Iterable[str] = ["xml"]
+
+    # Well, it's permissive by XML parser standards.
+    features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE: int = 512
+
+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
+
+    DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
+
+    nsmaps: List[Optional[_InvertedNamespaceMapping]]
+    empty_element_tags: Set[str]
+    parser: Any
+    _default_parser: Optional[etree.XMLParser]
+
+    # NOTE: If we parsed Element objects and looked at .sourceline,
+    # we'd be able to see the line numbers from the original document.
+    # But instead we build an XMLParser or HTMLParser object to serve
+    # as the target of parse messages, and those messages don't include
+    # line numbers.
+    # See: https://bugs.launchpad.net/lxml/+bug/1846906
+
+    def initialize_soup(self, soup: BeautifulSoup) -> None:
+        """Let the BeautifulSoup object know about the standard namespace
+        mapping.
+
+        :param soup: A `BeautifulSoup`.
+        """
+        # Beyond this point, self.soup is set, so we can assume (and
+        # assert) it's not None whenever necessary.
+        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+        self._register_namespaces(self.DEFAULT_NSMAPS)
+
+    def _register_namespaces(self, mapping: Dict[str, str]) -> None:
+        """Let the BeautifulSoup object know about namespaces encountered
+        while parsing the document.
+
+        This might be useful later on when creating CSS selectors.
+
+        This will track (almost) all namespaces, even ones that were
+        only in scope for part of the document. If two namespaces have
+        the same prefix, only the first one encountered will be
+        tracked. Un-prefixed namespaces are not tracked.
+
+        :param mapping: A dictionary mapping namespace prefixes to URIs.
+        """
+        assert self.soup is not None
+        for key, value in list(mapping.items()):
+            # This is 'if key' and not 'if key is not None' because we
+            # don't track un-prefixed namespaces. Soupselect will
+            # treat an un-prefixed namespace as the default, which
+            # causes confusion in some cases.
+            if key and key not in self.soup._namespaces:
+                # Let the BeautifulSoup object know about a new namespace.
+                # If there are multiple namespaces defined with the same
+                # prefix, the first one in the document takes precedence.
+                self.soup._namespaces[key] = value
+
+    def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
+        """Find the default parser for the given encoding.
+
+        :return: Either a parser object or a class, which
+          will be instantiated with default arguments.
+        """
+        if self._default_parser is not None:
+            return self._default_parser
+        return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding)
+
+    def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
+        """Instantiate an appropriate parser for the given encoding.
+
+        :param encoding: A string.
+        :return: A parser object such as an `etree.XMLParser`.
+        """
+        # Use the default parser.
+        parser = self.default_parser(encoding)
+
+        if callable(parser):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, recover=True, encoding=encoding)
+        return parser
+
+    def __init__(
+        self,
+        parser: Optional[etree.XMLParser] = None,
+        empty_element_tags: Optional[Set[str]] = None,
+        **kwargs: Any,
+    ):
+        # TODO: Issue a warning if parser is present but not a
+        # callable, since that means there's no way to create new
+        # parsers for different encodings.
+        self._default_parser = parser
+        self.soup = None
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
+        if "attribute_dict_class" not in kwargs:
+            kwargs["attribute_dict_class"] = XMLAttributeDict
+        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+
+    def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == "{":
+            namespace, name = tag[1:].split("}", 1)
+            return (namespace, name)
+        else:
+            return (None, tag)
+
+    def prepare_markup(
+        self,
+        markup: _RawMarkup,
+        user_specified_encoding: Optional[_Encoding] = None,
+        document_declared_encoding: Optional[_Encoding] = None,
+        exclude_encodings: Optional[_Encodings] = None,
+    ) -> Iterable[
+        Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
+    ]:
+        """Run any preliminary steps necessary to make incoming markup
+        acceptable to the parser.
+
+        lxml really wants to get a bytestring and convert it to
+        Unicode itself. So instead of using UnicodeDammit to convert
+        the bytestring to Unicode using different encodings, this
+        implementation uses EncodingDetector to iterate over the
+        encodings, and tell lxml to try to parse the document as each
+        one in turn.
+
+        :param markup: Some markup -- hopefully a bytestring.
+        :param user_specified_encoding: The user asked to try this encoding.
+        :param document_declared_encoding: The markup itself claims to be
+            in this encoding.
+        :param exclude_encodings: The user asked _not_ to try any of
+            these encodings.
+
+        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
+            has undergone character replacement)
+
+            Each 4-tuple represents a strategy for converting the
+            document to Unicode and parsing it. Each strategy will be tried
+            in turn.
+        """
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+            # We're in HTML mode, so if we're given XML, that's worth
+            # noting.
+            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
+        if isinstance(markup, str):
+            # We were given Unicode. Maybe lxml can parse Unicode on
+            # this system?
+
+            # TODO: This is a workaround for
+            # https://bugs.launchpad.net/lxml/+bug/1948551.
+            # We can remove it once the upstream issue is fixed.
+            if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
+                markup = markup[1:]
+            yield markup, None, document_declared_encoding, False
+
+        if isinstance(markup, str):
+            # No, apparently not. Convert the Unicode to UTF-8 and
+            # tell lxml to parse it as UTF-8.
+            yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
+
+            # Since the document was Unicode in the first place, there
+            # is no need to try any more strategies; we know this will
+            # work.
+            return
+
+        known_definite_encodings: List[_Encoding] = []
+        if user_specified_encoding:
+            # This was provided by the end-user; treat it as a known
+            # definite encoding per the algorithm laid out in the
+            # HTML5 spec. (See the EncodingDetector class for
+            # details.)
+            known_definite_encodings.append(user_specified_encoding)
+
+        user_encodings: List[_Encoding] = []
+        if document_declared_encoding:
+            # This was found in the document; treat it as a slightly
+            # lower-priority user encoding.
+            user_encodings.append(document_declared_encoding)
+
+        detector = EncodingDetector(
+            markup,
+            known_definite_encodings=known_definite_encodings,
+            user_encodings=user_encodings,
+            is_html=is_html,
+            exclude_encodings=exclude_encodings,
+        )
+        for encoding in detector.encodings:
+            yield (detector.markup, encoding, document_declared_encoding, False)
+
+    def feed(self, markup: _RawMarkup) -> None:
+        io: Union[BytesIO, StringIO]
+        if isinstance(markup, bytes):
+            io = BytesIO(markup)
+        elif isinstance(markup, str):
+            io = StringIO(markup)
+
+        # initialize_soup is called before feed, so we know this
+        # is not None.
+        assert self.soup is not None
+
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = io.read(self.CHUNK_SIZE)
+        try:
+            self.parser = self.parser_for(self.soup.original_encoding)
+            self.parser.feed(data)
+            while len(data) != 0:
+                # Now call feed() on the rest of the data, chunk by chunk.
+                data = io.read(self.CHUNK_SIZE)
+                if len(data) != 0:
+                    self.parser.feed(data)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
+            raise ParserRejectedMarkup(e)
+
+    def close(self) -> None:
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+
+    def start(
+        self,
+        tag: str | bytes,
+        attrs: Dict[str | bytes, str | bytes],
+        nsmap: _NamespaceMapping = {},
+    ) -> None:
+        # This is called by lxml code as a result of calling
+        # BeautifulSoup.feed(), and we know self.soup is set by the time feed()
+        # is called.
+        assert self.soup is not None
+        assert isinstance(tag, str)
+
+        # We need to recreate the attribute dict for three
+        # reasons. First, for type checking, so we can assert there
+        # are no bytestrings in the keys or values. Second, because we
+        # need a mutable dict--lxml might send us an immutable
+        # dictproxy. Third, so we can handle namespaced attribute
+        # names by converting the keys to NamespacedAttributes.
+        new_attrs: Dict[Union[str, NamespacedAttribute], str] = (
+            self.attribute_dict_class()
+        )
+        for k, v in attrs.items():
+            assert isinstance(k, str)
+            assert isinstance(v, str)
+            new_attrs[k] = v
+
+        nsprefix: Optional[_NamespacePrefix] = None
+        namespace: Optional[_NamespaceURL] = None
+        # Invert each namespace map as it comes in.
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+
+            # First, Let the BeautifulSoup object know about it.
+            self._register_namespaces(nsmap)
+
+            # Then, add it to our running list of inverted namespace
+            # mappings.
+            self.nsmaps.append(_invert(nsmap))
+
+            # The currently active namespace prefixes have
+            # changed. Calculate the new mapping so it can be stored
+            # with all Tag objects created while these prefixes are in
+            # scope.
+            current_mapping = dict(self.active_namespace_prefixes[-1])
+            current_mapping.update(nsmap)
+
+            # We should not track un-prefixed namespaces as we can only hold one
+            # and it will be recognized as the default namespace by soupsieve,
+            # which may be confusing in some situations.
+            if "" in current_mapping:
+                del current_mapping[""]
+            self.active_namespace_prefixes.append(current_mapping)
+
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            for prefix, namespace in list(nsmap.items()):
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/"
+                )
+                new_attrs[attribute] = namespace
+
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        final_attrs: AttributeDict = self.attribute_dict_class()
+        for attr, value in list(new_attrs.items()):
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                final_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                final_attrs[attr] = value
+
+        namespace, tag = self._getNsTag(tag)
+        nsprefix = self._prefix_for_namespace(namespace)
+        self.soup.handle_starttag(
+            tag,
+            namespace,
+            nsprefix,
+            final_attrs,
+            namespaces=self.active_namespace_prefixes[-1],
+        )
+
+    def _prefix_for_namespace(
+        self, namespace: Optional[_NamespaceURL]
+    ) -> Optional[_NamespacePrefix]:
+        """Find the currently active prefix for the given namespace."""
+        if namespace is None:
+            return None
+        for inverted_nsmap in reversed(self.nsmaps):
+            if inverted_nsmap is not None and namespace in inverted_nsmap:
+                return inverted_nsmap[namespace]
+        return None
+
+    def end(self, name: str | bytes) -> None:
+        assert self.soup is not None
+        assert isinstance(name, str)
+        self.soup.endData()
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if len(self.nsmaps) > 1:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            out_of_scope_nsmap = self.nsmaps.pop()
+
+            if out_of_scope_nsmap is not None:
+                # This tag introduced a namespace mapping which is no
+                # longer in scope. Recalculate the currently active
+                # namespace prefixes.
+                self.active_namespace_prefixes.pop()
+
+    def pi(self, target: str, data: str) -> None:
+        assert self.soup is not None
+        self.soup.endData()
+        data = target + " " + data
+        self.soup.handle_data(data)
+        self.soup.endData(self.processing_instruction_class)
+
+    def data(self, data: str | bytes) -> None:
+        assert self.soup is not None
+        assert isinstance(data, str)
+        self.soup.handle_data(data)
+
+    def doctype(self, name: str, pubid: str, system: str) -> None:
+        assert self.soup is not None
+        self.soup.endData()
+        doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
+        self.soup.handle_data(doctype_string)
+        self.soup.endData(containerClass=Doctype)
+
+    def comment(self, text: str | bytes) -> None:
+        "Handle comments as Comment objects."
+        assert self.soup is not None
+        assert isinstance(text, str)
+        self.soup.endData()
+        self.soup.handle_data(text)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """See `TreeBuilder`."""
+        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+    NAME: str = LXML
+    ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
+
+    features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
+    is_xml: bool = False
+
+    def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
+        return etree.HTMLParser
+
+    def feed(self, markup: _RawMarkup) -> None:
+        # We know self.soup is set by the time feed() is called.
+        assert self.soup is not None
+        encoding = self.soup.original_encoding
+        try:
+            self.parser = self.parser_for(encoding)
+            self.parser.feed(markup)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
+            raise ParserRejectedMarkup(e)
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """See `TreeBuilder`."""
+        return "<html><body>%s</body></html>" % fragment
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/builder
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz