two version of R2R are hereHEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 594 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py
new file mode 100644
index 00000000..c13439d0
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py
@@ -0,0 +1,594 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+__all__ = [
+    "HTML5TreeBuilder",
+]
+
+from typing import (
+    Any,
+    cast,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Tuple,
+    Union,
+)
+from typing_extensions import TypeAlias
+from bs4._typing import (
+    _AttributeValue,
+    _AttributeValues,
+    _Encoding,
+    _Encodings,
+    _NamespaceURL,
+    _RawMarkup,
+)
+
+import warnings
+from bs4.builder import (
+    DetectsXMLParsedAsHTML,
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+)
+from bs4.element import (
+    NamespacedAttribute,
+    PageElement,
+    nonwhitespace_re,
+)
+import html5lib
+from html5lib.constants import (
+    namespaces,
+)
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+)
+
+if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
+
+from html5lib.treebuilders import base as treebuilder_base
+
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
+    build a tree.
+
+    Note that `HTML5TreeBuilder` does not support some common HTML
+    `TreeBuilder` features. Some of these features could theoretically
+    be implemented, but at the very least it's quite difficult,
+    because html5lib moves the parse tree around as it's being built.
+
+    Specifically:
+
+    * This `TreeBuilder` doesn't use different subclasses of
+      `NavigableString` (e.g. `Script`) based on the name of the tag
+      in which the string was found.
+    * You can't use a `SoupStrainer` to parse only part of a document.
+    """
+
+    NAME: str = "html5lib"
+
+    features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML]
+
+    #: html5lib can tell us which line number and position in the
+    #: original file is the source of an element.
+    TRACKS_LINE_NUMBERS: bool = True
+
+    underlying_builder: "TreeBuilderForHtml5lib"  #: :meta private:
+    user_specified_encoding: Optional[_Encoding]
+
+    def prepare_markup(
+        self,
+        markup: _RawMarkup,
+        user_specified_encoding: Optional[_Encoding] = None,
+        document_declared_encoding: Optional[_Encoding] = None,
+        exclude_encodings: Optional[_Encodings] = None,
+    ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        for variable, name in (
+            (document_declared_encoding, "document_declared_encoding"),
+            (exclude_encodings, "exclude_encodings"),
+        ):
+            if variable:
+                warnings.warn(
+                    f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
+                    stacklevel=3,
+                )
+
+        # html5lib only parses HTML, so if it's given XML that's worth
+        # noting.
+        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
+
+        yield (markup, None, None, False)
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup: _RawMarkup) -> None:
+        """Run some incoming markup through some parsing process,
+        populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
+        """
+        if self.soup is not None and self.soup.parse_only is not None:
+            warnings.warn(
+                "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
+                stacklevel=4,
+            )
+
+        # self.underlying_builder is probably None now, but it'll be set
+        # when html5lib calls self.create_treebuilder().
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        assert self.underlying_builder is not None
+        self.underlying_builder.parser = parser
+        extra_kwargs = dict()
+        if not isinstance(markup, str):
+            # kwargs, specifically override_encoding, will eventually
+            # be passed in to html5lib's
+            # HTMLBinaryInputStream.__init__.
+            extra_kwargs["override_encoding"] = self.user_specified_encoding
+
+        doc = parser.parse(markup, **extra_kwargs)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, str):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            # The encoding is an html5lib Encoding object. We want to
+            # use a string for compatibility with other tree builders.
+            original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding
+        self.underlying_builder.parser = None
+
+    def create_treebuilder(
+        self, namespaceHTMLElements: bool
+    ) -> "TreeBuilderForHtml5lib":
+        """Called by html5lib to instantiate the kind of class it
+        calls a 'TreeBuilder'.
+
+        :param namespaceHTMLElements: Whether or not to namespace HTML elements.
+
+        :meta private:
+        """
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
+        )
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment: str) -> str:
+        """See `TreeBuilder`."""
+        return "<html><head></head><body>%s</body></html>" % fragment
+
+
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
+    soup: "BeautifulSoup"  #: :meta private:
+    parser: Optional[html5lib.HTMLParser]  #: :meta private:
+
+    def __init__(
+        self,
+        namespaceHTMLElements: bool,
+        soup: Optional["BeautifulSoup"] = None,
+        store_line_numbers: bool = True,
+        **kwargs: Any,
+    ):
+        if soup:
+            self.soup = soup
+        else:
+            warnings.warn(
+                "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            from bs4 import BeautifulSoup
+
+            # TODO: Why is the parser 'html.parser' here? Using
+            # html5lib doesn't cause an infinite loop and is more
+            # accurate. Best to get rid of this entire section, I think.
+            self.soup = BeautifulSoup(
+                "", "html.parser", store_line_numbers=store_line_numbers, **kwargs
+            )
+        # TODO: What are **kwargs exactly? Should they be passed in
+        # here in addition to/instead of being passed to the BeautifulSoup
+        # constructor?
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+        # This will be set later to a real html5lib HTMLParser object,
+        # which we can use to track the current line number.
+        self.parser = None
+        self.store_line_numbers = store_line_numbers
+
+    def documentClass(self) -> "Element":
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token: Dict[str, Any]) -> None:
+        name: str = cast(str, token["name"])
+        publicId: Optional[str] = cast(Optional[str], token["publicId"])
+        systemId: Optional[str] = cast(Optional[str], token["systemId"])
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name: str, namespace: str) -> "Element":
+        sourceline: Optional[int] = None
+        sourcepos: Optional[int] = None
+        if self.parser is not None and self.store_line_numbers:
+            # This represents the point immediately after the end of the
+            # tag. We don't know when the tag started, but we do know
+            # where it ended -- the character just before this one.
+            sourceline, sourcepos = self.parser.tokenizer.stream.position()
+            assert sourcepos is not None
+            sourcepos = sourcepos - 1
+        tag = self.soup.new_tag(
+            name, namespace, sourceline=sourceline, sourcepos=sourcepos
+        )
+
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data: str) -> "TextNode":
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self) -> "Element":
+        """This is only used by html5lib HTMLParser.parseFragment(),
+        which is never used by Beautiful Soup, only by the html5lib
+        unit tests. Since we don't currently hook into those tests,
+        the implementation is left blank.
+        """
+        raise NotImplementedError()
+
+    def getFragment(self) -> "Element":
+        """This is only used by the html5lib unit tests. Since we
+        don't currently hook into those tests, the implementation is
+        left blank.
+        """
+        raise NotImplementedError()
+
+    def appendChild(self, node: "Element") -> None:
+        # TODO: This code is not covered by the BS4 tests, and
+        # apparently not triggered by the html5lib test suite either.
+        # But it doesn't seem test-specific and there are calls to it
+        # (or a method with the same name) all over html5lib, so I'm
+        # leaving the implementation in place rather than replacing it
+        # with NotImplementedError()
+        self.soup.append(node.element)
+
+    def getDocument(self) -> "BeautifulSoup":
+        return self.soup
+
+    def testSerializer(self, element: "Element") -> str:
+        """This is only used by the html5lib unit tests. Since we
+        don't currently hook into those tests, the implementation is
+        left blank.
+        """
+        raise NotImplementedError()
+
+
+class AttrList(object):
+    """Represents a Tag's attributes in a way compatible with html5lib."""
+
+    element: Tag
+    attrs: _AttributeValues
+
+    def __init__(self, element: Tag):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+
+    def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
+        return list(self.attrs.items()).__iter__()
+
+    def __setitem__(self, name: str, value: _AttributeValue) -> None:
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = self.element.cdata_list_attributes or {}
+        if name in list_attr.get("*", []) or (
+            self.element.name in list_attr
+            and name in list_attr.get(self.element.name, [])
+        ):
+            # A node that is being cloned may have already undergone
+            # this procedure. Check for this and skip it.
+            if not isinstance(value, list):
+                assert isinstance(value, str)
+                value = self.element.attribute_value_list_class(
+                    nonwhitespace_re.findall(value)
+                )
+        self.element[name] = value
+
+    def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
+        return list(self.attrs.items())
+
+    def keys(self) -> Iterable[str]:
+        return list(self.attrs.keys())
+
+    def __len__(self) -> int:
+        return len(self.attrs)
+
+    def __getitem__(self, name: str) -> _AttributeValue:
+        return self.attrs[name]
+
+    def __contains__(self, name: str) -> bool:
+        return name in list(self.attrs.keys())
+
+
+class BeautifulSoupNode(treebuilder_base.Node):
+    element: PageElement
+    soup: "BeautifulSoup"
+    namespace: Optional[_NamespaceURL]
+
+    @property
+    def nodeType(self) -> int:
+        """Return the html5lib constant corresponding to the type of
+        the underlying DOM object.
+
+        NOTE: This property is only accessed by the html5lib test
+        suite, not by Beautiful Soup proper.
+        """
+        raise NotImplementedError()
+
+    # TODO-TYPING: typeshed stubs are incorrect about this;
+    # cloneNode returns a new Node, not None.
+    def cloneNode(self) -> treebuilder_base.Node:
+        raise NotImplementedError()
+
+
+class Element(BeautifulSoupNode):
+    element: Tag
+    namespace: Optional[_NamespaceURL]
+
+    def __init__(
+        self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
+    ):
+        treebuilder_base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node: "BeautifulSoupNode") -> None:
+        string_child: Optional[NavigableString] = None
+        child: PageElement
+        if type(node.element) is NavigableString:
+            string_child = child = node.element
+        else:
+            child = node.element
+        node.parent = self
+
+        if (
+            child is not None
+            and child.parent is not None
+            and not isinstance(child, str)
+        ):
+            node.element.extract()
+
+        if (
+            string_child is not None
+            and self.element.contents
+            and type(self.element.contents[-1]) is NavigableString
+        ):
+            # We are appending a string onto another string.
+            # TODO This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + string_child)
+            old_element.replace_with(new_element)
+            self.soup._most_recent_element = new_element
+        else:
+            if isinstance(node, str):
+                # Create a brand new NavigableString from this string.
+                child = self.soup.new_string(node)
+
+            # Tell Beautiful Soup to act as if it parsed this element
+            # immediately after the parent's last descendant. (Or
+            # immediately after the parent, if it has no children.)
+            if self.element.contents:
+                most_recent_element = self.element._last_descendant(False)
+            elif self.element.next_element is not None:
+                # Something from further ahead in the parse tree is
+                # being inserted into this earlier element. This is
+                # very annoying because it means an expensive search
+                # for the last element in the tree.
+                most_recent_element = self.soup._last_descendant()
+            else:
+                most_recent_element = self.element
+
+            self.soup.object_was_parsed(
+                child, parent=self.element, most_recent_element=most_recent_element
+            )
+
+    def getAttributes(self) -> AttrList:
+        if isinstance(self.element, Comment):
+            return {}
+        return AttrList(self.element)
+
+    # An HTML5lib attribute name may either be a single string,
+    # or a tuple (namespace, name).
+    _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
+    # Now we can define the type this method accepts as a dictionary
+    # mapping those attribute names to single string values.
+    _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
+
+    def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
+        if attributes is not None and len(attributes) > 0:
+            # Replace any namespaced attributes with
+            # NamespacedAttribute objects.
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            # We can now cast attributes to the type of Dict
+            # used by Beautiful Soup.
+            normalized_attributes = cast(_AttributeValues, attributes)
+
+            # Values for tags like 'class' came in as single strings;
+            # replace them with lists of strings as appropriate.
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, normalized_attributes
+            )
+
+            # Then set the attributes on the Tag associated with this
+            # BeautifulSoupNode.
+            for name, value_or_values in list(normalized_attributes.items()):
+                self.element[name] = value_or_values
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(
+        self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
+    ) -> None:
+        text = TextNode(self.soup.new_string(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(
+        self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
+    ) -> None:
+        index = self.element.index(refNode.element)
+        if (
+            type(node.element) is NavigableString
+            and self.element.contents
+            and type(self.element.contents[index - 1]) is NavigableString
+        ):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index - 1]
+            assert type(old_node) is NavigableString
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node: "Element") -> None:
+        node.element.extract()
+
+    def reparentChildren(self, new_parent: "Element") -> None:
+        """Move all of this tag's children into another tag."""
+        # print("MOVE", self.element.contents)
+        # print("FROM", self.element)
+        # print("TO", new_parent.element)
+
+        element = self.element
+        new_parent_element = new_parent.element
+        # Determine what this tag's next_element will be once all the children
+        # are removed.
+        final_next_element = element.next_sibling
+
+        new_parents_last_descendant = new_parent_element._last_descendant(False, False)
+        if len(new_parent_element.contents) > 0:
+            # The new parent already contains children. We will be
+            # appending this tag's children to the end.
+
+            # We can make this assertion since we know new_parent has
+            # children.
+            assert new_parents_last_descendant is not None
+            new_parents_last_child = new_parent_element.contents[-1]
+            new_parents_last_descendant_next_element = (
+                new_parents_last_descendant.next_element
+            )
+        else:
+            # The new parent contains no children.
+            new_parents_last_child = None
+            new_parents_last_descendant_next_element = new_parent_element.next_element
+
+        to_append = element.contents
+        if len(to_append) > 0:
+            # Set the first child's previous_element and previous_sibling
+            # to elements within the new parent
+            first_child = to_append[0]
+            if new_parents_last_descendant is not None:
+                first_child.previous_element = new_parents_last_descendant
+            else:
+                first_child.previous_element = new_parent_element
+            first_child.previous_sibling = new_parents_last_child
+            if new_parents_last_descendant is not None:
+                new_parents_last_descendant.next_element = first_child
+            else:
+                new_parent_element.next_element = first_child
+            if new_parents_last_child is not None:
+                new_parents_last_child.next_sibling = first_child
+
+            # Find the very last element being moved. It is now the
+            # parent's last descendant. It has no .next_sibling and
+            # its .next_element is whatever the previous last
+            # descendant had.
+            last_childs_last_descendant = to_append[-1]._last_descendant(
+                is_initialized=False, accept_self=True
+            )
+
+            # Since we passed accept_self=True into _last_descendant,
+            # there's no possibility that the result is None.
+            assert last_childs_last_descendant is not None
+            last_childs_last_descendant.next_element = (
+                new_parents_last_descendant_next_element
+            )
+            if new_parents_last_descendant_next_element is not None:
+                # TODO-COVERAGE: This code has no test coverage and
+                # I'm not sure how to get html5lib to go through this
+                # path, but it's just the other side of the previous
+                # line.
+                new_parents_last_descendant_next_element.previous_element = (
+                    last_childs_last_descendant
+                )
+            last_childs_last_descendant.next_sibling = None
+
+        for child in to_append:
+            child.parent = new_parent_element
+            new_parent_element.contents.append(child)
+
+        # Now that this element has no children, change its .next_element.
+        element.contents = []
+        element.next_element = final_next_element
+
+        # print("DONE WITH MOVE")
+        # print("FROM", self.element)
+        # print("TO", new_parent_element)
+
+    # TODO-TYPING: typeshed stubs are incorrect about this;
+    # hasContent returns a boolean, not None.
+    def hasContent(self) -> bool:
+        return len(self.element.contents) > 0
+
+    # TODO-TYPING: typeshed stubs are incorrect about this;
+    # cloneNode returns a new Node, not None.
+    def cloneNode(self) -> treebuilder_base.Node:
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key, value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
+        if self.namespace is None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+
+class TextNode(BeautifulSoupNode):
+    element: NavigableString
+
+    def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
+        treebuilder_base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz