diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py | 594 |
1 files changed, 594 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py new file mode 100644 index 00000000..c13439d0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py @@ -0,0 +1,594 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + "HTML5TreeBuilder", +] + +from typing import ( + Any, + cast, + Dict, + Iterable, + Optional, + Sequence, + TYPE_CHECKING, + Tuple, + Union, +) +from typing_extensions import TypeAlias +from bs4._typing import ( + _AttributeValue, + _AttributeValues, + _Encoding, + _Encodings, + _NamespaceURL, + _RawMarkup, +) + +import warnings +from bs4.builder import ( + DetectsXMLParsedAsHTML, + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, +) +from bs4.element import ( + NamespacedAttribute, + PageElement, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, +) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, +) + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + +from html5lib.treebuilders import base as treebuilder_base + + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to + build a tree. + + Note that `HTML5TreeBuilder` does not support some common HTML + `TreeBuilder` features. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + Specifically: + + * This `TreeBuilder` doesn't use different subclasses of + `NavigableString` (e.g. `Script`) based on the name of the tag + in which the string was found. + * You can't use a `SoupStrainer` to parse only part of a document. + """ + + NAME: str = "html5lib" + + features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML] + + #: html5lib can tell us which line number and position in the + #: original file is the source of an element. + TRACKS_LINE_NUMBERS: bool = True + + underlying_builder: "TreeBuilderForHtml5lib" #: :meta private: + user_specified_encoding: Optional[_Encoding] + + def prepare_markup( + self, + markup: _RawMarkup, + user_specified_encoding: Optional[_Encoding] = None, + document_declared_encoding: Optional[_Encoding] = None, + exclude_encodings: Optional[_Encodings] = None, + ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + for variable, name in ( + (document_declared_encoding, "document_declared_encoding"), + (exclude_encodings, "exclude_encodings"), + ): + if variable: + warnings.warn( + f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.", + stacklevel=3, + ) + + # html5lib only parses HTML, so if it's given XML that's worth + # noting. + DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) + + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup: _RawMarkup) -> None: + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`. + """ + if self.soup is not None and self.soup.parse_only is not None: + warnings.warn( + "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", + stacklevel=4, + ) + + # self.underlying_builder is probably None now, but it'll be set + # when html5lib calls self.create_treebuilder(). + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + assert self.underlying_builder is not None + self.underlying_builder.parser = parser + extra_kwargs = dict() + if not isinstance(markup, str): + # kwargs, specifically override_encoding, will eventually + # be passed in to html5lib's + # HTMLBinaryInputStream.__init__. + extra_kwargs["override_encoding"] = self.user_specified_encoding + + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + # The encoding is an html5lib Encoding object. We want to + # use a string for compatibility with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + self.underlying_builder.parser = None + + def create_treebuilder( + self, namespaceHTMLElements: bool + ) -> "TreeBuilderForHtml5lib": + """Called by html5lib to instantiate the kind of class it + calls a 'TreeBuilder'. + + :param namespaceHTMLElements: Whether or not to namespace HTML elements. + + :meta private: + """ + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers + ) + return self.underlying_builder + + def test_fragment_to_document(self, fragment: str) -> str: + """See `TreeBuilder`.""" + return "<html><head></head><body>%s</body></html>" % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + soup: "BeautifulSoup" #: :meta private: + parser: Optional[html5lib.HTMLParser] #: :meta private: + + def __init__( + self, + namespaceHTMLElements: bool, + soup: Optional["BeautifulSoup"] = None, + store_line_numbers: bool = True, + **kwargs: Any, + ): + if soup: + self.soup = soup + else: + warnings.warn( + "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.", + DeprecationWarning, + stacklevel=2, + ) + from bs4 import BeautifulSoup + + # TODO: Why is the parser 'html.parser' here? Using + # html5lib doesn't cause an infinite loop and is more + # accurate. Best to get rid of this entire section, I think. + self.soup = BeautifulSoup( + "", "html.parser", store_line_numbers=store_line_numbers, **kwargs + ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + # This will be set later to a real html5lib HTMLParser object, + # which we can use to track the current line number. + self.parser = None + self.store_line_numbers = store_line_numbers + + def documentClass(self) -> "Element": + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token: Dict[str, Any]) -> None: + name: str = cast(str, token["name"]) + publicId: Optional[str] = cast(Optional[str], token["publicId"]) + systemId: Optional[str] = cast(Optional[str], token["systemId"]) + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name: str, namespace: str) -> "Element": + sourceline: Optional[int] = None + sourcepos: Optional[int] = None + if self.parser is not None and self.store_line_numbers: + # This represents the point immediately after the end of the + # tag. We don't know when the tag started, but we do know + # where it ended -- the character just before this one. + sourceline, sourcepos = self.parser.tokenizer.stream.position() + assert sourcepos is not None + sourcepos = sourcepos - 1 + tag = self.soup.new_tag( + name, namespace, sourceline=sourceline, sourcepos=sourcepos + ) + + return Element(tag, self.soup, namespace) + + def commentClass(self, data: str) -> "TextNode": + return TextNode(Comment(data), self.soup) + + def fragmentClass(self) -> "Element": + """This is only used by html5lib HTMLParser.parseFragment(), + which is never used by Beautiful Soup, only by the html5lib + unit tests. Since we don't currently hook into those tests, + the implementation is left blank. + """ + raise NotImplementedError() + + def getFragment(self) -> "Element": + """This is only used by the html5lib unit tests. Since we + don't currently hook into those tests, the implementation is + left blank. + """ + raise NotImplementedError() + + def appendChild(self, node: "Element") -> None: + # TODO: This code is not covered by the BS4 tests, and + # apparently not triggered by the html5lib test suite either. + # But it doesn't seem test-specific and there are calls to it + # (or a method with the same name) all over html5lib, so I'm + # leaving the implementation in place rather than replacing it + # with NotImplementedError() + self.soup.append(node.element) + + def getDocument(self) -> "BeautifulSoup": + return self.soup + + def testSerializer(self, element: "Element") -> str: + """This is only used by the html5lib unit tests. Since we + don't currently hook into those tests, the implementation is + left blank. + """ + raise NotImplementedError() + + +class AttrList(object): + """Represents a Tag's attributes in a way compatible with html5lib.""" + + element: Tag + attrs: _AttributeValues + + def __init__(self, element: Tag): + self.element = element + self.attrs = dict(self.element.attrs) + + def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]: + return list(self.attrs.items()).__iter__() + + def __setitem__(self, name: str, value: _AttributeValue) -> None: + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes or {} + if name in list_attr.get("*", []) or ( + self.element.name in list_attr + and name in list_attr.get(self.element.name, []) + ): + # A node that is being cloned may have already undergone + # this procedure. Check for this and skip it. + if not isinstance(value, list): + assert isinstance(value, str) + value = self.element.attribute_value_list_class( + nonwhitespace_re.findall(value) + ) + self.element[name] = value + + def items(self) -> Iterable[Tuple[str, _AttributeValue]]: + return list(self.attrs.items()) + + def keys(self) -> Iterable[str]: + return list(self.attrs.keys()) + + def __len__(self) -> int: + return len(self.attrs) + + def __getitem__(self, name: str) -> _AttributeValue: + return self.attrs[name] + + def __contains__(self, name: str) -> bool: + return name in list(self.attrs.keys()) + + +class BeautifulSoupNode(treebuilder_base.Node): + element: PageElement + soup: "BeautifulSoup" + namespace: Optional[_NamespaceURL] + + @property + def nodeType(self) -> int: + """Return the html5lib constant corresponding to the type of + the underlying DOM object. + + NOTE: This property is only accessed by the html5lib test + suite, not by Beautiful Soup proper. + """ + raise NotImplementedError() + + # TODO-TYPING: typeshed stubs are incorrect about this; + # cloneNode returns a new Node, not None. + def cloneNode(self) -> treebuilder_base.Node: + raise NotImplementedError() + + +class Element(BeautifulSoupNode): + element: Tag + namespace: Optional[_NamespaceURL] + + def __init__( + self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL] + ): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node: "BeautifulSoupNode") -> None: + string_child: Optional[NavigableString] = None + child: PageElement + if type(node.element) is NavigableString: + string_child = child = node.element + else: + child = node.element + node.parent = self + + if ( + child is not None + and child.parent is not None + and not isinstance(child, str) + ): + node.element.extract() + + if ( + string_child is not None + and self.element.contents + and type(self.element.contents[-1]) is NavigableString + ): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "a</a>a</a>a</a>..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, most_recent_element=most_recent_element + ) + + def getAttributes(self) -> AttrList: + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + # An HTML5lib attribute name may either be a single string, + # or a tuple (namespace, name). + _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]] + # Now we can define the type this method accepts as a dictionary + # mapping those attribute names to single string values. + _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str] + + def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None: + if attributes is not None and len(attributes) > 0: + # Replace any namespaced attributes with + # NamespacedAttribute objects. + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + # We can now cast attributes to the type of Dict + # used by Beautiful Soup. + normalized_attributes = cast(_AttributeValues, attributes) + + # Values for tags like 'class' came in as single strings; + # replace them with lists of strings as appropriate. + self.soup.builder._replace_cdata_list_attribute_values( + self.name, normalized_attributes + ) + + # Then set the attributes on the Tag associated with this + # BeautifulSoupNode. + for name, value_or_values in list(normalized_attributes.items()): + self.element[name] = value_or_values + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + + attributes = property(getAttributes, setAttributes) + + def insertText( + self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None + ) -> None: + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore( + self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode" + ) -> None: + index = self.element.index(refNode.element) + if ( + type(node.element) is NavigableString + and self.element.contents + and type(self.element.contents[index - 1]) is NavigableString + ): + # (See comments in appendChild) + old_node = self.element.contents[index - 1] + assert type(old_node) is NavigableString + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node: "Element") -> None: + node.element.extract() + + def reparentChildren(self, new_parent: "Element") -> None: + """Move all of this tag's children into another tag.""" + # print("MOVE", self.element.contents) + # print("FROM", self.element) + # print("TO", new_parent.element) + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + + # We can make this assertion since we know new_parent has + # children. + assert new_parents_last_descendant is not None + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = ( + new_parents_last_descendant.next_element + ) + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant( + is_initialized=False, accept_self=True + ) + + # Since we passed accept_self=True into _last_descendant, + # there's no possibility that the result is None. + assert last_childs_last_descendant is not None + last_childs_last_descendant.next_element = ( + new_parents_last_descendant_next_element + ) + if new_parents_last_descendant_next_element is not None: + # TODO-COVERAGE: This code has no test coverage and + # I'm not sure how to get html5lib to go through this + # path, but it's just the other side of the previous + # line. + new_parents_last_descendant_next_element.previous_element = ( + last_childs_last_descendant + ) + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print("DONE WITH MOVE") + # print("FROM", self.element) + # print("TO", new_parent_element) + + # TODO-TYPING: typeshed stubs are incorrect about this; + # hasContent returns a boolean, not None. + def hasContent(self) -> bool: + return len(self.element.contents) > 0 + + # TODO-TYPING: typeshed stubs are incorrect about this; + # cloneNode returns a new Node, not None. + def cloneNode(self) -> treebuilder_base.Node: + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key, value in self.attributes: + node.attributes[key] = value + return node + + def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]: + if self.namespace is None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + + +class TextNode(BeautifulSoupNode): + element: NavigableString + + def __init__(self, element: NavigableString, soup: "BeautifulSoup"): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup |