diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/builder')
4 files changed, 2406 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py new file mode 100644 index 00000000..5f2b38de --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py @@ -0,0 +1,848 @@ +from __future__ import annotations + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import re +from types import ModuleType +from typing import ( + Any, + cast, + Dict, + Iterable, + List, + Optional, + Pattern, + Set, + Tuple, + Type, + TYPE_CHECKING, +) +import warnings +import sys +from bs4.element import ( + AttributeDict, + AttributeValueList, + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + RubyParenthesisString, + RubyTextString, + Stylesheet, + Script, + TemplateString, + nonwhitespace_re, +) + +# Exceptions were moved to their own module in 4.13. Import here for +# backwards compatibility. +from bs4.exceptions import ParserRejectedMarkup + +from bs4._typing import ( + _AttributeValues, + _RawAttributeValue, +) + +from bs4._warnings import XMLParsedAsHTMLWarning + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + from bs4.element import ( + NavigableString, + Tag, + ) + from bs4._typing import ( + _AttributeValue, + _Encoding, + _Encodings, + _RawOrProcessedAttributeValues, + _RawMarkup, + ) + +__all__ = [ + "HTMLTreeBuilder", + "SAXTreeBuilder", + "TreeBuilder", + "TreeBuilderRegistry", +] + +# Some useful features for a TreeBuilder to have. +FAST = "fast" +PERMISSIVE = "permissive" +STRICT = "strict" +XML = "xml" +HTML = "html" +HTML_5 = "html5" + +__all__ = [ + "TreeBuilderRegistry", + "TreeBuilder", + "HTMLTreeBuilder", + "DetectsXMLParsedAsHTML", + + "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0 +] + +class TreeBuilderRegistry(object): + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + + builders_for_feature: Dict[str, List[Type[TreeBuilder]]] + builders: List[Type[TreeBuilder]] + + def __init__(self) -> None: + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class: type[TreeBuilder]) -> None: + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of `TreeBuilder`. its + `TreeBuilder.features` attribute should list its features. + """ + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]: + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + feature_list = list(features) + feature_list.reverse() + candidates = None + candidate_set = None + while len(feature_list) > 0: + feature = feature_list.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection(set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None or candidates is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + + +#: The `BeautifulSoup` constructor will take a list of features +#: and use it to look up `TreeBuilder` classes in this registry. +builder_registry: TreeBuilderRegistry = TreeBuilderRegistry() + + +class TreeBuilder(object): + """Turn a textual document into a Beautiful Soup object tree. + + This is an abstract superclass which smooths out the behavior of + different parser libraries into a single, unified interface. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this to a dictionary will + customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES` + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is ``multi_valued_attributes``. + + :param preserve_whitespace_tags: A set of tags to treat + the way <pre> tags are treated in HTML. Tags in this set + are immune from pretty-printing; their contents will always be + output as-is. + + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`. + + :param store_line_numbers: If the parser keeps track of the line + numbers and positions of the original markup, that information + will, by default, be stored in each corresponding + :py:class:`bs4.element.Tag` object. You can turn this off by + passing store_line_numbers=False; then Tag.sourcepos and + Tag.sourceline will always be None. If the parser you're using + doesn't keep track of this information, then store_line_numbers + is irrelevant. + + :param attribute_dict_class: The value of a multi-valued attribute + (such as HTML's 'class') willl be stored in an instance of this + class. The default is Beautiful Soup's built-in + `AttributeValueList`, which is a normal Python list, and you + will probably never need to change it. + """ + + USE_DEFAULT: Any = object() #: :meta private: + + def __init__( + self, + multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT, + preserve_whitespace_tags: Set[str] = USE_DEFAULT, + store_line_numbers: bool = USE_DEFAULT, + string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT, + empty_element_tags: Set[str] = USE_DEFAULT, + attribute_dict_class: Type[AttributeDict] = AttributeDict, + attribute_value_list_class: Type[AttributeValueList] = AttributeValueList, + ): + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + if empty_element_tags is self.USE_DEFAULT: + self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS + else: + self.empty_element_tags = empty_element_tags + # TODO: store_line_numbers is probably irrelevant now that + # the behavior of sourceline and sourcepos has been made consistent + # everywhere. + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers + self.attribute_dict_class = attribute_dict_class + self.attribute_value_list_class = attribute_value_list_class + + NAME: str = "[Unknown tree builder]" + ALTERNATE_NAMES: Iterable[str] = [] + features: Iterable[str] = [] + + is_xml: bool = False + picklable: bool = False + + soup: Optional[BeautifulSoup] #: :meta private: + + #: A tag will be considered an empty-element + #: tag when and only when it has no contents. + empty_element_tags: Optional[Set[str]] = None #: :meta private: + cdata_list_attributes: Dict[str, Set[str]] #: :meta private: + preserve_whitespace_tags: Set[str] #: :meta private: + string_containers: Dict[str, Type[NavigableString]] #: :meta private: + tracks_line_numbers: bool #: :meta private: + + #: A value for these tag/attribute combinations is a space- or + #: comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set) + + #: Whitespace should be preserved inside these tags. + DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set() + + #: The textual contents of tags with these names should be + #: instantiated with some class other than `bs4.element.NavigableString`. + DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} + + #: By default, tags are treated as empty-element tags if they have + #: no contents--that is, using XML rules. HTMLTreeBuilder + #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the + #: HTML 4 and HTML5 standards. + DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None + + #: Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS: bool = False + + def initialize_soup(self, soup: BeautifulSoup) -> None: + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. + """ + self.soup = soup + + def reset(self) -> None: + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ + pass + + def can_be_empty_element(self, tag_name: str) -> bool: + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p/>" or "<p>". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will + be left alone. + + :param tag_name: The name of a markup tag. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup: _RawMarkup) -> None: + """Run incoming markup through some parsing process.""" + raise NotImplementedError() + + def prepare_markup( + self, + markup: _RawMarkup, + user_specified_encoding: Optional[_Encoding] = None, + document_declared_encoding: Optional[_Encoding] = None, + exclude_encodings: Optional[_Encodings] = None, + ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: The markup that's about to be parsed. + :param user_specified_encoding: The user asked to try this encoding + to convert the markup into a Unicode string. + :param document_declared_encoding: The markup itself claims to be + in this encoding. NOTE: This argument is not used by the + calling code and can probably be removed. + :param exclude_encodings: The user asked *not* to try any of + these encodings. + + :yield: A series of 4-tuples: (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy that the parser can try + to convert the document to Unicode and parse it. Each + strategy will be tried in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + + :meta private: + + """ + yield markup, None, None, False + + def test_fragment_to_document(self, fragment: str) -> str: + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of unit tests. + + :param fragment: A fragment of HTML. + :return: A full HTML document. + :meta private: + """ + return fragment + + def set_up_substitutions(self, tag: Tag) -> bool: + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :return: Whether or not a substitution was performed. + :meta private: + """ + return False + + def _replace_cdata_list_attribute_values( + self, tag_name: str, attrs: _RawOrProcessedAttributeValues + ) -> _AttributeValues: + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. + + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. + :return: The modified dictionary that was originally passed in. + """ + + # First, cast the attrs dict to _AttributeValues. This might + # not be accurate yet, but it will be by the time this method + # returns. + modified_attrs = cast(_AttributeValues, attrs) + if not modified_attrs or not self.cdata_list_attributes: + # Nothing to do. + return modified_attrs + + # There is at least a possibility that we need to modify one of + # the attribute values. + universal: Set[str] = self.cdata_list_attributes.get("*", set()) + tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None) + for attr in list(modified_attrs.keys()): + modified_value: _AttributeValue + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + original_value: _AttributeValue = modified_attrs[attr] + if isinstance(original_value, _RawAttributeValue): + # This is a _RawAttributeValue (a string) that + # needs to be split and converted to a + # AttributeValueList so it can be an + # _AttributeValue. + modified_value = self.attribute_value_list_class( + nonwhitespace_re.findall(original_value) + ) + else: + # html5lib calls setAttributes twice for the + # same tag when rearranging the parse tree. On + # the second call the attribute value here is + # already a list. This can also happen when a + # Tag object is cloned. If this happens, leave + # the value alone rather than trying to split + # it again. + modified_value = original_value + modified_attrs[attr] = modified_value + return modified_attrs + + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, and it will be removed + soon. It was a good idea, but it wasn't properly integrated into the + rest of Beautiful Soup, so there have been long stretches where it + hasn't worked properly. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + warnings.warn( + "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.", + DeprecationWarning, + stacklevel=2, + ) + super(SAXTreeBuilder, self).__init__(*args, **kwargs) + + def feed(self, markup: _RawMarkup) -> None: + raise NotImplementedError() + + def close(self) -> None: + pass + + def startElement(self, name: str, attrs: Dict[str, str]) -> None: + attrs = AttributeDict((key[1], value) for key, value in list(attrs.items())) + # print("Start %s, %r" % (name, attrs)) + assert self.soup is not None + self.soup.handle_starttag(name, None, None, attrs) + + def endElement(self, name: str) -> None: + # print("End %s" % name) + assert self.soup is not None + self.soup.handle_endtag(name) + + def startElementNS( + self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str] + ) -> None: + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None: + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + # handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix: str, nodeValue: str) -> None: + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix: str) -> None: + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content: str) -> None: + assert self.soup is not None + self.soup.handle_data(content) + + def startDocument(self) -> None: + pass + + def endDocument(self) -> None: + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML, such as which tags are treated + specially by the HTML standard. + """ + + #: Some HTML tags are defined as having no contents. Beautiful Soup + #: treats these specially. + DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set( + [ + # These are from HTML5. + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "keygen", + "link", + "menuitem", + "meta", + "param", + "source", + "track", + "wbr", + # These are from earlier versions of HTML and are removed in HTML5. + "basefont", + "bgsound", + "command", + "frame", + "image", + "isindex", + "nextid", + "spacer", + ] + ) + + #: The HTML standard defines these tags as block-level elements. Beautiful + #: Soup does not treat these elements differently from other elements, + #: but it may do so eventually, and this information is available if + #: you need to use it. + DEFAULT_BLOCK_ELEMENTS: Set[str] = set( + [ + "address", + "article", + "aside", + "blockquote", + "canvas", + "dd", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hr", + "li", + "main", + "nav", + "noscript", + "ol", + "output", + "p", + "pre", + "section", + "table", + "tfoot", + "ul", + "video", + ] + ) + + #: These HTML tags need special treatment so they can be + #: represented by a string class other than `bs4.element.NavigableString`. + #: + #: For some of these tags, it's because the HTML standard defines + #: an unusual content model for them. I made this list by going + #: through the HTML spec + #: (https://html.spec.whatwg.org/#metadata-content) and looking for + #: "metadata content" elements that can contain strings. + #: + #: The Ruby tags (<rt> and <rp>) are here despite being normal + #: "phrasing content" tags, because the content they contain is + #: qualitatively different from other text in the document, and it + #: can be useful to be able to distinguish it. + #: + #: TODO: Arguably <noscript> could go here but it seems + #: qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { + "rt": RubyTextString, + "rp": RubyParenthesisString, + "style": Stylesheet, + "script": Script, + "template": TemplateString, + } + + #: The HTML standard defines these attributes as containing a + #: space-separated list of values, not a single value. That is, + #: class="foo bar" means that the 'class' attribute has two values, + #: 'foo' and 'bar', not the single value 'foo bar'. When we + #: encounter one of these attributes, we will parse its value into + #: a list of values if possible. Upon output, the list will be + #: converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = { + "*": {"class", "accesskey", "dropzone"}, + "a": {"rel", "rev"}, + "link": {"rel", "rev"}, + "td": {"headers"}, + "th": {"headers"}, + "form": {"accept-charset"}, + "object": {"archive"}, + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area": {"rel"}, + "icon": {"sizes"}, + "iframe": {"sandbox"}, + "output": {"for"}, + } + + #: By default, whitespace inside these HTML tags will be + #: preserved rather than being collapsed. + DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"]) + + def set_up_substitutions(self, tag: Tag) -> bool: + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :return: Whether or not a substitution was performed. + + :meta private: + """ + # We are only interested in <meta> tags + if tag.name != "meta": + return False + + # TODO: This cast will fail in the (very unlikely) scenario + # that the programmer who instantiates the TreeBuilder + # specifies meta['content'] or meta['charset'] as + # cdata_list_attributes. + content: Optional[str] = cast(Optional[str], tag.get("content")) + charset: Optional[str] = cast(Optional[str], tag.get("charset")) + + # But we can accommodate meta['http-equiv'] being made a + # cdata_list_attribute (again, very unlikely) without much + # trouble. + http_equiv: List[str] = tag.get_attribute_list("http-equiv") + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + substituted = False + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + tag["charset"] = CharsetMetaAttributeValue(charset) + substituted = True + + elif content is not None and any( + x.lower() == "content-type" for x in http_equiv + ): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> + tag["content"] = ContentMetaAttributeValue(content) + substituted = True + + return substituted + + +class DetectsXMLParsedAsHTML(object): + """A mixin class for any class (a TreeBuilder, or some class used by a + TreeBuilder) that's in a position to detect whether an XML + document is being incorrectly parsed as HTML, and issue an + appropriate warning. + + This requires being able to observe an incoming processing + instruction that might be an XML declaration, and also able to + observe tags as they're opened. If you can't do that for a given + `TreeBuilder`, there's a less reliable implementation based on + examining the raw markup. + """ + + #: Regular expression for seeing if string markup has an <html> tag. + LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I) + + #: Regular expression for seeing if byte markup has an <html> tag. + LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I) + + #: The start of an XML document string. + XML_PREFIX: str = "<?xml" + + #: The start of an XML document bytestring. + XML_PREFIX_B: bytes = b"<?xml" + + # This is typed as str, not `ProcessingInstruction`, because this + # check may be run before any Beautiful Soup objects are created. + _first_processing_instruction: Optional[str] #: :meta private: + _root_tag_name: Optional[str] #: :meta private: + + @classmethod + def warn_if_markup_looks_like_xml( + cls, markup: Optional[_RawMarkup], stacklevel: int = 3 + ) -> bool: + """Perform a check on some markup to see if it looks like XML + that's not XHTML. If so, issue a warning. + + This is much less reliable than doing the check while parsing, + but some of the tree builders can't do that. + + :param stacklevel: The stacklevel of the code calling this\ + function. + + :return: True if the markup looks like non-XHTML XML, False + otherwise. + """ + if markup is None: + return False + markup = markup[:500] + if isinstance(markup, bytes): + markup_b: bytes = markup + looks_like_xml = markup_b.startswith( + cls.XML_PREFIX_B + ) and not cls.LOOKS_LIKE_HTML_B.search(markup) + else: + markup_s: str = markup + looks_like_xml = markup_s.startswith( + cls.XML_PREFIX + ) and not cls.LOOKS_LIKE_HTML.search(markup) + + if looks_like_xml: + cls._warn(stacklevel=stacklevel + 2) + return True + return False + + @classmethod + def _warn(cls, stacklevel: int = 5) -> None: + """Issue a warning about XML being parsed as HTML.""" + warnings.warn( + XMLParsedAsHTMLWarning.MESSAGE, + XMLParsedAsHTMLWarning, + stacklevel=stacklevel, + ) + + def _initialize_xml_detector(self) -> None: + """Call this method before parsing a document.""" + self._first_processing_instruction = None + self._root_tag_name = None + + def _document_might_be_xml(self, processing_instruction: str) -> None: + """Call this method when encountering an XML declaration, or a + "processing instruction" that might be an XML declaration. + + This helps Beautiful Soup detect potential issues later, if + the XML document turns out to be a non-XHTML document that's + being parsed as XML. + """ + if ( + self._first_processing_instruction is not None + or self._root_tag_name is not None + ): + # The document has already started. Don't bother checking + # anymore. + return + + self._first_processing_instruction = processing_instruction + + # We won't know until we encounter the first tag whether or + # not this is actually a problem. + + def _root_tag_encountered(self, name: str) -> None: + """Call this when you encounter the document's root tag. + + This is where we actually check whether an XML document is + being incorrectly parsed as HTML, and issue the warning. + """ + if self._root_tag_name is not None: + # This method was incorrectly called multiple times. Do + # nothing. + return + + self._root_tag_name = name + + if ( + name != "html" + and self._first_processing_instruction is not None + and self._first_processing_instruction.lower().startswith("xml ") + ): + # We encountered an XML declaration and then a tag other + # than 'html'. This is a reliable indicator that a + # non-XHTML document is being parsed as XML. + self._warn(stacklevel=10) + + +def register_treebuilders_from(module: ModuleType) -> None: + """Copy TreeBuilders from the given module into this module.""" + this_module = sys.modules[__name__] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last resort. +from . import _htmlparser # noqa: E402 + +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py new file mode 100644 index 00000000..c13439d0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/builder/_html5lib.py @@ -0,0 +1,594 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + "HTML5TreeBuilder", +] + +from typing import ( + Any, + cast, + Dict, + Iterable, + Optional, + Sequence, + TYPE_CHECKING, + Tuple, + Union, +) +from typing_extensions import TypeAlias +from bs4._typing import ( + _AttributeValue, + _AttributeValues, + _Encoding, + _Encodings, + _NamespaceURL, + _RawMarkup, +) + +import warnings +from bs4.builder import ( + DetectsXMLParsedAsHTML, + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, +) +from bs4.element import ( + NamespacedAttribute, + PageElement, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, +) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, +) + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + +from html5lib.treebuilders import base as treebuilder_base + + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to + build a tree. + + Note that `HTML5TreeBuilder` does not support some common HTML + `TreeBuilder` features. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + Specifically: + + * This `TreeBuilder` doesn't use different subclasses of + `NavigableString` (e.g. `Script`) based on the name of the tag + in which the string was found. + * You can't use a `SoupStrainer` to parse only part of a document. + """ + + NAME: str = "html5lib" + + features: Sequence[str] = [NAME, PERMISSIVE, HTML_5, HTML] + + #: html5lib can tell us which line number and position in the + #: original file is the source of an element. + TRACKS_LINE_NUMBERS: bool = True + + underlying_builder: "TreeBuilderForHtml5lib" #: :meta private: + user_specified_encoding: Optional[_Encoding] + + def prepare_markup( + self, + markup: _RawMarkup, + user_specified_encoding: Optional[_Encoding] = None, + document_declared_encoding: Optional[_Encoding] = None, + exclude_encodings: Optional[_Encodings] = None, + ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + for variable, name in ( + (document_declared_encoding, "document_declared_encoding"), + (exclude_encodings, "exclude_encodings"), + ): + if variable: + warnings.warn( + f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.", + stacklevel=3, + ) + + # html5lib only parses HTML, so if it's given XML that's worth + # noting. + DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) + + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup: _RawMarkup) -> None: + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`. + """ + if self.soup is not None and self.soup.parse_only is not None: + warnings.warn( + "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", + stacklevel=4, + ) + + # self.underlying_builder is probably None now, but it'll be set + # when html5lib calls self.create_treebuilder(). + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + assert self.underlying_builder is not None + self.underlying_builder.parser = parser + extra_kwargs = dict() + if not isinstance(markup, str): + # kwargs, specifically override_encoding, will eventually + # be passed in to html5lib's + # HTMLBinaryInputStream.__init__. + extra_kwargs["override_encoding"] = self.user_specified_encoding + + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + # The encoding is an html5lib Encoding object. We want to + # use a string for compatibility with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + self.underlying_builder.parser = None + + def create_treebuilder( + self, namespaceHTMLElements: bool + ) -> "TreeBuilderForHtml5lib": + """Called by html5lib to instantiate the kind of class it + calls a 'TreeBuilder'. + + :param namespaceHTMLElements: Whether or not to namespace HTML elements. + + :meta private: + """ + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers + ) + return self.underlying_builder + + def test_fragment_to_document(self, fragment: str) -> str: + """See `TreeBuilder`.""" + return "<html><head></head><body>%s</body></html>" % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + soup: "BeautifulSoup" #: :meta private: + parser: Optional[html5lib.HTMLParser] #: :meta private: + + def __init__( + self, + namespaceHTMLElements: bool, + soup: Optional["BeautifulSoup"] = None, + store_line_numbers: bool = True, + **kwargs: Any, + ): + if soup: + self.soup = soup + else: + warnings.warn( + "The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.", + DeprecationWarning, + stacklevel=2, + ) + from bs4 import BeautifulSoup + + # TODO: Why is the parser 'html.parser' here? Using + # html5lib doesn't cause an infinite loop and is more + # accurate. Best to get rid of this entire section, I think. + self.soup = BeautifulSoup( + "", "html.parser", store_line_numbers=store_line_numbers, **kwargs + ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + # This will be set later to a real html5lib HTMLParser object, + # which we can use to track the current line number. + self.parser = None + self.store_line_numbers = store_line_numbers + + def documentClass(self) -> "Element": + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token: Dict[str, Any]) -> None: + name: str = cast(str, token["name"]) + publicId: Optional[str] = cast(Optional[str], token["publicId"]) + systemId: Optional[str] = cast(Optional[str], token["systemId"]) + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name: str, namespace: str) -> "Element": + sourceline: Optional[int] = None + sourcepos: Optional[int] = None + if self.parser is not None and self.store_line_numbers: + # This represents the point immediately after the end of the + # tag. We don't know when the tag started, but we do know + # where it ended -- the character just before this one. + sourceline, sourcepos = self.parser.tokenizer.stream.position() + assert sourcepos is not None + sourcepos = sourcepos - 1 + tag = self.soup.new_tag( + name, namespace, sourceline=sourceline, sourcepos=sourcepos + ) + + return Element(tag, self.soup, namespace) + + def commentClass(self, data: str) -> "TextNode": + return TextNode(Comment(data), self.soup) + + def fragmentClass(self) -> "Element": + """This is only used by html5lib HTMLParser.parseFragment(), + which is never used by Beautiful Soup, only by the html5lib + unit tests. Since we don't currently hook into those tests, + the implementation is left blank. + """ + raise NotImplementedError() + + def getFragment(self) -> "Element": + """This is only used by the html5lib unit tests. Since we + don't currently hook into those tests, the implementation is + left blank. + """ + raise NotImplementedError() + + def appendChild(self, node: "Element") -> None: + # TODO: This code is not covered by the BS4 tests, and + # apparently not triggered by the html5lib test suite either. + # But it doesn't seem test-specific and there are calls to it + # (or a method with the same name) all over html5lib, so I'm + # leaving the implementation in place rather than replacing it + # with NotImplementedError() + self.soup.append(node.element) + + def getDocument(self) -> "BeautifulSoup": + return self.soup + + def testSerializer(self, element: "Element") -> str: + """This is only used by the html5lib unit tests. Since we + don't currently hook into those tests, the implementation is + left blank. + """ + raise NotImplementedError() + + +class AttrList(object): + """Represents a Tag's attributes in a way compatible with html5lib.""" + + element: Tag + attrs: _AttributeValues + + def __init__(self, element: Tag): + self.element = element + self.attrs = dict(self.element.attrs) + + def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]: + return list(self.attrs.items()).__iter__() + + def __setitem__(self, name: str, value: _AttributeValue) -> None: + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes or {} + if name in list_attr.get("*", []) or ( + self.element.name in list_attr + and name in list_attr.get(self.element.name, []) + ): + # A node that is being cloned may have already undergone + # this procedure. Check for this and skip it. + if not isinstance(value, list): + assert isinstance(value, str) + value = self.element.attribute_value_list_class( + nonwhitespace_re.findall(value) + ) + self.element[name] = value + + def items(self) -> Iterable[Tuple[str, _AttributeValue]]: + return list(self.attrs.items()) + + def keys(self) -> Iterable[str]: + return list(self.attrs.keys()) + + def __len__(self) -> int: + return len(self.attrs) + + def __getitem__(self, name: str) -> _AttributeValue: + return self.attrs[name] + + def __contains__(self, name: str) -> bool: + return name in list(self.attrs.keys()) + + +class BeautifulSoupNode(treebuilder_base.Node): + element: PageElement + soup: "BeautifulSoup" + namespace: Optional[_NamespaceURL] + + @property + def nodeType(self) -> int: + """Return the html5lib constant corresponding to the type of + the underlying DOM object. + + NOTE: This property is only accessed by the html5lib test + suite, not by Beautiful Soup proper. + """ + raise NotImplementedError() + + # TODO-TYPING: typeshed stubs are incorrect about this; + # cloneNode returns a new Node, not None. + def cloneNode(self) -> treebuilder_base.Node: + raise NotImplementedError() + + +class Element(BeautifulSoupNode): + element: Tag + namespace: Optional[_NamespaceURL] + + def __init__( + self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL] + ): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node: "BeautifulSoupNode") -> None: + string_child: Optional[NavigableString] = None + child: PageElement + if type(node.element) is NavigableString: + string_child = child = node.element + else: + child = node.element + node.parent = self + + if ( + child is not None + and child.parent is not None + and not isinstance(child, str) + ): + node.element.extract() + + if ( + string_child is not None + and self.element.contents + and type(self.element.contents[-1]) is NavigableString + ): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "a</a>a</a>a</a>..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, most_recent_element=most_recent_element + ) + + def getAttributes(self) -> AttrList: + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + # An HTML5lib attribute name may either be a single string, + # or a tuple (namespace, name). + _Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]] + # Now we can define the type this method accepts as a dictionary + # mapping those attribute names to single string values. + _Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str] + + def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None: + if attributes is not None and len(attributes) > 0: + # Replace any namespaced attributes with + # NamespacedAttribute objects. + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + # We can now cast attributes to the type of Dict + # used by Beautiful Soup. + normalized_attributes = cast(_AttributeValues, attributes) + + # Values for tags like 'class' came in as single strings; + # replace them with lists of strings as appropriate. + self.soup.builder._replace_cdata_list_attribute_values( + self.name, normalized_attributes + ) + + # Then set the attributes on the Tag associated with this + # BeautifulSoupNode. + for name, value_or_values in list(normalized_attributes.items()): + self.element[name] = value_or_values + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + + attributes = property(getAttributes, setAttributes) + + def insertText( + self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None + ) -> None: + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore( + self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode" + ) -> None: + index = self.element.index(refNode.element) + if ( + type(node.element) is NavigableString + and self.element.contents + and type(self.element.contents[index - 1]) is NavigableString + ): + # (See comments in appendChild) + old_node = self.element.contents[index - 1] + assert type(old_node) is NavigableString + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node: "Element") -> None: + node.element.extract() + + def reparentChildren(self, new_parent: "Element") -> None: + """Move all of this tag's children into another tag.""" + # print("MOVE", self.element.contents) + # print("FROM", self.element) + # print("TO", new_parent.element) + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + + # We can make this assertion since we know new_parent has + # children. + assert new_parents_last_descendant is not None + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = ( + new_parents_last_descendant.next_element + ) + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant( + is_initialized=False, accept_self=True + ) + + # Since we passed accept_self=True into _last_descendant, + # there's no possibility that the result is None. + assert last_childs_last_descendant is not None + last_childs_last_descendant.next_element = ( + new_parents_last_descendant_next_element + ) + if new_parents_last_descendant_next_element is not None: + # TODO-COVERAGE: This code has no test coverage and + # I'm not sure how to get html5lib to go through this + # path, but it's just the other side of the previous + # line. + new_parents_last_descendant_next_element.previous_element = ( + last_childs_last_descendant + ) + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print("DONE WITH MOVE") + # print("FROM", self.element) + # print("TO", new_parent_element) + + # TODO-TYPING: typeshed stubs are incorrect about this; + # hasContent returns a boolean, not None. + def hasContent(self) -> bool: + return len(self.element.contents) > 0 + + # TODO-TYPING: typeshed stubs are incorrect about this; + # cloneNode returns a new Node, not None. + def cloneNode(self) -> treebuilder_base.Node: + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key, value in self.attributes: + node.attributes[key] = value + return node + + def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]: + if self.namespace is None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + + +class TextNode(BeautifulSoupNode): + element: NavigableString + + def __init__(self, element: NavigableString, soup: "BeautifulSoup"): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_htmlparser.py b/.venv/lib/python3.12/site-packages/bs4/builder/_htmlparser.py new file mode 100644 index 00000000..417f7dc4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/builder/_htmlparser.py @@ -0,0 +1,474 @@ +# encoding: utf-8 +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" +from __future__ import annotations + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + "HTMLParserTreeBuilder", +] + +from html.parser import HTMLParser + +from typing import ( + Any, + Callable, + cast, + Dict, + Iterable, + List, + Optional, + TYPE_CHECKING, + Tuple, + Type, + Union, +) + +from bs4.element import ( + AttributeDict, + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, +) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + DetectsXMLParsedAsHTML, + HTML, + HTMLTreeBuilder, + STRICT, +) + +from bs4.exceptions import ParserRejectedMarkup + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + from bs4.element import NavigableString + from bs4._typing import ( + _Encoding, + _Encodings, + _RawMarkup, + ) + +HTMLPARSER = "html.parser" + +_DuplicateAttributeHandler = Callable[[Dict[str, str], str, str], None] + + +class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): + #: Constant to handle duplicate attributes by ignoring later values + #: and keeping the earlier ones. + REPLACE: str = "replace" + + #: Constant to handle duplicate attributes by replacing earlier values + #: with later ones. + IGNORE: str = "ignore" + + """A subclass of the Python standard library's HTMLParser class, which + listens for HTMLParser events and translates them into calls + to Beautiful Soup's tree construction API. + + :param on_duplicate_attribute: A strategy for what to do if a + tag includes the same attribute more than once. Accepted + values are: REPLACE (replace earlier values with later + ones, the default), IGNORE (keep the earliest value + encountered), or a callable. A callable must take three + arguments: the dictionary of attributes already processed, + the name of the duplicate attribute, and the most recent value + encountered. + """ + + def __init__( + self, + soup: BeautifulSoup, + *args: Any, + on_duplicate_attribute: Union[str, _DuplicateAttributeHandler] = REPLACE, + **kwargs: Any, + ): + self.soup = soup + self.on_duplicate_attribute = on_duplicate_attribute + self.attribute_dict_class = soup.builder.attribute_dict_class + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + self._initialize_xml_detector() + + on_duplicate_attribute: Union[str, _DuplicateAttributeHandler] + already_closed_empty_element: List[str] + soup: BeautifulSoup + + def error(self, message: str) -> None: + # NOTE: This method is required so long as Python 3.9 is + # supported. The corresponding code is removed from HTMLParser + # in 3.5, but not removed from ParserBase until 3.10. + # https://github.com/python/cpython/issues/76025 + # + # The original implementation turned the error into a warning, + # but in every case I discovered, this made HTMLParser + # immediately crash with an error message that was less + # helpful than the warning. The new implementation makes it + # more clear that html.parser just can't parse this + # markup. The 3.10 implementation does the same, though it + # raises AssertionError rather than calling a method. (We + # catch this error and wrap it in a ParserRejectedMarkup.) + raise ParserRejectedMarkup(message) + + def handle_startendtag( + self, name: str, attrs: List[Tuple[str, Optional[str]]] + ) -> None: + """Handle an incoming empty-element tag. + + html.parser only calls this method when the markup looks like + <tag/>. + """ + # `handle_empty_element` tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag, and we want to call + # handle_endtag ourselves. + self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag( + self, + name: str, + attrs: List[Tuple[str, Optional[str]]], + handle_empty_element: bool = True, + ) -> None: + """Handle an opening tag, e.g. '<tag>' + + :param handle_empty_element: True if this tag is known to be + an empty-element tag (i.e. there is not expected to be any + closing tag). + """ + # TODO: handle namespaces here? + attr_dict: AttributeDict = self.attribute_dict_class() + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = "" + if key in attr_dict: + # A single attribute shows up multiple times in this + # tag. How to handle it depends on the + # on_duplicate_attribute setting. + on_dupe = self.on_duplicate_attribute + if on_dupe == self.IGNORE: + pass + elif on_dupe in (None, self.REPLACE): + attr_dict[key] = value + else: + on_dupe = cast(_DuplicateAttributeHandler, on_dupe) + on_dupe(attr_dict, key, value) + else: + attr_dict[key] = value + # print("START", name) + sourceline: Optional[int] + sourcepos: Optional[int] + if self.soup.builder.store_line_numbers: + sourceline, sourcepos = self.getpos() + else: + sourceline = sourcepos = None + tag = self.soup.handle_starttag( + name, None, None, attr_dict, sourceline=sourceline, sourcepos=sourcepos + ) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # <tag/>.) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + if self._root_tag_name is None: + self._root_tag_encountered(name) + + def handle_endtag(self, name: str, check_already_closed: bool = True) -> None: + """Handle a closing tag, e.g. '</tag>' + + :param name: A tag name. + :param check_already_closed: True if this tag is expected to + be the closing portion of an empty-element tag, + e.g. '<tag></tag>'. + """ + # print("END", name) + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print("ALREADY CLOSED", name) + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + + def handle_data(self, data: str) -> None: + """Handle some textual data that shows up between tags.""" + self.soup.handle_data(data) + + def handle_charref(self, name: str) -> None: + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual + data. + + :param name: Character number, possibly in hexadecimal. + """ + # TODO: This was originally a workaround for a bug in + # HTMLParser. (http://bugs.python.org/issue13633) The bug has + # been fixed, but removing this code still makes some + # Beautiful Soup tests fail. This needs investigation. + if name.startswith("x"): + real_name = int(name.lstrip("x"), 16) + elif name.startswith("X"): + real_name = int(name.lstrip("X"), 16) + else: + real_name = int(name) + + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, "windows-1252"): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError): + pass + data = data or "\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name: str) -> None: + """Handle a named entity reference by converting it to the + corresponding Unicode character(s) and treating it as textual + data. + + :param name: Name of the entity reference. + """ + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name + self.handle_data(data) + + def handle_comment(self, data: str) -> None: + """Handle an HTML comment. + + :param data: The text of the comment. + """ + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data: str) -> None: + """Handle a DOCTYPE declaration. + + :param data: The text of the declaration. + """ + self.soup.endData() + data = data[len("DOCTYPE ") :] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data: str) -> None: + """Handle a declaration of unknown type -- probably a CDATA block. + + :param data: The text of the declaration. + """ + cls: Type[NavigableString] + if data.upper().startswith("CDATA["): + cls = CData + data = data[len("CDATA[") :] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data: str) -> None: + """Handle a processing instruction. + + :param data: The text of the instruction. + """ + self.soup.endData() + self.soup.handle_data(data) + self._document_might_be_xml(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + """A Beautiful soup `bs4.builder.TreeBuilder` that uses the + :py:class:`html.parser.HTMLParser` parser, found in the Python + standard library. + + """ + + is_xml: bool = False + picklable: bool = True + NAME: str = HTMLPARSER + features: Iterable[str] = [NAME, HTML, STRICT] + parser_args: Tuple[Iterable[Any], Dict[str, Any]] + + #: The html.parser knows which line number and position in the + #: original file is the source of an element. + TRACKS_LINE_NUMBERS: bool = True + + def __init__( + self, + parser_args: Optional[Iterable[Any]] = None, + parser_kwargs: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ): + """Constructor. + + :param parser_args: Positional arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param parser_kwargs: Keyword arguments to pass into + the BeautifulSoupHTMLParser constructor, once it's + invoked. + :param kwargs: Keyword arguments for the superclass constructor. + """ + # Some keyword arguments will be pulled out of kwargs and placed + # into parser_kwargs. + extra_parser_kwargs = dict() + for arg in ("on_duplicate_attribute",): + if arg in kwargs: + value = kwargs.pop(arg) + extra_parser_kwargs[arg] = value + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + parser_kwargs.update(extra_parser_kwargs) + parser_kwargs["convert_charrefs"] = False + self.parser_args = (parser_args, parser_kwargs) + + def prepare_markup( + self, + markup: _RawMarkup, + user_specified_encoding: Optional[_Encoding] = None, + document_declared_encoding: Optional[_Encoding] = None, + exclude_encodings: Optional[_Encodings] = None, + ) -> Iterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]]: + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. + This TreeBuilder uses Unicode, Dammit to convert the markup + into Unicode, so the ``markup`` element of the tuple will + always be a string. + """ + if isinstance(markup, str): + # Parse Unicode as-is. + yield (markup, None, None, False) + return + + # Ask UnicodeDammit to sniff the most likely encoding. + + known_definite_encodings: List[_Encoding] = [] + if user_specified_encoding: + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the + # HTML5 spec. (See the EncodingDetector class for + # details.) + known_definite_encodings.append(user_specified_encoding) + + user_encodings: List[_Encoding] = [] + if document_declared_encoding: + # This was found in the document; treat it as a slightly + # lower-priority user encoding. + user_encodings.append(document_declared_encoding) + + dammit = UnicodeDammit( + markup, + known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, + is_html=True, + exclude_encodings=exclude_encodings, + ) + + if dammit.unicode_markup is None: + # In every case I've seen, Unicode, Dammit is able to + # convert the markup into Unicode, even if it needs to use + # REPLACEMENT CHARACTER. But there is a code path that + # could result in unicode_markup being None, and + # HTMLParser can only parse Unicode, so here we handle + # that code path. + raise ParserRejectedMarkup( + "Could not convert input to Unicode, and html.parser will not accept bytestrings." + ) + else: + yield ( + dammit.unicode_markup, + dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters, + ) + + def feed(self, markup: _RawMarkup) -> None: + args, kwargs = self.parser_args + + # HTMLParser.feed will only handle str, but + # BeautifulSoup.markup is allowed to be _RawMarkup, because + # it's set by the yield value of + # TreeBuilder.prepare_markup. Fortunately, + # HTMLParserTreeBuilder.prepare_markup always yields a str + # (UnicodeDammit.unicode_markup). + assert isinstance(markup, str) + + # We know BeautifulSoup calls TreeBuilder.initialize_soup + # before calling feed(), so we can assume self.soup + # is set. + assert self.soup is not None + parser = BeautifulSoupHTMLParser(self.soup, *args, **kwargs) + + try: + parser.feed(markup) + parser.close() + except AssertionError as e: + # html.parser raises AssertionError in rare cases to + # indicate a fatal problem with the markup, especially + # when there's an error in the doctype declaration. + raise ParserRejectedMarkup(e) + parser.already_closed_empty_element = [] diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py b/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py new file mode 100644 index 00000000..1f367da3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/builder/_lxml.py @@ -0,0 +1,490 @@ +# encoding: utf-8 +from __future__ import annotations + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + "LXMLTreeBuilderForXML", + "LXMLTreeBuilder", +] + + +from typing import ( + Any, + Dict, + Iterable, + List, + Optional, + Set, + Tuple, + Type, + TYPE_CHECKING, + Union, +) +from typing_extensions import TypeAlias + +from io import BytesIO +from io import StringIO +from lxml import etree +from bs4.element import ( + AttributeDict, + XMLAttributeDict, + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + DetectsXMLParsedAsHTML, + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + TreeBuilder, + XML, +) +from bs4.dammit import EncodingDetector +from bs4.exceptions import ParserRejectedMarkup + +if TYPE_CHECKING: + from bs4._typing import ( + _Encoding, + _Encodings, + _NamespacePrefix, + _NamespaceURL, + _NamespaceMapping, + _InvertedNamespaceMapping, + _RawMarkup, + ) + from bs4 import BeautifulSoup + +LXML: str = "lxml" + + +def _invert(d: dict[Any, Any]) -> dict[Any, Any]: + "Invert a dictionary." + return dict((v, k) for k, v in list(d.items())) + + +_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser] +_ParserOrParserClass: TypeAlias = Union[ + _LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser] +] + + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser + + is_xml: bool = True + + processing_instruction_class: Type[ProcessingInstruction] + + NAME: str = "lxml-xml" + ALTERNATE_NAMES: Iterable[str] = ["xml"] + + # Well, it's permissive by XML parser standards. + features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE: int = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace") + + DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS) + + nsmaps: List[Optional[_InvertedNamespaceMapping]] + empty_element_tags: Set[str] + parser: Any + _default_parser: Optional[etree.XMLParser] + + # NOTE: If we parsed Element objects and looked at .sourceline, + # we'd be able to see the line numbers from the original document. + # But instead we build an XMLParser or HTMLParser object to serve + # as the target of parse messages, and those messages don't include + # line numbers. + # See: https://bugs.launchpad.net/lxml/+bug/1846906 + + def initialize_soup(self, soup: BeautifulSoup) -> None: + """Let the BeautifulSoup object know about the standard namespace + mapping. + + :param soup: A `BeautifulSoup`. + """ + # Beyond this point, self.soup is set, so we can assume (and + # assert) it's not None whenever necessary. + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping: Dict[str, str]) -> None: + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + + This will track (almost) all namespaces, even ones that were + only in scope for part of the document. If two namespaces have + the same prefix, only the first one encountered will be + tracked. Un-prefixed namespaces are not tracked. + + :param mapping: A dictionary mapping namespace prefixes to URIs. + """ + assert self.soup is not None + for key, value in list(mapping.items()): + # This is 'if key' and not 'if key is not None' because we + # don't track un-prefixed namespaces. Soupselect will + # treat an un-prefixed namespace as the default, which + # causes confusion in some cases. + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + + def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: + """Find the default parser for the given encoding. + + :return: Either a parser object or a class, which + will be instantiated with default arguments. + """ + if self._default_parser is not None: + return self._default_parser + return self.DEFAULT_PARSER_CLASS(target=self, recover=True, encoding=encoding) + + def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser: + """Instantiate an appropriate parser for the given encoding. + + :param encoding: A string. + :return: A parser object such as an `etree.XMLParser`. + """ + # Use the default parser. + parser = self.default_parser(encoding) + + if callable(parser): + # Instantiate the parser with default arguments + parser = parser(target=self, recover=True, encoding=encoding) + return parser + + def __init__( + self, + parser: Optional[etree.XMLParser] = None, + empty_element_tags: Optional[Set[str]] = None, + **kwargs: Any, + ): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)] + if "attribute_dict_class" not in kwargs: + kwargs["attribute_dict_class"] = XMLAttributeDict + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + + def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]: + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == "{": + namespace, name = tag[1:].split("}", 1) + return (namespace, name) + else: + return (None, tag) + + def prepare_markup( + self, + markup: _RawMarkup, + user_specified_encoding: Optional[_Encoding] = None, + document_declared_encoding: Optional[_Encoding] = None, + exclude_encodings: Optional[_Encodings] = None, + ) -> Iterable[ + Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool] + ]: + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + lxml really wants to get a bytestring and convert it to + Unicode itself. So instead of using UnicodeDammit to convert + the bytestring to Unicode using different encodings, this + implementation uses EncodingDetector to iterate over the + encodings, and tell lxml to try to parse the document as each + one in turn. + + :param markup: Some markup -- hopefully a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + """ + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + # We're in HTML mode, so if we're given XML, that's worth + # noting. + DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3) + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + + # TODO: This is a workaround for + # https://bugs.launchpad.net/lxml/+bug/1948551. + # We can remove it once the upstream issue is fixed. + if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}": + markup = markup[1:] + yield markup, None, document_declared_encoding, False + + if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) + + # Since the document was Unicode in the first place, there + # is no need to try any more strategies; we know this will + # work. + return + + known_definite_encodings: List[_Encoding] = [] + if user_specified_encoding: + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the + # HTML5 spec. (See the EncodingDetector class for + # details.) + known_definite_encodings.append(user_specified_encoding) + + user_encodings: List[_Encoding] = [] + if document_declared_encoding: + # This was found in the document; treat it as a slightly + # lower-priority user encoding. + user_encodings.append(document_declared_encoding) + + detector = EncodingDetector( + markup, + known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, + is_html=is_html, + exclude_encodings=exclude_encodings, + ) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup: _RawMarkup) -> None: + io: Union[BytesIO, StringIO] + if isinstance(markup, bytes): + io = BytesIO(markup) + elif isinstance(markup, str): + io = StringIO(markup) + + # initialize_soup is called before feed, so we know this + # is not None. + assert self.soup is not None + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = io.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = io.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + def close(self) -> None: + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + + def start( + self, + tag: str | bytes, + attrs: Dict[str | bytes, str | bytes], + nsmap: _NamespaceMapping = {}, + ) -> None: + # This is called by lxml code as a result of calling + # BeautifulSoup.feed(), and we know self.soup is set by the time feed() + # is called. + assert self.soup is not None + assert isinstance(tag, str) + + # We need to recreate the attribute dict for three + # reasons. First, for type checking, so we can assert there + # are no bytestrings in the keys or values. Second, because we + # need a mutable dict--lxml might send us an immutable + # dictproxy. Third, so we can handle namespaced attribute + # names by converting the keys to NamespacedAttributes. + new_attrs: Dict[Union[str, NamespacedAttribute], str] = ( + self.attribute_dict_class() + ) + for k, v in attrs.items(): + assert isinstance(k, str) + assert isinstance(v, str) + new_attrs[k] = v + + nsprefix: Optional[_NamespacePrefix] = None + namespace: Optional[_NamespaceURL] = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + + # The currently active namespace prefixes have + # changed. Calculate the new mapping so it can be stored + # with all Tag objects created while these prefixes are in + # scope. + current_mapping = dict(self.active_namespace_prefixes[-1]) + current_mapping.update(nsmap) + + # We should not track un-prefixed namespaces as we can only hold one + # and it will be recognized as the default namespace by soupsieve, + # which may be confusing in some situations. + if "" in current_mapping: + del current_mapping[""] + self.active_namespace_prefixes.append(current_mapping) + + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/" + ) + new_attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + final_attrs: AttributeDict = self.attribute_dict_class() + for attr, value in list(new_attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + final_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + final_attrs[attr] = value + + namespace, tag = self._getNsTag(tag) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag( + tag, + namespace, + nsprefix, + final_attrs, + namespaces=self.active_namespace_prefixes[-1], + ) + + def _prefix_for_namespace( + self, namespace: Optional[_NamespaceURL] + ) -> Optional[_NamespacePrefix]: + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name: str | bytes) -> None: + assert self.soup is not None + assert isinstance(name, str) + self.soup.endData() + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + out_of_scope_nsmap = self.nsmaps.pop() + + if out_of_scope_nsmap is not None: + # This tag introduced a namespace mapping which is no + # longer in scope. Recalculate the currently active + # namespace prefixes. + self.active_namespace_prefixes.pop() + + def pi(self, target: str, data: str) -> None: + assert self.soup is not None + self.soup.endData() + data = target + " " + data + self.soup.handle_data(data) + self.soup.endData(self.processing_instruction_class) + + def data(self, data: str | bytes) -> None: + assert self.soup is not None + assert isinstance(data, str) + self.soup.handle_data(data) + + def doctype(self, name: str, pubid: str, system: str) -> None: + assert self.soup is not None + self.soup.endData() + doctype_string = Doctype._string_for_name_and_ids(name, pubid, system) + self.soup.handle_data(doctype_string) + self.soup.endData(containerClass=Doctype) + + def comment(self, text: str | bytes) -> None: + "Handle comments as Comment objects." + assert self.soup is not None + assert isinstance(text, str) + self.soup.endData() + self.soup.handle_data(text) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment: str) -> str: + """See `TreeBuilder`.""" + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + NAME: str = LXML + ALTERNATE_NAMES: Iterable[str] = ["lxml-html"] + + features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE] + is_xml: bool = False + + def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass: + return etree.HTMLParser + + def feed(self, markup: _RawMarkup) -> None: + # We know self.soup is set by the time feed() is called. + assert self.soup is not None + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(e) + + def test_fragment_to_document(self, fragment: str) -> str: + """See `TreeBuilder`.""" + return "<html><body>%s</body></html>" % fragment |