diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/builder/__init__.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/builder/__init__.py | 848 |
1 files changed, 848 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py new file mode 100644 index 00000000..5f2b38de --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py @@ -0,0 +1,848 @@ +from __future__ import annotations + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import re +from types import ModuleType +from typing import ( + Any, + cast, + Dict, + Iterable, + List, + Optional, + Pattern, + Set, + Tuple, + Type, + TYPE_CHECKING, +) +import warnings +import sys +from bs4.element import ( + AttributeDict, + AttributeValueList, + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + RubyParenthesisString, + RubyTextString, + Stylesheet, + Script, + TemplateString, + nonwhitespace_re, +) + +# Exceptions were moved to their own module in 4.13. Import here for +# backwards compatibility. +from bs4.exceptions import ParserRejectedMarkup + +from bs4._typing import ( + _AttributeValues, + _RawAttributeValue, +) + +from bs4._warnings import XMLParsedAsHTMLWarning + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + from bs4.element import ( + NavigableString, + Tag, + ) + from bs4._typing import ( + _AttributeValue, + _Encoding, + _Encodings, + _RawOrProcessedAttributeValues, + _RawMarkup, + ) + +__all__ = [ + "HTMLTreeBuilder", + "SAXTreeBuilder", + "TreeBuilder", + "TreeBuilderRegistry", +] + +# Some useful features for a TreeBuilder to have. +FAST = "fast" +PERMISSIVE = "permissive" +STRICT = "strict" +XML = "xml" +HTML = "html" +HTML_5 = "html5" + +__all__ = [ + "TreeBuilderRegistry", + "TreeBuilder", + "HTMLTreeBuilder", + "DetectsXMLParsedAsHTML", + + "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0 +] + +class TreeBuilderRegistry(object): + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + + builders_for_feature: Dict[str, List[Type[TreeBuilder]]] + builders: List[Type[TreeBuilder]] + + def __init__(self) -> None: + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class: type[TreeBuilder]) -> None: + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of `TreeBuilder`. its + `TreeBuilder.features` attribute should list its features. + """ + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]: + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + feature_list = list(features) + feature_list.reverse() + candidates = None + candidate_set = None + while len(feature_list) > 0: + feature = feature_list.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection(set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None or candidates is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + + +#: The `BeautifulSoup` constructor will take a list of features +#: and use it to look up `TreeBuilder` classes in this registry. +builder_registry: TreeBuilderRegistry = TreeBuilderRegistry() + + +class TreeBuilder(object): + """Turn a textual document into a Beautiful Soup object tree. + + This is an abstract superclass which smooths out the behavior of + different parser libraries into a single, unified interface. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this to a dictionary will + customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES` + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is ``multi_valued_attributes``. + + :param preserve_whitespace_tags: A set of tags to treat + the way <pre> tags are treated in HTML. Tags in this set + are immune from pretty-printing; their contents will always be + output as-is. + + :param string_containers: A dictionary mapping tag names to + the classes that should be instantiated to contain the textual + contents of those tags. The default is to use NavigableString + for every tag, no matter what the name. You can override the + default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`. + + :param store_line_numbers: If the parser keeps track of the line + numbers and positions of the original markup, that information + will, by default, be stored in each corresponding + :py:class:`bs4.element.Tag` object. You can turn this off by + passing store_line_numbers=False; then Tag.sourcepos and + Tag.sourceline will always be None. If the parser you're using + doesn't keep track of this information, then store_line_numbers + is irrelevant. + + :param attribute_dict_class: The value of a multi-valued attribute + (such as HTML's 'class') willl be stored in an instance of this + class. The default is Beautiful Soup's built-in + `AttributeValueList`, which is a normal Python list, and you + will probably never need to change it. + """ + + USE_DEFAULT: Any = object() #: :meta private: + + def __init__( + self, + multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT, + preserve_whitespace_tags: Set[str] = USE_DEFAULT, + store_line_numbers: bool = USE_DEFAULT, + string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT, + empty_element_tags: Set[str] = USE_DEFAULT, + attribute_dict_class: Type[AttributeDict] = AttributeDict, + attribute_value_list_class: Type[AttributeValueList] = AttributeValueList, + ): + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + if empty_element_tags is self.USE_DEFAULT: + self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS + else: + self.empty_element_tags = empty_element_tags + # TODO: store_line_numbers is probably irrelevant now that + # the behavior of sourceline and sourcepos has been made consistent + # everywhere. + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + if string_containers == self.USE_DEFAULT: + string_containers = self.DEFAULT_STRING_CONTAINERS + self.string_containers = string_containers + self.attribute_dict_class = attribute_dict_class + self.attribute_value_list_class = attribute_value_list_class + + NAME: str = "[Unknown tree builder]" + ALTERNATE_NAMES: Iterable[str] = [] + features: Iterable[str] = [] + + is_xml: bool = False + picklable: bool = False + + soup: Optional[BeautifulSoup] #: :meta private: + + #: A tag will be considered an empty-element + #: tag when and only when it has no contents. + empty_element_tags: Optional[Set[str]] = None #: :meta private: + cdata_list_attributes: Dict[str, Set[str]] #: :meta private: + preserve_whitespace_tags: Set[str] #: :meta private: + string_containers: Dict[str, Type[NavigableString]] #: :meta private: + tracks_line_numbers: bool #: :meta private: + + #: A value for these tag/attribute combinations is a space- or + #: comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set) + + #: Whitespace should be preserved inside these tags. + DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set() + + #: The textual contents of tags with these names should be + #: instantiated with some class other than `bs4.element.NavigableString`. + DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} + + #: By default, tags are treated as empty-element tags if they have + #: no contents--that is, using XML rules. HTMLTreeBuilder + #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the + #: HTML 4 and HTML5 standards. + DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None + + #: Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS: bool = False + + def initialize_soup(self, soup: BeautifulSoup) -> None: + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + + :param soup: A BeautifulSoup object. + """ + self.soup = soup + + def reset(self) -> None: + """Do any work necessary to reset the underlying parser + for a new document. + + By default, this does nothing. + """ + pass + + def can_be_empty_element(self, tag_name: str) -> bool: + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p/>" or "<p>". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no children. + "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will + be left alone. + + :param tag_name: The name of a markup tag. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup: _RawMarkup) -> None: + """Run incoming markup through some parsing process.""" + raise NotImplementedError() + + def prepare_markup( + self, + markup: _RawMarkup, + user_specified_encoding: Optional[_Encoding] = None, + document_declared_encoding: Optional[_Encoding] = None, + exclude_encodings: Optional[_Encodings] = None, + ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]: + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: The markup that's about to be parsed. + :param user_specified_encoding: The user asked to try this encoding + to convert the markup into a Unicode string. + :param document_declared_encoding: The markup itself claims to be + in this encoding. NOTE: This argument is not used by the + calling code and can probably be removed. + :param exclude_encodings: The user asked *not* to try any of + these encodings. + + :yield: A series of 4-tuples: (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy that the parser can try + to convert the document to Unicode and parse it. Each + strategy will be tried in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + + :meta private: + + """ + yield markup, None, None, False + + def test_fragment_to_document(self, fragment: str) -> str: + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of unit tests. + + :param fragment: A fragment of HTML. + :return: A full HTML document. + :meta private: + """ + return fragment + + def set_up_substitutions(self, tag: Tag) -> bool: + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :return: Whether or not a substitution was performed. + :meta private: + """ + return False + + def _replace_cdata_list_attribute_values( + self, tag_name: str, attrs: _RawOrProcessedAttributeValues + ) -> _AttributeValues: + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. + + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. + + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. + :return: The modified dictionary that was originally passed in. + """ + + # First, cast the attrs dict to _AttributeValues. This might + # not be accurate yet, but it will be by the time this method + # returns. + modified_attrs = cast(_AttributeValues, attrs) + if not modified_attrs or not self.cdata_list_attributes: + # Nothing to do. + return modified_attrs + + # There is at least a possibility that we need to modify one of + # the attribute values. + universal: Set[str] = self.cdata_list_attributes.get("*", set()) + tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None) + for attr in list(modified_attrs.keys()): + modified_value: _AttributeValue + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + original_value: _AttributeValue = modified_attrs[attr] + if isinstance(original_value, _RawAttributeValue): + # This is a _RawAttributeValue (a string) that + # needs to be split and converted to a + # AttributeValueList so it can be an + # _AttributeValue. + modified_value = self.attribute_value_list_class( + nonwhitespace_re.findall(original_value) + ) + else: + # html5lib calls setAttributes twice for the + # same tag when rearranging the parse tree. On + # the second call the attribute value here is + # already a list. This can also happen when a + # Tag object is cloned. If this happens, leave + # the value alone rather than trying to split + # it again. + modified_value = original_value + modified_attrs[attr] = modified_value + return modified_attrs + + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, and it will be removed + soon. It was a good idea, but it wasn't properly integrated into the + rest of Beautiful Soup, so there have been long stretches where it + hasn't worked properly. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + warnings.warn( + "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.", + DeprecationWarning, + stacklevel=2, + ) + super(SAXTreeBuilder, self).__init__(*args, **kwargs) + + def feed(self, markup: _RawMarkup) -> None: + raise NotImplementedError() + + def close(self) -> None: + pass + + def startElement(self, name: str, attrs: Dict[str, str]) -> None: + attrs = AttributeDict((key[1], value) for key, value in list(attrs.items())) + # print("Start %s, %r" % (name, attrs)) + assert self.soup is not None + self.soup.handle_starttag(name, None, None, attrs) + + def endElement(self, name: str) -> None: + # print("End %s" % name) + assert self.soup is not None + self.soup.handle_endtag(name) + + def startElementNS( + self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str] + ) -> None: + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None: + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + # handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix: str, nodeValue: str) -> None: + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix: str) -> None: + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content: str) -> None: + assert self.soup is not None + self.soup.handle_data(content) + + def startDocument(self) -> None: + pass + + def endDocument(self) -> None: + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML, such as which tags are treated + specially by the HTML standard. + """ + + #: Some HTML tags are defined as having no contents. Beautiful Soup + #: treats these specially. + DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set( + [ + # These are from HTML5. + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "keygen", + "link", + "menuitem", + "meta", + "param", + "source", + "track", + "wbr", + # These are from earlier versions of HTML and are removed in HTML5. + "basefont", + "bgsound", + "command", + "frame", + "image", + "isindex", + "nextid", + "spacer", + ] + ) + + #: The HTML standard defines these tags as block-level elements. Beautiful + #: Soup does not treat these elements differently from other elements, + #: but it may do so eventually, and this information is available if + #: you need to use it. + DEFAULT_BLOCK_ELEMENTS: Set[str] = set( + [ + "address", + "article", + "aside", + "blockquote", + "canvas", + "dd", + "div", + "dl", + "dt", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hr", + "li", + "main", + "nav", + "noscript", + "ol", + "output", + "p", + "pre", + "section", + "table", + "tfoot", + "ul", + "video", + ] + ) + + #: These HTML tags need special treatment so they can be + #: represented by a string class other than `bs4.element.NavigableString`. + #: + #: For some of these tags, it's because the HTML standard defines + #: an unusual content model for them. I made this list by going + #: through the HTML spec + #: (https://html.spec.whatwg.org/#metadata-content) and looking for + #: "metadata content" elements that can contain strings. + #: + #: The Ruby tags (<rt> and <rp>) are here despite being normal + #: "phrasing content" tags, because the content they contain is + #: qualitatively different from other text in the document, and it + #: can be useful to be able to distinguish it. + #: + #: TODO: Arguably <noscript> could go here but it seems + #: qualitatively different from the other tags. + DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { + "rt": RubyTextString, + "rp": RubyParenthesisString, + "style": Stylesheet, + "script": Script, + "template": TemplateString, + } + + #: The HTML standard defines these attributes as containing a + #: space-separated list of values, not a single value. That is, + #: class="foo bar" means that the 'class' attribute has two values, + #: 'foo' and 'bar', not the single value 'foo bar'. When we + #: encounter one of these attributes, we will parse its value into + #: a list of values if possible. Upon output, the list will be + #: converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = { + "*": {"class", "accesskey", "dropzone"}, + "a": {"rel", "rev"}, + "link": {"rel", "rev"}, + "td": {"headers"}, + "th": {"headers"}, + "form": {"accept-charset"}, + "object": {"archive"}, + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area": {"rel"}, + "icon": {"sizes"}, + "iframe": {"sandbox"}, + "output": {"for"}, + } + + #: By default, whitespace inside these HTML tags will be + #: preserved rather than being collapsed. + DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"]) + + def set_up_substitutions(self, tag: Tag) -> bool: + """Replace the declared encoding in a <meta> tag with a placeholder, + to be substituted when the tag is output to a string. + + An HTML document may come in to Beautiful Soup as one + encoding, but exit in a different encoding, and the <meta> tag + needs to be changed to reflect this. + + :return: Whether or not a substitution was performed. + + :meta private: + """ + # We are only interested in <meta> tags + if tag.name != "meta": + return False + + # TODO: This cast will fail in the (very unlikely) scenario + # that the programmer who instantiates the TreeBuilder + # specifies meta['content'] or meta['charset'] as + # cdata_list_attributes. + content: Optional[str] = cast(Optional[str], tag.get("content")) + charset: Optional[str] = cast(Optional[str], tag.get("charset")) + + # But we can accommodate meta['http-equiv'] being made a + # cdata_list_attribute (again, very unlikely) without much + # trouble. + http_equiv: List[str] = tag.get_attribute_list("http-equiv") + + # We are interested in <meta> tags that say what encoding the + # document was originally in. This means HTML 5-style <meta> + # tags that provide the "charset" attribute. It also means + # HTML 4-style <meta> tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + substituted = False + if charset is not None: + # HTML 5 style: + # <meta charset="utf8"> + tag["charset"] = CharsetMetaAttributeValue(charset) + substituted = True + + elif content is not None and any( + x.lower() == "content-type" for x in http_equiv + ): + # HTML 4 style: + # <meta http-equiv="content-type" content="text/html; charset=utf8"> + tag["content"] = ContentMetaAttributeValue(content) + substituted = True + + return substituted + + +class DetectsXMLParsedAsHTML(object): + """A mixin class for any class (a TreeBuilder, or some class used by a + TreeBuilder) that's in a position to detect whether an XML + document is being incorrectly parsed as HTML, and issue an + appropriate warning. + + This requires being able to observe an incoming processing + instruction that might be an XML declaration, and also able to + observe tags as they're opened. If you can't do that for a given + `TreeBuilder`, there's a less reliable implementation based on + examining the raw markup. + """ + + #: Regular expression for seeing if string markup has an <html> tag. + LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I) + + #: Regular expression for seeing if byte markup has an <html> tag. + LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I) + + #: The start of an XML document string. + XML_PREFIX: str = "<?xml" + + #: The start of an XML document bytestring. + XML_PREFIX_B: bytes = b"<?xml" + + # This is typed as str, not `ProcessingInstruction`, because this + # check may be run before any Beautiful Soup objects are created. + _first_processing_instruction: Optional[str] #: :meta private: + _root_tag_name: Optional[str] #: :meta private: + + @classmethod + def warn_if_markup_looks_like_xml( + cls, markup: Optional[_RawMarkup], stacklevel: int = 3 + ) -> bool: + """Perform a check on some markup to see if it looks like XML + that's not XHTML. If so, issue a warning. + + This is much less reliable than doing the check while parsing, + but some of the tree builders can't do that. + + :param stacklevel: The stacklevel of the code calling this\ + function. + + :return: True if the markup looks like non-XHTML XML, False + otherwise. + """ + if markup is None: + return False + markup = markup[:500] + if isinstance(markup, bytes): + markup_b: bytes = markup + looks_like_xml = markup_b.startswith( + cls.XML_PREFIX_B + ) and not cls.LOOKS_LIKE_HTML_B.search(markup) + else: + markup_s: str = markup + looks_like_xml = markup_s.startswith( + cls.XML_PREFIX + ) and not cls.LOOKS_LIKE_HTML.search(markup) + + if looks_like_xml: + cls._warn(stacklevel=stacklevel + 2) + return True + return False + + @classmethod + def _warn(cls, stacklevel: int = 5) -> None: + """Issue a warning about XML being parsed as HTML.""" + warnings.warn( + XMLParsedAsHTMLWarning.MESSAGE, + XMLParsedAsHTMLWarning, + stacklevel=stacklevel, + ) + + def _initialize_xml_detector(self) -> None: + """Call this method before parsing a document.""" + self._first_processing_instruction = None + self._root_tag_name = None + + def _document_might_be_xml(self, processing_instruction: str) -> None: + """Call this method when encountering an XML declaration, or a + "processing instruction" that might be an XML declaration. + + This helps Beautiful Soup detect potential issues later, if + the XML document turns out to be a non-XHTML document that's + being parsed as XML. + """ + if ( + self._first_processing_instruction is not None + or self._root_tag_name is not None + ): + # The document has already started. Don't bother checking + # anymore. + return + + self._first_processing_instruction = processing_instruction + + # We won't know until we encounter the first tag whether or + # not this is actually a problem. + + def _root_tag_encountered(self, name: str) -> None: + """Call this when you encounter the document's root tag. + + This is where we actually check whether an XML document is + being incorrectly parsed as HTML, and issue the warning. + """ + if self._root_tag_name is not None: + # This method was incorrectly called multiple times. Do + # nothing. + return + + self._root_tag_name = name + + if ( + name != "html" + and self._first_processing_instruction is not None + and self._first_processing_instruction.lower().startswith("xml ") + ): + # We encountered an XML declaration and then a tag other + # than 'html'. This is a reliable indicator that a + # non-XHTML document is being parsed as XML. + self._warn(stacklevel=10) + + +def register_treebuilders_from(module: ModuleType) -> None: + """Copy TreeBuilders from the given module into this module.""" + this_module = sys.modules[__name__] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last resort. +from . import _htmlparser # noqa: E402 + +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass |