aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/builder/__init__.py')
-rw-r--r--.venv/lib/python3.12/site-packages/bs4/builder/__init__.py848
1 files changed, 848 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py
new file mode 100644
index 00000000..5f2b38de
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/bs4/builder/__init__.py
@@ -0,0 +1,848 @@
+from __future__ import annotations
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+from collections import defaultdict
+import re
+from types import ModuleType
+from typing import (
+ Any,
+ cast,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Pattern,
+ Set,
+ Tuple,
+ Type,
+ TYPE_CHECKING,
+)
+import warnings
+import sys
+from bs4.element import (
+ AttributeDict,
+ AttributeValueList,
+ CharsetMetaAttributeValue,
+ ContentMetaAttributeValue,
+ RubyParenthesisString,
+ RubyTextString,
+ Stylesheet,
+ Script,
+ TemplateString,
+ nonwhitespace_re,
+)
+
+# Exceptions were moved to their own module in 4.13. Import here for
+# backwards compatibility.
+from bs4.exceptions import ParserRejectedMarkup
+
+from bs4._typing import (
+ _AttributeValues,
+ _RawAttributeValue,
+)
+
+from bs4._warnings import XMLParsedAsHTMLWarning
+
+if TYPE_CHECKING:
+ from bs4 import BeautifulSoup
+ from bs4.element import (
+ NavigableString,
+ Tag,
+ )
+ from bs4._typing import (
+ _AttributeValue,
+ _Encoding,
+ _Encodings,
+ _RawOrProcessedAttributeValues,
+ _RawMarkup,
+ )
+
+__all__ = [
+ "HTMLTreeBuilder",
+ "SAXTreeBuilder",
+ "TreeBuilder",
+ "TreeBuilderRegistry",
+]
+
+# Some useful features for a TreeBuilder to have.
+FAST = "fast"
+PERMISSIVE = "permissive"
+STRICT = "strict"
+XML = "xml"
+HTML = "html"
+HTML_5 = "html5"
+
+__all__ = [
+ "TreeBuilderRegistry",
+ "TreeBuilder",
+ "HTMLTreeBuilder",
+ "DetectsXMLParsedAsHTML",
+
+ "ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
+]
+
+class TreeBuilderRegistry(object):
+ """A way of looking up TreeBuilder subclasses by their name or by desired
+ features.
+ """
+
+ builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
+ builders: List[Type[TreeBuilder]]
+
+ def __init__(self) -> None:
+ self.builders_for_feature = defaultdict(list)
+ self.builders = []
+
+ def register(self, treebuilder_class: type[TreeBuilder]) -> None:
+ """Register a treebuilder based on its advertised features.
+
+ :param treebuilder_class: A subclass of `TreeBuilder`. its
+ `TreeBuilder.features` attribute should list its features.
+ """
+ for feature in treebuilder_class.features:
+ self.builders_for_feature[feature].insert(0, treebuilder_class)
+ self.builders.insert(0, treebuilder_class)
+
+ def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
+ """Look up a TreeBuilder subclass with the desired features.
+
+ :param features: A list of features to look for. If none are
+ provided, the most recently registered TreeBuilder subclass
+ will be used.
+ :return: A TreeBuilder subclass, or None if there's no
+ registered subclass with all the requested features.
+ """
+ if len(self.builders) == 0:
+ # There are no builders at all.
+ return None
+
+ if len(features) == 0:
+ # They didn't ask for any features. Give them the most
+ # recently registered builder.
+ return self.builders[0]
+
+ # Go down the list of features in order, and eliminate any builders
+ # that don't match every feature.
+ feature_list = list(features)
+ feature_list.reverse()
+ candidates = None
+ candidate_set = None
+ while len(feature_list) > 0:
+ feature = feature_list.pop()
+ we_have_the_feature = self.builders_for_feature.get(feature, [])
+ if len(we_have_the_feature) > 0:
+ if candidates is None:
+ candidates = we_have_the_feature
+ candidate_set = set(candidates)
+ else:
+ # Eliminate any candidates that don't have this feature.
+ candidate_set = candidate_set.intersection(set(we_have_the_feature))
+
+ # The only valid candidates are the ones in candidate_set.
+ # Go through the original list of candidates and pick the first one
+ # that's in candidate_set.
+ if candidate_set is None or candidates is None:
+ return None
+ for candidate in candidates:
+ if candidate in candidate_set:
+ return candidate
+ return None
+
+
+#: The `BeautifulSoup` constructor will take a list of features
+#: and use it to look up `TreeBuilder` classes in this registry.
+builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()
+
+
+class TreeBuilder(object):
+ """Turn a textual document into a Beautiful Soup object tree.
+
+ This is an abstract superclass which smooths out the behavior of
+ different parser libraries into a single, unified interface.
+
+ :param multi_valued_attributes: If this is set to None, the
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this to a dictionary will
+ customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
+ for an example.
+
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is ``multi_valued_attributes``.
+
+ :param preserve_whitespace_tags: A set of tags to treat
+ the way <pre> tags are treated in HTML. Tags in this set
+ are immune from pretty-printing; their contents will always be
+ output as-is.
+
+ :param string_containers: A dictionary mapping tag names to
+ the classes that should be instantiated to contain the textual
+ contents of those tags. The default is to use NavigableString
+ for every tag, no matter what the name. You can override the
+ default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.
+
+ :param store_line_numbers: If the parser keeps track of the line
+ numbers and positions of the original markup, that information
+ will, by default, be stored in each corresponding
+ :py:class:`bs4.element.Tag` object. You can turn this off by
+ passing store_line_numbers=False; then Tag.sourcepos and
+ Tag.sourceline will always be None. If the parser you're using
+ doesn't keep track of this information, then store_line_numbers
+ is irrelevant.
+
+ :param attribute_dict_class: The value of a multi-valued attribute
+ (such as HTML's 'class') willl be stored in an instance of this
+ class. The default is Beautiful Soup's built-in
+ `AttributeValueList`, which is a normal Python list, and you
+ will probably never need to change it.
+ """
+
+ USE_DEFAULT: Any = object() #: :meta private:
+
+ def __init__(
+ self,
+ multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
+ preserve_whitespace_tags: Set[str] = USE_DEFAULT,
+ store_line_numbers: bool = USE_DEFAULT,
+ string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
+ empty_element_tags: Set[str] = USE_DEFAULT,
+ attribute_dict_class: Type[AttributeDict] = AttributeDict,
+ attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
+ ):
+ self.soup = None
+ if multi_valued_attributes is self.USE_DEFAULT:
+ multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+ self.cdata_list_attributes = multi_valued_attributes
+ if preserve_whitespace_tags is self.USE_DEFAULT:
+ preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+ self.preserve_whitespace_tags = preserve_whitespace_tags
+ if empty_element_tags is self.USE_DEFAULT:
+ self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
+ else:
+ self.empty_element_tags = empty_element_tags
+ # TODO: store_line_numbers is probably irrelevant now that
+ # the behavior of sourceline and sourcepos has been made consistent
+ # everywhere.
+ if store_line_numbers == self.USE_DEFAULT:
+ store_line_numbers = self.TRACKS_LINE_NUMBERS
+ self.store_line_numbers = store_line_numbers
+ if string_containers == self.USE_DEFAULT:
+ string_containers = self.DEFAULT_STRING_CONTAINERS
+ self.string_containers = string_containers
+ self.attribute_dict_class = attribute_dict_class
+ self.attribute_value_list_class = attribute_value_list_class
+
+ NAME: str = "[Unknown tree builder]"
+ ALTERNATE_NAMES: Iterable[str] = []
+ features: Iterable[str] = []
+
+ is_xml: bool = False
+ picklable: bool = False
+
+ soup: Optional[BeautifulSoup] #: :meta private:
+
+ #: A tag will be considered an empty-element
+ #: tag when and only when it has no contents.
+ empty_element_tags: Optional[Set[str]] = None #: :meta private:
+ cdata_list_attributes: Dict[str, Set[str]] #: :meta private:
+ preserve_whitespace_tags: Set[str] #: :meta private:
+ string_containers: Dict[str, Type[NavigableString]] #: :meta private:
+ tracks_line_numbers: bool #: :meta private:
+
+ #: A value for these tag/attribute combinations is a space- or
+ #: comma-separated list of CDATA, rather than a single CDATA.
+ DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)
+
+ #: Whitespace should be preserved inside these tags.
+ DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()
+
+ #: The textual contents of tags with these names should be
+ #: instantiated with some class other than `bs4.element.NavigableString`.
+ DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {}
+
+ #: By default, tags are treated as empty-element tags if they have
+ #: no contents--that is, using XML rules. HTMLTreeBuilder
+ #: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
+ #: HTML 4 and HTML5 standards.
+ DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None
+
+ #: Most parsers don't keep track of line numbers.
+ TRACKS_LINE_NUMBERS: bool = False
+
+ def initialize_soup(self, soup: BeautifulSoup) -> None:
+ """The BeautifulSoup object has been initialized and is now
+ being associated with the TreeBuilder.
+
+ :param soup: A BeautifulSoup object.
+ """
+ self.soup = soup
+
+ def reset(self) -> None:
+ """Do any work necessary to reset the underlying parser
+ for a new document.
+
+ By default, this does nothing.
+ """
+ pass
+
+ def can_be_empty_element(self, tag_name: str) -> bool:
+ """Might a tag with this name be an empty-element tag?
+
+ The final markup may or may not actually present this tag as
+ self-closing.
+
+ For instance: an HTMLBuilder does not consider a <p> tag to be
+ an empty-element tag (it's not in
+ HTMLBuilder.empty_element_tags). This means an empty <p> tag
+ will be presented as "<p></p>", not "<p/>" or "<p>".
+
+ The default implementation has no opinion about which tags are
+ empty-element tags, so a tag will be presented as an
+ empty-element tag if and only if it has no children.
+ "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
+ be left alone.
+
+ :param tag_name: The name of a markup tag.
+ """
+ if self.empty_element_tags is None:
+ return True
+ return tag_name in self.empty_element_tags
+
+ def feed(self, markup: _RawMarkup) -> None:
+ """Run incoming markup through some parsing process."""
+ raise NotImplementedError()
+
+ def prepare_markup(
+ self,
+ markup: _RawMarkup,
+ user_specified_encoding: Optional[_Encoding] = None,
+ document_declared_encoding: Optional[_Encoding] = None,
+ exclude_encodings: Optional[_Encodings] = None,
+ ) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: The markup that's about to be parsed.
+ :param user_specified_encoding: The user asked to try this encoding
+ to convert the markup into a Unicode string.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding. NOTE: This argument is not used by the
+ calling code and can probably be removed.
+ :param exclude_encodings: The user asked *not* to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples: (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy that the parser can try
+ to convert the document to Unicode and parse it. Each
+ strategy will be tried in turn.
+
+ By default, the only strategy is to parse the markup
+ as-is. See `LXMLTreeBuilderForXML` and
+ `HTMLParserTreeBuilder` for implementations that take into
+ account the quirks of particular parsers.
+
+ :meta private:
+
+ """
+ yield markup, None, None, False
+
+ def test_fragment_to_document(self, fragment: str) -> str:
+ """Wrap an HTML fragment to make it look like a document.
+
+ Different parsers do this differently. For instance, lxml
+ introduces an empty <head> tag, and html5lib
+ doesn't. Abstracting this away lets us write simple tests
+ which run HTML fragments through the parser and compare the
+ results against other HTML fragments.
+
+ This method should not be used outside of unit tests.
+
+ :param fragment: A fragment of HTML.
+ :return: A full HTML document.
+ :meta private:
+ """
+ return fragment
+
+ def set_up_substitutions(self, tag: Tag) -> bool:
+ """Set up any substitutions that will need to be performed on
+ a `Tag` when it's output as a string.
+
+ By default, this does nothing. See `HTMLTreeBuilder` for a
+ case where this is used.
+
+ :return: Whether or not a substitution was performed.
+ :meta private:
+ """
+ return False
+
+ def _replace_cdata_list_attribute_values(
+ self, tag_name: str, attrs: _RawOrProcessedAttributeValues
+ ) -> _AttributeValues:
+ """When an attribute value is associated with a tag that can
+ have multiple values for that attribute, convert the string
+ value to a list of strings.
+
+ Basically, replaces class="foo bar" with class=["foo", "bar"]
+
+ NOTE: This method modifies its input in place.
+
+ :param tag_name: The name of a tag.
+ :param attrs: A dictionary containing the tag's attributes.
+ Any appropriate attribute values will be modified in place.
+ :return: The modified dictionary that was originally passed in.
+ """
+
+ # First, cast the attrs dict to _AttributeValues. This might
+ # not be accurate yet, but it will be by the time this method
+ # returns.
+ modified_attrs = cast(_AttributeValues, attrs)
+ if not modified_attrs or not self.cdata_list_attributes:
+ # Nothing to do.
+ return modified_attrs
+
+ # There is at least a possibility that we need to modify one of
+ # the attribute values.
+ universal: Set[str] = self.cdata_list_attributes.get("*", set())
+ tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
+ for attr in list(modified_attrs.keys()):
+ modified_value: _AttributeValue
+ if attr in universal or (tag_specific and attr in tag_specific):
+ # We have a "class"-type attribute whose string
+ # value is a whitespace-separated list of
+ # values. Split it into a list.
+ original_value: _AttributeValue = modified_attrs[attr]
+ if isinstance(original_value, _RawAttributeValue):
+ # This is a _RawAttributeValue (a string) that
+ # needs to be split and converted to a
+ # AttributeValueList so it can be an
+ # _AttributeValue.
+ modified_value = self.attribute_value_list_class(
+ nonwhitespace_re.findall(original_value)
+ )
+ else:
+ # html5lib calls setAttributes twice for the
+ # same tag when rearranging the parse tree. On
+ # the second call the attribute value here is
+ # already a list. This can also happen when a
+ # Tag object is cloned. If this happens, leave
+ # the value alone rather than trying to split
+ # it again.
+ modified_value = original_value
+ modified_attrs[attr] = modified_value
+ return modified_attrs
+
+
+class SAXTreeBuilder(TreeBuilder):
+ """A Beautiful Soup treebuilder that listens for SAX events.
+
+ This is not currently used for anything, and it will be removed
+ soon. It was a good idea, but it wasn't properly integrated into the
+ rest of Beautiful Soup, so there have been long stretches where it
+ hasn't worked properly.
+ """
+
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ warnings.warn(
+ "The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ super(SAXTreeBuilder, self).__init__(*args, **kwargs)
+
+ def feed(self, markup: _RawMarkup) -> None:
+ raise NotImplementedError()
+
+ def close(self) -> None:
+ pass
+
+ def startElement(self, name: str, attrs: Dict[str, str]) -> None:
+ attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
+ # print("Start %s, %r" % (name, attrs))
+ assert self.soup is not None
+ self.soup.handle_starttag(name, None, None, attrs)
+
+ def endElement(self, name: str) -> None:
+ # print("End %s" % name)
+ assert self.soup is not None
+ self.soup.handle_endtag(name)
+
+ def startElementNS(
+ self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
+ ) -> None:
+ # Throw away (ns, nodeName) for now.
+ self.startElement(nodeName, attrs)
+
+ def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
+ # Throw away (ns, nodeName) for now.
+ self.endElement(nodeName)
+ # handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+ def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
+ # Ignore the prefix for now.
+ pass
+
+ def endPrefixMapping(self, prefix: str) -> None:
+ # Ignore the prefix for now.
+ # handler.endPrefixMapping(prefix)
+ pass
+
+ def characters(self, content: str) -> None:
+ assert self.soup is not None
+ self.soup.handle_data(content)
+
+ def startDocument(self) -> None:
+ pass
+
+ def endDocument(self) -> None:
+ pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+ """This TreeBuilder knows facts about HTML, such as which tags are treated
+ specially by the HTML standard.
+ """
+
+ #: Some HTML tags are defined as having no contents. Beautiful Soup
+ #: treats these specially.
+ DEFAULT_EMPTY_ELEMENT_TAGS: Set[str] = set(
+ [
+ # These are from HTML5.
+ "area",
+ "base",
+ "br",
+ "col",
+ "embed",
+ "hr",
+ "img",
+ "input",
+ "keygen",
+ "link",
+ "menuitem",
+ "meta",
+ "param",
+ "source",
+ "track",
+ "wbr",
+ # These are from earlier versions of HTML and are removed in HTML5.
+ "basefont",
+ "bgsound",
+ "command",
+ "frame",
+ "image",
+ "isindex",
+ "nextid",
+ "spacer",
+ ]
+ )
+
+ #: The HTML standard defines these tags as block-level elements. Beautiful
+ #: Soup does not treat these elements differently from other elements,
+ #: but it may do so eventually, and this information is available if
+ #: you need to use it.
+ DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
+ [
+ "address",
+ "article",
+ "aside",
+ "blockquote",
+ "canvas",
+ "dd",
+ "div",
+ "dl",
+ "dt",
+ "fieldset",
+ "figcaption",
+ "figure",
+ "footer",
+ "form",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "header",
+ "hr",
+ "li",
+ "main",
+ "nav",
+ "noscript",
+ "ol",
+ "output",
+ "p",
+ "pre",
+ "section",
+ "table",
+ "tfoot",
+ "ul",
+ "video",
+ ]
+ )
+
+ #: These HTML tags need special treatment so they can be
+ #: represented by a string class other than `bs4.element.NavigableString`.
+ #:
+ #: For some of these tags, it's because the HTML standard defines
+ #: an unusual content model for them. I made this list by going
+ #: through the HTML spec
+ #: (https://html.spec.whatwg.org/#metadata-content) and looking for
+ #: "metadata content" elements that can contain strings.
+ #:
+ #: The Ruby tags (<rt> and <rp>) are here despite being normal
+ #: "phrasing content" tags, because the content they contain is
+ #: qualitatively different from other text in the document, and it
+ #: can be useful to be able to distinguish it.
+ #:
+ #: TODO: Arguably <noscript> could go here but it seems
+ #: qualitatively different from the other tags.
+ DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {
+ "rt": RubyTextString,
+ "rp": RubyParenthesisString,
+ "style": Stylesheet,
+ "script": Script,
+ "template": TemplateString,
+ }
+
+ #: The HTML standard defines these attributes as containing a
+ #: space-separated list of values, not a single value. That is,
+ #: class="foo bar" means that the 'class' attribute has two values,
+ #: 'foo' and 'bar', not the single value 'foo bar'. When we
+ #: encounter one of these attributes, we will parse its value into
+ #: a list of values if possible. Upon output, the list will be
+ #: converted back into a string.
+ DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
+ "*": {"class", "accesskey", "dropzone"},
+ "a": {"rel", "rev"},
+ "link": {"rel", "rev"},
+ "td": {"headers"},
+ "th": {"headers"},
+ "form": {"accept-charset"},
+ "object": {"archive"},
+ # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+ "area": {"rel"},
+ "icon": {"sizes"},
+ "iframe": {"sandbox"},
+ "output": {"for"},
+ }
+
+ #: By default, whitespace inside these HTML tags will be
+ #: preserved rather than being collapsed.
+ DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])
+
+ def set_up_substitutions(self, tag: Tag) -> bool:
+ """Replace the declared encoding in a <meta> tag with a placeholder,
+ to be substituted when the tag is output to a string.
+
+ An HTML document may come in to Beautiful Soup as one
+ encoding, but exit in a different encoding, and the <meta> tag
+ needs to be changed to reflect this.
+
+ :return: Whether or not a substitution was performed.
+
+ :meta private:
+ """
+ # We are only interested in <meta> tags
+ if tag.name != "meta":
+ return False
+
+ # TODO: This cast will fail in the (very unlikely) scenario
+ # that the programmer who instantiates the TreeBuilder
+ # specifies meta['content'] or meta['charset'] as
+ # cdata_list_attributes.
+ content: Optional[str] = cast(Optional[str], tag.get("content"))
+ charset: Optional[str] = cast(Optional[str], tag.get("charset"))
+
+ # But we can accommodate meta['http-equiv'] being made a
+ # cdata_list_attribute (again, very unlikely) without much
+ # trouble.
+ http_equiv: List[str] = tag.get_attribute_list("http-equiv")
+
+ # We are interested in <meta> tags that say what encoding the
+ # document was originally in. This means HTML 5-style <meta>
+ # tags that provide the "charset" attribute. It also means
+ # HTML 4-style <meta> tags that provide the "content"
+ # attribute and have "http-equiv" set to "content-type".
+ #
+ # In both cases we will replace the value of the appropriate
+ # attribute with a standin object that can take on any
+ # encoding.
+ substituted = False
+ if charset is not None:
+ # HTML 5 style:
+ # <meta charset="utf8">
+ tag["charset"] = CharsetMetaAttributeValue(charset)
+ substituted = True
+
+ elif content is not None and any(
+ x.lower() == "content-type" for x in http_equiv
+ ):
+ # HTML 4 style:
+ # <meta http-equiv="content-type" content="text/html; charset=utf8">
+ tag["content"] = ContentMetaAttributeValue(content)
+ substituted = True
+
+ return substituted
+
+
+class DetectsXMLParsedAsHTML(object):
+ """A mixin class for any class (a TreeBuilder, or some class used by a
+ TreeBuilder) that's in a position to detect whether an XML
+ document is being incorrectly parsed as HTML, and issue an
+ appropriate warning.
+
+ This requires being able to observe an incoming processing
+ instruction that might be an XML declaration, and also able to
+ observe tags as they're opened. If you can't do that for a given
+ `TreeBuilder`, there's a less reliable implementation based on
+ examining the raw markup.
+ """
+
+ #: Regular expression for seeing if string markup has an <html> tag.
+ LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)
+
+ #: Regular expression for seeing if byte markup has an <html> tag.
+ LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)
+
+ #: The start of an XML document string.
+ XML_PREFIX: str = "<?xml"
+
+ #: The start of an XML document bytestring.
+ XML_PREFIX_B: bytes = b"<?xml"
+
+ # This is typed as str, not `ProcessingInstruction`, because this
+ # check may be run before any Beautiful Soup objects are created.
+ _first_processing_instruction: Optional[str] #: :meta private:
+ _root_tag_name: Optional[str] #: :meta private:
+
+ @classmethod
+ def warn_if_markup_looks_like_xml(
+ cls, markup: Optional[_RawMarkup], stacklevel: int = 3
+ ) -> bool:
+ """Perform a check on some markup to see if it looks like XML
+ that's not XHTML. If so, issue a warning.
+
+ This is much less reliable than doing the check while parsing,
+ but some of the tree builders can't do that.
+
+ :param stacklevel: The stacklevel of the code calling this\
+ function.
+
+ :return: True if the markup looks like non-XHTML XML, False
+ otherwise.
+ """
+ if markup is None:
+ return False
+ markup = markup[:500]
+ if isinstance(markup, bytes):
+ markup_b: bytes = markup
+ looks_like_xml = markup_b.startswith(
+ cls.XML_PREFIX_B
+ ) and not cls.LOOKS_LIKE_HTML_B.search(markup)
+ else:
+ markup_s: str = markup
+ looks_like_xml = markup_s.startswith(
+ cls.XML_PREFIX
+ ) and not cls.LOOKS_LIKE_HTML.search(markup)
+
+ if looks_like_xml:
+ cls._warn(stacklevel=stacklevel + 2)
+ return True
+ return False
+
+ @classmethod
+ def _warn(cls, stacklevel: int = 5) -> None:
+ """Issue a warning about XML being parsed as HTML."""
+ warnings.warn(
+ XMLParsedAsHTMLWarning.MESSAGE,
+ XMLParsedAsHTMLWarning,
+ stacklevel=stacklevel,
+ )
+
+ def _initialize_xml_detector(self) -> None:
+ """Call this method before parsing a document."""
+ self._first_processing_instruction = None
+ self._root_tag_name = None
+
+ def _document_might_be_xml(self, processing_instruction: str) -> None:
+ """Call this method when encountering an XML declaration, or a
+ "processing instruction" that might be an XML declaration.
+
+ This helps Beautiful Soup detect potential issues later, if
+ the XML document turns out to be a non-XHTML document that's
+ being parsed as XML.
+ """
+ if (
+ self._first_processing_instruction is not None
+ or self._root_tag_name is not None
+ ):
+ # The document has already started. Don't bother checking
+ # anymore.
+ return
+
+ self._first_processing_instruction = processing_instruction
+
+ # We won't know until we encounter the first tag whether or
+ # not this is actually a problem.
+
+ def _root_tag_encountered(self, name: str) -> None:
+ """Call this when you encounter the document's root tag.
+
+ This is where we actually check whether an XML document is
+ being incorrectly parsed as HTML, and issue the warning.
+ """
+ if self._root_tag_name is not None:
+ # This method was incorrectly called multiple times. Do
+ # nothing.
+ return
+
+ self._root_tag_name = name
+
+ if (
+ name != "html"
+ and self._first_processing_instruction is not None
+ and self._first_processing_instruction.lower().startswith("xml ")
+ ):
+ # We encountered an XML declaration and then a tag other
+ # than 'html'. This is a reliable indicator that a
+ # non-XHTML document is being parsed as XML.
+ self._warn(stacklevel=10)
+
+
+def register_treebuilders_from(module: ModuleType) -> None:
+ """Copy TreeBuilders from the given module into this module."""
+ this_module = sys.modules[__name__]
+ for name in module.__all__:
+ obj = getattr(module, name)
+
+ if issubclass(obj, TreeBuilder):
+ setattr(this_module, name, obj)
+ this_module.__all__.append(name)
+ # Register the builder while we're at it.
+ this_module.builder_registry.register(obj)
+
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last resort.
+from . import _htmlparser # noqa: E402
+
+register_treebuilders_from(_htmlparser)
+try:
+ from . import _html5lib
+
+ register_treebuilders_from(_html5lib)
+except ImportError:
+ # They don't have html5lib installed.
+ pass
+try:
+ from . import _lxml
+
+ register_treebuilders_from(_lxml)
+except ImportError:
+ # They don't have lxml installed.
+ pass