From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .venv/lib/python3.12/site-packages/bs4/element.py | 2886 +++++++++++++++++++++ 1 file changed, 2886 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/bs4/element.py (limited to '.venv/lib/python3.12/site-packages/bs4/element.py') diff --git a/.venv/lib/python3.12/site-packages/bs4/element.py b/.venv/lib/python3.12/site-packages/bs4/element.py new file mode 100644 index 00000000..6276054b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/element.py @@ -0,0 +1,2886 @@ +from __future__ import annotations + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import re +import warnings + +from bs4.css import CSS +from bs4._deprecation import ( + _deprecated, + _deprecated_alias, + _deprecated_function_alias, +) +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) +from bs4._warnings import AttributeResemblesVariableWarning + +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterable, + Iterator, + List, + Mapping, + Optional, + Pattern, + Set, + TYPE_CHECKING, + Tuple, + Type, + TypeVar, + Union, + cast, +) +from typing_extensions import ( + Self, + TypeAlias, +) + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + from bs4.builder import TreeBuilder + from bs4.filter import ElementFilter + from bs4.formatter import ( + _EntitySubstitutionFunction, + _FormatterOrName, + ) + from bs4._typing import ( + _AtMostOneElement, + _AttributeValue, + _AttributeValues, + _Encoding, + _InsertableElement, + _OneElement, + _QueryResults, + _RawOrProcessedAttributeValues, + _StrainableElement, + _StrainableAttribute, + _StrainableAttributes, + _StrainableString, + ) + +_OneOrMoreStringTypes: TypeAlias = Union[ + Type["NavigableString"], Iterable[Type["NavigableString"]] +] + +_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] + +# Deprecated module-level attributes. +# See https://peps.python.org/pep-0562/ +_deprecated_names = dict( + whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." +) +#: :meta private: +_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") + + +def __getattr__(name: str) -> Any: + if name in _deprecated_names: + message = _deprecated_names[name] + warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) + + return globals()[f"_deprecated_{name}"] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +#: Documents output by Beautiful Soup will be encoded with +#: this encoding unless you specify otherwise. +DEFAULT_OUTPUT_ENCODING: str = "utf-8" + +#: A regular expression that can be used to split on whitespace. +nonwhitespace_re: Pattern[str] = re.compile(r"\S+") + +#: These encodings are recognized by Python (so `Tag.encode` +#: could theoretically support them) but XML and HTML don't recognize +#: them (so they should not show up in an XML or HTML document as that +#: document's encoding). +#: +#: If an XML document is encoded in one of these encodings, no encoding +#: will be mentioned in the XML declaration. If an HTML document is +#: encoded in one of these encodings, and the HTML document has a +#: tag that mentions an encoding, the encoding will be given as +#: the empty string. +#: +#: Source: +#: Python documentation, `Python Specific Encodings `_ +PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( + [ + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", + ] +) + + +class NamespacedAttribute(str): + """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') + which remembers the namespace prefix ('xml') and the name ('lang') + that were used to create it. + """ + + prefix: Optional[str] + name: Optional[str] + namespace: Optional[str] + + def __new__( + cls, + prefix: Optional[str], + name: Optional[str] = None, + namespace: Optional[str] = None, + ) -> Self: + if not name: + # This is the default namespace. Its name "has no value" + # per https://www.w3.org/TR/xml-names/#defaulting + name = None + + if not name: + obj = str.__new__(cls, prefix) + elif not prefix: + # Not really namespaced. + obj = str.__new__(cls, name) + else: + obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + + +class AttributeValueWithCharsetSubstitution(str): + """An abstract class standing in for a character encoding specified + inside an HTML ```` tag. + + Subclasses exist for each place such a character encoding might be + found: either inside the ``charset`` attribute + (`CharsetMetaAttributeValue`) or inside the ``content`` attribute + (`ContentMetaAttributeValue`) + + This allows Beautiful Soup to replace that part of the HTML file + with a different encoding when ouputting a tree as a string. + """ + + # The original, un-encoded value of the ``content`` attribute. + #: :meta private: + original_value: str + + def substitute_encoding(self, eventual_encoding: str) -> str: + """Do whatever's necessary in this implementation-specific + portion an HTML document to substitute in a specific encoding. + """ + raise NotImplementedError() + + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a ```` tag's ``charset`` + attribute. + + When Beautiful Soup parses the markup ````, the + value of the ``charset`` attribute will become one of these objects. + + If the document is later encoded to an encoding other than UTF-8, its + ```` tag will mention the new encoding instead of ``utf8``. + """ + + def __new__(cls, original_value: str) -> Self: + # We don't need to use the original value for anything, but + # it might be useful for the user to know. + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: + """When an HTML document is being encoded to a given encoding, the + value of a ```` tag's ``charset`` becomes the name of + the encoding. + """ + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + return "" + return eventual_encoding + + +class AttributeValueList(List[str]): + """Class for the list used to hold the values of attributes which + have multiple values (such as HTML's 'class'). It's just a regular + list, but you can subclass it and pass it in to the TreeBuilder + constructor as attribute_value_list_class, to have your subclass + instantiated instead. + """ + + +class AttributeDict(Dict[Any,Any]): + """Superclass for the dictionary used to hold a tag's + attributes. You can use this, but it's just a regular dict with no + special logic. + """ + + +class XMLAttributeDict(AttributeDict): + """A dictionary for holding a Tag's attributes, which processes + incoming values for consistency with the HTML spec. + """ + + def __setitem__(self, key: str, value: Any) -> None: + """Set an attribute value, possibly modifying it to comply with + the XML spec. + + This just means converting common non-string values to + strings: XML attributes may have "any literal string as a + value." + """ + if value is None: + value = "" + if isinstance(value, bool): + # XML does not define any rules for boolean attributes. + # Preserve the old Beautiful Soup behavior (a bool that + # gets converted to a string on output) rather than + # guessing what the value should be. + pass + elif isinstance(value, (int, float)): + # It's dangerous to convert _every_ attribute value into a + # plain string, since an attribute value may be a more + # sophisticated string-like object + # (e.g. CharsetMetaAttributeValue). But we can definitely + # convert numeric values and booleans, which are the most common. + value = str(value) + + super().__setitem__(key, value) + + +class HTMLAttributeDict(AttributeDict): + """A dictionary for holding a Tag's attributes, which processes + incoming values for consistency with the HTML spec, which says + 'Attribute values are a mixture of text and character + references...' + + Basically, this means converting common non-string values into + strings, like XMLAttributeDict, though HTML also has some rules + around boolean attributes that XML doesn't have. + """ + + def __setitem__(self, key: str, value: Any) -> None: + """Set an attribute value, possibly modifying it to comply + with the HTML spec, + """ + if value in (False, None): + # 'The values "true" and "false" are not allowed on + # boolean attributes. To represent a false value, the + # attribute has to be omitted altogether.' + if key in self: + del self[key] + return + if isinstance(value, bool): + # 'If the [boolean] attribute is present, its value must + # either be the empty string or a value that is an ASCII + # case-insensitive match for the attribute's canonical + # name, with no leading or trailing whitespace.' + # + # [fixme] It's not clear to me whether "canonical name" + # means fully-qualified name, unqualified name, or + # (probably not) name with namespace prefix. For now I'm + # going with unqualified name. + if isinstance(key, NamespacedAttribute): + value = key.name + else: + value = key + elif isinstance(value, (int, float)): + # See note in XMLAttributeDict for the reasoning why we + # only do this to numbers. + value = str(value) + super().__setitem__(key, value) + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a ```` tag's ``content`` + attribute. + + When Beautiful Soup parses the markup: + ```` + + The value of the ``content`` attribute will become one of these objects. + + If the document is later encoded to an encoding other than UTF-8, its + ```` tag will mention the new encoding instead of ``utf8``. + """ + + #: Match the 'charset' argument inside the 'content' attribute + #: of a tag. + #: :meta private: + CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value: str) -> Self: + cls.CHARSET_RE.search(original_value) + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: + """When an HTML document is being encoded to a given encoding, the + value of the ``charset=`` in a ```` tag's ``content`` becomes + the name of the encoding. + """ + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + return self.CHARSET_RE.sub("", self.original_value) + + def rewrite(match: re.Match[str]) -> str: + return match.group(1) + eventual_encoding + + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """An abstract class representing a single element in the parse tree. + + `NavigableString`, `Tag`, etc. are all subclasses of + `PageElement`. For this reason you'll see a lot of methods that + return `PageElement`, but you'll never see an actual `PageElement` + object. For the most part you can think of `PageElement` as + meaning "a `Tag` or a `NavigableString`." + """ + + #: In general, we can't tell just by looking at an element whether + #: it's contained in an XML document or an HTML document. But for + #: `Tag` objects (q.v.) we can store this information at parse time. + #: :meta private: + known_xml: Optional[bool] = None + + #: Whether or not this element has been decomposed from the tree + #: it was created in. + _decomposed: bool + + parent: Optional[Tag] + next_element: _AtMostOneElement + previous_element: _AtMostOneElement + next_sibling: _AtMostOneElement + previous_sibling: _AtMostOneElement + + #: Whether or not this element is hidden from generated output. + #: Only the `BeautifulSoup` object itself is hidden. + hidden: bool = False + + def setup( + self, + parent: Optional[Tag] = None, + previous_element: _AtMostOneElement = None, + next_element: _AtMostOneElement = None, + previous_sibling: _AtMostOneElement = None, + next_sibling: _AtMostOneElement = None, + ) -> None: + """Sets up the initial relations between this element and + other elements. + + :param parent: The parent of this element. + + :param previous_element: The element parsed immediately before + this one. + + :param next_element: The element parsed immediately before + this one. + + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. + + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. + """ + self.parent = parent + + self.previous_element = previous_element + if self.previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if ( + previous_sibling is None + and self.parent is not None + and self.parent.contents + ): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if self.previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name( + self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] + ) -> Formatter: + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a `Formatter` object (used as-is), a + function (used as the entity substitution hook for an + `bs4.formatter.XMLFormatter` or + `bs4.formatter.HTMLFormatter`), or a string (used to look + up an `bs4.formatter.XMLFormatter` or + `bs4.formatter.HTMLFormatter` in the appropriate registry. + + """ + if isinstance(formatter_name, Formatter): + return formatter_name + c: type[Formatter] + registry: Mapping[Optional[str], Formatter] + if self._is_xml: + c = XMLFormatter + registry = XMLFormatter.REGISTRY + else: + c = HTMLFormatter + registry = HTMLFormatter.REGISTRY + if callable(formatter_name): + return c(entity_substitution=formatter_name) + return registry[formatter_name] + + @property + def _is_xml(self) -> bool: + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, "is_xml", False) + return self.parent._is_xml + + nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") + previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") + + def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: + raise NotImplementedError() + + def __copy__(self) -> Self: + """A copy of a PageElement can only be a deep copy, because + only one PageElement can occupy a given place in a parse tree. + """ + return self.__deepcopy__({}) + + default: Iterable[type[NavigableString]] = tuple() #: :meta private: + + def _all_strings( + self, strip: bool = False, types: Iterable[type[NavigableString]] = default + ) -> Iterator[str]: + """Yield all strings of certain classes, possibly stripping them. + + This is implemented differently in `Tag` and `NavigableString`. + """ + raise NotImplementedError() + + @property + def stripped_strings(self) -> Iterator[str]: + """Yield all interesting strings in this PageElement, stripping them + first. + + See `Tag` for information on which strings are considered + interesting in a given context. + """ + for string in self._all_strings(True): + yield string + + def get_text( + self, + separator: str = "", + strip: bool = False, + types: Iterable[Type[NavigableString]] = default, + ) -> str: + """Get all child strings of this PageElement, concatenated using the + given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :param types: A tuple of NavigableString subclasses. Any + strings of a subclass not found in this list will be + ignored. Although there are exceptions, the default + behavior in most cases is to consider only NavigableString + and CData objects. That means no comments, processing + instructions, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings(strip, types=types)]) + + getText = get_text + text = property(get_text) + + def replace_with(self, *args: PageElement) -> Self: + """Replace this `PageElement` with one or more other `PageElement`, + objects, keeping the rest of the tree the same. + + :return: This `PageElement`, no longer part of the tree. + """ + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree." + ) + if len(args) == 1 and args[0] is self: + # Replacing an element with itself is a no-op. + return self + if any(x is self.parent for x in args): + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract(_self_index=my_index) + for idx, replace_with in enumerate(args, start=my_index): + old_parent.insert(idx, replace_with) + return self + + replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") + + def wrap(self, wrap_inside: Tag) -> Tag: + """Wrap this `PageElement` inside a `Tag`. + + :return: ``wrap_inside``, occupying the position in the tree that used + to be occupied by this object, and with this object now inside it. + """ + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self, _self_index: Optional[int] = None) -> Self: + """Destructively rips this element out of the tree. + + :param _self_index: The location of this element in its parent's + .contents, if known. Passing this in allows for a performance + optimization. + + :return: this `PageElement`, no longer part of the tree. + """ + if self.parent is not None: + if _self_index is None: + _self_index = self.parent.index(self) + del self.parent.contents[_self_index] + + # Find the two elements that would be next to each other if + # this element (and any children) hadn't been parsed. Connect + # the two. + last_child = self._last_descendant() + + # last_child can't be None because we passed accept_self=True + # into _last_descendant. Worst case, last_child will be + # self. Making this cast removes several mypy complaints later + # on as we manipulate last_child. + last_child = cast(PageElement, last_child) + next_element = last_child.next_element + + if self.previous_element is not None: + if self.previous_element is not next_element: + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if ( + self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling + ): + self.previous_sibling.next_sibling = self.next_sibling + if ( + self.next_sibling is not None + and self.next_sibling is not self.previous_sibling + ): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def decompose(self) -> None: + """Recursively destroys this `PageElement` and its children. + + The element will be removed from the tree and wiped out; so + will everything beneath it. + + The behavior of a decomposed `PageElement` is undefined and you + should never use one for anything, but if you need to *check* + whether an element has been decomposed, you can use the + `PageElement.decomposed` property. + """ + self.extract() + e: _AtMostOneElement = self + next_up: _AtMostOneElement = None + while e is not None: + next_up = e.next_element + e.__dict__.clear() + if isinstance(e, Tag): + e.contents = [] + e._decomposed = True + e = next_up + + def _last_descendant( + self, is_initialized: bool = True, accept_self: bool = True + ) -> _AtMostOneElement: + """Finds the last element beneath this object to be parsed. + + Special note to help you figure things out if your type + checking is tripped up by the fact that this method returns + _AtMostOneElement instead of PageElement: the only time + this method returns None is if `accept_self` is False and the + `PageElement` has no children--either it's a NavigableString + or an empty Tag. + + :param is_initialized: Has `PageElement.setup` been called on + this `PageElement` yet? + + :param accept_self: Is ``self`` an acceptable answer to the + question? + """ + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + + _lastRecursiveChild = _deprecated_alias( + "_lastRecursiveChild", "_last_descendant", "4.0.0" + ) + + def insert_before(self, *args: _InsertableElement) -> List[PageElement]: + """Makes the given element(s) the immediate predecessor of this one. + + All the elements will have the same `PageElement.parent` as + this one, and the given elements will occur immediately before + this one. + + :param args: One or more PageElements. + + :return The list of PageElements that were inserted. + """ + parent = self.parent + if parent is None: + raise ValueError("Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + results: List[PageElement] = [] + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + results.extend(parent.insert(index, predecessor)) + + return results + + def insert_after(self, *args: _InsertableElement) -> List[PageElement]: + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same `PageElement.parent` as this + one, and the given elements will occur immediately after this + one. + + :param args: One or more PageElements. + + :return The list of PageElements that were inserted. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError("Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + results: List[PageElement] = [] + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + results.extend(parent.insert(index + 1 + offset, successor)) + offset += 1 + + return results + + def find_next( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one(self.find_all_next, name, attrs, string, **kwargs) + + findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") + + def find_all_next( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Find all `PageElement` objects that match the given criteria and + appear later in the document than this `PageElement`. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.next_elements, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") + + def find_next_sibling( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) + + findNextSibling = _deprecated_function_alias( + "findNextSibling", "find_next_sibling", "4.0.0" + ) + + def find_next_siblings( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Find all siblings of this `PageElement` that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.next_siblings, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findNextSiblings = _deprecated_function_alias( + "findNextSiblings", "find_next_siblings", "4.0.0" + ) + fetchNextSiblings = _deprecated_function_alias( + "fetchNextSiblings", "find_next_siblings", "3.0.0" + ) + + def find_previous( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Look backwards in the document from this `PageElement` and find the + first `PageElement` that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) + + findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") + + def find_all_previous( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Look backwards in the document from this `PageElement` and find all + `PageElement` that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.previous_elements, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findAllPrevious = _deprecated_function_alias( + "findAllPrevious", "find_all_previous", "4.0.0" + ) + fetchAllPrevious = _deprecated_function_alias( + "fetchAllPrevious", "find_all_previous", "3.0.0" + ) + + def find_previous_sibling( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Returns the closest sibling to this `PageElement` that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one( + self.find_previous_siblings, name, attrs, string, **kwargs + ) + + findPreviousSibling = _deprecated_function_alias( + "findPreviousSibling", "find_previous_sibling", "4.0.0" + ) + + def find_previous_siblings( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.previous_siblings, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findPreviousSiblings = _deprecated_function_alias( + "findPreviousSiblings", "find_previous_siblings", "4.0.0" + ) + fetchPreviousSiblings = _deprecated_function_alias( + "fetchPreviousSiblings", "find_previous_siblings", "3.0.0" + ) + + def find_parent( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param self: Whether the PageElement itself should be considered + as one of its 'parents'. + :kwargs: Additional filters on attribute values. + """ + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + results = self.find_parents( + name, attrs, 1, _stacklevel=3, **kwargs + ) + if results: + r = results[0] + return r + + findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") + + def find_parents( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Find all parents of this `PageElement` that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + iterator = self.parents + return self._find_all( + name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs + ) + + findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") + fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") + + @property + def next(self) -> _AtMostOneElement: + """The `PageElement`, if any, that was parsed just after this one.""" + return self.next_element + + @property + def previous(self) -> _AtMostOneElement: + """The `PageElement`, if any, that was parsed just before this one.""" + return self.previous_element + + # These methods do the real heavy lifting. + + def _find_one( + self, + # TODO-TYPING: "There is no syntax to indicate optional or + # keyword arguments; such function types are rarely used + # as callback types." - So, not sure how to get more + # specific here. + method: Callable, + name: _FindMethodName, + attrs: _StrainableAttributes, + string: Optional[_StrainableString], + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + r: _AtMostOneElement = None + results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) + if results: + r = results[0] + return r + + def _find_all( + self, + name: _FindMethodName, + attrs: _StrainableAttributes, + string: Optional[_StrainableString], + limit: Optional[int], + generator: Iterator[PageElement], + _stacklevel: int = 3, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Iterates over a generator looking for things that match.""" + + if string is None and "text" in kwargs: + string = kwargs.pop("text") + warnings.warn( + "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", + DeprecationWarning, + stacklevel=_stacklevel, + ) + + if "_class" in kwargs: + warnings.warn( + AttributeResemblesVariableWarning.MESSAGE + % dict( + original="_class", + autocorrect="class_", + ), + AttributeResemblesVariableWarning, + stacklevel=_stacklevel, + ) + + from bs4.filter import ElementFilter + + if isinstance(name, ElementFilter): + matcher = name + else: + matcher = SoupStrainer(name, attrs, string, **kwargs) + + result: Iterable[_OneElement] + if string is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator if isinstance(element, Tag)) + return ResultSet(matcher, result) + elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(":") == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(":", 1) + else: + prefix = None + local_name = name + result = [] + for element in generator: + if not isinstance(element, Tag): + continue + if element.name == name or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ): + result.append(element) + return ResultSet(matcher, result) + return matcher.find_all(generator, limit) + + # These generators can be used to navigate starting from both + # NavigableStrings and Tags. + @property + def next_elements(self) -> Iterator[PageElement]: + """All PageElements that were parsed after this one.""" + i = self.next_element + while i is not None: + successor = i.next_element + yield i + i = successor + + @property + def self_and_next_elements(self) -> Iterator[PageElement]: + """This PageElement, then all PageElements that were parsed after it.""" + return self._self_and(self.next_elements) + + @property + def next_siblings(self) -> Iterator[PageElement]: + """All PageElements that are siblings of this one but were parsed + later. + """ + i = self.next_sibling + while i is not None: + successor = i.next_sibling + yield i + i = successor + + @property + def self_and_next_siblings(self) -> Iterator[PageElement]: + """This PageElement, then all of its siblings.""" + return self._self_and(self.next_siblings) + + @property + def previous_elements(self) -> Iterator[PageElement]: + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ + i = self.previous_element + while i is not None: + successor = i.previous_element + yield i + i = successor + + @property + def self_and_previous_elements(self) -> Iterator[PageElement]: + """This PageElement, then all elements that were parsed + earlier.""" + return self._self_and(self.previous_elements) + + @property + def previous_siblings(self) -> Iterator[PageElement]: + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ + i = self.previous_sibling + while i is not None: + successor = i.previous_sibling + yield i + i = successor + + @property + def self_and_previous_siblings(self) -> Iterator[PageElement]: + """This PageElement, then all of its siblings that were parsed + earlier.""" + return self._self_and(self.previous_siblings) + + @property + def parents(self) -> Iterator[Tag]: + """All elements that are parents of this PageElement. + + :yield: A sequence of Tags, ending with a BeautifulSoup object. + """ + i = self.parent + while i is not None: + successor = i.parent + yield i + i = successor + + @property + def self_and_parents(self) -> Iterator[PageElement]: + """This element, then all of its parents. + + :yield: A sequence of PageElements, ending with a BeautifulSoup object. + """ + return self._self_and(self.parents) + + def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: + """Modify a generator by yielding this element, then everything + yielded by the other generator. + """ + if not self.hidden: + yield self + for i in other_generator: + yield i + + @property + def decomposed(self) -> bool: + """Check whether a PageElement has been decomposed.""" + return getattr(self, "_decomposed", False) or False + + @_deprecated("next_elements", "4.0.0") + def nextGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.next_elements + + @_deprecated("next_siblings", "4.0.0") + def nextSiblingGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.next_siblings + + @_deprecated("previous_elements", "4.0.0") + def previousGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.previous_elements + + @_deprecated("previous_siblings", "4.0.0") + def previousSiblingGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.previous_siblings + + @_deprecated("parents", "4.0.0") + def parentGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.parents + + +class NavigableString(str, PageElement): + """A Python string that is part of a parse tree. + + When Beautiful Soup parses the markup ``penguin``, it will + create a `NavigableString` for the string "penguin". + """ + + #: A string prepended to the body of the 'real' string + #: when formatting it as part of a document, such as the '' + #: in an HTML comment. + SUFFIX: str = "" + + def __new__(cls, value: Union[str, bytes]) -> Self: + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, str): + u = str.__new__(cls, value) + else: + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.hidden = False + u.setup() + return u + + def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + + :param recursive: This parameter is ignored; it's only defined + so that NavigableString.__deepcopy__ implements the same + signature as Tag.__deepcopy__. + """ + return type(self)(self) + + def __getnewargs__(self) -> Tuple[str]: + return (str(self),) + + @property + def string(self) -> str: + """Convenience property defined to match `Tag.string`. + + :return: This property always returns the `NavigableString` it was + called on. + + :meta private: + """ + return self + + def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: + """Run the string through the provided formatter, making it + ready for output as part of an HTML or XML document. + + :param formatter: A `Formatter` object, or a string naming one + of the standard formatters. + """ + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self) -> None: + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + + :meta private: + """ + return None + + @name.setter + def name(self, name: str) -> None: + """Prevent NavigableString.name from ever being set. + + :meta private: + """ + raise AttributeError("A NavigableString cannot be given a name.") + + def _all_strings( + self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default + ) -> Iterator[str]: + """Yield all strings of certain classes, possibly stripping them. + + This makes it easy for NavigableString to implement methods + like get_text() as conveniences, creating a consistent + text-extraction API across all PageElements. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. If this + NavigableString isn't one of those subclasses, the + sequence will be empty. By default, the subclasses + considered are NavigableString and CData objects. That + means no comments, processing instructions, etc. + + :yield: A sequence that either contains this string, or is empty. + """ + if types is self.default: + # This is kept in Tag because it's full of subclasses of + # this class, which aren't defined until later in the file. + types = Tag.MAIN_CONTENT_STRING_TYPES + + # Do nothing if the caller is looking for specific types of + # string, and we're of a different type. + # + # We check specific types instead of using isinstance(self, + # types) because all of these classes subclass + # NavigableString. Anyone who's using this feature probably + # wants generic NavigableStrings but not other stuff. + my_type = type(self) + if types is not None: + if isinstance(types, type): + # Looking for a single type. + if my_type is not types: + return + elif my_type not in types: + # Looking for one of a list of types. + return + + value = self + if strip: + final_value = value.strip() + else: + final_value = self + if len(final_value) > 0: + yield final_value + + @property + def strings(self) -> Iterator[str]: + """Yield this string, but only if it is interesting. + + This is defined the way it is for compatibility with + `Tag.strings`. See `Tag` for information on which strings are + interesting in a given context. + + :yield: A sequence that either contains this string, or is empty. + """ + return self._all_strings() + + +class PreformattedString(NavigableString): + """A `NavigableString` not subject to the normal formatting rules. + + This is an abstract class used for special kinds of strings such + as comments (`Comment`) and CDATA blocks (`CData`). + """ + + PREFIX: str = "" + SUFFIX: str = "" + + def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A `Formatter` object, or a string naming one + of the standard formatters. The string will be passed into the + `Formatter`, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. + """ + if formatter is not None: + self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + + +class CData(PreformattedString): + """A `CDATA section `_.""" + + PREFIX: str = "" + + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX: str = "" + + +class XMLProcessingInstruction(ProcessingInstruction): + """An `XML processing instruction `_.""" + + PREFIX: str = "" + + +class Comment(PreformattedString): + """An `HTML comment `_ or `XML comment `_.""" + + PREFIX: str = "" + + +class Declaration(PreformattedString): + """An `XML declaration `_.""" + + PREFIX: str = "" + + +class Doctype(PreformattedString): + """A `document type declaration `_.""" + + @classmethod + def for_name_and_ids( + cls, name: str, pub_id: Optional[str], system_id: Optional[str] + ) -> Doctype: + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + """ + return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) + + @classmethod + def _string_for_name_and_ids( + self, name: str, pub_id: Optional[str], system_id: Optional[str] + ) -> str: + """Generate a string to be used as the basis of a Doctype object. + + This is a separate method from for_name_and_ids() because the lxml + TreeBuilder needs to call it. + """ + value = name or "" + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + return value + + PREFIX: str = "\n" + + +class Stylesheet(NavigableString): + """A `NavigableString` representing the contents of a `