diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/element.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/element.py | 2886 |
1 files changed, 2886 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/element.py b/.venv/lib/python3.12/site-packages/bs4/element.py new file mode 100644 index 00000000..6276054b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/element.py @@ -0,0 +1,2886 @@ +from __future__ import annotations + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import re +import warnings + +from bs4.css import CSS +from bs4._deprecation import ( + _deprecated, + _deprecated_alias, + _deprecated_function_alias, +) +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) +from bs4._warnings import AttributeResemblesVariableWarning + +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterable, + Iterator, + List, + Mapping, + Optional, + Pattern, + Set, + TYPE_CHECKING, + Tuple, + Type, + TypeVar, + Union, + cast, +) +from typing_extensions import ( + Self, + TypeAlias, +) + +if TYPE_CHECKING: + from bs4 import BeautifulSoup + from bs4.builder import TreeBuilder + from bs4.filter import ElementFilter + from bs4.formatter import ( + _EntitySubstitutionFunction, + _FormatterOrName, + ) + from bs4._typing import ( + _AtMostOneElement, + _AttributeValue, + _AttributeValues, + _Encoding, + _InsertableElement, + _OneElement, + _QueryResults, + _RawOrProcessedAttributeValues, + _StrainableElement, + _StrainableAttribute, + _StrainableAttributes, + _StrainableString, + ) + +_OneOrMoreStringTypes: TypeAlias = Union[ + Type["NavigableString"], Iterable[Type["NavigableString"]] +] + +_FindMethodName: TypeAlias = Optional[Union["_StrainableElement", "ElementFilter"]] + +# Deprecated module-level attributes. +# See https://peps.python.org/pep-0562/ +_deprecated_names = dict( + whitespace_re="The {name} attribute was deprecated in version 4.7.0. If you need it, make your own copy." +) +#: :meta private: +_deprecated_whitespace_re: Pattern[str] = re.compile(r"\s+") + + +def __getattr__(name: str) -> Any: + if name in _deprecated_names: + message = _deprecated_names[name] + warnings.warn(message.format(name=name), DeprecationWarning, stacklevel=2) + + return globals()[f"_deprecated_{name}"] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +#: Documents output by Beautiful Soup will be encoded with +#: this encoding unless you specify otherwise. +DEFAULT_OUTPUT_ENCODING: str = "utf-8" + +#: A regular expression that can be used to split on whitespace. +nonwhitespace_re: Pattern[str] = re.compile(r"\S+") + +#: These encodings are recognized by Python (so `Tag.encode` +#: could theoretically support them) but XML and HTML don't recognize +#: them (so they should not show up in an XML or HTML document as that +#: document's encoding). +#: +#: If an XML document is encoded in one of these encodings, no encoding +#: will be mentioned in the XML declaration. If an HTML document is +#: encoded in one of these encodings, and the HTML document has a +#: <meta> tag that mentions an encoding, the encoding will be given as +#: the empty string. +#: +#: Source: +#: Python documentation, `Python Specific Encodings <https://docs.python.org/3/library/codecs.html#python-specific-encodings>`_ +PYTHON_SPECIFIC_ENCODINGS: Set[_Encoding] = set( + [ + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", + ] +) + + +class NamespacedAttribute(str): + """A namespaced attribute (e.g. the 'xml:lang' in 'xml:lang="en"') + which remembers the namespace prefix ('xml') and the name ('lang') + that were used to create it. + """ + + prefix: Optional[str] + name: Optional[str] + namespace: Optional[str] + + def __new__( + cls, + prefix: Optional[str], + name: Optional[str] = None, + namespace: Optional[str] = None, + ) -> Self: + if not name: + # This is the default namespace. Its name "has no value" + # per https://www.w3.org/TR/xml-names/#defaulting + name = None + + if not name: + obj = str.__new__(cls, prefix) + elif not prefix: + # Not really namespaced. + obj = str.__new__(cls, name) + else: + obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + + +class AttributeValueWithCharsetSubstitution(str): + """An abstract class standing in for a character encoding specified + inside an HTML ``<meta>`` tag. + + Subclasses exist for each place such a character encoding might be + found: either inside the ``charset`` attribute + (`CharsetMetaAttributeValue`) or inside the ``content`` attribute + (`ContentMetaAttributeValue`) + + This allows Beautiful Soup to replace that part of the HTML file + with a different encoding when ouputting a tree as a string. + """ + + # The original, un-encoded value of the ``content`` attribute. + #: :meta private: + original_value: str + + def substitute_encoding(self, eventual_encoding: str) -> str: + """Do whatever's necessary in this implementation-specific + portion an HTML document to substitute in a specific encoding. + """ + raise NotImplementedError() + + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a ``<meta>`` tag's ``charset`` + attribute. + + When Beautiful Soup parses the markup ``<meta charset="utf8">``, the + value of the ``charset`` attribute will become one of these objects. + + If the document is later encoded to an encoding other than UTF-8, its + ``<meta>`` tag will mention the new encoding instead of ``utf8``. + """ + + def __new__(cls, original_value: str) -> Self: + # We don't need to use the original value for anything, but + # it might be useful for the user to know. + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: + """When an HTML document is being encoded to a given encoding, the + value of a ``<meta>`` tag's ``charset`` becomes the name of + the encoding. + """ + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + return "" + return eventual_encoding + + +class AttributeValueList(List[str]): + """Class for the list used to hold the values of attributes which + have multiple values (such as HTML's 'class'). It's just a regular + list, but you can subclass it and pass it in to the TreeBuilder + constructor as attribute_value_list_class, to have your subclass + instantiated instead. + """ + + +class AttributeDict(Dict[Any,Any]): + """Superclass for the dictionary used to hold a tag's + attributes. You can use this, but it's just a regular dict with no + special logic. + """ + + +class XMLAttributeDict(AttributeDict): + """A dictionary for holding a Tag's attributes, which processes + incoming values for consistency with the HTML spec. + """ + + def __setitem__(self, key: str, value: Any) -> None: + """Set an attribute value, possibly modifying it to comply with + the XML spec. + + This just means converting common non-string values to + strings: XML attributes may have "any literal string as a + value." + """ + if value is None: + value = "" + if isinstance(value, bool): + # XML does not define any rules for boolean attributes. + # Preserve the old Beautiful Soup behavior (a bool that + # gets converted to a string on output) rather than + # guessing what the value should be. + pass + elif isinstance(value, (int, float)): + # It's dangerous to convert _every_ attribute value into a + # plain string, since an attribute value may be a more + # sophisticated string-like object + # (e.g. CharsetMetaAttributeValue). But we can definitely + # convert numeric values and booleans, which are the most common. + value = str(value) + + super().__setitem__(key, value) + + +class HTMLAttributeDict(AttributeDict): + """A dictionary for holding a Tag's attributes, which processes + incoming values for consistency with the HTML spec, which says + 'Attribute values are a mixture of text and character + references...' + + Basically, this means converting common non-string values into + strings, like XMLAttributeDict, though HTML also has some rules + around boolean attributes that XML doesn't have. + """ + + def __setitem__(self, key: str, value: Any) -> None: + """Set an attribute value, possibly modifying it to comply + with the HTML spec, + """ + if value in (False, None): + # 'The values "true" and "false" are not allowed on + # boolean attributes. To represent a false value, the + # attribute has to be omitted altogether.' + if key in self: + del self[key] + return + if isinstance(value, bool): + # 'If the [boolean] attribute is present, its value must + # either be the empty string or a value that is an ASCII + # case-insensitive match for the attribute's canonical + # name, with no leading or trailing whitespace.' + # + # [fixme] It's not clear to me whether "canonical name" + # means fully-qualified name, unqualified name, or + # (probably not) name with namespace prefix. For now I'm + # going with unqualified name. + if isinstance(key, NamespacedAttribute): + value = key.name + else: + value = key + elif isinstance(value, (int, float)): + # See note in XMLAttributeDict for the reasoning why we + # only do this to numbers. + value = str(value) + super().__setitem__(key, value) + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a ``<meta>`` tag's ``content`` + attribute. + + When Beautiful Soup parses the markup: + ``<meta http-equiv="content-type" content="text/html; charset=utf8">`` + + The value of the ``content`` attribute will become one of these objects. + + If the document is later encoded to an encoding other than UTF-8, its + ``<meta>`` tag will mention the new encoding instead of ``utf8``. + """ + + #: Match the 'charset' argument inside the 'content' attribute + #: of a <meta> tag. + #: :meta private: + CHARSET_RE: Pattern[str] = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value: str) -> Self: + cls.CHARSET_RE.search(original_value) + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def substitute_encoding(self, eventual_encoding: _Encoding = "utf-8") -> str: + """When an HTML document is being encoded to a given encoding, the + value of the ``charset=`` in a ``<meta>`` tag's ``content`` becomes + the name of the encoding. + """ + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + return self.CHARSET_RE.sub("", self.original_value) + + def rewrite(match: re.Match[str]) -> str: + return match.group(1) + eventual_encoding + + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """An abstract class representing a single element in the parse tree. + + `NavigableString`, `Tag`, etc. are all subclasses of + `PageElement`. For this reason you'll see a lot of methods that + return `PageElement`, but you'll never see an actual `PageElement` + object. For the most part you can think of `PageElement` as + meaning "a `Tag` or a `NavigableString`." + """ + + #: In general, we can't tell just by looking at an element whether + #: it's contained in an XML document or an HTML document. But for + #: `Tag` objects (q.v.) we can store this information at parse time. + #: :meta private: + known_xml: Optional[bool] = None + + #: Whether or not this element has been decomposed from the tree + #: it was created in. + _decomposed: bool + + parent: Optional[Tag] + next_element: _AtMostOneElement + previous_element: _AtMostOneElement + next_sibling: _AtMostOneElement + previous_sibling: _AtMostOneElement + + #: Whether or not this element is hidden from generated output. + #: Only the `BeautifulSoup` object itself is hidden. + hidden: bool = False + + def setup( + self, + parent: Optional[Tag] = None, + previous_element: _AtMostOneElement = None, + next_element: _AtMostOneElement = None, + previous_sibling: _AtMostOneElement = None, + next_sibling: _AtMostOneElement = None, + ) -> None: + """Sets up the initial relations between this element and + other elements. + + :param parent: The parent of this element. + + :param previous_element: The element parsed immediately before + this one. + + :param next_element: The element parsed immediately before + this one. + + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. + + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. + """ + self.parent = parent + + self.previous_element = previous_element + if self.previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if ( + previous_sibling is None + and self.parent is not None + and self.parent.contents + ): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if self.previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s: str, formatter: Optional[_FormatterOrName]) -> str: + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name( + self, formatter_name: Union[_FormatterOrName, _EntitySubstitutionFunction] + ) -> Formatter: + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a `Formatter` object (used as-is), a + function (used as the entity substitution hook for an + `bs4.formatter.XMLFormatter` or + `bs4.formatter.HTMLFormatter`), or a string (used to look + up an `bs4.formatter.XMLFormatter` or + `bs4.formatter.HTMLFormatter` in the appropriate registry. + + """ + if isinstance(formatter_name, Formatter): + return formatter_name + c: type[Formatter] + registry: Mapping[Optional[str], Formatter] + if self._is_xml: + c = XMLFormatter + registry = XMLFormatter.REGISTRY + else: + c = HTMLFormatter + registry = HTMLFormatter.REGISTRY + if callable(formatter_name): + return c(entity_substitution=formatter_name) + return registry[formatter_name] + + @property + def _is_xml(self) -> bool: + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, "is_xml", False) + return self.parent._is_xml + + nextSibling = _deprecated_alias("nextSibling", "next_sibling", "4.0.0") + previousSibling = _deprecated_alias("previousSibling", "previous_sibling", "4.0.0") + + def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: + raise NotImplementedError() + + def __copy__(self) -> Self: + """A copy of a PageElement can only be a deep copy, because + only one PageElement can occupy a given place in a parse tree. + """ + return self.__deepcopy__({}) + + default: Iterable[type[NavigableString]] = tuple() #: :meta private: + + def _all_strings( + self, strip: bool = False, types: Iterable[type[NavigableString]] = default + ) -> Iterator[str]: + """Yield all strings of certain classes, possibly stripping them. + + This is implemented differently in `Tag` and `NavigableString`. + """ + raise NotImplementedError() + + @property + def stripped_strings(self) -> Iterator[str]: + """Yield all interesting strings in this PageElement, stripping them + first. + + See `Tag` for information on which strings are considered + interesting in a given context. + """ + for string in self._all_strings(True): + yield string + + def get_text( + self, + separator: str = "", + strip: bool = False, + types: Iterable[Type[NavigableString]] = default, + ) -> str: + """Get all child strings of this PageElement, concatenated using the + given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :param types: A tuple of NavigableString subclasses. Any + strings of a subclass not found in this list will be + ignored. Although there are exceptions, the default + behavior in most cases is to consider only NavigableString + and CData objects. That means no comments, processing + instructions, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings(strip, types=types)]) + + getText = get_text + text = property(get_text) + + def replace_with(self, *args: PageElement) -> Self: + """Replace this `PageElement` with one or more other `PageElement`, + objects, keeping the rest of the tree the same. + + :return: This `PageElement`, no longer part of the tree. + """ + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree." + ) + if len(args) == 1 and args[0] is self: + # Replacing an element with itself is a no-op. + return self + if any(x is self.parent for x in args): + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract(_self_index=my_index) + for idx, replace_with in enumerate(args, start=my_index): + old_parent.insert(idx, replace_with) + return self + + replaceWith = _deprecated_function_alias("replaceWith", "replace_with", "4.0.0") + + def wrap(self, wrap_inside: Tag) -> Tag: + """Wrap this `PageElement` inside a `Tag`. + + :return: ``wrap_inside``, occupying the position in the tree that used + to be occupied by this object, and with this object now inside it. + """ + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self, _self_index: Optional[int] = None) -> Self: + """Destructively rips this element out of the tree. + + :param _self_index: The location of this element in its parent's + .contents, if known. Passing this in allows for a performance + optimization. + + :return: this `PageElement`, no longer part of the tree. + """ + if self.parent is not None: + if _self_index is None: + _self_index = self.parent.index(self) + del self.parent.contents[_self_index] + + # Find the two elements that would be next to each other if + # this element (and any children) hadn't been parsed. Connect + # the two. + last_child = self._last_descendant() + + # last_child can't be None because we passed accept_self=True + # into _last_descendant. Worst case, last_child will be + # self. Making this cast removes several mypy complaints later + # on as we manipulate last_child. + last_child = cast(PageElement, last_child) + next_element = last_child.next_element + + if self.previous_element is not None: + if self.previous_element is not next_element: + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if ( + self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling + ): + self.previous_sibling.next_sibling = self.next_sibling + if ( + self.next_sibling is not None + and self.next_sibling is not self.previous_sibling + ): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def decompose(self) -> None: + """Recursively destroys this `PageElement` and its children. + + The element will be removed from the tree and wiped out; so + will everything beneath it. + + The behavior of a decomposed `PageElement` is undefined and you + should never use one for anything, but if you need to *check* + whether an element has been decomposed, you can use the + `PageElement.decomposed` property. + """ + self.extract() + e: _AtMostOneElement = self + next_up: _AtMostOneElement = None + while e is not None: + next_up = e.next_element + e.__dict__.clear() + if isinstance(e, Tag): + e.contents = [] + e._decomposed = True + e = next_up + + def _last_descendant( + self, is_initialized: bool = True, accept_self: bool = True + ) -> _AtMostOneElement: + """Finds the last element beneath this object to be parsed. + + Special note to help you figure things out if your type + checking is tripped up by the fact that this method returns + _AtMostOneElement instead of PageElement: the only time + this method returns None is if `accept_self` is False and the + `PageElement` has no children--either it's a NavigableString + or an empty Tag. + + :param is_initialized: Has `PageElement.setup` been called on + this `PageElement` yet? + + :param accept_self: Is ``self`` an acceptable answer to the + question? + """ + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + + _lastRecursiveChild = _deprecated_alias( + "_lastRecursiveChild", "_last_descendant", "4.0.0" + ) + + def insert_before(self, *args: _InsertableElement) -> List[PageElement]: + """Makes the given element(s) the immediate predecessor of this one. + + All the elements will have the same `PageElement.parent` as + this one, and the given elements will occur immediately before + this one. + + :param args: One or more PageElements. + + :return The list of PageElements that were inserted. + """ + parent = self.parent + if parent is None: + raise ValueError("Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + results: List[PageElement] = [] + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + results.extend(parent.insert(index, predecessor)) + + return results + + def insert_after(self, *args: _InsertableElement) -> List[PageElement]: + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same `PageElement.parent` as this + one, and the given elements will occur immediately after this + one. + + :param args: One or more PageElements. + + :return The list of PageElements that were inserted. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError("Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + results: List[PageElement] = [] + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + results.extend(parent.insert(index + 1 + offset, successor)) + offset += 1 + + return results + + def find_next( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one(self.find_all_next, name, attrs, string, **kwargs) + + findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0") + + def find_all_next( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Find all `PageElement` objects that match the given criteria and + appear later in the document than this `PageElement`. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.next_elements, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0") + + def find_next_sibling( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) + + findNextSibling = _deprecated_function_alias( + "findNextSibling", "find_next_sibling", "4.0.0" + ) + + def find_next_siblings( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Find all siblings of this `PageElement` that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.next_siblings, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findNextSiblings = _deprecated_function_alias( + "findNextSiblings", "find_next_siblings", "4.0.0" + ) + fetchNextSiblings = _deprecated_function_alias( + "fetchNextSiblings", "find_next_siblings", "3.0.0" + ) + + def find_previous( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Look backwards in the document from this `PageElement` and find the + first `PageElement` that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one(self.find_all_previous, name, attrs, string, **kwargs) + + findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0") + + def find_all_previous( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Look backwards in the document from this `PageElement` and find all + `PageElement` that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.previous_elements, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findAllPrevious = _deprecated_function_alias( + "findAllPrevious", "find_all_previous", "4.0.0" + ) + fetchAllPrevious = _deprecated_function_alias( + "fetchAllPrevious", "find_all_previous", "3.0.0" + ) + + def find_previous_sibling( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Returns the closest sibling to this `PageElement` that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a `NavigableString` with specific text. + :kwargs: Additional filters on attribute values. + """ + return self._find_one( + self.find_previous_siblings, name, attrs, string, **kwargs + ) + + findPreviousSibling = _deprecated_function_alias( + "findPreviousSibling", "find_previous_sibling", "4.0.0" + ) + + def find_previous_siblings( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + return self._find_all( + name, + attrs, + string, + limit, + self.previous_siblings, + _stacklevel=_stacklevel + 1, + **kwargs, + ) + + findPreviousSiblings = _deprecated_function_alias( + "findPreviousSiblings", "find_previous_siblings", "4.0.0" + ) + fetchPreviousSiblings = _deprecated_function_alias( + "fetchPreviousSiblings", "find_previous_siblings", "3.0.0" + ) + + def find_parent( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param self: Whether the PageElement itself should be considered + as one of its 'parents'. + :kwargs: Additional filters on attribute values. + """ + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + results = self.find_parents( + name, attrs, 1, _stacklevel=3, **kwargs + ) + if results: + r = results[0] + return r + + findParent = _deprecated_function_alias("findParent", "find_parent", "4.0.0") + + def find_parents( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Find all parents of this `PageElement` that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + iterator = self.parents + return self._find_all( + name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs + ) + + findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0") + fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0") + + @property + def next(self) -> _AtMostOneElement: + """The `PageElement`, if any, that was parsed just after this one.""" + return self.next_element + + @property + def previous(self) -> _AtMostOneElement: + """The `PageElement`, if any, that was parsed just before this one.""" + return self.previous_element + + # These methods do the real heavy lifting. + + def _find_one( + self, + # TODO-TYPING: "There is no syntax to indicate optional or + # keyword arguments; such function types are rarely used + # as callback types." - So, not sure how to get more + # specific here. + method: Callable, + name: _FindMethodName, + attrs: _StrainableAttributes, + string: Optional[_StrainableString], + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + r: _AtMostOneElement = None + results: _QueryResults = method(name, attrs, string, 1, _stacklevel=4, **kwargs) + if results: + r = results[0] + return r + + def _find_all( + self, + name: _FindMethodName, + attrs: _StrainableAttributes, + string: Optional[_StrainableString], + limit: Optional[int], + generator: Iterator[PageElement], + _stacklevel: int = 3, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Iterates over a generator looking for things that match.""" + + if string is None and "text" in kwargs: + string = kwargs.pop("text") + warnings.warn( + "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", + DeprecationWarning, + stacklevel=_stacklevel, + ) + + if "_class" in kwargs: + warnings.warn( + AttributeResemblesVariableWarning.MESSAGE + % dict( + original="_class", + autocorrect="class_", + ), + AttributeResemblesVariableWarning, + stacklevel=_stacklevel, + ) + + from bs4.filter import ElementFilter + + if isinstance(name, ElementFilter): + matcher = name + else: + matcher = SoupStrainer(name, attrs, string, **kwargs) + + result: Iterable[_OneElement] + if string is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator if isinstance(element, Tag)) + return ResultSet(matcher, result) + elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(":") == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(":", 1) + else: + prefix = None + local_name = name + result = [] + for element in generator: + if not isinstance(element, Tag): + continue + if element.name == name or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ): + result.append(element) + return ResultSet(matcher, result) + return matcher.find_all(generator, limit) + + # These generators can be used to navigate starting from both + # NavigableStrings and Tags. + @property + def next_elements(self) -> Iterator[PageElement]: + """All PageElements that were parsed after this one.""" + i = self.next_element + while i is not None: + successor = i.next_element + yield i + i = successor + + @property + def self_and_next_elements(self) -> Iterator[PageElement]: + """This PageElement, then all PageElements that were parsed after it.""" + return self._self_and(self.next_elements) + + @property + def next_siblings(self) -> Iterator[PageElement]: + """All PageElements that are siblings of this one but were parsed + later. + """ + i = self.next_sibling + while i is not None: + successor = i.next_sibling + yield i + i = successor + + @property + def self_and_next_siblings(self) -> Iterator[PageElement]: + """This PageElement, then all of its siblings.""" + return self._self_and(self.next_siblings) + + @property + def previous_elements(self) -> Iterator[PageElement]: + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ + i = self.previous_element + while i is not None: + successor = i.previous_element + yield i + i = successor + + @property + def self_and_previous_elements(self) -> Iterator[PageElement]: + """This PageElement, then all elements that were parsed + earlier.""" + return self._self_and(self.previous_elements) + + @property + def previous_siblings(self) -> Iterator[PageElement]: + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ + i = self.previous_sibling + while i is not None: + successor = i.previous_sibling + yield i + i = successor + + @property + def self_and_previous_siblings(self) -> Iterator[PageElement]: + """This PageElement, then all of its siblings that were parsed + earlier.""" + return self._self_and(self.previous_siblings) + + @property + def parents(self) -> Iterator[Tag]: + """All elements that are parents of this PageElement. + + :yield: A sequence of Tags, ending with a BeautifulSoup object. + """ + i = self.parent + while i is not None: + successor = i.parent + yield i + i = successor + + @property + def self_and_parents(self) -> Iterator[PageElement]: + """This element, then all of its parents. + + :yield: A sequence of PageElements, ending with a BeautifulSoup object. + """ + return self._self_and(self.parents) + + def _self_and(self, other_generator:Iterator[PageElement]) -> Iterator[PageElement]: + """Modify a generator by yielding this element, then everything + yielded by the other generator. + """ + if not self.hidden: + yield self + for i in other_generator: + yield i + + @property + def decomposed(self) -> bool: + """Check whether a PageElement has been decomposed.""" + return getattr(self, "_decomposed", False) or False + + @_deprecated("next_elements", "4.0.0") + def nextGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.next_elements + + @_deprecated("next_siblings", "4.0.0") + def nextSiblingGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.next_siblings + + @_deprecated("previous_elements", "4.0.0") + def previousGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.previous_elements + + @_deprecated("previous_siblings", "4.0.0") + def previousSiblingGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.previous_siblings + + @_deprecated("parents", "4.0.0") + def parentGenerator(self) -> Iterator[PageElement]: + ":meta private:" + return self.parents + + +class NavigableString(str, PageElement): + """A Python string that is part of a parse tree. + + When Beautiful Soup parses the markup ``<b>penguin</b>``, it will + create a `NavigableString` for the string "penguin". + """ + + #: A string prepended to the body of the 'real' string + #: when formatting it as part of a document, such as the '<!--' + #: in an HTML comment. + PREFIX: str = "" + + #: A string appended to the body of the 'real' string + #: when formatting it as part of a document, such as the '-->' + #: in an HTML comment. + SUFFIX: str = "" + + def __new__(cls, value: Union[str, bytes]) -> Self: + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, str): + u = str.__new__(cls, value) + else: + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.hidden = False + u.setup() + return u + + def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = False) -> Self: + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + + :param recursive: This parameter is ignored; it's only defined + so that NavigableString.__deepcopy__ implements the same + signature as Tag.__deepcopy__. + """ + return type(self)(self) + + def __getnewargs__(self) -> Tuple[str]: + return (str(self),) + + @property + def string(self) -> str: + """Convenience property defined to match `Tag.string`. + + :return: This property always returns the `NavigableString` it was + called on. + + :meta private: + """ + return self + + def output_ready(self, formatter: _FormatterOrName = "minimal") -> str: + """Run the string through the provided formatter, making it + ready for output as part of an HTML or XML document. + + :param formatter: A `Formatter` object, or a string naming one + of the standard formatters. + """ + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self) -> None: + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + + :meta private: + """ + return None + + @name.setter + def name(self, name: str) -> None: + """Prevent NavigableString.name from ever being set. + + :meta private: + """ + raise AttributeError("A NavigableString cannot be given a name.") + + def _all_strings( + self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default + ) -> Iterator[str]: + """Yield all strings of certain classes, possibly stripping them. + + This makes it easy for NavigableString to implement methods + like get_text() as conveniences, creating a consistent + text-extraction API across all PageElements. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. If this + NavigableString isn't one of those subclasses, the + sequence will be empty. By default, the subclasses + considered are NavigableString and CData objects. That + means no comments, processing instructions, etc. + + :yield: A sequence that either contains this string, or is empty. + """ + if types is self.default: + # This is kept in Tag because it's full of subclasses of + # this class, which aren't defined until later in the file. + types = Tag.MAIN_CONTENT_STRING_TYPES + + # Do nothing if the caller is looking for specific types of + # string, and we're of a different type. + # + # We check specific types instead of using isinstance(self, + # types) because all of these classes subclass + # NavigableString. Anyone who's using this feature probably + # wants generic NavigableStrings but not other stuff. + my_type = type(self) + if types is not None: + if isinstance(types, type): + # Looking for a single type. + if my_type is not types: + return + elif my_type not in types: + # Looking for one of a list of types. + return + + value = self + if strip: + final_value = value.strip() + else: + final_value = self + if len(final_value) > 0: + yield final_value + + @property + def strings(self) -> Iterator[str]: + """Yield this string, but only if it is interesting. + + This is defined the way it is for compatibility with + `Tag.strings`. See `Tag` for information on which strings are + interesting in a given context. + + :yield: A sequence that either contains this string, or is empty. + """ + return self._all_strings() + + +class PreformattedString(NavigableString): + """A `NavigableString` not subject to the normal formatting rules. + + This is an abstract class used for special kinds of strings such + as comments (`Comment`) and CDATA blocks (`CData`). + """ + + PREFIX: str = "" + SUFFIX: str = "" + + def output_ready(self, formatter: Optional[_FormatterOrName] = None) -> str: + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A `Formatter` object, or a string naming one + of the standard formatters. The string will be passed into the + `Formatter`, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. + """ + if formatter is not None: + self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + + +class CData(PreformattedString): + """A `CDATA section <https://dev.w3.org/html5/spec-LC/syntax.html#cdata-sections>`_.""" + + PREFIX: str = "<![CDATA[" + SUFFIX: str = "]]>" + + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX: str = "<?" + SUFFIX: str = ">" + + +class XMLProcessingInstruction(ProcessingInstruction): + """An `XML processing instruction <https://www.w3.org/TR/REC-xml/#sec-pi>`_.""" + + PREFIX: str = "<?" + SUFFIX: str = "?>" + + +class Comment(PreformattedString): + """An `HTML comment <https://dev.w3.org/html5/spec-LC/syntax.html#comments>`_ or `XML comment <https://www.w3.org/TR/REC-xml/#sec-comments>`_.""" + + PREFIX: str = "<!--" + SUFFIX: str = "-->" + + +class Declaration(PreformattedString): + """An `XML declaration <https://www.w3.org/TR/REC-xml/#sec-prolog-dtd>`_.""" + + PREFIX: str = "<?" + SUFFIX: str = "?>" + + +class Doctype(PreformattedString): + """A `document type declaration <https://www.w3.org/TR/REC-xml/#dt-doctype>`_.""" + + @classmethod + def for_name_and_ids( + cls, name: str, pub_id: Optional[str], system_id: Optional[str] + ) -> Doctype: + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + """ + return Doctype(cls._string_for_name_and_ids(name, pub_id, system_id)) + + @classmethod + def _string_for_name_and_ids( + self, name: str, pub_id: Optional[str], system_id: Optional[str] + ) -> str: + """Generate a string to be used as the basis of a Doctype object. + + This is a separate method from for_name_and_ids() because the lxml + TreeBuilder needs to call it. + """ + value = name or "" + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + return value + + PREFIX: str = "<!DOCTYPE " + SUFFIX: str = ">\n" + + +class Stylesheet(NavigableString): + """A `NavigableString` representing the contents of a `<style> HTML + tag <https://dev.w3.org/html5/spec-LC/Overview.html#the-style-element>`_ + (probably CSS). + + Used to distinguish embedded stylesheets from textual content. + """ + + +class Script(NavigableString): + """A `NavigableString` representing the contents of a `<script> + HTML tag + <https://dev.w3.org/html5/spec-LC/Overview.html#the-script-element>`_ + (probably Javascript). + + Used to distinguish executable code from textual content. + """ + + +class TemplateString(NavigableString): + """A `NavigableString` representing a string found inside an `HTML + <template> tag <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>`_ + embedded in a larger document. + + Used to distinguish such strings from the main body of the document. + """ + + +class RubyTextString(NavigableString): + """A NavigableString representing the contents of an `<rt> HTML + tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element>`_. + + Can be used to distinguish such strings from the strings they're + annotating. + """ + + +class RubyParenthesisString(NavigableString): + """A NavigableString representing the contents of an `<rp> HTML + tag <https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element>`_. + """ + + +class Tag(PageElement): + """An HTML or XML tag that is part of a parse tree, along with its + attributes, contents, and relationships to other parts of the tree. + + When Beautiful Soup parses the markup ``<b>penguin</b>``, it will + create a `Tag` object representing the ``<b>`` tag. You can + instantiate `Tag` objects directly, but it's not necessary unless + you're adding entirely new markup to a parsed document. Most of + the constructor arguments are intended for use by the `TreeBuilder` + that's parsing a document. + + :param parser: A `BeautifulSoup` object representing the parse tree this + `Tag` will be part of. + :param builder: The `TreeBuilder` being used to build the tree. + :param name: The name of the tag. + :param namespace: The URI of this tag's XML namespace, if any. + :param prefix: The prefix for this tag's XML namespace, if any. + :param attrs: A dictionary of attribute values. + :param parent: The `Tag` to use as the parent of this `Tag`. May be + the `BeautifulSoup` object itself. + :param previous: The `PageElement` that was parsed immediately before + parsing this tag. + :param is_xml: If True, this is an XML tag. Otherwise, this is an + HTML tag. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within ``sourceline`` where this + tag was found. + :param can_be_empty_element: If True, this tag should be + represented as <tag/>. If False, this tag should be represented + as <tag></tag>. + :param cdata_list_attributes: A dictionary of attributes whose values should + be parsed as lists of strings if they ever show up on this tag. + :param preserve_whitespace_tags: Names of tags whose contents + should have their whitespace preserved if they are encountered inside + this tag. + :param interesting_string_types: When iterating over this tag's + string contents in methods like `Tag.strings` or + `PageElement.get_text`, these are the types of strings that are + interesting enough to be considered. By default, + `NavigableString` (normal strings) and `CData` (CDATA + sections) are the only interesting string subtypes. + :param namespaces: A dictionary mapping currently active + namespace prefixes to URIs, as of the point in the parsing process when + this tag was encountered. This can be used later to + construct CSS selectors. + + """ + + def __init__( + self, + parser: Optional[BeautifulSoup] = None, + builder: Optional[TreeBuilder] = None, + name: Optional[str] = None, + namespace: Optional[str] = None, + prefix: Optional[str] = None, + attrs: Optional[_RawOrProcessedAttributeValues] = None, + parent: Optional[Union[BeautifulSoup, Tag]] = None, + previous: _AtMostOneElement = None, + is_xml: Optional[bool] = None, + sourceline: Optional[int] = None, + sourcepos: Optional[int] = None, + can_be_empty_element: Optional[bool] = None, + cdata_list_attributes: Optional[Dict[str, Set[str]]] = None, + preserve_whitespace_tags: Optional[Set[str]] = None, + interesting_string_types: Optional[Set[Type[NavigableString]]] = None, + namespaces: Optional[Dict[str, str]] = None, + # NOTE: Any new arguments here need to be mirrored in + # Tag.copy_self, and potentially BeautifulSoup.new_tag + # as well. + ): + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self._namespaces = namespaces or {} + self.prefix = prefix + if (not builder or builder.store_line_numbers) and ( + sourceline is not None or sourcepos is not None + ): + self.sourceline = sourceline + self.sourcepos = sourcepos + else: + self.sourceline = sourceline + self.sourcepos = sourcepos + + attr_dict_class: type[AttributeDict] + attribute_value_list_class: type[AttributeValueList] + if builder is None: + if is_xml: + attr_dict_class = XMLAttributeDict + else: + attr_dict_class = HTMLAttributeDict + attribute_value_list_class = AttributeValueList + else: + attr_dict_class = builder.attribute_dict_class + attribute_value_list_class = builder.attribute_value_list_class + self.attribute_value_list_class = attribute_value_list_class + + if attrs is None: + self.attrs = attr_dict_class() + else: + if builder is not None and builder.cdata_list_attributes: + self.attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs + ) + else: + self.attrs = attr_dict_class() + # Make sure that the values of any multi-valued + # attributes (e.g. when a Tag is copied) are stored in + # new lists. + for k, v in attrs.items(): + if isinstance(v, list): + v = v.__class__(v) + self.attrs[k] = v + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml + self.contents: List[PageElement] = [] + self.setup(parent, previous) + self.hidden = False + + if builder is None: + # In the absence of a TreeBuilder, use whatever values were + # passed in here. They're probably None, unless this is a copy of some + # other tag. + self.can_be_empty_element = can_be_empty_element + self.cdata_list_attributes = cdata_list_attributes + self.preserve_whitespace_tags = preserve_whitespace_tags + self.interesting_string_types = interesting_string_types + else: + # Set up any substitutions for this tag, such as the charset in a META tag. + self.attribute_value_list_class = builder.attribute_value_list_class + builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. + self.can_be_empty_element = builder.can_be_empty_element(name) + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + if self.name in builder.string_containers: + # This sort of tag uses a special string container + # subclass for most of its strings. We need to be able + # to look up the proper container subclass. + self.interesting_string_types = {builder.string_containers[self.name]} + else: + self.interesting_string_types = self.MAIN_CONTENT_STRING_TYPES + + parser_class: Optional[type[BeautifulSoup]] + name: str + namespace: Optional[str] + prefix: Optional[str] + attrs: _AttributeValues + sourceline: Optional[int] + sourcepos: Optional[int] + known_xml: Optional[bool] + contents: List[PageElement] + hidden: bool + interesting_string_types: Optional[Set[Type[NavigableString]]] + + can_be_empty_element: Optional[bool] + cdata_list_attributes: Optional[Dict[str, Set[str]]] + preserve_whitespace_tags: Optional[Set[str]] + + #: :meta private: + parserClass = _deprecated_alias("parserClass", "parser_class", "4.0.0") + + def __deepcopy__(self, memo: Dict[Any, Any], recursive: bool = True) -> Self: + """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = self.copy_self() + + if recursive: + # Clone this tag's descendants recursively, but without + # making any recursive function calls. + tag_stack: List[Tag] = [clone] + for event, element in self._event_stream(self.descendants): + if event is Tag.END_ELEMENT_EVENT: + # Stop appending incoming Tags to the Tag that was + # just closed. + tag_stack.pop() + else: + descendant_clone = element.__deepcopy__(memo, recursive=False) + # Add to its parent's .contents + tag_stack[-1].append(descendant_clone) + + if event is Tag.START_ELEMENT_EVENT: + # Add the Tag itself to the stack so that its + # children will be .appended to it. + tag_stack.append(cast(Tag, descendant_clone)) + return clone + + def copy_self(self) -> Self: + """Create a new Tag just like this one, but with no + contents and unattached to any parse tree. + + This is the first step in the deepcopy process, but you can + call it on its own to create a copy of a Tag without copying its + contents. + """ + clone = type(self)( + None, + None, + self.name, + self.namespace, + self.prefix, + self.attrs, + is_xml=self._is_xml, + sourceline=self.sourceline, + sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags, + interesting_string_types=self.interesting_string_types, + namespaces=self._namespaces, + ) + for attr in ("can_be_empty_element", "hidden"): + setattr(clone, attr, getattr(self, attr)) + return clone + + @property + def is_empty_element(self) -> bool: + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the `TreeBuilder` used to create the + tag. If the builder has a designated list of empty-element + tags, then only a tag whose name shows up in that list is + considered an empty-element tag. This is usually the case + for HTML documents. + + If the builder has no designated list of empty-element, then + any tag with no contents is an empty-element tag. This is usually + the case for XML documents. + """ + return len(self.contents) == 0 and self.can_be_empty_element is True + + @_deprecated("is_empty_element", "4.0.0") + def isSelfClosing(self) -> bool: + ": :meta private:" + return self.is_empty_element + + @property + def string(self) -> Optional[str]: + """Convenience property to get the single string within this + `Tag`, assuming there is just one. + + :return: If this `Tag` has a single child that's a + `NavigableString`, the return value is that string. If this + element has one child `Tag`, the return value is that child's + `Tag.string`, recursively. If this `Tag` has no children, + or has more than one child, the return value is ``None``. + + If this property is unexpectedly returning ``None`` for you, + it's probably because your `Tag` has more than one thing + inside it. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + elif isinstance(child, Tag): + return child.string + return None + + @string.setter + def string(self, string: str) -> None: + """Replace the `Tag.contents` of this `Tag` with a single string.""" + self.clear() + if isinstance(string, NavigableString): + new_class = string.__class__ + else: + new_class = NavigableString + self.append(new_class(string)) + + #: :meta private: + MAIN_CONTENT_STRING_TYPES = {NavigableString, CData} + + def _all_strings( + self, strip: bool = False, types: _OneOrMoreStringTypes = PageElement.default + ) -> Iterator[str]: + """Yield all strings of certain classes, possibly stripping them. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, the subclasses considered are the ones found in + self.interesting_string_types. If that's not specified, + only NavigableString and CData objects will be + considered. That means no comments, processing + instructions, etc. + """ + if types is self.default: + if self.interesting_string_types is None: + types = self.MAIN_CONTENT_STRING_TYPES + else: + types = self.interesting_string_types + + for descendant in self.descendants: + if not isinstance(descendant, NavigableString): + continue + descendant_type = type(descendant) + if isinstance(types, type): + if descendant_type is not types: + # We're not interested in strings of this type. + continue + elif types is not None and descendant_type not in types: + # We're not interested in strings of this type. + continue + if strip: + stripped = descendant.strip() + if len(stripped) == 0: + continue + yield stripped + else: + yield descendant + + strings = property(_all_strings) + + def insert(self, position: int, *new_children: _InsertableElement) -> List[PageElement]: + """Insert one or more new PageElements as a child of this `Tag`. + + This works similarly to :py:meth:`list.insert`, except you can insert + multiple elements at once. + + :param position: The numeric position that should be occupied + in this Tag's `Tag.children` by the first new `PageElement`. + + :param new_children: The PageElements to insert. + + :return The newly inserted PageElements. + """ + inserted: List[PageElement] = [] + for new_child in new_children: + inserted.extend(self._insert(position, new_child)) + position += 1 + return inserted + + def _insert(self, position: int, new_child: _InsertableElement) -> List[PageElement]: + if new_child is None: + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if isinstance(new_child, str) and not isinstance(new_child, NavigableString): + new_child = NavigableString(new_child) + + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the BeautifulSoup's children and + # return them. + return self.insert(position, *list(new_child.contents)) + position = min(position, len(self.contents)) + if hasattr(new_child, "parent") and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + elif current_index == position: + # We're 'inserting' an element into its current location. + # This is a no-op. + return [new_child] + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant( + is_initialized=False, accept_self=True + ) + # new_childs_last_element can't be None because we passed + # accept_self=True into _last_descendant. Worst case, + # new_childs_last_element will be new_child itself. Making + # this cast removes several mypy complaints later on as we + # manipulate new_childs_last_element. + new_childs_last_element = cast(PageElement, new_childs_last_element) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent: Optional[Tag] = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = ( + new_childs_last_element + ) + self.contents.insert(position, new_child) + + return [new_child] + + def unwrap(self) -> Self: + """Replace this `PageElement` with its contents. + + :return: This object, no longer part of the tree. + """ + my_parent = self.parent + if my_parent is None: + raise ValueError( + "Cannot replace an element with its contents when that " + "element is not part of a tree." + ) + my_index = my_parent.index(self) + self.extract(_self_index=my_index) + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + + replace_with_children = unwrap + + @_deprecated("unwrap", "4.0.0") + def replaceWithChildren(self) -> _OneElement: + ": :meta private:" + return self.unwrap() + + def append(self, tag: _InsertableElement) -> PageElement: + """ + Appends the given `PageElement` to the contents of this `Tag`. + + :param tag: A PageElement. + + :return The newly appended PageElement. + """ + return self.insert(len(self.contents), tag)[0] + + def extend(self, tags: Union[Iterable[_InsertableElement], Tag]) -> List[PageElement]: + """Appends one or more objects to the contents of this + `Tag`. + + :param tags: If a list of `PageElement` objects is provided, + they will be appended to this tag's contents, one at a time. + If a single `Tag` is provided, its `Tag.contents` will be + used to extend this object's `Tag.contents`. + + :return The list of PageElements that were appended. + """ + tag_list: Iterable[_InsertableElement] + + if isinstance(tags, Tag): + tag_list = list(tags.contents) + elif isinstance(tags, (PageElement, str)): + # The caller should really be using append() instead, + # but we can make it work. + warnings.warn( + "A single non-Tag item was passed into Tag.extend. Use Tag.append instead.", + UserWarning, + stacklevel=2, + ) + if isinstance(tags, str) and not isinstance(tags, PageElement): + tags = NavigableString(tags) + tag_list = [tags] + elif isinstance(tags, Iterable): + # Moving items around the tree may change their position in + # the original list. Make a list that won't change. + tag_list = list(tags) + + results: List[PageElement] = [] + for tag in tag_list: + results.append(self.append(tag)) + + return results + + def clear(self, decompose: bool = False) -> None: + """Destroy all children of this `Tag` by calling + `PageElement.extract` on them. + + :param decompose: If this is True, `PageElement.decompose` (a + more destructive method) will be called instead of + `PageElement.extract`. + """ + for element in self.contents[:]: + if decompose: + element.decompose() + else: + element.extract() + + def smooth(self) -> None: + """Smooth out the children of this `Tag` by consolidating consecutive + strings. + + If you perform a lot of operations that modify the tree, + calling this method afterwards can make pretty-printed output + look more natural. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents) - 1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i + 1] + if ( + isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = cast(NavigableString, self.contents[i]) + b = cast(NavigableString, self.contents[i + 1]) + b.extract() + n = NavigableString(a + b) + a.replace_with(n) + + def index(self, element: PageElement) -> int: + """Find the index of a child of this `Tag` (by identity, not value). + + Doing this by identity avoids issues when a `Tag` contains two + children that have string equality. + + :param element: Look for this `PageElement` in this object's contents. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get( + self, key: str, default: Optional[_AttributeValue] = None + ) -> Optional[_AttributeValue]: + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this `Tag`. + """ + return self.attrs.get(key, default) + + def get_attribute_list( + self, key: str, default: Optional[AttributeValueList] = None + ) -> AttributeValueList: + """The same as get(), but always returns a (possibly empty) list. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this `Tag`. + :return: A list of strings, usually empty or containing only a single + value. + """ + list_value: AttributeValueList + value = self.get(key, default) + if value is None: + list_value = self.attribute_value_list_class() + elif isinstance(value, list): + list_value = value + else: + if not isinstance(value, str): + value = cast(str, value) + list_value = self.attribute_value_list_class([value]) + return list_value + + def has_attr(self, key: str) -> bool: + """Does this `Tag` have an attribute with the given name?""" + return key in self.attrs + + def __hash__(self) -> int: + return str(self).__hash__() + + def __getitem__(self, key: str) -> _AttributeValue: + """tag[key] returns the value of the 'key' attribute for the Tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self) -> Iterator[PageElement]: + "Iterating over a Tag iterates over its contents." + return iter(self.contents) + + def __len__(self) -> int: + "The length of a Tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x: Any) -> bool: + return x in self.contents + + def __bool__(self) -> bool: + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key: str, value: _AttributeValue) -> None: + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key: str) -> None: + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__( + self, + name: Optional[_StrainableElement] = None, + attrs: _StrainableAttributes = {}, + recursive: bool = True, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Calling a Tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all( + name, attrs, recursive, string, limit, _stacklevel, **kwargs + ) + + def __getattr__(self, subtag: str) -> Optional[Tag]: + """Calling tag.subtag is the same as calling tag.find(name="subtag")""" + # print("Getattr %s.%s" % (self.__class__, tag)) + result: _AtMostOneElement + if len(subtag) > 3 and subtag.endswith("Tag"): + # BS3: soup.aTag -> "soup.find("a") + tag_name = subtag[:-3] + warnings.warn( + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' + % dict(name=tag_name), + DeprecationWarning, + stacklevel=2, + ) + result = self.find(tag_name) + # We special case contents to avoid recursion. + elif not subtag.startswith("__") and not subtag == "contents": + result = self.find(subtag) + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, subtag) + ) + return cast(Optional[Tag], result) + + def __eq__(self, other: Any) -> bool: + """Returns true iff this Tag has the same name, the same attributes, + and the same contents (recursively) as `other`.""" + if self is other: + return True + if not isinstance(other, Tag): + return False + if ( + not hasattr(other, "name") + or not hasattr(other, "attrs") + or not hasattr(other, "contents") + or self.name != other.name + or self.attrs != other.attrs + or len(self) != len(other) + ): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other: Any) -> bool: + """Returns true iff this Tag is not identical to `other`, + as defined in __eq__.""" + return not self == other + + def __repr__(self) -> str: + """Renders this `Tag` as a string.""" + return self.decode() + + __str__ = __unicode__ = __repr__ + + def encode( + self, + encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, + indent_level: Optional[int] = None, + formatter: _FormatterOrName = "minimal", + errors: str = "xmlcharrefreplace", + ) -> bytes: + """Render this `Tag` and its contents as a bytestring. + + :param encoding: The encoding to use when converting to + a bytestring. This may also affect the text of the document, + specifically any encoding declarations within the document. + :param indent_level: Each line of the rendering will be + indented this many levels. (The ``formatter`` decides what a + 'level' means, in terms of spaces or other characters + output.) This is used internally in recursive calls while + pretty-printing. + :param formatter: Either a `Formatter` object, or a string naming one of + the standard formatters. + :param errors: An error handling strategy such as + 'xmlcharrefreplace'. This value is passed along into + :py:meth:`str.encode` and its value should be one of the `error + handling constants defined by Python's codecs module + <https://docs.python.org/3/library/codecs.html#error-handlers>`_. + """ + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode( + self, + indent_level: Optional[int] = None, + eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, + formatter: _FormatterOrName = "minimal", + iterator: Optional[Iterator[PageElement]] = None, + ) -> str: + """Render this `Tag` and its contents as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many levels. (The ``formatter`` decides what a + 'level' means, in terms of spaces or other characters + output.) This is used internally in recursive calls while + pretty-printing. + :param encoding: The encoding you intend to use when + converting the string to a bytestring. decode() is *not* + responsible for performing that encoding. This information + is needed so that a real encoding can be substituted in if + the document contains an encoding declaration (e.g. in a + <meta> tag). + :param formatter: Either a `Formatter` object, or a string + naming one of the standard formatters. + :param iterator: The iterator to use when navigating over the + parse tree. This is only used by `Tag.decode_contents` and + you probably won't need to use it. + """ + pieces = [] + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + if indent_level is True: + indent_level = 0 + + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + + for event, element in self._event_stream(iterator): + if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): + element = cast(Tag, element) + piece = element._format_tag(eventual_encoding, formatter, opening=True) + elif event is Tag.END_ELEMENT_EVENT: + element = cast(Tag, element) + piece = element._format_tag(eventual_encoding, formatter, opening=False) + if indent_level is not None: + indent_level -= 1 + else: + element = cast(NavigableString, element) + piece = element.output_ready(formatter) + + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like <pre> and + # <script>, can't be prettified, since adding whitespace would + # change the meaning of the content. + + # The default behavior is to add whitespace before and + # after an element when string literal mode is off, and to + # leave things as they are when string literal mode is on. + if string_literal_tag: + indent_before = indent_after = False + else: + indent_before = indent_after = True + + # The only time the behavior is more complex than that is + # when we encounter an opening or closing tag that might + # put us into or out of string literal mode. + if ( + event is Tag.START_ELEMENT_EVENT + and not string_literal_tag + and not cast(Tag, element)._should_pretty_print() + ): + # We are about to enter string literal mode. Add + # whitespace before this tag, but not after. We + # will stay in string literal mode until this tag + # is closed. + indent_before = True + indent_after = False + string_literal_tag = element + elif event is Tag.END_ELEMENT_EVENT and element is string_literal_tag: + # We are about to exit string literal mode by closing + # the tag that sent us into that mode. Add whitespace + # after this tag, but not before. + indent_before = False + indent_after = True + string_literal_tag = None + + # Now we know whether to add whitespace before and/or + # after this element. + if indent_level is not None: + if indent_before or indent_after: + if isinstance(element, NavigableString): + piece = piece.strip() + if piece: + piece = self._indent_string( + piece, indent_level, formatter, indent_before, indent_after + ) + if event == Tag.START_ELEMENT_EVENT: + indent_level += 1 + pieces.append(piece) + return "".join(pieces) + + class _TreeTraversalEvent(object): + """An internal class representing an event in the process + of traversing a parse tree. + + :meta private: + """ + + # Stand-ins for the different events yielded by _event_stream + START_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: + END_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: + EMPTY_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: + STRING_ELEMENT_EVENT = _TreeTraversalEvent() #: :meta private: + + def _event_stream( + self, iterator: Optional[Iterator[PageElement]] = None + ) -> Iterator[Tuple[_TreeTraversalEvent, PageElement]]: + """Yield a sequence of events that can be used to reconstruct the DOM + for this element. + + This lets us recreate the nested structure of this element + (e.g. when formatting it as a string) without using recursive + method calls. + + This is similar in concept to the SAX API, but it's a simpler + interface designed for internal use. The events are different + from SAX and the arguments associated with the events are Tags + and other Beautiful Soup objects. + + :param iterator: An alternate iterator to use when traversing + the tree. + """ + tag_stack: List[Tag] = [] + + iterator = iterator or self.self_and_descendants + + for c in iterator: + # If the parent of the element we're about to yield is not + # the tag currently on the stack, it means that the tag on + # the stack closed before this element appeared. + while tag_stack and c.parent != tag_stack[-1]: + now_closed_tag = tag_stack.pop() + yield Tag.END_ELEMENT_EVENT, now_closed_tag + + if isinstance(c, Tag): + if c.is_empty_element: + yield Tag.EMPTY_ELEMENT_EVENT, c + else: + yield Tag.START_ELEMENT_EVENT, c + tag_stack.append(c) + continue + else: + yield Tag.STRING_ELEMENT_EVENT, c + + while tag_stack: + now_closed_tag = tag_stack.pop() + yield Tag.END_ELEMENT_EVENT, now_closed_tag + + def _indent_string( + self, + s: str, + indent_level: int, + formatter: Formatter, + indent_before: bool, + indent_after: bool, + ) -> str: + """Add indentation whitespace before and/or after a string. + + :param s: The string to amend with whitespace. + :param indent_level: The indentation level; affects how much + whitespace goes before the string. + :param indent_before: Whether or not to add whitespace + before the string. + :param indent_after: Whether or not to add whitespace + (a newline) after the string. + """ + space_before = "" + if indent_before and indent_level: + space_before = formatter.indent * indent_level + + space_after = "" + if indent_after: + space_after = "\n" + + return space_before + s + space_after + + def _format_tag( + self, eventual_encoding: str, formatter: Formatter, opening: bool + ) -> str: + if self.hidden: + # A hidden tag is invisible, although its contents + # are visible. + return "" + + # A tag starts with the < character (see below). + + # Then the / character, if this is a closing tag. + closing_slash = "" + if not opening: + closing_slash = "/" + + # Then an optional namespace prefix. + prefix = "" + if self.prefix: + prefix = self.prefix + ":" + + # Then a list of attribute values, if this is an opening tag. + attribute_string = "" + if opening: + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = " ".join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.substitute_encoding(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = str(key) + "=" + formatter.quoted_attribute_value(text) + attrs.append(decoded) + if attrs: + attribute_string = " " + " ".join(attrs) + + # Then an optional closing slash (for a void element in an + # XML document). + void_element_closing_slash = "" + if self.is_empty_element: + void_element_closing_slash = formatter.void_element_close_prefix or "" + + # Put it all together. + return ( + "<" + + closing_slash + + prefix + + self.name + + attribute_string + + void_element_closing_slash + + ">" + ) + + def _should_pretty_print(self, indent_level: int = 1) -> bool: + """Should this tag be pretty-printed? + + Most of them should, but some (such as <pre> in HTML + documents) should not. + """ + return indent_level is not None and ( + not self.preserve_whitespace_tags + or self.name not in self.preserve_whitespace_tags + ) + + def prettify( + self, + encoding: Optional[_Encoding] = None, + formatter: _FormatterOrName = "minimal", + ) -> Union[str, bytes]: + """Pretty-print this `Tag` as a string or bytestring. + + :param encoding: The encoding of the bytestring, or None if you want Unicode. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :return: A string (if no ``encoding`` is provided) or a bytestring + (otherwise). + """ + if encoding is None: + return self.decode(indent_level=0, formatter=formatter) + else: + return self.encode(encoding=encoding, indent_level=0, formatter=formatter) + + def decode_contents( + self, + indent_level: Optional[int] = None, + eventual_encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, + formatter: _FormatterOrName = "minimal", + ) -> str: + """Renders the contents of this tag as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. decode_contents() is *not* + responsible for performing that encoding. This information + is needed so that a real encoding can be substituted in if + the document contains an encoding declaration (e.g. in a + <meta> tag). + + :param formatter: A `Formatter` object, or a string naming one of + the standard Formatters. + """ + return self.decode( + indent_level, eventual_encoding, formatter, iterator=self.descendants + ) + + def encode_contents( + self, + indent_level: Optional[int] = None, + encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, + formatter: _FormatterOrName = "minimal", + ) -> bytes: + """Renders the contents of this PageElement as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many levels. (The ``formatter`` decides what a + 'level' means, in terms of spaces or other characters + output.) This is used internally in recursive calls while + pretty-printing. + :param formatter: Either a `Formatter` object, or a string naming one of + the standard formatters. + :param encoding: The bytestring will be in this encoding. + """ + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + @_deprecated("encode_contents", "4.0.0") + def renderContents( + self, + encoding: _Encoding = DEFAULT_OUTPUT_ENCODING, + prettyPrint: bool = False, + indentLevel: Optional[int] = 0, + ) -> bytes: + """Deprecated method for BS3 compatibility. + + :meta private: + """ + if not prettyPrint: + indentLevel = None + return self.encode_contents(indent_level=indentLevel, encoding=encoding) + + # Soup methods + + def find( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + recursive: bool = True, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ) -> _AtMostOneElement: + """Look in the children of this PageElement and find the first + PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param recursive: If this is True, find() will perform a + recursive search of this Tag's children. Otherwise, + only the direct children will be considered. + :param string: A filter on the `Tag.string` attribute. + :param limit: Stop looking after finding this many results. + :kwargs: Additional filters on attribute values. + """ + r = None + results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) + if results: + r = results[0] + return r + + findChild = _deprecated_function_alias("findChild", "find", "3.0.0") + + def find_all( + self, + name: _FindMethodName = None, + attrs: _StrainableAttributes = {}, + recursive: bool = True, + string: Optional[_StrainableString] = None, + limit: Optional[int] = None, + _stacklevel: int = 2, + **kwargs: _StrainableAttribute, + ) -> _QueryResults: + """Look in the children of this `PageElement` and find all + `PageElement` objects that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: Additional filters on attribute values. + :param recursive: If this is True, find_all() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :param _stacklevel: Used internally to improve warning messages. + :kwargs: Additional filters on attribute values. + """ + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all( + name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs + ) + + findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0") + findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0") + + # Generator methods + @property + def children(self) -> Iterator[PageElement]: + """Iterate over all direct children of this `PageElement`.""" + return (x for x in self.contents) + + @property + def self_and_descendants(self) -> Iterator[PageElement]: + """Iterate over this `Tag` and its children in a + breadth-first sequence. + """ + return self._self_and(self.descendants) + + @property + def descendants(self) -> Iterator[PageElement]: + """Iterate over all children of this `Tag` in a + breadth-first sequence. + """ + if not len(self.contents): + return + # _last_descendant() can't return None here because + # accept_self is True. Worst case, last_descendant will end up + # as self. + last_descendant = cast(PageElement, self._last_descendant(accept_self=True)) + stopNode = last_descendant.next_element + current: _AtMostOneElement = self.contents[0] + while current is not stopNode and current is not None: + successor = current.next_element + yield current + current = successor + + # CSS selector code + def select_one( + self, selector: str, namespaces: Optional[Dict[str, str]] = None, **kwargs: Any + ) -> Optional[Tag]: + """Perform a CSS selection operation on the current element. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param kwargs: Keyword arguments to be passed into Soup Sieve's + soupsieve.select() method. + """ + return self.css.select_one(selector, namespaces, **kwargs) + + def select( + self, + selector: str, + namespaces: Optional[Dict[str, str]] = None, + limit: int = 0, + **kwargs: Any, + ) -> ResultSet[Tag]: + """Perform a CSS selection operation on the current element. + + This uses the SoupSieve library. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + """ + return self.css.select(selector, namespaces, limit, **kwargs) + + @property + def css(self) -> CSS: + """Return an interface to the CSS selector API.""" + return CSS(self) + + # Old names for backwards compatibility + @_deprecated("children", "4.0.0") + def childGenerator(self) -> Iterator[PageElement]: + """Deprecated generator. + + :meta private: + """ + return self.children + + @_deprecated("descendants", "4.0.0") + def recursiveChildGenerator(self) -> Iterator[PageElement]: + """Deprecated generator. + + :meta private: + """ + return self.descendants + + @_deprecated("has_attr", "4.0.0") + def has_key(self, key: str) -> bool: + """Deprecated method. This was kind of misleading because has_key() + (attributes) was different from __in__ (contents). + + has_key() is gone in Python 3, anyway. + + :meta private: + """ + return self.has_attr(key) + + +_PageElementT = TypeVar("_PageElementT", bound=PageElement) + + +class ResultSet(List[_PageElementT], Generic[_PageElementT]): + """A ResultSet is a list of `PageElement` objects, gathered as the result + of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of + search results. + """ + + source: Optional[ElementFilter] + + def __init__( + self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = () + ) -> None: + super(ResultSet, self).__init__(result) + self.source = source + + def __getattr__(self, key: str) -> None: + """Raise a helpful exception to explain a common code fix.""" + raise AttributeError( + f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?""" + ) + + +# Now that all the classes used by SoupStrainer have been defined, +# import SoupStrainer itself into this module to preserve the +# backwards compatibility of anyone who imports +# bs4.element.SoupStrainer. +from bs4.filter import SoupStrainer # noqa: E402 |