diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/bs4/filter.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/bs4/filter.py | 755 |
1 files changed, 755 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/bs4/filter.py b/.venv/lib/python3.12/site-packages/bs4/filter.py new file mode 100644 index 00000000..e3ce2e2f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/bs4/filter.py @@ -0,0 +1,755 @@ +from __future__ import annotations +from collections import defaultdict +import re +from typing import ( + Any, + Callable, + cast, + Dict, + Iterator, + Iterable, + List, + Optional, + Sequence, + Type, + Union, +) +import warnings + +from bs4._deprecation import _deprecated +from bs4.element import ( + AttributeDict, + NavigableString, + PageElement, + ResultSet, + Tag, +) +from bs4._typing import ( + _AtMostOneElement, + _AttributeValue, + _OneElement, + _PageElementMatchFunction, + _QueryResults, + _RawAttributeValues, + _RegularExpressionProtocol, + _StrainableAttribute, + _StrainableElement, + _StrainableString, + _StringMatchFunction, + _TagMatchFunction, +) + + +class ElementFilter(object): + """`ElementFilter` encapsulates the logic necessary to decide: + + 1. whether a `PageElement` (a `Tag` or a `NavigableString`) matches a + user-specified query. + + 2. whether a given sequence of markup found during initial parsing + should be turned into a `PageElement` at all, or simply discarded. + + The base class is the simplest `ElementFilter`. By default, it + matches everything and allows all markup to become `PageElement` + objects. You can make it more selective by passing in a + user-defined match function, or defining a subclass. + + Most users of Beautiful Soup will never need to use + `ElementFilter`, or its more capable subclass + `SoupStrainer`. Instead, they will use methods like + :py:meth:`Tag.find`, which will convert their arguments into + `SoupStrainer` objects and run them against the tree. + + However, if you find yourself wanting to treat the arguments to + Beautiful Soup's find_*() methods as first-class objects, those + objects will be `SoupStrainer` objects. You can create them + yourself and then make use of functions like + `ElementFilter.filter()`. + """ + + match_function: Optional[_PageElementMatchFunction] + + def __init__(self, match_function: Optional[_PageElementMatchFunction] = None): + """Pass in a match function to easily customize the behavior of + `ElementFilter.match` without needing to subclass. + + :param match_function: A function that takes a `PageElement` + and returns `True` if that `PageElement` matches some criteria. + """ + self.match_function = match_function + + @property + def includes_everything(self) -> bool: + """Does this `ElementFilter` obviously include everything? If so, + the filter process can be made much faster. + + The `ElementFilter` might turn out to include everything even + if this returns `False`, but it won't include everything in an + obvious way. + + The base `ElementFilter` implementation includes things based on + the match function, so includes_everything is only true if + there is no match function. + """ + return not self.match_function + + @property + def excludes_everything(self) -> bool: + """Does this `ElementFilter` obviously exclude everything? If + so, Beautiful Soup will issue a warning if you try to use it + when parsing a document. + + The `ElementFilter` might turn out to exclude everything even + if this returns `False`, but it won't exclude everything in an + obvious way. + + The base `ElementFilter` implementation excludes things based + on a match function we can't inspect, so excludes_everything + is always false. + """ + return False + + def match(self, element: PageElement, _known_rules:bool=False) -> bool: + """Does the given PageElement match the rules set down by this + ElementFilter? + + The base implementation delegates to the function passed in to + the constructor. + + :param _known_rules: Defined for compatibility with + SoupStrainer._match(). Used more for consistency than because + we need the performance optimization. + """ + if not _known_rules and self.includes_everything: + return True + if not self.match_function: + return True + return self.match_function(element) + + def filter(self, generator: Iterator[PageElement]) -> Iterator[_OneElement]: + """The most generic search method offered by Beautiful Soup. + + Acts like Python's built-in `filter`, using + `ElementFilter.match` as the filtering function. + """ + # If there are no rules at all, don't bother filtering. Let + # anything through. + if self.includes_everything: + for i in generator: + yield i + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + if self.match(i, _known_rules=True): + yield cast("_OneElement", i) + + def find(self, generator: Iterator[PageElement]) -> _AtMostOneElement: + """A lower-level equivalent of :py:meth:`Tag.find`. + + You can pass in your own generator for iterating over + `PageElement` objects. The first one that matches this + `ElementFilter` will be returned. + + :param generator: A way of iterating over `PageElement` + objects. + """ + for match in self.filter(generator): + return match + return None + + def find_all( + self, generator: Iterator[PageElement], limit: Optional[int] = None + ) -> _QueryResults: + """A lower-level equivalent of :py:meth:`Tag.find_all`. + + You can pass in your own generator for iterating over + `PageElement` objects. Only elements that match this + `ElementFilter` will be returned in the :py:class:`ResultSet`. + + :param generator: A way of iterating over `PageElement` + objects. + + :param limit: Stop looking after finding this many results. + """ + results: _QueryResults = ResultSet(self) + for match in self.filter(generator): + results.append(match) + if limit is not None and len(results) >= limit: + break + return results + + def allow_tag_creation( + self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues] + ) -> bool: + """Based on the name and attributes of a tag, see whether this + `ElementFilter` will allow a `Tag` object to even be created. + + By default, all tags are parsed. To change this, subclass + `ElementFilter`. + + :param name: The name of the prospective tag. + :param attrs: The attributes of the prospective tag. + """ + return True + + def allow_string_creation(self, string: str) -> bool: + """Based on the content of a string, see whether this + `ElementFilter` will allow a `NavigableString` object based on + this string to be added to the parse tree. + + By default, all strings are processed into `NavigableString` + objects. To change this, subclass `ElementFilter`. + + :param str: The string under consideration. + """ + return True + + +class MatchRule(object): + """Each MatchRule encapsulates the logic behind a single argument + passed in to one of the Beautiful Soup find* methods. + """ + + string: Optional[str] + pattern: Optional[_RegularExpressionProtocol] + present: Optional[bool] + exclude_everything: Optional[bool] + # TODO-TYPING: All MatchRule objects also have an attribute + # ``function``, but the type of the function depends on the + # subclass. + + def __init__( + self, + string: Optional[Union[str, bytes]] = None, + pattern: Optional[_RegularExpressionProtocol] = None, + function: Optional[Callable] = None, + present: Optional[bool] = None, + exclude_everything: Optional[bool] = None + ): + if isinstance(string, bytes): + string = string.decode("utf8") + self.string = string + if isinstance(pattern, bytes): + self.pattern = re.compile(pattern.decode("utf8")) + elif isinstance(pattern, str): + self.pattern = re.compile(pattern) + else: + self.pattern = pattern + self.function = function + self.present = present + self.exclude_everything = exclude_everything + + values = [ + x + for x in (self.string, self.pattern, self.function, self.present, self.exclude_everything) + if x is not None + ] + if len(values) == 0: + raise ValueError( + "Either string, pattern, function, present, or exclude_everything must be provided." + ) + if len(values) > 1: + raise ValueError( + "At most one of string, pattern, function, present, and exclude_everything must be provided." + ) + + def _base_match(self, string: Optional[str]) -> Optional[bool]: + """Run the 'cheap' portion of a match, trying to get an answer without + calling a potentially expensive custom function. + + :return: True or False if we have a (positive or negative) + match; None if we need to keep trying. + """ + # self.exclude_everything matches nothing. + if self.exclude_everything: + return False + + # self.present==True matches everything except None. + if self.present is True: + return string is not None + + # self.present==False matches _only_ None. + if self.present is False: + return string is None + + # self.string does an exact string match. + if self.string is not None: + # print(f"{self.string} ?= {string}") + return self.string == string + + # self.pattern does a regular expression search. + if self.pattern is not None: + # print(f"{self.pattern} ?~ {string}") + if string is None: + return False + return self.pattern.search(string) is not None + + return None + + def matches_string(self, string: Optional[str]) -> bool: + _base_result = self._base_match(string) + if _base_result is not None: + # No need to invoke the test function. + return _base_result + if self.function is not None and not self.function(string): + # print(f"{self.function}({string}) == False") + return False + return True + + def __repr__(self) -> str: + cls = type(self).__name__ + return f"<{cls} string={self.string} pattern={self.pattern} function={self.function} present={self.present}>" + + def __eq__(self, other: Any) -> bool: + return ( + isinstance(other, MatchRule) + and self.string == other.string + and self.pattern == other.pattern + and self.function == other.function + and self.present == other.present + ) + + +class TagNameMatchRule(MatchRule): + """A MatchRule implementing the rules for matches against tag name.""" + + function: Optional[_TagMatchFunction] + + def matches_tag(self, tag: Tag) -> bool: + base_value = self._base_match(tag.name) + if base_value is not None: + return base_value + + # The only remaining possibility is that the match is determined + # by a function call. Call the function. + function = cast(_TagMatchFunction, self.function) + if function(tag): + return True + return False + + +class AttributeValueMatchRule(MatchRule): + """A MatchRule implementing the rules for matches against attribute value.""" + + function: Optional[_StringMatchFunction] + + +class StringMatchRule(MatchRule): + """A MatchRule implementing the rules for matches against a NavigableString.""" + + function: Optional[_StringMatchFunction] + + +class SoupStrainer(ElementFilter): + """The `ElementFilter` subclass used internally by Beautiful Soup. + + A `SoupStrainer` encapsulates the logic necessary to perform the + kind of matches supported by methods such as + :py:meth:`Tag.find`. `SoupStrainer` objects are primarily created + internally, but you can create one yourself and pass it in as + ``parse_only`` to the `BeautifulSoup` constructor, to parse a + subset of a large document. + + Internally, `SoupStrainer` objects work by converting the + constructor arguments into `MatchRule` objects. Incoming + tags/markup are matched against those rules. + + :param name: One or more restrictions on the tags found in a document. + + :param attrs: A dictionary that maps attribute names to + restrictions on tags that use those attributes. + + :param string: One or more restrictions on the strings found in a + document. + + :param kwargs: A dictionary that maps attribute names to restrictions + on tags that use those attributes. These restrictions are additive to + any specified in ``attrs``. + + """ + + name_rules: List[TagNameMatchRule] + attribute_rules: Dict[str, List[AttributeValueMatchRule]] + string_rules: List[StringMatchRule] + + def __init__( + self, + name: Optional[_StrainableElement] = None, + attrs: Dict[str, _StrainableAttribute] = {}, + string: Optional[_StrainableString] = None, + **kwargs: _StrainableAttribute, + ): + if string is None and "text" in kwargs: + string = cast(Optional[_StrainableString], kwargs.pop("text")) + warnings.warn( + "As of version 4.11.0, the 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", + DeprecationWarning, + stacklevel=2, + ) + + if name is None and not attrs and not string and not kwargs: + # Special case for backwards compatibility. Instantiating + # a SoupStrainer with no arguments whatsoever gets you one + # that matches all Tags, and only Tags. + self.name_rules = [TagNameMatchRule(present=True)] + else: + self.name_rules = cast( + List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule)) + ) + self.attribute_rules = defaultdict(list) + + if not isinstance(attrs, dict): + # Passing something other than a dictionary as attrs is + # sugar for matching that thing against the 'class' + # attribute. + attrs = {"class": attrs} + + for attrdict in attrs, kwargs: + for attr, value in attrdict.items(): + if attr == "class_" and attrdict is kwargs: + # If you pass in 'class_' as part of kwargs, it's + # because class is a Python reserved word. If you + # pass it in as part of the attrs dict, it's + # because you really are looking for an attribute + # called 'class_'. + attr = "class" + + if value is None: + value = False + for rule_obj in self._make_match_rules(value, AttributeValueMatchRule): + self.attribute_rules[attr].append( + cast(AttributeValueMatchRule, rule_obj) + ) + + self.string_rules = cast( + List[StringMatchRule], list(self._make_match_rules(string, StringMatchRule)) + ) + + #: DEPRECATED 4.13.0: You shouldn't need to check this under + #: any name (.string or .text), and if you do, you're probably + #: not taking into account all of the types of values this + #: variable might have. Look at the .string_rules list instead. + self.__string = string + + @property + def includes_everything(self) -> bool: + """Check whether the provided rules will obviously include + everything. (They might include everything even if this returns `False`, + but not in an obvious way.) + """ + return not self.name_rules and not self.string_rules and not self.attribute_rules + + @property + def excludes_everything(self) -> bool: + """Check whether the provided rules will obviously exclude + everything. (They might exclude everything even if this returns `False`, + but not in an obvious way.) + """ + if (self.string_rules and (self.name_rules or self.attribute_rules)): + # This is self-contradictory, so the rules exclude everything. + return True + + # If there's a rule that ended up treated as an "exclude everything" + # rule due to creating a logical inconsistency, then the rules + # exclude everything. + if any(x.exclude_everything for x in self.string_rules): + return True + if any(x.exclude_everything for x in self.name_rules): + return True + for ruleset in self.attribute_rules.values(): + if any(x.exclude_everything for x in ruleset): + return True + return False + + @property + def string(self) -> Optional[_StrainableString]: + ":meta private:" + warnings.warn( + "Access to deprecated property string. (Look at .string_rules instead) -- Deprecated since version 4.13.0.", + DeprecationWarning, + stacklevel=2, + ) + return self.__string + + @property + def text(self) -> Optional[_StrainableString]: + ":meta private:" + warnings.warn( + "Access to deprecated property text. (Look at .string_rules instead) -- Deprecated since version 4.13.0.", + DeprecationWarning, + stacklevel=2, + ) + return self.__string + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} name={self.name_rules} attrs={self.attribute_rules} string={self.string_rules}>" + + @classmethod + def _make_match_rules( + cls, + obj: Optional[Union[_StrainableElement, _StrainableAttribute]], + rule_class: Type[MatchRule], + ) -> Iterator[MatchRule]: + """Convert a vaguely-specific 'object' into one or more well-defined + `MatchRule` objects. + + :param obj: Some kind of object that corresponds to one or more + matching rules. + :param rule_class: Create instances of this `MatchRule` subclass. + """ + if obj is None: + return + if isinstance(obj, (str, bytes)): + yield rule_class(string=obj) + elif isinstance(obj, bool): + yield rule_class(present=obj) + elif callable(obj): + yield rule_class(function=obj) + elif isinstance(obj, _RegularExpressionProtocol): + yield rule_class(pattern=obj) + elif hasattr(obj, "__iter__"): + if not obj: + # The attribute is being matched against the null set, + # which means it should exclude everything. + yield rule_class(exclude_everything=True) + for o in obj: + if not isinstance(o, (bytes, str)) and hasattr(o, "__iter__"): + # This is almost certainly the user's + # mistake. This list contains another list, which + # opens up the possibility of infinite + # self-reference. In the interests of avoiding + # infinite recursion, we'll treat this as an + # impossible match and issue a rule that excludes + # everything, rather than looking inside. + warnings.warn( + f"Ignoring nested list {o} to avoid the possibility of infinite recursion.", + stacklevel=5, + ) + yield rule_class(exclude_everything=True) + continue + for x in cls._make_match_rules(o, rule_class): + yield x + else: + yield rule_class(string=str(obj)) + + def matches_tag(self, tag: Tag) -> bool: + """Do the rules of this `SoupStrainer` trigger a match against the + given `Tag`? + + If the `SoupStrainer` has any `TagNameMatchRule`, at least one + must match the `Tag` or its `Tag.name`. + + If there are any `AttributeValueMatchRule` for a given + attribute, at least one of them must match the attribute + value. + + If there are any `StringMatchRule`, at least one must match, + but a `SoupStrainer` that *only* contains `StringMatchRule` + cannot match a `Tag`, only a `NavigableString`. + """ + # If there are no rules at all, let anything through. + #if self.includes_everything: + # return True + + # String rules cannot not match a Tag on their own. + if not self.name_rules and not self.attribute_rules: + return False + + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. + if ( + not tag.prefix + and len(self.name_rules) == 1 + and self.name_rules[0].string is not None + and tag.name != self.name_rules[0].string + ): + return False + + # If there are name rules, at least one must match. It can + # match either the Tag object itself or the prefixed name of + # the tag. + prefixed_name = None + if tag.prefix: + prefixed_name = f"{tag.prefix}:{tag.name}" + if self.name_rules: + name_matches = False + for rule in self.name_rules: + # attrs = " ".join( + # [f"{k}={v}" for k, v in sorted(tag.attrs.items())] + # ) + # print(f"Testing <{tag.name} {attrs}>{tag.string}</{tag.name}> against {rule}") + if rule.matches_tag(tag) or ( + prefixed_name is not None and rule.matches_string(prefixed_name) + ): + name_matches = True + break + + if not name_matches: + return False + + # If there are attribute rules for a given attribute, at least + # one of them must match. If there are rules for multiple + # attributes, each attribute must have at least one match. + for attr, rules in self.attribute_rules.items(): + attr_value = tag.get(attr, None) + this_attr_match = self._attribute_match(attr_value, rules) + if not this_attr_match: + return False + + # If there are string rules, at least one must match. + if self.string_rules: + _str = tag.string + if _str is None: + return False + if not self.matches_any_string_rule(_str): + return False + return True + + def _attribute_match( + self, + attr_value: Optional[_AttributeValue], + rules: Iterable[AttributeValueMatchRule], + ) -> bool: + attr_values: Sequence[Optional[str]] + if isinstance(attr_value, list): + attr_values = attr_value + else: + attr_values = [cast(str, attr_value)] + + def _match_attribute_value_helper(attr_values: Sequence[Optional[str]]) -> bool: + for rule in rules: + for attr_value in attr_values: + if rule.matches_string(attr_value): + return True + return False + + this_attr_match = _match_attribute_value_helper(attr_values) + if not this_attr_match and len(attr_values) > 1: + # This cast converts Optional[str] to plain str. + # + # We know if there's more than one value, there can't be + # any None in the list, because Beautiful Soup never uses + # None as a value of a multi-valued attribute, and if None + # is passed in as attr_value, it's turned into a list with + # a single element (thus len(attr_values) > 1 fails). + attr_values = cast(Sequence[str], attr_values) + + # Try again but treat the attribute value + # as a single string. + joined_attr_value = " ".join(attr_values) + this_attr_match = _match_attribute_value_helper([joined_attr_value]) + return this_attr_match + + def allow_tag_creation( + self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues] + ) -> bool: + """Based on the name and attributes of a tag, see whether this + `SoupStrainer` will allow a `Tag` object to even be created. + + :param name: The name of the prospective tag. + :param attrs: The attributes of the prospective tag. + """ + if self.string_rules: + # A SoupStrainer that has string rules can't be used to + # manage tag creation, because the string rule can't be + # evaluated until after the tag and all of its contents + # have been parsed. + return False + prefixed_name = None + if nsprefix: + prefixed_name = f"{nsprefix}:{name}" + if self.name_rules: + # At least one name rule must match. + name_match = False + for rule in self.name_rules: + for x in name, prefixed_name: + if x is not None: + if rule.matches_string(x): + name_match = True + break + if not name_match: + return False + + # For each attribute that has rules, at least one rule must + # match. + if attrs is None: + attrs = AttributeDict() + for attr, rules in self.attribute_rules.items(): + attr_value = attrs.get(attr) + if not self._attribute_match(attr_value, rules): + return False + + return True + + def allow_string_creation(self, string: str) -> bool: + """Based on the content of a markup string, see whether this + `SoupStrainer` will allow it to be instantiated as a + `NavigableString` object, or whether it should be ignored. + """ + if self.name_rules or self.attribute_rules: + # A SoupStrainer that has name or attribute rules won't + # match any strings; it's designed to match tags with + # certain properties. + return False + if not self.string_rules: + # A SoupStrainer with no string rules will match + # all strings. + return True + if not self.matches_any_string_rule(string): + return False + return True + + def matches_any_string_rule(self, string: str) -> bool: + """See whether the content of a string matches any of + this `SoupStrainer`'s string rules. + """ + if not self.string_rules: + return True + for string_rule in self.string_rules: + if string_rule.matches_string(string): + return True + return False + + def match(self, element: PageElement, _known_rules: bool=False) -> bool: + """Does the given `PageElement` match the rules set down by this + `SoupStrainer`? + + The find_* methods rely heavily on this method to find matches. + + :param element: A `PageElement`. + :param _known_rules: Set to true in the common case where + we already checked and found at least one rule in this SoupStrainer + that might exclude a PageElement. Without this, we need + to check .includes_everything every time, just to be safe. + :return: `True` if the element matches this `SoupStrainer`'s rules; `False` otherwise. + """ + # If there are no rules at all, let anything through. + if not _known_rules and self.includes_everything: + return True + if isinstance(element, Tag): + return self.matches_tag(element) + assert isinstance(element, NavigableString) + if not (self.name_rules or self.attribute_rules): + # A NavigableString can only match a SoupStrainer that + # does not define any name or attribute rules. + # Then it comes down to the string rules. + return self.matches_any_string_rule(element) + return False + + @_deprecated("allow_tag_creation", "4.13.0") + def search_tag(self, name: str, attrs: Optional[_RawAttributeValues]) -> bool: + """A less elegant version of `allow_tag_creation`. Deprecated as of 4.13.0""" + ":meta private:" + return self.allow_tag_creation(None, name, attrs) + + @_deprecated("match", "4.13.0") + def search(self, element: PageElement) -> Optional[PageElement]: + """A less elegant version of match(). Deprecated as of 4.13.0. + + :meta private: + """ + return element if self.match(element) else None |