from __future__ import annotations
from collections import defaultdict
import re
from typing import (
Any,
Callable,
cast,
Dict,
Iterator,
Iterable,
List,
Optional,
Sequence,
Type,
Union,
)
import warnings
from bs4._deprecation import _deprecated
from bs4.element import (
AttributeDict,
NavigableString,
PageElement,
ResultSet,
Tag,
)
from bs4._typing import (
_AtMostOneElement,
_AttributeValue,
_OneElement,
_PageElementMatchFunction,
_QueryResults,
_RawAttributeValues,
_RegularExpressionProtocol,
_StrainableAttribute,
_StrainableElement,
_StrainableString,
_StringMatchFunction,
_TagMatchFunction,
)
class ElementFilter(object):
"""`ElementFilter` encapsulates the logic necessary to decide:
1. whether a `PageElement` (a `Tag` or a `NavigableString`) matches a
user-specified query.
2. whether a given sequence of markup found during initial parsing
should be turned into a `PageElement` at all, or simply discarded.
The base class is the simplest `ElementFilter`. By default, it
matches everything and allows all markup to become `PageElement`
objects. You can make it more selective by passing in a
user-defined match function, or defining a subclass.
Most users of Beautiful Soup will never need to use
`ElementFilter`, or its more capable subclass
`SoupStrainer`. Instead, they will use methods like
:py:meth:`Tag.find`, which will convert their arguments into
`SoupStrainer` objects and run them against the tree.
However, if you find yourself wanting to treat the arguments to
Beautiful Soup's find_*() methods as first-class objects, those
objects will be `SoupStrainer` objects. You can create them
yourself and then make use of functions like
`ElementFilter.filter()`.
"""
match_function: Optional[_PageElementMatchFunction]
def __init__(self, match_function: Optional[_PageElementMatchFunction] = None):
"""Pass in a match function to easily customize the behavior of
`ElementFilter.match` without needing to subclass.
:param match_function: A function that takes a `PageElement`
and returns `True` if that `PageElement` matches some criteria.
"""
self.match_function = match_function
@property
def includes_everything(self) -> bool:
"""Does this `ElementFilter` obviously include everything? If so,
the filter process can be made much faster.
The `ElementFilter` might turn out to include everything even
if this returns `False`, but it won't include everything in an
obvious way.
The base `ElementFilter` implementation includes things based on
the match function, so includes_everything is only true if
there is no match function.
"""
return not self.match_function
@property
def excludes_everything(self) -> bool:
"""Does this `ElementFilter` obviously exclude everything? If
so, Beautiful Soup will issue a warning if you try to use it
when parsing a document.
The `ElementFilter` might turn out to exclude everything even
if this returns `False`, but it won't exclude everything in an
obvious way.
The base `ElementFilter` implementation excludes things based
on a match function we can't inspect, so excludes_everything
is always false.
"""
return False
def match(self, element: PageElement, _known_rules:bool=False) -> bool:
"""Does the given PageElement match the rules set down by this
ElementFilter?
The base implementation delegates to the function passed in to
the constructor.
:param _known_rules: Defined for compatibility with
SoupStrainer._match(). Used more for consistency than because
we need the performance optimization.
"""
if not _known_rules and self.includes_everything:
return True
if not self.match_function:
return True
return self.match_function(element)
def filter(self, generator: Iterator[PageElement]) -> Iterator[_OneElement]:
"""The most generic search method offered by Beautiful Soup.
Acts like Python's built-in `filter`, using
`ElementFilter.match` as the filtering function.
"""
# If there are no rules at all, don't bother filtering. Let
# anything through.
if self.includes_everything:
for i in generator:
yield i
while True:
try:
i = next(generator)
except StopIteration:
break
if i:
if self.match(i, _known_rules=True):
yield cast("_OneElement", i)
def find(self, generator: Iterator[PageElement]) -> _AtMostOneElement:
"""A lower-level equivalent of :py:meth:`Tag.find`.
You can pass in your own generator for iterating over
`PageElement` objects. The first one that matches this
`ElementFilter` will be returned.
:param generator: A way of iterating over `PageElement`
objects.
"""
for match in self.filter(generator):
return match
return None
def find_all(
self, generator: Iterator[PageElement], limit: Optional[int] = None
) -> _QueryResults:
"""A lower-level equivalent of :py:meth:`Tag.find_all`.
You can pass in your own generator for iterating over
`PageElement` objects. Only elements that match this
`ElementFilter` will be returned in the :py:class:`ResultSet`.
:param generator: A way of iterating over `PageElement`
objects.
:param limit: Stop looking after finding this many results.
"""
results: _QueryResults = ResultSet(self)
for match in self.filter(generator):
results.append(match)
if limit is not None and len(results) >= limit:
break
return results
def allow_tag_creation(
self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues]
) -> bool:
"""Based on the name and attributes of a tag, see whether this
`ElementFilter` will allow a `Tag` object to even be created.
By default, all tags are parsed. To change this, subclass
`ElementFilter`.
:param name: The name of the prospective tag.
:param attrs: The attributes of the prospective tag.
"""
return True
def allow_string_creation(self, string: str) -> bool:
"""Based on the content of a string, see whether this
`ElementFilter` will allow a `NavigableString` object based on
this string to be added to the parse tree.
By default, all strings are processed into `NavigableString`
objects. To change this, subclass `ElementFilter`.
:param str: The string under consideration.
"""
return True
class MatchRule(object):
"""Each MatchRule encapsulates the logic behind a single argument
passed in to one of the Beautiful Soup find* methods.
"""
string: Optional[str]
pattern: Optional[_RegularExpressionProtocol]
present: Optional[bool]
exclude_everything: Optional[bool]
# TODO-TYPING: All MatchRule objects also have an attribute
# ``function``, but the type of the function depends on the
# subclass.
def __init__(
self,
string: Optional[Union[str, bytes]] = None,
pattern: Optional[_RegularExpressionProtocol] = None,
function: Optional[Callable] = None,
present: Optional[bool] = None,
exclude_everything: Optional[bool] = None
):
if isinstance(string, bytes):
string = string.decode("utf8")
self.string = string
if isinstance(pattern, bytes):
self.pattern = re.compile(pattern.decode("utf8"))
elif isinstance(pattern, str):
self.pattern = re.compile(pattern)
else:
self.pattern = pattern
self.function = function
self.present = present
self.exclude_everything = exclude_everything
values = [
x
for x in (self.string, self.pattern, self.function, self.present, self.exclude_everything)
if x is not None
]
if len(values) == 0:
raise ValueError(
"Either string, pattern, function, present, or exclude_everything must be provided."
)
if len(values) > 1:
raise ValueError(
"At most one of string, pattern, function, present, and exclude_everything must be provided."
)
def _base_match(self, string: Optional[str]) -> Optional[bool]:
"""Run the 'cheap' portion of a match, trying to get an answer without
calling a potentially expensive custom function.
:return: True or False if we have a (positive or negative)
match; None if we need to keep trying.
"""
# self.exclude_everything matches nothing.
if self.exclude_everything:
return False
# self.present==True matches everything except None.
if self.present is True:
return string is not None
# self.present==False matches _only_ None.
if self.present is False:
return string is None
# self.string does an exact string match.
if self.string is not None:
# print(f"{self.string} ?= {string}")
return self.string == string
# self.pattern does a regular expression search.
if self.pattern is not None:
# print(f"{self.pattern} ?~ {string}")
if string is None:
return False
return self.pattern.search(string) is not None
return None
def matches_string(self, string: Optional[str]) -> bool:
_base_result = self._base_match(string)
if _base_result is not None:
# No need to invoke the test function.
return _base_result
if self.function is not None and not self.function(string):
# print(f"{self.function}({string}) == False")
return False
return True
def __repr__(self) -> str:
cls = type(self).__name__
return f"<{cls} string={self.string} pattern={self.pattern} function={self.function} present={self.present}>"
def __eq__(self, other: Any) -> bool:
return (
isinstance(other, MatchRule)
and self.string == other.string
and self.pattern == other.pattern
and self.function == other.function
and self.present == other.present
)
class TagNameMatchRule(MatchRule):
"""A MatchRule implementing the rules for matches against tag name."""
function: Optional[_TagMatchFunction]
def matches_tag(self, tag: Tag) -> bool:
base_value = self._base_match(tag.name)
if base_value is not None:
return base_value
# The only remaining possibility is that the match is determined
# by a function call. Call the function.
function = cast(_TagMatchFunction, self.function)
if function(tag):
return True
return False
class AttributeValueMatchRule(MatchRule):
"""A MatchRule implementing the rules for matches against attribute value."""
function: Optional[_StringMatchFunction]
class StringMatchRule(MatchRule):
"""A MatchRule implementing the rules for matches against a NavigableString."""
function: Optional[_StringMatchFunction]
class SoupStrainer(ElementFilter):
"""The `ElementFilter` subclass used internally by Beautiful Soup.
A `SoupStrainer` encapsulates the logic necessary to perform the
kind of matches supported by methods such as
:py:meth:`Tag.find`. `SoupStrainer` objects are primarily created
internally, but you can create one yourself and pass it in as
``parse_only`` to the `BeautifulSoup` constructor, to parse a
subset of a large document.
Internally, `SoupStrainer` objects work by converting the
constructor arguments into `MatchRule` objects. Incoming
tags/markup are matched against those rules.
:param name: One or more restrictions on the tags found in a document.
:param attrs: A dictionary that maps attribute names to
restrictions on tags that use those attributes.
:param string: One or more restrictions on the strings found in a
document.
:param kwargs: A dictionary that maps attribute names to restrictions
on tags that use those attributes. These restrictions are additive to
any specified in ``attrs``.
"""
name_rules: List[TagNameMatchRule]
attribute_rules: Dict[str, List[AttributeValueMatchRule]]
string_rules: List[StringMatchRule]
def __init__(
self,
name: Optional[_StrainableElement] = None,
attrs: Dict[str, _StrainableAttribute] = {},
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
):
if string is None and "text" in kwargs:
string = cast(Optional[_StrainableString], kwargs.pop("text"))
warnings.warn(
"As of version 4.11.0, the 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
DeprecationWarning,
stacklevel=2,
)
if name is None and not attrs and not string and not kwargs:
# Special case for backwards compatibility. Instantiating
# a SoupStrainer with no arguments whatsoever gets you one
# that matches all Tags, and only Tags.
self.name_rules = [TagNameMatchRule(present=True)]
else:
self.name_rules = cast(
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
)
self.attribute_rules = defaultdict(list)
if not isinstance(attrs, dict):
# Passing something other than a dictionary as attrs is
# sugar for matching that thing against the 'class'
# attribute.
attrs = {"class": attrs}
for attrdict in attrs, kwargs:
for attr, value in attrdict.items():
if attr == "class_" and attrdict is kwargs:
# If you pass in 'class_' as part of kwargs, it's
# because class is a Python reserved word. If you
# pass it in as part of the attrs dict, it's
# because you really are looking for an attribute
# called 'class_'.
attr = "class"
if value is None:
value = False
for rule_obj in self._make_match_rules(value, AttributeValueMatchRule):
self.attribute_rules[attr].append(
cast(AttributeValueMatchRule, rule_obj)
)
self.string_rules = cast(
List[StringMatchRule], list(self._make_match_rules(string, StringMatchRule))
)
#: DEPRECATED 4.13.0: You shouldn't need to check this under
#: any name (.string or .text), and if you do, you're probably
#: not taking into account all of the types of values this
#: variable might have. Look at the .string_rules list instead.
self.__string = string
@property
def includes_everything(self) -> bool:
"""Check whether the provided rules will obviously include
everything. (They might include everything even if this returns `False`,
but not in an obvious way.)
"""
return not self.name_rules and not self.string_rules and not self.attribute_rules
@property
def excludes_everything(self) -> bool:
"""Check whether the provided rules will obviously exclude
everything. (They might exclude everything even if this returns `False`,
but not in an obvious way.)
"""
if (self.string_rules and (self.name_rules or self.attribute_rules)):
# This is self-contradictory, so the rules exclude everything.
return True
# If there's a rule that ended up treated as an "exclude everything"
# rule due to creating a logical inconsistency, then the rules
# exclude everything.
if any(x.exclude_everything for x in self.string_rules):
return True
if any(x.exclude_everything for x in self.name_rules):
return True
for ruleset in self.attribute_rules.values():
if any(x.exclude_everything for x in ruleset):
return True
return False
@property
def string(self) -> Optional[_StrainableString]:
":meta private:"
warnings.warn(
"Access to deprecated property string. (Look at .string_rules instead) -- Deprecated since version 4.13.0.",
DeprecationWarning,
stacklevel=2,
)
return self.__string
@property
def text(self) -> Optional[_StrainableString]:
":meta private:"
warnings.warn(
"Access to deprecated property text. (Look at .string_rules instead) -- Deprecated since version 4.13.0.",
DeprecationWarning,
stacklevel=2,
)
return self.__string
def __repr__(self) -> str:
return f"<{self.__class__.__name__} name={self.name_rules} attrs={self.attribute_rules} string={self.string_rules}>"
@classmethod
def _make_match_rules(
cls,
obj: Optional[Union[_StrainableElement, _StrainableAttribute]],
rule_class: Type[MatchRule],
) -> Iterator[MatchRule]:
"""Convert a vaguely-specific 'object' into one or more well-defined
`MatchRule` objects.
:param obj: Some kind of object that corresponds to one or more
matching rules.
:param rule_class: Create instances of this `MatchRule` subclass.
"""
if obj is None:
return
if isinstance(obj, (str, bytes)):
yield rule_class(string=obj)
elif isinstance(obj, bool):
yield rule_class(present=obj)
elif callable(obj):
yield rule_class(function=obj)
elif isinstance(obj, _RegularExpressionProtocol):
yield rule_class(pattern=obj)
elif hasattr(obj, "__iter__"):
if not obj:
# The attribute is being matched against the null set,
# which means it should exclude everything.
yield rule_class(exclude_everything=True)
for o in obj:
if not isinstance(o, (bytes, str)) and hasattr(o, "__iter__"):
# This is almost certainly the user's
# mistake. This list contains another list, which
# opens up the possibility of infinite
# self-reference. In the interests of avoiding
# infinite recursion, we'll treat this as an
# impossible match and issue a rule that excludes
# everything, rather than looking inside.
warnings.warn(
f"Ignoring nested list {o} to avoid the possibility of infinite recursion.",
stacklevel=5,
)
yield rule_class(exclude_everything=True)
continue
for x in cls._make_match_rules(o, rule_class):
yield x
else:
yield rule_class(string=str(obj))
def matches_tag(self, tag: Tag) -> bool:
"""Do the rules of this `SoupStrainer` trigger a match against the
given `Tag`?
If the `SoupStrainer` has any `TagNameMatchRule`, at least one
must match the `Tag` or its `Tag.name`.
If there are any `AttributeValueMatchRule` for a given
attribute, at least one of them must match the attribute
value.
If there are any `StringMatchRule`, at least one must match,
but a `SoupStrainer` that *only* contains `StringMatchRule`
cannot match a `Tag`, only a `NavigableString`.
"""
# If there are no rules at all, let anything through.
#if self.includes_everything:
# return True
# String rules cannot not match a Tag on their own.
if not self.name_rules and not self.attribute_rules:
return False
# Optimization for a very common case where the user is
# searching for a tag with one specific name, and we're
# looking at a tag with a different name.
if (
not tag.prefix
and len(self.name_rules) == 1
and self.name_rules[0].string is not None
and tag.name != self.name_rules[0].string
):
return False
# If there are name rules, at least one must match. It can
# match either the Tag object itself or the prefixed name of
# the tag.
prefixed_name = None
if tag.prefix:
prefixed_name = f"{tag.prefix}:{tag.name}"
if self.name_rules:
name_matches = False
for rule in self.name_rules:
# attrs = " ".join(
# [f"{k}={v}" for k, v in sorted(tag.attrs.items())]
# )
# print(f"Testing <{tag.name} {attrs}>{tag.string}</{tag.name}> against {rule}")
if rule.matches_tag(tag) or (
prefixed_name is not None and rule.matches_string(prefixed_name)
):
name_matches = True
break
if not name_matches:
return False
# If there are attribute rules for a given attribute, at least
# one of them must match. If there are rules for multiple
# attributes, each attribute must have at least one match.
for attr, rules in self.attribute_rules.items():
attr_value = tag.get(attr, None)
this_attr_match = self._attribute_match(attr_value, rules)
if not this_attr_match:
return False
# If there are string rules, at least one must match.
if self.string_rules:
_str = tag.string
if _str is None:
return False
if not self.matches_any_string_rule(_str):
return False
return True
def _attribute_match(
self,
attr_value: Optional[_AttributeValue],
rules: Iterable[AttributeValueMatchRule],
) -> bool:
attr_values: Sequence[Optional[str]]
if isinstance(attr_value, list):
attr_values = attr_value
else:
attr_values = [cast(str, attr_value)]
def _match_attribute_value_helper(attr_values: Sequence[Optional[str]]) -> bool:
for rule in rules:
for attr_value in attr_values:
if rule.matches_string(attr_value):
return True
return False
this_attr_match = _match_attribute_value_helper(attr_values)
if not this_attr_match and len(attr_values) > 1:
# This cast converts Optional[str] to plain str.
#
# We know if there's more than one value, there can't be
# any None in the list, because Beautiful Soup never uses
# None as a value of a multi-valued attribute, and if None
# is passed in as attr_value, it's turned into a list with
# a single element (thus len(attr_values) > 1 fails).
attr_values = cast(Sequence[str], attr_values)
# Try again but treat the attribute value
# as a single string.
joined_attr_value = " ".join(attr_values)
this_attr_match = _match_attribute_value_helper([joined_attr_value])
return this_attr_match
def allow_tag_creation(
self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues]
) -> bool:
"""Based on the name and attributes of a tag, see whether this
`SoupStrainer` will allow a `Tag` object to even be created.
:param name: The name of the prospective tag.
:param attrs: The attributes of the prospective tag.
"""
if self.string_rules:
# A SoupStrainer that has string rules can't be used to
# manage tag creation, because the string rule can't be
# evaluated until after the tag and all of its contents
# have been parsed.
return False
prefixed_name = None
if nsprefix:
prefixed_name = f"{nsprefix}:{name}"
if self.name_rules:
# At least one name rule must match.
name_match = False
for rule in self.name_rules:
for x in name, prefixed_name:
if x is not None:
if rule.matches_string(x):
name_match = True
break
if not name_match:
return False
# For each attribute that has rules, at least one rule must
# match.
if attrs is None:
attrs = AttributeDict()
for attr, rules in self.attribute_rules.items():
attr_value = attrs.get(attr)
if not self._attribute_match(attr_value, rules):
return False
return True
def allow_string_creation(self, string: str) -> bool:
"""Based on the content of a markup string, see whether this
`SoupStrainer` will allow it to be instantiated as a
`NavigableString` object, or whether it should be ignored.
"""
if self.name_rules or self.attribute_rules:
# A SoupStrainer that has name or attribute rules won't
# match any strings; it's designed to match tags with
# certain properties.
return False
if not self.string_rules:
# A SoupStrainer with no string rules will match
# all strings.
return True
if not self.matches_any_string_rule(string):
return False
return True
def matches_any_string_rule(self, string: str) -> bool:
"""See whether the content of a string matches any of
this `SoupStrainer`'s string rules.
"""
if not self.string_rules:
return True
for string_rule in self.string_rules:
if string_rule.matches_string(string):
return True
return False
def match(self, element: PageElement, _known_rules: bool=False) -> bool:
"""Does the given `PageElement` match the rules set down by this
`SoupStrainer`?
The find_* methods rely heavily on this method to find matches.
:param element: A `PageElement`.
:param _known_rules: Set to true in the common case where
we already checked and found at least one rule in this SoupStrainer
that might exclude a PageElement. Without this, we need
to check .includes_everything every time, just to be safe.
:return: `True` if the element matches this `SoupStrainer`'s rules; `False` otherwise.
"""
# If there are no rules at all, let anything through.
if not _known_rules and self.includes_everything:
return True
if isinstance(element, Tag):
return self.matches_tag(element)
assert isinstance(element, NavigableString)
if not (self.name_rules or self.attribute_rules):
# A NavigableString can only match a SoupStrainer that
# does not define any name or attribute rules.
# Then it comes down to the string rules.
return self.matches_any_string_rule(element)
return False
@_deprecated("allow_tag_creation", "4.13.0")
def search_tag(self, name: str, attrs: Optional[_RawAttributeValues]) -> bool:
"""A less elegant version of `allow_tag_creation`. Deprecated as of 4.13.0"""
":meta private:"
return self.allow_tag_creation(None, name, attrs)
@_deprecated("match", "4.13.0")
def search(self, element: PageElement) -> Optional[PageElement]:
"""A less elegant version of match(). Deprecated as of 4.13.0.
:meta private:
"""
return element if self.match(element) else None