diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/soupsieve | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/soupsieve')
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/__init__.py | 168 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/__meta__.py | 197 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/css_match.py | 1582 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/css_parser.py | 1289 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/css_types.py | 407 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/pretty.py | 139 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/py.typed | 0 | ||||
-rw-r--r-- | .venv/lib/python3.12/site-packages/soupsieve/util.py | 117 |
8 files changed, 3899 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/soupsieve/__init__.py b/.venv/lib/python3.12/site-packages/soupsieve/__init__.py new file mode 100644 index 00000000..45730dfa --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/__init__.py @@ -0,0 +1,168 @@ +""" +Soup Sieve. + +A CSS selector filter for BeautifulSoup4. + +MIT License + +Copyright (c) 2018 Isaac Muse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from __future__ import annotations +from .__meta__ import __version__, __version_info__ # noqa: F401 +from . import css_parser as cp +from . import css_match as cm +from . import css_types as ct +from .util import DEBUG, SelectorSyntaxError # noqa: F401 +import bs4 # type: ignore[import-untyped] +from typing import Any, Iterator, Iterable + +__all__ = ( + 'DEBUG', 'SelectorSyntaxError', 'SoupSieve', + 'closest', 'compile', 'filter', 'iselect', + 'match', 'select', 'select_one' +) + +SoupSieve = cm.SoupSieve + + +def compile( # noqa: A001 + pattern: str, + namespaces: dict[str, str] | None = None, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> cm.SoupSieve: + """Compile CSS pattern.""" + + if isinstance(pattern, SoupSieve): + if flags: + raise ValueError("Cannot process 'flags' argument on a compiled selector list") + elif namespaces is not None: + raise ValueError("Cannot process 'namespaces' argument on a compiled selector list") + elif custom is not None: + raise ValueError("Cannot process 'custom' argument on a compiled selector list") + return pattern + + return cp._cached_css_compile( + pattern, + ct.Namespaces(namespaces) if namespaces is not None else namespaces, + ct.CustomSelectors(custom) if custom is not None else custom, + flags + ) + + +def purge() -> None: + """Purge cached patterns.""" + + cp._purge_cache() + + +def closest( + select: str, + tag: bs4.Tag, + namespaces: dict[str, str] | None = None, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> bs4.Tag: + """Match closest ancestor.""" + + return compile(select, namespaces, flags, **kwargs).closest(tag) + + +def match( + select: str, + tag: bs4.Tag, + namespaces: dict[str, str] | None = None, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> bool: + """Match node.""" + + return compile(select, namespaces, flags, **kwargs).match(tag) + + +def filter( # noqa: A001 + select: str, + iterable: Iterable[bs4.Tag], + namespaces: dict[str, str] | None = None, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> list[bs4.Tag]: + """Filter list of nodes.""" + + return compile(select, namespaces, flags, **kwargs).filter(iterable) + + +def select_one( + select: str, + tag: bs4.Tag, + namespaces: dict[str, str] | None = None, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> bs4.Tag: + """Select a single tag.""" + + return compile(select, namespaces, flags, **kwargs).select_one(tag) + + +def select( + select: str, + tag: bs4.Tag, + namespaces: dict[str, str] | None = None, + limit: int = 0, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> list[bs4.Tag]: + """Select the specified tags.""" + + return compile(select, namespaces, flags, **kwargs).select(tag, limit) + + +def iselect( + select: str, + tag: bs4.Tag, + namespaces: dict[str, str] | None = None, + limit: int = 0, + flags: int = 0, + *, + custom: dict[str, str] | None = None, + **kwargs: Any +) -> Iterator[bs4.Tag]: + """Iterate the specified tags.""" + + yield from compile(select, namespaces, flags, **kwargs).iselect(tag, limit) + + +def escape(ident: str) -> str: + """Escape identifier.""" + + return cp.escape(ident) diff --git a/.venv/lib/python3.12/site-packages/soupsieve/__meta__.py b/.venv/lib/python3.12/site-packages/soupsieve/__meta__.py new file mode 100644 index 00000000..0fbf71b0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/__meta__.py @@ -0,0 +1,197 @@ +"""Meta related things.""" +from __future__ import annotations +from collections import namedtuple +import re + +RE_VER = re.compile( + r'''(?x) + (?P<major>\d+)(?:\.(?P<minor>\d+))?(?:\.(?P<micro>\d+))? + (?:(?P<type>a|b|rc)(?P<pre>\d+))? + (?:\.post(?P<post>\d+))? + (?:\.dev(?P<dev>\d+))? + ''' +) + +REL_MAP = { + ".dev": "", + ".dev-alpha": "a", + ".dev-beta": "b", + ".dev-candidate": "rc", + "alpha": "a", + "beta": "b", + "candidate": "rc", + "final": "" +} + +DEV_STATUS = { + ".dev": "2 - Pre-Alpha", + ".dev-alpha": "2 - Pre-Alpha", + ".dev-beta": "2 - Pre-Alpha", + ".dev-candidate": "2 - Pre-Alpha", + "alpha": "3 - Alpha", + "beta": "4 - Beta", + "candidate": "4 - Beta", + "final": "5 - Production/Stable" +} + +PRE_REL_MAP = {"a": 'alpha', "b": 'beta', "rc": 'candidate'} + + +class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre", "post", "dev"])): + """ + Get the version (PEP 440). + + A biased approach to the PEP 440 semantic version. + + Provides a tuple structure which is sorted for comparisons `v1 > v2` etc. + (major, minor, micro, release type, pre-release build, post-release build, development release build) + Release types are named in is such a way they are comparable with ease. + Accessors to check if a development, pre-release, or post-release build. Also provides accessor to get + development status for setup files. + + How it works (currently): + + - You must specify a release type as either `final`, `alpha`, `beta`, or `candidate`. + - To define a development release, you can use either `.dev`, `.dev-alpha`, `.dev-beta`, or `.dev-candidate`. + The dot is used to ensure all development specifiers are sorted before `alpha`. + You can specify a `dev` number for development builds, but do not have to as implicit development releases + are allowed. + - You must specify a `pre` value greater than zero if using a prerelease as this project (not PEP 440) does not + allow implicit prereleases. + - You can optionally set `post` to a value greater than zero to make the build a post release. While post releases + are technically allowed in prereleases, it is strongly discouraged, so we are rejecting them. It should be + noted that we do not allow `post0` even though PEP 440 does not restrict this. This project specifically + does not allow implicit post releases. + - It should be noted that we do not support epochs `1!` or local versions `+some-custom.version-1`. + + Acceptable version releases: + + ``` + Version(1, 0, 0, "final") 1.0 + Version(1, 2, 0, "final") 1.2 + Version(1, 2, 3, "final") 1.2.3 + Version(1, 2, 0, ".dev-alpha", pre=4) 1.2a4 + Version(1, 2, 0, ".dev-beta", pre=4) 1.2b4 + Version(1, 2, 0, ".dev-candidate", pre=4) 1.2rc4 + Version(1, 2, 0, "final", post=1) 1.2.post1 + Version(1, 2, 3, ".dev") 1.2.3.dev0 + Version(1, 2, 3, ".dev", dev=1) 1.2.3.dev1 + ``` + + """ + + def __new__( + cls, + major: int, minor: int, micro: int, release: str = "final", + pre: int = 0, post: int = 0, dev: int = 0 + ) -> Version: + """Validate version info.""" + + # Ensure all parts are positive integers. + for value in (major, minor, micro, pre, post): + if not (isinstance(value, int) and value >= 0): + raise ValueError("All version parts except 'release' should be integers.") + + if release not in REL_MAP: + raise ValueError(f"'{release}' is not a valid release type.") + + # Ensure valid pre-release (we do not allow implicit pre-releases). + if ".dev-candidate" < release < "final": + if pre == 0: + raise ValueError("Implicit pre-releases not allowed.") + elif dev: + raise ValueError("Version is not a development release.") + elif post: + raise ValueError("Post-releases are not allowed with pre-releases.") + + # Ensure valid development or development/pre release + elif release < "alpha": + if release > ".dev" and pre == 0: + raise ValueError("Implicit pre-release not allowed.") + elif post: + raise ValueError("Post-releases are not allowed with pre-releases.") + + # Ensure a valid normal release + else: + if pre: + raise ValueError("Version is not a pre-release.") + elif dev: + raise ValueError("Version is not a development release.") + + return super().__new__(cls, major, minor, micro, release, pre, post, dev) + + def _is_pre(self) -> bool: + """Is prerelease.""" + + return bool(self.pre > 0) + + def _is_dev(self) -> bool: + """Is development.""" + + return bool(self.release < "alpha") + + def _is_post(self) -> bool: + """Is post.""" + + return bool(self.post > 0) + + def _get_dev_status(self) -> str: # pragma: no cover + """Get development status string.""" + + return DEV_STATUS[self.release] + + def _get_canonical(self) -> str: + """Get the canonical output string.""" + + # Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed.. + if self.micro == 0: + ver = f"{self.major}.{self.minor}" + else: + ver = f"{self.major}.{self.minor}.{self.micro}" + if self._is_pre(): + ver += f'{REL_MAP[self.release]}{self.pre}' + if self._is_post(): + ver += f".post{self.post}" + if self._is_dev(): + ver += f".dev{self.dev}" + + return ver + + +def parse_version(ver: str) -> Version: + """Parse version into a comparable Version tuple.""" + + m = RE_VER.match(ver) + + if m is None: + raise ValueError(f"'{ver}' is not a valid version") + + # Handle major, minor, micro + major = int(m.group('major')) + minor = int(m.group('minor')) if m.group('minor') else 0 + micro = int(m.group('micro')) if m.group('micro') else 0 + + # Handle pre releases + if m.group('type'): + release = PRE_REL_MAP[m.group('type')] + pre = int(m.group('pre')) + else: + release = "final" + pre = 0 + + # Handle development releases + dev = m.group('dev') if m.group('dev') else 0 + if m.group('dev'): + dev = int(m.group('dev')) + release = '.dev-' + release if pre else '.dev' + else: + dev = 0 + + # Handle post + post = int(m.group('post')) if m.group('post') else 0 + + return Version(major, minor, micro, release, pre, post, dev) + + +__version_info__ = Version(2, 6, 0, "final") +__version__ = __version_info__._get_canonical() diff --git a/.venv/lib/python3.12/site-packages/soupsieve/css_match.py b/.venv/lib/python3.12/site-packages/soupsieve/css_match.py new file mode 100644 index 00000000..e52e42d5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/css_match.py @@ -0,0 +1,1582 @@ +"""CSS matcher.""" +from __future__ import annotations +from datetime import datetime +from . import util +import re +from . import css_types as ct +import unicodedata +import bs4 # type: ignore[import-untyped] +from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401 + +# Empty tag pattern (whitespace okay) +RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') + +RE_NOT_WS = re.compile('[^ \t\r\n\f]+') + +# Relationships +REL_PARENT = ' ' +REL_CLOSE_PARENT = '>' +REL_SIBLING = '~' +REL_CLOSE_SIBLING = '+' + +# Relationships for :has() (forward looking) +REL_HAS_PARENT = ': ' +REL_HAS_CLOSE_PARENT = ':>' +REL_HAS_SIBLING = ':~' +REL_HAS_CLOSE_SIBLING = ':+' + +NS_XHTML = 'http://www.w3.org/1999/xhtml' +NS_XML = 'http://www.w3.org/XML/1998/namespace' + +DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL +RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE + +DIR_MAP = { + 'ltr': ct.SEL_DIR_LTR, + 'rtl': ct.SEL_DIR_RTL, + 'auto': 0 +} + +RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") +RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') +RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') +RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') +RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') +RE_DATETIME = re.compile( + r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' +) +RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') + +MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November +FEB = 2 +SHORT_MONTH = 30 +LONG_MONTH = 31 +FEB_MONTH = 28 +FEB_LEAP_MONTH = 29 +DAYS_IN_WEEK = 7 + + +class _FakeParent: + """ + Fake parent class. + + When we have a fragment with no `BeautifulSoup` document object, + we can't evaluate `nth` selectors properly. Create a temporary + fake parent so we can traverse the root element as a child. + """ + + def __init__(self, element: bs4.Tag) -> None: + """Initialize.""" + + self.contents = [element] + + def __len__(self) -> bs4.PageElement: + """Length.""" + + return len(self.contents) + + +class _DocumentNav: + """Navigate a Beautiful Soup document.""" + + @classmethod + def assert_valid_input(cls, tag: Any) -> None: + """Check if valid input tag or document.""" + + # Fail on unexpected types. + if not cls.is_tag(tag): + raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}") + + @staticmethod + def is_doc(obj: bs4.Tag) -> bool: + """Is `BeautifulSoup` object.""" + return isinstance(obj, bs4.BeautifulSoup) + + @staticmethod + def is_tag(obj: bs4.PageElement) -> bool: + """Is tag.""" + return isinstance(obj, bs4.Tag) + + @staticmethod + def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover + """Is declaration.""" + return isinstance(obj, bs4.Declaration) + + @staticmethod + def is_cdata(obj: bs4.PageElement) -> bool: + """Is CDATA.""" + return isinstance(obj, bs4.CData) + + @staticmethod + def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover + """Is processing instruction.""" + return isinstance(obj, bs4.ProcessingInstruction) + + @staticmethod + def is_navigable_string(obj: bs4.PageElement) -> bool: + """Is navigable string.""" + return isinstance(obj, bs4.NavigableString) + + @staticmethod + def is_special_string(obj: bs4.PageElement) -> bool: + """Is special string.""" + return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) + + @classmethod + def is_content_string(cls, obj: bs4.PageElement) -> bool: + """Check if node is content string.""" + + return cls.is_navigable_string(obj) and not cls.is_special_string(obj) + + @staticmethod + def create_fake_parent(el: bs4.Tag) -> _FakeParent: + """Create fake parent for a given element.""" + + return _FakeParent(el) + + @staticmethod + def is_xml_tree(el: bs4.Tag) -> bool: + """Check if element (or document) is from a XML tree.""" + + return bool(el._is_xml) + + def is_iframe(self, el: bs4.Tag) -> bool: + """Check if element is an `iframe`.""" + + return bool( + ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and + self.is_html_tag(el) # type: ignore[attr-defined] + ) + + def is_root(self, el: bs4.Tag) -> bool: + """ + Return whether element is a root element. + + We check that the element is the root of the tree (which we have already pre-calculated), + and we check if it is the root element under an `iframe`. + """ + + root = self.root and self.root is el # type: ignore[attr-defined] + if not root: + parent = self.get_parent(el) + root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] + return root + + def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]: + """Get contents or contents in reverse.""" + if not no_iframe or not self.is_iframe(el): + yield from el.contents + + def get_children( + self, + el: bs4.Tag, + start: int | None = None, + reverse: bool = False, + tags: bool = True, + no_iframe: bool = False + ) -> Iterator[bs4.PageElement]: + """Get children.""" + + if not no_iframe or not self.is_iframe(el): + last = len(el.contents) - 1 + if start is None: + index = last if reverse else 0 + else: + index = start + end = -1 if reverse else last + 1 + incr = -1 if reverse else 1 + + if 0 <= index <= last: + while index != end: + node = el.contents[index] + index += incr + if not tags or self.is_tag(node): + yield node + + def get_descendants( + self, + el: bs4.Tag, + tags: bool = True, + no_iframe: bool = False + ) -> Iterator[bs4.PageElement]: + """Get descendants.""" + + if not no_iframe or not self.is_iframe(el): + next_good = None + for child in el.descendants: + + if next_good is not None: + if child is not next_good: + continue + next_good = None + + is_tag = self.is_tag(child) + + if no_iframe and is_tag and self.is_iframe(child): + if child.next_sibling is not None: + next_good = child.next_sibling + else: + last_child = child + while self.is_tag(last_child) and last_child.contents: + last_child = last_child.contents[-1] + next_good = last_child.next_element + yield child + if next_good is None: + break + # Coverage isn't seeing this even though it's executed + continue # pragma: no cover + + if not tags or is_tag: + yield child + + def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag: + """Get parent.""" + + parent = el.parent + if no_iframe and parent is not None and self.is_iframe(parent): + parent = None + return parent + + @staticmethod + def get_tag_name(el: bs4.Tag) -> str | None: + """Get tag.""" + + return cast('str | None', el.name) + + @staticmethod + def get_prefix_name(el: bs4.Tag) -> str | None: + """Get prefix.""" + + return cast('str | None', el.prefix) + + @staticmethod + def get_uri(el: bs4.Tag) -> str | None: + """Get namespace `URI`.""" + + return cast('str | None', el.namespace) + + @classmethod + def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: + """Get next sibling tag.""" + + sibling = el.next_sibling + while tags and not cls.is_tag(sibling) and sibling is not None: + sibling = sibling.next_sibling + return sibling + + @classmethod + def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: + """Get previous sibling tag.""" + + sibling = el.previous_sibling + while tags and not cls.is_tag(sibling) and sibling is not None: + sibling = sibling.previous_sibling + return sibling + + @staticmethod + def has_html_ns(el: bs4.Tag) -> bool: + """ + Check if element has an HTML namespace. + + This is a bit different than whether a element is treated as having an HTML namespace, + like we do in the case of `is_html_tag`. + """ + + ns = getattr(el, 'namespace') if el else None # noqa: B009 + return bool(ns and ns == NS_XHTML) + + @staticmethod + def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]: + """Return namespace and attribute name without the prefix.""" + + return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) + + @classmethod + def normalize_value(cls, value: Any) -> str | Sequence[str]: + """Normalize the value to be a string or list of strings.""" + + # Treat `None` as empty string. + if value is None: + return '' + + # Pass through strings + if (isinstance(value, str)): + return value + + # If it's a byte string, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. + if isinstance(value, Sequence): + new_value = [] + for v in value: + if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): + # This is most certainly a user error and will crash and burn later. + # To keep things working, we'll do what we do with all objects, + # And convert them to strings. + new_value.append(str(v)) + else: + # Convert the child to a string + new_value.append(cast(str, cls.normalize_value(v))) + return new_value + + # Try and make anything else a string + return str(value) + + @classmethod + def get_attribute_by_name( + cls, + el: bs4.Tag, + name: str, + default: str | Sequence[str] | None = None + ) -> str | Sequence[str] | None: + """Get attribute by name.""" + + value = default + if el._is_xml: + try: + value = cls.normalize_value(el.attrs[name]) + except KeyError: + pass + else: + for k, v in el.attrs.items(): + if util.lower(k) == name: + value = cls.normalize_value(v) + break + return value + + @classmethod + def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]: + """Iterate attributes.""" + + for k, v in el.attrs.items(): + yield k, cls.normalize_value(v) + + @classmethod + def get_classes(cls, el: bs4.Tag) -> Sequence[str]: + """Get classes.""" + + classes = cls.get_attribute_by_name(el, 'class', []) + if isinstance(classes, str): + classes = RE_NOT_WS.findall(classes) + return cast(Sequence[str], classes) + + def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: + """Get text.""" + + return ''.join( + [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] + ) + + def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: + """Get Own Text.""" + + return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] + + +class Inputs: + """Class for parsing and validating input items.""" + + @staticmethod + def validate_day(year: int, month: int, day: int) -> bool: + """Validate day.""" + + max_days = LONG_MONTH + if month == FEB: + max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH + elif month in MONTHS_30: + max_days = SHORT_MONTH + return 1 <= day <= max_days + + @staticmethod + def validate_week(year: int, week: int) -> bool: + """Validate week.""" + + max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1] + if max_week == 1: + max_week = 53 + return 1 <= week <= max_week + + @staticmethod + def validate_month(month: int) -> bool: + """Validate month.""" + + return 1 <= month <= 12 + + @staticmethod + def validate_year(year: int) -> bool: + """Validate year.""" + + return 1 <= year + + @staticmethod + def validate_hour(hour: int) -> bool: + """Validate hour.""" + + return 0 <= hour <= 23 + + @staticmethod + def validate_minutes(minutes: int) -> bool: + """Validate minutes.""" + + return 0 <= minutes <= 59 + + @classmethod + def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None: + """Parse the input value.""" + + parsed = None # type: tuple[float, ...] | None + if value is None: + return value + if itype == "date": + m = RE_DATE.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + day = int(m.group('day'), 10) + if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): + parsed = (year, month, day) + elif itype == "month": + m = RE_MONTH.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + if cls.validate_year(year) and cls.validate_month(month): + parsed = (year, month) + elif itype == "week": + m = RE_WEEK.match(value) + if m: + year = int(m.group('year'), 10) + week = int(m.group('week'), 10) + if cls.validate_year(year) and cls.validate_week(year, week): + parsed = (year, week) + elif itype == "time": + m = RE_TIME.match(value) + if m: + hour = int(m.group('hour'), 10) + minutes = int(m.group('minutes'), 10) + if cls.validate_hour(hour) and cls.validate_minutes(minutes): + parsed = (hour, minutes) + elif itype == "datetime-local": + m = RE_DATETIME.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + day = int(m.group('day'), 10) + hour = int(m.group('hour'), 10) + minutes = int(m.group('minutes'), 10) + if ( + cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and + cls.validate_hour(hour) and cls.validate_minutes(minutes) + ): + parsed = (year, month, day, hour, minutes) + elif itype in ("number", "range"): + m = RE_NUM.match(value) + if m: + parsed = (float(m.group('value')),) + return parsed + + +class CSSMatch(_DocumentNav): + """Perform CSS matching.""" + + def __init__( + self, + selectors: ct.SelectorList, + scope: bs4.Tag, + namespaces: ct.Namespaces | None, + flags: int + ) -> None: + """Initialize.""" + + self.assert_valid_input(scope) + self.tag = scope + self.cached_meta_lang = [] # type: list[tuple[str, str]] + self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] + self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] + self.selectors = selectors + self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] + self.flags = flags + self.iframe_restrict = False + + # Find the root element for the whole tree + doc = scope + parent = self.get_parent(doc) + while parent: + doc = parent + parent = self.get_parent(doc) + root = None + if not self.is_doc(doc): + root = doc + else: + for child in self.get_children(doc): + root = child + break + + self.root = root + self.scope = scope if scope is not doc else root + self.has_html_namespace = self.has_html_ns(root) + + # A document can be both XML and HTML (XHTML) + self.is_xml = self.is_xml_tree(doc) + self.is_html = not self.is_xml or self.has_html_namespace + + def supports_namespaces(self) -> bool: + """Check if namespaces are supported in the HTML type.""" + + return self.is_xml or self.has_html_namespace + + def get_tag_ns(self, el: bs4.Tag) -> str: + """Get tag namespace.""" + + if self.supports_namespaces(): + namespace = '' + ns = self.get_uri(el) + if ns: + namespace = ns + else: + namespace = NS_XHTML + return namespace + + def is_html_tag(self, el: bs4.Tag) -> bool: + """Check if tag is in HTML namespace.""" + + return self.get_tag_ns(el) == NS_XHTML + + def get_tag(self, el: bs4.Tag) -> str | None: + """Get tag.""" + + name = self.get_tag_name(el) + return util.lower(name) if name is not None and not self.is_xml else name + + def get_prefix(self, el: bs4.Tag) -> str | None: + """Get prefix.""" + + prefix = self.get_prefix_name(el) + return util.lower(prefix) if prefix is not None and not self.is_xml else prefix + + def find_bidi(self, el: bs4.Tag) -> int | None: + """Get directionality from element text.""" + + for node in self.get_children(el, tags=False): + + # Analyze child text nodes + if self.is_tag(node): + + # Avoid analyzing certain elements specified in the specification. + direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) + if ( + self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or + not self.is_html_tag(node) or + direction is not None + ): + continue # pragma: no cover + + # Check directionality of this node's text + value = self.find_bidi(node) + if value is not None: + return value + + # Direction could not be determined + continue # pragma: no cover + + # Skip `doctype` comments, etc. + if self.is_special_string(node): + continue + + # Analyze text nodes for directionality. + for c in node: + bidi = unicodedata.bidirectional(c) + if bidi in ('AL', 'R', 'L'): + return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL + return None + + def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: + """Filter the language tags.""" + + match = True + lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() + ranges = lang_range.split('-') + subtags = lang_tag.lower().split('-') + length = len(ranges) + slength = len(subtags) + rindex = 0 + sindex = 0 + r = ranges[rindex] + s = subtags[sindex] + + # Empty specified language should match unspecified language attributes + if length == 1 and slength == 1 and not r and r == s: + return True + + # Primary tag needs to match + if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): + match = False + + rindex += 1 + sindex += 1 + + # Match until we run out of ranges + while match and rindex < length: + r = ranges[rindex] + try: + s = subtags[sindex] + except IndexError: + # Ran out of subtags, + # but we still have ranges + match = False + continue + + # Empty range + if not r: + match = False + continue + + # Matched range + elif s == r: + rindex += 1 + + # Implicit wildcard cannot match + # singletons + elif len(s) == 1: + match = False + continue + + # Implicitly matched, so grab next subtag + sindex += 1 + + return match + + def match_attribute_name( + self, + el: bs4.Tag, + attr: str, + prefix: str | None + ) -> str | Sequence[str] | None: + """Match attribute name and return value if it exists.""" + + value = None + if self.supports_namespaces(): + value = None + # If we have not defined namespaces, we can't very well find them, so don't bother trying. + if prefix: + ns = self.namespaces.get(prefix) + if ns is None and prefix != '*': + return None + else: + ns = None + + for k, v in self.iter_attributes(el): + + # Get attribute parts + namespace, name = self.split_namespace(el, k) + + # Can't match a prefix attribute as we haven't specified one to match + # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. + if ns is None: + if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): + value = v + break + # Coverage is not finding this even though it is executed. + # Adding a print statement before this (and erasing coverage) causes coverage to find the line. + # Ignore the false positive message. + continue # pragma: no cover + + # We can't match our desired prefix attribute as the attribute doesn't have a prefix + if namespace is None or ns != namespace and prefix != '*': + continue + + # The attribute doesn't match. + if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): + continue + + value = v + break + else: + for k, v in self.iter_attributes(el): + if util.lower(attr) != util.lower(k): + continue + value = v + break + return value + + def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: + """Match the namespace of the element.""" + + match = True + namespace = self.get_tag_ns(el) + default_namespace = self.namespaces.get('') + tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) + # We must match the default namespace if one is not provided + if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): + match = False + # If we specified `|tag`, we must not have a namespace. + elif (tag.prefix is not None and tag.prefix == '' and namespace): + match = False + # Verify prefix matches + elif ( + tag.prefix and + tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) + ): + match = False + return match + + def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: + """Match attributes.""" + + match = True + if attributes: + for a in attributes: + temp = self.match_attribute_name(el, a.attribute, a.prefix) + pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern + if temp is None: + match = False + break + value = temp if isinstance(temp, str) else ' '.join(temp) + if pattern is None: + continue + elif pattern.match(value) is None: + match = False + break + return match + + def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: + """Match tag name.""" + + name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) + return not ( + name is not None and + name not in (self.get_tag(el), '*') + ) + + def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool: + """Match the tag.""" + + match = True + if tag is not None: + # Verify namespace + if not self.match_namespace(el, tag): + match = False + if not self.match_tagname(el, tag): + match = False + return match + + def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: + """Match past relationship.""" + + found = False + # I don't think this can ever happen, but it makes `mypy` happy + if isinstance(relation[0], ct.SelectorNull): # pragma: no cover + return found + + if relation[0].rel_type == REL_PARENT: + parent = self.get_parent(el, no_iframe=self.iframe_restrict) + while not found and parent: + found = self.match_selectors(parent, relation) + parent = self.get_parent(parent, no_iframe=self.iframe_restrict) + elif relation[0].rel_type == REL_CLOSE_PARENT: + parent = self.get_parent(el, no_iframe=self.iframe_restrict) + if parent: + found = self.match_selectors(parent, relation) + elif relation[0].rel_type == REL_SIBLING: + sibling = self.get_previous(el) + while not found and sibling: + found = self.match_selectors(sibling, relation) + sibling = self.get_previous(sibling) + elif relation[0].rel_type == REL_CLOSE_SIBLING: + sibling = self.get_previous(el) + if sibling and self.is_tag(sibling): + found = self.match_selectors(sibling, relation) + return found + + def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: + """Match future child.""" + + match = False + if recursive: + children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]] + else: + children = self.get_children + for child in children(parent, no_iframe=self.iframe_restrict): + match = self.match_selectors(child, relation) + if match: + break + return match + + def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: + """Match future relationship.""" + + found = False + # I don't think this can ever happen, but it makes `mypy` happy + if isinstance(relation[0], ct.SelectorNull): # pragma: no cover + return found + + if relation[0].rel_type == REL_HAS_PARENT: + found = self.match_future_child(el, relation, True) + elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: + found = self.match_future_child(el, relation) + elif relation[0].rel_type == REL_HAS_SIBLING: + sibling = self.get_next(el) + while not found and sibling: + found = self.match_selectors(sibling, relation) + sibling = self.get_next(sibling) + elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: + sibling = self.get_next(el) + if sibling and self.is_tag(sibling): + found = self.match_selectors(sibling, relation) + return found + + def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: + """Match relationship to other elements.""" + + found = False + + if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: + return found + + if relation[0].rel_type.startswith(':'): + found = self.match_future_relations(el, relation) + else: + found = self.match_past_relations(el, relation) + + return found + + def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: + """Match element's ID.""" + + found = True + for i in ids: + if i != self.get_attribute_by_name(el, 'id', ''): + found = False + break + return found + + def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: + """Match element's classes.""" + + current_classes = self.get_classes(el) + found = True + for c in classes: + if c not in current_classes: + found = False + break + return found + + def match_root(self, el: bs4.Tag) -> bool: + """Match element as root.""" + + is_root = self.is_root(el) + if is_root: + sibling = self.get_previous(el, tags=False) + while is_root and sibling is not None: + if ( + self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or + self.is_cdata(sibling) + ): + is_root = False + else: + sibling = self.get_previous(sibling, tags=False) + if is_root: + sibling = self.get_next(el, tags=False) + while is_root and sibling is not None: + if ( + self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or + self.is_cdata(sibling) + ): + is_root = False + else: + sibling = self.get_next(sibling, tags=False) + return is_root + + def match_scope(self, el: bs4.Tag) -> bool: + """Match element as scope.""" + + return self.scope is el + + def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: + """Match tag type for `nth` matches.""" + + return ( + (self.get_tag(child) == self.get_tag(el)) and + (self.get_tag_ns(child) == self.get_tag_ns(el)) + ) + + def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool: + """Match `nth` elements.""" + + matched = True + + for n in nth: + matched = False + if n.selectors and not self.match_selectors(el, n.selectors): + break + parent = self.get_parent(el) + if parent is None: + parent = self.create_fake_parent(el) + last = n.last + last_index = len(parent) - 1 + index = last_index if last else 0 + relative_index = 0 + a = n.a + b = n.b + var = n.n + count = 0 + count_incr = 1 + factor = -1 if last else 1 + idx = last_idx = a * count + b if var else a + + # We can only adjust bounds within a variable index + if var: + # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. + # Otherwise, increment to try to get in bounds. + adjust = None + while idx < 1 or idx > last_index: + if idx < 0: + diff_low = 0 - idx + if adjust is not None and adjust == 1: + break + adjust = -1 + count += count_incr + idx = last_idx = a * count + b if var else a + diff = 0 - idx + if diff >= diff_low: + break + else: + diff_high = idx - last_index + if adjust is not None and adjust == -1: + break + adjust = 1 + count += count_incr + idx = last_idx = a * count + b if var else a + diff = idx - last_index + if diff >= diff_high: + break + diff_high = diff + + # If a < 0, our count is working backwards, so floor the index by increasing the count. + # Find the count that yields the lowest, in bound value and use that. + # Lastly reverse count increment so that we'll increase our index. + lowest = count + if a < 0: + while idx >= 1: + lowest = count + count += count_incr + idx = last_idx = a * count + b if var else a + count_incr = -1 + count = lowest + idx = last_idx = a * count + b if var else a + + # Evaluate elements while our calculated nth index is still in range + while 1 <= idx <= last_index + 1: + child = None + # Evaluate while our child index is still in range. + for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): + index += factor + if not self.is_tag(child): + continue + # Handle `of S` in `nth-child` + if n.selectors and not self.match_selectors(child, n.selectors): + continue + # Handle `of-type` + if n.of_type and not self.match_nth_tag_type(el, child): + continue + relative_index += 1 + if relative_index == idx: + if child is el: + matched = True + else: + break + if child is el: + break + if child is el: + break + last_idx = idx + count += count_incr + if count < 0: + # Count is counting down and has now ventured into invalid territory. + break + idx = a * count + b if var else a + if last_idx == idx: + break + if not matched: + break + return matched + + def match_empty(self, el: bs4.Tag) -> bool: + """Check if element is empty (if requested).""" + + is_empty = True + for child in self.get_children(el, tags=False): + if self.is_tag(child): + is_empty = False + break + elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): + is_empty = False + break + return is_empty + + def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: + """Match selectors.""" + + match = True + for sel in selectors: + if not self.match_selectors(el, sel): + match = False + return match + + def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: + """Match element if it contains text.""" + + match = True + content = None # type: str | Sequence[str] | None + for contain_list in contains: + if content is None: + if contain_list.own: + content = self.get_own_text(el, no_iframe=self.is_html) + else: + content = self.get_text(el, no_iframe=self.is_html) + found = False + for text in contain_list.text: + if contain_list.own: + for c in content: + if text in c: + found = True + break + if found: + break + else: + if text in content: + found = True + break + if not found: + match = False + return match + + def match_default(self, el: bs4.Tag) -> bool: + """Match default.""" + + match = False + + # Find this input's form + form = None + parent = self.get_parent(el, no_iframe=True) + while parent and form is None: + if self.get_tag(parent) == 'form' and self.is_html_tag(parent): + form = parent + else: + parent = self.get_parent(parent, no_iframe=True) + + # Look in form cache to see if we've already located its default button + found_form = False + for f, t in self.cached_default_forms: + if f is form: + found_form = True + if t is el: + match = True + break + + # We didn't have the form cached, so look for its default button + if not found_form: + for child in self.get_descendants(form, no_iframe=True): + name = self.get_tag(child) + # Can't do nested forms (haven't figured out why we never hit this) + if name == 'form': # pragma: no cover + break + if name in ('input', 'button'): + v = self.get_attribute_by_name(child, 'type', '') + if v and util.lower(v) == 'submit': + self.cached_default_forms.append((form, child)) + if el is child: + match = True + break + return match + + def match_indeterminate(self, el: bs4.Tag) -> bool: + """Match default.""" + + match = False + name = cast(str, self.get_attribute_by_name(el, 'name')) + + def get_parent_form(el: bs4.Tag) -> bs4.Tag | None: + """Find this input's form.""" + form = None + parent = self.get_parent(el, no_iframe=True) + while form is None: + if self.get_tag(parent) == 'form' and self.is_html_tag(parent): + form = parent + break + last_parent = parent + parent = self.get_parent(parent, no_iframe=True) + if parent is None: + form = last_parent + break + return form + + form = get_parent_form(el) + + # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate + found_form = False + for f, n, i in self.cached_indeterminate_forms: + if f is form and n == name: + found_form = True + if i is True: + match = True + break + + # We didn't have the form cached, so validate that the radio button is indeterminate + if not found_form: + checked = False + for child in self.get_descendants(form, no_iframe=True): + if child is el: + continue + tag_name = self.get_tag(child) + if tag_name == 'input': + is_radio = False + check = False + has_name = False + for k, v in self.iter_attributes(child): + if util.lower(k) == 'type' and util.lower(v) == 'radio': + is_radio = True + elif util.lower(k) == 'name' and v == name: + has_name = True + elif util.lower(k) == 'checked': + check = True + if is_radio and check and has_name and get_parent_form(child) is form: + checked = True + break + if checked: + break + if not checked: + match = True + self.cached_indeterminate_forms.append((form, name, match)) + + return match + + def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: + """Match languages.""" + + match = False + has_ns = self.supports_namespaces() + root = self.root + has_html_namespace = self.has_html_namespace + + # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. + parent = el + found_lang = None + last = None + while not found_lang: + has_html_ns = self.has_html_ns(parent) + for k, v in self.iter_attributes(parent): + attr_ns, attr = self.split_namespace(parent, k) + if ( + ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or + ( + has_ns and not has_html_ns and attr_ns == NS_XML and + (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' + ) + ): + found_lang = v + break + last = parent + parent = self.get_parent(parent, no_iframe=self.is_html) + + if parent is None: + root = last + has_html_namespace = self.has_html_ns(root) + parent = last + break + + # Use cached meta language. + if found_lang is None and self.cached_meta_lang: + for cache in self.cached_meta_lang: + if root is cache[0]: + found_lang = cache[1] + + # If we couldn't find a language, and the document is HTML, look to meta to determine language. + if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): + # Find head + found = False + for tag in ('html', 'head'): + found = False + for child in self.get_children(parent, no_iframe=self.is_html): + if self.get_tag(child) == tag and self.is_html_tag(child): + found = True + parent = child + break + if not found: # pragma: no cover + break + + # Search meta tags + if found: + for child in parent: + if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): + c_lang = False + content = None + for k, v in self.iter_attributes(child): + if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': + c_lang = True + if util.lower(k) == 'content': + content = v + if c_lang and content: + found_lang = content + self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) + break + if found_lang is not None: + break + if found_lang is None: + self.cached_meta_lang.append((cast(str, root), '')) + + # If we determined a language, compare. + if found_lang is not None: + for patterns in langs: + match = False + for pattern in patterns: + if self.extended_language_filter(pattern, cast(str, found_lang)): + match = True + if not match: + break + + return match + + def match_dir(self, el: bs4.Tag, directionality: int) -> bool: + """Check directionality.""" + + # If we have to match both left and right, we can't match either. + if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: + return False + + if el is None or not self.is_html_tag(el): + return False + + # Element has defined direction of left to right or right to left + direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) + if direction not in (None, 0): + return direction == directionality + + # Element is the document element (the root) and no direction assigned, assume left to right. + is_root = self.is_root(el) + if is_root and direction is None: + return ct.SEL_DIR_LTR == directionality + + # If `input[type=telephone]` and no direction is assigned, assume left to right. + name = self.get_tag(el) + is_input = name == 'input' + is_textarea = name == 'textarea' + is_bdi = name == 'bdi' + itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' + if is_input and itype == 'tel' and direction is None: + return ct.SEL_DIR_LTR == directionality + + # Auto handling for text inputs + if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: + if is_textarea: + value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) + else: + value = cast(str, self.get_attribute_by_name(el, 'value', '')) + if value: + for c in value: + bidi = unicodedata.bidirectional(c) + if bidi in ('AL', 'R', 'L'): + direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL + return direction == directionality + # Assume left to right + return ct.SEL_DIR_LTR == directionality + elif is_root: + return ct.SEL_DIR_LTR == directionality + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + # Auto handling for `bdi` and other non text inputs. + if (is_bdi and direction is None) or direction == 0: + direction = self.find_bidi(el) + if direction is not None: + return direction == directionality + elif is_root: + return ct.SEL_DIR_LTR == directionality + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + # Match parents direction + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + def match_range(self, el: bs4.Tag, condition: int) -> bool: + """ + Match range. + + Behavior is modeled after what we see in browsers. Browsers seem to evaluate + if the value is out of range, and if not, it is in range. So a missing value + will not evaluate out of range; therefore, value is in range. Personally, I + feel like this should evaluate as neither in or out of range. + """ + + out_of_range = False + + itype = util.lower(self.get_attribute_by_name(el, 'type')) + mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) + mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) + + # There is no valid min or max, so we cannot evaluate a range + if mn is None and mx is None: + return False + + value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) + if value is not None: + if itype in ("date", "datetime-local", "month", "week", "number", "range"): + if mn is not None and value < mn: + out_of_range = True + if not out_of_range and mx is not None and value > mx: + out_of_range = True + elif itype == "time": + if mn is not None and mx is not None and mn > mx: + # Time is periodic, so this is a reversed/discontinuous range + if value < mn and value > mx: + out_of_range = True + else: + if mn is not None and value < mn: + out_of_range = True + if not out_of_range and mx is not None and value > mx: + out_of_range = True + + return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range + + def match_defined(self, el: bs4.Tag) -> bool: + """ + Match defined. + + `:defined` is related to custom elements in a browser. + + - If the document is XML (not XHTML), all tags will match. + - Tags that are not custom (don't have a hyphen) are marked defined. + - If the tag has a prefix (without or without a namespace), it will not match. + + This is of course requires the parser to provide us with the proper prefix and namespace info, + if it doesn't, there is nothing we can do. + """ + + name = self.get_tag(el) + return ( + name is not None and ( + name.find('-') == -1 or + name.find(':') != -1 or + self.get_prefix(el) is not None + ) + ) + + def match_placeholder_shown(self, el: bs4.Tag) -> bool: + """ + Match placeholder shown according to HTML spec. + + - text area should be checked if they have content. A single newline does not count as content. + + """ + + match = False + content = self.get_text(el) + if content in ('', '\n'): + match = True + + return match + + def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: + """Check if element matches one of the selectors.""" + + match = False + is_not = selectors.is_not + is_html = selectors.is_html + + # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. + if is_html: + namespaces = self.namespaces + iframe_restrict = self.iframe_restrict + self.namespaces = {'html': NS_XHTML} + self.iframe_restrict = True + + if not is_html or self.is_html: + for selector in selectors: + match = is_not + # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) + if isinstance(selector, ct.SelectorNull): + continue + # Verify tag matches + if not self.match_tag(el, selector.tag): + continue + # Verify tag is defined + if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): + continue + # Verify element is root + if selector.flags & ct.SEL_ROOT and not self.match_root(el): + continue + # Verify element is scope + if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): + continue + # Verify element has placeholder shown + if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): + continue + # Verify `nth` matches + if not self.match_nth(el, selector.nth): + continue + if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): + continue + # Verify id matches + if selector.ids and not self.match_id(el, selector.ids): + continue + # Verify classes match + if selector.classes and not self.match_classes(el, selector.classes): + continue + # Verify attribute(s) match + if not self.match_attributes(el, selector.attributes): + continue + # Verify ranges + if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): + continue + # Verify language patterns + if selector.lang and not self.match_lang(el, selector.lang): + continue + # Verify pseudo selector patterns + if selector.selectors and not self.match_subselectors(el, selector.selectors): + continue + # Verify relationship selectors + if selector.relation and not self.match_relations(el, selector.relation): + continue + # Validate that the current default selector match corresponds to the first submit button in the form + if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): + continue + # Validate that the unset radio button is among radio buttons with the same name in a form that are + # also not set. + if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): + continue + # Validate element directionality + if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): + continue + # Validate that the tag contains the specified text. + if selector.contains and not self.match_contains(el, selector.contains): + continue + match = not is_not + break + + # Restore actual namespaces being used for external selector lists + if is_html: + self.namespaces = namespaces + self.iframe_restrict = iframe_restrict + + return match + + def select(self, limit: int = 0) -> Iterator[bs4.Tag]: + """Match all tags under the targeted tag.""" + + lim = None if limit < 1 else limit + + for child in self.get_descendants(self.tag): + if self.match(child): + yield child + if lim is not None: + lim -= 1 + if lim < 1: + break + + def closest(self) -> bs4.Tag | None: + """Match closest ancestor.""" + + current = self.tag + closest = None + while closest is None and current is not None: + if self.match(current): + closest = current + else: + current = self.get_parent(current) + return closest + + def filter(self) -> list[bs4.Tag]: # noqa A001 + """Filter tag's children.""" + + return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] + + def match(self, el: bs4.Tag) -> bool: + """Match.""" + + return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) + + +class SoupSieve(ct.Immutable): + """Compiled Soup Sieve selector matching object.""" + + pattern: str + selectors: ct.SelectorList + namespaces: ct.Namespaces | None + custom: dict[str, str] + flags: int + + __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") + + def __init__( + self, + pattern: str, + selectors: ct.SelectorList, + namespaces: ct.Namespaces | None, + custom: ct.CustomSelectors | None, + flags: int + ): + """Initialize.""" + + super().__init__( + pattern=pattern, + selectors=selectors, + namespaces=namespaces, + custom=custom, + flags=flags + ) + + def match(self, tag: bs4.Tag) -> bool: + """Match.""" + + return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) + + def closest(self, tag: bs4.Tag) -> bs4.Tag: + """Match closest ancestor.""" + + return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() + + def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 + """ + Filter. + + `CSSMatch` can cache certain searches for tags of the same document, + so if we are given a tag, all tags are from the same document, + and we can take advantage of the optimization. + + Any other kind of iterable could have tags from different documents or detached tags, + so for those, we use a new `CSSMatch` for each item in the iterable. + """ + + if CSSMatch.is_tag(iterable): + return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() + else: + return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] + + def select_one(self, tag: bs4.Tag) -> bs4.Tag: + """Select a single tag.""" + + tags = self.select(tag, limit=1) + return tags[0] if tags else None + + def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: + """Select the specified tags.""" + + return list(self.iselect(tag, limit)) + + def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: + """Iterate the specified tags.""" + + yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit) + + def __repr__(self) -> str: # pragma: no cover + """Representation.""" + + return ( + f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, " + f"custom={self.custom!r}, flags={self.flags!r})" + ) + + __str__ = __repr__ + + +ct.pickle_register(SoupSieve) diff --git a/.venv/lib/python3.12/site-packages/soupsieve/css_parser.py b/.venv/lib/python3.12/site-packages/soupsieve/css_parser.py new file mode 100644 index 00000000..bedae694 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/css_parser.py @@ -0,0 +1,1289 @@ +"""CSS selector parser.""" +from __future__ import annotations +import re +from functools import lru_cache +from . import util +from . import css_match as cm +from . import css_types as ct +from .util import SelectorSyntaxError +import warnings +from typing import Match, Any, Iterator, cast + +UNICODE_REPLACEMENT_CHAR = 0xFFFD + +# Simple pseudo classes that take no parameters +PSEUDO_SIMPLE = { + ":any-link", + ":empty", + ":first-child", + ":first-of-type", + ":in-range", + ":out-of-range", + ":last-child", + ":last-of-type", + ":link", + ":only-child", + ":only-of-type", + ":root", + ':checked', + ':default', + ':disabled', + ':enabled', + ':indeterminate', + ':optional', + ':placeholder-shown', + ':read-only', + ':read-write', + ':required', + ':scope', + ':defined' +} + +# Supported, simple pseudo classes that match nothing in the Soup Sieve environment +PSEUDO_SIMPLE_NO_MATCH = { + ':active', + ':current', + ':focus', + ':focus-visible', + ':focus-within', + ':future', + ':host', + ':hover', + ':local-link', + ':past', + ':paused', + ':playing', + ':target', + ':target-within', + ':user-invalid', + ':visited' +} + +# Complex pseudo classes that take selector lists +PSEUDO_COMPLEX = { + ':contains', + ':-soup-contains', + ':-soup-contains-own', + ':has', + ':is', + ':matches', + ':not', + ':where' +} + +PSEUDO_COMPLEX_NO_MATCH = { + ':current', + ':host', + ':host-context' +} + +# Complex pseudo classes that take very specific parameters and are handled special +PSEUDO_SPECIAL = { + ':dir', + ':lang', + ':nth-child', + ':nth-last-child', + ':nth-last-of-type', + ':nth-of-type' +} + +PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL + +# Sub-patterns parts +# Whitespace +NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])' +WS = fr'(?:[ \t]|{NEWLINE})' +# Comments +COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)' +# Whitespace with comments included +WSC = fr'(?:{WS}|{COMMENTS})' +# CSS escapes +CSS_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$))' +CSS_STRING_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$|{NEWLINE}))' +# CSS Identifier +IDENTIFIER = fr''' +(?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})+|--) +(?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})*) +''' +# `nth` content +NTH = fr'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){WSC}*(?:[-+]){WSC}*(?:[0-9]+))?' +# Value: quoted string or identifier +VALUE = fr'''(?:"(?:\\(?:.|{NEWLINE})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{NEWLINE})|[^\\'\r\n\f]+)*?'|{IDENTIFIER}+)''' +# Attribute value comparison. `!=` is handled special as it is non-standard. +ATTR = fr'(?:{WSC}*(?P<cmp>[!~^|*$]?=){WSC}*(?P<value>{VALUE})(?:{WSC}*(?P<case>[is]))?)?{WSC}*\]' + +# Selector patterns +# IDs (`#id`) +PAT_ID = fr'\#{IDENTIFIER}' +# Classes (`.class`) +PAT_CLASS = fr'\.{IDENTIFIER}' +# Prefix:Tag (`prefix|tag`) +PAT_TAG = fr'(?P<tag_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<tag_name>{IDENTIFIER}|\*)' +# Attributes (`[attr]`, `[attr=value]`, etc.) +PAT_ATTR = fr'\[{WSC}*(?P<attr_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<attr_name>{IDENTIFIER}){ATTR}' +# Pseudo class (`:pseudo-class`, `:pseudo-class(`) +PAT_PSEUDO_CLASS = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)?' +# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. +PAT_PSEUDO_CLASS_SPECIAL = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)' +# Custom pseudo class (`:--custom-pseudo`) +PAT_PSEUDO_CLASS_CUSTOM = fr'(?P<name>:(?=--){IDENTIFIER})' +# Nesting ampersand selector. Matches `&` +PAT_AMP = r'&' +# Closing pseudo group (`)`) +PAT_PSEUDO_CLOSE = fr'{WSC}*\)' +# Pseudo element (`::pseudo-element`) +PAT_PSEUDO_ELEMENT = fr':{PAT_PSEUDO_CLASS}' +# At rule (`@page`, etc.) (not supported) +PAT_AT_RULE = fr'@P{IDENTIFIER}' +# Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.) +PAT_PSEUDO_NTH_CHILD = fr''' +(?P<pseudo_nth_child>{PAT_PSEUDO_CLASS_SPECIAL} +(?P<nth_child>{NTH}|even|odd))(?:{WSC}*\)|(?P<of>{COMMENTS}*{WS}{WSC}*of{COMMENTS}*{WS}{WSC}*)) +''' +# Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.) +PAT_PSEUDO_NTH_TYPE = fr''' +(?P<pseudo_nth_type>{PAT_PSEUDO_CLASS_SPECIAL} +(?P<nth_type>{NTH}|even|odd)){WSC}*\) +''' +# Pseudo class language (`:lang("*-de", en)`) +PAT_PSEUDO_LANG = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)' +# Pseudo class direction (`:dir(ltr)`) +PAT_PSEUDO_DIR = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<dir>ltr|rtl){WSC}*\)' +# Combining characters (`>`, `~`, ` `, `+`, `,`) +PAT_COMBINE = fr'{WSC}*?(?P<relation>[,+>~]|{WS}(?![,+>~])){WSC}*' +# Extra: Contains (`:contains(text)`) +PAT_PSEUDO_CONTAINS = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)' + +# Regular expressions +# CSS escape pattern +RE_CSS_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WSC}?)|(\\[^\r\n\f])|(\\$))', re.I) +RE_CSS_STR_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WS}?)|(\\[^\r\n\f])|(\\$)|(\\{NEWLINE}))', re.I) +# Pattern to break up `nth` specifiers +RE_NTH = re.compile(fr'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){WSC}*(?P<s2>[-+]){WSC}*(?P<b>[0-9]+))?', re.I) +# Pattern to iterate multiple values. +RE_VALUES = re.compile(fr'(?:(?P<value>{VALUE})|(?P<split>{WSC}*,{WSC}*))', re.X) +# Whitespace checks +RE_WS = re.compile(WS) +RE_WS_BEGIN = re.compile(fr'^{WSC}*') +RE_WS_END = re.compile(fr'{WSC}*$') +RE_CUSTOM = re.compile(fr'^{PAT_PSEUDO_CLASS_CUSTOM}$', re.X) + +# Constants +# List split token +COMMA_COMBINATOR = ',' +# Relation token for descendant +WS_COMBINATOR = " " + +# Parse flags +FLG_PSEUDO = 0x01 +FLG_NOT = 0x02 +FLG_RELATIVE = 0x04 +FLG_DEFAULT = 0x08 +FLG_HTML = 0x10 +FLG_INDETERMINATE = 0x20 +FLG_OPEN = 0x40 +FLG_IN_RANGE = 0x80 +FLG_OUT_OF_RANGE = 0x100 +FLG_PLACEHOLDER_SHOWN = 0x200 +FLG_FORGIVE = 0x400 + +# Maximum cached patterns to store +_MAXCACHE = 500 + + +@lru_cache(maxsize=_MAXCACHE) +def _cached_css_compile( + pattern: str, + namespaces: ct.Namespaces | None, + custom: ct.CustomSelectors | None, + flags: int +) -> cm.SoupSieve: + """Cached CSS compile.""" + + custom_selectors = process_custom(custom) + return cm.SoupSieve( + pattern, + CSSParser( + pattern, + custom=custom_selectors, + flags=flags + ).process_selectors(), + namespaces, + custom, + flags + ) + + +def _purge_cache() -> None: + """Purge the cache.""" + + _cached_css_compile.cache_clear() + + +def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]: + """Process custom.""" + + custom_selectors = {} + if custom is not None: + for key, value in custom.items(): + name = util.lower(key) + if RE_CUSTOM.match(name) is None: + raise SelectorSyntaxError(f"The name '{name}' is not a valid custom pseudo-class name") + if name in custom_selectors: + raise KeyError(f"The custom selector '{name}' has already been registered") + custom_selectors[css_unescape(name)] = value + return custom_selectors + + +def css_unescape(content: str, string: bool = False) -> str: + """ + Unescape CSS value. + + Strings allow for spanning the value on multiple strings by escaping a new line. + """ + + def replace(m: Match[str]) -> str: + """Replace with the appropriate substitute.""" + + if m.group(1): + codepoint = int(m.group(1)[1:], 16) + if codepoint == 0: + codepoint = UNICODE_REPLACEMENT_CHAR + value = chr(codepoint) + elif m.group(2): + value = m.group(2)[1:] + elif m.group(3): + value = '\ufffd' + else: + value = '' + + return value + + return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) + + +def escape(ident: str) -> str: + """Escape identifier.""" + + string = [] + length = len(ident) + start_dash = length > 0 and ident[0] == '-' + if length == 1 and start_dash: + # Need to escape identifier that is a single `-` with no other characters + string.append(f'\\{ident}') + else: + for index, c in enumerate(ident): + codepoint = ord(c) + if codepoint == 0x00: + string.append('\ufffd') + elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F: + string.append(f'\\{codepoint:x} ') + elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39): + string.append(f'\\{codepoint:x} ') + elif ( + codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or + (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A) + ): + string.append(c) + else: + string.append(f'\\{c}') + return ''.join(string) + + +class SelectorPattern: + """Selector pattern.""" + + def __init__(self, name: str, pattern: str) -> None: + """Initialize.""" + + self.name = name + self.re_pattern = re.compile(pattern, re.I | re.X | re.U) + + def get_name(self) -> str: + """Get name.""" + + return self.name + + def match(self, selector: str, index: int, flags: int) -> Match[str] | None: + """Match the selector.""" + + return self.re_pattern.match(selector, index) + + +class SpecialPseudoPattern(SelectorPattern): + """Selector pattern.""" + + def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None: + """Initialize.""" + + self.patterns = {} + for p in patterns: + name = p[0] + pattern = p[3](name, p[2]) + for pseudo in p[1]: + self.patterns[pseudo] = pattern + + self.matched_name = None # type: SelectorPattern | None + self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) + + def get_name(self) -> str: + """Get name.""" + + return '' if self.matched_name is None else self.matched_name.get_name() + + def match(self, selector: str, index: int, flags: int) -> Match[str] | None: + """Match the selector.""" + + pseudo = None + m = self.re_pseudo_name.match(selector, index) + if m: + name = util.lower(css_unescape(m.group('name'))) + pattern = self.patterns.get(name) + if pattern: + pseudo = pattern.match(selector, index, flags) + if pseudo: + self.matched_name = pattern + + return pseudo + + +class _Selector: + """ + Intermediate selector class. + + This stores selector data for a compound selector as we are acquiring them. + Once we are done collecting the data for a compound selector, we freeze + the data in an object that can be pickled and hashed. + """ + + def __init__(self, **kwargs: Any) -> None: + """Initialize.""" + + self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None + self.ids = kwargs.get('ids', []) # type: list[str] + self.classes = kwargs.get('classes', []) # type: list[str] + self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute] + self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth] + self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList] + self.relations = kwargs.get('relations', []) # type: list[_Selector] + self.rel_type = kwargs.get('rel_type', None) # type: str | None + self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains] + self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang] + self.flags = kwargs.get('flags', 0) # type: int + self.no_match = kwargs.get('no_match', False) # type: bool + + def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList: + """Freeze relation.""" + + if relations: + sel = relations[0] + sel.relations.extend(relations[1:]) + return ct.SelectorList([sel.freeze()]) + else: + return ct.SelectorList() + + def freeze(self) -> ct.Selector | ct.SelectorNull: + """Freeze self.""" + + if self.no_match: + return ct.SelectorNull() + else: + return ct.Selector( + self.tag, + tuple(self.ids), + tuple(self.classes), + tuple(self.attributes), + tuple(self.nth), + tuple(self.selectors), + self._freeze_relations(self.relations), + self.rel_type, + tuple(self.contains), + tuple(self.lang), + self.flags + ) + + def __str__(self) -> str: # pragma: no cover + """String representation.""" + + return ( + f'_Selector(tag={self.tag!r}, ids={self.ids!r}, classes={self.classes!r}, attributes={self.attributes!r}, ' + f'nth={self.nth!r}, selectors={self.selectors!r}, relations={self.relations!r}, ' + f'rel_type={self.rel_type!r}, contains={self.contains!r}, lang={self.lang!r}, flags={self.flags!r}, ' + f'no_match={self.no_match!r})' + ) + + __repr__ = __str__ + + +class CSSParser: + """Parse CSS selectors.""" + + css_tokens = ( + SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), + SpecialPseudoPattern( + ( + ( + "pseudo_contains", + (':contains', ':-soup-contains', ':-soup-contains-own'), + PAT_PSEUDO_CONTAINS, + SelectorPattern + ), + ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern), + ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern), + ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern), + ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern) + ) + ), + SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), + SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS), + SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT), + SelectorPattern("amp", PAT_AMP), + SelectorPattern("at_rule", PAT_AT_RULE), + SelectorPattern("id", PAT_ID), + SelectorPattern("class", PAT_CLASS), + SelectorPattern("tag", PAT_TAG), + SelectorPattern("attribute", PAT_ATTR), + SelectorPattern("combine", PAT_COMBINE) + ) + + def __init__( + self, + selector: str, + custom: dict[str, str | ct.SelectorList] | None = None, + flags: int = 0 + ) -> None: + """Initialize.""" + + self.pattern = selector.replace('\x00', '\ufffd') + self.flags = flags + self.debug = self.flags & util.DEBUG + self.custom = {} if custom is None else custom + + def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """Create attribute selector from the returned regex match.""" + + inverse = False + op = m.group('cmp') + case = util.lower(m.group('case')) if m.group('case') else None + ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else '' + attr = css_unescape(m.group('attr_name')) + is_type = False + pattern2 = None + value = '' + + if case: + flags = (re.I if case == 'i' else 0) | re.DOTALL + elif util.lower(attr) == 'type': + flags = re.I | re.DOTALL + is_type = True + else: + flags = re.DOTALL + + if op: + if m.group('value').startswith(('"', "'")): + value = css_unescape(m.group('value')[1:-1], True) + else: + value = css_unescape(m.group('value')) + + if not op: + # Attribute name + pattern = None + elif op.startswith('^'): + # Value start with + pattern = re.compile(r'^%s.*' % re.escape(value), flags) + elif op.startswith('$'): + # Value ends with + pattern = re.compile(r'.*?%s$' % re.escape(value), flags) + elif op.startswith('*'): + # Value contains + pattern = re.compile(r'.*?%s.*' % re.escape(value), flags) + elif op.startswith('~'): + # Value contains word within space separated list + # `~=` should match nothing if it is empty or contains whitespace, + # so if either of these cases is present, use `[^\s\S]` which cannot be matched. + value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value) + pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags) + elif op.startswith('|'): + # Value starts with word in dash separated list + pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) + else: + # Value matches + pattern = re.compile(r'^%s$' % re.escape(value), flags) + if op.startswith('!'): + # Equivalent to `:not([attr=value])` + inverse = True + if is_type and pattern: + pattern2 = re.compile(pattern.pattern) + + # Append the attribute selector + sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2) + if inverse: + # If we are using `!=`, we need to nest the pattern under a `:not()`. + sub_sel = _Selector() + sub_sel.attributes.append(sel_attr) + not_list = ct.SelectorList([sub_sel.freeze()], True, False) + sel.selectors.append(not_list) + else: + sel.attributes.append(sel_attr) + + has_selector = True + return has_selector + + def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """Parse tag pattern from regex match.""" + + prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None + tag = css_unescape(m.group('tag_name')) + sel.tag = ct.SelectorTag(tag, prefix) + has_selector = True + return has_selector + + def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """ + Parse custom pseudo class alias. + + Compile custom selectors as we need them. When compiling a custom selector, + set it to `None` in the dictionary so we can avoid an infinite loop. + """ + + pseudo = util.lower(css_unescape(m.group('name'))) + selector = self.custom.get(pseudo) + if selector is None: + raise SelectorSyntaxError( + f"Undefined custom selector '{pseudo}' found at position {m.end(0)}", + self.pattern, + m.end(0) + ) + + if not isinstance(selector, ct.SelectorList): + del self.custom[pseudo] + selector = CSSParser( + selector, custom=self.custom, flags=self.flags + ).process_selectors(flags=FLG_PSEUDO) + self.custom[pseudo] = selector + + sel.selectors.append(selector) + has_selector = True + return has_selector + + def parse_pseudo_class( + self, + sel: _Selector, + m: Match[str], + has_selector: bool, + iselector: Iterator[tuple[str, Match[str]]], + is_html: bool + ) -> tuple[bool, bool]: + """Parse pseudo class.""" + + complex_pseudo = False + pseudo = util.lower(css_unescape(m.group('name'))) + if m.group('open'): + complex_pseudo = True + if complex_pseudo and pseudo in PSEUDO_COMPLEX: + has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0)) + elif not complex_pseudo and pseudo in PSEUDO_SIMPLE: + if pseudo == ':root': + sel.flags |= ct.SEL_ROOT + elif pseudo == ':defined': + sel.flags |= ct.SEL_DEFINED + is_html = True + elif pseudo == ':scope': + sel.flags |= ct.SEL_SCOPE + elif pseudo == ':empty': + sel.flags |= ct.SEL_EMPTY + elif pseudo in (':link', ':any-link'): + sel.selectors.append(CSS_LINK) + elif pseudo == ':checked': + sel.selectors.append(CSS_CHECKED) + elif pseudo == ':default': + sel.selectors.append(CSS_DEFAULT) + elif pseudo == ':indeterminate': + sel.selectors.append(CSS_INDETERMINATE) + elif pseudo == ":disabled": + sel.selectors.append(CSS_DISABLED) + elif pseudo == ":enabled": + sel.selectors.append(CSS_ENABLED) + elif pseudo == ":required": + sel.selectors.append(CSS_REQUIRED) + elif pseudo == ":optional": + sel.selectors.append(CSS_OPTIONAL) + elif pseudo == ":read-only": + sel.selectors.append(CSS_READ_ONLY) + elif pseudo == ":read-write": + sel.selectors.append(CSS_READ_WRITE) + elif pseudo == ":in-range": + sel.selectors.append(CSS_IN_RANGE) + elif pseudo == ":out-of-range": + sel.selectors.append(CSS_OUT_OF_RANGE) + elif pseudo == ":placeholder-shown": + sel.selectors.append(CSS_PLACEHOLDER_SHOWN) + elif pseudo == ':first-child': + sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList())) + elif pseudo == ':last-child': + sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())) + elif pseudo == ':first-of-type': + sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList())) + elif pseudo == ':last-of-type': + sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())) + elif pseudo == ':only-child': + sel.nth.extend( + [ + ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()), + ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()) + ] + ) + elif pseudo == ':only-of-type': + sel.nth.extend( + [ + ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()), + ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()) + ] + ) + has_selector = True + elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH: + self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) + sel.no_match = True + has_selector = True + elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH: + sel.no_match = True + has_selector = True + elif pseudo in PSEUDO_SUPPORTED: + raise SelectorSyntaxError( + f"Invalid syntax for pseudo class '{pseudo}'", + self.pattern, + m.start(0) + ) + else: + raise SelectorSyntaxError( + f"'{pseudo}' was detected as a pseudo-class and is either unsupported or invalid. " + "If the syntax was not intended to be recognized as a pseudo-class, please escape the colon.", + self.pattern, + m.start(0) + ) + + return has_selector, is_html + + def parse_pseudo_nth( + self, + sel: _Selector, + m: Match[str], + has_selector: bool, + iselector: Iterator[tuple[str, Match[str]]] + ) -> bool: + """Parse `nth` pseudo.""" + + mdict = m.groupdict() + if mdict.get('pseudo_nth_child'): + postfix = '_child' + else: + postfix = '_type' + mdict['name'] = util.lower(css_unescape(mdict['name'])) + content = util.lower(mdict.get('nth' + postfix)) + if content == 'even': + # 2n + s1 = 2 + s2 = 0 + var = True + elif content == 'odd': + # 2n+1 + s1 = 2 + s2 = 1 + var = True + else: + nth_parts = cast(Match[str], RE_NTH.match(content)) + _s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else '' + a = nth_parts.group('a') + var = a.endswith('n') + if a.startswith('n'): + _s1 += '1' + elif var: + _s1 += a[:-1] + else: + _s1 += a + _s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else '' + if nth_parts.group('b'): + _s2 += nth_parts.group('b') + else: + _s2 = '0' + s1 = int(_s1, 10) + s2 = int(_s2, 10) + + pseudo_sel = mdict['name'] + if postfix == '_child': + if m.group('of'): + # Parse the rest of `of S`. + nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) + else: + # Use default `*|*` for `of S`. + nth_sel = CSS_NTH_OF_S_DEFAULT + if pseudo_sel == ':nth-child': + sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel)) + elif pseudo_sel == ':nth-last-child': + sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel)) + else: + if pseudo_sel == ':nth-of-type': + sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList())) + elif pseudo_sel == ':nth-last-of-type': + sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList())) + has_selector = True + return has_selector + + def parse_pseudo_open( + self, + sel: _Selector, + name: str, + has_selector: bool, + iselector: Iterator[tuple[str, Match[str]]], + index: int + ) -> bool: + """Parse pseudo with opening bracket.""" + + flags = FLG_PSEUDO | FLG_OPEN + if name == ':not': + flags |= FLG_NOT + elif name == ':has': + flags |= FLG_RELATIVE + elif name in (':where', ':is'): + flags |= FLG_FORGIVE + + sel.selectors.append(self.parse_selectors(iselector, index, flags)) + has_selector = True + + return has_selector + + def parse_has_combinator( + self, + sel: _Selector, + m: Match[str], + has_selector: bool, + selectors: list[_Selector], + rel_type: str, + index: int + ) -> tuple[bool, _Selector, str]: + """Parse combinator tokens.""" + + combinator = m.group('relation').strip() + if not combinator: + combinator = WS_COMBINATOR + if combinator == COMMA_COMBINATOR: + sel.rel_type = rel_type + selectors[-1].relations.append(sel) + rel_type = ":" + WS_COMBINATOR + selectors.append(_Selector()) + else: + if has_selector: + # End the current selector and associate the leading combinator with this selector. + sel.rel_type = rel_type + selectors[-1].relations.append(sel) + elif rel_type[1:] != WS_COMBINATOR: + # It's impossible to have two whitespace combinators after each other as the patterns + # will gobble up trailing whitespace. It is also impossible to have a whitespace + # combinator after any other kind for the same reason. But we could have + # multiple non-whitespace combinators. So if the current combinator is not a whitespace, + # then we've hit the multiple combinator case, so we should fail. + raise SelectorSyntaxError( + f'The multiple combinators at position {index}', + self.pattern, + index + ) + + # Set the leading combinator for the next selector. + rel_type = ':' + combinator + + sel = _Selector() + has_selector = False + return has_selector, sel, rel_type + + def parse_combinator( + self, + sel: _Selector, + m: Match[str], + has_selector: bool, + selectors: list[_Selector], + relations: list[_Selector], + is_pseudo: bool, + is_forgive: bool, + index: int + ) -> tuple[bool, _Selector]: + """Parse combinator tokens.""" + + combinator = m.group('relation').strip() + if not combinator: + combinator = WS_COMBINATOR + if not has_selector: + if not is_forgive or combinator != COMMA_COMBINATOR: + raise SelectorSyntaxError( + f"The combinator '{combinator}' at position {index}, must have a selector before it", + self.pattern, + index + ) + + # If we are in a forgiving pseudo class, just make the selector a "no match" + if combinator == COMMA_COMBINATOR: + sel.no_match = True + del relations[:] + selectors.append(sel) + else: + if combinator == COMMA_COMBINATOR: + if not sel.tag and not is_pseudo: + # Implied `*` + sel.tag = ct.SelectorTag('*', None) + sel.relations.extend(relations) + selectors.append(sel) + del relations[:] + else: + sel.relations.extend(relations) + sel.rel_type = combinator + del relations[:] + relations.append(sel) + + sel = _Selector() + has_selector = False + + return has_selector, sel + + def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """Parse HTML classes and ids.""" + + selector = m.group(0) + if selector.startswith('.'): + sel.classes.append(css_unescape(selector[1:])) + else: + sel.ids.append(css_unescape(selector[1:])) + has_selector = True + return has_selector + + def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """Parse contains.""" + + pseudo = util.lower(css_unescape(m.group('name'))) + if pseudo == ":contains": + warnings.warn( # noqa: B028 + "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.", + FutureWarning + ) + contains_own = pseudo == ":-soup-contains-own" + values = css_unescape(m.group('values')) + patterns = [] + for token in RE_VALUES.finditer(values): + if token.group('split'): + continue + value = token.group('value') + if value.startswith(("'", '"')): + value = css_unescape(value[1:-1], True) + else: + value = css_unescape(value) + patterns.append(value) + sel.contains.append(ct.SelectorContains(patterns, contains_own)) + has_selector = True + return has_selector + + def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """Parse pseudo language.""" + + values = m.group('values') + patterns = [] + for token in RE_VALUES.finditer(values): + if token.group('split'): + continue + value = token.group('value') + if value.startswith(('"', "'")): + value = css_unescape(value[1:-1], True) + else: + value = css_unescape(value) + + patterns.append(value) + + sel.lang.append(ct.SelectorLang(patterns)) + has_selector = True + + return has_selector + + def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: + """Parse pseudo direction.""" + + value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL + sel.flags |= value + has_selector = True + return has_selector + + def parse_selectors( + self, + iselector: Iterator[tuple[str, Match[str]]], + index: int = 0, + flags: int = 0 + ) -> ct.SelectorList: + """Parse selectors.""" + + # Initialize important variables + sel = _Selector() + selectors = [] + has_selector = False + closed = False + relations = [] # type: list[_Selector] + rel_type = ":" + WS_COMBINATOR + + # Setup various flags + is_open = bool(flags & FLG_OPEN) + is_pseudo = bool(flags & FLG_PSEUDO) + is_relative = bool(flags & FLG_RELATIVE) + is_not = bool(flags & FLG_NOT) + is_html = bool(flags & FLG_HTML) + is_default = bool(flags & FLG_DEFAULT) + is_indeterminate = bool(flags & FLG_INDETERMINATE) + is_in_range = bool(flags & FLG_IN_RANGE) + is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) + is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN) + is_forgive = bool(flags & FLG_FORGIVE) + + # Print out useful debug stuff + if self.debug: # pragma: no cover + if is_pseudo: + print(' is_pseudo: True') + if is_open: + print(' is_open: True') + if is_relative: + print(' is_relative: True') + if is_not: + print(' is_not: True') + if is_html: + print(' is_html: True') + if is_default: + print(' is_default: True') + if is_indeterminate: + print(' is_indeterminate: True') + if is_in_range: + print(' is_in_range: True') + if is_out_of_range: + print(' is_out_of_range: True') + if is_placeholder_shown: + print(' is_placeholder_shown: True') + if is_forgive: + print(' is_forgive: True') + + # The algorithm for relative selectors require an initial selector in the selector list + if is_relative: + selectors.append(_Selector()) + + try: + while True: + key, m = next(iselector) + + # Handle parts + if key == "at_rule": + raise NotImplementedError(f"At-rules found at position {m.start(0)}") + elif key == "amp": + sel.flags |= ct.SEL_SCOPE + has_selector = True + elif key == 'pseudo_class_custom': + has_selector = self.parse_pseudo_class_custom(sel, m, has_selector) + elif key == 'pseudo_class': + has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html) + elif key == 'pseudo_element': + raise NotImplementedError(f"Pseudo-element found at position {m.start(0)}") + elif key == 'pseudo_contains': + has_selector = self.parse_pseudo_contains(sel, m, has_selector) + elif key in ('pseudo_nth_type', 'pseudo_nth_child'): + has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector) + elif key == 'pseudo_lang': + has_selector = self.parse_pseudo_lang(sel, m, has_selector) + elif key == 'pseudo_dir': + has_selector = self.parse_pseudo_dir(sel, m, has_selector) + # Currently only supports HTML + is_html = True + elif key == 'pseudo_close': + if not has_selector: + if not is_forgive: + raise SelectorSyntaxError( + f"Expected a selector at position {m.start(0)}", + self.pattern, + m.start(0) + ) + sel.no_match = True + if is_open: + closed = True + break + else: + raise SelectorSyntaxError( + f"Unmatched pseudo-class close at position {m.start(0)}", + self.pattern, + m.start(0) + ) + elif key == 'combine': + if is_relative: + has_selector, sel, rel_type = self.parse_has_combinator( + sel, m, has_selector, selectors, rel_type, index + ) + else: + has_selector, sel = self.parse_combinator( + sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index + ) + elif key == 'attribute': + has_selector = self.parse_attribute_selector(sel, m, has_selector) + elif key == 'tag': + if has_selector: + raise SelectorSyntaxError( + f"Tag name found at position {m.start(0)} instead of at the start", + self.pattern, + m.start(0) + ) + has_selector = self.parse_tag_pattern(sel, m, has_selector) + elif key in ('class', 'id'): + has_selector = self.parse_class_id(sel, m, has_selector) + + index = m.end(0) + except StopIteration: + pass + + # Handle selectors that are not closed + if is_open and not closed: + raise SelectorSyntaxError( + f"Unclosed pseudo-class at position {index}", + self.pattern, + index + ) + + # Cleanup completed selector piece + if has_selector: + if not sel.tag and not is_pseudo: + # Implied `*` + sel.tag = ct.SelectorTag('*', None) + if is_relative: + sel.rel_type = rel_type + selectors[-1].relations.append(sel) + else: + sel.relations.extend(relations) + del relations[:] + selectors.append(sel) + + # Forgive empty slots in pseudo-classes that have lists (and are forgiving) + elif is_forgive and (not selectors or not relations): + # Handle normal pseudo-classes with empty slots like `:is()` etc. + sel.no_match = True + del relations[:] + selectors.append(sel) + has_selector = True + + if not has_selector: + # We will always need to finish a selector when `:has()` is used as it leads with combining. + # May apply to others as well. + raise SelectorSyntaxError( + f'Expected a selector at position {index}', + self.pattern, + index + ) + + # Some patterns require additional logic, such as default. We try to make these the + # last pattern, and append the appropriate flag to that selector which communicates + # to the matcher what additional logic is required. + if is_default: + selectors[-1].flags = ct.SEL_DEFAULT + if is_indeterminate: + selectors[-1].flags = ct.SEL_INDETERMINATE + if is_in_range: + selectors[-1].flags = ct.SEL_IN_RANGE + if is_out_of_range: + selectors[-1].flags = ct.SEL_OUT_OF_RANGE + if is_placeholder_shown: + selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN + + # Return selector list + return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) + + def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]: + """Iterate selector tokens.""" + + # Ignore whitespace and comments at start and end of pattern + m = RE_WS_BEGIN.search(pattern) + index = m.end(0) if m else 0 + m = RE_WS_END.search(pattern) + end = (m.start(0) - 1) if m else (len(pattern) - 1) + + if self.debug: # pragma: no cover + print(f'## PARSING: {pattern!r}') + while index <= end: + m = None + for v in self.css_tokens: + m = v.match(pattern, index, self.flags) + if m: + name = v.get_name() + if self.debug: # pragma: no cover + print(f"TOKEN: '{name}' --> {m.group(0)!r} at position {m.start(0)}") + index = m.end(0) + yield name, m + break + if m is None: + c = pattern[index] + # If the character represents the start of one of the known selector types, + # throw an exception mentioning that the known selector type is in error; + # otherwise, report the invalid character. + if c == '[': + msg = f"Malformed attribute selector at position {index}" + elif c == '.': + msg = f"Malformed class selector at position {index}" + elif c == '#': + msg = f"Malformed id selector at position {index}" + elif c == ':': + msg = f"Malformed pseudo-class selector at position {index}" + else: + msg = f"Invalid character {c!r} position {index}" + raise SelectorSyntaxError(msg, self.pattern, index) + if self.debug: # pragma: no cover + print('## END PARSING') + + def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList: + """Process selectors.""" + + return self.parse_selectors(self.selector_iter(self.pattern), index, flags) + + +# Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern) +# A few patterns are order dependent as they use patterns previous compiled. + +# CSS pattern for `:link` and `:any-link` +CSS_LINK = CSSParser( + 'html|*:is(a, area)[href]' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:checked` +CSS_CHECKED = CSSParser( + ''' + html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected] + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:default` (must compile CSS_CHECKED first) +CSS_DEFAULT = CSSParser( + ''' + :checked, + + /* + This pattern must be at the end. + Special logic is applied to the last selector. + */ + html|form html|*:is(button, input)[type="submit"] + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT) +# CSS pattern for `:indeterminate` +CSS_INDETERMINATE = CSSParser( + ''' + html|input[type="checkbox"][indeterminate], + html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]), + html|progress:not([value]), + + /* + This pattern must be at the end. + Special logic is applied to the last selector. + */ + html|input[type="radio"][name]:not([name='']):not([checked]) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) +# CSS pattern for `:disabled` +CSS_DISABLED = CSSParser( + ''' + html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], + html|optgroup[disabled] > html|option, + html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset), + html|fieldset[disabled] > + html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:enabled` +CSS_ENABLED = CSSParser( + ''' + html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:required` +CSS_REQUIRED = CSSParser( + 'html|*:is(input, textarea, select)[required]' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:optional` +CSS_OPTIONAL = CSSParser( + 'html|*:is(input, textarea, select):not([required])' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:placeholder-shown` +CSS_PLACEHOLDER_SHOWN = CSSParser( + ''' + html|input:is( + :not([type]), + [type=""], + [type=text], + [type=search], + [type=url], + [type=tel], + [type=email], + [type=password], + [type=number] + )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]), + html|textarea[placeholder]:not([placeholder='']) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN) +# CSS pattern default for `:nth-child` "of S" feature +CSS_NTH_OF_S_DEFAULT = CSSParser( + '*|*' +).process_selectors(flags=FLG_PSEUDO) +# CSS pattern for `:read-write` (CSS_DISABLED must be compiled first) +CSS_READ_WRITE = CSSParser( + ''' + html|*:is( + textarea, + input:is( + :not([type]), + [type=""], + [type=text], + [type=search], + [type=url], + [type=tel], + [type=email], + [type=number], + [type=password], + [type=date], + [type=datetime-local], + [type=month], + [type=time], + [type=week] + ) + ):not([readonly], :disabled), + html|*:is([contenteditable=""], [contenteditable="true" i]) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:read-only` +CSS_READ_ONLY = CSSParser( + ''' + html|*:not(:read-write) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) +# CSS pattern for `:in-range` +CSS_IN_RANGE = CSSParser( + ''' + html|input:is( + [type="date"], + [type="month"], + [type="week"], + [type="time"], + [type="datetime-local"], + [type="number"], + [type="range"] + ):is( + [min], + [max] + ) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML) +# CSS pattern for `:out-of-range` +CSS_OUT_OF_RANGE = CSSParser( + ''' + html|input:is( + [type="date"], + [type="month"], + [type="week"], + [type="time"], + [type="datetime-local"], + [type="number"], + [type="range"] + ):is( + [min], + [max] + ) + ''' +).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML) diff --git a/.venv/lib/python3.12/site-packages/soupsieve/css_types.py b/.venv/lib/python3.12/site-packages/soupsieve/css_types.py new file mode 100644 index 00000000..71a6519b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/css_types.py @@ -0,0 +1,407 @@ +"""CSS selector structure items.""" +from __future__ import annotations +import copyreg +from .pretty import pretty +from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping + +__all__ = ( + 'Selector', + 'SelectorNull', + 'SelectorTag', + 'SelectorAttribute', + 'SelectorContains', + 'SelectorNth', + 'SelectorLang', + 'SelectorList', + 'Namespaces', + 'CustomSelectors' +) + + +SEL_EMPTY = 0x1 +SEL_ROOT = 0x2 +SEL_DEFAULT = 0x4 +SEL_INDETERMINATE = 0x8 +SEL_SCOPE = 0x10 +SEL_DIR_LTR = 0x20 +SEL_DIR_RTL = 0x40 +SEL_IN_RANGE = 0x80 +SEL_OUT_OF_RANGE = 0x100 +SEL_DEFINED = 0x200 +SEL_PLACEHOLDER_SHOWN = 0x400 + + +class Immutable: + """Immutable.""" + + __slots__: tuple[str, ...] = ('_hash',) + + _hash: int + + def __init__(self, **kwargs: Any) -> None: + """Initialize.""" + + temp = [] + for k, v in kwargs.items(): + temp.append(type(v)) + temp.append(v) + super().__setattr__(k, v) + super().__setattr__('_hash', hash(tuple(temp))) + + @classmethod + def __base__(cls) -> type[Immutable]: + """Get base class.""" + + return cls + + def __eq__(self, other: Any) -> bool: + """Equal.""" + + return ( + isinstance(other, self.__base__()) and + all(getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash') + ) + + def __ne__(self, other: Any) -> bool: + """Equal.""" + + return ( + not isinstance(other, self.__base__()) or + any(getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash') + ) + + def __hash__(self) -> int: + """Hash.""" + + return self._hash + + def __setattr__(self, name: str, value: Any) -> None: + """Prevent mutability.""" + + raise AttributeError(f"'{self.__class__.__name__}' is immutable") + + def __repr__(self) -> str: # pragma: no cover + """Representation.""" + + r = ', '.join([f"{k}={getattr(self, k)!r}" for k in self.__slots__[:-1]]) + return f"{self.__class__.__name__}({r})" + + __str__ = __repr__ + + def pretty(self) -> None: # pragma: no cover + """Pretty print.""" + + print(pretty(self)) + + +class ImmutableDict(Mapping[Any, Any]): + """Hashable, immutable dictionary.""" + + def __init__( + self, + arg: dict[Any, Any] | Iterable[tuple[Any, Any]] + ) -> None: + """Initialize.""" + + self._validate(arg) + self._d = dict(arg) + self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())])) + + def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None: + """Validate arguments.""" + + if isinstance(arg, dict): + if not all(isinstance(v, Hashable) for v in arg.values()): + raise TypeError(f'{self.__class__.__name__} values must be hashable') + elif not all(isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg): + raise TypeError(f'{self.__class__.__name__} values must be hashable') + + def __iter__(self) -> Iterator[Any]: + """Iterator.""" + + return iter(self._d) + + def __len__(self) -> int: + """Length.""" + + return len(self._d) + + def __getitem__(self, key: Any) -> Any: + """Get item: `namespace['key']`.""" + + return self._d[key] + + def __hash__(self) -> int: + """Hash.""" + + return self._hash + + def __repr__(self) -> str: # pragma: no cover + """Representation.""" + + return f"{self._d!r}" + + __str__ = __repr__ + + +class Namespaces(ImmutableDict): + """Namespaces.""" + + def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: + """Initialize.""" + + super().__init__(arg) + + def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: + """Validate arguments.""" + + if isinstance(arg, dict): + if not all(isinstance(v, str) for v in arg.values()): + raise TypeError(f'{self.__class__.__name__} values must be hashable') + elif not all(isinstance(k, str) and isinstance(v, str) for k, v in arg): + raise TypeError(f'{self.__class__.__name__} keys and values must be Unicode strings') + + +class CustomSelectors(ImmutableDict): + """Custom selectors.""" + + def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: + """Initialize.""" + + super().__init__(arg) + + def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: + """Validate arguments.""" + + if isinstance(arg, dict): + if not all(isinstance(v, str) for v in arg.values()): + raise TypeError(f'{self.__class__.__name__} values must be hashable') + elif not all(isinstance(k, str) and isinstance(v, str) for k, v in arg): + raise TypeError(f'{self.__class__.__name__} keys and values must be Unicode strings') + + +class Selector(Immutable): + """Selector.""" + + __slots__ = ( + 'tag', 'ids', 'classes', 'attributes', 'nth', 'selectors', + 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' + ) + + tag: SelectorTag | None + ids: tuple[str, ...] + classes: tuple[str, ...] + attributes: tuple[SelectorAttribute, ...] + nth: tuple[SelectorNth, ...] + selectors: tuple[SelectorList, ...] + relation: SelectorList + rel_type: str | None + contains: tuple[SelectorContains, ...] + lang: tuple[SelectorLang, ...] + flags: int + + def __init__( + self, + tag: SelectorTag | None, + ids: tuple[str, ...], + classes: tuple[str, ...], + attributes: tuple[SelectorAttribute, ...], + nth: tuple[SelectorNth, ...], + selectors: tuple[SelectorList, ...], + relation: SelectorList, + rel_type: str | None, + contains: tuple[SelectorContains, ...], + lang: tuple[SelectorLang, ...], + flags: int + ): + """Initialize.""" + + super().__init__( + tag=tag, + ids=ids, + classes=classes, + attributes=attributes, + nth=nth, + selectors=selectors, + relation=relation, + rel_type=rel_type, + contains=contains, + lang=lang, + flags=flags + ) + + +class SelectorNull(Immutable): + """Null Selector.""" + + def __init__(self) -> None: + """Initialize.""" + + super().__init__() + + +class SelectorTag(Immutable): + """Selector tag.""" + + __slots__ = ("name", "prefix", "_hash") + + name: str + prefix: str | None + + def __init__(self, name: str, prefix: str | None) -> None: + """Initialize.""" + + super().__init__(name=name, prefix=prefix) + + +class SelectorAttribute(Immutable): + """Selector attribute rule.""" + + __slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash") + + attribute: str + prefix: str + pattern: Pattern[str] | None + xml_type_pattern: Pattern[str] | None + + def __init__( + self, + attribute: str, + prefix: str, + pattern: Pattern[str] | None, + xml_type_pattern: Pattern[str] | None + ) -> None: + """Initialize.""" + + super().__init__( + attribute=attribute, + prefix=prefix, + pattern=pattern, + xml_type_pattern=xml_type_pattern + ) + + +class SelectorContains(Immutable): + """Selector contains rule.""" + + __slots__ = ("text", "own", "_hash") + + text: tuple[str, ...] + own: bool + + def __init__(self, text: Iterable[str], own: bool) -> None: + """Initialize.""" + + super().__init__(text=tuple(text), own=own) + + +class SelectorNth(Immutable): + """Selector nth type.""" + + __slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash") + + a: int + n: bool + b: int + of_type: bool + last: bool + selectors: SelectorList + + def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None: + """Initialize.""" + + super().__init__( + a=a, + n=n, + b=b, + of_type=of_type, + last=last, + selectors=selectors + ) + + +class SelectorLang(Immutable): + """Selector language rules.""" + + __slots__ = ("languages", "_hash",) + + languages: tuple[str, ...] + + def __init__(self, languages: Iterable[str]): + """Initialize.""" + + super().__init__(languages=tuple(languages)) + + def __iter__(self) -> Iterator[str]: + """Iterator.""" + + return iter(self.languages) + + def __len__(self) -> int: # pragma: no cover + """Length.""" + + return len(self.languages) + + def __getitem__(self, index: int) -> str: # pragma: no cover + """Get item.""" + + return self.languages[index] + + +class SelectorList(Immutable): + """Selector list.""" + + __slots__ = ("selectors", "is_not", "is_html", "_hash") + + selectors: tuple[Selector | SelectorNull, ...] + is_not: bool + is_html: bool + + def __init__( + self, + selectors: Iterable[Selector | SelectorNull] | None = None, + is_not: bool = False, + is_html: bool = False + ) -> None: + """Initialize.""" + + super().__init__( + selectors=tuple(selectors) if selectors is not None else (), + is_not=is_not, + is_html=is_html + ) + + def __iter__(self) -> Iterator[Selector | SelectorNull]: + """Iterator.""" + + return iter(self.selectors) + + def __len__(self) -> int: + """Length.""" + + return len(self.selectors) + + def __getitem__(self, index: int) -> Selector | SelectorNull: + """Get item.""" + + return self.selectors[index] + + +def _pickle(p: Any) -> Any: + return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]]) + + +def pickle_register(obj: Any) -> None: + """Allow object to be pickled.""" + + copyreg.pickle(obj, _pickle) + + +pickle_register(Selector) +pickle_register(SelectorNull) +pickle_register(SelectorTag) +pickle_register(SelectorAttribute) +pickle_register(SelectorContains) +pickle_register(SelectorNth) +pickle_register(SelectorLang) +pickle_register(SelectorList) diff --git a/.venv/lib/python3.12/site-packages/soupsieve/pretty.py b/.venv/lib/python3.12/site-packages/soupsieve/pretty.py new file mode 100644 index 00000000..193db05e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/pretty.py @@ -0,0 +1,139 @@ +""" +Format a pretty string of a `SoupSieve` object for easy debugging. + +This won't necessarily support all types and such, and definitely +not support custom outputs. + +It is mainly geared towards our types as the `SelectorList` +object is a beast to look at without some indentation and newlines. +The format and various output types is fairly known (though it +hasn't been tested extensively to make sure we aren't missing corners). + +Example: +------- +``` +>>> import soupsieve as sv +>>> sv.compile('this > that.class[name=value]').selectors.pretty() +SelectorList( + selectors=( + Selector( + tag=SelectorTag( + name='that', + prefix=None), + ids=(), + classes=( + 'class', + ), + attributes=( + SelectorAttribute( + attribute='name', + prefix='', + pattern=re.compile( + '^value$'), + xml_type_pattern=None), + ), + nth=(), + selectors=(), + relation=SelectorList( + selectors=( + Selector( + tag=SelectorTag( + name='this', + prefix=None), + ids=(), + classes=(), + attributes=(), + nth=(), + selectors=(), + relation=SelectorList( + selectors=(), + is_not=False, + is_html=False), + rel_type='>', + contains=(), + lang=(), + flags=0), + ), + is_not=False, + is_html=False), + rel_type=None, + contains=(), + lang=(), + flags=0), + ), + is_not=False, + is_html=False) +``` + +""" +from __future__ import annotations +import re +from typing import Any + +RE_CLASS = re.compile(r'(?i)[a-z_][_a-z\d\.]+\(') +RE_PARAM = re.compile(r'(?i)[_a-z][_a-z\d]+=') +RE_EMPTY = re.compile(r'\(\)|\[\]|\{\}') +RE_LSTRT = re.compile(r'\[') +RE_DSTRT = re.compile(r'\{') +RE_TSTRT = re.compile(r'\(') +RE_LEND = re.compile(r'\]') +RE_DEND = re.compile(r'\}') +RE_TEND = re.compile(r'\)') +RE_INT = re.compile(r'\d+') +RE_KWORD = re.compile(r'(?i)[_a-z][_a-z\d]+') +RE_DQSTR = re.compile(r'"(?:\\.|[^"\\])*"') +RE_SQSTR = re.compile(r"'(?:\\.|[^'\\])*'") +RE_SEP = re.compile(r'\s*(,)\s*') +RE_DSEP = re.compile(r'\s*(:)\s*') + +TOKENS = { + 'class': RE_CLASS, + 'param': RE_PARAM, + 'empty': RE_EMPTY, + 'lstrt': RE_LSTRT, + 'dstrt': RE_DSTRT, + 'tstrt': RE_TSTRT, + 'lend': RE_LEND, + 'dend': RE_DEND, + 'tend': RE_TEND, + 'sqstr': RE_SQSTR, + 'sep': RE_SEP, + 'dsep': RE_DSEP, + 'int': RE_INT, + 'kword': RE_KWORD, + 'dqstr': RE_DQSTR +} + + +def pretty(obj: Any) -> str: # pragma: no cover + """Make the object output string pretty.""" + + sel = str(obj) + index = 0 + end = len(sel) - 1 + indent = 0 + output = [] + + while index <= end: + m = None + for k, v in TOKENS.items(): + m = v.match(sel, index) + + if m: + name = k + index = m.end(0) + if name in ('class', 'lstrt', 'dstrt', 'tstrt'): + indent += 4 + output.append(f'{m.group(0)}\n{" " * indent}') + elif name in ('param', 'int', 'kword', 'sqstr', 'dqstr', 'empty'): + output.append(m.group(0)) + elif name in ('lend', 'dend', 'tend'): + indent -= 4 + output.append(m.group(0)) + elif name in ('sep',): + output.append(f'{m.group(1)}\n{" " * indent}') + elif name in ('dsep',): + output.append(f'{m.group(1)} ') + break + + return ''.join(output) diff --git a/.venv/lib/python3.12/site-packages/soupsieve/py.typed b/.venv/lib/python3.12/site-packages/soupsieve/py.typed new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/py.typed diff --git a/.venv/lib/python3.12/site-packages/soupsieve/util.py b/.venv/lib/python3.12/site-packages/soupsieve/util.py new file mode 100644 index 00000000..9b2e64df --- /dev/null +++ b/.venv/lib/python3.12/site-packages/soupsieve/util.py @@ -0,0 +1,117 @@ +"""Utility.""" +from __future__ import annotations +from functools import wraps, lru_cache +import warnings +import re +from typing import Callable, Any + +DEBUG = 0x00001 + +RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$') + +UC_A = ord('A') +UC_Z = ord('Z') + + +@lru_cache(maxsize=512) +def lower(string: str) -> str: + """Lower.""" + + new_string = [] + for c in string: + o = ord(c) + new_string.append(chr(o + 32) if UC_A <= o <= UC_Z else c) + return ''.join(new_string) + + +class SelectorSyntaxError(Exception): + """Syntax error in a CSS selector.""" + + def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None: + """Initialize.""" + + self.line = None + self.col = None + self.context = None + + if pattern is not None and index is not None: + # Format pattern to show line and column position + self.context, self.line, self.col = get_pattern_context(pattern, index) + msg = f'{msg}\n line {self.line}:\n{self.context}' + + super().__init__(msg) + + +def deprecated(message: str, stacklevel: int = 2) -> Callable[..., Any]: # pragma: no cover + """ + Raise a `DeprecationWarning` when wrapped function/method is called. + + Usage: + + @deprecated("This method will be removed in version X; use Y instead.") + def some_method()" + pass + """ + + def _wrapper(func: Callable[..., Any]) -> Callable[..., Any]: + @wraps(func) + def _deprecated_func(*args: Any, **kwargs: Any) -> Any: + warnings.warn( + f"'{func.__name__}' is deprecated. {message}", + category=DeprecationWarning, + stacklevel=stacklevel + ) + return func(*args, **kwargs) + return _deprecated_func + return _wrapper + + +def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no cover + """Warn deprecated.""" + + warnings.warn( + message, + category=DeprecationWarning, + stacklevel=stacklevel + ) + + +def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]: + """Get the pattern context.""" + + last = 0 + current_line = 1 + col = 1 + text = [] # type: list[str] + line = 1 + offset = None # type: int | None + + # Split pattern by newline and handle the text before the newline + for m in RE_PATTERN_LINE_SPLIT.finditer(pattern): + linetext = pattern[last:m.start(0)] + if not len(m.group(0)) and not len(text): + indent = '' + offset = -1 + col = index - last + 1 + elif last <= index < m.end(0): + indent = '--> ' + offset = (-1 if index > m.start(0) else 0) + 3 + col = index - last + 1 + else: + indent = ' ' + offset = None + if len(text): + # Regardless of whether we are presented with `\r\n`, `\r`, or `\n`, + # we will render the output with just `\n`. We will still log the column + # correctly though. + text.append('\n') + text.append(f'{indent}{linetext}') + if offset is not None: + text.append('\n') + text.append(' ' * (col + offset) + '^') + line = current_line + + current_line += 1 + last = m.end(0) + + return ''.join(text), line, col |