diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/yarl/_parse.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/yarl/_parse.py | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/yarl/_parse.py b/.venv/lib/python3.12/site-packages/yarl/_parse.py new file mode 100644 index 00000000..cc259ea8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/yarl/_parse.py @@ -0,0 +1,189 @@ +"""URL parsing utilities.""" + +import re +import unicodedata +from functools import lru_cache +from typing import Union +from urllib.parse import scheme_chars, uses_netloc + +from ._quoters import QUOTER + +# Leading and trailing C0 control and space to be stripped per WHATWG spec. +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) +WHATWG_C0_CONTROL_OR_SPACE = ( + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10" + "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f " +) + +# Unsafe bytes to be removed per WHATWG spec +UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"] +USES_AUTHORITY = frozenset(uses_netloc) + +SplitURLType = tuple[str, str, str, str, str] + + +def split_url(url: str) -> SplitURLType: + """Split URL into parts.""" + # Adapted from urllib.parse.urlsplit + # Only lstrip url as some applications rely on preserving trailing space. + # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) + url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE) + for b in UNSAFE_URL_BYTES_TO_REMOVE: + if b in url: + url = url.replace(b, "") + + scheme = netloc = query = fragment = "" + i = url.find(":") + if i > 0 and url[0] in scheme_chars: + for c in url[1:i]: + if c not in scheme_chars: + break + else: + scheme, url = url[:i].lower(), url[i + 1 :] + has_hash = "#" in url + has_question_mark = "?" in url + if url[:2] == "//": + delim = len(url) # position of end of domain part of url, default is end + if has_hash and has_question_mark: + delim_chars = "/?#" + elif has_question_mark: + delim_chars = "/?" + elif has_hash: + delim_chars = "/#" + else: + delim_chars = "/" + for c in delim_chars: # look for delimiters; the order is NOT important + wdelim = url.find(c, 2) # find first of this delim + if wdelim >= 0 and wdelim < delim: # if found + delim = wdelim # use earliest delim position + netloc = url[2:delim] + url = url[delim:] + has_left_bracket = "[" in netloc + has_right_bracket = "]" in netloc + if (has_left_bracket and not has_right_bracket) or ( + has_right_bracket and not has_left_bracket + ): + raise ValueError("Invalid IPv6 URL") + if has_left_bracket: + bracketed_host = netloc.partition("[")[2].partition("]")[0] + # Valid bracketed hosts are defined in + # https://www.rfc-editor.org/rfc/rfc3986#page-49 + # https://url.spec.whatwg.org/ + if bracketed_host[0] == "v": + if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host): + raise ValueError("IPvFuture address is invalid") + elif ":" not in bracketed_host: + raise ValueError("An IPv4 address cannot be in brackets") + if has_hash: + url, _, fragment = url.partition("#") + if has_question_mark: + url, _, query = url.partition("?") + if netloc and not netloc.isascii(): + _check_netloc(netloc) + return scheme, netloc, url, query, fragment + + +def _check_netloc(netloc: str) -> None: + # Adapted from urllib.parse._checknetloc + # looking for characters like \u2100 that expand to 'a/c' + # IDNA uses NFKC equivalence, so normalize for this check + + # ignore characters already included + # but not the surrounding text + n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "") + normalized_netloc = unicodedata.normalize("NFKC", n) + if n == normalized_netloc: + return + # Note that there are no unicode decompositions for the character '@' so + # its currently impossible to have test coverage for this branch, however if the + # one should be added in the future we want to make sure its still checked. + for c in "/?#@:": # pragma: no branch + if c in normalized_netloc: + raise ValueError( + f"netloc '{netloc}' contains invalid " + "characters under NFKC normalization" + ) + + +@lru_cache # match the same size as urlsplit +def split_netloc( + netloc: str, +) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]: + """Split netloc into username, password, host and port.""" + if "@" not in netloc: + username: Union[str, None] = None + password: Union[str, None] = None + hostinfo = netloc + else: + userinfo, _, hostinfo = netloc.rpartition("@") + username, have_password, password = userinfo.partition(":") + if not have_password: + password = None + + if "[" in hostinfo: + _, _, bracketed = hostinfo.partition("[") + hostname, _, port_str = bracketed.partition("]") + _, _, port_str = port_str.partition(":") + else: + hostname, _, port_str = hostinfo.partition(":") + + if not port_str: + return username or None, password, hostname or None, None + + try: + port = int(port_str) + except ValueError: + raise ValueError("Invalid URL: port can't be converted to integer") + if not (0 <= port <= 65535): + raise ValueError("Port out of range 0-65535") + return username or None, password, hostname or None, port + + +def unsplit_result( + scheme: str, netloc: str, url: str, query: str, fragment: str +) -> str: + """Unsplit a URL without any normalization.""" + if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//": + if url and url[:1] != "/": + url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}" + else: + url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}" + elif scheme: + url = f"{scheme}:{url}" + if query: + url = f"{url}?{query}" + return f"{url}#{fragment}" if fragment else url + + +@lru_cache # match the same size as urlsplit +def make_netloc( + user: Union[str, None], + password: Union[str, None], + host: Union[str, None], + port: Union[int, None], + encode: bool = False, +) -> str: + """Make netloc from parts. + + The user and password are encoded if encode is True. + + The host must already be encoded with _encode_host. + """ + if host is None: + return "" + ret = host + if port is not None: + ret = f"{ret}:{port}" + if user is None and password is None: + return ret + if password is not None: + if not user: + user = "" + elif encode: + user = QUOTER(user) + if encode: + password = QUOTER(password) + user = f"{user}:{password}" + elif user and encode: + user = QUOTER(user) + return f"{user}@{ret}" if user else ret |