aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/charset_normalizer/utils.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/charset_normalizer/utils.py')
-rw-r--r--.venv/lib/python3.12/site-packages/charset_normalizer/utils.py408
1 files changed, 408 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/charset_normalizer/utils.py b/.venv/lib/python3.12/site-packages/charset_normalizer/utils.py
new file mode 100644
index 00000000..0175e0a9
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/charset_normalizer/utils.py
@@ -0,0 +1,408 @@
+from __future__ import annotations
+
+import importlib
+import logging
+import unicodedata
+from codecs import IncrementalDecoder
+from encodings.aliases import aliases
+from functools import lru_cache
+from re import findall
+from typing import Generator
+
+from _multibytecodec import ( # type: ignore[import-not-found,import]
+ MultibyteIncrementalDecoder,
+)
+
+from .constant import (
+ ENCODING_MARKS,
+ IANA_SUPPORTED_SIMILAR,
+ RE_POSSIBLE_ENCODING_INDICATION,
+ UNICODE_RANGES_COMBINED,
+ UNICODE_SECONDARY_RANGE_KEYWORD,
+ UTF8_MAXIMAL_ALLOCATION,
+)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_accentuated(character: str) -> bool:
+ try:
+ description: str = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+ return (
+ "WITH GRAVE" in description
+ or "WITH ACUTE" in description
+ or "WITH CEDILLA" in description
+ or "WITH DIAERESIS" in description
+ or "WITH CIRCUMFLEX" in description
+ or "WITH TILDE" in description
+ or "WITH MACRON" in description
+ or "WITH RING ABOVE" in description
+ )
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def remove_accent(character: str) -> str:
+ decomposed: str = unicodedata.decomposition(character)
+ if not decomposed:
+ return character
+
+ codes: list[str] = decomposed.split(" ")
+
+ return chr(int(codes[0], 16))
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def unicode_range(character: str) -> str | None:
+ """
+ Retrieve the Unicode range official name from a single character.
+ """
+ character_ord: int = ord(character)
+
+ for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
+ if character_ord in ord_range:
+ return range_name
+
+ return None
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_latin(character: str) -> bool:
+ try:
+ description: str = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+ return "LATIN" in description
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_punctuation(character: str) -> bool:
+ character_category: str = unicodedata.category(character)
+
+ if "P" in character_category:
+ return True
+
+ character_range: str | None = unicode_range(character)
+
+ if character_range is None:
+ return False
+
+ return "Punctuation" in character_range
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_symbol(character: str) -> bool:
+ character_category: str = unicodedata.category(character)
+
+ if "S" in character_category or "N" in character_category:
+ return True
+
+ character_range: str | None = unicode_range(character)
+
+ if character_range is None:
+ return False
+
+ return "Forms" in character_range and character_category != "Lo"
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_emoticon(character: str) -> bool:
+ character_range: str | None = unicode_range(character)
+
+ if character_range is None:
+ return False
+
+ return "Emoticons" in character_range or "Pictographs" in character_range
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_separator(character: str) -> bool:
+ if character.isspace() or character in {"|", "+", "<", ">"}:
+ return True
+
+ character_category: str = unicodedata.category(character)
+
+ return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_case_variable(character: str) -> bool:
+ return character.islower() != character.isupper()
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_cjk(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "CJK" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hiragana(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "HIRAGANA" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_katakana(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "KATAKANA" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hangul(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "HANGUL" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_thai(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "THAI" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "ARABIC" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic_isolated_form(character: str) -> bool:
+ try:
+ character_name = unicodedata.name(character)
+ except ValueError: # Defensive: unicode database outdated?
+ return False
+
+ return "ARABIC" in character_name and "ISOLATED FORM" in character_name
+
+
+@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
+def is_unicode_range_secondary(range_name: str) -> bool:
+ return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+ return (
+ character.isspace() is False # includes \n \t \r \v
+ and character.isprintable() is False
+ and character != "\x1a" # Why? Its the ASCII substitute character.
+ and character != "\ufeff" # bug discovered in Python,
+ # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+ )
+
+
+def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
+ """
+ Extract using ASCII-only decoder any specified encoding in the first n-bytes.
+ """
+ if not isinstance(sequence, bytes):
+ raise TypeError
+
+ seq_len: int = len(sequence)
+
+ results: list[str] = findall(
+ RE_POSSIBLE_ENCODING_INDICATION,
+ sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
+ )
+
+ if len(results) == 0:
+ return None
+
+ for specified_encoding in results:
+ specified_encoding = specified_encoding.lower().replace("-", "_")
+
+ encoding_alias: str
+ encoding_iana: str
+
+ for encoding_alias, encoding_iana in aliases.items():
+ if encoding_alias == specified_encoding:
+ return encoding_iana
+ if encoding_iana == specified_encoding:
+ return encoding_iana
+
+ return None
+
+
+@lru_cache(maxsize=128)
+def is_multi_byte_encoding(name: str) -> bool:
+ """
+ Verify is a specific encoding is a multi byte one based on it IANA name
+ """
+ return name in {
+ "utf_8",
+ "utf_8_sig",
+ "utf_16",
+ "utf_16_be",
+ "utf_16_le",
+ "utf_32",
+ "utf_32_le",
+ "utf_32_be",
+ "utf_7",
+ } or issubclass(
+ importlib.import_module(f"encodings.{name}").IncrementalDecoder,
+ MultibyteIncrementalDecoder,
+ )
+
+
+def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
+ """
+ Identify and extract SIG/BOM in given sequence.
+ """
+
+ for iana_encoding in ENCODING_MARKS:
+ marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
+
+ if isinstance(marks, bytes):
+ marks = [marks]
+
+ for mark in marks:
+ if sequence.startswith(mark):
+ return iana_encoding, mark
+
+ return None, b""
+
+
+def should_strip_sig_or_bom(iana_encoding: str) -> bool:
+ return iana_encoding not in {"utf_16", "utf_32"}
+
+
+def iana_name(cp_name: str, strict: bool = True) -> str:
+ """Returns the Python normalized encoding name (Not the IANA official name)."""
+ cp_name = cp_name.lower().replace("-", "_")
+
+ encoding_alias: str
+ encoding_iana: str
+
+ for encoding_alias, encoding_iana in aliases.items():
+ if cp_name in [encoding_alias, encoding_iana]:
+ return encoding_iana
+
+ if strict:
+ raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
+
+ return cp_name
+
+
+def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
+ if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
+ return 0.0
+
+ decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
+ decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
+
+ id_a: IncrementalDecoder = decoder_a(errors="ignore")
+ id_b: IncrementalDecoder = decoder_b(errors="ignore")
+
+ character_match_count: int = 0
+
+ for i in range(255):
+ to_be_decoded: bytes = bytes([i])
+ if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
+ character_match_count += 1
+
+ return character_match_count / 254
+
+
+def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
+ """
+ Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
+ the function cp_similarity.
+ """
+ return (
+ iana_name_a in IANA_SUPPORTED_SIMILAR
+ and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
+ )
+
+
+def set_logging_handler(
+ name: str = "charset_normalizer",
+ level: int = logging.INFO,
+ format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
+) -> None:
+ logger = logging.getLogger(name)
+ logger.setLevel(level)
+
+ handler = logging.StreamHandler()
+ handler.setFormatter(logging.Formatter(format_string))
+ logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+ sequences: bytes,
+ encoding_iana: str,
+ offsets: range,
+ chunk_size: int,
+ bom_or_sig_available: bool,
+ strip_sig_or_bom: bool,
+ sig_payload: bytes,
+ is_multi_byte_decoder: bool,
+ decoded_payload: str | None = None,
+) -> Generator[str, None, None]:
+ if decoded_payload and is_multi_byte_decoder is False:
+ for i in offsets:
+ chunk = decoded_payload[i : i + chunk_size]
+ if not chunk:
+ break
+ yield chunk
+ else:
+ for i in offsets:
+ chunk_end = i + chunk_size
+ if chunk_end > len(sequences) + 8:
+ continue
+
+ cut_sequence = sequences[i : i + chunk_size]
+
+ if bom_or_sig_available and strip_sig_or_bom is False:
+ cut_sequence = sig_payload + cut_sequence
+
+ chunk = cut_sequence.decode(
+ encoding_iana,
+ errors="ignore" if is_multi_byte_decoder else "strict",
+ )
+
+ # multi-byte bad cutting detector and adjustment
+ # not the cleanest way to perform that fix but clever enough for now.
+ if is_multi_byte_decoder and i > 0:
+ chunk_partial_size_chk: int = min(chunk_size, 16)
+
+ if (
+ decoded_payload
+ and chunk[:chunk_partial_size_chk] not in decoded_payload
+ ):
+ for j in range(i, i - 4, -1):
+ cut_sequence = sequences[j:chunk_end]
+
+ if bom_or_sig_available and strip_sig_or_bom is False:
+ cut_sequence = sig_payload + cut_sequence
+
+ chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+ if chunk[:chunk_partial_size_chk] in decoded_payload:
+ break
+
+ yield chunk