aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/charset_normalizer/cd.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/charset_normalizer/cd.py')
-rw-r--r--.venv/lib/python3.12/site-packages/charset_normalizer/cd.py395
1 files changed, 395 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/charset_normalizer/cd.py b/.venv/lib/python3.12/site-packages/charset_normalizer/cd.py
new file mode 100644
index 00000000..71a3ed51
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/charset_normalizer/cd.py
@@ -0,0 +1,395 @@
+from __future__ import annotations
+
+import importlib
+from codecs import IncrementalDecoder
+from collections import Counter
+from functools import lru_cache
+from typing import Counter as TypeCounter
+
+from .constant import (
+ FREQUENCIES,
+ KO_NAMES,
+ LANGUAGE_SUPPORTED_COUNT,
+ TOO_SMALL_SEQUENCE,
+ ZH_NAMES,
+)
+from .md import is_suspiciously_successive_range
+from .models import CoherenceMatches
+from .utils import (
+ is_accentuated,
+ is_latin,
+ is_multi_byte_encoding,
+ is_unicode_range_secondary,
+ unicode_range,
+)
+
+
+def encoding_unicode_range(iana_name: str) -> list[str]:
+ """
+ Return associated unicode ranges in a single byte code page.
+ """
+ if is_multi_byte_encoding(iana_name):
+ raise OSError("Function not supported on multi-byte code page")
+
+ decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
+
+ p: IncrementalDecoder = decoder(errors="ignore")
+ seen_ranges: dict[str, int] = {}
+ character_count: int = 0
+
+ for i in range(0x40, 0xFF):
+ chunk: str = p.decode(bytes([i]))
+
+ if chunk:
+ character_range: str | None = unicode_range(chunk)
+
+ if character_range is None:
+ continue
+
+ if is_unicode_range_secondary(character_range) is False:
+ if character_range not in seen_ranges:
+ seen_ranges[character_range] = 0
+ seen_ranges[character_range] += 1
+ character_count += 1
+
+ return sorted(
+ [
+ character_range
+ for character_range in seen_ranges
+ if seen_ranges[character_range] / character_count >= 0.15
+ ]
+ )
+
+
+def unicode_range_languages(primary_range: str) -> list[str]:
+ """
+ Return inferred languages used with a unicode range.
+ """
+ languages: list[str] = []
+
+ for language, characters in FREQUENCIES.items():
+ for character in characters:
+ if unicode_range(character) == primary_range:
+ languages.append(language)
+ break
+
+ return languages
+
+
+@lru_cache()
+def encoding_languages(iana_name: str) -> list[str]:
+ """
+ Single-byte encoding language association. Some code page are heavily linked to particular language(s).
+ This function does the correspondence.
+ """
+ unicode_ranges: list[str] = encoding_unicode_range(iana_name)
+ primary_range: str | None = None
+
+ for specified_range in unicode_ranges:
+ if "Latin" not in specified_range:
+ primary_range = specified_range
+ break
+
+ if primary_range is None:
+ return ["Latin Based"]
+
+ return unicode_range_languages(primary_range)
+
+
+@lru_cache()
+def mb_encoding_languages(iana_name: str) -> list[str]:
+ """
+ Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
+ This function does the correspondence.
+ """
+ if (
+ iana_name.startswith("shift_")
+ or iana_name.startswith("iso2022_jp")
+ or iana_name.startswith("euc_j")
+ or iana_name == "cp932"
+ ):
+ return ["Japanese"]
+ if iana_name.startswith("gb") or iana_name in ZH_NAMES:
+ return ["Chinese"]
+ if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
+ return ["Korean"]
+
+ return []
+
+
+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> tuple[bool, bool]:
+ """
+ Determine main aspects from a supported language if it contains accents and if is pure Latin.
+ """
+ target_have_accents: bool = False
+ target_pure_latin: bool = True
+
+ for character in FREQUENCIES[language]:
+ if not target_have_accents and is_accentuated(character):
+ target_have_accents = True
+ if target_pure_latin and is_latin(character) is False:
+ target_pure_latin = False
+
+ return target_have_accents, target_pure_latin
+
+
+def alphabet_languages(
+ characters: list[str], ignore_non_latin: bool = False
+) -> list[str]:
+ """
+ Return associated languages associated to given characters.
+ """
+ languages: list[tuple[str, float]] = []
+
+ source_have_accents = any(is_accentuated(character) for character in characters)
+
+ for language, language_characters in FREQUENCIES.items():
+ target_have_accents, target_pure_latin = get_target_features(language)
+
+ if ignore_non_latin and target_pure_latin is False:
+ continue
+
+ if target_have_accents is False and source_have_accents:
+ continue
+
+ character_count: int = len(language_characters)
+
+ character_match_count: int = len(
+ [c for c in language_characters if c in characters]
+ )
+
+ ratio: float = character_match_count / character_count
+
+ if ratio >= 0.2:
+ languages.append((language, ratio))
+
+ languages = sorted(languages, key=lambda x: x[1], reverse=True)
+
+ return [compatible_language[0] for compatible_language in languages]
+
+
+def characters_popularity_compare(
+ language: str, ordered_characters: list[str]
+) -> float:
+ """
+ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
+ The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
+ Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
+ """
+ if language not in FREQUENCIES:
+ raise ValueError(f"{language} not available")
+
+ character_approved_count: int = 0
+ FREQUENCIES_language_set = set(FREQUENCIES[language])
+
+ ordered_characters_count: int = len(ordered_characters)
+ target_language_characters_count: int = len(FREQUENCIES[language])
+
+ large_alphabet: bool = target_language_characters_count > 26
+
+ for character, character_rank in zip(
+ ordered_characters, range(0, ordered_characters_count)
+ ):
+ if character not in FREQUENCIES_language_set:
+ continue
+
+ character_rank_in_language: int = FREQUENCIES[language].index(character)
+ expected_projection_ratio: float = (
+ target_language_characters_count / ordered_characters_count
+ )
+ character_rank_projection: int = int(character_rank * expected_projection_ratio)
+
+ if (
+ large_alphabet is False
+ and abs(character_rank_projection - character_rank_in_language) > 4
+ ):
+ continue
+
+ if (
+ large_alphabet is True
+ and abs(character_rank_projection - character_rank_in_language)
+ < target_language_characters_count / 3
+ ):
+ character_approved_count += 1
+ continue
+
+ characters_before_source: list[str] = FREQUENCIES[language][
+ 0:character_rank_in_language
+ ]
+ characters_after_source: list[str] = FREQUENCIES[language][
+ character_rank_in_language:
+ ]
+ characters_before: list[str] = ordered_characters[0:character_rank]
+ characters_after: list[str] = ordered_characters[character_rank:]
+
+ before_match_count: int = len(
+ set(characters_before) & set(characters_before_source)
+ )
+
+ after_match_count: int = len(
+ set(characters_after) & set(characters_after_source)
+ )
+
+ if len(characters_before_source) == 0 and before_match_count <= 4:
+ character_approved_count += 1
+ continue
+
+ if len(characters_after_source) == 0 and after_match_count <= 4:
+ character_approved_count += 1
+ continue
+
+ if (
+ before_match_count / len(characters_before_source) >= 0.4
+ or after_match_count / len(characters_after_source) >= 0.4
+ ):
+ character_approved_count += 1
+ continue
+
+ return character_approved_count / len(ordered_characters)
+
+
+def alpha_unicode_split(decoded_sequence: str) -> list[str]:
+ """
+ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
+ Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
+ One containing the latin letters and the other hebrew.
+ """
+ layers: dict[str, str] = {}
+
+ for character in decoded_sequence:
+ if character.isalpha() is False:
+ continue
+
+ character_range: str | None = unicode_range(character)
+
+ if character_range is None:
+ continue
+
+ layer_target_range: str | None = None
+
+ for discovered_range in layers:
+ if (
+ is_suspiciously_successive_range(discovered_range, character_range)
+ is False
+ ):
+ layer_target_range = discovered_range
+ break
+
+ if layer_target_range is None:
+ layer_target_range = character_range
+
+ if layer_target_range not in layers:
+ layers[layer_target_range] = character.lower()
+ continue
+
+ layers[layer_target_range] += character.lower()
+
+ return list(layers.values())
+
+
+def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
+ """
+ This function merge results previously given by the function coherence_ratio.
+ The return type is the same as coherence_ratio.
+ """
+ per_language_ratios: dict[str, list[float]] = {}
+ for result in results:
+ for sub_result in result:
+ language, ratio = sub_result
+ if language not in per_language_ratios:
+ per_language_ratios[language] = [ratio]
+ continue
+ per_language_ratios[language].append(ratio)
+
+ merge = [
+ (
+ language,
+ round(
+ sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+ 4,
+ ),
+ )
+ for language in per_language_ratios
+ ]
+
+ return sorted(merge, key=lambda x: x[1], reverse=True)
+
+
+def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
+ """
+ We shall NOT return "English—" in CoherenceMatches because it is an alternative
+ of "English". This function only keeps the best match and remove the em-dash in it.
+ """
+ index_results: dict[str, list[float]] = dict()
+
+ for result in results:
+ language, ratio = result
+ no_em_name: str = language.replace("—", "")
+
+ if no_em_name not in index_results:
+ index_results[no_em_name] = []
+
+ index_results[no_em_name].append(ratio)
+
+ if any(len(index_results[e]) > 1 for e in index_results):
+ filtered_results: CoherenceMatches = []
+
+ for language in index_results:
+ filtered_results.append((language, max(index_results[language])))
+
+ return filtered_results
+
+ return results
+
+
+@lru_cache(maxsize=2048)
+def coherence_ratio(
+ decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
+) -> CoherenceMatches:
+ """
+ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
+ A layer = Character extraction by alphabets/ranges.
+ """
+
+ results: list[tuple[str, float]] = []
+ ignore_non_latin: bool = False
+
+ sufficient_match_count: int = 0
+
+ lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
+ if "Latin Based" in lg_inclusion_list:
+ ignore_non_latin = True
+ lg_inclusion_list.remove("Latin Based")
+
+ for layer in alpha_unicode_split(decoded_sequence):
+ sequence_frequencies: TypeCounter[str] = Counter(layer)
+ most_common = sequence_frequencies.most_common()
+
+ character_count: int = sum(o for c, o in most_common)
+
+ if character_count <= TOO_SMALL_SEQUENCE:
+ continue
+
+ popular_character_ordered: list[str] = [c for c, o in most_common]
+
+ for language in lg_inclusion_list or alphabet_languages(
+ popular_character_ordered, ignore_non_latin
+ ):
+ ratio: float = characters_popularity_compare(
+ language, popular_character_ordered
+ )
+
+ if ratio < threshold:
+ continue
+ elif ratio >= 0.8:
+ sufficient_match_count += 1
+
+ results.append((language, round(ratio, 4)))
+
+ if sufficient_match_count >= 3:
+ break
+
+ return sorted(
+ filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
+ )