diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/charset_normalizer/legacy.py | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py b/.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py new file mode 100644 index 00000000..a2f53451 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any +from warnings import warn + +from .api import from_bytes +from .constant import CHARDET_CORRESPONDENCE + +# TODO: remove this check when dropping Python 3.7 support +if TYPE_CHECKING: + from typing_extensions import TypedDict + + class ResultDict(TypedDict): + encoding: str | None + language: str + confidence: float | None + + +def detect( + byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any +) -> ResultDict: + """ + chardet legacy method + Detect the encoding of the given byte string. It should be mostly backward-compatible. + Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) + This function is deprecated and should be used to migrate your project easily, consult the documentation for + further information. Not planned for removal. + + :param byte_str: The byte sequence to examine. + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + """ + if len(kwargs): + warn( + f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()" + ) + + if not isinstance(byte_str, (bytearray, bytes)): + raise TypeError( # pragma: nocover + "Expected object of type bytes or bytearray, got: " "{}".format( + type(byte_str) + ) + ) + + if isinstance(byte_str, bytearray): + byte_str = bytes(byte_str) + + r = from_bytes(byte_str).best() + + encoding = r.encoding if r is not None else None + language = r.language if r is not None and r.language != "Unknown" else "" + confidence = 1.0 - r.chaos if r is not None else None + + # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process + # but chardet does return 'utf-8-sig' and it is a valid codec name. + if r is not None and encoding == "utf_8" and r.bom: + encoding += "_sig" + + if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE: + encoding = CHARDET_CORRESPONDENCE[encoding] + + return { + "encoding": encoding, + "language": language, + "confidence": confidence, + } |