aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py')
-rw-r--r--.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py321
1 files changed, 321 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py b/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py
new file mode 100644
index 00000000..64a290f2
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py
@@ -0,0 +1,321 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from json import dumps
+from os.path import abspath, basename, dirname, join, realpath
+from platform import python_version
+from unicodedata import unidata_version
+
+import charset_normalizer.md as md_module
+from charset_normalizer import from_fp
+from charset_normalizer.models import CliDetectionResult
+from charset_normalizer.version import __version__
+
+
+def query_yes_no(question: str, default: str = "yes") -> bool:
+ """Ask a yes/no question via input() and return their answer.
+
+ "question" is a string that is presented to the user.
+ "default" is the presumed answer if the user just hits <Enter>.
+ It must be "yes" (the default), "no" or None (meaning
+ an answer is required of the user).
+
+ The "answer" return value is True for "yes" or False for "no".
+
+ Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
+ """
+ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
+ if default is None:
+ prompt = " [y/n] "
+ elif default == "yes":
+ prompt = " [Y/n] "
+ elif default == "no":
+ prompt = " [y/N] "
+ else:
+ raise ValueError("invalid default answer: '%s'" % default)
+
+ while True:
+ sys.stdout.write(question + prompt)
+ choice = input().lower()
+ if default is not None and choice == "":
+ return valid[default]
+ elif choice in valid:
+ return valid[choice]
+ else:
+ sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
+
+
+def cli_detect(argv: list[str] | None = None) -> int:
+ """
+ CLI assistant using ARGV and ArgumentParser
+ :param argv:
+ :return: 0 if everything is fine, anything else equal trouble
+ """
+ parser = argparse.ArgumentParser(
+ description="The Real First Universal Charset Detector. "
+ "Discover originating encoding used on text file. "
+ "Normalize text to unicode."
+ )
+
+ parser.add_argument(
+ "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ default=False,
+ dest="verbose",
+ help="Display complementary information about file if any. "
+ "Stdout will contain logs about the detection process.",
+ )
+ parser.add_argument(
+ "-a",
+ "--with-alternative",
+ action="store_true",
+ default=False,
+ dest="alternatives",
+ help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
+ )
+ parser.add_argument(
+ "-n",
+ "--normalize",
+ action="store_true",
+ default=False,
+ dest="normalize",
+ help="Permit to normalize input file. If not set, program does not write anything.",
+ )
+ parser.add_argument(
+ "-m",
+ "--minimal",
+ action="store_true",
+ default=False,
+ dest="minimal",
+ help="Only output the charset detected to STDOUT. Disabling JSON output.",
+ )
+ parser.add_argument(
+ "-r",
+ "--replace",
+ action="store_true",
+ default=False,
+ dest="replace",
+ help="Replace file when trying to normalize it instead of creating a new one.",
+ )
+ parser.add_argument(
+ "-f",
+ "--force",
+ action="store_true",
+ default=False,
+ dest="force",
+ help="Replace file without asking if you are sure, use this flag with caution.",
+ )
+ parser.add_argument(
+ "-i",
+ "--no-preemptive",
+ action="store_true",
+ default=False,
+ dest="no_preemptive",
+ help="Disable looking at a charset declaration to hint the detector.",
+ )
+ parser.add_argument(
+ "-t",
+ "--threshold",
+ action="store",
+ default=0.2,
+ type=float,
+ dest="threshold",
+ help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
+ )
+ parser.add_argument(
+ "--version",
+ action="version",
+ version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
+ __version__,
+ python_version(),
+ unidata_version,
+ "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
+ ),
+ help="Show version information and exit.",
+ )
+
+ args = parser.parse_args(argv)
+
+ if args.replace is True and args.normalize is False:
+ if args.files:
+ for my_file in args.files:
+ my_file.close()
+ print("Use --replace in addition of --normalize only.", file=sys.stderr)
+ return 1
+
+ if args.force is True and args.replace is False:
+ if args.files:
+ for my_file in args.files:
+ my_file.close()
+ print("Use --force in addition of --replace only.", file=sys.stderr)
+ return 1
+
+ if args.threshold < 0.0 or args.threshold > 1.0:
+ if args.files:
+ for my_file in args.files:
+ my_file.close()
+ print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
+ return 1
+
+ x_ = []
+
+ for my_file in args.files:
+ matches = from_fp(
+ my_file,
+ threshold=args.threshold,
+ explain=args.verbose,
+ preemptive_behaviour=args.no_preemptive is False,
+ )
+
+ best_guess = matches.best()
+
+ if best_guess is None:
+ print(
+ 'Unable to identify originating encoding for "{}". {}'.format(
+ my_file.name,
+ (
+ "Maybe try increasing maximum amount of chaos."
+ if args.threshold < 1.0
+ else ""
+ ),
+ ),
+ file=sys.stderr,
+ )
+ x_.append(
+ CliDetectionResult(
+ abspath(my_file.name),
+ None,
+ [],
+ [],
+ "Unknown",
+ [],
+ False,
+ 1.0,
+ 0.0,
+ None,
+ True,
+ )
+ )
+ else:
+ x_.append(
+ CliDetectionResult(
+ abspath(my_file.name),
+ best_guess.encoding,
+ best_guess.encoding_aliases,
+ [
+ cp
+ for cp in best_guess.could_be_from_charset
+ if cp != best_guess.encoding
+ ],
+ best_guess.language,
+ best_guess.alphabets,
+ best_guess.bom,
+ best_guess.percent_chaos,
+ best_guess.percent_coherence,
+ None,
+ True,
+ )
+ )
+
+ if len(matches) > 1 and args.alternatives:
+ for el in matches:
+ if el != best_guess:
+ x_.append(
+ CliDetectionResult(
+ abspath(my_file.name),
+ el.encoding,
+ el.encoding_aliases,
+ [
+ cp
+ for cp in el.could_be_from_charset
+ if cp != el.encoding
+ ],
+ el.language,
+ el.alphabets,
+ el.bom,
+ el.percent_chaos,
+ el.percent_coherence,
+ None,
+ False,
+ )
+ )
+
+ if args.normalize is True:
+ if best_guess.encoding.startswith("utf") is True:
+ print(
+ '"{}" file does not need to be normalized, as it already came from unicode.'.format(
+ my_file.name
+ ),
+ file=sys.stderr,
+ )
+ if my_file.closed is False:
+ my_file.close()
+ continue
+
+ dir_path = dirname(realpath(my_file.name))
+ file_name = basename(realpath(my_file.name))
+
+ o_: list[str] = file_name.split(".")
+
+ if args.replace is False:
+ o_.insert(-1, best_guess.encoding)
+ if my_file.closed is False:
+ my_file.close()
+ elif (
+ args.force is False
+ and query_yes_no(
+ 'Are you sure to normalize "{}" by replacing it ?'.format(
+ my_file.name
+ ),
+ "no",
+ )
+ is False
+ ):
+ if my_file.closed is False:
+ my_file.close()
+ continue
+
+ try:
+ x_[0].unicode_path = join(dir_path, ".".join(o_))
+
+ with open(x_[0].unicode_path, "wb") as fp:
+ fp.write(best_guess.output())
+ except OSError as e:
+ print(str(e), file=sys.stderr)
+ if my_file.closed is False:
+ my_file.close()
+ return 2
+
+ if my_file.closed is False:
+ my_file.close()
+
+ if args.minimal is False:
+ print(
+ dumps(
+ [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
+ ensure_ascii=True,
+ indent=4,
+ )
+ )
+ else:
+ for my_file in args.files:
+ print(
+ ", ".join(
+ [
+ el.encoding or "undefined"
+ for el in x_
+ if el.path == abspath(my_file.name)
+ ]
+ )
+ )
+
+ return 0
+
+
+if __name__ == "__main__":
+ cli_detect()