From 903af1c0b1f2cc695ea4e0c31438f9205571d15d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 27 Apr 2022 17:37:22 +0300 Subject: Implement command-line interface for QC of files --- README.org | 29 ++++++++++ manifest.scm | 1 + qc.py | 108 ++++++++++++++++++++++++++++++++++++++ quality_control/average.py | 2 +- quality_control/parsing.py | 3 +- quality_control/standard_error.py | 2 +- 6 files changed, 141 insertions(+), 4 deletions(-) create mode 100644 qc.py diff --git a/README.org b/README.org index aef1a08..53ac63c 100644 --- a/README.org +++ b/README.org @@ -53,3 +53,32 @@ To check for correct type usage in the application, run: #+BEGIN_SRC shell mypy --show-error-codes . #+END_SRC + +** Running QC + +*** Command-Line Version + +To run qc against a file, the syntax is: +#+BEGIN_SRC shell + python3 -m qc [--strainsfile ] [--verbose] +#+END_SRC +where +- ~~ is one of "*average*" or "*standard-error*" +- ~~ is either an absolute path to the file, or a path relative to the + current working directory +- if the ~--strainsfile~ option is not provided, it will default to the one in + the root directory of this repository +- the ~--verbose~ option is a flag, defaulting to ~False~ that controls the + display of optional progress messages + +To view the usage information for the application, run +#+BEGIN_SRC shell + python3 -m qc --help +#+END_SRC + +**** TODO Figure out how to put qc.py in a /bin or /scripts directory and still be able to import the modules in the repo +**** TODO Reduce the command to simply ~qc [--strainsfile ] [--verbose] ~ + +*** Web Version + +Coming soon... diff --git a/manifest.scm b/manifest.scm index d93902a..9ec50a6 100644 --- a/manifest.scm +++ b/manifest.scm @@ -4,6 +4,7 @@ "python-mypy" "python-redis" "python-flask" + "python-magic" "python-pylint" "python-pytest" "python-hypothesis")) diff --git a/qc.py b/qc.py new file mode 100644 index 0000000..fee74cb --- /dev/null +++ b/qc.py @@ -0,0 +1,108 @@ +"""Implements the command-line interface for the qc application""" +import os +import sys +import argparse + +import magic + +from quality_control.errors import ParseError +from quality_control.parsing import ( + FileType, + parse_file, + strain_names, + parse_errors, + parse_strains) + + +def is_file_mime(filepath, mimetype): + """Check that `filepath` has a mimetype of `mimetype` or `text/plain`""" + return magic.from_file(filepath, mime=True) in ("text/plain", mimetype) + +def cli_argument_parser(): + """Create the parser for the CLI arguments""" + parser = argparse.ArgumentParser( + prog="qc", description = ( + "Command-Line Interface program for quality control of data files")) + parser.add_argument( + "filetype", + help="The type of file to check", + choices=("average", "standard-error")) + parser.add_argument( + "filepath", + help=( + "The path to the file to be checked." + "If an absolute path is not provided, then the file will be relative to" + f"\t'{os.getcwd()}'")) + default_strains_file = os.path.join( + os.path.dirname(__file__), "strains.csv") + parser.add_argument( + "-s", "--strainsfile", + help=( + "Path to the file containing allowable strains/samples. " + f"[default '{default_strains_file}']"), + default=default_strains_file) + + parser.add_argument( + "-v", "--verbose", + help="Controls whether to show extra output", + default=False, action="store_true") + return parser + +def check(filepath, filetype, strains, verbose=False): + """Check the file and print out results""" + try: + for line_num, line in enumerate(parse_file( + filepath, filetype, strains), start=1): + if verbose: + print(f"Checked line: {line_num}") + + print(f"Successfully checked the file. No errors found.") + except ParseError as pe: + print("line\terrors") + for line_num, error in enumerate( + parse_errors(filepath, filetype, strains, + pe.args[0]["line_number"]), + start = pe.args[0]["line_number"] + 1): + print(f"{line_num}\t{' '.join(error['message'])}") + + +def main(): + """Entry point function""" + argparser = cli_argument_parser() + args = argparser.parse_args() + if not os.path.exists(args.filepath): + print("The file '{args.filepath}' does not exist.", file=sys.stderr) + return 1 + + if not os.path.exists(args.strainsfile): + print("The file '{args.strainsfile}' does not exist.", file=sys.stderr) + return 2 + + if not is_file_mime(args.filepath, "text/tab-separated-values"): + print( + f"The file '{args.filepath}' MUST be a tab-separated file.", + file=sys.stderr) + return 3 + + if not is_file_mime(args.strainsfile, "text/csv"): + print( + f"The file '{args.strainsfile}' MUST be a tab-separated file.", + file=sys.stderr) + return 4 + + if args.verbose: + print(f"Parsing the strain names from '{args.strainsfile}'") + + strains = strain_names(parse_strains(os.path.realpath(args.strainsfile))) + + filepath = os.path.realpath(args.filepath) + if args.verbose: + print(f"Checking '{filepath}' for errors") + + check( + filepath, ( + FileType.AVERAGE if args.filetype == "average" + else FileType.STANDARD_ERROR), strains) + +if __name__ == "__main__": + main() diff --git a/quality_control/average.py b/quality_control/average.py index 3261e1c..2907b9c 100644 --- a/quality_control/average.py +++ b/quality_control/average.py @@ -8,6 +8,6 @@ def valid_value(val): if re.search(r"^[0-9]+\.[0-9]{3}$", val): return float(val) raise InvalidCellValue( - f"Invalid value '{val}'.\n" + f"Invalid value '{val}'. " "Expected string representing a number with exactly three decimal " "places.") diff --git a/quality_control/parsing.py b/quality_control/parsing.py index a4edb0f..9fe88f1 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -62,7 +62,7 @@ def parse_file(filepath: str, filetype: FileType, strains: list): with open(filepath, encoding="utf-8") as input_file: for line_number, line in enumerate(input_file): if line_number == 0: - yield __parse_header(line, strains) + yield __parse_header(line, strains), seek_pos + len(line) seek_pos = seek_pos + len(line) continue @@ -83,7 +83,6 @@ def parse_file(filepath: str, filetype: FileType, strains: list): def parse_errors(filepath: str, filetype: FileType, strains: list, seek_pos: int = 0) -> Generator: """Retrieve ALL the parse errors""" - print(f"seek_pos: {seek_pos}, {type(seek_pos)}") assert seek_pos >= 0, "The seek position must be at least zero (0)" def __error_type(error): diff --git a/quality_control/standard_error.py b/quality_control/standard_error.py index 805c30e..f1e33c4 100644 --- a/quality_control/standard_error.py +++ b/quality_control/standard_error.py @@ -8,6 +8,6 @@ def valid_value(val): if re.search(r"^[0-9]+\.[0-9]{6,}$", val): return float(val) raise InvalidCellValue( - f"Invalid value '{val}'.\n" + f"Invalid value '{val}'. " "Expected string representing a number with at least six decimal " "places.") -- cgit v1.2.3