aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.org29
-rw-r--r--manifest.scm1
-rw-r--r--qc.py108
-rw-r--r--quality_control/average.py2
-rw-r--r--quality_control/parsing.py3
-rw-r--r--quality_control/standard_error.py2
6 files changed, 141 insertions, 4 deletions
diff --git a/README.org b/README.org
index aef1a08..53ac63c 100644
--- a/README.org
+++ b/README.org
@@ -53,3 +53,32 @@ To check for correct type usage in the application, run:
#+BEGIN_SRC shell
mypy --show-error-codes .
#+END_SRC
+
+** Running QC
+
+*** Command-Line Version
+
+To run qc against a file, the syntax is:
+#+BEGIN_SRC shell
+ python3 -m qc [--strainsfile <strainsfile-path>] [--verbose] <filetype> <filepath>
+#+END_SRC
+where
+- ~<filetype>~ is one of "*average*" or "*standard-error*"
+- ~<filepath>~ is either an absolute path to the file, or a path relative to the
+ current working directory
+- if the ~--strainsfile~ option is not provided, it will default to the one in
+ the root directory of this repository
+- the ~--verbose~ option is a flag, defaulting to ~False~ that controls the
+ display of optional progress messages
+
+To view the usage information for the application, run
+#+BEGIN_SRC shell
+ python3 -m qc --help
+#+END_SRC
+
+**** TODO Figure out how to put qc.py in a /bin or /scripts directory and still be able to import the modules in the repo
+**** TODO Reduce the command to simply ~qc [--strainsfile <strainsfile-path>] [--verbose] <filetype> <filepath>~
+
+*** Web Version
+
+Coming soon...
diff --git a/manifest.scm b/manifest.scm
index d93902a..9ec50a6 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -4,6 +4,7 @@
"python-mypy"
"python-redis"
"python-flask"
+ "python-magic"
"python-pylint"
"python-pytest"
"python-hypothesis"))
diff --git a/qc.py b/qc.py
new file mode 100644
index 0000000..fee74cb
--- /dev/null
+++ b/qc.py
@@ -0,0 +1,108 @@
+"""Implements the command-line interface for the qc application"""
+import os
+import sys
+import argparse
+
+import magic
+
+from quality_control.errors import ParseError
+from quality_control.parsing import (
+ FileType,
+ parse_file,
+ strain_names,
+ parse_errors,
+ parse_strains)
+
+
+def is_file_mime(filepath, mimetype):
+ """Check that `filepath` has a mimetype of `mimetype` or `text/plain`"""
+ return magic.from_file(filepath, mime=True) in ("text/plain", mimetype)
+
+def cli_argument_parser():
+ """Create the parser for the CLI arguments"""
+ parser = argparse.ArgumentParser(
+ prog="qc", description = (
+ "Command-Line Interface program for quality control of data files"))
+ parser.add_argument(
+ "filetype",
+ help="The type of file to check",
+ choices=("average", "standard-error"))
+ parser.add_argument(
+ "filepath",
+ help=(
+ "The path to the file to be checked."
+ "If an absolute path is not provided, then the file will be relative to"
+ f"\t'{os.getcwd()}'"))
+ default_strains_file = os.path.join(
+ os.path.dirname(__file__), "strains.csv")
+ parser.add_argument(
+ "-s", "--strainsfile",
+ help=(
+ "Path to the file containing allowable strains/samples. "
+ f"[default '{default_strains_file}']"),
+ default=default_strains_file)
+
+ parser.add_argument(
+ "-v", "--verbose",
+ help="Controls whether to show extra output",
+ default=False, action="store_true")
+ return parser
+
+def check(filepath, filetype, strains, verbose=False):
+ """Check the file and print out results"""
+ try:
+ for line_num, line in enumerate(parse_file(
+ filepath, filetype, strains), start=1):
+ if verbose:
+ print(f"Checked line: {line_num}")
+
+ print(f"Successfully checked the file. No errors found.")
+ except ParseError as pe:
+ print("line\terrors")
+ for line_num, error in enumerate(
+ parse_errors(filepath, filetype, strains,
+ pe.args[0]["line_number"]),
+ start = pe.args[0]["line_number"] + 1):
+ print(f"{line_num}\t{' '.join(error['message'])}")
+
+
+def main():
+ """Entry point function"""
+ argparser = cli_argument_parser()
+ args = argparser.parse_args()
+ if not os.path.exists(args.filepath):
+ print("The file '{args.filepath}' does not exist.", file=sys.stderr)
+ return 1
+
+ if not os.path.exists(args.strainsfile):
+ print("The file '{args.strainsfile}' does not exist.", file=sys.stderr)
+ return 2
+
+ if not is_file_mime(args.filepath, "text/tab-separated-values"):
+ print(
+ f"The file '{args.filepath}' MUST be a tab-separated file.",
+ file=sys.stderr)
+ return 3
+
+ if not is_file_mime(args.strainsfile, "text/csv"):
+ print(
+ f"The file '{args.strainsfile}' MUST be a tab-separated file.",
+ file=sys.stderr)
+ return 4
+
+ if args.verbose:
+ print(f"Parsing the strain names from '{args.strainsfile}'")
+
+ strains = strain_names(parse_strains(os.path.realpath(args.strainsfile)))
+
+ filepath = os.path.realpath(args.filepath)
+ if args.verbose:
+ print(f"Checking '{filepath}' for errors")
+
+ check(
+ filepath, (
+ FileType.AVERAGE if args.filetype == "average"
+ else FileType.STANDARD_ERROR), strains)
+
+if __name__ == "__main__":
+ main()
diff --git a/quality_control/average.py b/quality_control/average.py
index 3261e1c..2907b9c 100644
--- a/quality_control/average.py
+++ b/quality_control/average.py
@@ -8,6 +8,6 @@ def valid_value(val):
if re.search(r"^[0-9]+\.[0-9]{3}$", val):
return float(val)
raise InvalidCellValue(
- f"Invalid value '{val}'.\n"
+ f"Invalid value '{val}'. "
"Expected string representing a number with exactly three decimal "
"places.")
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index a4edb0f..9fe88f1 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -62,7 +62,7 @@ def parse_file(filepath: str, filetype: FileType, strains: list):
with open(filepath, encoding="utf-8") as input_file:
for line_number, line in enumerate(input_file):
if line_number == 0:
- yield __parse_header(line, strains)
+ yield __parse_header(line, strains), seek_pos + len(line)
seek_pos = seek_pos + len(line)
continue
@@ -83,7 +83,6 @@ def parse_file(filepath: str, filetype: FileType, strains: list):
def parse_errors(filepath: str, filetype: FileType, strains: list,
seek_pos: int = 0) -> Generator:
"""Retrieve ALL the parse errors"""
- print(f"seek_pos: {seek_pos}, {type(seek_pos)}")
assert seek_pos >= 0, "The seek position must be at least zero (0)"
def __error_type(error):
diff --git a/quality_control/standard_error.py b/quality_control/standard_error.py
index 805c30e..f1e33c4 100644
--- a/quality_control/standard_error.py
+++ b/quality_control/standard_error.py
@@ -8,6 +8,6 @@ def valid_value(val):
if re.search(r"^[0-9]+\.[0-9]{6,}$", val):
return float(val)
raise InvalidCellValue(
- f"Invalid value '{val}'.\n"
+ f"Invalid value '{val}'. "
"Expected string representing a number with at least six decimal "
"places.")