about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-10-24 16:44:17 -0500
committerFrederick Muriuki Muriithi2024-10-24 16:44:17 -0500
commitf8e45d54be507d5080e9a6a3a3026907f60507a9 (patch)
treed3085f6224cfa930dff3496eec0625e1be1e9278
parentda177466dfbdc36c866c153c41babe52b15d8788 (diff)
downloadgn-uploader-f8e45d54be507d5080e9a6a3a3026907f60507a9.tar.gz
Check pheno, phenose, and phenonum files with logging
Add logging to the `qc_pheno_file` function so that we get the
messages pushed to redis for every file that is being checked for
errors.
-rw-r--r--scripts/rqtl2/phenotypes_qc.py119
1 files changed, 71 insertions, 48 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 4c02578..d8bf3c2 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -150,7 +150,7 @@ def redis_logger(
 
 
 def qc_phenocovar_file(
-        filename: Path,
+        filepath: Path,
         redisuri,
         fqkey: str,
         separator: str,
@@ -159,20 +159,20 @@ def qc_phenocovar_file(
     with redis_logger(
             redisuri,
             f"{__MODULE__}.qc_phenocovar_file",
-            filename.name,
+            filepath.name,
             fqkey) as logger:
-        logger.info("Running QC on file: %s", filename.name)
-        _csvfile = rqtl2.read_csv_file(filename, separator, comment_char)
+        logger.info("Running QC on file: %s", filepath.name)
+        _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
         _headings = tuple(heading.lower() for heading in next(_csvfile))
         _errors: tuple[InvalidValue, ...] = tuple()
         for heading in ("description", "units"):
             if heading not in _headings:
                 _errors = (InvalidValue(
-                    filename.name,
+                    filepath.name,
                     "header row",
                     "-",
                     "-",
-                    (f"File {filename.name} is missing the {heading} heading "
+                    (f"File {filepath.name} is missing the {heading} heading "
                      "in the header line.")),)
 
         def collect_errors(errors_and_linecount, line):
@@ -180,16 +180,16 @@ def qc_phenocovar_file(
             logger.info("Testing record '%s'", line[0])
             if len(line) != len(_headings):
                 _errs = _errs + (InvalidValue(
-                    filename.name,
+                    filepath.name,
                     line[0],
                     "-",
                     "-",
-                    (f"Record {_lc} in file {filename.name} has a different "
+                    (f"Record {_lc} in file {filepath.name} has a different "
                         "number of columns than the number of headings")),)
             _line = dict(zip(_headings, line))
             if not bool(_line["description"]):
                 _errs = _errs + (
-                    InvalidValue(filename.name,
+                    InvalidValue(filepath.name,
                                  _line[_headings[0]],
                                  "description",
                                  _line["description"],
@@ -198,7 +198,7 @@ def qc_phenocovar_file(
             return _errs, _lc+1
 
         return {
-            filename.name: dict(zip(
+            filepath.name: dict(zip(
                 ("errors", "linecount"),
                 reduce(collect_errors, _csvfile, (_errors, 1))))
         }
@@ -242,6 +242,8 @@ def integer_error(
 
 def qc_pheno_file(# pylint: disable=[too-many-arguments]
         filepath: Path,
+        redisuri: str,
+        fqkey: str,
         samples: tuple[str, ...],
         phenonames: tuple[str, ...],
         separator: str,
@@ -250,49 +252,55 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
         error_fn: Callable = decimal_points_error
 ):
     """Run QC/QA on a `pheno` file."""
-    _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
-    _headings: tuple[str, ...] = tuple(
-        heading.lower() for heading in next(_csvfile))
-    _errors: tuple[InvalidValue, ...] = tuple()
-
-    _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
-    if len(_absent) > 0:
-        _errors = _errors + (InvalidValue(
-            filepath.name,
-            "header row",
-            "-",
-            ", ".join(_absent),
-            (f"The phenotype names ({', '.join(samples)}) do not exist in any "
-             "of the provided phenocovar files.")),)
-
-    def collect_errors(errors_and_linecount, line):
-        _errs, _lc = errors_and_linecount
-        if line[0] not in samples:
-            _errs = _errs + (InvalidValue(
+    with redis_logger(
+            redisuri,
+            f"{__MODULE__}.qc_pheno_file",
             filepath.name,
-            line[0],
-            _headings[0],
-            line[0],
-            (f"The sample named '{line[0]}' does not exist in the database. "
-             "You will need to upload that first.")),)
-
-        for field, value in zip(_headings[1:], line[1:]):
-            if value in na_strings:
-                continue
-            _err = error_fn(
+            fqkey) as logger:
+        logger.info("Running QC on file: %s", filepath.name)
+        _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
+        _headings: tuple[str, ...] = tuple(
+            heading.lower() for heading in next(_csvfile))
+        _errors: tuple[InvalidValue, ...] = tuple()
+
+        _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
+        if len(_absent) > 0:
+            _errors = _errors + (InvalidValue(
                 filepath.name,
+                "header row",
+                "-",
+                ", ".join(_absent),
+                (f"The phenotype names ({', '.join(samples)}) do not exist in any "
+                 "of the provided phenocovar files.")),)
+
+        def collect_errors(errors_and_linecount, line):
+            _errs, _lc = errors_and_linecount
+            if line[0] not in samples:
+                _errs = _errs + (InvalidValue(
+                filepath.name,
+                line[0],
+                _headings[0],
                 line[0],
-                field,
-                value)
-            _errs = _errs + ((_err,) if bool(_err) else tuple())
+                (f"The sample named '{line[0]}' does not exist in the database. "
+                 "You will need to upload that first.")),)
+
+            for field, value in zip(_headings[1:], line[1:]):
+                if value in na_strings:
+                    continue
+                _err = error_fn(
+                    filepath.name,
+                    line[0],
+                    field,
+                    value)
+                _errs = _errs + ((_err,) if bool(_err) else tuple())
 
-        return _errs, _lc+1
+            return _errs, _lc+1
 
-    return {
-        filepath.name: dict(zip(
-            ("errors", "linecount"),
-            reduce(collect_errors, _csvfile, (_errors, 1))))
-    }
+        return {
+            filepath.name: dict(zip(
+                ("errors", "linecount"),
+                reduce(collect_errors, _csvfile, (_errors, 1))))
+        }
 
 
 def phenotype_names(filepath: Path,
@@ -384,6 +392,11 @@ def run_qc(# pylint: disable=[too-many-locals]
         logger.debug("Check for errors in 'pheno' file(s).")
         _pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
             extractiondir.joinpath(_file),
+            args.redisuri,
+            chain(
+                "pheno",
+                fullyqualifiedkey(args.jobid),
+                fullyqualifiedkey(args.redisprefix)),
             samples,
             phenonames,
             cdata["sep"],
@@ -398,6 +411,11 @@ def run_qc(# pylint: disable=[too-many-locals]
         logger.debug("Check for errors in 'phenose' file(s).")
         _phenose_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
             extractiondir.joinpath(_file),
+            args.redisuri,
+            chain(
+                "phenose",
+                fullyqualifiedkey(args.jobid),
+                fullyqualifiedkey(args.redisprefix)),
             samples,
             phenonames,
             cdata["sep"],
@@ -409,6 +427,11 @@ def run_qc(# pylint: disable=[too-many-locals]
         logger.debug("Check for errors in 'phenonum' file(s).")
         _phenonum_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
             extractiondir.joinpath(_file),
+            args.redisuri,
+            chain(
+                "phenonum",
+                fullyqualifiedkey(args.jobid),
+                fullyqualifiedkey(args.redisprefix)),
             samples,
             phenonames,
             cdata["sep"],