From 6bf0801db67a64b574128eb1b561046a4ec1d042 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Tue, 3 Dec 2024 14:23:33 -0600
Subject: Save 'pheno' errors in redis as they are found.

---
 scripts/rqtl2/phenotypes_qc.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'scripts/rqtl2')

diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index bfbfab6..2c09835 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -264,12 +264,14 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
         error_fn: Callable = decimal_points_error
 ):
     """Run QC/QA on a `pheno` file."""
-    with redis_logger(
+    with (redis_logger(
             redisuri,
             f"{__MODULE__}.qc_pheno_file",
             filepath.name,
-            fqkey) as logger:
+            f"{fqkey}:logs") as logger,
+          Redis.from_url(redisuri, decode_responses=True) as rconn):
         logger.info("Running QC on file: %s", filepath.name)
+        save_error = partial(push_error, rconn, f"{fqkey}:errors:{filepath.name}")
         _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
         _headings: tuple[str, ...] = tuple(
             heading.lower() for heading in next(_csvfile))
@@ -277,24 +279,25 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
 
         _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
         if len(_absent) > 0:
-            _errors = _errors + (InvalidValue(
+            _errors = _errors + (save_error(InvalidValue(
                 filepath.name,
                 "header row",
                 "-",
                 ", ".join(_absent),
                 (f"The phenotype names ({', '.join(samples)}) do not exist in any "
-                 "of the provided phenocovar files.")),)
+                 "of the provided phenocovar files."))),)
 
         def collect_errors(errors_and_linecount, line):
             _errs, _lc = errors_and_linecount
+            logger.debug("Checking row %s", line[0])
             if line[0] not in samples:
-                _errs = _errs + (InvalidValue(
+                _errs = _errs + (save_error(InvalidValue(
                 filepath.name,
                 line[0],
                 _headings[0],
                 line[0],
                 (f"The sample named '{line[0]}' does not exist in the database. "
-                 "You will need to upload that first.")),)
+                 "You will need to upload that first."))),)
 
             for field, value in zip(_headings[1:], line[1:]):
                 if value in na_strings:
@@ -304,15 +307,16 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
                     line[0],
                     field,
                     value)
-                _errs = _errs + ((_err,) if bool(_err) else tuple())
+                _errs = _errs + ((save_error(_err),) if bool(_err) else tuple())
 
+            rconn.hset(f"{fqkey}:metadata", "linecount", _lc+1)
+            rconn.hset(f"{fqkey}:metadata", "total-errors", len(_errs))
             return _errs, _lc+1
 
-        return {
-            filepath.name: dict(zip(
-                ("errors", "linecount"),
-                reduce(collect_errors, _csvfile, (_errors, 1))))
-        }
+        logger.debug(f"[{filepath.name}] Collecting errors")
+        _errors, _linecount = reduce(collect_errors, _csvfile, (_errors, 1))
+        logger.debug(f"[{filepath.name}] Finished collecting errors. Returning results …")
+        return {filepath.name: {"errors": _errors, "linecount": linecount}}
 
 
 def phenotype_names(filepath: Path,
-- 
cgit v1.2.3