From 971d1383aa81947a1d43725150bcfa6eceec24f0 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Tue, 13 Feb 2024 04:09:34 +0300
Subject: Provide nice UI progress indicators.

---
 scripts/qc_on_rqtl2_bundle.py | 64 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 4 deletions(-)

(limited to 'scripts')

diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py
index c3e8b66..b5b2059 100644
--- a/scripts/qc_on_rqtl2_bundle.py
+++ b/scripts/qc_on_rqtl2_bundle.py
@@ -2,18 +2,21 @@
 import sys
 import json
 from zipfile import ZipFile
+from functools import partial
 from argparse import Namespace
-from typing import Union, Sequence
 from logging import Logger, getLogger, StreamHandler
+from typing import Union, Sequence, Callable, Iterator
 
 from redis import Redis
 
 from quality_control.errors import InvalidValue
+from quality_control.checks import decimal_points_error
 
 from qc_app import jobs
 from qc_app.db_utils import database_connection
 from qc_app.check_connections import check_db, check_redis
 
+from r_qtl import errors as rqe
 from r_qtl import r_qtl2 as rqtl2
 from r_qtl import r_qtl2_qc as rqc
 from r_qtl import fileerrors as rqfe
@@ -57,12 +60,61 @@ def qc_missing_files(rconn: Redis,
         return True
     return False
 
+def compute_filesize(zfile: ZipFile, filetype: str) -> int:
+    """Compute the total file size."""
+    cdata = rqtl2.control_data(zfile)
+    if isinstance(cdata[filetype], str):
+        return zfile.getinfo(cdata[filetype]).file_size
+
+    return sum(zfile.getinfo(afile).file_size for afile in cdata[filetype])
+
+def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals]
+                                  fqjobid: str,
+                                  zfile: ZipFile,
+                                  filetype: str,
+                                  checkers: tuple[Callable]) -> Iterator[Union[
+                                      InvalidValue, rqfe.MissingFile]]:
+    """Filter the errors while also counting the number of lines in the file."""
+    assert filetype in rqtl2.FILE_TYPES, f"Invalid file type {filetype}."
+    count = 0
+    checked = 0
+    cdata = rqtl2.control_data(zfile)
+    rconn.hset(fqjobid, f"{filetype}-filesize", compute_filesize(zfile, filetype))
+    def __update_processed__(value):
+        nonlocal checked
+        checked = checked + len(value)
+        rconn.hset(fqjobid, f"{filetype}-checked", checked)
+
+    try:# pylint: disable=[too-many-nested-blocks]
+        for lineno, row in enumerate(
+                rqtl2.file_data(zfile, filetype, cdata), start=1):
+            count = count + 1
+            for field, value in row.items():
+                if field == "id":
+                    __update_processed__(value)
+                    continue
+                if value is not None:
+                    for checker in checkers:
+                        error = checker(lineno, field, value)
+                        if bool(error):
+                            yield error
+                        __update_processed__(value)
+
+        rconn.hset(fqjobid, f"{filetype}-linecount", count)
+    except rqe.MissingFileError:
+        fname = cdata.get(filetype)
+        yield rqfe.MissingFile(filetype, fname, (
+            f"The file '{fname}' does not exist in the bundle despite it being "
+            f"listed under '{filetype}' in the control file."))
+
 def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
     """Check for errors in `geno` file(s)."""
     logger.info("Checking for errors in the 'geno' file…")
     cdata = rqtl2.control_data(zfile)
     if "geno" in cdata:
-        gerrs = tuple(rqc.geno_errors(zfile))
+        gerrs = tuple(retrieve_errors_with_progress(
+            rconn, fqjobid, zfile, "geno",
+            (rqc.make_genocode_checker(cdata.get("genotypes", {})),)))
         add_to_errors(rconn, fqjobid, "errors-generic", tuple(
             err for err in gerrs if isinstance(err, rqfe.MissingFile)))
         add_to_errors(rconn, fqjobid, "errors-geno", tuple(
@@ -79,7 +131,9 @@ def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
     logger.info("Checking for errors in the 'pheno' file…")
     cdata = rqtl2.control_data(zfile)
     if "pheno" in cdata:
-        perrs = tuple(rqc.pheno_errors(zfile))
+        perrs = tuple(retrieve_errors_with_progress(
+            rconn,fqjobid, zfile, "pheno",
+            (partial(decimal_points_error, mini=3),)))
         add_to_errors(rconn, fqjobid, "errors-generic", tuple(
             err for err in perrs if isinstance(err, rqfe.MissingFile)))
         add_to_errors(rconn, fqjobid, "errors-pheno", tuple(
@@ -96,7 +150,9 @@ def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool:
     logger.info("Checking for errors in the 'phenose' file…")
     cdata = rqtl2.control_data(zfile)
     if "phenose" in cdata:
-        perrs = tuple(rqc.phenose_errors(zfile))
+        perrs = tuple(retrieve_errors_with_progress(
+            rconn,fqjobid, zfile, "phenose",
+            (partial(decimal_points_error, mini=6),)))
         add_to_errors(rconn, fqjobid, "errors-generic", tuple(
             err for err in perrs if isinstance(err, rqfe.MissingFile)))
         add_to_errors(rconn, fqjobid, "errors-phenose", tuple(
-- 
cgit v1.2.3