From 12f613cbfb1379c822e5a8831e2ca2becda3bce9 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 21 Oct 2024 17:01:10 -0500
Subject: Check `phenocovar` files for errors.

---
 scripts/rqtl2/phenotypes_qc.py | 55 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'scripts')

diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 76ef7bf..db990b1 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -13,6 +13,7 @@ import MySQLdb as mdb
 from r_qtl import r_qtl2 as rqtl2
 from r_qtl import r_qtl2_qc as rqc
 from r_qtl import exceptions as rqe
+from r_qtl.fileerrors import InvalidValue
 
 from uploader.files import sha256_digest_over_file
 from uploader.samples.models import samples_by_species_and_population
@@ -117,6 +118,54 @@ def undo_transpose(filetype: str, cdata: dict, extractiondir):
                 build_line_joiner(cdata))
 
 
+def qc_phenocovar_file(phenocovarfile: Path, separator: str, comment_char: str):
+    """Check that `phenocovar` files are structured correctly."""
+    _csvfile = rqtl2.read_csv_file(phenocovarfile, separator, comment_char)
+    _headings = tuple(heading.lower() for heading in next(_csvfile))
+    _errors = tuple()
+    for heading in ("description", "units"):
+        if heading not in _headings:
+            _errors = (InvalidValue(
+                phenocovarfile.name,
+                "header row",
+                "-",
+                "-",
+                (f"File {phenocovarfile.name} is missing the {heading} heading "
+                 "in the header line.")),)
+
+    def collect_errors(errors_and_linecount, line):
+        _errs, _lc = errors_and_linecount
+        if len(line) != len(_headings):
+            _errs = _errs + (InvalidValue(
+                phenocovarfile.name,
+                line[0],
+                "-",
+                "-",
+                (f"Record {_lc} in file {phenocovarfile.name} has a different "
+                    "number of columns than the number of headings")),)
+        _line = dict(zip(_headings, line))
+        if not bool(_line["description"]):
+            _errs = _errs + (
+                InvalidValue(phenocovarfile.name,
+                             _line[_headings[0]],
+                             "description",
+                             _line["description"],
+                             "The description is not provided!"),)
+
+        return _errs, _lc+1
+
+    return {
+        phenocovarfile.name: dict(zip(
+            ("errors", "linecount"),
+            reduce(collect_errors, _csvfile, (_errors, 1))))
+    }
+
+
+def merge_dicts(*dicts):
+    """Merge multiple dicts into a single one."""
+    return reduce(lambda merged, dct: {**merged, **dct}, dicts, {})
+
+
 def run_qc(# pylint: disable=[too-many-arguments]
         dbconn: mdb.Connection,
         phenobundle: Path,
@@ -159,7 +208,11 @@ def run_qc(# pylint: disable=[too-many-arguments]
 
     #       - Check that `description` and `units` is present in phenocovar for
     #         all phenotypes
-    # qc_phenocovar_files(…)
+    with mproc.Pool(mproc.cpu_count() - 1) as pool:
+        # This call is way too busy. Maybe just return the errors?
+        qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple(
+            (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"])
+            for _file in cdata.get("phenocovar", []))))
 
     #       - Check all samples in pheno files exist in database
     #       - Check all phenotypes in pheno files exist in phenocovar files
-- 
cgit 1.4.1