Check for errors in `pheno` files.

author: Frederick Muriuki Muriithi 2024-10-22 12:27:05 -0500
committer: Frederick Muriuki Muriithi 2024-10-22 12:27:05 -0500
commit: 84bbf8736ed017b24affb7e931207dafc4ae4780 (patch)
tree: 49208e08c0b6e502b995176a47faaeecebf7af0f
parent: 12f613cbfb1379c822e5a8831e2ca2becda3bce9 (diff)
download: gn-uploader-84bbf8736ed017b24affb7e931207dafc4ae4780.tar.gz
1 files changed, 100 insertions, 6 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index db990b1..668fca0 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -6,6 +6,7 @@ from pathlib import Path
 from zipfile import ZipFile
 from functools import reduce
 import multiprocessing as mproc
+from typing import Optional, Sequence
 from logging import Logger, getLogger, StreamHandler
 
 import MySQLdb as mdb
@@ -15,6 +16,8 @@ from r_qtl import r_qtl2_qc as rqc
 from r_qtl import exceptions as rqe
 from r_qtl.fileerrors import InvalidValue
 
+from quality_control.checks import decimal_places_pattern
+
 from uploader.files import sha256_digest_over_file
 from uploader.samples.models import samples_by_species_and_population
 
@@ -166,6 +169,84 @@ def merge_dicts(*dicts):
     return reduce(lambda merged, dct: {**merged, **dct}, dicts, {})
 
 
+def decimal_points_error(
+        filename: str,
+        rowtitle: str,
+        coltitle: str,
+        cellvalue: str,
+        message: str,
+        decimal_places: int = 1
+) -> Optional[InvalidValue]:
+    """Returns an error if the value does not meet the checks."""
+    if not bool(decimal_places_pattern(1).match(cellvalue)):
+        return InvalidValue(filename, rowtitle, coltitle, cellvalue, message)
+    return None
+
+
+def qc_pheno_file(
+        filepath: Path,
+        samples: tuple[str, ...],
+        phenonames: tuple[str, ...],
+        separator: str,
+        comment_char: str,
+        na_strings: Sequence[str]
+):
+    """Run QC/QA on a `pheno` file."""
+    _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
+    _headings = tuple(heading.lower() for heading in next(_csvfile))
+    _errors = tuple()
+
+    _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
+    if len(_absent) > 0:
+        _errors = _errors + (InvalidValue(
+            filepath.name,
+            "header row",
+            "-",
+            ", ".join(_absent),
+            (f"The phenotype names ({', '.join(samples)}) do not exist in any "
+             "of the provided phenocovar files.")),)
+
+    def collect_errors(errors_and_linecount, line):
+        _errs, _lc = errors_and_linecount
+        if line[0] not in samples:
+            _errs = _errs + (InvalidValue(
+            filepath.name,
+            line[0],
+            _headings[0],
+            line[0],
+            (f"The sample named '{line[0]}' does not exist in the database. "
+             "You will need to upload that first.")),)
+
+        for field, value in zip(_headings[1:], line[1:]):
+            if value in na_strings:
+                continue
+            _err = decimal_points_error(
+                filepath.name,
+                line[0],
+                field,
+                value,
+                ("Expected a non-negative number with at least one decimal "
+                 "place."))
+            _errs = _errs + ((_err,) if bool(_err) else tuple())
+
+        return _errs, _lc+1
+
+    return {
+        filepath.name: dict(zip(
+            ("errors", "linecount"),
+            reduce(collect_errors, _csvfile, (_errors, 1))))
+    }
+
+
+def phenotype_names(filepath: Path,
+                    separator: str,
+                    comment_char: str) -> tuple[str, ...]:
+    """Read phenotype names from `phenocovar` file."""
+    return reduce(lambda tpl, line: tpl + (line[0],),
+                  rqtl2.read_csv_file(filepath, separator, comment_char),
+                  tuple())[1:]
+
+
 def run_qc(# pylint: disable=[too-many-arguments]
         dbconn: mdb.Connection,
         phenobundle: Path,
@@ -209,15 +290,28 @@ def run_qc(# pylint: disable=[too-many-arguments]
     #       - Check that `description` and `units` is present in phenocovar for
     #         all phenotypes
     with mproc.Pool(mproc.cpu_count() - 1) as pool:
-        # This call is way too busy. Maybe just return the errors?
-        qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple(
+        phenocovar_qc_res = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple(
             (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"])
             for _file in cdata.get("phenocovar", []))))
 
-    #       - Check all samples in pheno files exist in database
-    #       - Check all phenotypes in pheno files exist in phenocovar files
-    #       - Check all numeric values in pheno files
-    # qc_pheno_files(…)
+        #       - Check all samples in pheno files exist in database
+        #       - Check all phenotypes in pheno files exist in phenocovar files
+        #       - Check all numeric values in pheno files
+        phenonames = tuple(set(
+            name for names in pool.starmap(phenotype_names, tuple(
+            (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"])
+            for _file in cdata.get("phenocovar", [])))
+            for name in names))
+
+        pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
+            extractiondir.joinpath(_file),
+            samples,
+            phenonames,
+            cdata["sep"],
+            cdata["comment.char"],
+            cdata["na.strings"]
+        ) for _file in cdata.get("pheno", []))))
+
     #       - Check the 3 checks above for phenose and phenonum values too
     # qc_phenose_files(…)
     # qc_phenonum_files(…)
author	Frederick Muriuki Muriithi	2024-10-22 12:27:05 -0500
committer	Frederick Muriuki Muriithi	2024-10-22 12:27:05 -0500
commit	84bbf8736ed017b24affb7e931207dafc4ae4780 (patch)
tree	49208e08c0b6e502b995176a47faaeecebf7af0f
parent	12f613cbfb1379c822e5a8831e2ca2becda3bce9 (diff)
download	gn-uploader-84bbf8736ed017b24affb7e931207dafc4ae4780.tar.gz