about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-21 09:20:37 +0300
committerFrederick Muriuki Muriithi2024-02-21 09:20:37 +0300
commit075b554cdde11f32e73981222a2cede3bb249151 (patch)
treeaccd26a54b67c358a4a0983ac6efec59af0ee359 /scripts
parent6462099372626e11706219a695e8303250359510 (diff)
downloadgn-uploader-075b554cdde11f32e73981222a2cede3bb249151.tar.gz
Check that samples/cases are consistent
Ensure that **ALL** samples/cases/individuals mentioned in any of the
pheno files actually exist in at least one of the geno files.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/qc_on_rqtl2_bundle.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py
index 37b8a9e..45be4ae 100644
--- a/scripts/qc_on_rqtl2_bundle.py
+++ b/scripts/qc_on_rqtl2_bundle.py
@@ -2,6 +2,7 @@
 import sys
 import json
 from time import sleep
+from pathlib import Path
 from zipfile import ZipFile
 from functools import partial
 from argparse import Namespace
@@ -128,6 +129,48 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
 
     return False
 
+
+def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[
+        Union[InvalidValue, rqfe.MissingFile], ...]:
+    """Check that samples in 'pheno' file exist in geno file."""
+    cdata = rqtl2.read_control_file(zipfilepath)
+    genosamples = tuple(
+        sample for perfilesamples in (
+            rqtl2.load_samples(zipfilepath, member, cdata["geno_transposed"])
+            for member in cdata["geno"])
+        for sample in perfilesamples)
+
+    def __check_file__(member) -> tuple[InvalidValue, ...]:
+        logger.info("Checking samples/cases in member file '%s' …", member)
+        sampledata = rqtl2.load_samples(
+            zipfilepath, member, cdata["pheno_transposed"])
+        errors: tuple[InvalidValue, ...] = tuple()
+        for sample in sampledata:
+            if sample not in genosamples:
+                errors = errors + (InvalidValue(
+                    member, "-", "-", sample,
+                    f"The individual/case/sample '{sample}' in file "
+                    f"{member} does not exist in any of the 'geno' files."),)
+
+        logger.info("Found %s missing samples in member file '%s'.",
+                    len(errors),
+                    member)
+        return errors
+
+    allerrors: tuple[Union[InvalidValue, rqfe.MissingFile], ...] = tuple()
+    for afile in cdata["pheno"]:
+        try:
+            allerrors = allerrors + __check_file__(afile)
+        except KeyError:
+            allerrors = allerrors + (rqfe.MissingFile(
+                "pheno",
+                afile,
+                (f"The file '{afile}' does not exist in the zipfile despite "
+                 "being listed in the control file.")),)
+
+    return allerrors
+
+
 def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
     """Check for errors in `pheno` file(s)."""
     cdata = rqtl2.control_data(zfile)