aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-21 09:20:37 +0300
committerFrederick Muriuki Muriithi2024-02-21 09:20:37 +0300
commit075b554cdde11f32e73981222a2cede3bb249151 (patch)
treeaccd26a54b67c358a4a0983ac6efec59af0ee359 /scripts
parent6462099372626e11706219a695e8303250359510 (diff)
downloadgn-uploader-075b554cdde11f32e73981222a2cede3bb249151.tar.gz
Check that samples/cases are consistent
Ensure that **ALL** samples/cases/individuals mentioned in any of the pheno files actually exist in at least one of the geno files.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/qc_on_rqtl2_bundle.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py
index 37b8a9e..45be4ae 100644
--- a/scripts/qc_on_rqtl2_bundle.py
+++ b/scripts/qc_on_rqtl2_bundle.py
@@ -2,6 +2,7 @@
import sys
import json
from time import sleep
+from pathlib import Path
from zipfile import ZipFile
from functools import partial
from argparse import Namespace
@@ -128,6 +129,48 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
return False
+
+def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[
+ Union[InvalidValue, rqfe.MissingFile], ...]:
+ """Check that samples in 'pheno' file exist in geno file."""
+ cdata = rqtl2.read_control_file(zipfilepath)
+ genosamples = tuple(
+ sample for perfilesamples in (
+ rqtl2.load_samples(zipfilepath, member, cdata["geno_transposed"])
+ for member in cdata["geno"])
+ for sample in perfilesamples)
+
+ def __check_file__(member) -> tuple[InvalidValue, ...]:
+ logger.info("Checking samples/cases in member file '%s' …", member)
+ sampledata = rqtl2.load_samples(
+ zipfilepath, member, cdata["pheno_transposed"])
+ errors: tuple[InvalidValue, ...] = tuple()
+ for sample in sampledata:
+ if sample not in genosamples:
+ errors = errors + (InvalidValue(
+ member, "-", "-", sample,
+ f"The individual/case/sample '{sample}' in file "
+ f"{member} does not exist in any of the 'geno' files."),)
+
+ logger.info("Found %s missing samples in member file '%s'.",
+ len(errors),
+ member)
+ return errors
+
+ allerrors: tuple[Union[InvalidValue, rqfe.MissingFile], ...] = tuple()
+ for afile in cdata["pheno"]:
+ try:
+ allerrors = allerrors + __check_file__(afile)
+ except KeyError:
+ allerrors = allerrors + (rqfe.MissingFile(
+ "pheno",
+ afile,
+ (f"The file '{afile}' does not exist in the zipfile despite "
+ "being listed in the control file.")),)
+
+ return allerrors
+
+
def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
"""Check for errors in `pheno` file(s)."""
cdata = rqtl2.control_data(zfile)