aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-21 09:20:37 +0300
committerFrederick Muriuki Muriithi2024-02-21 09:20:37 +0300
commit075b554cdde11f32e73981222a2cede3bb249151 (patch)
treeaccd26a54b67c358a4a0983ac6efec59af0ee359
parent6462099372626e11706219a695e8303250359510 (diff)
downloadgn-uploader-075b554cdde11f32e73981222a2cede3bb249151.tar.gz
Check that samples/cases are consistent
Ensure that **ALL** samples/cases/individuals mentioned in any of the pheno files actually exist in at least one of the geno files.
-rw-r--r--r_qtl/r_qtl2.py34
-rw-r--r--scripts/qc_on_rqtl2_bundle.py43
2 files changed, 57 insertions, 20 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 1e28bc0..87491d0 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -368,27 +368,21 @@ def read_geno_file_data(
replace_genotype_codes, genocodes=cdata.get("genotypes", {})))
-def load_samples(
- zipfilepath: Union[str, Path], filetype: str) -> tuple[str, ...]:
- """Load the samples/cases/individuals from file(s) of type 'filetype'."""
- cdata = read_control_file(zipfilepath)
+def load_samples(zipfilepath: Union[str, Path],
+ member: str,
+ transposed: bool) -> tuple[str, ...]:
+ """Load the samples/cases/individuals from file 'member'."""
+ filedata = read_geno_file_data(zipfilepath, member)
samples: set[str] = set()
- for afile in cdata.get(filetype, []):
- filedata = read_geno_file_data(zipfilepath, afile)
- if cdata.get(f"{filetype}_transposed", False):
+ if transposed:
+ samples.update(
+ item for item in next(filedata)[1:] if item is not None)
+ else:
+ try:
+ next(filedata)# Ignore first row.
samples.update(
- item for item in next(filedata)[1:] if item is not None)
- else:
- try:
- next(filedata)# Ignore first row.
- samples.update(
- line[0] for line in filedata if line[0] is not None)
- except StopIteration:# Empty file.
- pass
+ line[0] for line in filedata if line[0] is not None)
+ except StopIteration:# Empty file.
+ pass
return tuple(samples)
-
-
-load_geno_samples = partial(load_samples, filetype="geno")
-load_founder_geno_samples = partial(load_samples, filetype="founder_geno")
-load_pheno_samples = partial(load_samples, filetype="pheno")
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py
index 37b8a9e..45be4ae 100644
--- a/scripts/qc_on_rqtl2_bundle.py
+++ b/scripts/qc_on_rqtl2_bundle.py
@@ -2,6 +2,7 @@
import sys
import json
from time import sleep
+from pathlib import Path
from zipfile import ZipFile
from functools import partial
from argparse import Namespace
@@ -128,6 +129,48 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
return False
+
+def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[
+ Union[InvalidValue, rqfe.MissingFile], ...]:
+ """Check that samples in 'pheno' file exist in geno file."""
+ cdata = rqtl2.read_control_file(zipfilepath)
+ genosamples = tuple(
+ sample for perfilesamples in (
+ rqtl2.load_samples(zipfilepath, member, cdata["geno_transposed"])
+ for member in cdata["geno"])
+ for sample in perfilesamples)
+
+ def __check_file__(member) -> tuple[InvalidValue, ...]:
+ logger.info("Checking samples/cases in member file '%s' …", member)
+ sampledata = rqtl2.load_samples(
+ zipfilepath, member, cdata["pheno_transposed"])
+ errors: tuple[InvalidValue, ...] = tuple()
+ for sample in sampledata:
+ if sample not in genosamples:
+ errors = errors + (InvalidValue(
+ member, "-", "-", sample,
+ f"The individual/case/sample '{sample}' in file "
+ f"{member} does not exist in any of the 'geno' files."),)
+
+ logger.info("Found %s missing samples in member file '%s'.",
+ len(errors),
+ member)
+ return errors
+
+ allerrors: tuple[Union[InvalidValue, rqfe.MissingFile], ...] = tuple()
+ for afile in cdata["pheno"]:
+ try:
+ allerrors = allerrors + __check_file__(afile)
+ except KeyError:
+ allerrors = allerrors + (rqfe.MissingFile(
+ "pheno",
+ afile,
+ (f"The file '{afile}' does not exist in the zipfile despite "
+ "being listed in the control file.")),)
+
+ return allerrors
+
+
def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
"""Check for errors in `pheno` file(s)."""
cdata = rqtl2.control_data(zfile)