From 075b554cdde11f32e73981222a2cede3bb249151 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 21 Feb 2024 09:20:37 +0300 Subject: Check that samples/cases are consistent Ensure that **ALL** samples/cases/individuals mentioned in any of the pheno files actually exist in at least one of the geno files. --- r_qtl/r_qtl2.py | 34 ++++++++++++++-------------------- scripts/qc_on_rqtl2_bundle.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 1e28bc0..87491d0 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -368,27 +368,21 @@ def read_geno_file_data( replace_genotype_codes, genocodes=cdata.get("genotypes", {}))) -def load_samples( - zipfilepath: Union[str, Path], filetype: str) -> tuple[str, ...]: - """Load the samples/cases/individuals from file(s) of type 'filetype'.""" - cdata = read_control_file(zipfilepath) +def load_samples(zipfilepath: Union[str, Path], + member: str, + transposed: bool) -> tuple[str, ...]: + """Load the samples/cases/individuals from file 'member'.""" + filedata = read_geno_file_data(zipfilepath, member) samples: set[str] = set() - for afile in cdata.get(filetype, []): - filedata = read_geno_file_data(zipfilepath, afile) - if cdata.get(f"{filetype}_transposed", False): + if transposed: + samples.update( + item for item in next(filedata)[1:] if item is not None) + else: + try: + next(filedata)# Ignore first row. samples.update( - item for item in next(filedata)[1:] if item is not None) - else: - try: - next(filedata)# Ignore first row. - samples.update( - line[0] for line in filedata if line[0] is not None) - except StopIteration:# Empty file. - pass + line[0] for line in filedata if line[0] is not None) + except StopIteration:# Empty file. + pass return tuple(samples) - - -load_geno_samples = partial(load_samples, filetype="geno") -load_founder_geno_samples = partial(load_samples, filetype="founder_geno") -load_pheno_samples = partial(load_samples, filetype="pheno") diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index 37b8a9e..45be4ae 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -2,6 +2,7 @@ import sys import json from time import sleep +from pathlib import Path from zipfile import ZipFile from functools import partial from argparse import Namespace @@ -128,6 +129,48 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool: return False + +def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[ + Union[InvalidValue, rqfe.MissingFile], ...]: + """Check that samples in 'pheno' file exist in geno file.""" + cdata = rqtl2.read_control_file(zipfilepath) + genosamples = tuple( + sample for perfilesamples in ( + rqtl2.load_samples(zipfilepath, member, cdata["geno_transposed"]) + for member in cdata["geno"]) + for sample in perfilesamples) + + def __check_file__(member) -> tuple[InvalidValue, ...]: + logger.info("Checking samples/cases in member file '%s' …", member) + sampledata = rqtl2.load_samples( + zipfilepath, member, cdata["pheno_transposed"]) + errors: tuple[InvalidValue, ...] = tuple() + for sample in sampledata: + if sample not in genosamples: + errors = errors + (InvalidValue( + member, "-", "-", sample, + f"The individual/case/sample '{sample}' in file " + f"{member} does not exist in any of the 'geno' files."),) + + logger.info("Found %s missing samples in member file '%s'.", + len(errors), + member) + return errors + + allerrors: tuple[Union[InvalidValue, rqfe.MissingFile], ...] = tuple() + for afile in cdata["pheno"]: + try: + allerrors = allerrors + __check_file__(afile) + except KeyError: + allerrors = allerrors + (rqfe.MissingFile( + "pheno", + afile, + (f"The file '{afile}' does not exist in the zipfile despite " + "being listed in the control file.")),) + + return allerrors + + def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool: """Check for errors in `pheno` file(s).""" cdata = rqtl2.control_data(zfile) -- cgit v1.2.3