From b8299e6c7cdb0819883bfd8e142fc71c43440746 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 14 Oct 2024 13:26:08 -0500 Subject: Initialise background script for running QC on phenotype bundles. --- scripts/rqtl2/phenotypes_qc.py | 134 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 scripts/rqtl2/phenotypes_qc.py (limited to 'scripts') diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py new file mode 100644 index 0000000..6a804df --- /dev/null +++ b/scripts/rqtl2/phenotypes_qc.py @@ -0,0 +1,134 @@ +"""Run quality control on phenotypes-specific files in the bundle.""" +import sys +from pathlib import Path +from argparse import ArgumentParser +from logging import Logger, getLogger, StreamHandler + +import MySQLdb as mdb + +from r_qtl import r_qtl2 as rqtl2 +from r_qtl import r_qtl2_qc as rqc +from r_qtl import exceptions as rqe + +from scripts.cli_parser import init_cli_parser +from scripts.rqtl2.entry import build_main +from scripts.rqtl2.cli_parser import add_bundle_argument + +def validate(phenobundle: Path, logger: Logger) -> dict: + """Check that the bundle is generally valid""" + try: + rqc.validate_bundle(phenobundle) + except rqe.RQTLError as rqtlerr: + # logger.error("Bundle file validation failed!", exc_info=True) + return { + "skip": True, + "logger": logger, + "phenobundle": phenobundle, + "errors": (" ".join(rqtlerr.args),) + } + return { + "errors": tuple(), + "skip": False, + "phenobundle": phenobundle, + "logger": logger + } + + +def check_for_mandatory_pheno_keys( + phenobundle: Path, + logger: Logger, + **kwargs +) -> dict: + """Check that the mandatory keys exist for phenotypes.""" + if kwargs.get("skip", False): + return { + **kwargs, + "logger": logger, + "phenobundle": phenobundle + } + + _mandatory_keys = ("pheno", "phenocovar") + _cdata = rqtl2.read_control_file(phenobundle) + _errors = kwargs.get("errors", tuple()) + tuple( + f"Expected '{key}' file(s) are not declared in the bundle." + for key in _mandatory_keys if key not in _cdata.keys()) + return { + **kwargs, + "logger": logger, + "phenobundle": phenobundle, + "errors": _errors, + "skip": len(_errors) > 0 + } + + +def check_for_averages_files( + phenobundle: Path, + logger: Logger, + **kwargs +) -> dict: + """Check that averages files appear together""" + if kwargs.get("skip", False): + return { + **kwargs, + "logger": logger, + "phenobundle": phenobundle + } + + _together = (("phenose", "phenonum"), ("phenonum", "phenose")) + _cdata = rqtl2.read_control_file(phenobundle) + _errors = kwargs.get("errors", tuple()) + tuple( + f"'{first}' is defined in the control file but there is no " + f"corresponding '{second}'" + for first, second in _together + if ((first in _cdata.keys()) and (second not in _cdata.keys()))) + return { + **kwargs, + "logger": logger, + "phenobundle": phenobundle, + "errors": _errors, + "skip": len(_errors) > 0 + } + + +def run_qc(dbconn: mdb.Connection, phenobundle: Path, logger: Logger) -> int: + """Run quality control checks on the bundle.""" + results = check_for_averages_files( + **check_for_mandatory_pheno_keys( + **validate(phenobundle, logger))) + errors = results.get("errors", tuple()) + if len(errors) > 0: + logger.error("We found the following errors:\n%s", + "\n".join(f" - {error}" for error in errors)) + return 1 + # TODO: Run QC on actual values + # Steps: + # - Extract file to specific directory + # - For every pheno, phenocovar, phenose, phenonum file, undo + # transposition where relevant + # - Check that `description` and `units` is present in phenocovar for + # all phenotypes + # - Check all phenotypes in pheno files exist in phenocovar files + # - Check all numeric values + raise NotImplementedError("WIP!") + + +if __name__ == "__main__": + + def cli_args(): + """Process command-line arguments for `install_phenos`""" + parser = add_bundle_argument(init_cli_parser( + program="PhenotypesQC", + description=( + "Perform Quality Control checks on a phenotypes bundle file"))) + return parser.parse_args() + + _logger = getLogger("phenotypes_qc") + _logger.addHandler(StreamHandler(stream=sys.stderr)) + + main = build_main( + cli_args(), + lambda dbconn, args: run_qc(dbconn, args.rqtl2bundle, _logger), + _logger, + "DEBUG") + sys.exit(main()) + -- cgit v1.2.3