"""Run quality control on phenotypes-specific files in the bundle.""" import sys from pathlib import Path from argparse import ArgumentParser from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc from r_qtl import exceptions as rqe from scripts.cli_parser import init_cli_parser from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_bundle_argument def validate(phenobundle: Path, logger: Logger) -> dict: """Check that the bundle is generally valid""" try: rqc.validate_bundle(phenobundle) except rqe.RQTLError as rqtlerr: # logger.error("Bundle file validation failed!", exc_info=True) return { "skip": True, "logger": logger, "phenobundle": phenobundle, "errors": (" ".join(rqtlerr.args),) } return { "errors": tuple(), "skip": False, "phenobundle": phenobundle, "logger": logger } def check_for_mandatory_pheno_keys( phenobundle: Path, logger: Logger, **kwargs ) -> dict: """Check that the mandatory keys exist for phenotypes.""" if kwargs.get("skip", False): return { **kwargs, "logger": logger, "phenobundle": phenobundle } _mandatory_keys = ("pheno", "phenocovar") _cdata = rqtl2.read_control_file(phenobundle) _errors = kwargs.get("errors", tuple()) + tuple( f"Expected '{key}' file(s) are not declared in the bundle." for key in _mandatory_keys if key not in _cdata.keys()) return { **kwargs, "logger": logger, "phenobundle": phenobundle, "errors": _errors, "skip": len(_errors) > 0 } def check_for_averages_files( phenobundle: Path, logger: Logger, **kwargs ) -> dict: """Check that averages files appear together""" if kwargs.get("skip", False): return { **kwargs, "logger": logger, "phenobundle": phenobundle } _together = (("phenose", "phenonum"), ("phenonum", "phenose")) _cdata = rqtl2.read_control_file(phenobundle) _errors = kwargs.get("errors", tuple()) + tuple( f"'{first}' is defined in the control file but there is no " f"corresponding '{second}'" for first, second in _together if ((first in _cdata.keys()) and (second not in _cdata.keys()))) return { **kwargs, "logger": logger, "phenobundle": phenobundle, "errors": _errors, "skip": len(_errors) > 0 } def run_qc(dbconn: mdb.Connection, phenobundle: Path, logger: Logger) -> int: """Run quality control checks on the bundle.""" results = check_for_averages_files( **check_for_mandatory_pheno_keys( **validate(phenobundle, logger))) errors = results.get("errors", tuple()) if len(errors) > 0: logger.error("We found the following errors:\n%s", "\n".join(f" - {error}" for error in errors)) return 1 # TODO: Run QC on actual values # Steps: # - Extract file to specific directory # - For every pheno, phenocovar, phenose, phenonum file, undo # transposition where relevant # - Check that `description` and `units` is present in phenocovar for # all phenotypes # - Check all phenotypes in pheno files exist in phenocovar files # - Check all numeric values raise NotImplementedError("WIP!") if __name__ == "__main__": def cli_args(): """Process command-line arguments for `install_phenos`""" parser = add_bundle_argument(init_cli_parser( program="PhenotypesQC", description=( "Perform Quality Control checks on a phenotypes bundle file"))) return parser.parse_args() _logger = getLogger("phenotypes_qc") _logger.addHandler(StreamHandler(stream=sys.stderr)) main = build_main( cli_args(), lambda dbconn, args: run_qc(dbconn, args.rqtl2bundle, _logger), _logger, "DEBUG") sys.exit(main())