"""Run quality control on phenotypes-specific files in the bundle.""" import sys import tempfile from pathlib import Path from zipfile import ZipFile from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc from r_qtl import exceptions as rqe from uploader.files import sha256_digest_over_file from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_bundle_argument from scripts.cli_parser import init_cli_parser, add_global_data_arguments def validate(phenobundle: Path, logger: Logger) -> dict: """Check that the bundle is generally valid""" try: rqc.validate_bundle(phenobundle) except rqe.RQTLError as rqtlerr: # logger.error("Bundle file validation failed!", exc_info=True) return { "skip": True, "logger": logger, "phenobundle": phenobundle, "errors": (" ".join(rqtlerr.args),) } return { "errors": tuple(), "skip": False, "phenobundle": phenobundle, "logger": logger } def check_for_mandatory_pheno_keys( phenobundle: Path, logger: Logger, **kwargs ) -> dict: """Check that the mandatory keys exist for phenotypes.""" if kwargs.get("skip", False): return { **kwargs, "logger": logger, "phenobundle": phenobundle } _mandatory_keys = ("pheno", "phenocovar") _cdata = rqtl2.read_control_file(phenobundle) _errors = kwargs.get("errors", tuple()) + tuple( f"Expected '{key}' file(s) are not declared in the bundle." for key in _mandatory_keys if key not in _cdata.keys()) return { **kwargs, "logger": logger, "phenobundle": phenobundle, "errors": _errors, "skip": len(_errors) > 0 } def check_for_averages_files( phenobundle: Path, logger: Logger, **kwargs ) -> dict: """Check that averages files appear together""" if kwargs.get("skip", False): return { **kwargs, "logger": logger, "phenobundle": phenobundle } _together = (("phenose", "phenonum"), ("phenonum", "phenose")) _cdata = rqtl2.read_control_file(phenobundle) _errors = kwargs.get("errors", tuple()) + tuple( f"'{first}' is defined in the control file but there is no " f"corresponding '{second}'" for first, second in _together if ((first in _cdata.keys()) and (second not in _cdata.keys()))) return { **kwargs, "logger": logger, "phenobundle": phenobundle, "errors": _errors, "skip": len(_errors) > 0 } def extract_bundle(bundle: Path, workdir: Path) -> tuple[Path, tuple[Path, ...]]: """Extract the bundle.""" with ZipFile(bundle) as zfile: extractiondir = workdir.joinpath( f"{sha256_digest_over_file(bundle)}_phenotype_qc_{bundle.name}") return extractiondir, rqtl2.extract(zfile, extractiondir) """Run quality control checks on the bundle.""" results = check_for_averages_files( **check_for_mandatory_pheno_keys( **validate(phenobundle, logger))) errors = results.get("errors", tuple()) if len(errors) > 0: logger.error("We found the following errors:\n%s", "\n".join(f" - {error}" for error in errors)) return 1 # TODO: Run QC on actual values # Steps: # - Extract file to specific directory extractiondir, *_bundlefiles = extract_bundle(phenobundle, workingdir) # - For every pheno, phenocovar, phenose, phenonum file, undo # transposition where relevant # - Check that `description` and `units` is present in phenocovar for # all phenotypes # - Check all phenotypes in pheno files exist in phenocovar files # - Check all numeric values raise NotImplementedError("WIP!") if __name__ == "__main__": def cli_args(): """Process command-line arguments for `install_phenos`""" parser = add_bundle_argument(add_global_data_arguments(init_cli_parser( program="PhenotypesQC", description=( "Perform Quality Control checks on a phenotypes bundle file")))) parser.add_argument( "--working-dir", default=f"{tempfile.gettempdir()}/phenotypes_qc", help=("The directory where this script will put its intermediate " "files."), type=Path) return parser.parse_args() _logger = getLogger("phenotypes_qc") _logger.addHandler(StreamHandler(stream=sys.stderr)) main = build_main( cli_args(), lambda dbconn, args: run_qc(dbconn, args.rqtl2bundle, _logger), _logger, "DEBUG") sys.exit(main())