diff options
Diffstat (limited to 'scripts/qc_on_rqtl2_bundle.py')
-rw-r--r-- | scripts/qc_on_rqtl2_bundle.py | 87 |
1 files changed, 65 insertions, 22 deletions
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index deef8fe..fc95d13 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -10,22 +10,24 @@ import multiprocessing as mproc from logging import Logger, getLogger, StreamHandler from typing import Union, Sequence, Callable, Iterator +import MySQLdb as mdb from redis import Redis from quality_control.errors import InvalidValue from quality_control.checks import decimal_points_error -from qc_app import jobs -from qc_app.check_connections import check_db, check_redis +from uploader import jobs +from uploader.db_utils import database_connection +from uploader.check_connections import check_db, check_redis -from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc +from r_qtl import exceptions as rqe from r_qtl import fileerrors as rqfe -from scripts.cli_parser import init_cli_parser from scripts.process_rqtl2_bundle import parse_job from scripts.redis_logger import setup_redis_logger +from scripts.cli_parser import init_cli_parser, add_global_data_arguments def dict2tuple(dct: dict) -> tuple: """Utility to convert items in dicts to pairs of tuples.""" @@ -103,13 +105,13 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals __update_processed__(value) rconn.hset(fqjobid, f"{filetype}-linecount", count) - except rqe.MissingFileError: + except rqe.MissingFileException: fname = cdata.get(filetype) yield rqfe.MissingFile(filetype, fname, ( f"The file '{fname}' does not exist in the bundle despite it being " f"listed under '{filetype}' in the control file.")) -def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool: +def qc_geno_errors(rconn, fqjobid, _dburi, _speciesid, zfile, logger) -> bool: """Check for errors in `geno` file(s).""" cdata = rqtl2.control_data(zfile) if "geno" in cdata: @@ -129,15 +131,32 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool: return False -def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[ - Union[InvalidValue, rqfe.MissingFile], ...]: +def fetch_db_geno_samples(conn: mdb.Connection, speciesid: int) -> tuple[str, ...]: + """Fetch samples/cases/individuals from the database.""" + samples = set()# type: ignore[var-annotated] + with conn.cursor() as cursor: + cursor.execute("SELECT Name, Name2 from Strain WHERE SpeciesId=%s", + (speciesid,)) + rows = cursor.fetchall() or tuple() + for row in rows: + samples.update(tuple(row)) + + return tuple(item.strip() for item in samples if bool(item)) + + +def check_pheno_samples( + conn: mdb.Connection, + speciesid: int, + zipfilepath: Union[str, Path], + logger: Logger +) -> tuple[Union[InvalidValue, rqfe.MissingFile], ...]: """Check that samples in 'pheno' file exist in geno file.""" cdata = rqtl2.read_control_file(zipfilepath) genosamples = tuple( sample for perfilesamples in ( rqtl2.load_samples(zipfilepath, member, cdata["geno_transposed"]) for member in cdata["geno"]) - for sample in perfilesamples) + for sample in perfilesamples) + fetch_db_geno_samples(conn, speciesid) def __check_file__(member) -> tuple[InvalidValue, ...]: logger.info("Checking samples/cases in member file '%s' …", member) @@ -149,7 +168,9 @@ def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[ errors = errors + (InvalidValue( member, "-", "-", sample, f"The individual/case/sample '{sample}' in file " - f"{member} does not exist in any of the 'geno' files."),) + f"{member} does not exist in either, any of the 'geno' " + "files provided in the bundle or the GeneNetwork database." + ),) logger.info("Found %s missing samples in member file '%s'.", len(errors), @@ -170,15 +191,21 @@ def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[ return allerrors -def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool: +def qc_pheno_errors(# pylint: disable=[too-many-arguments] + rconn, fqjobid, dburi, speciesid, zfile, logger) -> bool: """Check for errors in `pheno` file(s).""" cdata = rqtl2.control_data(zfile) if "pheno" in cdata: logger.info("Checking for errors in the 'pheno' file…") - perrs = check_pheno_samples(zfile.filename, logger) + tuple( - retrieve_errors_with_progress( - rconn,fqjobid, zfile, "pheno", - (partial(decimal_points_error, filename="pheno", mini=3),))) + perrs = tuple()# type: ignore[var-annotated] + with database_connection(dburi) as dbconn: + perrs = check_pheno_samples( + dbconn, speciesid, zfile.filename, logger) + tuple( + retrieve_errors_with_progress( + rconn,fqjobid, zfile, "pheno", + (partial(decimal_points_error, + filename="pheno", + mini=3),))) add_to_errors(rconn, fqjobid, "errors-generic", tuple( err for err in perrs if isinstance(err, rqfe.MissingFile))) add_to_errors(rconn, fqjobid, "errors-pheno", tuple( @@ -190,7 +217,8 @@ def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool: return False -def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool: +def qc_phenose_errors(# pylint: disable=[too-many-arguments] + rconn, fqjobid, _dburi, _speciesid, zfile, logger) -> bool: """Check for errors in `phenose` file(s).""" cdata = rqtl2.control_data(zfile) if "phenose" in cdata: @@ -209,7 +237,14 @@ def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool: return False -def qc_phenocovar_errors(_rconn, _fqjobid, _zfile, _logger) -> bool: +def qc_phenocovar_errors( + _rconn, + _fqjobid, + _dburi, + _speciesid, + _zfile, + _logger +) -> bool: """Check for errors in `phenocovar` file(s).""" return False @@ -225,12 +260,20 @@ def run_qc(rconn: Redis, if qc_missing_files(rconn, fqjobid, zfile, logger): return 1 - def with_zipfile(rconn, fqjobid, filename, logger, func): + def with_zipfile(# pylint: disable=[too-many-arguments] + rconn, fqjobid, dbconn, speciesid, filename, logger, func + ): with ZipFile(filename, "r") as zfile: - return func(rconn, fqjobid, zfile, logger) + return func(rconn, fqjobid, dbconn, speciesid, zfile, logger) def buildargs(func): - return (rconn, fqjobid, jobmeta["rqtl2-bundle-file"], logger, func) + return (rconn, + fqjobid, + args.databaseuri, + args.speciesid, + jobmeta["rqtl2-bundle-file"], + logger, + func) processes = [ mproc.Process(target=with_zipfile, args=buildargs(qc_geno_errors,)), mproc.Process(target=with_zipfile, args=buildargs(qc_pheno_errors,)), @@ -263,8 +306,8 @@ def run_qc(rconn: Redis, if __name__ == "__main__": def main(): """Enter R/qtl2 bundle QC runner.""" - args = init_cli_parser( - "qc-on-rqtl2-bundle", "Run QC on R/qtl2 bundle.").parse_args() + args = add_global_data_arguments(init_cli_parser( + "qc-on-rqtl2-bundle", "Run QC on R/qtl2 bundle.")).parse_args() check_redis(args.redisuri) check_db(args.databaseuri) |