From c6437873142ad90c6ce29dcf0e700e0e6a1c658a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 3 Dec 2024 12:23:16 -0600 Subject: Save phenocovar errors in redis as they are found. --- scripts/rqtl2/phenotypes_qc.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 12b1803..26c049b 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -1,6 +1,7 @@ """Run quality control on phenotypes-specific files in the bundle.""" import sys import uuid +import json import shutil import logging import tempfile @@ -152,6 +153,12 @@ def redis_logger( rconn.close() +def push_error(rconn: Redis, fqkey: str, error: InvalidValue) -> InvalidValue: + """Persist the error in redis.""" + rconn.rpush(fqkey, json.dumps(error._asdict())) + return error + + def qc_phenocovar_file( filepath: Path, redisuri, @@ -159,44 +166,46 @@ def qc_phenocovar_file( separator: str, comment_char: str): """Check that `phenocovar` files are structured correctly.""" - with redis_logger( + with (redis_logger( redisuri, f"{__MODULE__}.qc_phenocovar_file", filepath.name, - fqkey) as logger: + f"{fqkey}:logs") as logger, + Redis.from_url(redisuri, decode_responses=True) as rconn): logger.info("Running QC on file: %s", filepath.name) _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) _headings = tuple(heading.lower() for heading in next(_csvfile)) _errors: tuple[InvalidValue, ...] = tuple() + save_error = partial(push_error, rconn, f"{fqkey}:errors:{filepath.name}") for heading in ("description", "units"): if heading not in _headings: - _errors = (InvalidValue( + _errors = (save_error(InvalidValue( filepath.name, "header row", "-", "-", (f"File {filepath.name} is missing the {heading} heading " - "in the header line.")),) + "in the header line."))),) def collect_errors(errors_and_linecount, line): _errs, _lc = errors_and_linecount logger.info("Testing record '%s'", line[0]) if len(line) != len(_headings): - _errs = _errs + (InvalidValue( + _errs = _errs + (save_error(InvalidValue( filepath.name, line[0], "-", "-", (f"Record {_lc} in file {filepath.name} has a different " - "number of columns than the number of headings")),) + "number of columns than the number of headings"))),) _line = dict(zip(_headings, line)) if not bool(_line["description"]): _errs = _errs + ( - InvalidValue(filepath.name, - _line[_headings[0]], - "description", - _line["description"], - "The description is not provided!"),) + save_error(InvalidValue(filepath.name, + _line[_headings[0]], + "description", + _line["description"], + "The description is not provided!")),) return _errs, _lc+1 @@ -368,15 +377,16 @@ def run_qc(# pylint: disable=[too-many-locals] # - Check that `description` and `units` is present in phenocovar for # all phenotypes + rconn.hset(fullyqualifiedjobid, + "fully-qualified-keys:phenocovar", + json.dumps(tuple(f"{fullyqualifiedjobid}:phenocovar:{_file}" + for _file in cdata.get("phenocovar", [])))) with mproc.Pool(mproc.cpu_count() - 1) as pool: logger.debug("Check for errors in 'phenocovar' file(s).") _phenocovar_qc_res = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple( (extractiondir.joinpath(_file), args.redisuri, - chain( - "phenocovar", - fullyqualifiedkey(args.jobid), - fullyqualifiedkey(args.redisprefix)), + f"{fullyqualifiedjobid}:phenocovar", cdata["sep"], cdata["comment.char"]) for _file in cdata.get("phenocovar", [])))) -- cgit v1.2.3