From f8e45d54be507d5080e9a6a3a3026907f60507a9 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 24 Oct 2024 16:44:17 -0500 Subject: Check pheno, phenose, and phenonum files with logging Add logging to the `qc_pheno_file` function so that we get the messages pushed to redis for every file that is being checked for errors. --- scripts/rqtl2/phenotypes_qc.py | 119 ++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 4c02578..d8bf3c2 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -150,7 +150,7 @@ def redis_logger( def qc_phenocovar_file( - filename: Path, + filepath: Path, redisuri, fqkey: str, separator: str, @@ -159,20 +159,20 @@ def qc_phenocovar_file( with redis_logger( redisuri, f"{__MODULE__}.qc_phenocovar_file", - filename.name, + filepath.name, fqkey) as logger: - logger.info("Running QC on file: %s", filename.name) - _csvfile = rqtl2.read_csv_file(filename, separator, comment_char) + logger.info("Running QC on file: %s", filepath.name) + _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) _headings = tuple(heading.lower() for heading in next(_csvfile)) _errors: tuple[InvalidValue, ...] = tuple() for heading in ("description", "units"): if heading not in _headings: _errors = (InvalidValue( - filename.name, + filepath.name, "header row", "-", "-", - (f"File {filename.name} is missing the {heading} heading " + (f"File {filepath.name} is missing the {heading} heading " "in the header line.")),) def collect_errors(errors_and_linecount, line): @@ -180,16 +180,16 @@ def qc_phenocovar_file( logger.info("Testing record '%s'", line[0]) if len(line) != len(_headings): _errs = _errs + (InvalidValue( - filename.name, + filepath.name, line[0], "-", "-", - (f"Record {_lc} in file {filename.name} has a different " + (f"Record {_lc} in file {filepath.name} has a different " "number of columns than the number of headings")),) _line = dict(zip(_headings, line)) if not bool(_line["description"]): _errs = _errs + ( - InvalidValue(filename.name, + InvalidValue(filepath.name, _line[_headings[0]], "description", _line["description"], @@ -198,7 +198,7 @@ def qc_phenocovar_file( return _errs, _lc+1 return { - filename.name: dict(zip( + filepath.name: dict(zip( ("errors", "linecount"), reduce(collect_errors, _csvfile, (_errors, 1)))) } @@ -242,6 +242,8 @@ def integer_error( def qc_pheno_file(# pylint: disable=[too-many-arguments] filepath: Path, + redisuri: str, + fqkey: str, samples: tuple[str, ...], phenonames: tuple[str, ...], separator: str, @@ -250,49 +252,55 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments] error_fn: Callable = decimal_points_error ): """Run QC/QA on a `pheno` file.""" - _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) - _headings: tuple[str, ...] = tuple( - heading.lower() for heading in next(_csvfile)) - _errors: tuple[InvalidValue, ...] = tuple() - - _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames) - if len(_absent) > 0: - _errors = _errors + (InvalidValue( - filepath.name, - "header row", - "-", - ", ".join(_absent), - (f"The phenotype names ({', '.join(samples)}) do not exist in any " - "of the provided phenocovar files.")),) - - def collect_errors(errors_and_linecount, line): - _errs, _lc = errors_and_linecount - if line[0] not in samples: - _errs = _errs + (InvalidValue( + with redis_logger( + redisuri, + f"{__MODULE__}.qc_pheno_file", filepath.name, - line[0], - _headings[0], - line[0], - (f"The sample named '{line[0]}' does not exist in the database. " - "You will need to upload that first.")),) - - for field, value in zip(_headings[1:], line[1:]): - if value in na_strings: - continue - _err = error_fn( + fqkey) as logger: + logger.info("Running QC on file: %s", filepath.name) + _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) + _headings: tuple[str, ...] = tuple( + heading.lower() for heading in next(_csvfile)) + _errors: tuple[InvalidValue, ...] = tuple() + + _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames) + if len(_absent) > 0: + _errors = _errors + (InvalidValue( filepath.name, + "header row", + "-", + ", ".join(_absent), + (f"The phenotype names ({', '.join(samples)}) do not exist in any " + "of the provided phenocovar files.")),) + + def collect_errors(errors_and_linecount, line): + _errs, _lc = errors_and_linecount + if line[0] not in samples: + _errs = _errs + (InvalidValue( + filepath.name, + line[0], + _headings[0], line[0], - field, - value) - _errs = _errs + ((_err,) if bool(_err) else tuple()) + (f"The sample named '{line[0]}' does not exist in the database. " + "You will need to upload that first.")),) + + for field, value in zip(_headings[1:], line[1:]): + if value in na_strings: + continue + _err = error_fn( + filepath.name, + line[0], + field, + value) + _errs = _errs + ((_err,) if bool(_err) else tuple()) - return _errs, _lc+1 + return _errs, _lc+1 - return { - filepath.name: dict(zip( - ("errors", "linecount"), - reduce(collect_errors, _csvfile, (_errors, 1)))) - } + return { + filepath.name: dict(zip( + ("errors", "linecount"), + reduce(collect_errors, _csvfile, (_errors, 1)))) + } def phenotype_names(filepath: Path, @@ -384,6 +392,11 @@ def run_qc(# pylint: disable=[too-many-locals] logger.debug("Check for errors in 'pheno' file(s).") _pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( extractiondir.joinpath(_file), + args.redisuri, + chain( + "pheno", + fullyqualifiedkey(args.jobid), + fullyqualifiedkey(args.redisprefix)), samples, phenonames, cdata["sep"], @@ -398,6 +411,11 @@ def run_qc(# pylint: disable=[too-many-locals] logger.debug("Check for errors in 'phenose' file(s).") _phenose_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( extractiondir.joinpath(_file), + args.redisuri, + chain( + "phenose", + fullyqualifiedkey(args.jobid), + fullyqualifiedkey(args.redisprefix)), samples, phenonames, cdata["sep"], @@ -409,6 +427,11 @@ def run_qc(# pylint: disable=[too-many-locals] logger.debug("Check for errors in 'phenonum' file(s).") _phenonum_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( extractiondir.joinpath(_file), + args.redisuri, + chain( + "phenonum", + fullyqualifiedkey(args.jobid), + fullyqualifiedkey(args.redisprefix)), samples, phenonames, cdata["sep"], -- cgit v1.2.3