diff options
author | Frederick Muriuki Muriithi | 2024-10-22 13:22:17 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2024-10-22 13:22:17 -0500 |
commit | 01dadbc45d3bc4ae184d8b4a5f64c1cc6538b2e9 (patch) | |
tree | 1118c351d212e6310af33fea51155913b67ca44a | |
parent | 84bbf8736ed017b24affb7e931207dafc4ae4780 (diff) | |
download | gn-uploader-01dadbc45d3bc4ae184d8b4a5f64c1cc6538b2e9.tar.gz |
Refactor `qc_pheno_file` and reuse it for different file types.
The QC/QA steps taken by the `qc_pheno_file` function are very
similar across the "pheno", "phenose" and "phenonum" files. This
commit makes the `qc_pheno_file` function a higher-order function and
we pass the file-type specific check(s) as a callable (function) to be
used for the QC/QA process.
-rw-r--r-- | scripts/rqtl2/phenotypes_qc.py | 63 |
1 files changed, 52 insertions, 11 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 668fca0..ccd2110 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -4,9 +4,9 @@ import shutil import tempfile from pathlib import Path from zipfile import ZipFile -from functools import reduce import multiprocessing as mproc -from typing import Optional, Sequence +from functools import reduce, partial +from typing import Callable, Optional, Sequence from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb @@ -183,13 +183,32 @@ def decimal_points_error( return None +def integer_error( + filename: str, + rowtitle: str, + coltitle: str, + cellvalue: str, + message: str, + decimal_places: int = 1 +) -> Optional[InvalidValue]: + """Returns an error if the value does not meet the checks.""" + try: + value = int(cellvalue) + if value <= 0: + raise ValueError("Must be a non-zero, positive number.") + return None + except ValueError as _verr: + return InvalidValue(filename, rowtitle, coltitle, cellvalue, message) + + def qc_pheno_file( filepath: Path, samples: tuple[str, ...], phenonames: tuple[str, ...], separator: str, comment_char: str, - na_strings: Sequence[str] + na_strings: Sequence[str], + error_fn: Callable = decimal_points_error ): """Run QC/QA on a `pheno` file.""" _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) @@ -220,13 +239,11 @@ def qc_pheno_file( for field, value in zip(_headings[1:], line[1:]): if value in na_strings: continue - _err = decimal_points_error( + _err = error_fn( filepath.name, line[0], field, - value, - ("Expected a non-negative number with at least one decimal " - "place.")) + value) _errs = _errs + ((_err,) if bool(_err) else tuple()) return _errs, _lc+1 @@ -303,18 +320,42 @@ def run_qc(# pylint: disable=[too-many-arguments] for _file in cdata.get("phenocovar", []))) for name in names)) + dec_err_fn = partial(decimal_points_error, message=( + "Expected a non-negative number with at least one decimal " + "place.")) pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( extractiondir.joinpath(_file), samples, phenonames, cdata["sep"], cdata["comment.char"], - cdata["na.strings"] + cdata["na.strings"], + dec_err_fn ) for _file in cdata.get("pheno", [])))) - # - Check the 3 checks above for phenose and phenonum values too - # qc_phenose_files(…) - # qc_phenonum_files(…) + # - Check the 3 checks above for phenose and phenonum values too + # qc_phenose_files(…) + # qc_phenonum_files(…) + phenose_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( + extractiondir.joinpath(_file), + samples, + phenonames, + cdata["sep"], + cdata["comment.char"], + cdata["na.strings"], + dec_err_fn + ) for _file in cdata.get("phenose", [])))) + + phenonum_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( + extractiondir.joinpath(_file), + samples, + phenonames, + cdata["sep"], + cdata["comment.char"], + cdata["na.strings"], + partial(integer_error, message=( + "Expected a non-negative, non-zero integer value.")) + ) for _file in cdata.get("phenonum", [])))) # - Delete all extracted files shutil.rmtree(extractiondir) |