aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-10-22 13:22:17 -0500
committerFrederick Muriuki Muriithi2024-10-22 13:22:17 -0500
commit01dadbc45d3bc4ae184d8b4a5f64c1cc6538b2e9 (patch)
tree1118c351d212e6310af33fea51155913b67ca44a /scripts
parent84bbf8736ed017b24affb7e931207dafc4ae4780 (diff)
downloadgn-uploader-01dadbc45d3bc4ae184d8b4a5f64c1cc6538b2e9.tar.gz
Refactor `qc_pheno_file` and reuse it for different file types.
The QC/QA steps taken by the `qc_pheno_file` function are very similar across the "pheno", "phenose" and "phenonum" files. This commit makes the `qc_pheno_file` function a higher-order function and we pass the file-type specific check(s) as a callable (function) to be used for the QC/QA process.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/rqtl2/phenotypes_qc.py63
1 files changed, 52 insertions, 11 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 668fca0..ccd2110 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -4,9 +4,9 @@ import shutil
import tempfile
from pathlib import Path
from zipfile import ZipFile
-from functools import reduce
import multiprocessing as mproc
-from typing import Optional, Sequence
+from functools import reduce, partial
+from typing import Callable, Optional, Sequence
from logging import Logger, getLogger, StreamHandler
import MySQLdb as mdb
@@ -183,13 +183,32 @@ def decimal_points_error(
return None
+def integer_error(
+ filename: str,
+ rowtitle: str,
+ coltitle: str,
+ cellvalue: str,
+ message: str,
+ decimal_places: int = 1
+) -> Optional[InvalidValue]:
+ """Returns an error if the value does not meet the checks."""
+ try:
+ value = int(cellvalue)
+ if value <= 0:
+ raise ValueError("Must be a non-zero, positive number.")
+ return None
+ except ValueError as _verr:
+ return InvalidValue(filename, rowtitle, coltitle, cellvalue, message)
+
+
def qc_pheno_file(
filepath: Path,
samples: tuple[str, ...],
phenonames: tuple[str, ...],
separator: str,
comment_char: str,
- na_strings: Sequence[str]
+ na_strings: Sequence[str],
+ error_fn: Callable = decimal_points_error
):
"""Run QC/QA on a `pheno` file."""
_csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
@@ -220,13 +239,11 @@ def qc_pheno_file(
for field, value in zip(_headings[1:], line[1:]):
if value in na_strings:
continue
- _err = decimal_points_error(
+ _err = error_fn(
filepath.name,
line[0],
field,
- value,
- ("Expected a non-negative number with at least one decimal "
- "place."))
+ value)
_errs = _errs + ((_err,) if bool(_err) else tuple())
return _errs, _lc+1
@@ -303,18 +320,42 @@ def run_qc(# pylint: disable=[too-many-arguments]
for _file in cdata.get("phenocovar", [])))
for name in names))
+ dec_err_fn = partial(decimal_points_error, message=(
+ "Expected a non-negative number with at least one decimal "
+ "place."))
pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
extractiondir.joinpath(_file),
samples,
phenonames,
cdata["sep"],
cdata["comment.char"],
- cdata["na.strings"]
+ cdata["na.strings"],
+ dec_err_fn
) for _file in cdata.get("pheno", []))))
- # - Check the 3 checks above for phenose and phenonum values too
- # qc_phenose_files(…)
- # qc_phenonum_files(…)
+ # - Check the 3 checks above for phenose and phenonum values too
+ # qc_phenose_files(…)
+ # qc_phenonum_files(…)
+ phenose_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
+ extractiondir.joinpath(_file),
+ samples,
+ phenonames,
+ cdata["sep"],
+ cdata["comment.char"],
+ cdata["na.strings"],
+ dec_err_fn
+ ) for _file in cdata.get("phenose", []))))
+
+ phenonum_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
+ extractiondir.joinpath(_file),
+ samples,
+ phenonames,
+ cdata["sep"],
+ cdata["comment.char"],
+ cdata["na.strings"],
+ partial(integer_error, message=(
+ "Expected a non-negative, non-zero integer value."))
+ ) for _file in cdata.get("phenonum", []))))
# - Delete all extracted files
shutil.rmtree(extractiondir)