aboutsummaryrefslogtreecommitdiff
path: root/scripts/rqtl2/phenotypes_qc.py
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-12-03 14:23:33 -0600
committerFrederick Muriuki Muriithi2024-12-03 15:11:26 -0600
commit6bf0801db67a64b574128eb1b561046a4ec1d042 (patch)
treea2b0ef44c08df41b63afaca4d90ea84f92613f1d /scripts/rqtl2/phenotypes_qc.py
parentb3b85a6241bacecae3997ec063dde007143289f0 (diff)
downloadgn-uploader-6bf0801db67a64b574128eb1b561046a4ec1d042.tar.gz
Save 'pheno' errors in redis as they are found.
Diffstat (limited to 'scripts/rqtl2/phenotypes_qc.py')
-rw-r--r--scripts/rqtl2/phenotypes_qc.py28
1 files changed, 16 insertions, 12 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index bfbfab6..2c09835 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -264,12 +264,14 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
error_fn: Callable = decimal_points_error
):
"""Run QC/QA on a `pheno` file."""
- with redis_logger(
+ with (redis_logger(
redisuri,
f"{__MODULE__}.qc_pheno_file",
filepath.name,
- fqkey) as logger:
+ f"{fqkey}:logs") as logger,
+ Redis.from_url(redisuri, decode_responses=True) as rconn):
logger.info("Running QC on file: %s", filepath.name)
+ save_error = partial(push_error, rconn, f"{fqkey}:errors:{filepath.name}")
_csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
_headings: tuple[str, ...] = tuple(
heading.lower() for heading in next(_csvfile))
@@ -277,24 +279,25 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
_absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
if len(_absent) > 0:
- _errors = _errors + (InvalidValue(
+ _errors = _errors + (save_error(InvalidValue(
filepath.name,
"header row",
"-",
", ".join(_absent),
(f"The phenotype names ({', '.join(samples)}) do not exist in any "
- "of the provided phenocovar files.")),)
+ "of the provided phenocovar files."))),)
def collect_errors(errors_and_linecount, line):
_errs, _lc = errors_and_linecount
+ logger.debug("Checking row %s", line[0])
if line[0] not in samples:
- _errs = _errs + (InvalidValue(
+ _errs = _errs + (save_error(InvalidValue(
filepath.name,
line[0],
_headings[0],
line[0],
(f"The sample named '{line[0]}' does not exist in the database. "
- "You will need to upload that first.")),)
+ "You will need to upload that first."))),)
for field, value in zip(_headings[1:], line[1:]):
if value in na_strings:
@@ -304,15 +307,16 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
line[0],
field,
value)
- _errs = _errs + ((_err,) if bool(_err) else tuple())
+ _errs = _errs + ((save_error(_err),) if bool(_err) else tuple())
+ rconn.hset(f"{fqkey}:metadata", "linecount", _lc+1)
+ rconn.hset(f"{fqkey}:metadata", "total-errors", len(_errs))
return _errs, _lc+1
- return {
- filepath.name: dict(zip(
- ("errors", "linecount"),
- reduce(collect_errors, _csvfile, (_errors, 1))))
- }
+ logger.debug(f"[{filepath.name}] Collecting errors")
+ _errors, _linecount = reduce(collect_errors, _csvfile, (_errors, 1))
+ logger.debug(f"[{filepath.name}] Finished collecting errors. Returning results …")
+ return {filepath.name: {"errors": _errors, "linecount": linecount}}
def phenotype_names(filepath: Path,