aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-10-24 16:44:17 -0500
committerFrederick Muriuki Muriithi2024-10-24 16:44:17 -0500
commitf8e45d54be507d5080e9a6a3a3026907f60507a9 (patch)
treed3085f6224cfa930dff3496eec0625e1be1e9278
parentda177466dfbdc36c866c153c41babe52b15d8788 (diff)
downloadgn-uploader-f8e45d54be507d5080e9a6a3a3026907f60507a9.tar.gz
Check pheno, phenose, and phenonum files with logging
Add logging to the `qc_pheno_file` function so that we get the messages pushed to redis for every file that is being checked for errors.
-rw-r--r--scripts/rqtl2/phenotypes_qc.py119
1 files changed, 71 insertions, 48 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 4c02578..d8bf3c2 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -150,7 +150,7 @@ def redis_logger(
def qc_phenocovar_file(
- filename: Path,
+ filepath: Path,
redisuri,
fqkey: str,
separator: str,
@@ -159,20 +159,20 @@ def qc_phenocovar_file(
with redis_logger(
redisuri,
f"{__MODULE__}.qc_phenocovar_file",
- filename.name,
+ filepath.name,
fqkey) as logger:
- logger.info("Running QC on file: %s", filename.name)
- _csvfile = rqtl2.read_csv_file(filename, separator, comment_char)
+ logger.info("Running QC on file: %s", filepath.name)
+ _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
_headings = tuple(heading.lower() for heading in next(_csvfile))
_errors: tuple[InvalidValue, ...] = tuple()
for heading in ("description", "units"):
if heading not in _headings:
_errors = (InvalidValue(
- filename.name,
+ filepath.name,
"header row",
"-",
"-",
- (f"File {filename.name} is missing the {heading} heading "
+ (f"File {filepath.name} is missing the {heading} heading "
"in the header line.")),)
def collect_errors(errors_and_linecount, line):
@@ -180,16 +180,16 @@ def qc_phenocovar_file(
logger.info("Testing record '%s'", line[0])
if len(line) != len(_headings):
_errs = _errs + (InvalidValue(
- filename.name,
+ filepath.name,
line[0],
"-",
"-",
- (f"Record {_lc} in file {filename.name} has a different "
+ (f"Record {_lc} in file {filepath.name} has a different "
"number of columns than the number of headings")),)
_line = dict(zip(_headings, line))
if not bool(_line["description"]):
_errs = _errs + (
- InvalidValue(filename.name,
+ InvalidValue(filepath.name,
_line[_headings[0]],
"description",
_line["description"],
@@ -198,7 +198,7 @@ def qc_phenocovar_file(
return _errs, _lc+1
return {
- filename.name: dict(zip(
+ filepath.name: dict(zip(
("errors", "linecount"),
reduce(collect_errors, _csvfile, (_errors, 1))))
}
@@ -242,6 +242,8 @@ def integer_error(
def qc_pheno_file(# pylint: disable=[too-many-arguments]
filepath: Path,
+ redisuri: str,
+ fqkey: str,
samples: tuple[str, ...],
phenonames: tuple[str, ...],
separator: str,
@@ -250,49 +252,55 @@ def qc_pheno_file(# pylint: disable=[too-many-arguments]
error_fn: Callable = decimal_points_error
):
"""Run QC/QA on a `pheno` file."""
- _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
- _headings: tuple[str, ...] = tuple(
- heading.lower() for heading in next(_csvfile))
- _errors: tuple[InvalidValue, ...] = tuple()
-
- _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
- if len(_absent) > 0:
- _errors = _errors + (InvalidValue(
- filepath.name,
- "header row",
- "-",
- ", ".join(_absent),
- (f"The phenotype names ({', '.join(samples)}) do not exist in any "
- "of the provided phenocovar files.")),)
-
- def collect_errors(errors_and_linecount, line):
- _errs, _lc = errors_and_linecount
- if line[0] not in samples:
- _errs = _errs + (InvalidValue(
+ with redis_logger(
+ redisuri,
+ f"{__MODULE__}.qc_pheno_file",
filepath.name,
- line[0],
- _headings[0],
- line[0],
- (f"The sample named '{line[0]}' does not exist in the database. "
- "You will need to upload that first.")),)
-
- for field, value in zip(_headings[1:], line[1:]):
- if value in na_strings:
- continue
- _err = error_fn(
+ fqkey) as logger:
+ logger.info("Running QC on file: %s", filepath.name)
+ _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
+ _headings: tuple[str, ...] = tuple(
+ heading.lower() for heading in next(_csvfile))
+ _errors: tuple[InvalidValue, ...] = tuple()
+
+ _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
+ if len(_absent) > 0:
+ _errors = _errors + (InvalidValue(
filepath.name,
+ "header row",
+ "-",
+ ", ".join(_absent),
+ (f"The phenotype names ({', '.join(samples)}) do not exist in any "
+ "of the provided phenocovar files.")),)
+
+ def collect_errors(errors_and_linecount, line):
+ _errs, _lc = errors_and_linecount
+ if line[0] not in samples:
+ _errs = _errs + (InvalidValue(
+ filepath.name,
+ line[0],
+ _headings[0],
line[0],
- field,
- value)
- _errs = _errs + ((_err,) if bool(_err) else tuple())
+ (f"The sample named '{line[0]}' does not exist in the database. "
+ "You will need to upload that first.")),)
+
+ for field, value in zip(_headings[1:], line[1:]):
+ if value in na_strings:
+ continue
+ _err = error_fn(
+ filepath.name,
+ line[0],
+ field,
+ value)
+ _errs = _errs + ((_err,) if bool(_err) else tuple())
- return _errs, _lc+1
+ return _errs, _lc+1
- return {
- filepath.name: dict(zip(
- ("errors", "linecount"),
- reduce(collect_errors, _csvfile, (_errors, 1))))
- }
+ return {
+ filepath.name: dict(zip(
+ ("errors", "linecount"),
+ reduce(collect_errors, _csvfile, (_errors, 1))))
+ }
def phenotype_names(filepath: Path,
@@ -384,6 +392,11 @@ def run_qc(# pylint: disable=[too-many-locals]
logger.debug("Check for errors in 'pheno' file(s).")
_pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
extractiondir.joinpath(_file),
+ args.redisuri,
+ chain(
+ "pheno",
+ fullyqualifiedkey(args.jobid),
+ fullyqualifiedkey(args.redisprefix)),
samples,
phenonames,
cdata["sep"],
@@ -398,6 +411,11 @@ def run_qc(# pylint: disable=[too-many-locals]
logger.debug("Check for errors in 'phenose' file(s).")
_phenose_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
extractiondir.joinpath(_file),
+ args.redisuri,
+ chain(
+ "phenose",
+ fullyqualifiedkey(args.jobid),
+ fullyqualifiedkey(args.redisprefix)),
samples,
phenonames,
cdata["sep"],
@@ -409,6 +427,11 @@ def run_qc(# pylint: disable=[too-many-locals]
logger.debug("Check for errors in 'phenonum' file(s).")
_phenonum_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
extractiondir.joinpath(_file),
+ args.redisuri,
+ chain(
+ "phenonum",
+ fullyqualifiedkey(args.jobid),
+ fullyqualifiedkey(args.redisprefix)),
samples,
phenonames,
cdata["sep"],