aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-10-22 12:27:05 -0500
committerFrederick Muriuki Muriithi2024-10-22 12:27:05 -0500
commit84bbf8736ed017b24affb7e931207dafc4ae4780 (patch)
tree49208e08c0b6e502b995176a47faaeecebf7af0f /scripts
parent12f613cbfb1379c822e5a8831e2ca2becda3bce9 (diff)
downloadgn-uploader-84bbf8736ed017b24affb7e931207dafc4ae4780.tar.gz
Check for errors in `pheno` files.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/rqtl2/phenotypes_qc.py106
1 files changed, 100 insertions, 6 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index db990b1..668fca0 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -6,6 +6,7 @@ from pathlib import Path
from zipfile import ZipFile
from functools import reduce
import multiprocessing as mproc
+from typing import Optional, Sequence
from logging import Logger, getLogger, StreamHandler
import MySQLdb as mdb
@@ -15,6 +16,8 @@ from r_qtl import r_qtl2_qc as rqc
from r_qtl import exceptions as rqe
from r_qtl.fileerrors import InvalidValue
+from quality_control.checks import decimal_places_pattern
+
from uploader.files import sha256_digest_over_file
from uploader.samples.models import samples_by_species_and_population
@@ -166,6 +169,84 @@ def merge_dicts(*dicts):
return reduce(lambda merged, dct: {**merged, **dct}, dicts, {})
+def decimal_points_error(
+ filename: str,
+ rowtitle: str,
+ coltitle: str,
+ cellvalue: str,
+ message: str,
+ decimal_places: int = 1
+) -> Optional[InvalidValue]:
+ """Returns an error if the value does not meet the checks."""
+ if not bool(decimal_places_pattern(1).match(cellvalue)):
+ return InvalidValue(filename, rowtitle, coltitle, cellvalue, message)
+ return None
+
+
+def qc_pheno_file(
+ filepath: Path,
+ samples: tuple[str, ...],
+ phenonames: tuple[str, ...],
+ separator: str,
+ comment_char: str,
+ na_strings: Sequence[str]
+):
+ """Run QC/QA on a `pheno` file."""
+ _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char)
+ _headings = tuple(heading.lower() for heading in next(_csvfile))
+ _errors = tuple()
+
+ _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames)
+ if len(_absent) > 0:
+ _errors = _errors + (InvalidValue(
+ filepath.name,
+ "header row",
+ "-",
+ ", ".join(_absent),
+ (f"The phenotype names ({', '.join(samples)}) do not exist in any "
+ "of the provided phenocovar files.")),)
+
+ def collect_errors(errors_and_linecount, line):
+ _errs, _lc = errors_and_linecount
+ if line[0] not in samples:
+ _errs = _errs + (InvalidValue(
+ filepath.name,
+ line[0],
+ _headings[0],
+ line[0],
+ (f"The sample named '{line[0]}' does not exist in the database. "
+ "You will need to upload that first.")),)
+
+ for field, value in zip(_headings[1:], line[1:]):
+ if value in na_strings:
+ continue
+ _err = decimal_points_error(
+ filepath.name,
+ line[0],
+ field,
+ value,
+ ("Expected a non-negative number with at least one decimal "
+ "place."))
+ _errs = _errs + ((_err,) if bool(_err) else tuple())
+
+ return _errs, _lc+1
+
+ return {
+ filepath.name: dict(zip(
+ ("errors", "linecount"),
+ reduce(collect_errors, _csvfile, (_errors, 1))))
+ }
+
+
+def phenotype_names(filepath: Path,
+ separator: str,
+ comment_char: str) -> tuple[str, ...]:
+ """Read phenotype names from `phenocovar` file."""
+ return reduce(lambda tpl, line: tpl + (line[0],),
+ rqtl2.read_csv_file(filepath, separator, comment_char),
+ tuple())[1:]
+
+
def run_qc(# pylint: disable=[too-many-arguments]
dbconn: mdb.Connection,
phenobundle: Path,
@@ -209,15 +290,28 @@ def run_qc(# pylint: disable=[too-many-arguments]
# - Check that `description` and `units` is present in phenocovar for
# all phenotypes
with mproc.Pool(mproc.cpu_count() - 1) as pool:
- # This call is way too busy. Maybe just return the errors?
- qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple(
+ phenocovar_qc_res = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple(
(extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"])
for _file in cdata.get("phenocovar", []))))
- # - Check all samples in pheno files exist in database
- # - Check all phenotypes in pheno files exist in phenocovar files
- # - Check all numeric values in pheno files
- # qc_pheno_files(…)
+ # - Check all samples in pheno files exist in database
+ # - Check all phenotypes in pheno files exist in phenocovar files
+ # - Check all numeric values in pheno files
+ phenonames = tuple(set(
+ name for names in pool.starmap(phenotype_names, tuple(
+ (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"])
+ for _file in cdata.get("phenocovar", [])))
+ for name in names))
+
+ pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple((
+ extractiondir.joinpath(_file),
+ samples,
+ phenonames,
+ cdata["sep"],
+ cdata["comment.char"],
+ cdata["na.strings"]
+ ) for _file in cdata.get("pheno", []))))
+
# - Check the 3 checks above for phenose and phenonum values too
# qc_phenose_files(…)
# qc_phenonum_files(…)