diff options
author | Frederick Muriuki Muriithi | 2024-10-22 12:27:05 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2024-10-22 12:27:05 -0500 |
commit | 84bbf8736ed017b24affb7e931207dafc4ae4780 (patch) | |
tree | 49208e08c0b6e502b995176a47faaeecebf7af0f | |
parent | 12f613cbfb1379c822e5a8831e2ca2becda3bce9 (diff) | |
download | gn-uploader-84bbf8736ed017b24affb7e931207dafc4ae4780.tar.gz |
Check for errors in `pheno` files.
-rw-r--r-- | scripts/rqtl2/phenotypes_qc.py | 106 |
1 files changed, 100 insertions, 6 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index db990b1..668fca0 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -6,6 +6,7 @@ from pathlib import Path from zipfile import ZipFile from functools import reduce import multiprocessing as mproc +from typing import Optional, Sequence from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb @@ -15,6 +16,8 @@ from r_qtl import r_qtl2_qc as rqc from r_qtl import exceptions as rqe from r_qtl.fileerrors import InvalidValue +from quality_control.checks import decimal_places_pattern + from uploader.files import sha256_digest_over_file from uploader.samples.models import samples_by_species_and_population @@ -166,6 +169,84 @@ def merge_dicts(*dicts): return reduce(lambda merged, dct: {**merged, **dct}, dicts, {}) +def decimal_points_error( + filename: str, + rowtitle: str, + coltitle: str, + cellvalue: str, + message: str, + decimal_places: int = 1 +) -> Optional[InvalidValue]: + """Returns an error if the value does not meet the checks.""" + if not bool(decimal_places_pattern(1).match(cellvalue)): + return InvalidValue(filename, rowtitle, coltitle, cellvalue, message) + return None + + +def qc_pheno_file( + filepath: Path, + samples: tuple[str, ...], + phenonames: tuple[str, ...], + separator: str, + comment_char: str, + na_strings: Sequence[str] +): + """Run QC/QA on a `pheno` file.""" + _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) + _headings = tuple(heading.lower() for heading in next(_csvfile)) + _errors = tuple() + + _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames) + if len(_absent) > 0: + _errors = _errors + (InvalidValue( + filepath.name, + "header row", + "-", + ", ".join(_absent), + (f"The phenotype names ({', '.join(samples)}) do not exist in any " + "of the provided phenocovar files.")),) + + def collect_errors(errors_and_linecount, line): + _errs, _lc = errors_and_linecount + if line[0] not in samples: + _errs = _errs + (InvalidValue( + filepath.name, + line[0], + _headings[0], + line[0], + (f"The sample named '{line[0]}' does not exist in the database. " + "You will need to upload that first.")),) + + for field, value in zip(_headings[1:], line[1:]): + if value in na_strings: + continue + _err = decimal_points_error( + filepath.name, + line[0], + field, + value, + ("Expected a non-negative number with at least one decimal " + "place.")) + _errs = _errs + ((_err,) if bool(_err) else tuple()) + + return _errs, _lc+1 + + return { + filepath.name: dict(zip( + ("errors", "linecount"), + reduce(collect_errors, _csvfile, (_errors, 1)))) + } + + +def phenotype_names(filepath: Path, + separator: str, + comment_char: str) -> tuple[str, ...]: + """Read phenotype names from `phenocovar` file.""" + return reduce(lambda tpl, line: tpl + (line[0],), + rqtl2.read_csv_file(filepath, separator, comment_char), + tuple())[1:] + + def run_qc(# pylint: disable=[too-many-arguments] dbconn: mdb.Connection, phenobundle: Path, @@ -209,15 +290,28 @@ def run_qc(# pylint: disable=[too-many-arguments] # - Check that `description` and `units` is present in phenocovar for # all phenotypes with mproc.Pool(mproc.cpu_count() - 1) as pool: - # This call is way too busy. Maybe just return the errors? - qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple( + phenocovar_qc_res = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple( (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"]) for _file in cdata.get("phenocovar", [])))) - # - Check all samples in pheno files exist in database - # - Check all phenotypes in pheno files exist in phenocovar files - # - Check all numeric values in pheno files - # qc_pheno_files(…) + # - Check all samples in pheno files exist in database + # - Check all phenotypes in pheno files exist in phenocovar files + # - Check all numeric values in pheno files + phenonames = tuple(set( + name for names in pool.starmap(phenotype_names, tuple( + (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"]) + for _file in cdata.get("phenocovar", []))) + for name in names)) + + pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( + extractiondir.joinpath(_file), + samples, + phenonames, + cdata["sep"], + cdata["comment.char"], + cdata["na.strings"] + ) for _file in cdata.get("pheno", [])))) + # - Check the 3 checks above for phenose and phenonum values too # qc_phenose_files(…) # qc_phenonum_files(…) |