From 84bbf8736ed017b24affb7e931207dafc4ae4780 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 22 Oct 2024 12:27:05 -0500 Subject: Check for errors in `pheno` files. --- scripts/rqtl2/phenotypes_qc.py | 106 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 6 deletions(-) diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index db990b1..668fca0 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -6,6 +6,7 @@ from pathlib import Path from zipfile import ZipFile from functools import reduce import multiprocessing as mproc +from typing import Optional, Sequence from logging import Logger, getLogger, StreamHandler import MySQLdb as mdb @@ -15,6 +16,8 @@ from r_qtl import r_qtl2_qc as rqc from r_qtl import exceptions as rqe from r_qtl.fileerrors import InvalidValue +from quality_control.checks import decimal_places_pattern + from uploader.files import sha256_digest_over_file from uploader.samples.models import samples_by_species_and_population @@ -166,6 +169,84 @@ def merge_dicts(*dicts): return reduce(lambda merged, dct: {**merged, **dct}, dicts, {}) +def decimal_points_error( + filename: str, + rowtitle: str, + coltitle: str, + cellvalue: str, + message: str, + decimal_places: int = 1 +) -> Optional[InvalidValue]: + """Returns an error if the value does not meet the checks.""" + if not bool(decimal_places_pattern(1).match(cellvalue)): + return InvalidValue(filename, rowtitle, coltitle, cellvalue, message) + return None + + +def qc_pheno_file( + filepath: Path, + samples: tuple[str, ...], + phenonames: tuple[str, ...], + separator: str, + comment_char: str, + na_strings: Sequence[str] +): + """Run QC/QA on a `pheno` file.""" + _csvfile = rqtl2.read_csv_file(filepath, separator, comment_char) + _headings = tuple(heading.lower() for heading in next(_csvfile)) + _errors = tuple() + + _absent = tuple(pheno for pheno in _headings[1:] if pheno not in phenonames) + if len(_absent) > 0: + _errors = _errors + (InvalidValue( + filepath.name, + "header row", + "-", + ", ".join(_absent), + (f"The phenotype names ({', '.join(samples)}) do not exist in any " + "of the provided phenocovar files.")),) + + def collect_errors(errors_and_linecount, line): + _errs, _lc = errors_and_linecount + if line[0] not in samples: + _errs = _errs + (InvalidValue( + filepath.name, + line[0], + _headings[0], + line[0], + (f"The sample named '{line[0]}' does not exist in the database. " + "You will need to upload that first.")),) + + for field, value in zip(_headings[1:], line[1:]): + if value in na_strings: + continue + _err = decimal_points_error( + filepath.name, + line[0], + field, + value, + ("Expected a non-negative number with at least one decimal " + "place.")) + _errs = _errs + ((_err,) if bool(_err) else tuple()) + + return _errs, _lc+1 + + return { + filepath.name: dict(zip( + ("errors", "linecount"), + reduce(collect_errors, _csvfile, (_errors, 1)))) + } + + +def phenotype_names(filepath: Path, + separator: str, + comment_char: str) -> tuple[str, ...]: + """Read phenotype names from `phenocovar` file.""" + return reduce(lambda tpl, line: tpl + (line[0],), + rqtl2.read_csv_file(filepath, separator, comment_char), + tuple())[1:] + + def run_qc(# pylint: disable=[too-many-arguments] dbconn: mdb.Connection, phenobundle: Path, @@ -209,15 +290,28 @@ def run_qc(# pylint: disable=[too-many-arguments] # - Check that `description` and `units` is present in phenocovar for # all phenotypes with mproc.Pool(mproc.cpu_count() - 1) as pool: - # This call is way too busy. Maybe just return the errors? - qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple( + phenocovar_qc_res = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple( (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"]) for _file in cdata.get("phenocovar", [])))) - # - Check all samples in pheno files exist in database - # - Check all phenotypes in pheno files exist in phenocovar files - # - Check all numeric values in pheno files - # qc_pheno_files(…) + # - Check all samples in pheno files exist in database + # - Check all phenotypes in pheno files exist in phenocovar files + # - Check all numeric values in pheno files + phenonames = tuple(set( + name for names in pool.starmap(phenotype_names, tuple( + (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"]) + for _file in cdata.get("phenocovar", []))) + for name in names)) + + pheno_qc_res = merge_dicts(*pool.starmap(qc_pheno_file, tuple(( + extractiondir.joinpath(_file), + samples, + phenonames, + cdata["sep"], + cdata["comment.char"], + cdata["na.strings"] + ) for _file in cdata.get("pheno", [])))) + # - Check the 3 checks above for phenose and phenonum values too # qc_phenose_files(…) # qc_phenonum_files(…) -- cgit v1.2.3