From 12f613cbfb1379c822e5a8831e2ca2becda3bce9 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 21 Oct 2024 17:01:10 -0500 Subject: Check `phenocovar` files for errors. --- scripts/rqtl2/phenotypes_qc.py | 55 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'scripts') diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py index 76ef7bf..db990b1 100644 --- a/scripts/rqtl2/phenotypes_qc.py +++ b/scripts/rqtl2/phenotypes_qc.py @@ -13,6 +13,7 @@ import MySQLdb as mdb from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc from r_qtl import exceptions as rqe +from r_qtl.fileerrors import InvalidValue from uploader.files import sha256_digest_over_file from uploader.samples.models import samples_by_species_and_population @@ -117,6 +118,54 @@ def undo_transpose(filetype: str, cdata: dict, extractiondir): build_line_joiner(cdata)) +def qc_phenocovar_file(phenocovarfile: Path, separator: str, comment_char: str): + """Check that `phenocovar` files are structured correctly.""" + _csvfile = rqtl2.read_csv_file(phenocovarfile, separator, comment_char) + _headings = tuple(heading.lower() for heading in next(_csvfile)) + _errors = tuple() + for heading in ("description", "units"): + if heading not in _headings: + _errors = (InvalidValue( + phenocovarfile.name, + "header row", + "-", + "-", + (f"File {phenocovarfile.name} is missing the {heading} heading " + "in the header line.")),) + + def collect_errors(errors_and_linecount, line): + _errs, _lc = errors_and_linecount + if len(line) != len(_headings): + _errs = _errs + (InvalidValue( + phenocovarfile.name, + line[0], + "-", + "-", + (f"Record {_lc} in file {phenocovarfile.name} has a different " + "number of columns than the number of headings")),) + _line = dict(zip(_headings, line)) + if not bool(_line["description"]): + _errs = _errs + ( + InvalidValue(phenocovarfile.name, + _line[_headings[0]], + "description", + _line["description"], + "The description is not provided!"),) + + return _errs, _lc+1 + + return { + phenocovarfile.name: dict(zip( + ("errors", "linecount"), + reduce(collect_errors, _csvfile, (_errors, 1)))) + } + + +def merge_dicts(*dicts): + """Merge multiple dicts into a single one.""" + return reduce(lambda merged, dct: {**merged, **dct}, dicts, {}) + + def run_qc(# pylint: disable=[too-many-arguments] dbconn: mdb.Connection, phenobundle: Path, @@ -159,7 +208,11 @@ def run_qc(# pylint: disable=[too-many-arguments] # - Check that `description` and `units` is present in phenocovar for # all phenotypes - # qc_phenocovar_files(…) + with mproc.Pool(mproc.cpu_count() - 1) as pool: + # This call is way too busy. Maybe just return the errors? + qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple( + (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"]) + for _file in cdata.get("phenocovar", [])))) # - Check all samples in pheno files exist in database # - Check all phenotypes in pheno files exist in phenocovar files -- cgit v1.2.3