aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/rqtl2/phenotypes_qc.py55
1 files changed, 54 insertions, 1 deletions
diff --git a/scripts/rqtl2/phenotypes_qc.py b/scripts/rqtl2/phenotypes_qc.py
index 76ef7bf..db990b1 100644
--- a/scripts/rqtl2/phenotypes_qc.py
+++ b/scripts/rqtl2/phenotypes_qc.py
@@ -13,6 +13,7 @@ import MySQLdb as mdb
from r_qtl import r_qtl2 as rqtl2
from r_qtl import r_qtl2_qc as rqc
from r_qtl import exceptions as rqe
+from r_qtl.fileerrors import InvalidValue
from uploader.files import sha256_digest_over_file
from uploader.samples.models import samples_by_species_and_population
@@ -117,6 +118,54 @@ def undo_transpose(filetype: str, cdata: dict, extractiondir):
build_line_joiner(cdata))
+def qc_phenocovar_file(phenocovarfile: Path, separator: str, comment_char: str):
+ """Check that `phenocovar` files are structured correctly."""
+ _csvfile = rqtl2.read_csv_file(phenocovarfile, separator, comment_char)
+ _headings = tuple(heading.lower() for heading in next(_csvfile))
+ _errors = tuple()
+ for heading in ("description", "units"):
+ if heading not in _headings:
+ _errors = (InvalidValue(
+ phenocovarfile.name,
+ "header row",
+ "-",
+ "-",
+ (f"File {phenocovarfile.name} is missing the {heading} heading "
+ "in the header line.")),)
+
+ def collect_errors(errors_and_linecount, line):
+ _errs, _lc = errors_and_linecount
+ if len(line) != len(_headings):
+ _errs = _errs + (InvalidValue(
+ phenocovarfile.name,
+ line[0],
+ "-",
+ "-",
+ (f"Record {_lc} in file {phenocovarfile.name} has a different "
+ "number of columns than the number of headings")),)
+ _line = dict(zip(_headings, line))
+ if not bool(_line["description"]):
+ _errs = _errs + (
+ InvalidValue(phenocovarfile.name,
+ _line[_headings[0]],
+ "description",
+ _line["description"],
+ "The description is not provided!"),)
+
+ return _errs, _lc+1
+
+ return {
+ phenocovarfile.name: dict(zip(
+ ("errors", "linecount"),
+ reduce(collect_errors, _csvfile, (_errors, 1))))
+ }
+
+
+def merge_dicts(*dicts):
+ """Merge multiple dicts into a single one."""
+ return reduce(lambda merged, dct: {**merged, **dct}, dicts, {})
+
+
def run_qc(# pylint: disable=[too-many-arguments]
dbconn: mdb.Connection,
phenobundle: Path,
@@ -159,7 +208,11 @@ def run_qc(# pylint: disable=[too-many-arguments]
# - Check that `description` and `units` is present in phenocovar for
# all phenotypes
- # qc_phenocovar_files(…)
+ with mproc.Pool(mproc.cpu_count() - 1) as pool:
+ # This call is way too busy. Maybe just return the errors?
+ qc_results = merge_dicts(*pool.starmap(qc_phenocovar_file, tuple(
+ (extractiondir.joinpath(_file), cdata["sep"], cdata["comment.char"])
+ for _file in cdata.get("phenocovar", []))))
# - Check all samples in pheno files exist in database
# - Check all phenotypes in pheno files exist in phenocovar files