diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/qc_on_rqtl2_bundle.py | 64 |
1 files changed, 60 insertions, 4 deletions
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index c3e8b66..b5b2059 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -2,18 +2,21 @@ import sys import json from zipfile import ZipFile +from functools import partial from argparse import Namespace -from typing import Union, Sequence from logging import Logger, getLogger, StreamHandler +from typing import Union, Sequence, Callable, Iterator from redis import Redis from quality_control.errors import InvalidValue +from quality_control.checks import decimal_points_error from qc_app import jobs from qc_app.db_utils import database_connection from qc_app.check_connections import check_db, check_redis +from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc from r_qtl import fileerrors as rqfe @@ -57,12 +60,61 @@ def qc_missing_files(rconn: Redis, return True return False +def compute_filesize(zfile: ZipFile, filetype: str) -> int: + """Compute the total file size.""" + cdata = rqtl2.control_data(zfile) + if isinstance(cdata[filetype], str): + return zfile.getinfo(cdata[filetype]).file_size + + return sum(zfile.getinfo(afile).file_size for afile in cdata[filetype]) + +def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals] + fqjobid: str, + zfile: ZipFile, + filetype: str, + checkers: tuple[Callable]) -> Iterator[Union[ + InvalidValue, rqfe.MissingFile]]: + """Filter the errors while also counting the number of lines in the file.""" + assert filetype in rqtl2.FILE_TYPES, f"Invalid file type {filetype}." + count = 0 + checked = 0 + cdata = rqtl2.control_data(zfile) + rconn.hset(fqjobid, f"{filetype}-filesize", compute_filesize(zfile, filetype)) + def __update_processed__(value): + nonlocal checked + checked = checked + len(value) + rconn.hset(fqjobid, f"{filetype}-checked", checked) + + try:# pylint: disable=[too-many-nested-blocks] + for lineno, row in enumerate( + rqtl2.file_data(zfile, filetype, cdata), start=1): + count = count + 1 + for field, value in row.items(): + if field == "id": + __update_processed__(value) + continue + if value is not None: + for checker in checkers: + error = checker(lineno, field, value) + if bool(error): + yield error + __update_processed__(value) + + rconn.hset(fqjobid, f"{filetype}-linecount", count) + except rqe.MissingFileError: + fname = cdata.get(filetype) + yield rqfe.MissingFile(filetype, fname, ( + f"The file '{fname}' does not exist in the bundle despite it being " + f"listed under '{filetype}' in the control file.")) + def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool: """Check for errors in `geno` file(s).""" logger.info("Checking for errors in the 'geno' file…") cdata = rqtl2.control_data(zfile) if "geno" in cdata: - gerrs = tuple(rqc.geno_errors(zfile)) + gerrs = tuple(retrieve_errors_with_progress( + rconn, fqjobid, zfile, "geno", + (rqc.make_genocode_checker(cdata.get("genotypes", {})),))) add_to_errors(rconn, fqjobid, "errors-generic", tuple( err for err in gerrs if isinstance(err, rqfe.MissingFile))) add_to_errors(rconn, fqjobid, "errors-geno", tuple( @@ -79,7 +131,9 @@ def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool: logger.info("Checking for errors in the 'pheno' file…") cdata = rqtl2.control_data(zfile) if "pheno" in cdata: - perrs = tuple(rqc.pheno_errors(zfile)) + perrs = tuple(retrieve_errors_with_progress( + rconn,fqjobid, zfile, "pheno", + (partial(decimal_points_error, mini=3),))) add_to_errors(rconn, fqjobid, "errors-generic", tuple( err for err in perrs if isinstance(err, rqfe.MissingFile))) add_to_errors(rconn, fqjobid, "errors-pheno", tuple( @@ -96,7 +150,9 @@ def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool: logger.info("Checking for errors in the 'phenose' file…") cdata = rqtl2.control_data(zfile) if "phenose" in cdata: - perrs = tuple(rqc.phenose_errors(zfile)) + perrs = tuple(retrieve_errors_with_progress( + rconn,fqjobid, zfile, "phenose", + (partial(decimal_points_error, mini=6),))) add_to_errors(rconn, fqjobid, "errors-generic", tuple( err for err in perrs if isinstance(err, rqfe.MissingFile))) add_to_errors(rconn, fqjobid, "errors-phenose", tuple( |