aboutsummaryrefslogtreecommitdiff
path: root/scripts/qc_on_rqtl2_bundle.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/qc_on_rqtl2_bundle.py')
-rw-r--r--scripts/qc_on_rqtl2_bundle.py64
1 files changed, 60 insertions, 4 deletions
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py
index c3e8b66..b5b2059 100644
--- a/scripts/qc_on_rqtl2_bundle.py
+++ b/scripts/qc_on_rqtl2_bundle.py
@@ -2,18 +2,21 @@
import sys
import json
from zipfile import ZipFile
+from functools import partial
from argparse import Namespace
-from typing import Union, Sequence
from logging import Logger, getLogger, StreamHandler
+from typing import Union, Sequence, Callable, Iterator
from redis import Redis
from quality_control.errors import InvalidValue
+from quality_control.checks import decimal_points_error
from qc_app import jobs
from qc_app.db_utils import database_connection
from qc_app.check_connections import check_db, check_redis
+from r_qtl import errors as rqe
from r_qtl import r_qtl2 as rqtl2
from r_qtl import r_qtl2_qc as rqc
from r_qtl import fileerrors as rqfe
@@ -57,12 +60,61 @@ def qc_missing_files(rconn: Redis,
return True
return False
+def compute_filesize(zfile: ZipFile, filetype: str) -> int:
+ """Compute the total file size."""
+ cdata = rqtl2.control_data(zfile)
+ if isinstance(cdata[filetype], str):
+ return zfile.getinfo(cdata[filetype]).file_size
+
+ return sum(zfile.getinfo(afile).file_size for afile in cdata[filetype])
+
+def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals]
+ fqjobid: str,
+ zfile: ZipFile,
+ filetype: str,
+ checkers: tuple[Callable]) -> Iterator[Union[
+ InvalidValue, rqfe.MissingFile]]:
+ """Filter the errors while also counting the number of lines in the file."""
+ assert filetype in rqtl2.FILE_TYPES, f"Invalid file type {filetype}."
+ count = 0
+ checked = 0
+ cdata = rqtl2.control_data(zfile)
+ rconn.hset(fqjobid, f"{filetype}-filesize", compute_filesize(zfile, filetype))
+ def __update_processed__(value):
+ nonlocal checked
+ checked = checked + len(value)
+ rconn.hset(fqjobid, f"{filetype}-checked", checked)
+
+ try:# pylint: disable=[too-many-nested-blocks]
+ for lineno, row in enumerate(
+ rqtl2.file_data(zfile, filetype, cdata), start=1):
+ count = count + 1
+ for field, value in row.items():
+ if field == "id":
+ __update_processed__(value)
+ continue
+ if value is not None:
+ for checker in checkers:
+ error = checker(lineno, field, value)
+ if bool(error):
+ yield error
+ __update_processed__(value)
+
+ rconn.hset(fqjobid, f"{filetype}-linecount", count)
+ except rqe.MissingFileError:
+ fname = cdata.get(filetype)
+ yield rqfe.MissingFile(filetype, fname, (
+ f"The file '{fname}' does not exist in the bundle despite it being "
+ f"listed under '{filetype}' in the control file."))
+
def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
"""Check for errors in `geno` file(s)."""
logger.info("Checking for errors in the 'geno' file…")
cdata = rqtl2.control_data(zfile)
if "geno" in cdata:
- gerrs = tuple(rqc.geno_errors(zfile))
+ gerrs = tuple(retrieve_errors_with_progress(
+ rconn, fqjobid, zfile, "geno",
+ (rqc.make_genocode_checker(cdata.get("genotypes", {})),)))
add_to_errors(rconn, fqjobid, "errors-generic", tuple(
err for err in gerrs if isinstance(err, rqfe.MissingFile)))
add_to_errors(rconn, fqjobid, "errors-geno", tuple(
@@ -79,7 +131,9 @@ def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
logger.info("Checking for errors in the 'pheno' file…")
cdata = rqtl2.control_data(zfile)
if "pheno" in cdata:
- perrs = tuple(rqc.pheno_errors(zfile))
+ perrs = tuple(retrieve_errors_with_progress(
+ rconn,fqjobid, zfile, "pheno",
+ (partial(decimal_points_error, mini=3),)))
add_to_errors(rconn, fqjobid, "errors-generic", tuple(
err for err in perrs if isinstance(err, rqfe.MissingFile)))
add_to_errors(rconn, fqjobid, "errors-pheno", tuple(
@@ -96,7 +150,9 @@ def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool:
logger.info("Checking for errors in the 'phenose' file…")
cdata = rqtl2.control_data(zfile)
if "phenose" in cdata:
- perrs = tuple(rqc.phenose_errors(zfile))
+ perrs = tuple(retrieve_errors_with_progress(
+ rconn,fqjobid, zfile, "phenose",
+ (partial(decimal_points_error, mini=6),)))
add_to_errors(rconn, fqjobid, "errors-generic", tuple(
err for err in perrs if isinstance(err, rqfe.MissingFile)))
add_to_errors(rconn, fqjobid, "errors-phenose", tuple(