aboutsummaryrefslogtreecommitdiff
path: root/scripts/qc_on_rqtl2_bundle.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/qc_on_rqtl2_bundle.py')
-rw-r--r--scripts/qc_on_rqtl2_bundle.py87
1 files changed, 65 insertions, 22 deletions
diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py
index deef8fe..fc95d13 100644
--- a/scripts/qc_on_rqtl2_bundle.py
+++ b/scripts/qc_on_rqtl2_bundle.py
@@ -10,22 +10,24 @@ import multiprocessing as mproc
from logging import Logger, getLogger, StreamHandler
from typing import Union, Sequence, Callable, Iterator
+import MySQLdb as mdb
from redis import Redis
from quality_control.errors import InvalidValue
from quality_control.checks import decimal_points_error
-from qc_app import jobs
-from qc_app.check_connections import check_db, check_redis
+from uploader import jobs
+from uploader.db_utils import database_connection
+from uploader.check_connections import check_db, check_redis
-from r_qtl import errors as rqe
from r_qtl import r_qtl2 as rqtl2
from r_qtl import r_qtl2_qc as rqc
+from r_qtl import exceptions as rqe
from r_qtl import fileerrors as rqfe
-from scripts.cli_parser import init_cli_parser
from scripts.process_rqtl2_bundle import parse_job
from scripts.redis_logger import setup_redis_logger
+from scripts.cli_parser import init_cli_parser, add_global_data_arguments
def dict2tuple(dct: dict) -> tuple:
"""Utility to convert items in dicts to pairs of tuples."""
@@ -103,13 +105,13 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals
__update_processed__(value)
rconn.hset(fqjobid, f"{filetype}-linecount", count)
- except rqe.MissingFileError:
+ except rqe.MissingFileException:
fname = cdata.get(filetype)
yield rqfe.MissingFile(filetype, fname, (
f"The file '{fname}' does not exist in the bundle despite it being "
f"listed under '{filetype}' in the control file."))
-def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
+def qc_geno_errors(rconn, fqjobid, _dburi, _speciesid, zfile, logger) -> bool:
"""Check for errors in `geno` file(s)."""
cdata = rqtl2.control_data(zfile)
if "geno" in cdata:
@@ -129,15 +131,32 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool:
return False
-def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[
- Union[InvalidValue, rqfe.MissingFile], ...]:
+def fetch_db_geno_samples(conn: mdb.Connection, speciesid: int) -> tuple[str, ...]:
+ """Fetch samples/cases/individuals from the database."""
+ samples = set()# type: ignore[var-annotated]
+ with conn.cursor() as cursor:
+ cursor.execute("SELECT Name, Name2 from Strain WHERE SpeciesId=%s",
+ (speciesid,))
+ rows = cursor.fetchall() or tuple()
+ for row in rows:
+ samples.update(tuple(row))
+
+ return tuple(item.strip() for item in samples if bool(item))
+
+
+def check_pheno_samples(
+ conn: mdb.Connection,
+ speciesid: int,
+ zipfilepath: Union[str, Path],
+ logger: Logger
+) -> tuple[Union[InvalidValue, rqfe.MissingFile], ...]:
"""Check that samples in 'pheno' file exist in geno file."""
cdata = rqtl2.read_control_file(zipfilepath)
genosamples = tuple(
sample for perfilesamples in (
rqtl2.load_samples(zipfilepath, member, cdata["geno_transposed"])
for member in cdata["geno"])
- for sample in perfilesamples)
+ for sample in perfilesamples) + fetch_db_geno_samples(conn, speciesid)
def __check_file__(member) -> tuple[InvalidValue, ...]:
logger.info("Checking samples/cases in member file '%s' …", member)
@@ -149,7 +168,9 @@ def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[
errors = errors + (InvalidValue(
member, "-", "-", sample,
f"The individual/case/sample '{sample}' in file "
- f"{member} does not exist in any of the 'geno' files."),)
+ f"{member} does not exist in either, any of the 'geno' "
+ "files provided in the bundle or the GeneNetwork database."
+ ),)
logger.info("Found %s missing samples in member file '%s'.",
len(errors),
@@ -170,15 +191,21 @@ def check_pheno_samples(zipfilepath: Union[str, Path], logger: Logger) -> tuple[
return allerrors
-def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
+def qc_pheno_errors(# pylint: disable=[too-many-arguments]
+ rconn, fqjobid, dburi, speciesid, zfile, logger) -> bool:
"""Check for errors in `pheno` file(s)."""
cdata = rqtl2.control_data(zfile)
if "pheno" in cdata:
logger.info("Checking for errors in the 'pheno' file…")
- perrs = check_pheno_samples(zfile.filename, logger) + tuple(
- retrieve_errors_with_progress(
- rconn,fqjobid, zfile, "pheno",
- (partial(decimal_points_error, filename="pheno", mini=3),)))
+ perrs = tuple()# type: ignore[var-annotated]
+ with database_connection(dburi) as dbconn:
+ perrs = check_pheno_samples(
+ dbconn, speciesid, zfile.filename, logger) + tuple(
+ retrieve_errors_with_progress(
+ rconn,fqjobid, zfile, "pheno",
+ (partial(decimal_points_error,
+ filename="pheno",
+ mini=3),)))
add_to_errors(rconn, fqjobid, "errors-generic", tuple(
err for err in perrs if isinstance(err, rqfe.MissingFile)))
add_to_errors(rconn, fqjobid, "errors-pheno", tuple(
@@ -190,7 +217,8 @@ def qc_pheno_errors(rconn, fqjobid, zfile, logger) -> bool:
return False
-def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool:
+def qc_phenose_errors(# pylint: disable=[too-many-arguments]
+ rconn, fqjobid, _dburi, _speciesid, zfile, logger) -> bool:
"""Check for errors in `phenose` file(s)."""
cdata = rqtl2.control_data(zfile)
if "phenose" in cdata:
@@ -209,7 +237,14 @@ def qc_phenose_errors(rconn, fqjobid, zfile, logger) -> bool:
return False
-def qc_phenocovar_errors(_rconn, _fqjobid, _zfile, _logger) -> bool:
+def qc_phenocovar_errors(
+ _rconn,
+ _fqjobid,
+ _dburi,
+ _speciesid,
+ _zfile,
+ _logger
+) -> bool:
"""Check for errors in `phenocovar` file(s)."""
return False
@@ -225,12 +260,20 @@ def run_qc(rconn: Redis,
if qc_missing_files(rconn, fqjobid, zfile, logger):
return 1
- def with_zipfile(rconn, fqjobid, filename, logger, func):
+ def with_zipfile(# pylint: disable=[too-many-arguments]
+ rconn, fqjobid, dbconn, speciesid, filename, logger, func
+ ):
with ZipFile(filename, "r") as zfile:
- return func(rconn, fqjobid, zfile, logger)
+ return func(rconn, fqjobid, dbconn, speciesid, zfile, logger)
def buildargs(func):
- return (rconn, fqjobid, jobmeta["rqtl2-bundle-file"], logger, func)
+ return (rconn,
+ fqjobid,
+ args.databaseuri,
+ args.speciesid,
+ jobmeta["rqtl2-bundle-file"],
+ logger,
+ func)
processes = [
mproc.Process(target=with_zipfile, args=buildargs(qc_geno_errors,)),
mproc.Process(target=with_zipfile, args=buildargs(qc_pheno_errors,)),
@@ -263,8 +306,8 @@ def run_qc(rconn: Redis,
if __name__ == "__main__":
def main():
"""Enter R/qtl2 bundle QC runner."""
- args = init_cli_parser(
- "qc-on-rqtl2-bundle", "Run QC on R/qtl2 bundle.").parse_args()
+ args = add_global_data_arguments(init_cli_parser(
+ "qc-on-rqtl2-bundle", "Run QC on R/qtl2 bundle.")).parse_args()
check_redis(args.redisuri)
check_db(args.databaseuri)