diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/insert_data.py | 4 | ||||
-rw-r--r-- | scripts/insert_samples.py | 9 | ||||
-rw-r--r-- | scripts/process_rqtl2_bundle.py | 8 | ||||
-rw-r--r-- | scripts/qc.py | 2 | ||||
-rw-r--r-- | scripts/qc_on_rqtl2_bundle.py | 24 | ||||
-rw-r--r-- | scripts/qcapp_wsgi.py | 4 | ||||
-rw-r--r-- | scripts/rqtl2/entry.py | 6 | ||||
-rw-r--r-- | scripts/rqtl2/install_genotypes.py | 119 | ||||
-rw-r--r-- | scripts/validate_file.py | 4 | ||||
-rw-r--r-- | scripts/worker.py | 4 |
10 files changed, 109 insertions, 75 deletions
diff --git a/scripts/insert_data.py b/scripts/insert_data.py index 1465348..4b2e5f3 100644 --- a/scripts/insert_data.py +++ b/scripts/insert_data.py @@ -14,8 +14,8 @@ from MySQLdb.cursors import DictCursor from functional_tools import take from quality_control.file_utils import open_file -from qc_app.db_utils import database_connection -from qc_app.check_connections import check_db, check_redis +from uploader.db_utils import database_connection +from uploader.check_connections import check_db, check_redis # Set up logging stderr_handler = logging.StreamHandler(stream=sys.stderr) diff --git a/scripts/insert_samples.py b/scripts/insert_samples.py index 8431462..e3577b6 100644 --- a/scripts/insert_samples.py +++ b/scripts/insert_samples.py @@ -7,10 +7,11 @@ import argparse import MySQLdb as mdb from redis import Redis -from qc_app.db_utils import database_connection -from qc_app.check_connections import check_db, check_redis -from qc_app.db import species_by_id, population_by_id -from qc_app.samples import ( +from uploader.db_utils import database_connection +from uploader.check_connections import check_db, check_redis +from uploader.species.models import species_by_id +from uploader.population.models import population_by_id +from uploader.samples.models import ( save_samples_data, read_samples_file, cross_reference_samples) diff --git a/scripts/process_rqtl2_bundle.py b/scripts/process_rqtl2_bundle.py index 4da3936..20cfd3b 100644 --- a/scripts/process_rqtl2_bundle.py +++ b/scripts/process_rqtl2_bundle.py @@ -13,13 +13,13 @@ from redis import Redis from functional_tools import take -import r_qtl.errors as rqe import r_qtl.r_qtl2 as rqtl2 import r_qtl.r_qtl2_qc as rqc +import r_qtl.exceptions as rqe -from qc_app import jobs -from qc_app.db_utils import database_connection -from qc_app.check_connections import check_db, check_redis +from uploader import jobs +from uploader.db_utils import database_connection +from uploader.check_connections import check_db, check_redis from scripts.cli_parser import init_cli_parser from scripts.redis_logger import setup_redis_logger diff --git a/scripts/qc.py b/scripts/qc.py index e8573a9..6de051f 100644 --- a/scripts/qc.py +++ b/scripts/qc.py @@ -11,7 +11,7 @@ from quality_control.utils import make_progress_calculator from quality_control.errors import InvalidValue, DuplicateHeading from quality_control.parsing import FileType, strain_names, collect_errors -from qc_app.db_utils import database_connection +from uploader.db_utils import database_connection from .cli_parser import init_cli_parser diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index 40809b7..fc95d13 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -16,13 +16,13 @@ from redis import Redis from quality_control.errors import InvalidValue from quality_control.checks import decimal_points_error -from qc_app import jobs -from qc_app.db_utils import database_connection -from qc_app.check_connections import check_db, check_redis +from uploader import jobs +from uploader.db_utils import database_connection +from uploader.check_connections import check_db, check_redis -from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc +from r_qtl import exceptions as rqe from r_qtl import fileerrors as rqfe from scripts.process_rqtl2_bundle import parse_job @@ -105,7 +105,7 @@ def retrieve_errors_with_progress(rconn: Redis,#pylint: disable=[too-many-locals __update_processed__(value) rconn.hset(fqjobid, f"{filetype}-linecount", count) - except rqe.MissingFileError: + except rqe.MissingFileException: fname = cdata.get(filetype) yield rqfe.MissingFile(filetype, fname, ( f"The file '{fname}' does not exist in the bundle despite it being " @@ -133,7 +133,7 @@ def qc_geno_errors(rconn, fqjobid, _dburi, _speciesid, zfile, logger) -> bool: def fetch_db_geno_samples(conn: mdb.Connection, speciesid: int) -> tuple[str, ...]: """Fetch samples/cases/individuals from the database.""" - samples = set() + samples = set()# type: ignore[var-annotated] with conn.cursor() as cursor: cursor.execute("SELECT Name, Name2 from Strain WHERE SpeciesId=%s", (speciesid,)) @@ -191,12 +191,13 @@ def check_pheno_samples( return allerrors -def qc_pheno_errors(rconn, fqjobid, dburi, speciesid, zfile, logger) -> bool: +def qc_pheno_errors(# pylint: disable=[too-many-arguments] + rconn, fqjobid, dburi, speciesid, zfile, logger) -> bool: """Check for errors in `pheno` file(s).""" cdata = rqtl2.control_data(zfile) if "pheno" in cdata: logger.info("Checking for errors in the 'pheno' fileā¦") - perrs = tuple() + perrs = tuple()# type: ignore[var-annotated] with database_connection(dburi) as dbconn: perrs = check_pheno_samples( dbconn, speciesid, zfile.filename, logger) + tuple( @@ -216,7 +217,8 @@ def qc_pheno_errors(rconn, fqjobid, dburi, speciesid, zfile, logger) -> bool: return False -def qc_phenose_errors(rconn, fqjobid, dburi, speciesid, zfile, logger) -> bool: +def qc_phenose_errors(# pylint: disable=[too-many-arguments] + rconn, fqjobid, _dburi, _speciesid, zfile, logger) -> bool: """Check for errors in `phenose` file(s).""" cdata = rqtl2.control_data(zfile) if "phenose" in cdata: @@ -258,7 +260,9 @@ def run_qc(rconn: Redis, if qc_missing_files(rconn, fqjobid, zfile, logger): return 1 - def with_zipfile(rconn, fqjobid, dbconn, speciesid, filename, logger, func): + def with_zipfile(# pylint: disable=[too-many-arguments] + rconn, fqjobid, dbconn, speciesid, filename, logger, func + ): with ZipFile(filename, "r") as zfile: return func(rconn, fqjobid, dbconn, speciesid, zfile, logger) diff --git a/scripts/qcapp_wsgi.py b/scripts/qcapp_wsgi.py index 349c006..fe77031 100644 --- a/scripts/qcapp_wsgi.py +++ b/scripts/qcapp_wsgi.py @@ -5,8 +5,8 @@ from logging import getLogger, StreamHandler from flask import Flask -from qc_app import create_app -from qc_app.check_connections import check_db, check_redis +from uploader import create_app +from uploader.check_connections import check_db, check_redis def setup_logging(appl: Flask) -> Flask: """Setup appropriate logging paradigm depending on environment.""" diff --git a/scripts/rqtl2/entry.py b/scripts/rqtl2/entry.py index 93fc130..b7fb68e 100644 --- a/scripts/rqtl2/entry.py +++ b/scripts/rqtl2/entry.py @@ -6,9 +6,9 @@ from argparse import Namespace from redis import Redis from MySQLdb import Connection -from qc_app import jobs -from qc_app.db_utils import database_connection -from qc_app.check_connections import check_db, check_redis +from uploader import jobs +from uploader.db_utils import database_connection +from uploader.check_connections import check_db, check_redis from scripts.redis_logger import setup_redis_logger diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py index 68ae365..6b89142 100644 --- a/scripts/rqtl2/install_genotypes.py +++ b/scripts/rqtl2/install_genotypes.py @@ -19,10 +19,13 @@ from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_common_arguments from scripts.cli_parser import init_cli_parser, add_global_data_arguments -def insert_markers(dbconn: mdb.Connection, - speciesid: int, - markers: tuple[str, ...], - pmapdata: Optional[Iterator[dict]]) -> int: +def insert_markers( + dbconn: mdb.Connection, + speciesid: int, + markers: tuple[str, ...], + pmapdata: Optional[Iterator[dict]], + _logger: Logger +) -> int: """Insert genotype and genotype values into the database.""" mdata = reduce(#type: ignore[var-annotated] lambda acc, row: ({#type: ignore[arg-type, return-value] @@ -45,12 +48,15 @@ def insert_markers(dbconn: mdb.Connection, "marker": marker, "chr": mdata.get(marker, {}).get("chr"), "pos": mdata.get(marker, {}).get("pos") - } for marker in markers}.items())) + } for marker in markers}.values())) return cursor.rowcount -def insert_individuals(dbconn: mdb.Connection, - speciesid: int, - individuals: tuple[str, ...]) -> int: +def insert_individuals( + dbconn: mdb.Connection, + speciesid: int, + individuals: tuple[str, ...], + _logger: Logger +) -> int: """Insert individuals/samples into the database.""" with dbconn.cursor() as cursor: cursor.executemany( @@ -61,10 +67,13 @@ def insert_individuals(dbconn: mdb.Connection, for individual in individuals)) return cursor.rowcount -def cross_reference_individuals(dbconn: mdb.Connection, - speciesid: int, - populationid: int, - individuals: tuple[str, ...]) -> int: +def cross_reference_individuals( + dbconn: mdb.Connection, + speciesid: int, + populationid: int, + individuals: tuple[str, ...], + _logger: Logger +) -> int: """Cross reference any inserted individuals.""" with dbconn.cursor(cursorclass=DictCursor) as cursor: paramstr = ", ".join(["%s"] * len(individuals)) @@ -80,11 +89,13 @@ def cross_reference_individuals(dbconn: mdb.Connection, tuple(ids)) return cursor.rowcount -def insert_genotype_data(dbconn: mdb.Connection, - speciesid: int, - genotypes: tuple[dict, ...], - individuals: tuple[str, ...]) -> tuple[ - int, tuple[dict, ...]]: +def insert_genotype_data( + dbconn: mdb.Connection, + speciesid: int, + genotypes: tuple[dict, ...], + individuals: tuple[str, ...], + _logger: Logger +) -> tuple[int, tuple[dict, ...]]: """Insert the genotype data values into the database.""" with dbconn.cursor(cursorclass=DictCursor) as cursor: paramstr = ", ".join(["%s"] * len(individuals)) @@ -120,11 +131,14 @@ def insert_genotype_data(dbconn: mdb.Connection, "markerid": row["markerid"] } for row in data) -def cross_reference_genotypes(dbconn: mdb.Connection, - speciesid: int, - datasetid: int, - dataids: tuple[dict, ...], - gmapdata: Optional[Iterator[dict]]) -> int: +def cross_reference_genotypes( + dbconn: mdb.Connection, + speciesid: int, + datasetid: int, + dataids: tuple[dict, ...], + gmapdata: Optional[Iterator[dict]], + _logger: Logger +) -> int: """Cross-reference the data to the relevant dataset.""" _rows, markers, mdata = reduce(#type: ignore[var-annotated] lambda acc, row: (#type: ignore[return-value,arg-type] @@ -140,30 +154,43 @@ def cross_reference_genotypes(dbconn: mdb.Connection, (tuple(), tuple(), {})) with dbconn.cursor(cursorclass=DictCursor) as cursor: - paramstr = ", ".join(["%s"] * len(markers)) - cursor.execute("SELECT Id, Name FROM Geno " - f"WHERE SpeciesId=%s AND Name IN ({paramstr})", - (speciesid,) + markers) - markersdict = {row["Id"]: row["Name"] for row in cursor.fetchall()} - cursor.executemany( + markersdict = {} + if len(markers) > 0: + paramstr = ", ".join(["%s"] * len(markers)) + insertparams = (speciesid,) + markers + selectquery = ("SELECT Id, Name FROM Geno " + f"WHERE SpeciesId=%s AND Name IN ({paramstr})") + _logger.debug( + "The select query was\n\t%s\n\nwith the parameters\n\t%s", + selectquery, + (speciesid,) + markers) + cursor.execute(selectquery, insertparams) + markersdict = {row["Id"]: row["Name"] for row in cursor.fetchall()} + + insertquery = ( "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId, cM) " "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s, %(pos)s) " - "ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId", - tuple({ - **row, - "datasetid": datasetid, - "pos": mdata.get(markersdict.get( - row.get("markerid"), {}), {}).get("pos") - } for row in dataids)) + "ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId") + insertparams = tuple({ + **row, + "datasetid": datasetid, + "pos": mdata.get(markersdict.get( + row.get("markerid"), "nosuchkey"), {}).get("pos") + } for row in dataids) + _logger.debug( + "The insert query was\n\t%s\n\nwith the parameters\n\t%s", + insertquery, insertparams) + cursor.executemany(insertquery, insertparams) return cursor.rowcount def install_genotypes(#pylint: disable=[too-many-arguments, too-many-locals] dbconn: mdb.Connection, - speciesid: int, - populationid: int, - datasetid: int, - rqtl2bundle: Path, - logger: Logger = getLogger()) -> int: + speciesid: int, + populationid: int, + datasetid: int, + rqtl2bundle: Path, + logger: Logger = getLogger(__name__) +) -> int: """Load any existing genotypes into the database.""" count = 0 with ZipFile(str(rqtl2bundle.absolute()), "r") as zfile: @@ -189,20 +216,22 @@ def install_genotypes(#pylint: disable=[too-many-arguments, too-many-locals] speciesid, tuple(key for key in batch[0].keys() if key != "id"), (rqtl2.file_data(zfile, "pmap", cdata) if "pmap" in cdata - else None)) + else None), + logger) individuals = tuple(row["id"] for row in batch) - insert_individuals(dbconn, speciesid, individuals) + insert_individuals(dbconn, speciesid, individuals, logger) cross_reference_individuals( - dbconn, speciesid, populationid, individuals) + dbconn, speciesid, populationid, individuals, logger) _num_rows, dataids = insert_genotype_data( - dbconn, speciesid, batch, individuals) + dbconn, speciesid, batch, individuals, logger) cross_reference_genotypes( dbconn, speciesid, datasetid, dataids, (rqtl2.file_data(zfile, "gmap", cdata) - if "gmap" in cdata else None)) + if "gmap" in cdata else None), + logger) count = count + len(batch) except rqtl2.InvalidFormat as exc: logger.error(str(exc)) diff --git a/scripts/validate_file.py b/scripts/validate_file.py index 0028795..a40d7e7 100644 --- a/scripts/validate_file.py +++ b/scripts/validate_file.py @@ -12,8 +12,8 @@ from redis.exceptions import ConnectionError # pylint: disable=[redefined-builti from quality_control.utils import make_progress_calculator from quality_control.parsing import FileType, strain_names, collect_errors -from qc_app import jobs -from qc_app.db_utils import database_connection +from uploader import jobs +from uploader.db_utils import database_connection from .cli_parser import init_cli_parser from .qc import add_file_validation_arguments diff --git a/scripts/worker.py b/scripts/worker.py index 0eb9ea5..91b0332 100644 --- a/scripts/worker.py +++ b/scripts/worker.py @@ -11,8 +11,8 @@ from tempfile import TemporaryDirectory from redis import Redis -from qc_app import jobs -from qc_app.check_connections import check_redis +from uploader import jobs +from uploader.check_connections import check_redis def parse_args(): "Parse the command-line arguments" |