From 21078fecb698972062af3157a9a0f6e84bb8fd0d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 9 Jan 2024 11:24:21 +0300 Subject: WIP: Install genotypes from R/qtl2 bundle Load the genotype information from the R/qtl2 bundle and insert it into the database. --- scripts/rqtl2/__init__.py | 1 + scripts/rqtl2/install_genotypes.py | 152 +++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 scripts/rqtl2/__init__.py create mode 100644 scripts/rqtl2/install_genotypes.py diff --git a/scripts/rqtl2/__init__.py b/scripts/rqtl2/__init__.py new file mode 100644 index 0000000..62bc52a --- /dev/null +++ b/scripts/rqtl2/__init__.py @@ -0,0 +1 @@ +"""A package of scripts for use in parsing R/qtl2 bundles""" diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py new file mode 100644 index 0000000..f5b6eb4 --- /dev/null +++ b/scripts/rqtl2/install_genotypes.py @@ -0,0 +1,152 @@ +"""Load genotypes from R/qtl2 bundle into the database.""" +import sys +import uuid +import logging +import traceback +from pathlib import Path +from zipfile import ZipFile +from argparse import ArgumentParser + +import MySQLdb as mdb +from redis import Redis + +from r_qtl import r_qtl2 as rqtl2 + +from quality_control.parsing import take + +from qc_app.db_utils import database_connection +from qc_app.check_connections import check_db, check_redis + +from scripts.redis_logger import RedisLogger + +stderr_handler = logging.StreamHandler(stream=sys.stderr) +logger = logging.getLogger("install_genotypes") +logger.addHandler(stderr_handler) + +def insert_genotypes(dbconn: mdb.Connection, + speciesid: int, + populationid: int, + genotypes: tuple[dict]) -> int: + """Insert genotype and genotype values into the database.""" + with dbconn.cursor() as cursor: + cursor.executemany( + "INSERT INTO Geno(SpeciesId, Name, Marker_Name) " + "VALUES (%(speciesid)s, %(marker)s, %(marker)s) " + "ON DUPLICATE KEY UPDATE " + "SpeciesId=VALUE(SpeciesId)", + tuple({"speciesid": speciesid, "marker": geno["marker"]} + for geno in genotypes)) + # TODO: Install individuals/samples/strains: Strain + # TODO: Cross-ref samples to population: StrainXRef + # TODO: Install geno data: GenoData + return cursor.rowcount + +def install_genotypes(dbconn: mdb.Connection, + speciesid: int, + populationid: int, + rqtl2bundle: Path) -> int: + """Load any existing genotypes into the database.""" + count = 0 + installed = 0 + with ZipFile(str(rqtl2bundle.absolute()), "r") as zfile: + try: + logger.info("Validating bundle") + rqtl2.validate_bundle(zfile) + logger.info("Bundle validated successfully.") + logger.info(("Loading genotypes. This could take a while. " + "Please be patient.")) + + cdata = rqtl2.control_data(zfile) + genotypes = rqtl2.file_data(zfile, + "geno", + cdata, + rqtl2.make_process_data_geno(cdata)) + while True: + batch = tuple(take(genotypes, 5000)) + if len(batch) == 0: + logger.info("Loading Genotypes complete!") + logger.info( + f"Total genotypes installed: {installed} of {count}") + break + + curr_installed = insert_genotypes( + dbconn, speciesid, populationid, batch) + installed = installed + curr_installed + count = count + len(batch) + logger.info(f"Installed {curr_installed} genotypes") + + if "gmap" in cdata: + logger.info("Loading genetic mapping info.") + # TODO: load gmap files + logger.info("Successfully loaded genetic mapping.") + + if "pmap" in cdata: + logger.info("Loading physical mapping info.") + # TODO: load pmap files + logger.info("Successfully loaded physical mapping.") + except rqtl2.InvalidFormat as exc: + logger.error(str(exc)) + logger.info("There are no genotypes to load.") + except Exception as exc: + logger.error("Failing with exception: %s", traceback.format_exc()) + return 3 + + return 0 + +if __name__ == "__main__": + + def cli_args(): + """Process command-line arguments for install_genotypes""" + parser = ArgumentParser( + prog="install_genotypes", + description="Parse genotypes from R/qtl2 bundle into the database.") + + parser.add_argument("databaseuri", help="URL to MariaDB") + parser.add_argument("redisuri", help="URL to Redis") + parser.add_argument("jobid", + help="Job ID that this belongs to.", + type=uuid.UUID) + + parser.add_argument("speciesid", + help="Species to which bundle relates.") + parser.add_argument("populationid", + help="Population to group data under") + parser.add_argument("rqtl2bundle", + help="Path to R/qtl2 bundle zip file.", + type=Path) + + parser.add_argument("--redisexpiry", + help="How long to keep any redis keys around.", + type=int, + default=86400) + + return parser.parse_args() + + def main(): + """Run `install_genotypes` scripts.""" + args = cli_args() + check_db(args.databaseuri) + check_redis(args.redisuri) + if not args.rqtl2bundle.exists(): + logging.error("File not found: '%s'.", args.rqtl2bundle) + return 2 + + with (Redis.from_url(args.redisuri, decode_responses=True) as rconn, + database_connection(args.databaseuri) as dbconn): + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s: %(message)s") + job_messagelist = f"{str(args.jobid)}:log-messages" + rconn.hset(name=str(args.jobid), + key="log-messagelist", + value=job_messagelist) + redislogger = RedisLogger( + rconn, args.jobid, expiry=args.redisexpiry) + redislogger.setFormatter(formatter) + logger.addHandler(redislogger) + logger.setLevel("INFO") + return install_genotypes(dbconn, + args.speciesid, + args.populationid, + args.rqtl2bundle) + + sys.exit(main()) -- cgit v1.2.3