aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-09 11:24:21 +0300
committerFrederick Muriuki Muriithi2024-01-09 11:24:21 +0300
commit21078fecb698972062af3157a9a0f6e84bb8fd0d (patch)
tree8319cc872e29091c9fa2f79db81e22ae99701c83 /scripts
parent5491da18dda8c6f55bc2bc5d95f21b86908cd382 (diff)
downloadgn-uploader-21078fecb698972062af3157a9a0f6e84bb8fd0d.tar.gz
WIP: Install genotypes from R/qtl2 bundle
Load the genotype information from the R/qtl2 bundle and insert it into the database.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/rqtl2/__init__.py1
-rw-r--r--scripts/rqtl2/install_genotypes.py152
2 files changed, 153 insertions, 0 deletions
diff --git a/scripts/rqtl2/__init__.py b/scripts/rqtl2/__init__.py
new file mode 100644
index 0000000..62bc52a
--- /dev/null
+++ b/scripts/rqtl2/__init__.py
@@ -0,0 +1 @@
+"""A package of scripts for use in parsing R/qtl2 bundles"""
diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py
new file mode 100644
index 0000000..f5b6eb4
--- /dev/null
+++ b/scripts/rqtl2/install_genotypes.py
@@ -0,0 +1,152 @@
+"""Load genotypes from R/qtl2 bundle into the database."""
+import sys
+import uuid
+import logging
+import traceback
+from pathlib import Path
+from zipfile import ZipFile
+from argparse import ArgumentParser
+
+import MySQLdb as mdb
+from redis import Redis
+
+from r_qtl import r_qtl2 as rqtl2
+
+from quality_control.parsing import take
+
+from qc_app.db_utils import database_connection
+from qc_app.check_connections import check_db, check_redis
+
+from scripts.redis_logger import RedisLogger
+
+stderr_handler = logging.StreamHandler(stream=sys.stderr)
+logger = logging.getLogger("install_genotypes")
+logger.addHandler(stderr_handler)
+
+def insert_genotypes(dbconn: mdb.Connection,
+ speciesid: int,
+ populationid: int,
+ genotypes: tuple[dict]) -> int:
+ """Insert genotype and genotype values into the database."""
+ with dbconn.cursor() as cursor:
+ cursor.executemany(
+ "INSERT INTO Geno(SpeciesId, Name, Marker_Name) "
+ "VALUES (%(speciesid)s, %(marker)s, %(marker)s) "
+ "ON DUPLICATE KEY UPDATE "
+ "SpeciesId=VALUE(SpeciesId)",
+ tuple({"speciesid": speciesid, "marker": geno["marker"]}
+ for geno in genotypes))
+ # TODO: Install individuals/samples/strains: Strain
+ # TODO: Cross-ref samples to population: StrainXRef
+ # TODO: Install geno data: GenoData
+ return cursor.rowcount
+
+def install_genotypes(dbconn: mdb.Connection,
+ speciesid: int,
+ populationid: int,
+ rqtl2bundle: Path) -> int:
+ """Load any existing genotypes into the database."""
+ count = 0
+ installed = 0
+ with ZipFile(str(rqtl2bundle.absolute()), "r") as zfile:
+ try:
+ logger.info("Validating bundle")
+ rqtl2.validate_bundle(zfile)
+ logger.info("Bundle validated successfully.")
+ logger.info(("Loading genotypes. This could take a while. "
+ "Please be patient."))
+
+ cdata = rqtl2.control_data(zfile)
+ genotypes = rqtl2.file_data(zfile,
+ "geno",
+ cdata,
+ rqtl2.make_process_data_geno(cdata))
+ while True:
+ batch = tuple(take(genotypes, 5000))
+ if len(batch) == 0:
+ logger.info("Loading Genotypes complete!")
+ logger.info(
+ f"Total genotypes installed: {installed} of {count}")
+ break
+
+ curr_installed = insert_genotypes(
+ dbconn, speciesid, populationid, batch)
+ installed = installed + curr_installed
+ count = count + len(batch)
+ logger.info(f"Installed {curr_installed} genotypes")
+
+ if "gmap" in cdata:
+ logger.info("Loading genetic mapping info.")
+ # TODO: load gmap files
+ logger.info("Successfully loaded genetic mapping.")
+
+ if "pmap" in cdata:
+ logger.info("Loading physical mapping info.")
+ # TODO: load pmap files
+ logger.info("Successfully loaded physical mapping.")
+ except rqtl2.InvalidFormat as exc:
+ logger.error(str(exc))
+ logger.info("There are no genotypes to load.")
+ except Exception as exc:
+ logger.error("Failing with exception: %s", traceback.format_exc())
+ return 3
+
+ return 0
+
+if __name__ == "__main__":
+
+ def cli_args():
+ """Process command-line arguments for install_genotypes"""
+ parser = ArgumentParser(
+ prog="install_genotypes",
+ description="Parse genotypes from R/qtl2 bundle into the database.")
+
+ parser.add_argument("databaseuri", help="URL to MariaDB")
+ parser.add_argument("redisuri", help="URL to Redis")
+ parser.add_argument("jobid",
+ help="Job ID that this belongs to.",
+ type=uuid.UUID)
+
+ parser.add_argument("speciesid",
+ help="Species to which bundle relates.")
+ parser.add_argument("populationid",
+ help="Population to group data under")
+ parser.add_argument("rqtl2bundle",
+ help="Path to R/qtl2 bundle zip file.",
+ type=Path)
+
+ parser.add_argument("--redisexpiry",
+ help="How long to keep any redis keys around.",
+ type=int,
+ default=86400)
+
+ return parser.parse_args()
+
+ def main():
+ """Run `install_genotypes` scripts."""
+ args = cli_args()
+ check_db(args.databaseuri)
+ check_redis(args.redisuri)
+ if not args.rqtl2bundle.exists():
+ logging.error("File not found: '%s'.", args.rqtl2bundle)
+ return 2
+
+ with (Redis.from_url(args.redisuri, decode_responses=True) as rconn,
+ database_connection(args.databaseuri) as dbconn):
+ formatter = logging.Formatter(
+ "%(asctime)s - %(name)s - %(levelname)s: %(message)s")
+ job_messagelist = f"{str(args.jobid)}:log-messages"
+ rconn.hset(name=str(args.jobid),
+ key="log-messagelist",
+ value=job_messagelist)
+ redislogger = RedisLogger(
+ rconn, args.jobid, expiry=args.redisexpiry)
+ redislogger.setFormatter(formatter)
+ logger.addHandler(redislogger)
+ logger.setLevel("INFO")
+ return install_genotypes(dbconn,
+ args.speciesid,
+ args.populationid,
+ args.rqtl2bundle)
+
+ sys.exit(main())