about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-09 11:24:21 +0300
committerFrederick Muriuki Muriithi2024-01-09 11:24:21 +0300
commit21078fecb698972062af3157a9a0f6e84bb8fd0d (patch)
tree8319cc872e29091c9fa2f79db81e22ae99701c83 /scripts
parent5491da18dda8c6f55bc2bc5d95f21b86908cd382 (diff)
downloadgn-uploader-21078fecb698972062af3157a9a0f6e84bb8fd0d.tar.gz
WIP: Install genotypes from R/qtl2 bundle
Load the genotype information from the R/qtl2 bundle and insert it
into the database.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/rqtl2/__init__.py1
-rw-r--r--scripts/rqtl2/install_genotypes.py152
2 files changed, 153 insertions, 0 deletions
diff --git a/scripts/rqtl2/__init__.py b/scripts/rqtl2/__init__.py
new file mode 100644
index 0000000..62bc52a
--- /dev/null
+++ b/scripts/rqtl2/__init__.py
@@ -0,0 +1 @@
+"""A package of scripts for use in parsing R/qtl2 bundles"""
diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py
new file mode 100644
index 0000000..f5b6eb4
--- /dev/null
+++ b/scripts/rqtl2/install_genotypes.py
@@ -0,0 +1,152 @@
+"""Load genotypes from R/qtl2 bundle into the database."""
+import sys
+import uuid
+import logging
+import traceback
+from pathlib import Path
+from zipfile import ZipFile
+from argparse import ArgumentParser
+
+import MySQLdb as mdb
+from redis import Redis
+
+from r_qtl import r_qtl2 as rqtl2
+
+from quality_control.parsing import take
+
+from qc_app.db_utils import database_connection
+from qc_app.check_connections import check_db, check_redis
+
+from scripts.redis_logger import RedisLogger
+
+stderr_handler = logging.StreamHandler(stream=sys.stderr)
+logger = logging.getLogger("install_genotypes")
+logger.addHandler(stderr_handler)
+
+def insert_genotypes(dbconn: mdb.Connection,
+                     speciesid: int,
+                     populationid: int,
+                     genotypes: tuple[dict]) -> int:
+    """Insert genotype and genotype values into the database."""
+    with dbconn.cursor() as cursor:
+        cursor.executemany(
+            "INSERT INTO Geno(SpeciesId, Name, Marker_Name) "
+            "VALUES (%(speciesid)s, %(marker)s, %(marker)s) "
+            "ON DUPLICATE KEY UPDATE "
+            "SpeciesId=VALUE(SpeciesId)",
+            tuple({"speciesid": speciesid, "marker": geno["marker"]}
+                  for geno in genotypes))
+        # TODO: Install individuals/samples/strains: Strain
+        # TODO: Cross-ref samples to population: StrainXRef
+        # TODO: Install geno data: GenoData
+        return cursor.rowcount
+
+def install_genotypes(dbconn: mdb.Connection,
+                      speciesid: int,
+                      populationid: int,
+                      rqtl2bundle: Path) -> int:
+    """Load any existing genotypes into the database."""
+    count = 0
+    installed = 0
+    with ZipFile(str(rqtl2bundle.absolute()), "r") as zfile:
+        try:
+            logger.info("Validating bundle")
+            rqtl2.validate_bundle(zfile)
+            logger.info("Bundle validated successfully.")
+            logger.info(("Loading genotypes. This could take a while. "
+                         "Please be patient."))
+            
+            cdata = rqtl2.control_data(zfile)
+            genotypes = rqtl2.file_data(zfile,
+                                        "geno",
+                                        cdata,
+                                        rqtl2.make_process_data_geno(cdata))
+            while True:
+                batch = tuple(take(genotypes, 5000))
+                if len(batch) == 0:
+                    logger.info("Loading Genotypes complete!")
+                    logger.info(
+                        f"Total genotypes installed: {installed} of {count}")
+                    break
+
+                curr_installed = insert_genotypes(
+                    dbconn, speciesid, populationid, batch)
+                installed = installed + curr_installed
+                count = count + len(batch)
+                logger.info(f"Installed {curr_installed} genotypes")
+
+            if "gmap" in cdata:
+                logger.info("Loading genetic mapping info.")
+                # TODO: load gmap files
+                logger.info("Successfully loaded genetic mapping.")
+
+            if "pmap" in cdata:
+                logger.info("Loading physical mapping info.")
+                # TODO: load pmap files
+                logger.info("Successfully loaded physical mapping.")
+        except rqtl2.InvalidFormat as exc:
+            logger.error(str(exc))
+            logger.info("There are no genotypes to load.")
+        except Exception as exc:
+            logger.error("Failing with exception: %s", traceback.format_exc())
+            return 3
+
+    return 0
+
+if __name__ == "__main__":
+
+    def cli_args():
+        """Process command-line arguments for install_genotypes"""
+        parser = ArgumentParser(
+            prog="install_genotypes",
+            description="Parse genotypes from R/qtl2 bundle into the database.")
+
+        parser.add_argument("databaseuri", help="URL to MariaDB")
+        parser.add_argument("redisuri", help="URL to Redis")
+        parser.add_argument("jobid",
+                            help="Job ID that this belongs to.",
+                            type=uuid.UUID)
+
+        parser.add_argument("speciesid",
+                            help="Species to which bundle relates.")
+        parser.add_argument("populationid",
+                            help="Population to group data under")
+        parser.add_argument("rqtl2bundle",
+                            help="Path to R/qtl2 bundle zip file.",
+                            type=Path)
+
+        parser.add_argument("--redisexpiry",
+                            help="How long to keep any redis keys around.",
+                            type=int,
+                            default=86400)
+
+        return parser.parse_args()
+
+    def main():
+        """Run `install_genotypes` scripts."""
+        args = cli_args()
+        check_db(args.databaseuri)
+        check_redis(args.redisuri)
+        if not args.rqtl2bundle.exists():
+            logging.error("File not found: '%s'.", args.rqtl2bundle)
+            return 2
+
+        with (Redis.from_url(args.redisuri, decode_responses=True) as rconn,
+              database_connection(args.databaseuri) as dbconn):
+            formatter = logging.Formatter(
+                "%(asctime)s - %(name)s - %(levelname)s: %(message)s")
+            job_messagelist = f"{str(args.jobid)}:log-messages"
+            rconn.hset(name=str(args.jobid),
+                       key="log-messagelist",
+                       value=job_messagelist)
+            redislogger = RedisLogger(
+                rconn, args.jobid, expiry=args.redisexpiry)
+            redislogger.setFormatter(formatter)
+            logger.addHandler(redislogger)
+            logger.setLevel("INFO")
+            return install_genotypes(dbconn,
+                                     args.speciesid,
+                                     args.populationid,
+                                     args.rqtl2bundle)
+
+    sys.exit(main())