about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-11 04:04:36 +0300
committerFrederick Muriuki Muriithi2024-01-11 04:04:36 +0300
commit2a29e3f0ed57414490f05790e664f94d89b5fdf9 (patch)
treecce41c86eb914b5aeffa53408b5a213bdfc4a71b /scripts
parent88a8bc0b458f079e93757057080f1ec0dc842fd8 (diff)
downloadgn-uploader-2a29e3f0ed57414490f05790e664f94d89b5fdf9.tar.gz
Update gmap data in the database
Update the genetic positions (in centiMorgans) in the database.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/rqtl2/install_genotypes.py52
1 files changed, 40 insertions, 12 deletions
diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py
index 5c3da85..1317c96 100644
--- a/scripts/rqtl2/install_genotypes.py
+++ b/scripts/rqtl2/install_genotypes.py
@@ -5,6 +5,8 @@ import logging
 import traceback
 from pathlib import Path
 from zipfile import ZipFile
+from functools import reduce
+from typing import Union, Iterator
 from argparse import ArgumentParser
 
 import MySQLdb as mdb
@@ -109,15 +111,40 @@ def insert_genotype_data(dbconn: mdb.Connection,
         } for row in data)
 
 def cross_reference_genotypes(dbconn: mdb.Connection,
+                              speciesid: int,
                               datasetid: int,
-                              dataids: tuple[dict, ...]) -> int:
+                              dataids: tuple[dict, ...],
+                              gmapdata: Union[Iterator[dict], None]) -> int:
     """Cross-reference the data to the relevant dataset."""
+    _rows, markers, mdata = reduce(#type: ignore[var-annotated]
+        lambda acc, row: (#type: ignore[return-value,arg-type]
+            (acc[0] + (row,)), (acc[1] + (row["id"],)),
+            {
+                **acc[2], row["id"]: {
+                    key: val
+                    for key,val in row.items()
+                    if key != "id"
+                }
+            }),
+        (gmapdata or tuple()),
+        (tuple(), tuple(), {}))
+
     with dbconn.cursor(cursorclass=DictCursor) as cursor:
-        cursor.execute(
-            "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId) "
-            "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s) "
+        paramstr = ", ".join(["%s"] * len(markers))
+        cursor.execute("SELECT Id, Name FROM Geno "
+                       f"WHERE SpeciesId=%s AND Name IN ({paramstr})",
+                       (speciesid,) + markers)
+        markersdict = {row["Id"]: row["Name"] for row in cursor.fetchall()}
+        cursor.executemany(
+            "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId, cM) "
+            "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s, %(pos)s) "
             "ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId",
-            tuple({**row, "datasetid": datasetid} for row in dataids))
+            tuple({
+                **row,
+                "datasetid": datasetid,
+                "pos": mdata.get(markersdict.get(
+                    row.get("markerid"), {}), {}).get("pos")
+            } for row in dataids))
         return cursor.rowcount
 
 def install_genotypes(dbconn: mdb.Connection,
@@ -135,6 +162,7 @@ def install_genotypes(dbconn: mdb.Connection,
             logger.info(("Loading genotypes. This could take a while. "
                          "Please be patient."))
 
+            cdata = rqtl2.control_data(zfile)
             genotypes = rqtl2.genotype_data(zfile)
             while True:
                 batch = tuple(take(genotypes, 5000))
@@ -154,15 +182,15 @@ def install_genotypes(dbconn: mdb.Connection,
                     dbconn, speciesid, populationid, individuals)
                 _num_rows, dataids = insert_genotype_data(
                     dbconn, speciesid, batch, individuals)
-                cross_reference_genotypes(dbconn, datasetid, dataids)
+                cross_reference_genotypes(
+                    dbconn,
+                    speciesid,
+                    datasetid,
+                    dataids,
+                    (rqtl2.file_data(zfile, "gmap", cdata)
+                     if "gmap" in cdata else None))
                 count = count + len(batch)
 
-            cdata = rqtl2.control_data(zfile)
-            if "gmap" in cdata:
-                logger.info("Loading genetic mapping info.")
-                # TODO: load gmap files
-                logger.info("Successfully loaded genetic mapping.")
-
             if "pmap" in cdata:
                 logger.info("Loading physical mapping info.")
                 # TODO: load pmap files