aboutsummaryrefslogtreecommitdiff
path: root/scripts/rqtl2
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-11 04:04:36 +0300
committerFrederick Muriuki Muriithi2024-01-11 04:04:36 +0300
commit2a29e3f0ed57414490f05790e664f94d89b5fdf9 (patch)
treecce41c86eb914b5aeffa53408b5a213bdfc4a71b /scripts/rqtl2
parent88a8bc0b458f079e93757057080f1ec0dc842fd8 (diff)
downloadgn-uploader-2a29e3f0ed57414490f05790e664f94d89b5fdf9.tar.gz
Update gmap data in the database
Update the genetic positions (in centiMorgans) in the database.
Diffstat (limited to 'scripts/rqtl2')
-rw-r--r--scripts/rqtl2/install_genotypes.py52
1 files changed, 40 insertions, 12 deletions
diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py
index 5c3da85..1317c96 100644
--- a/scripts/rqtl2/install_genotypes.py
+++ b/scripts/rqtl2/install_genotypes.py
@@ -5,6 +5,8 @@ import logging
import traceback
from pathlib import Path
from zipfile import ZipFile
+from functools import reduce
+from typing import Union, Iterator
from argparse import ArgumentParser
import MySQLdb as mdb
@@ -109,15 +111,40 @@ def insert_genotype_data(dbconn: mdb.Connection,
} for row in data)
def cross_reference_genotypes(dbconn: mdb.Connection,
+ speciesid: int,
datasetid: int,
- dataids: tuple[dict, ...]) -> int:
+ dataids: tuple[dict, ...],
+ gmapdata: Union[Iterator[dict], None]) -> int:
"""Cross-reference the data to the relevant dataset."""
+ _rows, markers, mdata = reduce(#type: ignore[var-annotated]
+ lambda acc, row: (#type: ignore[return-value,arg-type]
+ (acc[0] + (row,)), (acc[1] + (row["id"],)),
+ {
+ **acc[2], row["id"]: {
+ key: val
+ for key,val in row.items()
+ if key != "id"
+ }
+ }),
+ (gmapdata or tuple()),
+ (tuple(), tuple(), {}))
+
with dbconn.cursor(cursorclass=DictCursor) as cursor:
- cursor.execute(
- "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId) "
- "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s) "
+ paramstr = ", ".join(["%s"] * len(markers))
+ cursor.execute("SELECT Id, Name FROM Geno "
+ f"WHERE SpeciesId=%s AND Name IN ({paramstr})",
+ (speciesid,) + markers)
+ markersdict = {row["Id"]: row["Name"] for row in cursor.fetchall()}
+ cursor.executemany(
+ "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId, cM) "
+ "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s, %(pos)s) "
"ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId",
- tuple({**row, "datasetid": datasetid} for row in dataids))
+ tuple({
+ **row,
+ "datasetid": datasetid,
+ "pos": mdata.get(markersdict.get(
+ row.get("markerid"), {}), {}).get("pos")
+ } for row in dataids))
return cursor.rowcount
def install_genotypes(dbconn: mdb.Connection,
@@ -135,6 +162,7 @@ def install_genotypes(dbconn: mdb.Connection,
logger.info(("Loading genotypes. This could take a while. "
"Please be patient."))
+ cdata = rqtl2.control_data(zfile)
genotypes = rqtl2.genotype_data(zfile)
while True:
batch = tuple(take(genotypes, 5000))
@@ -154,15 +182,15 @@ def install_genotypes(dbconn: mdb.Connection,
dbconn, speciesid, populationid, individuals)
_num_rows, dataids = insert_genotype_data(
dbconn, speciesid, batch, individuals)
- cross_reference_genotypes(dbconn, datasetid, dataids)
+ cross_reference_genotypes(
+ dbconn,
+ speciesid,
+ datasetid,
+ dataids,
+ (rqtl2.file_data(zfile, "gmap", cdata)
+ if "gmap" in cdata else None))
count = count + len(batch)
- cdata = rqtl2.control_data(zfile)
- if "gmap" in cdata:
- logger.info("Loading genetic mapping info.")
- # TODO: load gmap files
- logger.info("Successfully loaded genetic mapping.")
-
if "pmap" in cdata:
logger.info("Loading physical mapping info.")
# TODO: load pmap files