diff options
Diffstat (limited to 'scripts/rqtl2/install_genotypes.py')
-rw-r--r-- | scripts/rqtl2/install_genotypes.py | 52 |
1 files changed, 40 insertions, 12 deletions
diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py index 5c3da85..1317c96 100644 --- a/scripts/rqtl2/install_genotypes.py +++ b/scripts/rqtl2/install_genotypes.py @@ -5,6 +5,8 @@ import logging import traceback from pathlib import Path from zipfile import ZipFile +from functools import reduce +from typing import Union, Iterator from argparse import ArgumentParser import MySQLdb as mdb @@ -109,15 +111,40 @@ def insert_genotype_data(dbconn: mdb.Connection, } for row in data) def cross_reference_genotypes(dbconn: mdb.Connection, + speciesid: int, datasetid: int, - dataids: tuple[dict, ...]) -> int: + dataids: tuple[dict, ...], + gmapdata: Union[Iterator[dict], None]) -> int: """Cross-reference the data to the relevant dataset.""" + _rows, markers, mdata = reduce(#type: ignore[var-annotated] + lambda acc, row: (#type: ignore[return-value,arg-type] + (acc[0] + (row,)), (acc[1] + (row["id"],)), + { + **acc[2], row["id"]: { + key: val + for key,val in row.items() + if key != "id" + } + }), + (gmapdata or tuple()), + (tuple(), tuple(), {})) + with dbconn.cursor(cursorclass=DictCursor) as cursor: - cursor.execute( - "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId) " - "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s) " + paramstr = ", ".join(["%s"] * len(markers)) + cursor.execute("SELECT Id, Name FROM Geno " + f"WHERE SpeciesId=%s AND Name IN ({paramstr})", + (speciesid,) + markers) + markersdict = {row["Id"]: row["Name"] for row in cursor.fetchall()} + cursor.executemany( + "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId, cM) " + "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s, %(pos)s) " "ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId", - tuple({**row, "datasetid": datasetid} for row in dataids)) + tuple({ + **row, + "datasetid": datasetid, + "pos": mdata.get(markersdict.get( + row.get("markerid"), {}), {}).get("pos") + } for row in dataids)) return cursor.rowcount def install_genotypes(dbconn: mdb.Connection, @@ -135,6 +162,7 @@ def install_genotypes(dbconn: mdb.Connection, logger.info(("Loading genotypes. This could take a while. " "Please be patient.")) + cdata = rqtl2.control_data(zfile) genotypes = rqtl2.genotype_data(zfile) while True: batch = tuple(take(genotypes, 5000)) @@ -154,15 +182,15 @@ def install_genotypes(dbconn: mdb.Connection, dbconn, speciesid, populationid, individuals) _num_rows, dataids = insert_genotype_data( dbconn, speciesid, batch, individuals) - cross_reference_genotypes(dbconn, datasetid, dataids) + cross_reference_genotypes( + dbconn, + speciesid, + datasetid, + dataids, + (rqtl2.file_data(zfile, "gmap", cdata) + if "gmap" in cdata else None)) count = count + len(batch) - cdata = rqtl2.control_data(zfile) - if "gmap" in cdata: - logger.info("Loading genetic mapping info.") - # TODO: load gmap files - logger.info("Successfully loaded genetic mapping.") - if "pmap" in cdata: logger.info("Loading physical mapping info.") # TODO: load pmap files |