diff options
-rw-r--r-- | scripts/rqtl2/install_genotypes.py | 94 |
1 files changed, 60 insertions, 34 deletions
diff --git a/scripts/rqtl2/install_genotypes.py b/scripts/rqtl2/install_genotypes.py index dffd9d8..1eb3b75 100644 --- a/scripts/rqtl2/install_genotypes.py +++ b/scripts/rqtl2/install_genotypes.py @@ -19,10 +19,13 @@ from scripts.rqtl2.entry import build_main from scripts.rqtl2.cli_parser import add_common_arguments from scripts.cli_parser import init_cli_parser, add_global_data_arguments -def insert_markers(dbconn: mdb.Connection, - speciesid: int, - markers: tuple[str, ...], - pmapdata: Optional[Iterator[dict]]) -> int: +def insert_markers( + dbconn: mdb.Connection, + speciesid: int, + markers: tuple[str, ...], + pmapdata: Optional[Iterator[dict]], + _logger: Logger +) -> int: """Insert genotype and genotype values into the database.""" mdata = reduce(#type: ignore[var-annotated] lambda acc, row: ({#type: ignore[arg-type, return-value] @@ -48,9 +51,12 @@ def insert_markers(dbconn: mdb.Connection, } for marker in markers}.values())) return cursor.rowcount -def insert_individuals(dbconn: mdb.Connection, - speciesid: int, - individuals: tuple[str, ...]) -> int: +def insert_individuals( + dbconn: mdb.Connection, + speciesid: int, + individuals: tuple[str, ...], + _logger: Logger +) -> int: """Insert individuals/samples into the database.""" with dbconn.cursor() as cursor: cursor.executemany( @@ -61,10 +67,13 @@ def insert_individuals(dbconn: mdb.Connection, for individual in individuals)) return cursor.rowcount -def cross_reference_individuals(dbconn: mdb.Connection, - speciesid: int, - populationid: int, - individuals: tuple[str, ...]) -> int: +def cross_reference_individuals( + dbconn: mdb.Connection, + speciesid: int, + populationid: int, + individuals: tuple[str, ...], + _logger: Logger +) -> int: """Cross reference any inserted individuals.""" with dbconn.cursor(cursorclass=DictCursor) as cursor: paramstr = ", ".join(["%s"] * len(individuals)) @@ -80,11 +89,13 @@ def cross_reference_individuals(dbconn: mdb.Connection, tuple(ids)) return cursor.rowcount -def insert_genotype_data(dbconn: mdb.Connection, - speciesid: int, - genotypes: tuple[dict, ...], - individuals: tuple[str, ...]) -> tuple[ - int, tuple[dict, ...]]: +def insert_genotype_data( + dbconn: mdb.Connection, + speciesid: int, + genotypes: tuple[dict, ...], + individuals: tuple[str, ...], + _logger: Logger +) -> tuple[int, tuple[dict, ...]]: """Insert the genotype data values into the database.""" with dbconn.cursor(cursorclass=DictCursor) as cursor: paramstr = ", ".join(["%s"] * len(individuals)) @@ -120,11 +131,14 @@ def insert_genotype_data(dbconn: mdb.Connection, "markerid": row["markerid"] } for row in data) -def cross_reference_genotypes(dbconn: mdb.Connection, - speciesid: int, - datasetid: int, - dataids: tuple[dict, ...], - gmapdata: Optional[Iterator[dict]]) -> int: +def cross_reference_genotypes( + dbconn: mdb.Connection, + speciesid: int, + datasetid: int, + dataids: tuple[dict, ...], + gmapdata: Optional[Iterator[dict]], + _logger: Logger +) -> int: """Cross-reference the data to the relevant dataset.""" _rows, markers, mdata = reduce(#type: ignore[var-annotated] lambda acc, row: (#type: ignore[return-value,arg-type] @@ -141,20 +155,30 @@ def cross_reference_genotypes(dbconn: mdb.Connection, with dbconn.cursor(cursorclass=DictCursor) as cursor: paramstr = ", ".join(["%s"] * len(markers)) - cursor.execute("SELECT Id, Name FROM Geno " - f"WHERE SpeciesId=%s AND Name IN ({paramstr})", - (speciesid,) + markers) + insertparams = (speciesid,) + markers + selectquery = ("SELECT Id, Name FROM Geno " + f"WHERE SpeciesId=%s AND Name IN ({paramstr})") + _logger.debug( + "The select query was\n\t%s\n\nwith the parameters\n\t%s", + selectquery, + (speciesid,) + markers) + cursor.execute(query, insertparams) markersdict = {row["Id"]: row["Name"] for row in cursor.fetchall()} - cursor.executemany( + insertquery = ( "INSERT INTO GenoXRef(GenoFreezeId, GenoId, DataId, cM) " "VALUES(%(datasetid)s, %(markerid)s, %(dataid)s, %(pos)s) " - "ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId", - tuple({ - **row, - "datasetid": datasetid, - "pos": mdata.get(markersdict.get( - row.get("markerid"), "nosuchkey"), {}).get("pos") - } for row in dataids)) + "ON DUPLICATE KEY UPDATE GenoFreezeId=GenoFreezeId") + insertparams = tuple({ + **row, + "datasetid": datasetid, + "pos": mdata.get(markersdict.get( + row.get("markerid"), "nosuchkey"), {}).get("pos") + } for row in dataids) + _logger.debug( + "The insert query was\n\t%s\n\nwith the parameters\n\t%s", + insertquery, + (speciesid,) + markers) + cursor.executemany(insertquery, insertparams) return cursor.rowcount def install_genotypes(#pylint: disable=[too-many-arguments, too-many-locals] @@ -189,7 +213,8 @@ def install_genotypes(#pylint: disable=[too-many-arguments, too-many-locals] speciesid, tuple(key for key in batch[0].keys() if key != "id"), (rqtl2.file_data(zfile, "pmap", cdata) if "pmap" in cdata - else None)) + else None), + logger) individuals = tuple(row["id"] for row in batch) insert_individuals(dbconn, speciesid, individuals) cross_reference_individuals( @@ -202,7 +227,8 @@ def install_genotypes(#pylint: disable=[too-many-arguments, too-many-locals] datasetid, dataids, (rqtl2.file_data(zfile, "gmap", cdata) - if "gmap" in cdata else None)) + if "gmap" in cdata else None), + logger) count = count + len(batch) except rqtl2.InvalidFormat as exc: logger.error(str(exc)) |