diff options
Diffstat (limited to 'uploader/phenotypes/models.py')
-rw-r--r-- | uploader/phenotypes/models.py | 241 |
1 files changed, 202 insertions, 39 deletions
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py index 73b1cce..c2aeebf 100644 --- a/uploader/phenotypes/models.py +++ b/uploader/phenotypes/models.py @@ -1,14 +1,30 @@ """Database and utility functions for phenotypes.""" -from typing import Optional +import logging +import tempfile +from pathlib import Path from functools import reduce from datetime import datetime +from typing import Optional, Iterable import MySQLdb as mdb from MySQLdb.cursors import Cursor, DictCursor -from flask import current_app as app +from functional_tools import take from gn_libs.mysqldb import debug_query +logger = logging.getLogger(__name__) + + +__PHENO_DATA_TABLES__ = { + "PublishData": { + "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"}, + "PublishSE": { + "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"}, + "NStrain": { + "table": "NStrain", "valueCol": "count", "DataIdCol": "DataId"} +} + + def datasets_by_population( conn: mdb.Connection, species_id: int, @@ -32,10 +48,10 @@ def dataset_by_id(conn: mdb.Connection, """Fetch dataset details by identifier""" with conn.cursor(cursorclass=DictCursor) as cursor: cursor.execute( - "SELECT s.SpeciesId, pf.* FROM Species AS s " - "INNER JOIN InbredSet AS iset ON s.Id=iset.SpeciesId " - "INNER JOIN PublishFreeze AS pf ON iset.Id=pf.InbredSetId " - "WHERE s.Id=%s AND iset.Id=%s AND pf.Id=%s", + "SELECT Species.SpeciesId, PublishFreeze.* FROM Species " + "INNER JOIN InbredSet ON Species.Id=InbredSet.SpeciesId " + "INNER JOIN PublishFreeze ON InbredSet.Id=PublishFreeze.InbredSetId " + "WHERE Species.Id=%s AND InbredSet.Id=%s AND PublishFreeze.Id=%s", (species_id, population_id, dataset_id)) return dict(cursor.fetchone()) @@ -54,6 +70,20 @@ def phenotypes_count(conn: mdb.Connection, return int(cursor.fetchone()["total_phenos"]) +def phenotype_publication_data(conn, phenotype_id) -> Optional[dict]: + """Retrieve the publication data for a phenotype if it exists.""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute( + "SELECT DISTINCT pxr.PhenotypeId, pub.* FROM PublishXRef AS pxr " + "INNER JOIN Publication as pub ON pxr.PublicationId=pub.Id " + "WHERE pxr.PhenotypeId=%s", + (phenotype_id,)) + res = cursor.fetchone() + if res is None: + return res + return dict(res) + + def dataset_phenotypes(conn: mdb.Connection, population_id: int, dataset_id: int, @@ -61,7 +91,7 @@ def dataset_phenotypes(conn: mdb.Connection, limit: Optional[int] = None) -> tuple[dict, ...]: """Fetch the actual phenotypes.""" _query = ( - "SELECT pheno.*, pxr.Id, ist.InbredSetCode FROM Phenotype AS pheno " + "SELECT pheno.*, pxr.Id AS xref_id, pxr.InbredSetId, ist.InbredSetCode FROM Phenotype AS pheno " "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " "INNER JOIN PublishFreeze AS pf ON pxr.InbredSetId=pf.InbredSetId " "INNER JOIN InbredSet AS ist ON pf.InbredSetId=ist.Id " @@ -69,35 +99,45 @@ def dataset_phenotypes(conn: mdb.Connection, f" LIMIT {limit} OFFSET {offset}" if bool(limit) else "") with conn.cursor(cursorclass=DictCursor) as cursor: cursor.execute(_query, (population_id, dataset_id)) - debug_query(cursor, app.logger) + debug_query(cursor, logger) return tuple(dict(row) for row in cursor.fetchall()) -def __phenotype_se__(cursor: Cursor, - species_id: int, - population_id: int, - dataset_id: int, - xref_id: str) -> dict: +def __phenotype_se__(cursor: Cursor, xref_id, dataids_and_strainids): """Fetch standard-error values (if they exist) for a phenotype.""" - _sequery = ( - "SELECT pxr.Id AS xref_id, pxr.DataId, str.Id AS StrainId, pse.error, nst.count " - "FROM Phenotype AS pheno " - "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " - "INNER JOIN PublishSE AS pse ON pxr.DataId=pse.DataId " - "INNER JOIN NStrain AS nst ON pse.DataId=nst.DataId " - "INNER JOIN Strain AS str ON nst.StrainId=str.Id " - "INNER JOIN StrainXRef AS sxr ON str.Id=sxr.StrainId " - "INNER JOIN PublishFreeze AS pf ON sxr.InbredSetId=pf.InbredSetId " - "INNER JOIN InbredSet AS iset ON pf.InbredSetId=iset.InbredSetId " - "WHERE (str.SpeciesId, pxr.InbredSetId, pf.Id, pxr.Id)=(%s, %s, %s, %s)") - cursor.execute(_sequery, - (species_id, population_id, dataset_id, xref_id)) - return {(row["DataId"], row["StrainId"]): { - "xref_id": row["xref_id"], - "DataId": row["DataId"], - "error": row["error"], - "count": row["count"] - } for row in cursor.fetchall()} + paramstr = ", ".join(["(%s, %s)"] * len(dataids_and_strainids)) + flat = tuple(item for sublist in dataids_and_strainids for item in sublist) + cursor.execute("SELECT * FROM PublishSE WHERE (DataId, StrainId) IN " + f"({paramstr})", + flat) + debug_query(cursor, logger) + _se = { + (row["DataId"], row["StrainId"]): { + "DataId": row["DataId"], + "StrainId": row["StrainId"], + "error": row["error"] + } + for row in cursor.fetchall() + } + + cursor.execute("SELECT * FROM NStrain WHERE (DataId, StrainId) IN " + f"({paramstr})", + flat) + debug_query(cursor, logger) + _n = { + (row["DataId"], row["StrainId"]): { + "DataId": row["DataId"], + "StrainId": row["StrainId"], + "count": row["count"] + } + for row in cursor.fetchall() + } + + keys = set(tuple(_se.keys()) + tuple(_n.keys())) + return { + key: {"xref_id": xref_id, **_se.get(key,{}), **_n.get(key,{})} + for key in keys + } def __organise_by_phenotype__(pheno, row): """Organise disparate data rows into phenotype 'objects'.""" @@ -113,10 +153,12 @@ def __organise_by_phenotype__(pheno, row): "Pre_publication_abbreviation": row["Pre_publication_abbreviation"], "Post_publication_abbreviation": row["Post_publication_abbreviation"], "xref_id": row["pxr.Id"], + "DataId": row["DataId"], "data": { **(_pheno["data"] if bool(_pheno) else {}), (row["DataId"], row["StrainId"]): { "DataId": row["DataId"], + "StrainId": row["StrainId"], "mean": row["mean"], "Locus": row["Locus"], "LRS": row["LRS"], @@ -170,11 +212,9 @@ def phenotype_by_id( **_pheno, "data": tuple(__merge_pheno_data_and_se__( _pheno["data"], - __phenotype_se__(cursor, - species_id, - population_id, - dataset_id, - xref_id)).values()) + __phenotype_se__( + cursor, xref_id, tuple(_pheno["data"].keys())) + ).values()) } if bool(_pheno) and len(_pheno.keys()) > 1: raise Exception( @@ -202,7 +242,7 @@ def phenotypes_data(conn: mdb.Connection, f" LIMIT {limit} OFFSET {offset}" if bool(limit) else "") with conn.cursor(cursorclass=DictCursor) as cursor: cursor.execute(_query, (population_id, dataset_id)) - debug_query(cursor, app.logger) + debug_query(cursor, logger) return tuple(dict(row) for row in cursor.fetchall()) @@ -229,5 +269,128 @@ def save_new_dataset(cursor: Cursor, "%(created)s, %(public)s, %(population_id)s, %(confidentiality)s, " "%(users)s)", params) - debug_query(cursor, app.logger) + debug_query(cursor, logger) return {**params, "Id": cursor.lastrowid} + + +def phenotypes_data_by_ids( + conn: mdb.Connection, + inbred_pheno_xref: dict[str, int] +) -> tuple[dict, ...]: + """Fetch all phenotype data, filtered by the `inbred_pheno_xref` mapping.""" + _paramstr = ",".join(["(%s, %s, %s)"] * len(inbred_pheno_xref)) + _query = ("SELECT " + "pub.PubMed_ID, pheno.*, pxr.*, pd.*, str.*, iset.InbredSetCode " + "FROM Publication AS pub " + "RIGHT JOIN PublishXRef AS pxr0 ON pub.Id=pxr0.PublicationId " + "INNER JOIN Phenotype AS pheno ON pxr0.PhenotypeId=pheno.id " + "INNER JOIN PublishXRef AS pxr ON pheno.Id=pxr.PhenotypeId " + "INNER JOIN PublishData AS pd ON pxr.DataId=pd.Id " + "INNER JOIN Strain AS str ON pd.StrainId=str.Id " + "INNER JOIN StrainXRef AS sxr ON str.Id=sxr.StrainId " + "INNER JOIN PublishFreeze AS pf ON sxr.InbredSetId=pf.InbredSetId " + "INNER JOIN InbredSet AS iset ON pf.InbredSetId=iset.InbredSetId " + f"WHERE (pxr.InbredSetId, pheno.Id, pxr.Id) IN ({_paramstr}) " + "ORDER BY pheno.Id") + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute(_query, tuple(item for row in inbred_pheno_xref + for item in (row["population_id"], + row["phenoid"], + row["xref_id"]))) + debug_query(cursor, logger) + return tuple( + reduce(__organise_by_phenotype__, cursor.fetchall(), {}).values()) + + +def create_new_phenotypes(conn: mdb.Connection, + phenotypes: Iterable[dict]) -> tuple[dict, ...]: + """Add entirely new phenotypes to the database.""" + _phenos = tuple() + with conn.cursor(cursorclass=DictCursor) as cursor: + while True: + batch = take(phenotypes, 1000) + if len(batch) == 0: + break + + cursor.executemany( + ("INSERT INTO " + "Phenotype(Pre_publication_description, Original_description, Units, Authorized_Users) " + "VALUES (%s, %s, %s, 'robwilliams')"), + tuple((row["id"], row["description"], row["units"]) + for row in batch)) + paramstr = ", ".join(["%s"] * len(batch)) + cursor.execute( + "SELECT * FROM Phenotype WHERE Pre_publication_description IN " + f"({paramstr})", + tuple(item["id"] for item in batch)) + _phenos = _phenos + tuple({ + "phenotype_id": row["Id"], + "id": row["Pre_publication_description"], + "description": row["Original_description"], + "units": row["Units"] + } for row in cursor.fetchall()) + + return _phenos + + +def save_phenotypes_data( + conn: mdb.Connection, + table: str, + data: Iterable[dict] +) -> int: + """Save new phenotypes data into the database.""" + _table_details = __PHENO_DATA_TABLES__[table] + with conn.cursor(cursorclass=DictCursor) as cursor: + _count = 0 + while True: + batch = take(data, 100000) + if len(batch) == 0: + logger.warning("Got an empty batch. This needs investigation.") + break + + logger.debug("Saving batch of %s items.", len(batch)) + cursor.executemany( + (f"INSERT INTO {_table_details['table']}" + f"({_table_details['DataIdCol']}, StrainId, {_table_details['valueCol']}) " + "VALUES " + f"(%(data_id)s, %(sample_id)s, %(value)s) "), + tuple(batch)) + debug_query(cursor, logger) + _count = _count + len(batch) + + + logger.debug("Saved a total of %s data rows", _count) + return _count + + +def quick_save_phenotypes_data( + conn: mdb.Connection, + table: str, + dataitems: Iterable[dict], + tmpdir: Path +) -> int: + """Save data items to the database, but using """ + _table_details = __PHENO_DATA_TABLES__[table] + with (tempfile.NamedTemporaryFile( + prefix=f"{table}_data", mode="wt", dir=tmpdir) as tmpfile, + conn.cursor(cursorclass=DictCursor) as cursor): + _count = 0 + logger.debug("Write data rows to text file.") + for row in dataitems: + tmpfile.write( + f'{row["data_id"]}\t{row["sample_id"]}\t{row["value"]}\n') + _count = _count + 1 + tmpfile.flush() + + logger.debug("Load text file into database (table: %s)", + _table_details["table"]) + cursor.execute( + f"LOAD DATA LOCAL INFILE '{tmpfile.name}' " + f"INTO TABLE {_table_details['table']} " + "(" + f"{_table_details['DataIdCol']}, " + "StrainId, " + f"{_table_details['valueCol']}" + ")") + debug_query(cursor, logger) + return _count |