diff options
Diffstat (limited to 'uploader/phenotypes/models.py')
-rw-r--r-- | uploader/phenotypes/models.py | 103 |
1 files changed, 80 insertions, 23 deletions
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py index 48e64da..20b8e77 100644 --- a/uploader/phenotypes/models.py +++ b/uploader/phenotypes/models.py @@ -1,5 +1,7 @@ """Database and utility functions for phenotypes.""" import logging +import tempfile +from pathlib import Path from functools import reduce from datetime import datetime from typing import Optional, Iterable @@ -13,6 +15,16 @@ from gn_libs.mysqldb import debug_query logger = logging.getLogger(__name__) +__PHENO_DATA_TABLES__ = { + "PublishData": { + "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"}, + "PublishSE": { + "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"}, + "NStrain": { + "table": "NStrain", "valueCol": "count", "DataIdCol": "DataId"} +} + + def datasets_by_population( conn: mdb.Connection, species_id: int, @@ -36,10 +48,10 @@ def dataset_by_id(conn: mdb.Connection, """Fetch dataset details by identifier""" with conn.cursor(cursorclass=DictCursor) as cursor: cursor.execute( - "SELECT s.SpeciesId, pf.* FROM Species AS s " - "INNER JOIN InbredSet AS iset ON s.Id=iset.SpeciesId " - "INNER JOIN PublishFreeze AS pf ON iset.Id=pf.InbredSetId " - "WHERE s.Id=%s AND iset.Id=%s AND pf.Id=%s", + "SELECT Species.SpeciesId, PublishFreeze.* FROM Species " + "INNER JOIN InbredSet ON Species.Id=InbredSet.SpeciesId " + "INNER JOIN PublishFreeze ON InbredSet.Id=PublishFreeze.InbredSetId " + "WHERE Species.Id=%s AND InbredSet.Id=%s AND PublishFreeze.Id=%s", (species_id, population_id, dataset_id)) return dict(cursor.fetchone()) @@ -295,13 +307,21 @@ def create_new_phenotypes(conn: mdb.Connection, """Add entirely new phenotypes to the database.""" _phenos = tuple() with conn.cursor(cursorclass=DictCursor) as cursor: - for batch in take(phenotypes, 1000): + while True: + batch = take(phenotypes, 1000) + if len(batch) == 0: + break + cursor.executemany( ("INSERT INTO " - "Phenotypes(Pre_publication_description, Original_description, Units, Authorized_Users) " - "VALUES (%(id)s, %(description)s, %(units)s, 'robwilliams') " - "RETURNING *"), + "Phenotype(Pre_publication_description, Original_description, Units, Authorized_Users) " + "VALUES (%(id)s, %(description)s, %(units)s, 'robwilliams')"), tuple(batch)) + paramstr = ", ".join(["%s"] * len(batch)) + cursor.execute( + "SELECT * FROM Phenotype WHERE Pre_publication_description IN " + f"({paramstr})", + tuple(item["id"] for item in batch)) _phenos = _phenos + tuple({ "phenotype_id": row["Id"], "id": row["Pre_publication_description"], @@ -316,23 +336,60 @@ def save_phenotypes_data( conn: mdb.Connection, table: str, data: Iterable[dict] -) -> tuple[dict, ...]: +) -> int: """Save new phenotypes data into the database.""" - _table_details = { - "PublishData": {"table": "PublishData", "valueCol": "value"}, - "PublishSE": {"table": "PublishSE", "valueCol": "error"}, - "NStrain": {"table": "PublishData", "valueCol": "count"} - }[table] - saved_data = tuple() + _table_details = __PHENO_DATA_TABLES__[table] with conn.cursor(cursorclass=DictCursor) as cursor: - for batch in take(data, 5000): + _count = 0 + while True: + batch = take(data, 100000) + if len(batch) == 0: + logger.warning("Got an empty batch. This needs investigation.") + break + + logger.debug("Saving batch of %s items.", len(batch)) cursor.executemany( - (f"INSERT INTO {_table_details['table']}" - f"(Id, StrainId, {_table_details['valueCol']}) " - "VALUES " - f"(%(data_id)s, %(sample_id)s, %({_table_details['valueCol']})s) " - "RETURNING *"), + (f"INSERT INTO {_table_details['table']}" + f"({_table_details['DataIdCol']}, StrainId, {_table_details['valueCol']}) " + "VALUES " + f"(%(data_id)s, %(sample_id)s, %(value)s) "), tuple(batch)) - _data = data + tuple(cursor.fetchall()) + debug_query(cursor, logger) + _count = _count + len(batch) - return saved_data + + logger.debug("Saved a total of %s data rows", _count) + return _count + + +def quick_save_phenotypes_data( + conn: mdb.Connection, + table: str, + dataitems: Iterable[dict], + tmpdir: Path +) -> int: + """Save data items to the database, but using """ + _table_details = __PHENO_DATA_TABLES__[table] + with (tempfile.NamedTemporaryFile( + prefix=f"{table}_data", mode="wt", dir=tmpdir) as tmpfile, + conn.cursor(cursorclass=DictCursor) as cursor): + _count = 0 + console.debug("Write data rows to text file.") + for row in dataitems: + tmpfile.write( + f'{row["data_id"]}\t{row["sample_id"]}\t{row["value"]}\n') + _count = _count + 1 + tmpfile.flush() + + console.debug("Load text file into database (table: %s)", + _table_details["table"]) + cursor.execute( + f"LOAD DATA LOCAL INFILE '{tmpfile.name}' " + f"INTO TABLE {_table_details['table']} " + "(" + f"{_table_details['DataIdCol']}, " + "StrainId, " + f"{_table_details['valueCol']}" + ")") + debug_query(cursor, logger) + return _count |