aboutsummaryrefslogtreecommitdiff
path: root/uploader/phenotypes/models.py
diff options
context:
space:
mode:
Diffstat (limited to 'uploader/phenotypes/models.py')
-rw-r--r--uploader/phenotypes/models.py103
1 files changed, 80 insertions, 23 deletions
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py
index 48e64da..20b8e77 100644
--- a/uploader/phenotypes/models.py
+++ b/uploader/phenotypes/models.py
@@ -1,5 +1,7 @@
"""Database and utility functions for phenotypes."""
import logging
+import tempfile
+from pathlib import Path
from functools import reduce
from datetime import datetime
from typing import Optional, Iterable
@@ -13,6 +15,16 @@ from gn_libs.mysqldb import debug_query
logger = logging.getLogger(__name__)
+__PHENO_DATA_TABLES__ = {
+ "PublishData": {
+ "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"},
+ "PublishSE": {
+ "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"},
+ "NStrain": {
+ "table": "NStrain", "valueCol": "count", "DataIdCol": "DataId"}
+}
+
+
def datasets_by_population(
conn: mdb.Connection,
species_id: int,
@@ -36,10 +48,10 @@ def dataset_by_id(conn: mdb.Connection,
"""Fetch dataset details by identifier"""
with conn.cursor(cursorclass=DictCursor) as cursor:
cursor.execute(
- "SELECT s.SpeciesId, pf.* FROM Species AS s "
- "INNER JOIN InbredSet AS iset ON s.Id=iset.SpeciesId "
- "INNER JOIN PublishFreeze AS pf ON iset.Id=pf.InbredSetId "
- "WHERE s.Id=%s AND iset.Id=%s AND pf.Id=%s",
+ "SELECT Species.SpeciesId, PublishFreeze.* FROM Species "
+ "INNER JOIN InbredSet ON Species.Id=InbredSet.SpeciesId "
+ "INNER JOIN PublishFreeze ON InbredSet.Id=PublishFreeze.InbredSetId "
+ "WHERE Species.Id=%s AND InbredSet.Id=%s AND PublishFreeze.Id=%s",
(species_id, population_id, dataset_id))
return dict(cursor.fetchone())
@@ -295,13 +307,21 @@ def create_new_phenotypes(conn: mdb.Connection,
"""Add entirely new phenotypes to the database."""
_phenos = tuple()
with conn.cursor(cursorclass=DictCursor) as cursor:
- for batch in take(phenotypes, 1000):
+ while True:
+ batch = take(phenotypes, 1000)
+ if len(batch) == 0:
+ break
+
cursor.executemany(
("INSERT INTO "
- "Phenotypes(Pre_publication_description, Original_description, Units, Authorized_Users) "
- "VALUES (%(id)s, %(description)s, %(units)s, 'robwilliams') "
- "RETURNING *"),
+ "Phenotype(Pre_publication_description, Original_description, Units, Authorized_Users) "
+ "VALUES (%(id)s, %(description)s, %(units)s, 'robwilliams')"),
tuple(batch))
+ paramstr = ", ".join(["%s"] * len(batch))
+ cursor.execute(
+ "SELECT * FROM Phenotype WHERE Pre_publication_description IN "
+ f"({paramstr})",
+ tuple(item["id"] for item in batch))
_phenos = _phenos + tuple({
"phenotype_id": row["Id"],
"id": row["Pre_publication_description"],
@@ -316,23 +336,60 @@ def save_phenotypes_data(
conn: mdb.Connection,
table: str,
data: Iterable[dict]
-) -> tuple[dict, ...]:
+) -> int:
"""Save new phenotypes data into the database."""
- _table_details = {
- "PublishData": {"table": "PublishData", "valueCol": "value"},
- "PublishSE": {"table": "PublishSE", "valueCol": "error"},
- "NStrain": {"table": "PublishData", "valueCol": "count"}
- }[table]
- saved_data = tuple()
+ _table_details = __PHENO_DATA_TABLES__[table]
with conn.cursor(cursorclass=DictCursor) as cursor:
- for batch in take(data, 5000):
+ _count = 0
+ while True:
+ batch = take(data, 100000)
+ if len(batch) == 0:
+ logger.warning("Got an empty batch. This needs investigation.")
+ break
+
+ logger.debug("Saving batch of %s items.", len(batch))
cursor.executemany(
- (f"INSERT INTO {_table_details['table']}"
- f"(Id, StrainId, {_table_details['valueCol']}) "
- "VALUES "
- f"(%(data_id)s, %(sample_id)s, %({_table_details['valueCol']})s) "
- "RETURNING *"),
+ (f"INSERT INTO {_table_details['table']}"
+ f"({_table_details['DataIdCol']}, StrainId, {_table_details['valueCol']}) "
+ "VALUES "
+ f"(%(data_id)s, %(sample_id)s, %(value)s) "),
tuple(batch))
- _data = data + tuple(cursor.fetchall())
+ debug_query(cursor, logger)
+ _count = _count + len(batch)
- return saved_data
+
+ logger.debug("Saved a total of %s data rows", _count)
+ return _count
+
+
+def quick_save_phenotypes_data(
+ conn: mdb.Connection,
+ table: str,
+ dataitems: Iterable[dict],
+ tmpdir: Path
+) -> int:
+ """Save data items to the database, but using """
+ _table_details = __PHENO_DATA_TABLES__[table]
+ with (tempfile.NamedTemporaryFile(
+ prefix=f"{table}_data", mode="wt", dir=tmpdir) as tmpfile,
+ conn.cursor(cursorclass=DictCursor) as cursor):
+ _count = 0
+ console.debug("Write data rows to text file.")
+ for row in dataitems:
+ tmpfile.write(
+ f'{row["data_id"]}\t{row["sample_id"]}\t{row["value"]}\n')
+ _count = _count + 1
+ tmpfile.flush()
+
+ console.debug("Load text file into database (table: %s)",
+ _table_details["table"])
+ cursor.execute(
+ f"LOAD DATA LOCAL INFILE '{tmpfile.name}' "
+ f"INTO TABLE {_table_details['table']} "
+ "("
+ f"{_table_details['DataIdCol']}, "
+ "StrainId, "
+ f"{_table_details['valueCol']}"
+ ")")
+ debug_query(cursor, logger)
+ return _count