1 files changed, 112 insertions, 5 deletions
diff --git a/uploader/phenotypes/models.py b/uploader/phenotypes/models.py
index 9ff89ae..c2aeebf 100644
--- a/uploader/phenotypes/models.py
+++ b/uploader/phenotypes/models.py
@@ -1,17 +1,30 @@
 """Database and utility functions for phenotypes."""
 import logging
-from typing import Optional
+import tempfile
+from pathlib import Path
 from functools import reduce
 from datetime import datetime
+from typing import Optional, Iterable
 
 import MySQLdb as mdb
 from MySQLdb.cursors import Cursor, DictCursor
 
+from functional_tools import take
 from gn_libs.mysqldb import debug_query
 
 logger = logging.getLogger(__name__)
 
 
+__PHENO_DATA_TABLES__ = {
+    "PublishData": {
+        "table": "PublishData", "valueCol": "value", "DataIdCol": "Id"},
+    "PublishSE": {
+        "table": "PublishSE", "valueCol": "error", "DataIdCol": "DataId"},
+    "NStrain": {
+        "table": "NStrain", "valueCol": "count", "DataIdCol": "DataId"}
+}
+
+
 def datasets_by_population(
         conn: mdb.Connection,
         species_id: int,
@@ -35,10 +48,10 @@ def dataset_by_id(conn: mdb.Connection,
     """Fetch dataset details by identifier"""
     with conn.cursor(cursorclass=DictCursor) as cursor:
         cursor.execute(
-            "SELECT s.SpeciesId, pf.* FROM Species AS s "
-            "INNER JOIN InbredSet AS iset ON s.Id=iset.SpeciesId "
-            "INNER JOIN PublishFreeze AS pf ON iset.Id=pf.InbredSetId "
-            "WHERE s.Id=%s AND iset.Id=%s AND pf.Id=%s",
+            "SELECT Species.SpeciesId, PublishFreeze.* FROM Species "
+            "INNER JOIN InbredSet ON Species.Id=InbredSet.SpeciesId "
+            "INNER JOIN PublishFreeze ON InbredSet.Id=PublishFreeze.InbredSetId "
+            "WHERE Species.Id=%s AND InbredSet.Id=%s AND PublishFreeze.Id=%s",
             (species_id, population_id, dataset_id))
         return dict(cursor.fetchone())
 
@@ -287,3 +300,97 @@ def phenotypes_data_by_ids(
         debug_query(cursor, logger)
         return tuple(
             reduce(__organise_by_phenotype__, cursor.fetchall(), {}).values())
+
+
+def create_new_phenotypes(conn: mdb.Connection,
+                          phenotypes: Iterable[dict]) -> tuple[dict, ...]:
+    """Add entirely new phenotypes to the database."""
+    _phenos = tuple()
+    with conn.cursor(cursorclass=DictCursor) as cursor:
+        while True:
+            batch = take(phenotypes, 1000)
+            if len(batch) == 0:
+                break
+
+            cursor.executemany(
+                ("INSERT INTO "
+                 "Phenotype(Pre_publication_description, Original_description, Units, Authorized_Users) "
+                 "VALUES (%s, %s, %s, 'robwilliams')"),
+                tuple((row["id"], row["description"], row["units"])
+                      for row in batch))
+            paramstr = ", ".join(["%s"] * len(batch))
+            cursor.execute(
+                "SELECT * FROM Phenotype WHERE Pre_publication_description IN "
+                f"({paramstr})",
+                tuple(item["id"] for item in batch))
+            _phenos = _phenos + tuple({
+                "phenotype_id": row["Id"],
+                "id": row["Pre_publication_description"],
+                "description": row["Original_description"],
+                "units": row["Units"]
+            } for row in cursor.fetchall())
+
+    return _phenos
+
+
+def save_phenotypes_data(
+        conn: mdb.Connection,
+        table: str,
+        data: Iterable[dict]
+) -> int:
+    """Save new phenotypes data into the database."""
+    _table_details = __PHENO_DATA_TABLES__[table]
+    with conn.cursor(cursorclass=DictCursor) as cursor:
+        _count = 0
+        while True:
+            batch = take(data, 100000)
+            if len(batch) == 0:
+                logger.warning("Got an empty batch. This needs investigation.")
+                break
+
+            logger.debug("Saving batch of %s items.", len(batch))
+            cursor.executemany(
+                (f"INSERT INTO {_table_details['table']}"
+                 f"({_table_details['DataIdCol']}, StrainId, {_table_details['valueCol']}) "
+                 "VALUES "
+                 f"(%(data_id)s, %(sample_id)s, %(value)s) "),
+                tuple(batch))
+            debug_query(cursor, logger)
+            _count = _count + len(batch)
+
+
+    logger.debug("Saved a total of %s data rows", _count)
+    return _count
+
+
+def quick_save_phenotypes_data(
+        conn: mdb.Connection,
+        table: str,
+        dataitems: Iterable[dict],
+        tmpdir: Path
+) -> int:
+    """Save data items to the database, but using """
+    _table_details = __PHENO_DATA_TABLES__[table]
+    with (tempfile.NamedTemporaryFile(
+            prefix=f"{table}_data", mode="wt", dir=tmpdir) as tmpfile,
+          conn.cursor(cursorclass=DictCursor) as cursor):
+        _count = 0
+        logger.debug("Write data rows to text file.")
+        for row in dataitems:
+            tmpfile.write(
+                f'{row["data_id"]}\t{row["sample_id"]}\t{row["value"]}\n')
+            _count = _count + 1
+        tmpfile.flush()
+
+        logger.debug("Load text file into database (table: %s)",
+                      _table_details["table"])
+        cursor.execute(
+            f"LOAD DATA LOCAL INFILE '{tmpfile.name}' "
+            f"INTO TABLE {_table_details['table']} "
+            "("
+            f"{_table_details['DataIdCol']}, "
+            "StrainId, "
+            f"{_table_details['valueCol']}"
+            ")")
+        debug_query(cursor, logger)
+        return _count