From 8191ef600eb54d83b88c4d5cd2f46c6caf568e5e Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 30 Oct 2023 05:38:37 +0300
Subject: Prevent overwrite. Avoid inserting same probeset multiple times.

* Avoid overwriting existing data by adding a random string to the
  name of the ProbeSet
* Read ProbeSet names separately from the means for the strains, to
  avoid entering the same ProbeSet record over and over with varying
  names.
---
 scripts/insert_data.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'scripts')

diff --git a/scripts/insert_data.py b/scripts/insert_data.py
index 614ba4e..e1cc29b 100644
--- a/scripts/insert_data.py
+++ b/scripts/insert_data.py
@@ -1,5 +1,7 @@
 """Insert means/averages or standard-error data into the database."""
 import sys
+import string
+import random
 import argparse
 from typing import Tuple
 from functools import reduce
@@ -13,6 +15,11 @@ from quality_control.file_utils import open_file
 from qc_app.db_utils import database_connection
 from qc_app.check_connections import check_db, check_redis
 
+def random_string(count: int = 10) -> str:
+    """Generate a random, alphanumeric string."""
+    return "".join(random.choices(
+        string.digits + string.ascii_letters, k=count))
+
 def translate_alias(heading):
     "Translate strain aliases into canonical names"
     translations = {"B6": "C57BL/6J", "D2": "DBA/2J"}
@@ -63,6 +70,12 @@ def read_datavalues(filepath, headings, strain_info):
                 "DataValue": float(row[sname])
             }
 
+def read_probesets(filepath, headings):
+    """Read the ProbeSet names."""
+    for row in (dict(zip(headings, line))
+                for line in read_file_contents(filepath)):
+        yield {"Name": int(row["ProbeSetID"])}
+
 def last_data_id(dbconn: mdb.Connection) -> int:
     "Get the last id from the database"
     with dbconn.cursor() as cursor:
@@ -148,8 +161,8 @@ def insert_means(
     strains = strains_info(dbconn, headings[1:], speciesid)
     check_strains(headings[1:], strains),
     probeset_query = (
-        "INSERT IGNORE INTO ProbeSet(ChipId, Name) "
-        "VALUES (%(ChipId)s, %(ProbeSetId)s) ")
+        "INSERT INTO ProbeSet(ChipId, Name) "
+        "VALUES (%(ChipId)s, %(Name)s) ")
     means_query = (
         "INSERT INTO ProbeSetData "
         "VALUES(%(ProbeSetDataId)s, %(StrainId)s, %(DataValue)s)")
@@ -162,17 +175,23 @@ def insert_means(
     } for data_id, mean in enumerate(
         read_datavalues(filepath, headings, strains),
         start=(last_data_id(dbconn)+1)))
+    the_probesets = ({
+        **row,
+        "Name": f"{row['Name']}::RAND_{random_string()}",
+        "ChipId": platform_id
+    } for row in read_probesets(filepath, headings))
     with dbconn.cursor(cursorclass=DictCursor) as cursor:
         while True:
-            means = tuple(take(the_means, 1000))
+            means = tuple(take(the_means, 10000))
+            probeset_params = tuple(take(the_probesets, 10000))
             if not bool(means):
                 break
-            print(__format_query__(probeset_query, means))
+            print(__format_query__(probeset_query, probeset_params))
             print()
             print(__format_query__(means_query, means))
             print()
             print(__format_query__(xref_query, means))
-            cursor.executemany(probeset_query, means)
+            cursor.executemany(probeset_query, probeset_params)
             cursor.executemany(means_query, means)
             cursor.executemany(xref_query, __xref_params__(dbconn, means))
     return 0
-- 
cgit 1.4.1