aboutsummaryrefslogtreecommitdiff
path: root/scripts/insert_data.py
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2023-10-30 05:38:37 +0300
committerFrederick Muriuki Muriithi2023-10-30 05:38:37 +0300
commit8191ef600eb54d83b88c4d5cd2f46c6caf568e5e (patch)
treee16fdf5fe96ae3e91ea6f3e54d99423d73dd20cd /scripts/insert_data.py
parent2fcf052d9369a78e7f6cf99e73d504dcacdbbfe3 (diff)
downloadgn-uploader-8191ef600eb54d83b88c4d5cd2f46c6caf568e5e.tar.gz
Prevent overwrite. Avoid inserting same probeset multiple times.
* Avoid overwriting existing data by adding a random string to the name of the ProbeSet * Read ProbeSet names separately from the means for the strains, to avoid entering the same ProbeSet record over and over with varying names.
Diffstat (limited to 'scripts/insert_data.py')
-rw-r--r--scripts/insert_data.py29
1 files changed, 24 insertions, 5 deletions
diff --git a/scripts/insert_data.py b/scripts/insert_data.py
index 614ba4e..e1cc29b 100644
--- a/scripts/insert_data.py
+++ b/scripts/insert_data.py
@@ -1,5 +1,7 @@
"""Insert means/averages or standard-error data into the database."""
import sys
+import string
+import random
import argparse
from typing import Tuple
from functools import reduce
@@ -13,6 +15,11 @@ from quality_control.file_utils import open_file
from qc_app.db_utils import database_connection
from qc_app.check_connections import check_db, check_redis
+def random_string(count: int = 10) -> str:
+ """Generate a random, alphanumeric string."""
+ return "".join(random.choices(
+ string.digits + string.ascii_letters, k=count))
+
def translate_alias(heading):
"Translate strain aliases into canonical names"
translations = {"B6": "C57BL/6J", "D2": "DBA/2J"}
@@ -63,6 +70,12 @@ def read_datavalues(filepath, headings, strain_info):
"DataValue": float(row[sname])
}
+def read_probesets(filepath, headings):
+ """Read the ProbeSet names."""
+ for row in (dict(zip(headings, line))
+ for line in read_file_contents(filepath)):
+ yield {"Name": int(row["ProbeSetID"])}
+
def last_data_id(dbconn: mdb.Connection) -> int:
"Get the last id from the database"
with dbconn.cursor() as cursor:
@@ -148,8 +161,8 @@ def insert_means(
strains = strains_info(dbconn, headings[1:], speciesid)
check_strains(headings[1:], strains),
probeset_query = (
- "INSERT IGNORE INTO ProbeSet(ChipId, Name) "
- "VALUES (%(ChipId)s, %(ProbeSetId)s) ")
+ "INSERT INTO ProbeSet(ChipId, Name) "
+ "VALUES (%(ChipId)s, %(Name)s) ")
means_query = (
"INSERT INTO ProbeSetData "
"VALUES(%(ProbeSetDataId)s, %(StrainId)s, %(DataValue)s)")
@@ -162,17 +175,23 @@ def insert_means(
} for data_id, mean in enumerate(
read_datavalues(filepath, headings, strains),
start=(last_data_id(dbconn)+1)))
+ the_probesets = ({
+ **row,
+ "Name": f"{row['Name']}::RAND_{random_string()}",
+ "ChipId": platform_id
+ } for row in read_probesets(filepath, headings))
with dbconn.cursor(cursorclass=DictCursor) as cursor:
while True:
- means = tuple(take(the_means, 1000))
+ means = tuple(take(the_means, 10000))
+ probeset_params = tuple(take(the_probesets, 10000))
if not bool(means):
break
- print(__format_query__(probeset_query, means))
+ print(__format_query__(probeset_query, probeset_params))
print()
print(__format_query__(means_query, means))
print()
print(__format_query__(xref_query, means))
- cursor.executemany(probeset_query, means)
+ cursor.executemany(probeset_query, probeset_params)
cursor.executemany(means_query, means)
cursor.executemany(xref_query, __xref_params__(dbconn, means))
return 0