From 8191ef600eb54d83b88c4d5cd2f46c6caf568e5e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 30 Oct 2023 05:38:37 +0300 Subject: Prevent overwrite. Avoid inserting same probeset multiple times. * Avoid overwriting existing data by adding a random string to the name of the ProbeSet * Read ProbeSet names separately from the means for the strains, to avoid entering the same ProbeSet record over and over with varying names. --- scripts/insert_data.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/scripts/insert_data.py b/scripts/insert_data.py index 614ba4e..e1cc29b 100644 --- a/scripts/insert_data.py +++ b/scripts/insert_data.py @@ -1,5 +1,7 @@ """Insert means/averages or standard-error data into the database.""" import sys +import string +import random import argparse from typing import Tuple from functools import reduce @@ -13,6 +15,11 @@ from quality_control.file_utils import open_file from qc_app.db_utils import database_connection from qc_app.check_connections import check_db, check_redis +def random_string(count: int = 10) -> str: + """Generate a random, alphanumeric string.""" + return "".join(random.choices( + string.digits + string.ascii_letters, k=count)) + def translate_alias(heading): "Translate strain aliases into canonical names" translations = {"B6": "C57BL/6J", "D2": "DBA/2J"} @@ -63,6 +70,12 @@ def read_datavalues(filepath, headings, strain_info): "DataValue": float(row[sname]) } +def read_probesets(filepath, headings): + """Read the ProbeSet names.""" + for row in (dict(zip(headings, line)) + for line in read_file_contents(filepath)): + yield {"Name": int(row["ProbeSetID"])} + def last_data_id(dbconn: mdb.Connection) -> int: "Get the last id from the database" with dbconn.cursor() as cursor: @@ -148,8 +161,8 @@ def insert_means( strains = strains_info(dbconn, headings[1:], speciesid) check_strains(headings[1:], strains), probeset_query = ( - "INSERT IGNORE INTO ProbeSet(ChipId, Name) " - "VALUES (%(ChipId)s, %(ProbeSetId)s) ") + "INSERT INTO ProbeSet(ChipId, Name) " + "VALUES (%(ChipId)s, %(Name)s) ") means_query = ( "INSERT INTO ProbeSetData " "VALUES(%(ProbeSetDataId)s, %(StrainId)s, %(DataValue)s)") @@ -162,17 +175,23 @@ def insert_means( } for data_id, mean in enumerate( read_datavalues(filepath, headings, strains), start=(last_data_id(dbconn)+1))) + the_probesets = ({ + **row, + "Name": f"{row['Name']}::RAND_{random_string()}", + "ChipId": platform_id + } for row in read_probesets(filepath, headings)) with dbconn.cursor(cursorclass=DictCursor) as cursor: while True: - means = tuple(take(the_means, 1000)) + means = tuple(take(the_means, 10000)) + probeset_params = tuple(take(the_probesets, 10000)) if not bool(means): break - print(__format_query__(probeset_query, means)) + print(__format_query__(probeset_query, probeset_params)) print() print(__format_query__(means_query, means)) print() print(__format_query__(xref_query, means)) - cursor.executemany(probeset_query, means) + cursor.executemany(probeset_query, probeset_params) cursor.executemany(means_query, means) cursor.executemany(xref_query, __xref_params__(dbconn, means)) return 0 -- cgit v1.2.3