scripts/insert_expression_data.py - genenetwork2

# !/usr/bin/python3
"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""

########################################################################
# Last Updated 3/11/2022 by Zach
########################################################################
import csv
import string
import sys
import MySQLdb
import getpass
import time

########################################################################

def translate_alias(str):
    if str == "B6":
        return "C57BL/6J"
    elif str == "D2":
        return "DBA/2J"
    else:
        return str


########################################################################
#
#  Indicate Data Start Position, ProbeFreezeId, gene_chip_id, DataFile
#
########################################################################

data_start = 1

gene_chip_id = int(input("Enter GeneChipId:"))
probeset_freeze_id = int(input("Enter ProbeSetFreezeId:"))
input_file_name = input("Enter file name with suffix:")

try:
    passwd = getpass.getpass('Please enter mysql password here : ')
    conn = MySQLdb.Connect(db='db_webqtl', host='localhost', user='webqtlout', passwd=passwd)

    db = conn.cursor()

    print
    "You have successfully connected to mysql.\n"
except:
    print
    "You entered incorrect password.\n"
    sys.exit(0)

time0 = time.time()

#########################################################################
#
#  Check if each line have same number of members
#  generate the gene list of expression data here
#
#########################################################################
print
'Checking if each line have same number of members'

gene_list = []
strain_list = []
trait_data = []

with open(input_file_name, "r") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="\t")
    
    kj = 0
    for line in reader:
        trait_data.append(line)

        # Get the strain list; only need to get it once
        if kj == 0:
            strain_list = [item for item in line.keys() if item != "ProbeSetID"]
            print("STRAIN LIST:", strain_list)

        gene_list.append(line['ProbeSetID'])

        if kj % 100000 == 0:
            print(f"checked {kj} lines")
        kj += 1

gene_list.sort()

print(f"used {time.time() - time0} seconds")
#########################################################################
#
#  Check if each strain exist in database
#  generate the string id list of expression data here
#
#########################################################################
print('Checking if each strain exists in database')

strain_list = map(translate_alias, strain_list)

strain_ids = {}
for item in strain_list:
    try:
        db.execute(f'select Id from Strain where Name = "{item}" AND SpeciesId=1')
        strain_ids[item] = db.fetchone()[0]
    except:
        print(f"{item} does not exist, check the if the strain name is correct")
        sys.exit(0)

print(f"Used {time.time() - time0} seconds")

########################################################################
#
# Check if each ProbeSet exist in database
#
########################################################################
print("Check if each ProbeSet exists in database")


# Check whether ProbeSetIDs are Name or TargetId (if not Name, assume to be TargetId)
id_type = "TargetId"
db.execute(f"select Id from ProbeSet where Name='{gene_list[0]}' and ChipId={gene_chip_id}")
if len(db.fetchall()):
    id_type = "Name"

## Get Name/TargetId + ID list from database
db.execute(f"select {id_type}, Id from ProbeSet where ChipId={gene_chip_id} order by {id_type}")
records_from_db = db.fetchall()

record_names = [item[0] for item in records_from_db]
record_names.sort()

# Compare gene_list with gene_names
invalid_records = []
lowercase_records = [name2.lower() for name2 in record_names]
for name in gene_list:
    if name.lower() not in lowercase_records:
        invalid_records.append(name)

if len(invalid_records):
    with open("ProbeSetError.txt", "wb") as error_fh:
        for item in invalid_records:
            error_fh.write(f"{item} doesn't exist, cheeck if the ProbeSet name is correct \n")
    sys.exit(0)

print(f"used {time.time() - time0} seconds")
#########################################################################
#
# Insert data into database
#
#########################################################################
print("getting ProbeSet Name + Id")
record_ids = {}
for record in records_from_db:
    record_ids[record[0]] = record[1]

print(f"used {time.time() - time0} seconds")

print("inserting data")

# Get old max dataId
db.execute('select max(Id) from ProbeSetData')
latest_data_id = int(db.fetchone()[0])
print(f"Latest DataId = {latest_data_id}")

# Insert data
probeset_data_values = []
probeset_xref_values = []
for i, item in enumerate(trait_data):
    latest_data_id += 1


    probeset_id = item['ProbeSetID']
    item.pop('ProbeSetID')
    sample_data = item
    for strain in sample_data:
        probeset_data_values.append(f"({latest_data_id},{strain_ids[strain]},{float(sample_data[strain])})")

    probeset_xref_values.append(f"({probeset_freeze_id},{record_ids[probeset_id]},{latest_data_id})")

    # Insert into tables for every 100 traits
    if i % 100 == 0:
        data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}"
        db.execute(data_query)

        xref_query = (
            "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) "
            f"VALUES {','.join(probeset_xref_values)}")
        db.execute(xref_query)

        probeset_data_values = []
        probeset_xref_values = []

        print(f"Inserted {i} lines")
        print(f"Used {time.time() - time0} seconds")

# Insert the remainder (since the loop above only inserts every 100 traits)
if len(probeset_data_values):
    data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}"
    db.execute(data_query)

    xref_query = (
        "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) "
        f"VALUES {','.join(probeset_xref_values)}")
    db.execute(xref_query)

conn.commit()
conn.close()