scripts/insert_expression_data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

# !/usr/bin/python3
"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""

########################################################################
# Last Updated 3/11/2022 by Zach
########################################################################
import csv
import string
import sys
import MySQLdb
import getpass
import time

########################################################################

def translate_alias(str):
    if str == "B6":
        return "C57BL/6J"
    elif str == "D2":
        return "DBA/2J"
    else:
        return str


########################################################################
#
#  Indicate Data Start Position, ProbeFreezeId, gene_chip_id, DataFile
#
########################################################################

data_start = 1

gene_chip_id = int(input("Enter GeneChipId:"))
probeset_freeze_id = int(input("Enter ProbeSetFreezeId:"))
input_file_name = input("Enter file name with suffix:")

try:
    passwd = getpass.getpass('Please enter mysql password here : ')
    conn = MySQLdb.Connect(db='db_webqtl', host='localhost', user='webqtlout', passwd=passwd)

    db = conn.cursor()

    print
    "You have successfully connected to mysql.\n"
except:
    print
    "You entered incorrect password.\n"
    sys.exit(0)

time0 = time.time()

#########################################################################
#
#  Check if each line have same number of members
#  generate the gene list of expression data here
#
#########################################################################
print
'Checking if each line have same number of members'

gene_list = []
strain_list = []
trait_data = []

with open(input_file_name, "r") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="\t")
    
    kj = 0
    for line in reader:
        trait_data.append(line)

        # Get the strain list; only need to get it once
        if kj == 0:
            strain_list = [item for item in line.keys() if item != "ProbeSetID"]
            print("STRAIN LIST:", strain_list)

        gene_list.append(line['ProbeSetID'])

        if kj % 100000 == 0:
            print(f"checked {kj} lines")
        kj += 1

gene_list.sort()

print(f"used {time.time() - time0} seconds")
#########################################################################
#
#  Check if each strain exist in database
#  generate the string id list of expression data here
#
#########################################################################
print('Checking if each strain exists in database')

strain_list = map(translate_alias, strain_list)

strain_ids = {}
for item in strain_list:
    try:
        db.execute(f'select Id from Strain where Name = "{item}" AND SpeciesId=1')
        strain_ids[item] = db.fetchone()[0]
    except:
        print(f"{item} does not exist, check the if the strain name is correct")
        sys.exit(0)

print(f"Used {time.time() - time0} seconds")

########################################################################
#
# Check if each ProbeSet exist in database
#
########################################################################
print("Check if each ProbeSet exists in database")


# Check whether ProbeSetIDs are Name or TargetId (if not Name, assume to be TargetId)
id_type = "TargetId"
db.execute(f"select Id from ProbeSet where Name='{gene_list[0]}' and ChipId={gene_chip_id}")
if len(db.fetchall()):
    id_type = "Name"

## Get Name/TargetId + ID list from database
db.execute(f"select {id_type}, Id from ProbeSet where ChipId={gene_chip_id} order by {id_type}")
records_from_db = db.fetchall()

record_names = [item[0] for item in records_from_db]
record_names.sort()

# Compare gene_list with gene_names
invalid_records = []
lowercase_records = [name2.lower() for name2 in record_names]
for name in gene_list:
    if name.lower() not in lowercase_records:
        invalid_records.append(name)

if len(invalid_records):
    with open("ProbeSetError.txt", "wb") as error_fh:
        for item in invalid_records:
            error_fh.write(f"{item} doesn't exist, cheeck if the ProbeSet name is correct \n")
    sys.exit(0)

print(f"used {time.time() - time0} seconds")
#########################################################################
#
# Insert data into database
#
#########################################################################
print("getting ProbeSet Name + Id")
record_ids = {}
for record in records_from_db:
    record_ids[record[0]] = record[1]

print(f"used {time.time() - time0} seconds")

print("inserting data")

# Get old max dataId
db.execute('select max(Id) from ProbeSetData')
latest_data_id = int(db.fetchone()[0])
print(f"Latest DataId = {latest_data_id}")

# Insert data
probeset_data_values = []
probeset_xref_values = []
for i, item in enumerate(trait_data):
    latest_data_id += 1


    probeset_id = item['ProbeSetID']
    item.pop('ProbeSetID')
    sample_data = item
    for strain in sample_data:
        probeset_data_values.append(f"({latest_data_id},{strain_ids[strain]},{float(sample_data[strain])})")

    probeset_xref_values.append(f"({probeset_freeze_id},{record_ids[probeset_id]},{latest_data_id})")

    # Insert into tables for every 100 traits
    if i % 100 == 0:
        data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}"
        db.execute(data_query)

        xref_query = (
            "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) "
            f"VALUES {','.join(probeset_xref_values)}")
        db.execute(xref_query)

        probeset_data_values = []
        probeset_xref_values = []

        print(f"Inserted {i} lines")
        print(f"Used {time.time() - time0} seconds")

# Insert the remainder (since the loop above only inserts every 100 traits)
if len(probeset_data_values):
    data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}"
    db.execute(data_query)

    xref_query = (
        "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) "
        f"VALUES {','.join(probeset_xref_values)}")
    db.execute(xref_query)

conn.commit()
conn.close()