aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorzsloan2022-03-22 19:02:15 +0000
committerzsloan2022-03-22 19:02:15 +0000
commita49da43ba00245cf23a2b72c314127986f567f28 (patch)
treece64e83370c52add94927bc050febf5d242722db /scripts
parent68ac19153b128f60b660e11365e5fd4304c95300 (diff)
parent32cb57b82db328bc84753af9d25e9aaa1bd31152 (diff)
downloadgenenetwork2-a49da43ba00245cf23a2b72c314127986f567f28.tar.gz
Merge remote-tracking branch 'origin/testing' into feature/add_rqtl_pairscan
Diffstat (limited to 'scripts')
-rw-r--r--scripts/insert_expression_data.py203
-rwxr-xr-xscripts/maintenance/QTL_Reaper_v6.py3
-rw-r--r--scripts/maintenance/Update_Case_Attributes_MySQL_tab.py23
-rwxr-xr-xscripts/maintenance/readProbeSetMean_v7.py1
-rwxr-xr-xscripts/maintenance/readProbeSetSE_v7.py1
-rw-r--r--scripts/maintenance/utilities.py10
6 files changed, 217 insertions, 24 deletions
diff --git a/scripts/insert_expression_data.py b/scripts/insert_expression_data.py
new file mode 100644
index 00000000..3d93c9f4
--- /dev/null
+++ b/scripts/insert_expression_data.py
@@ -0,0 +1,203 @@
+# !/usr/bin/python3
+"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""
+
+########################################################################
+# Last Updated 3/11/2022 by Zach
+########################################################################
+import csv
+import string
+import sys
+import MySQLdb
+import getpass
+import time
+
+########################################################################
+
+def translate_alias(str):
+ if str == "B6":
+ return "C57BL/6J"
+ elif str == "D2":
+ return "DBA/2J"
+ else:
+ return str
+
+
+########################################################################
+#
+# Indicate Data Start Position, ProbeFreezeId, gene_chip_id, DataFile
+#
+########################################################################
+
+data_start = 1
+
+gene_chip_id = int(input("Enter GeneChipId:"))
+probeset_freeze_id = int(input("Enter ProbeSetFreezeId:"))
+input_file_name = input("Enter file name with suffix:")
+
+try:
+ passwd = getpass.getpass('Please enter mysql password here : ')
+ conn = MySQLdb.Connect(db='db_webqtl', host='localhost', user='webqtlout', passwd=passwd)
+
+ db = conn.cursor()
+
+ print
+ "You have successfully connected to mysql.\n"
+except:
+ print
+ "You entered incorrect password.\n"
+ sys.exit(0)
+
+time0 = time.time()
+
+#########################################################################
+#
+# Check if each line have same number of members
+# generate the gene list of expression data here
+#
+#########################################################################
+print
+'Checking if each line have same number of members'
+
+gene_list = []
+strain_list = []
+trait_data = []
+
+with open(input_file_name, "r") as csvfile:
+ reader = csv.DictReader(csvfile, delimiter="\t")
+
+ kj = 0
+ for line in reader:
+ trait_data.append(line)
+
+ # Get the strain list; only need to get it once
+ if kj == 0:
+ strain_list = [item for item in line.keys() if item != "ProbeSetID"]
+ print("STRAIN LIST:", strain_list)
+
+ gene_list.append(line['ProbeSetID'])
+
+ if kj % 100000 == 0:
+ print(f"checked {kj} lines")
+ kj += 1
+
+gene_list.sort()
+
+print(f"used {time.time() - time0} seconds")
+#########################################################################
+#
+# Check if each strain exist in database
+# generate the string id list of expression data here
+#
+#########################################################################
+print('Checking if each strain exists in database')
+
+strain_list = map(translate_alias, strain_list)
+
+strain_ids = {}
+for item in strain_list:
+ try:
+ db.execute(f'select Id from Strain where Name = "{item}" AND SpeciesId=1')
+ strain_ids[item] = db.fetchone()[0]
+ except:
+ print(f"{item} does not exist, check the if the strain name is correct")
+ sys.exit(0)
+
+print(f"Used {time.time() - time0} seconds")
+
+########################################################################
+#
+# Check if each ProbeSet exist in database
+#
+########################################################################
+print("Check if each ProbeSet exists in database")
+
+
+# Check whether ProbeSetIDs are Name or TargetId (if not Name, assume to be TargetId)
+id_type = "TargetId"
+db.execute(f"select Id from ProbeSet where Name='{gene_list[0]}' and ChipId={gene_chip_id}")
+if len(db.fetchall()):
+ id_type = "Name"
+
+## Get Name/TargetId + ID list from database
+db.execute(f"select {id_type}, Id from ProbeSet where ChipId={gene_chip_id} order by {id_type}")
+records_from_db = db.fetchall()
+
+record_names = [item[0] for item in records_from_db]
+record_names.sort()
+
+# Compare gene_list with gene_names
+invalid_records = []
+lowercase_records = [name2.lower() for name2 in record_names]
+for name in gene_list:
+ if name.lower() not in lowercase_records:
+ invalid_records.append(name)
+
+if len(invalid_records):
+ with open("ProbeSetError.txt", "wb") as error_fh:
+ for item in invalid_records:
+ error_fh.write(f"{item} doesn't exist, cheeck if the ProbeSet name is correct \n")
+ sys.exit(0)
+
+print(f"used {time.time() - time0} seconds")
+#########################################################################
+#
+# Insert data into database
+#
+#########################################################################
+print("getting ProbeSet Name + Id")
+record_ids = {}
+for record in records_from_db:
+ record_ids[record[0]] = record[1]
+
+print(f"used {time.time() - time0} seconds")
+
+print("inserting data")
+
+# Get old max dataId
+db.execute('select max(Id) from ProbeSetData')
+latest_data_id = int(db.fetchone()[0])
+print(f"Latest DataId = {latest_data_id}")
+
+# Insert data
+probeset_data_values = []
+probeset_xref_values = []
+for i, item in enumerate(trait_data):
+ latest_data_id += 1
+
+
+ probeset_id = item['ProbeSetID']
+ item.pop('ProbeSetID')
+ sample_data = item
+ for strain in sample_data:
+ probeset_data_values.append(f"({latest_data_id},{strain_ids[strain]},{float(sample_data[strain])})")
+
+ probeset_xref_values.append(f"({probeset_freeze_id},{record_ids[probeset_id]},{latest_data_id})")
+
+ # Insert into tables for every 100 traits
+ if i % 100 == 0:
+ data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}"
+ db.execute(data_query)
+
+ xref_query = (
+ "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) "
+ f"VALUES {','.join(probeset_xref_values)}")
+ db.execute(xref_query)
+
+ probeset_data_values = []
+ probeset_xref_values = []
+
+ print(f"Inserted {i} lines")
+ print(f"Used {time.time() - time0} seconds")
+
+# Insert the remainder (since the loop above only inserts every 100 traits)
+if len(probeset_data_values):
+ data_query = f"INSERT INTO ProbeSetData VALUES {','.join(probeset_data_values)}"
+ db.execute(data_query)
+
+ xref_query = (
+ "INSERT INTO ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) "
+ f"VALUES {','.join(probeset_xref_values)}")
+ db.execute(xref_query)
+
+conn.commit()
+conn.close()
diff --git a/scripts/maintenance/QTL_Reaper_v6.py b/scripts/maintenance/QTL_Reaper_v6.py
index 35f2d1a1..20fd8e3b 100755
--- a/scripts/maintenance/QTL_Reaper_v6.py
+++ b/scripts/maintenance/QTL_Reaper_v6.py
@@ -106,3 +106,6 @@ for ProbeSetFreezeId in ProbeSetFreezeIds:
print(ProbeSetFreezeIds)
+
+cursor.close()
+con.close()
diff --git a/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
index bf796df4..a3cd1c35 100644
--- a/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
+++ b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
@@ -10,18 +10,13 @@ import time
import csv
########################################################################
-mydb = MySQLdb.connect(host='localhost',
- user='username',
- passwd='',
- db='db_webqtl')
-cursor = mydb.cursor()
+with MySQLdb.connect(
+ host='localhost', user='username', passwd='', db='db_webqtl') as mydb:
+ with mydb.cursor() as cursor:
-csv_data = csv.reader(file('GN711_pvalues.txt'), delimiter ="\t")
-for row in csv_data:
-
- cursor.execute("""UPDATE ProbeSetXRef SET pValue = %s WHERE ProbeSetFreezeId = %s AND ProbeSetId = %s """,
- (row))
-#close the connection to the database.
-mydb.commit()
-cursor.close()
-print("Done") \ No newline at end of file
+ csv_data = csv.reader(file('GN711_pvalues.txt'), delimiter ="\t")
+ for row in csv_data:
+ cursor.execute(
+ """UPDATE ProbeSetXRef SET pValue = %s WHERE ProbeSetFreezeId = %s AND ProbeSetId = %s """,
+ (row))
+print("Done")
diff --git a/scripts/maintenance/readProbeSetMean_v7.py b/scripts/maintenance/readProbeSetMean_v7.py
index 43f084f4..56adcdfd 100755
--- a/scripts/maintenance/readProbeSetMean_v7.py
+++ b/scripts/maintenance/readProbeSetMean_v7.py
@@ -269,4 +269,5 @@ if len(values1) > 0:
cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
db.execute(cmd)
+db.close()
con.close()
diff --git a/scripts/maintenance/readProbeSetSE_v7.py b/scripts/maintenance/readProbeSetSE_v7.py
index 2cfe2e07..88a347bf 100755
--- a/scripts/maintenance/readProbeSetSE_v7.py
+++ b/scripts/maintenance/readProbeSetSE_v7.py
@@ -251,4 +251,5 @@ if len(DataValues) > 0:
cmd = 'insert ProbeSetSE values %s' % DataValues
db.execute(cmd)
+db.close()
con.close()
diff --git a/scripts/maintenance/utilities.py b/scripts/maintenance/utilities.py
index 886410c2..1fe14809 100644
--- a/scripts/maintenance/utilities.py
+++ b/scripts/maintenance/utilities.py
@@ -1,16 +1,6 @@
-import MySQLdb
import re
import configparser
-def get_cursor():
- host = 'tux.uthsc.edu'
- user = 'webqtlout'
- passwd = 'webqtlout'
- db = 'db_webqtl'
- con = MySQLdb.Connect(db=db, host=host, user=user, passwd=passwd)
- cursor = con.cursor()
- return cursor, con
-
def clearspaces(s, default=None):
if s:
s = re.sub('\s+', ' ', s)