aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/maintenance/QTL_Reaper_v6.py108
-rw-r--r--scripts/maintenance/README.txt66
-rw-r--r--scripts/maintenance/Update_Case_Attributes_MySQL_tab.py27
-rwxr-xr-xscripts/maintenance/delete_genotypes.py33
-rwxr-xr-xscripts/maintenance/delete_phenotypes.py35
-rwxr-xr-xscripts/maintenance/load_genotypes.py154
-rwxr-xr-xscripts/maintenance/load_phenotypes.py171
-rwxr-xr-xscripts/maintenance/readProbeSetMean_v7.py274
-rwxr-xr-xscripts/maintenance/readProbeSetSE_v7.py254
9 files changed, 1122 insertions, 0 deletions
diff --git a/scripts/maintenance/QTL_Reaper_v6.py b/scripts/maintenance/QTL_Reaper_v6.py
new file mode 100755
index 00000000..e50dbd40
--- /dev/null
+++ b/scripts/maintenance/QTL_Reaper_v6.py
@@ -0,0 +1,108 @@
+####### To run this program do: python QTL_Reaper_v2.py 235
+#Where 163 is the ProbeSetFreeze Id of the database that we want to calculate #the LRS
+#!/usr/bin/python
+import sys
+import os
+import reaper
+import MySQLdb
+import time
+
+con = MySQLdb.Connect(db='db_webqtl',user='username',passwd='', host="localhost")
+cursor = con.cursor()
+
+genotypeDir = '/gnshare/gn/web/genotypes/'
+genotype_1 = reaper.Dataset()
+
+#####get all of the genotypes
+cursor.execute('select Id, Name from InbredSet')
+results = cursor.fetchall()
+InbredSets = {}
+for item in results:
+ InbredSets[item[0]] = genotypeDir+str(item[1])+'.geno'
+
+ProbeSetFreezeIds=sys.argv[1:]
+if ProbeSetFreezeIds:
+ #####convert the Ids to integer
+ ProbeSetFreezeIds=map(int, ProbeSetFreezeIds)
+
+else:
+ #####get all of the dataset that need be updated
+ cursor.execute('select distinct(ProbeSetFreezeId) from ProbeSetXRef where pValue is NULL order by ProbeSetFreezeId desc')
+ results = cursor.fetchall()
+ ProbeSetFreezeIds = []
+ for item in results:
+ ProbeSetFreezeIds.append(item[0])
+
+####human dataset can NOT use this program calculate the LRS, ignore it
+if ProbeSetFreezeIds.__contains__(215):
+ ProbeSetFreezeIds.remove(215)
+
+#output_file = open ('/home/xzhou/work/DatabaseTools/cal_LRS_Additive_result.txt', 'w')
+
+#####update
+for ProbeSetFreezeId in ProbeSetFreezeIds:
+ cursor.execute("""
+ select InbredSetId
+ from ProbeFreeze, ProbeSetFreeze
+ where ProbeFreeze.Id=ProbeSetFreeze.ProbeFreezeId and ProbeSetFreeze.Id=%d
+ """%ProbeSetFreezeId);
+
+ InbredSetId = cursor.fetchone()[0]
+ if InbredSetId==3:
+ InbredSetId=1
+ #if InbredSetId==12:
+ # InbredSetId=2
+
+ print ProbeSetFreezeId, InbredSets[InbredSetId]
+
+ genotype_1.read(InbredSets[InbredSetId])
+ locuses = []
+ for geno in genotype_1:
+ for locus in geno:
+ locuses.append(locus.name)
+
+ cursor.execute('select ProbeSetId, Locus, DataId from ProbeSetXRef where ProbeSetFreezeId=%s'%ProbeSetFreezeId)
+ ProbeSetXRefInfos = cursor.fetchall()
+
+ kj=0
+ for aProbeSetXRef in ProbeSetXRefInfos:
+ ProbeSetId, Locus, DataId = aProbeSetXRef
+ prgy = genotype_1.prgy
+
+ cursor.execute("select Strain.Name, ProbeSetData.value from Strain, ProbeSetData where Strain.Id = ProbeSetData.StrainId and ProbeSetData.Id = %d" % DataId)
+ results = cursor.fetchall()
+ if not results:
+ continue
+ _strains = []
+ _values = []
+ for item2 in results:
+ strain, value = item2
+ if strain in prgy:
+ _strains.append(strain)
+ _values.append(value)
+ if not _strains or not _values:
+ continue
+
+ if len(_strains) < 8:
+ continue
+ qtlresults = genotype_1.regression(strains = _strains, trait = _values)
+ _max = max(qtlresults)
+ _locus = _max.locus.name
+ _additive = _max.additive
+ _max = _max.lrs
+
+ #output_file.write('%s\t%s\t%s\t%s\t%s\n' % (ProbeSetFreezeId, ProbeSetId, _locus, _max, _additive))
+
+ # _max(LRS) maybe is infinite sometimes, so define it as a very big number
+ if _max == float('inf'):
+ _max = 10000
+
+ cursor.execute('update ProbeSetXRef set Locus=%s, LRS=%s, additive=%s where ProbeSetId=%s and ProbeSetFreezeId=%s', \
+ (_locus, _max, _additive, ProbeSetId, ProbeSetFreezeId))
+
+ kj += 1
+ if kj%1000==0:
+ print ProbeSetFreezeId, InbredSets[InbredSetId],kj
+
+
+ print ProbeSetFreezeIds
diff --git a/scripts/maintenance/README.txt b/scripts/maintenance/README.txt
new file mode 100644
index 00000000..8604a5c9
--- /dev/null
+++ b/scripts/maintenance/README.txt
@@ -0,0 +1,66 @@
+Theses are Python2 scripts used for uploading data into the MySQL
+database (current as per April 2018)
+
+
+Last updated by A.Centeno 4-2-18
+==========================
+load_genotypes.py
+==========================
+Mainly used to enter genotype batch records
+Run load_genotypes.py as:
+python delete_genotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Load_Genotypes/config.ini
+
+==========================
+delete_genotypes.py
+==========================
+Mainly used to delete genotype batch records
+Run delete_genotypes.py as:
+python delete_genotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Delete_Genotypes/config.ini
+
+==========================
+load_phenotypes.py
+==========================
+Mainly used to enter phenotype trait batch records
+Run load_phenotypes.py as:
+python load_phenotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Load_Phenotypes/config.ini
+
+==========================
+delete_phenotypes.py
+==========================
+Mainly used to delete phenotype trait full records
+Run delete_phenotypes.py as:
+python delete_phenotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Delete_Phenotypes/config.ini
+
+==========================
+QTL_Reaper_v6.py
+==========================
+Mainly used to perform QTL reaper and obtain Max LRS values.
+Run QTL_Reaper_v6.py as:
+python QTL_Reaper_v6.py (and GN accession number here)
+
+==========================
+Update_Case_Attributes_MySQL_tab.py
+==========================
+Mainly used to enter Case attributes.
+Run Update_Case_Attributes_MySQL_tab.py as:
+python Update_Case_Attributes_MySQL_tab.py
+
+==========================
+readProbeSetMean_v7.py
+==========================
+Mainly used to enter mean expression data.
+Run readProbeSetMean_v7.py as:
+python readProbeSetMean_v7.py
+
+==========================
+readProbeSetSE_v7.py
+==========================
+Mainly used to enter Standard Error values from expression data.
+Run readProbeSetSE_v7.py as:
+python readProbeSetSE_v7.py
+
+==========================
+MYSQL command CALCULATE MEANS NEW Structure:
+==========================
+Mainly used to calculate mean from expression data values.
+update ProbeSetXRef set mean = (select AVG(value) from ProbeSetData where ProbeSetData.Id = ProbeSetXRef.DataId) where ProbeSetXRef.ProbeSetFreezeId = 811;
diff --git a/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
new file mode 100644
index 00000000..0f8602c9
--- /dev/null
+++ b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python2
+########################################################################
+# Last Updated Apr 11, 2016 By Arthur and Zach
+########################################################################
+import string
+import sys
+import MySQLdb
+import getpass
+import time
+import csv
+########################################################################
+
+mydb = MySQLdb.connect(host='localhost',
+ user='username',
+ passwd='',
+ db='db_webqtl')
+cursor = mydb.cursor()
+
+csv_data = csv.reader(file('GN711_pvalues.txt'), delimiter ="\t")
+for row in csv_data:
+
+ cursor.execute("""UPDATE ProbeSetXRef SET pValue = %s WHERE ProbeSetFreezeId = %s AND ProbeSetId = %s """,
+ (row))
+#close the connection to the database.
+mydb.commit()
+cursor.close()
+print "Done" \ No newline at end of file
diff --git a/scripts/maintenance/delete_genotypes.py b/scripts/maintenance/delete_genotypes.py
new file mode 100755
index 00000000..fa693f0f
--- /dev/null
+++ b/scripts/maintenance/delete_genotypes.py
@@ -0,0 +1,33 @@
+import sys
+import csv
+
+import utilities
+import datastructure
+import genotypes
+
+def main(argv):
+ # config
+ config = utilities.get_config(argv[1])
+ print "config:"
+ for item in config.items('config'):
+ print "\t%s" % (str(item))
+ # var
+ print "variable:"
+ inbredsetid = config.get('config', 'inbredsetid')
+ print "\tinbredsetid: %s" % inbredsetid
+ # datafile
+ datafile = open(config.get('config', 'datafile'), 'r')
+ datafile = csv.reader(datafile, delimiter='\t', quotechar='"')
+ datafile.next()
+ delrowcount = 0
+ for row in datafile:
+ if len(row) == 0:
+ continue
+ genoname = row[0]
+ delrowcount += genotypes.delete(genoname, inbredsetid)
+ print "deleted %d genotypes" % (delrowcount)
+
+if __name__ == "__main__":
+ print "command line arguments:\n\t%s" % sys.argv
+ main(sys.argv)
+ print "exit successfully"
diff --git a/scripts/maintenance/delete_phenotypes.py b/scripts/maintenance/delete_phenotypes.py
new file mode 100755
index 00000000..326c466e
--- /dev/null
+++ b/scripts/maintenance/delete_phenotypes.py
@@ -0,0 +1,35 @@
+import sys
+import csv
+
+import utilities
+import datastructure
+import phenotypes
+
+def main(argv):
+ # config
+ config = utilities.get_config(argv[1])
+ print "config:"
+ for item in config.items('config'):
+ print "\t%s" % (str(item))
+ # var
+ print "variable:"
+ inbredsetid = config.get('config', 'inbredsetid')
+ print "\tinbredsetid: %s" % inbredsetid
+ # datafile
+ datafile = open(config.get('config', 'datafile'), 'r')
+ datafile = csv.reader(datafile, delimiter='\t', quotechar='"')
+ delrowcount = 0
+ for row in datafile:
+ if len(row) == 0:
+ continue
+ try:
+ publishxrefid = int(row[0])
+ except:
+ continue
+ delrowcount += phenotypes.delete(publishxrefid=publishxrefid, inbredsetid=inbredsetid)
+ print "deleted %d phenotypes" % (delrowcount)
+
+if __name__ == "__main__":
+ print "command line arguments:\n\t%s" % sys.argv
+ main(sys.argv)
+ print "exit successfully"
diff --git a/scripts/maintenance/load_genotypes.py b/scripts/maintenance/load_genotypes.py
new file mode 100755
index 00000000..338483f4
--- /dev/null
+++ b/scripts/maintenance/load_genotypes.py
@@ -0,0 +1,154 @@
+import sys
+import re
+
+import utilities
+import datastructure
+
+def main(argv):
+ config = utilities.get_config(argv[1])
+ print("config file:")
+ for item in config.items('config'):
+ print("\t%s" % str(item))
+ parse_genofile(config, fetch_parameters(config))
+
+def fetch_parameters(config):
+ config_dic = {}
+ config_dic['inbredsetid'] = config.get('config', 'inbredsetid')
+ config_dic["speciesid"] = datastructure.get_species(config_dic['inbredsetid'])[0]
+ config_dic['genofreezeid'] = datastructure.get_genofreeze_byinbredsetid(config_dic['inbredsetid'])[0]
+ config_dic['dataid'] = datastructure.get_nextdataid_genotype()
+ config_dic['genofile'] = config.get('config', 'genofile')
+ print("config dictionary:")
+ for k, v in config_dic.items():
+ print("\t%s: %s" % (k, v))
+ return config_dic
+
+def parse_genofile(config, config_dic):
+ genofile = open(config_dic['genofile'], 'r')
+ meta_dic = {}
+ for line in genofile:
+ line = line.strip()
+ if len(line) == 0:
+ continue
+ if line.startswith('#'):
+ continue
+ if line.startswith('@'):
+ line = line.strip('@')
+ items = line.split(';')
+ for item in items:
+ kv = re.split(':|=', item)
+ meta_dic[kv[0].strip()] = kv[1].strip()
+ continue
+ if line.lower().startswith("chr"):
+ #
+ print("geno file meta dictionary:")
+ for k, v in meta_dic.items():
+ print("\t%s: %s" % (k, v))
+ #
+ print("geno file head:\n\t%s" % line)
+ strainnames = line.split()[4:]
+ config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes")
+ continue
+ # geno file line, marker
+ marker_dic = parse_marker(line)
+ marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic)
+ if check_genoxref(config_dic, marker_dic):
+ continue
+ insert_genodata(config, config_dic, marker_dic)
+ insert_genoxref(config_dic, marker_dic)
+ config_dic['dataid'] += 1
+ genofile.close()
+
+def parse_marker(line):
+ marker_dic = {}
+ cells = line.split()
+ marker_dic['chromosome'] = cells[0]
+ marker_dic['locus'] = cells[1]
+ marker_dic['cm'] = cells[2]
+ marker_dic['mb'] = cells[3]
+ marker_dic['values'] = cells[4:]
+ return marker_dic
+
+def check_or_insert_geno(config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
+ sql = """
+ SELECT Geno.`Id`
+ FROM Geno
+ WHERE Geno.`SpeciesId`=%s
+ AND Geno.`Name` like %s
+ """
+ cursor.execute(sql, (config_dic["speciesid"], marker_dic['locus']))
+ result = cursor.fetchone()
+ if result:
+ genoid = result[0]
+ print("get geno record: %d" % genoid)
+ else:
+ sql = """
+ INSERT INTO Geno
+ SET
+ Geno.`SpeciesId`=%s,
+ Geno.`Name`=%s,
+ Geno.`Marker_Name`=%s,
+ Geno.`Chr`=%s,
+ Geno.`Mb`=%s
+ """
+ cursor.execute(sql, (config_dic['speciesid'], marker_dic['locus'], marker_dic['locus'], marker_dic['chromosome'], marker_dic['mb']))
+ rowcount = cursor.rowcount
+ genoid = con.insert_id()
+ print("INSERT INTO Geno: %d record: %d" % (rowcount, genoid))
+ return genoid
+
+def check_genoxref(config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
+ sql = """
+ select GenoXRef.*
+ from GenoXRef
+ where GenoXRef.`GenoFreezeId`=%s
+ AND GenoXRef.`GenoId`=%s
+ """
+ cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid']))
+ rowcount = cursor.rowcount
+ return rowcount
+
+def insert_genodata(config, config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
+ for index, strain in enumerate(config_dic['strains']):
+ strainid = strain[0]
+ value = utilities.to_db_string(marker_dic['values'][index], None)
+ if not value:
+ continue
+ value = config.get('config', "genovalue_" + value)
+ try:
+ number = int(value)
+ except:
+ continue
+ if not number in [-1, 0, 1]:
+ continue
+ sql = """
+ INSERT INTO GenoData
+ SET
+ GenoData.`Id`=%s,
+ GenoData.`StrainId`=%s,
+ GenoData.`value`=%s
+ """
+ cursor.execute(sql, (config_dic['dataid'], strainid, number))
+
+def insert_genoxref(config_dic, marker_dic):
+ cursor, con = utilities.get_cursor()
+ sql = """
+ INSERT INTO GenoXRef
+ SET
+ GenoXRef.`GenoFreezeId`=%s,
+ GenoXRef.`GenoId`=%s,
+ GenoXRef.`DataId`=%s,
+ GenoXRef.`cM`=%s,
+ GenoXRef.`Used_for_mapping`=%s
+ """
+ cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'], config_dic['dataid'], marker_dic['cm'], 'N'))
+ rowcount = cursor.rowcount
+ print("INSERT INTO GenoXRef: %d record" % (rowcount))
+
+if __name__ == "__main__":
+ print("command line arguments:\n\t%s" % sys.argv)
+ main(sys.argv)
+ print("exit successfully")
diff --git a/scripts/maintenance/load_phenotypes.py b/scripts/maintenance/load_phenotypes.py
new file mode 100755
index 00000000..c3c6570b
--- /dev/null
+++ b/scripts/maintenance/load_phenotypes.py
@@ -0,0 +1,171 @@
+import sys
+import csv
+
+import utilities
+import datastructure
+
+def main(argv):
+ # config
+ config = utilities.get_config(argv[1])
+ print "config:"
+ for item in config.items('config'):
+ print "\t%s" % (str(item))
+ # var
+ inbredsetid = config.get('config', 'inbredsetid')
+ print "inbredsetid: %s" % inbredsetid
+ species = datastructure.get_species(inbredsetid)
+ speciesid = species[0]
+ print "speciesid: %s" % speciesid
+ dataid = datastructure.get_nextdataid_phenotype()
+ print "next data id: %s" % dataid
+ cursor, con = utilities.get_cursor()
+ # datafile
+ datafile = open(config.get('config', 'datafile'), 'r')
+ phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"')
+ phenotypedata_head = phenotypedata.next()
+ print "phenotypedata head:\n\t%s" % phenotypedata_head
+ strainnames = phenotypedata_head[1:]
+ strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
+ # metafile
+ metafile = open(config.get('config', 'metafile'), 'r')
+ phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"')
+ phenotypemeta_head = phenotypemeta.next()
+ print "phenotypemeta head:\n\t%s" % phenotypemeta_head
+ print
+ # load
+ for metarow in phenotypemeta:
+ #
+ datarow_value = phenotypedata.next()
+ datarow_se = phenotypedata.next()
+ datarow_n = phenotypedata.next()
+ # Phenotype
+ sql = """
+ INSERT INTO Phenotype
+ SET
+ Phenotype.`Pre_publication_description`=%s,
+ Phenotype.`Post_publication_description`=%s,
+ Phenotype.`Original_description`=%s,
+ Phenotype.`Pre_publication_abbreviation`=%s,
+ Phenotype.`Post_publication_abbreviation`=%s,
+ Phenotype.`Lab_code`=%s,
+ Phenotype.`Submitter`=%s,
+ Phenotype.`Owner`=%s,
+ Phenotype.`Authorized_Users`=%s,
+ Phenotype.`Units`=%s
+ """
+ cursor.execute(sql, (
+ utilities.to_db_string(metarow[1], None),
+ utilities.to_db_string(metarow[2], None),
+ utilities.to_db_string(metarow[3], None),
+ utilities.to_db_string(metarow[4], None),
+ utilities.to_db_string(metarow[5], None),
+ utilities.to_db_string(metarow[6], None),
+ utilities.to_db_string(metarow[7], None),
+ utilities.to_db_string(metarow[8], None),
+ utilities.to_db_string(metarow[9], ""),
+ utilities.to_db_string(metarow[18], ""),
+ ))
+ rowcount = cursor.rowcount
+ phenotypeid = con.insert_id()
+ print "INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid)
+ # Publication
+ publicationid = None # reset
+ pubmed_id = utilities.to_db_string(metarow[0], None)
+ if pubmed_id:
+ sql = """
+ SELECT Publication.`Id`
+ FROM Publication
+ WHERE Publication.`PubMed_ID`=%s
+ """
+ cursor.execute(sql, (pubmed_id))
+ re = cursor.fetchone()
+ if re:
+ publicationid = re[0]
+ print "get Publication record: %d" % publicationid
+ if not publicationid:
+ sql = """
+ INSERT INTO Publication
+ SET
+ Publication.`PubMed_ID`=%s,
+ Publication.`Abstract`=%s,
+ Publication.`Authors`=%s,
+ Publication.`Title`=%s,
+ Publication.`Journal`=%s,
+ Publication.`Volume`=%s,
+ Publication.`Pages`=%s,
+ Publication.`Month`=%s,
+ Publication.`Year`=%s
+ """
+ cursor.execute(sql, (
+ utilities.to_db_string(metarow[0], None),
+ utilities.to_db_string(metarow[12], None),
+ utilities.to_db_string(metarow[10], ""),
+ utilities.to_db_string(metarow[11], None),
+ utilities.to_db_string(metarow[13], None),
+ utilities.to_db_string(metarow[14], None),
+ utilities.to_db_string(metarow[15], None),
+ utilities.to_db_string(metarow[16], None),
+ utilities.to_db_string(metarow[17], ""),
+ ))
+ rowcount = cursor.rowcount
+ publicationid = con.insert_id()
+ print "INSERT INTO Publication: %d record: %d" % (rowcount, publicationid)
+ # data
+ for index, strain in enumerate(strains):
+ #
+ strainid = strain[0]
+ value = utilities.to_db_float(datarow_value[index+1], None)
+ se = utilities.to_db_float(datarow_se[index+1], None)
+ n = utilities.to_db_int(datarow_n[index+1], None)
+ #
+ if value is not None:
+ sql = """
+ INSERT INTO PublishData
+ SET
+ PublishData.`Id`=%s,
+ PublishData.`StrainId`=%s,
+ PublishData.`value`=%s
+ """
+ cursor.execute(sql, (dataid, strainid, value))
+ if se is not None:
+ sql = """
+ INSERT INTO PublishSE
+ SET
+ PublishSE.`DataId`=%s,
+ PublishSE.`StrainId`=%s,
+ PublishSE.`error`=%s
+ """
+ cursor.execute(sql, (dataid, strainid, se))
+ if n is not None:
+ sql = """
+ INSERT INTO NStrain
+ SET
+ NStrain.`DataId`=%s,
+ NStrain.`StrainId`=%s,
+ NStrain.`count`=%s
+ """
+ cursor.execute(sql, (dataid, strainid, n))
+ # PublishXRef
+ sql = """
+ INSERT INTO PublishXRef
+ SET
+ PublishXRef.`InbredSetId`=%s,
+ PublishXRef.`PhenotypeId`=%s,
+ PublishXRef.`PublicationId`=%s,
+ PublishXRef.`DataId`=%s,
+ PublishXRef.`comments`=%s
+ """
+ cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, ""))
+ rowcount = cursor.rowcount
+ publishxrefid = con.insert_id()
+ print "INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid)
+ # for loop next
+ dataid += 1
+ print
+ # release
+ con.close()
+
+if __name__ == "__main__":
+ print "command line arguments:\n\t%s" % sys.argv
+ main(sys.argv)
+ print "exit successfully"
diff --git a/scripts/maintenance/readProbeSetMean_v7.py b/scripts/maintenance/readProbeSetMean_v7.py
new file mode 100755
index 00000000..e9c8f25c
--- /dev/null
+++ b/scripts/maintenance/readProbeSetMean_v7.py
@@ -0,0 +1,274 @@
+#!/usr/bin/python2
+"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""
+
+########################################################################
+# Last Updated Sep 27, 2011 by Xiaodong
+########################################################################
+import string
+import sys
+import MySQLdb
+import getpass
+import time
+#import pdb
+
+#pdb.set_trace()
+
+########################################################################
+
+def translateAlias(str):
+ if str == "B6":
+ return "C57BL/6J"
+ elif str == "D2":
+ return "DBA/2J"
+ else:
+ return str
+
+########################################################################
+#
+# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile
+#
+########################################################################
+
+dataStart = 1
+
+GeneChipId = int( raw_input("Enter GeneChipId:") )
+ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") )
+input_file_name = raw_input("Enter file name with suffix:")
+
+fp = open("%s" % input_file_name, 'rb')
+
+try:
+ passwd = getpass.getpass('Please enter mysql password here : ')
+ con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd)
+
+ db = con.cursor()
+ print "You have successfully connected to mysql.\n"
+except:
+ print "You entered incorrect password.\n"
+ sys.exit(0)
+
+time0 = time.time()
+
+#########################################################################
+#
+# Check if each line have same number of members
+# generate the gene list of expression data here
+#
+#########################################################################
+print 'Checking if each line have same number of members'
+
+GeneList = []
+isCont = 1
+header = fp.readline()
+header = string.split(string.strip(header),'\t')
+header = map(string.strip, header)
+nfield = len(header)
+line = fp.readline()
+
+kj=0
+while line:
+ line2 = string.split(string.strip(line),'\t')
+ line2 = map(string.strip, line2)
+ if len(line2) != nfield:
+ print "Error : " + line
+ isCont = 0
+
+ GeneList.append(line2[0])
+ line = fp.readline()
+
+ kj+=1
+ if kj%100000 == 0:
+ print 'checked ',kj,' lines'
+
+GeneList = map(string.lower, GeneList)
+GeneList.sort()
+
+if isCont==0:
+ sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+#########################################################################
+#
+# Check if each strain exist in database
+# generate the string id list of expression data here
+#
+#########################################################################
+print 'Checking if each strain exist in database'
+
+isCont = 1
+fp.seek(0)
+header = fp.readline()
+header = string.split(string.strip(header),'\t')
+header = map(string.strip, header)
+header = map(translateAlias, header)
+header = header[dataStart:]
+Ids = []
+for item in header:
+ try:
+ db.execute('select Id from Strain where Name = "%s"' % item)
+ Ids.append(db.fetchall()[0][0])
+ except:
+ print item,'does not exist, check the if the strain name is correct'
+ isCont=0
+
+if isCont==0:
+ sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+########################################################################
+#
+# Check if each ProbeSet exist in database
+#
+########################################################################
+print 'Check if each ProbeSet exist in database'
+
+##---- find PID is name or target ----##
+line = fp.readline()
+line = fp.readline()
+line2 = string.split(string.strip(line),'\t')
+line2 = map(string.strip, line2)
+PId = line2[0]
+
+db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId) )
+results = db.fetchall()
+IdStr = 'TargetId'
+if len(results)>0:
+ IdStr = 'Name'
+
+
+##---- get Name/TargetId list from database ----##
+db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr))
+results = db.fetchall()
+
+Names = []
+for item in results:
+ Names.append(item[0])
+
+print Names
+
+Names = map(string.lower, Names)
+
+Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at doesn't exist --#
+
+
+##---- compare genelist with names ----##
+x=y=0
+x1=-1
+GeneList2=[]
+while x<len(GeneList) and y<len(Names):
+ if GeneList[x]==Names[y]:
+ x += 1
+ y += 1
+ elif GeneList[x]<Names[y]:
+ if x!=x1:
+ GeneList2.append(GeneList[x])
+ x1 = x
+ x += 1
+ elif GeneList[x]>Names[y]:
+ y += 1
+
+ if x%100000==0:
+ print 'check Name, checked %d lines'%x
+
+while x<len(GeneList):
+ GeneList2.append(GeneList[x])
+ x += 1
+
+isCont=1
+ferror = open("ProbeSetError.txt", "wb")
+for item in GeneList2:
+ ferror.write(item + " doesn't exist \n")
+ print item, " doesn't exist, check if the ProbeSet name is correct"
+ isCont = 0
+
+if isCont==0:
+ sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+#########################################################################
+#
+# Insert data into database
+#
+#########################################################################
+print 'getting ProbeSet/Id'
+
+
+#---- get Name/Id map ----#
+db.execute('select %s, Id from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr))
+results = db.fetchall()
+NameIds = {}
+for item in results:
+ NameIds[item[0]] = item[1]
+print 'used ',time.time()-time0,' seconds'
+
+
+print 'inserting data'
+
+##---- get old max dataId ----##
+db.execute('select max(Id) from ProbeSetData')
+maxDataId = int(db.fetchall()[0][0])
+bmax = maxDataId
+print "old_max = %d\n" % bmax
+
+##---- insert data ----##
+fp.seek(0)
+line = fp.readline()
+line = fp.readline()
+kj = 0
+
+values1 = []
+values2 = []
+while line:
+ line2 = string.split(string.strip(line),'\t')
+ line2 = map(string.strip, line2)
+ PId = line2[0]
+ recordId = NameIds[PId]
+
+ maxDataId += 1
+ datasorig = line2[dataStart:]
+
+ ###### Data Table items ######
+ i=0
+ for item in datasorig:
+ try:
+ values1.append('(%d,%d,%s)' % (maxDataId, Ids[i], float(item)))
+ except:
+ pass
+ i += 1
+
+ values2.append("(%d,%d,%d)" % (ProbeSetFreezeId, recordId, maxDataId))
+
+
+ ##---- insert into table ----##
+ kj += 1
+ if kj % 100 == 0:
+ cmd = ','.join(values1)
+ cmd = 'insert into ProbeSetData values %s' % cmd
+ db.execute(cmd)
+
+ cmd = ','.join(values2)
+ cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
+ db.execute(cmd)
+
+ values1=[]
+ values2=[]
+ print 'Inserted ', kj,' lines'
+ print 'used ',time.time()-time0,' seconds'
+
+ line = fp.readline()
+
+
+
+if len(values1)>0:
+ cmd = ','.join(values1)
+ cmd = 'insert into ProbeSetData values %s' % cmd
+ db.execute(cmd)
+
+ cmd = ','.join(values2)
+ cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
+ db.execute(cmd)
+
+con.close()
diff --git a/scripts/maintenance/readProbeSetSE_v7.py b/scripts/maintenance/readProbeSetSE_v7.py
new file mode 100755
index 00000000..fd6f0bb8
--- /dev/null
+++ b/scripts/maintenance/readProbeSetSE_v7.py
@@ -0,0 +1,254 @@
+#!/usr/bin/python2
+"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""
+########################################################################
+# Last Updated Sep 27, 2011 by Xiaodong
+# This version fix the bug that incorrectly exclude the first 2 probesetIDs
+########################################################################
+
+import string
+import sys
+import MySQLdb
+import getpass
+import time
+
+
+def translateAlias(str):
+ if str == "B6":
+ return "C57BL/6J"
+ elif str == "D2":
+ return "DBA/2J"
+ else:
+ return str
+
+########################################################################
+#
+# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile
+#
+########################################################################
+
+dataStart = 1
+
+GeneChipId = int( raw_input("Enter GeneChipId:") )
+ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") )
+input_file_name = raw_input("Enter file name with suffix:")
+
+fp = open("%s" % input_file_name, 'rb')
+
+
+try:
+ passwd = getpass.getpass('Please enter mysql password here : ')
+ con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd)
+
+ db = con.cursor()
+ print "You have successfully connected to mysql.\n"
+except:
+ print "You entered incorrect password.\n"
+ sys.exit(0)
+
+time0 = time.time()
+########################################################################
+#
+# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile
+#
+########################################################################
+
+#GeneChipId = 4
+#dataStart = 1
+#ProbeSetFreezeId = 359 #JAX Liver 6C Affy M430 2.0 (Jul11) MDP
+#fp = open("GSE10493_AllSamples_6C_Z_AvgSE.txt", 'rb')
+
+
+#########################################################################
+#
+# Check if each line have same number of members
+# generate the gene list of expression data here
+#
+#########################################################################
+print 'Checking if each line have same number of members'
+
+GeneList = []
+isCont = 1
+header = fp.readline()
+header = string.split(string.strip(header),'\t')
+header = map(string.strip, header)
+nfield = len(header)
+line = fp.readline()
+
+kj=0
+while line:
+ line2 = string.split(string.strip(line),'\t')
+ line2 = map(string.strip, line2)
+ if len(line2) != nfield:
+ print "Error : " + line
+ isCont = 0
+
+ GeneList.append(line2[0])
+ line = fp.readline()
+
+ kj+=1
+ if kj%100000 == 0:
+ print 'checked ',kj,' lines'
+
+GeneList = map(string.lower, GeneList)
+GeneList.sort()
+
+if isCont==0:
+ sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+#########################################################################
+#
+# Check if each strain exist in database
+# generate the string id list of expression data here
+#
+#########################################################################
+print 'Checking if each strain exist in database'
+
+isCont = 1
+fp.seek(0)
+header = fp.readline()
+header = string.split(string.strip(header),'\t')
+header = map(string.strip, header)
+header = map(translateAlias, header)
+header = header[dataStart:]
+Ids = []
+for item in header:
+ try:
+ db.execute('select Id from Strain where Name = "%s"' % item)
+ Ids.append(db.fetchall()[0][0])
+ except:
+ print item,'does not exist, check the if the strain name is correct'
+ isCont=0
+
+if isCont==0:
+ sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+########################################################################
+#
+# Check if each ProbeSet exist in database
+#
+########################################################################
+print 'Check if each ProbeSet exist in database'
+
+##---- find PID is name or target ----##
+line = fp.readline()
+line = fp.readline()
+line2 = string.split(string.strip(line),'\t')
+line2 = map(string.strip, line2)
+PId = line2[0]
+
+db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId))
+results = db.fetchall()
+IdStr = 'TargetId'
+if len(results)>0:
+ IdStr = 'Name'
+
+
+##---- get Name/TargetId list from database ----##
+db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr))
+results = db.fetchall()
+
+Names = []
+for item in results:
+ Names.append(item[0])
+Names = map(string.lower, Names)
+Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at doesn't exist --#
+
+##---- compare genelist with names ----##
+x=y=0
+x1=-1
+GeneList2=[]
+while x<len(GeneList) and y<len(Names):
+ if GeneList[x]==Names[y]:
+ x += 1
+ y += 1
+ elif GeneList[x]<Names[y]:
+ if x!=x1:
+ GeneList2.append(GeneList[x])
+ x1 = x
+ x += 1
+ elif GeneList[x]>Names[y]:
+ y += 1
+
+ if x%100000==0:
+ print 'check Name, checked %d lines'%x
+
+while x<len(GeneList):
+ GeneList2.append(GeneList[x])
+ x += 1
+
+isCont=1
+ferror = open("ProbeSetError.txt", "wb")
+for item in GeneList2:
+ ferror.write(item + " doesn't exist \n")
+ print item, " doesn't exist"
+ isCont = 0
+
+if isCont==0:
+ sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+#############################
+#Insert new Data into SE
+############################
+db.execute("""
+ select ProbeSet.%s, ProbeSetXRef.DataId from ProbeSet, ProbeSetXRef
+ where ProbeSet.Id=ProbeSetXRef.ProbeSetId and ProbeSetXRef.ProbeSetFreezeId=%d"""
+ % (IdStr, ProbeSetFreezeId))
+results = db.fetchall()
+
+ProbeNameId = {}
+for Name, Id in results:
+ ProbeNameId[Name] = Id
+
+ferror = open("ProbeError.txt", "wb")
+
+DataValues = []
+
+fp.seek(0) #XZ add this line
+line = fp.readline() #XZ add this line
+line = fp.readline()
+
+kj = 0
+while line:
+ line2 = string.split(string.strip(line),'\t')
+ line2 = map(string.strip, line2)
+
+ CellId = line2[0]
+ if not ProbeNameId.has_key(CellId):
+ ferror.write(CellId + " doesn't exist\n")
+ print CellId, " doesn't exist"
+ else:
+ DataId = ProbeNameId[CellId]
+ datasorig = line2[dataStart:]
+
+ i = 0
+ for item in datasorig:
+ if item != '':
+ value = '('+str(DataId)+','+str(Ids[i])+','+str(item)+')'
+ DataValues.append(value)
+ i += 1
+
+ kj += 1
+ if kj % 100 == 0:
+ Dataitems = ','.join(DataValues)
+ cmd = 'insert ProbeSetSE values %s' % Dataitems
+ db.execute(cmd)
+
+ DataValues = []
+ print 'inserted ',kj,' lines'
+ print 'used ',time.time()-time0,' seconds'
+ line = fp.readline()
+
+if len(DataValues)>0:
+ DataValues = ','.join(DataValues)
+ cmd = 'insert ProbeSetSE values %s' % DataValues
+ db.execute(cmd)
+
+con.close()
+
+