diff options
| author | acenteno | 2020-04-21 17:35:34 -0500 | 
|---|---|---|
| committer | GitHub | 2020-04-21 17:35:34 -0500 | 
| commit | 660589b9c2a507529e8e51ca6ce66ca97ad982c5 (patch) | |
| tree | 27f63957278581bc2fce2b88744bfe20c8a81558 /scripts/maintenance | |
| parent | d97fdc18359233f07c1a1c7b83fe7e88eb225043 (diff) | |
| parent | f2a3ae13231a7d270a5bb6911c248aa713f1ef91 (diff) | |
| download | genenetwork2-660589b9c2a507529e8e51ca6ce66ca97ad982c5.tar.gz | |
Merge pull request #1 from genenetwork/testing
Updating my testing branch
Diffstat (limited to 'scripts/maintenance')
| -rwxr-xr-x | scripts/maintenance/QTL_Reaper_v6.py | 108 | ||||
| -rw-r--r-- | scripts/maintenance/README.txt | 66 | ||||
| -rw-r--r-- | scripts/maintenance/Update_Case_Attributes_MySQL_tab.py | 27 | ||||
| -rwxr-xr-x | scripts/maintenance/delete_genotypes.py | 33 | ||||
| -rwxr-xr-x | scripts/maintenance/delete_phenotypes.py | 35 | ||||
| -rwxr-xr-x | scripts/maintenance/load_genotypes.py | 154 | ||||
| -rwxr-xr-x | scripts/maintenance/load_phenotypes.py | 171 | ||||
| -rwxr-xr-x | scripts/maintenance/readProbeSetMean_v7.py | 274 | ||||
| -rwxr-xr-x | scripts/maintenance/readProbeSetSE_v7.py | 254 | 
9 files changed, 1122 insertions, 0 deletions
| diff --git a/scripts/maintenance/QTL_Reaper_v6.py b/scripts/maintenance/QTL_Reaper_v6.py new file mode 100755 index 00000000..e50dbd40 --- /dev/null +++ b/scripts/maintenance/QTL_Reaper_v6.py @@ -0,0 +1,108 @@ +####### To run this program do: python QTL_Reaper_v2.py 235 +#Where 163 is the ProbeSetFreeze Id of the database that we want to calculate #the LRS +#!/usr/bin/python +import sys +import os +import reaper +import MySQLdb +import time + +con = MySQLdb.Connect(db='db_webqtl',user='username',passwd='', host="localhost") +cursor = con.cursor() + +genotypeDir = '/gnshare/gn/web/genotypes/' +genotype_1 = reaper.Dataset() + +#####get all of the genotypes +cursor.execute('select Id, Name from InbredSet') +results = cursor.fetchall() +InbredSets = {} +for item in results: + InbredSets[item[0]] = genotypeDir+str(item[1])+'.geno' + +ProbeSetFreezeIds=sys.argv[1:] +if ProbeSetFreezeIds: + #####convert the Ids to integer + ProbeSetFreezeIds=map(int, ProbeSetFreezeIds) + +else: + #####get all of the dataset that need be updated + cursor.execute('select distinct(ProbeSetFreezeId) from ProbeSetXRef where pValue is NULL order by ProbeSetFreezeId desc') + results = cursor.fetchall() + ProbeSetFreezeIds = [] + for item in results: + ProbeSetFreezeIds.append(item[0]) + +####human dataset can NOT use this program calculate the LRS, ignore it +if ProbeSetFreezeIds.__contains__(215): + ProbeSetFreezeIds.remove(215) + +#output_file = open ('/home/xzhou/work/DatabaseTools/cal_LRS_Additive_result.txt', 'w') + +#####update +for ProbeSetFreezeId in ProbeSetFreezeIds: + cursor.execute(""" + select InbredSetId + from ProbeFreeze, ProbeSetFreeze + where ProbeFreeze.Id=ProbeSetFreeze.ProbeFreezeId and ProbeSetFreeze.Id=%d + """%ProbeSetFreezeId); + + InbredSetId = cursor.fetchone()[0] + if InbredSetId==3: + InbredSetId=1 + #if InbredSetId==12: + # InbredSetId=2 + + print ProbeSetFreezeId, InbredSets[InbredSetId] + + genotype_1.read(InbredSets[InbredSetId]) + locuses = [] + for geno in genotype_1: + for locus in geno: + locuses.append(locus.name) + + cursor.execute('select ProbeSetId, Locus, DataId from ProbeSetXRef where ProbeSetFreezeId=%s'%ProbeSetFreezeId) + ProbeSetXRefInfos = cursor.fetchall() + + kj=0 + for aProbeSetXRef in ProbeSetXRefInfos: + ProbeSetId, Locus, DataId = aProbeSetXRef + prgy = genotype_1.prgy + + cursor.execute("select Strain.Name, ProbeSetData.value from Strain, ProbeSetData where Strain.Id = ProbeSetData.StrainId and ProbeSetData.Id = %d" % DataId) + results = cursor.fetchall() + if not results: + continue + _strains = [] + _values = [] + for item2 in results: + strain, value = item2 + if strain in prgy: + _strains.append(strain) + _values.append(value) + if not _strains or not _values: + continue + + if len(_strains) < 8: + continue + qtlresults = genotype_1.regression(strains = _strains, trait = _values) + _max = max(qtlresults) + _locus = _max.locus.name + _additive = _max.additive + _max = _max.lrs + + #output_file.write('%s\t%s\t%s\t%s\t%s\n' % (ProbeSetFreezeId, ProbeSetId, _locus, _max, _additive)) + + # _max(LRS) maybe is infinite sometimes, so define it as a very big number + if _max == float('inf'): + _max = 10000 + + cursor.execute('update ProbeSetXRef set Locus=%s, LRS=%s, additive=%s where ProbeSetId=%s and ProbeSetFreezeId=%s', \ + (_locus, _max, _additive, ProbeSetId, ProbeSetFreezeId)) + + kj += 1 + if kj%1000==0: + print ProbeSetFreezeId, InbredSets[InbredSetId],kj + + + print ProbeSetFreezeIds diff --git a/scripts/maintenance/README.txt b/scripts/maintenance/README.txt new file mode 100644 index 00000000..8604a5c9 --- /dev/null +++ b/scripts/maintenance/README.txt @@ -0,0 +1,66 @@ +Theses are Python2 scripts used for uploading data into the MySQL +database (current as per April 2018) + + +Last updated by A.Centeno 4-2-18 +========================== +load_genotypes.py +========================== +Mainly used to enter genotype batch records +Run load_genotypes.py as: +python delete_genotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Load_Genotypes/config.ini + +========================== +delete_genotypes.py +========================== +Mainly used to delete genotype batch records +Run delete_genotypes.py as: +python delete_genotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Delete_Genotypes/config.ini + +========================== +load_phenotypes.py +========================== +Mainly used to enter phenotype trait batch records +Run load_phenotypes.py as: +python load_phenotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Load_Phenotypes/config.ini + +========================== +delete_phenotypes.py +========================== +Mainly used to delete phenotype trait full records +Run delete_phenotypes.py as: +python delete_phenotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Delete_Phenotypes/config.ini + +========================== +QTL_Reaper_v6.py +========================== +Mainly used to perform QTL reaper and obtain Max LRS values. +Run QTL_Reaper_v6.py as: +python QTL_Reaper_v6.py (and GN accession number here) + +========================== +Update_Case_Attributes_MySQL_tab.py +========================== +Mainly used to enter Case attributes. +Run Update_Case_Attributes_MySQL_tab.py as: +python Update_Case_Attributes_MySQL_tab.py + +========================== +readProbeSetMean_v7.py +========================== +Mainly used to enter mean expression data. +Run readProbeSetMean_v7.py as: +python readProbeSetMean_v7.py + +========================== +readProbeSetSE_v7.py +========================== +Mainly used to enter Standard Error values from expression data. +Run readProbeSetSE_v7.py as: +python readProbeSetSE_v7.py + +========================== +MYSQL command CALCULATE MEANS NEW Structure: +========================== +Mainly used to calculate mean from expression data values. +update ProbeSetXRef set mean = (select AVG(value) from ProbeSetData where ProbeSetData.Id = ProbeSetXRef.DataId) where ProbeSetXRef.ProbeSetFreezeId = 811; diff --git a/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py new file mode 100644 index 00000000..0f8602c9 --- /dev/null +++ b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py @@ -0,0 +1,27 @@ +#!/usr/bin/python2 +######################################################################## +# Last Updated Apr 11, 2016 By Arthur and Zach +######################################################################## +import string +import sys +import MySQLdb +import getpass +import time +import csv +######################################################################## + +mydb = MySQLdb.connect(host='localhost', + user='username', + passwd='', + db='db_webqtl') +cursor = mydb.cursor() + +csv_data = csv.reader(file('GN711_pvalues.txt'), delimiter ="\t") +for row in csv_data: + + cursor.execute("""UPDATE ProbeSetXRef SET pValue = %s WHERE ProbeSetFreezeId = %s AND ProbeSetId = %s """, + (row)) +#close the connection to the database. +mydb.commit() +cursor.close() +print "Done" \ No newline at end of file diff --git a/scripts/maintenance/delete_genotypes.py b/scripts/maintenance/delete_genotypes.py new file mode 100755 index 00000000..fa693f0f --- /dev/null +++ b/scripts/maintenance/delete_genotypes.py @@ -0,0 +1,33 @@ +import sys +import csv + +import utilities +import datastructure +import genotypes + +def main(argv): + # config + config = utilities.get_config(argv[1]) + print "config:" + for item in config.items('config'): + print "\t%s" % (str(item)) + # var + print "variable:" + inbredsetid = config.get('config', 'inbredsetid') + print "\tinbredsetid: %s" % inbredsetid + # datafile + datafile = open(config.get('config', 'datafile'), 'r') + datafile = csv.reader(datafile, delimiter='\t', quotechar='"') + datafile.next() + delrowcount = 0 + for row in datafile: + if len(row) == 0: + continue + genoname = row[0] + delrowcount += genotypes.delete(genoname, inbredsetid) + print "deleted %d genotypes" % (delrowcount) + +if __name__ == "__main__": + print "command line arguments:\n\t%s" % sys.argv + main(sys.argv) + print "exit successfully" diff --git a/scripts/maintenance/delete_phenotypes.py b/scripts/maintenance/delete_phenotypes.py new file mode 100755 index 00000000..326c466e --- /dev/null +++ b/scripts/maintenance/delete_phenotypes.py @@ -0,0 +1,35 @@ +import sys +import csv + +import utilities +import datastructure +import phenotypes + +def main(argv): + # config + config = utilities.get_config(argv[1]) + print "config:" + for item in config.items('config'): + print "\t%s" % (str(item)) + # var + print "variable:" + inbredsetid = config.get('config', 'inbredsetid') + print "\tinbredsetid: %s" % inbredsetid + # datafile + datafile = open(config.get('config', 'datafile'), 'r') + datafile = csv.reader(datafile, delimiter='\t', quotechar='"') + delrowcount = 0 + for row in datafile: + if len(row) == 0: + continue + try: + publishxrefid = int(row[0]) + except: + continue + delrowcount += phenotypes.delete(publishxrefid=publishxrefid, inbredsetid=inbredsetid) + print "deleted %d phenotypes" % (delrowcount) + +if __name__ == "__main__": + print "command line arguments:\n\t%s" % sys.argv + main(sys.argv) + print "exit successfully" diff --git a/scripts/maintenance/load_genotypes.py b/scripts/maintenance/load_genotypes.py new file mode 100755 index 00000000..338483f4 --- /dev/null +++ b/scripts/maintenance/load_genotypes.py @@ -0,0 +1,154 @@ +import sys +import re + +import utilities +import datastructure + +def main(argv): + config = utilities.get_config(argv[1]) + print("config file:") + for item in config.items('config'): + print("\t%s" % str(item)) + parse_genofile(config, fetch_parameters(config)) + +def fetch_parameters(config): + config_dic = {} + config_dic['inbredsetid'] = config.get('config', 'inbredsetid') + config_dic["speciesid"] = datastructure.get_species(config_dic['inbredsetid'])[0] + config_dic['genofreezeid'] = datastructure.get_genofreeze_byinbredsetid(config_dic['inbredsetid'])[0] + config_dic['dataid'] = datastructure.get_nextdataid_genotype() + config_dic['genofile'] = config.get('config', 'genofile') + print("config dictionary:") + for k, v in config_dic.items(): + print("\t%s: %s" % (k, v)) + return config_dic + +def parse_genofile(config, config_dic): + genofile = open(config_dic['genofile'], 'r') + meta_dic = {} + for line in genofile: + line = line.strip() + if len(line) == 0: + continue + if line.startswith('#'): + continue + if line.startswith('@'): + line = line.strip('@') + items = line.split(';') + for item in items: + kv = re.split(':|=', item) + meta_dic[kv[0].strip()] = kv[1].strip() + continue + if line.lower().startswith("chr"): + # + print("geno file meta dictionary:") + for k, v in meta_dic.items(): + print("\t%s: %s" % (k, v)) + # + print("geno file head:\n\t%s" % line) + strainnames = line.split()[4:] + config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes") + continue + # geno file line, marker + marker_dic = parse_marker(line) + marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic) + if check_genoxref(config_dic, marker_dic): + continue + insert_genodata(config, config_dic, marker_dic) + insert_genoxref(config_dic, marker_dic) + config_dic['dataid'] += 1 + genofile.close() + +def parse_marker(line): + marker_dic = {} + cells = line.split() + marker_dic['chromosome'] = cells[0] + marker_dic['locus'] = cells[1] + marker_dic['cm'] = cells[2] + marker_dic['mb'] = cells[3] + marker_dic['values'] = cells[4:] + return marker_dic + +def check_or_insert_geno(config_dic, marker_dic): + cursor, con = utilities.get_cursor() + sql = """ + SELECT Geno.`Id` + FROM Geno + WHERE Geno.`SpeciesId`=%s + AND Geno.`Name` like %s + """ + cursor.execute(sql, (config_dic["speciesid"], marker_dic['locus'])) + result = cursor.fetchone() + if result: + genoid = result[0] + print("get geno record: %d" % genoid) + else: + sql = """ + INSERT INTO Geno + SET + Geno.`SpeciesId`=%s, + Geno.`Name`=%s, + Geno.`Marker_Name`=%s, + Geno.`Chr`=%s, + Geno.`Mb`=%s + """ + cursor.execute(sql, (config_dic['speciesid'], marker_dic['locus'], marker_dic['locus'], marker_dic['chromosome'], marker_dic['mb'])) + rowcount = cursor.rowcount + genoid = con.insert_id() + print("INSERT INTO Geno: %d record: %d" % (rowcount, genoid)) + return genoid + +def check_genoxref(config_dic, marker_dic): + cursor, con = utilities.get_cursor() + sql = """ + select GenoXRef.* + from GenoXRef + where GenoXRef.`GenoFreezeId`=%s + AND GenoXRef.`GenoId`=%s + """ + cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'])) + rowcount = cursor.rowcount + return rowcount + +def insert_genodata(config, config_dic, marker_dic): + cursor, con = utilities.get_cursor() + for index, strain in enumerate(config_dic['strains']): + strainid = strain[0] + value = utilities.to_db_string(marker_dic['values'][index], None) + if not value: + continue + value = config.get('config', "genovalue_" + value) + try: + number = int(value) + except: + continue + if not number in [-1, 0, 1]: + continue + sql = """ + INSERT INTO GenoData + SET + GenoData.`Id`=%s, + GenoData.`StrainId`=%s, + GenoData.`value`=%s + """ + cursor.execute(sql, (config_dic['dataid'], strainid, number)) + +def insert_genoxref(config_dic, marker_dic): + cursor, con = utilities.get_cursor() + sql = """ + INSERT INTO GenoXRef + SET + GenoXRef.`GenoFreezeId`=%s, + GenoXRef.`GenoId`=%s, + GenoXRef.`DataId`=%s, + GenoXRef.`cM`=%s, + GenoXRef.`Used_for_mapping`=%s + """ + cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'], config_dic['dataid'], marker_dic['cm'], 'N')) + rowcount = cursor.rowcount + print("INSERT INTO GenoXRef: %d record" % (rowcount)) + +if __name__ == "__main__": + print("command line arguments:\n\t%s" % sys.argv) + main(sys.argv) + print("exit successfully") diff --git a/scripts/maintenance/load_phenotypes.py b/scripts/maintenance/load_phenotypes.py new file mode 100755 index 00000000..c3c6570b --- /dev/null +++ b/scripts/maintenance/load_phenotypes.py @@ -0,0 +1,171 @@ +import sys +import csv + +import utilities +import datastructure + +def main(argv): + # config + config = utilities.get_config(argv[1]) + print "config:" + for item in config.items('config'): + print "\t%s" % (str(item)) + # var + inbredsetid = config.get('config', 'inbredsetid') + print "inbredsetid: %s" % inbredsetid + species = datastructure.get_species(inbredsetid) + speciesid = species[0] + print "speciesid: %s" % speciesid + dataid = datastructure.get_nextdataid_phenotype() + print "next data id: %s" % dataid + cursor, con = utilities.get_cursor() + # datafile + datafile = open(config.get('config', 'datafile'), 'r') + phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"') + phenotypedata_head = phenotypedata.next() + print "phenotypedata head:\n\t%s" % phenotypedata_head + strainnames = phenotypedata_head[1:] + strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") + # metafile + metafile = open(config.get('config', 'metafile'), 'r') + phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"') + phenotypemeta_head = phenotypemeta.next() + print "phenotypemeta head:\n\t%s" % phenotypemeta_head + print + # load + for metarow in phenotypemeta: + # + datarow_value = phenotypedata.next() + datarow_se = phenotypedata.next() + datarow_n = phenotypedata.next() + # Phenotype + sql = """ + INSERT INTO Phenotype + SET + Phenotype.`Pre_publication_description`=%s, + Phenotype.`Post_publication_description`=%s, + Phenotype.`Original_description`=%s, + Phenotype.`Pre_publication_abbreviation`=%s, + Phenotype.`Post_publication_abbreviation`=%s, + Phenotype.`Lab_code`=%s, + Phenotype.`Submitter`=%s, + Phenotype.`Owner`=%s, + Phenotype.`Authorized_Users`=%s, + Phenotype.`Units`=%s + """ + cursor.execute(sql, ( + utilities.to_db_string(metarow[1], None), + utilities.to_db_string(metarow[2], None), + utilities.to_db_string(metarow[3], None), + utilities.to_db_string(metarow[4], None), + utilities.to_db_string(metarow[5], None), + utilities.to_db_string(metarow[6], None), + utilities.to_db_string(metarow[7], None), + utilities.to_db_string(metarow[8], None), + utilities.to_db_string(metarow[9], ""), + utilities.to_db_string(metarow[18], ""), + )) + rowcount = cursor.rowcount + phenotypeid = con.insert_id() + print "INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid) + # Publication + publicationid = None # reset + pubmed_id = utilities.to_db_string(metarow[0], None) + if pubmed_id: + sql = """ + SELECT Publication.`Id` + FROM Publication + WHERE Publication.`PubMed_ID`=%s + """ + cursor.execute(sql, (pubmed_id)) + re = cursor.fetchone() + if re: + publicationid = re[0] + print "get Publication record: %d" % publicationid + if not publicationid: + sql = """ + INSERT INTO Publication + SET + Publication.`PubMed_ID`=%s, + Publication.`Abstract`=%s, + Publication.`Authors`=%s, + Publication.`Title`=%s, + Publication.`Journal`=%s, + Publication.`Volume`=%s, + Publication.`Pages`=%s, + Publication.`Month`=%s, + Publication.`Year`=%s + """ + cursor.execute(sql, ( + utilities.to_db_string(metarow[0], None), + utilities.to_db_string(metarow[12], None), + utilities.to_db_string(metarow[10], ""), + utilities.to_db_string(metarow[11], None), + utilities.to_db_string(metarow[13], None), + utilities.to_db_string(metarow[14], None), + utilities.to_db_string(metarow[15], None), + utilities.to_db_string(metarow[16], None), + utilities.to_db_string(metarow[17], ""), + )) + rowcount = cursor.rowcount + publicationid = con.insert_id() + print "INSERT INTO Publication: %d record: %d" % (rowcount, publicationid) + # data + for index, strain in enumerate(strains): + # + strainid = strain[0] + value = utilities.to_db_float(datarow_value[index+1], None) + se = utilities.to_db_float(datarow_se[index+1], None) + n = utilities.to_db_int(datarow_n[index+1], None) + # + if value is not None: + sql = """ + INSERT INTO PublishData + SET + PublishData.`Id`=%s, + PublishData.`StrainId`=%s, + PublishData.`value`=%s + """ + cursor.execute(sql, (dataid, strainid, value)) + if se is not None: + sql = """ + INSERT INTO PublishSE + SET + PublishSE.`DataId`=%s, + PublishSE.`StrainId`=%s, + PublishSE.`error`=%s + """ + cursor.execute(sql, (dataid, strainid, se)) + if n is not None: + sql = """ + INSERT INTO NStrain + SET + NStrain.`DataId`=%s, + NStrain.`StrainId`=%s, + NStrain.`count`=%s + """ + cursor.execute(sql, (dataid, strainid, n)) + # PublishXRef + sql = """ + INSERT INTO PublishXRef + SET + PublishXRef.`InbredSetId`=%s, + PublishXRef.`PhenotypeId`=%s, + PublishXRef.`PublicationId`=%s, + PublishXRef.`DataId`=%s, + PublishXRef.`comments`=%s + """ + cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, "")) + rowcount = cursor.rowcount + publishxrefid = con.insert_id() + print "INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid) + # for loop next + dataid += 1 + print + # release + con.close() + +if __name__ == "__main__": + print "command line arguments:\n\t%s" % sys.argv + main(sys.argv) + print "exit successfully" diff --git a/scripts/maintenance/readProbeSetMean_v7.py b/scripts/maintenance/readProbeSetMean_v7.py new file mode 100755 index 00000000..e9c8f25c --- /dev/null +++ b/scripts/maintenance/readProbeSetMean_v7.py @@ -0,0 +1,274 @@ +#!/usr/bin/python2 +"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value""" + +######################################################################## +# Last Updated Sep 27, 2011 by Xiaodong +######################################################################## +import string +import sys +import MySQLdb +import getpass +import time +#import pdb + +#pdb.set_trace() + +######################################################################## + +def translateAlias(str): + if str == "B6": + return "C57BL/6J" + elif str == "D2": + return "DBA/2J" + else: + return str + +######################################################################## +# +# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile +# +######################################################################## + +dataStart = 1 + +GeneChipId = int( raw_input("Enter GeneChipId:") ) +ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") ) +input_file_name = raw_input("Enter file name with suffix:") + +fp = open("%s" % input_file_name, 'rb') + +try: + passwd = getpass.getpass('Please enter mysql password here : ') + con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd) + + db = con.cursor() + print "You have successfully connected to mysql.\n" +except: + print "You entered incorrect password.\n" + sys.exit(0) + +time0 = time.time() + +######################################################################### +# +# Check if each line have same number of members +# generate the gene list of expression data here +# +######################################################################### +print 'Checking if each line have same number of members' + +GeneList = [] +isCont = 1 +header = fp.readline() +header = string.split(string.strip(header),'\t') +header = map(string.strip, header) +nfield = len(header) +line = fp.readline() + +kj=0 +while line: + line2 = string.split(string.strip(line),'\t') + line2 = map(string.strip, line2) + if len(line2) != nfield: + print "Error : " + line + isCont = 0 + + GeneList.append(line2[0]) + line = fp.readline() + + kj+=1 + if kj%100000 == 0: + print 'checked ',kj,' lines' + +GeneList = map(string.lower, GeneList) +GeneList.sort() + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################### +# +# Check if each strain exist in database +# generate the string id list of expression data here +# +######################################################################### +print 'Checking if each strain exist in database' + +isCont = 1 +fp.seek(0) +header = fp.readline() +header = string.split(string.strip(header),'\t') +header = map(string.strip, header) +header = map(translateAlias, header) +header = header[dataStart:] +Ids = [] +for item in header: + try: + db.execute('select Id from Strain where Name = "%s"' % item) + Ids.append(db.fetchall()[0][0]) + except: + print item,'does not exist, check the if the strain name is correct' + isCont=0 + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################## +# +# Check if each ProbeSet exist in database +# +######################################################################## +print 'Check if each ProbeSet exist in database' + +##---- find PID is name or target ----## +line = fp.readline() +line = fp.readline() +line2 = string.split(string.strip(line),'\t') +line2 = map(string.strip, line2) +PId = line2[0] + +db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId) ) +results = db.fetchall() +IdStr = 'TargetId' +if len(results)>0: + IdStr = 'Name' + + +##---- get Name/TargetId list from database ----## +db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr)) +results = db.fetchall() + +Names = [] +for item in results: + Names.append(item[0]) + +print Names + +Names = map(string.lower, Names) + +Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at doesn't exist --# + + +##---- compare genelist with names ----## +x=y=0 +x1=-1 +GeneList2=[] +while x<len(GeneList) and y<len(Names): + if GeneList[x]==Names[y]: + x += 1 + y += 1 + elif GeneList[x]<Names[y]: + if x!=x1: + GeneList2.append(GeneList[x]) + x1 = x + x += 1 + elif GeneList[x]>Names[y]: + y += 1 + + if x%100000==0: + print 'check Name, checked %d lines'%x + +while x<len(GeneList): + GeneList2.append(GeneList[x]) + x += 1 + +isCont=1 +ferror = open("ProbeSetError.txt", "wb") +for item in GeneList2: + ferror.write(item + " doesn't exist \n") + print item, " doesn't exist, check if the ProbeSet name is correct" + isCont = 0 + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################### +# +# Insert data into database +# +######################################################################### +print 'getting ProbeSet/Id' + + +#---- get Name/Id map ----# +db.execute('select %s, Id from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr)) +results = db.fetchall() +NameIds = {} +for item in results: + NameIds[item[0]] = item[1] +print 'used ',time.time()-time0,' seconds' + + +print 'inserting data' + +##---- get old max dataId ----## +db.execute('select max(Id) from ProbeSetData') +maxDataId = int(db.fetchall()[0][0]) +bmax = maxDataId +print "old_max = %d\n" % bmax + +##---- insert data ----## +fp.seek(0) +line = fp.readline() +line = fp.readline() +kj = 0 + +values1 = [] +values2 = [] +while line: + line2 = string.split(string.strip(line),'\t') + line2 = map(string.strip, line2) + PId = line2[0] + recordId = NameIds[PId] + + maxDataId += 1 + datasorig = line2[dataStart:] + + ###### Data Table items ###### + i=0 + for item in datasorig: + try: + values1.append('(%d,%d,%s)' % (maxDataId, Ids[i], float(item))) + except: + pass + i += 1 + + values2.append("(%d,%d,%d)" % (ProbeSetFreezeId, recordId, maxDataId)) + + + ##---- insert into table ----## + kj += 1 + if kj % 100 == 0: + cmd = ','.join(values1) + cmd = 'insert into ProbeSetData values %s' % cmd + db.execute(cmd) + + cmd = ','.join(values2) + cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd + db.execute(cmd) + + values1=[] + values2=[] + print 'Inserted ', kj,' lines' + print 'used ',time.time()-time0,' seconds' + + line = fp.readline() + + + +if len(values1)>0: + cmd = ','.join(values1) + cmd = 'insert into ProbeSetData values %s' % cmd + db.execute(cmd) + + cmd = ','.join(values2) + cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd + db.execute(cmd) + +con.close() diff --git a/scripts/maintenance/readProbeSetSE_v7.py b/scripts/maintenance/readProbeSetSE_v7.py new file mode 100755 index 00000000..fd6f0bb8 --- /dev/null +++ b/scripts/maintenance/readProbeSetSE_v7.py @@ -0,0 +1,254 @@ +#!/usr/bin/python2 +"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value""" +######################################################################## +# Last Updated Sep 27, 2011 by Xiaodong +# This version fix the bug that incorrectly exclude the first 2 probesetIDs +######################################################################## + +import string +import sys +import MySQLdb +import getpass +import time + + +def translateAlias(str): + if str == "B6": + return "C57BL/6J" + elif str == "D2": + return "DBA/2J" + else: + return str + +######################################################################## +# +# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile +# +######################################################################## + +dataStart = 1 + +GeneChipId = int( raw_input("Enter GeneChipId:") ) +ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") ) +input_file_name = raw_input("Enter file name with suffix:") + +fp = open("%s" % input_file_name, 'rb') + + +try: + passwd = getpass.getpass('Please enter mysql password here : ') + con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd) + + db = con.cursor() + print "You have successfully connected to mysql.\n" +except: + print "You entered incorrect password.\n" + sys.exit(0) + +time0 = time.time() +######################################################################## +# +# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile +# +######################################################################## + +#GeneChipId = 4 +#dataStart = 1 +#ProbeSetFreezeId = 359 #JAX Liver 6C Affy M430 2.0 (Jul11) MDP +#fp = open("GSE10493_AllSamples_6C_Z_AvgSE.txt", 'rb') + + +######################################################################### +# +# Check if each line have same number of members +# generate the gene list of expression data here +# +######################################################################### +print 'Checking if each line have same number of members' + +GeneList = [] +isCont = 1 +header = fp.readline() +header = string.split(string.strip(header),'\t') +header = map(string.strip, header) +nfield = len(header) +line = fp.readline() + +kj=0 +while line: + line2 = string.split(string.strip(line),'\t') + line2 = map(string.strip, line2) + if len(line2) != nfield: + print "Error : " + line + isCont = 0 + + GeneList.append(line2[0]) + line = fp.readline() + + kj+=1 + if kj%100000 == 0: + print 'checked ',kj,' lines' + +GeneList = map(string.lower, GeneList) +GeneList.sort() + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################### +# +# Check if each strain exist in database +# generate the string id list of expression data here +# +######################################################################### +print 'Checking if each strain exist in database' + +isCont = 1 +fp.seek(0) +header = fp.readline() +header = string.split(string.strip(header),'\t') +header = map(string.strip, header) +header = map(translateAlias, header) +header = header[dataStart:] +Ids = [] +for item in header: + try: + db.execute('select Id from Strain where Name = "%s"' % item) + Ids.append(db.fetchall()[0][0]) + except: + print item,'does not exist, check the if the strain name is correct' + isCont=0 + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################## +# +# Check if each ProbeSet exist in database +# +######################################################################## +print 'Check if each ProbeSet exist in database' + +##---- find PID is name or target ----## +line = fp.readline() +line = fp.readline() +line2 = string.split(string.strip(line),'\t') +line2 = map(string.strip, line2) +PId = line2[0] + +db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId)) +results = db.fetchall() +IdStr = 'TargetId' +if len(results)>0: + IdStr = 'Name' + + +##---- get Name/TargetId list from database ----## +db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr)) +results = db.fetchall() + +Names = [] +for item in results: + Names.append(item[0]) +Names = map(string.lower, Names) +Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at doesn't exist --# + +##---- compare genelist with names ----## +x=y=0 +x1=-1 +GeneList2=[] +while x<len(GeneList) and y<len(Names): + if GeneList[x]==Names[y]: + x += 1 + y += 1 + elif GeneList[x]<Names[y]: + if x!=x1: + GeneList2.append(GeneList[x]) + x1 = x + x += 1 + elif GeneList[x]>Names[y]: + y += 1 + + if x%100000==0: + print 'check Name, checked %d lines'%x + +while x<len(GeneList): + GeneList2.append(GeneList[x]) + x += 1 + +isCont=1 +ferror = open("ProbeSetError.txt", "wb") +for item in GeneList2: + ferror.write(item + " doesn't exist \n") + print item, " doesn't exist" + isCont = 0 + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +############################# +#Insert new Data into SE +############################ +db.execute(""" + select ProbeSet.%s, ProbeSetXRef.DataId from ProbeSet, ProbeSetXRef + where ProbeSet.Id=ProbeSetXRef.ProbeSetId and ProbeSetXRef.ProbeSetFreezeId=%d""" + % (IdStr, ProbeSetFreezeId)) +results = db.fetchall() + +ProbeNameId = {} +for Name, Id in results: + ProbeNameId[Name] = Id + +ferror = open("ProbeError.txt", "wb") + +DataValues = [] + +fp.seek(0) #XZ add this line +line = fp.readline() #XZ add this line +line = fp.readline() + +kj = 0 +while line: + line2 = string.split(string.strip(line),'\t') + line2 = map(string.strip, line2) + + CellId = line2[0] + if not ProbeNameId.has_key(CellId): + ferror.write(CellId + " doesn't exist\n") + print CellId, " doesn't exist" + else: + DataId = ProbeNameId[CellId] + datasorig = line2[dataStart:] + + i = 0 + for item in datasorig: + if item != '': + value = '('+str(DataId)+','+str(Ids[i])+','+str(item)+')' + DataValues.append(value) + i += 1 + + kj += 1 + if kj % 100 == 0: + Dataitems = ','.join(DataValues) + cmd = 'insert ProbeSetSE values %s' % Dataitems + db.execute(cmd) + + DataValues = [] + print 'inserted ',kj,' lines' + print 'used ',time.time()-time0,' seconds' + line = fp.readline() + +if len(DataValues)>0: + DataValues = ','.join(DataValues) + cmd = 'insert ProbeSetSE values %s' % DataValues + db.execute(cmd) + +con.close() + + | 
