From fb57f05083b0512b7bb9f9e15b6cc6efaded5a1f Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 3 Apr 2018 07:50:28 +0000 Subject: @acenteno added data upload scripts into main repo --- scripts/maintenance/readProbeSetMean_v7.py | 274 +++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100755 scripts/maintenance/readProbeSetMean_v7.py (limited to 'scripts/maintenance/readProbeSetMean_v7.py') diff --git a/scripts/maintenance/readProbeSetMean_v7.py b/scripts/maintenance/readProbeSetMean_v7.py new file mode 100755 index 00000000..e9c8f25c --- /dev/null +++ b/scripts/maintenance/readProbeSetMean_v7.py @@ -0,0 +1,274 @@ +#!/usr/bin/python2 +"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value""" + +######################################################################## +# Last Updated Sep 27, 2011 by Xiaodong +######################################################################## +import string +import sys +import MySQLdb +import getpass +import time +#import pdb + +#pdb.set_trace() + +######################################################################## + +def translateAlias(str): + if str == "B6": + return "C57BL/6J" + elif str == "D2": + return "DBA/2J" + else: + return str + +######################################################################## +# +# Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile +# +######################################################################## + +dataStart = 1 + +GeneChipId = int( raw_input("Enter GeneChipId:") ) +ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") ) +input_file_name = raw_input("Enter file name with suffix:") + +fp = open("%s" % input_file_name, 'rb') + +try: + passwd = getpass.getpass('Please enter mysql password here : ') + con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd) + + db = con.cursor() + print "You have successfully connected to mysql.\n" +except: + print "You entered incorrect password.\n" + sys.exit(0) + +time0 = time.time() + +######################################################################### +# +# Check if each line have same number of members +# generate the gene list of expression data here +# +######################################################################### +print 'Checking if each line have same number of members' + +GeneList = [] +isCont = 1 +header = fp.readline() +header = string.split(string.strip(header),'\t') +header = map(string.strip, header) +nfield = len(header) +line = fp.readline() + +kj=0 +while line: + line2 = string.split(string.strip(line),'\t') + line2 = map(string.strip, line2) + if len(line2) != nfield: + print "Error : " + line + isCont = 0 + + GeneList.append(line2[0]) + line = fp.readline() + + kj+=1 + if kj%100000 == 0: + print 'checked ',kj,' lines' + +GeneList = map(string.lower, GeneList) +GeneList.sort() + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################### +# +# Check if each strain exist in database +# generate the string id list of expression data here +# +######################################################################### +print 'Checking if each strain exist in database' + +isCont = 1 +fp.seek(0) +header = fp.readline() +header = string.split(string.strip(header),'\t') +header = map(string.strip, header) +header = map(translateAlias, header) +header = header[dataStart:] +Ids = [] +for item in header: + try: + db.execute('select Id from Strain where Name = "%s"' % item) + Ids.append(db.fetchall()[0][0]) + except: + print item,'does not exist, check the if the strain name is correct' + isCont=0 + +if isCont==0: + sys.exit(0) + + +print 'used ',time.time()-time0,' seconds' +######################################################################## +# +# Check if each ProbeSet exist in database +# +######################################################################## +print 'Check if each ProbeSet exist in database' + +##---- find PID is name or target ----## +line = fp.readline() +line = fp.readline() +line2 = string.split(string.strip(line),'\t') +line2 = map(string.strip, line2) +PId = line2[0] + +db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId) ) +results = db.fetchall() +IdStr = 'TargetId' +if len(results)>0: + IdStr = 'Name' + + +##---- get Name/TargetId list from database ----## +db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr)) +results = db.fetchall() + +Names = [] +for item in results: + Names.append(item[0]) + +print Names + +Names = map(string.lower, Names) + +Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at doesn't exist --# + + +##---- compare genelist with names ----## +x=y=0 +x1=-1 +GeneList2=[] +while xNames[y]: + y += 1 + + if x%100000==0: + print 'check Name, checked %d lines'%x + +while x0: + cmd = ','.join(values1) + cmd = 'insert into ProbeSetData values %s' % cmd + db.execute(cmd) + + cmd = ','.join(values2) + cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd + db.execute(cmd) + +con.close() -- cgit v1.2.3