aboutsummaryrefslogtreecommitdiff
#!/usr/bin/python2
"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""

########################################################################
# Last Updated Sep 27, 2011 by Xiaodong
########################################################################
import string
import sys
import MySQLdb
import getpass
import time


########################################################################

def translateAlias(str):
    if str == "B6":
        return "C57BL/6J"
    elif str == "D2":
        return "DBA/2J"
    else:
        return str

########################################################################
#
#  Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile
#
########################################################################


dataStart = 1

GeneChipId = int(input("Enter GeneChipId:"))
ProbeSetFreezeId = int(input("Enter ProbeSetFreezeId:"))
input_file_name = input("Enter file name with suffix:")

fp = open("%s" % input_file_name, 'rb')

try:
    passwd = getpass.getpass('Please enter mysql password here : ')
    con = MySQLdb.Connect(db='db_webqtl', host='localhost',
                          user='username', passwd=passwd)

    db = con.cursor()
    print("You have successfully connected to mysql.\n")
except:
    print("You entered incorrect password.\n")
    sys.exit(0)

time0 = time.time()

#########################################################################
#
#  Check if each line have same number of members
#  generate the gene list of expression data here
#
#########################################################################
print('Checking if each line have same number of members')

GeneList = []
isCont = 1
header = fp.readline()
header = header.strip().split('\t')
header = [x.strip() for x in header]
nfield = len(header)
line = fp.readline()

kj = 0
while line:
    line2 = line.strip().split('\t')
    line2 = [x.strip() for x in line2]
    if len(line2) != nfield:
        print(("Error : " + line))
        isCont = 0

    GeneList.append(line2[0])
    line = fp.readline()

    kj += 1
    if kj % 100000 == 0:
        print(('checked ', kj, ' lines'))

GeneList = sorted(map(string.lower, GeneList))

if isCont == 0:
    sys.exit(0)


print(('used ', time.time()-time0, ' seconds'))
#########################################################################
#
#  Check if each strain exist in database
#  generate the string id list of expression data here
#
#########################################################################
print('Checking if each strain exist in database')

isCont = 1
fp.seek(0)
header = fp.readline()
header = header.strip().split('\t')
header = [x.strip() for x in header]
header = list(map(translateAlias, header))
header = header[dataStart:]
Ids = []
for item in header:
    try:
        db.execute('select Id from Strain where Name = "%s"' % item)
        Ids.append(db.fetchall()[0][0])
    except:
        print((item, 'does not exist, check the if the strain name is correct'))
        isCont = 0

if isCont == 0:
    sys.exit(0)


print(('used ', time.time()-time0, ' seconds'))
########################################################################
#
# Check if each ProbeSet exist in database
#
########################################################################
print('Check if each ProbeSet exist in database')

##---- find PID is name or target ----##
line = fp.readline()
line = fp.readline()
line2 = line.strip().split('\t')
line2 = [x.strip() for x in line2]
PId = line2[0]

db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' %
           (PId, GeneChipId))
results = db.fetchall()
IdStr = 'TargetId'
if len(results) > 0:
    IdStr = 'Name'


##---- get Name/TargetId list from database ----##
db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (
    IdStr, GeneChipId, IdStr))
results = db.fetchall()

Names = []
for item in results:
    Names.append(item[0])

print(Names)

Names = sorted(map(string.lower, Names))


##---- compare genelist with names ----##
x = y = 0
x1 = -1
GeneList2 = []
while x < len(GeneList) and y < len(Names):
    if GeneList[x] == Names[y]:
        x += 1
        y += 1
    elif GeneList[x] < Names[y]:
        if x != x1:
            GeneList2.append(GeneList[x])
            x1 = x
        x += 1
    elif GeneList[x] > Names[y]:
        y += 1

    if x % 100000 == 0:
        print(('check Name, checked %d lines' % x))

while x < len(GeneList):
    GeneList2.append(GeneList[x])
    x += 1

isCont = 1
ferror = open("ProbeSetError.txt", "wb")
for item in GeneList2:
    ferror.write(item + " doesn't exist \n")
    print((item, " doesn't exist, check if the ProbeSet name is correct"))
    isCont = 0

if isCont == 0:
    sys.exit(0)


print(('used ', time.time()-time0, ' seconds'))
#########################################################################
#
# Insert data into database
#
#########################################################################
print('getting ProbeSet/Id')


#---- get Name/Id map ----#
db.execute('select %s, Id from ProbeSet where ChipId=%d order by %s' %
           (IdStr, GeneChipId, IdStr))
results = db.fetchall()
NameIds = {}
for item in results:
    NameIds[item[0]] = item[1]
print(('used ', time.time()-time0, ' seconds'))


print('inserting data')

##---- get old max dataId ----##
db.execute('select max(Id) from ProbeSetData')
maxDataId = int(db.fetchall()[0][0])
bmax = maxDataId
print(("old_max = %d\n" % bmax))

##---- insert data ----##
fp.seek(0)
line = fp.readline()
line = fp.readline()
kj = 0

values1 = []
values2 = []
while line:
    line2 = line.strip().split('\t')
    line2 = [x.strip() for x in line2]
    PId = line2[0]
    recordId = NameIds[PId]

    maxDataId += 1
    datasorig = line2[dataStart:]

    ###### Data Table items ######
    i = 0
    for item in datasorig:
        try:
            values1.append('(%d,%d,%s)' % (maxDataId, Ids[i], float(item)))
        except:
            pass
        i += 1

    values2.append("(%d,%d,%d)" % (ProbeSetFreezeId, recordId, maxDataId))

    ##---- insert into table ----##
    kj += 1
    if kj % 100 == 0:
        cmd = ','.join(values1)
        cmd = 'insert into ProbeSetData values %s' % cmd
        db.execute(cmd)

        cmd = ','.join(values2)
        cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
        db.execute(cmd)

        values1 = []
        values2 = []
        print(('Inserted ', kj, ' lines'))
        print(('used ', time.time()-time0, ' seconds'))

    line = fp.readline()


if len(values1) > 0:
    cmd = ','.join(values1)
    cmd = 'insert into ProbeSetData values %s' % cmd
    db.execute(cmd)

    cmd = ','.join(values2)
    cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
    db.execute(cmd)

db.close()
con.close()