scripts/maintenance/QTL_Reaper_v6.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

####### To run this program do: python QTL_Reaper_v2.py 235
#Where 163 is the ProbeSetFreeze Id of the database that we want to calculate #the LRS
#!/usr/bin/python
import sys
import os
import reaper
import MySQLdb
import time

con = MySQLdb.Connect(db='db_webqtl', user='username', passwd='', host="localhost")
cursor = con.cursor()

genotypeDir = '/gnshare/gn/web/genotypes/'
genotype_1 = reaper.Dataset()

#####get all of the genotypes
cursor.execute('select Id, Name from InbredSet')
results = cursor.fetchall()
InbredSets = {}
for item in results:
	InbredSets[item[0]] = genotypeDir+str(item[1])+'.geno'

ProbeSetFreezeIds=sys.argv[1:]
if ProbeSetFreezeIds:
	#####convert the Ids to integer
	ProbeSetFreezeIds=list(map(int, ProbeSetFreezeIds))

else:
	#####get all of the dataset that need be updated
	cursor.execute('select distinct(ProbeSetFreezeId) from ProbeSetXRef where pValue is NULL order by ProbeSetFreezeId desc')
	results = cursor.fetchall()
	ProbeSetFreezeIds = []
	for item in results:
		ProbeSetFreezeIds.append(item[0])

####human dataset can NOT use this program calculate the LRS, ignore it
if ProbeSetFreezeIds.__contains__(215):
	ProbeSetFreezeIds.remove(215)

#output_file = open ('/home/xzhou/work/DatabaseTools/cal_LRS_Additive_result.txt', 'w')

#####update 
for ProbeSetFreezeId in ProbeSetFreezeIds:
	cursor.execute("""
		select InbredSetId 
		from ProbeFreeze, ProbeSetFreeze 
		where ProbeFreeze.Id=ProbeSetFreeze.ProbeFreezeId and ProbeSetFreeze.Id=%d
	"""%ProbeSetFreezeId);

	InbredSetId = cursor.fetchone()[0]
	if InbredSetId==3:
		InbredSetId=1
	#if InbredSetId==12:
	#	InbredSetId=2

	print((ProbeSetFreezeId, InbredSets[InbredSetId]))

	genotype_1.read(InbredSets[InbredSetId])
	locuses = []
	for geno in genotype_1:
		for locus in geno:
			locuses.append(locus.name)

	cursor.execute('select ProbeSetId, Locus, DataId from ProbeSetXRef where ProbeSetFreezeId=%s'%ProbeSetFreezeId)
	ProbeSetXRefInfos = cursor.fetchall()

	kj=0
	for aProbeSetXRef in ProbeSetXRefInfos:
		ProbeSetId, Locus, DataId = aProbeSetXRef
		prgy = genotype_1.prgy

		cursor.execute("select Strain.Name, ProbeSetData.value from Strain, ProbeSetData where Strain.Id = ProbeSetData.StrainId and ProbeSetData.Id = %d" % DataId)
		results = cursor.fetchall()
		if not results:
			continue
		_strains = []
		_values = []
		for item2 in results:
			strain, value = item2
			if strain in prgy:
				_strains.append(strain)
				_values.append(value)
		if not _strains or not _values:
			continue
		
		if len(_strains) < 8:
			continue
		qtlresults = genotype_1.regression(strains = _strains, trait = _values)
		_max = max(qtlresults)
		_locus = _max.locus.name
		_additive = _max.additive
		_max = _max.lrs

		#output_file.write('%s\t%s\t%s\t%s\t%s\n' % (ProbeSetFreezeId, ProbeSetId, _locus, _max, _additive))
		
		# _max(LRS) maybe is infinite sometimes, so define it as a very big number
		if _max == float('inf'):
			_max = 10000

		cursor.execute('update ProbeSetXRef set Locus=%s, LRS=%s, additive=%s where ProbeSetId=%s and ProbeSetFreezeId=%s', \
				(_locus, _max, _additive, ProbeSetId, ProbeSetFreezeId))

		kj += 1
		if kj%1000==0:
			print((ProbeSetFreezeId, InbredSets[InbredSetId], kj))


	print(ProbeSetFreezeIds)

cursor.close()
con.close()