aboutsummaryrefslogtreecommitdiff
path: root/web/webqtl/correlation/correlationFunction.py
diff options
context:
space:
mode:
authorroot2012-05-08 18:39:56 -0500
committerroot2012-05-08 18:39:56 -0500
commitea46f42ee640928b92947bfb204c41a482d80937 (patch)
tree9b27a4eb852d12539b543c3efee9d2a47ef470f3 /web/webqtl/correlation/correlationFunction.py
parent056b5253fc3857b0444382aa39944f6344dc1ceb (diff)
downloadgenenetwork2-ea46f42ee640928b92947bfb204c41a482d80937.tar.gz
Add all the source codes into the github.
Diffstat (limited to 'web/webqtl/correlation/correlationFunction.py')
-rwxr-xr-xweb/webqtl/correlation/correlationFunction.py923
1 files changed, 923 insertions, 0 deletions
diff --git a/web/webqtl/correlation/correlationFunction.py b/web/webqtl/correlation/correlationFunction.py
new file mode 100755
index 00000000..cc19f54e
--- /dev/null
+++ b/web/webqtl/correlation/correlationFunction.py
@@ -0,0 +1,923 @@
+# Copyright (C) University of Tennessee Health Science Center, Memphis, TN.
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License
+# as published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU Affero General Public License for more details.
+#
+# This program is available from Source Forge: at GeneNetwork Project
+# (sourceforge.net/projects/genenetwork/).
+#
+# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010)
+# at rwilliams@uthsc.edu and xzhou15@uthsc.edu
+#
+#
+#
+# This module is used by GeneNetwork project (www.genenetwork.org)
+#
+# Created by GeneNetwork Core Team 2010/08/10
+#
+# Last updated by NL 2011/03/23
+
+
+import math
+import rpy2.robjects
+import pp
+import string
+
+from utility import webqtlUtil
+from base.webqtlTrait import webqtlTrait
+from dbFunction import webqtlDatabaseFunction
+
+
+
+#XZ: The input 'controls' is String. It contains the full name of control traits.
+#XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait.
+#XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers.
+#XZ, 03/29/2010: For each returned control trait, there is no None value in it.
+def controlStrains(controls, strainlst):
+
+ controls = controls.split(',')
+
+ cvals = {}
+ for oneTraitName in controls:
+ oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() )
+ oneTrait.retrieveData()
+ cvals[oneTraitName] = oneTrait.data
+
+ tcstrains = []
+ tcvals = []
+ tcvars = []
+
+ for oneTraitName in controls:
+ strains = []
+ vals = []
+ vars = []
+
+ for _strain in strainlst:
+ if cvals[oneTraitName].has_key(_strain):
+ _val = cvals[oneTraitName][_strain].val
+ if _val != None:
+ strains.append(_strain)
+ vals.append(_val)
+ vars.append(None)
+
+ tcstrains.append(strains)
+ tcvals.append(vals)
+ tcvars.append(vars)
+
+ return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains]
+
+
+
+#XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them.
+def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars):
+ """Corrects strains, vals, and vars so that all contrain only those strains common
+ to the reference trait and all control traits."""
+
+ def dictify(strains,vals,vars):
+ subdict = {}
+ for i in xrange(len(strains)):
+ subdict[strains[i]] = (vals[i],vars[i])
+ return subdict
+
+ #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits.
+ dicts = []
+ dicts.append(dictify(_strains,_vals,_vars))
+
+ nCstrains = len(_controlstrains)
+ for i in xrange(nCstrains):
+ dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i]))
+
+ _newstrains = []
+ _vals = []
+ _vars = []
+ _controlvals = [[] for x in xrange(nCstrains)]
+ _controlvars = [[] for x in xrange(nCstrains)]
+
+ for strain in _strains:
+ inall = True
+ for d in dicts:
+ if strain not in d:
+ inall = False
+ break
+ if inall:
+ _newstrains.append(strain)
+ _vals.append(dicts[0][strain][0])
+ _vars.append(dicts[0][strain][1])
+ for i in xrange(nCstrains):
+ _controlvals[i].append(dicts[i+1][strain][0])
+ _controlvars[i].append(dicts[i+1][strain][1])
+
+ return _newstrains, _vals, _controlvals, _vars, _controlvars
+
+
+#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty.
+#else, the returned list has two elements of control trait name.
+def findIdenticalControlTraits ( controlVals, controlNames ):
+ nameOfIdenticalTraits = []
+
+ controlTraitNumber = len(controlVals)
+
+ if controlTraitNumber > 1:
+
+ #XZ: reset the precision of values and convert to string type
+ for oneTraitVal in controlVals:
+ for oneStrainVal in oneTraitVal:
+ oneStrainVal = '%.3f' % oneStrainVal
+
+ for i, oneTraitVal in enumerate( controlVals ):
+ for j in range(i+1, controlTraitNumber):
+ if oneTraitVal == controlVals[j]:
+ nameOfIdenticalTraits.append(controlNames[i])
+ nameOfIdenticalTraits.append(controlNames[j])
+
+ return nameOfIdenticalTraits
+
+#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty.
+#else, the returned list has two elements of control trait name.
+#primaryVal is of list type. It contains value of primary trait.
+#primaryName is of string type.
+#controlVals is of list type. Each element is list too. Each element contain value of one control trait.
+#controlNames is of list type.
+def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ):
+ nameOfIdenticalTraits = []
+
+ #XZ: reset the precision of values and convert to string type
+ for oneStrainVal in primaryVal:
+ oneStrainVal = '%.3f' % oneStrainVal
+
+ for oneTraitVal in controlVals:
+ for oneStrainVal in oneTraitVal:
+ oneStrainVal = '%.3f' % oneStrainVal
+
+ controlTraitNumber = len(controlVals)
+
+ if controlTraitNumber > 1:
+ for i, oneTraitVal in enumerate( controlVals ):
+ for j in range(i+1, controlTraitNumber):
+ if oneTraitVal == controlVals[j]:
+ nameOfIdenticalTraits.append(controlNames[i])
+ nameOfIdenticalTraits.append(controlNames[j])
+ break
+
+ if len(nameOfIdenticalTraits) == 0:
+ for i, oneTraitVal in enumerate( controlVals ):
+ if primaryVal == oneTraitVal:
+ nameOfIdenticalTraits.append(primaryName)
+ nameOfIdenticalTraits.append(controlNames[i])
+ break
+
+ return nameOfIdenticalTraits
+
+
+
+#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order.
+#XZ: No value in primaryVal and controlVals could be None.
+
+def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'):
+
+ def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ):
+
+ rpy2.robjects.r("""
+pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){
+ # The partial correlation coefficient between x and y given z
+ #
+ # pcor.test is free and comes with ABSOLUTELY NO WARRANTY.
+ #
+ # x and y should be vectors
+ #
+ # z can be either a vector or a matrix
+ #
+ # use: There are two methods to calculate the partial correlation coefficient.
+ # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec").
+ # Default is "mat".
+ #
+ # method: There are three ways to calculate the correlation coefficient,
+ # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods.
+ # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis.
+ # Default is "p".
+ #
+ # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z).
+ # If not, the missing samples will be removed just when the correlation coefficient is calculated.
+ # However, the number of samples for the p-value is the number of samples after removing
+ # all the missing samples from the whole dataset.
+ # Default is "T".
+
+ x <- c(x)
+ y <- c(y)
+ z <- as.data.frame(z)
+
+ if(use == "mat"){
+ p.use <- "Var-Cov matrix"
+ pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm)
+ }else if(use == "rec"){
+ p.use <- "Recursive formula"
+ pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm)
+ }else{
+ stop("use should be either rec or mat!\n")
+ }
+
+ # print the method
+ if(gregexpr("p",method)[[1]][1] == 1){
+ p.method <- "Pearson"
+ }else if(gregexpr("s",method)[[1]][1] == 1){
+ p.method <- "Spearman"
+ }else if(gregexpr("k",method)[[1]][1] == 1){
+ p.method <- "Kendall"
+ }else{
+ stop("method should be pearson or spearman or kendall!\n")
+ }
+
+ # sample number
+ n <- dim(na.omit(data.frame(x,y,z)))[1]
+
+ # given variables' number
+ gn <- dim(z)[2]
+
+ # p-value
+ if(p.method == "Kendall"){
+ statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn)))
+ p.value <- 2*pnorm(-abs(statistic))
+
+ }else{
+ statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2))
+ p.value <- 2*pnorm(-abs(statistic))
+ }
+
+ data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use)
+}
+
+# By using var-cov matrix
+pcor.mat <- function(x,y,z,method="p",na.rm=T){
+
+ x <- c(x)
+ y <- c(y)
+ z <- as.data.frame(z)
+
+ if(dim(z)[2] == 0){
+ stop("There should be given data\n")
+ }
+
+ data <- data.frame(x,y,z)
+
+ if(na.rm == T){
+ data = na.omit(data)
+ }
+
+ xdata <- na.omit(data.frame(data[,c(1,2)]))
+ Sxx <- cov(xdata,xdata,m=method)
+
+ xzdata <- na.omit(data)
+ xdata <- data.frame(xzdata[,c(1,2)])
+ zdata <- data.frame(xzdata[,-c(1,2)])
+ Sxz <- cov(xdata,zdata,m=method)
+
+ zdata <- na.omit(data.frame(data[,-c(1,2)]))
+ Szz <- cov(zdata,zdata,m=method)
+
+ # is Szz positive definite?
+ zz.ev <- eigen(Szz)$values
+ if(min(zz.ev)[1]<0){
+ stop("\'Szz\' is not positive definite!\n")
+ }
+
+ # partial correlation
+ Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz)
+
+ rxx.z <- cov2cor(Sxx.z)[1,2]
+
+ rxx.z
+}
+
+# By using recursive formula
+pcor.rec <- function(x,y,z,method="p",na.rm=T){
+ #
+
+ x <- c(x)
+ y <- c(y)
+ z <- as.data.frame(z)
+
+ if(dim(z)[2] == 0){
+ stop("There should be given data\n")
+ }
+
+ data <- data.frame(x,y,z)
+
+ if(na.rm == T){
+ data = na.omit(data)
+ }
+
+ # recursive formula
+ if(dim(z)[2] == 1){
+ tdata <- na.omit(data.frame(data[,1],data[,2]))
+ rxy <- cor(tdata[,1],tdata[,2],m=method)
+
+ tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)]))
+ rxz <- cor(tdata[,1],tdata[,2],m=method)
+
+ tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)]))
+ ryz <- cor(tdata[,1],tdata[,2],m=method)
+
+ rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) )
+
+ return(rxy.z)
+ }else{
+ x <- c(data[,1])
+ y <- c(data[,2])
+ z0 <- c(data[,3])
+ zc <- as.data.frame(data[,-c(1,2,3)])
+
+ rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm)
+ rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm)
+ ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm)
+
+ rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) )
+ return(rxy.z)
+ }
+}
+""")
+
+ R_pcorr_function = rpy2.robjects.r['pcor.test']
+ R_corr_test = rpy2.robjects.r['cor.test']
+
+ primary = rpy2.robjects.FloatVector(range(len(primaryVal)))
+ for i in range(len(primaryVal)):
+ primary[i] = primaryVal[i]
+
+ control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals))
+ for i in range(len(controlVals)):
+ for j in range(len(controlVals[0])):
+ control[i*len(controlVals[0]) + j] = controlVals[i][j]
+
+ allcorrelations = []
+
+ for targetIndex, oneTargetVals in enumerate(targetVals):
+
+ this_primary = None
+ this_control = None
+ this_target = None
+
+ if None in oneTargetVals:
+
+ goodIndex = []
+ for i in range(len(oneTargetVals)):
+ if oneTargetVals[i] != None:
+ goodIndex.append(i)
+
+ this_primary = rpy2.robjects.FloatVector(range(len(goodIndex)))
+ for i in range(len(goodIndex)):
+ this_primary[i] = primaryVal[goodIndex[i]]
+
+ this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals))
+ for i in range(len(controlVals)):
+ for j in range(len(goodIndex)):
+ this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]]
+
+ this_target = rpy2.robjects.FloatVector(range(len(goodIndex)))
+ for i in range(len(goodIndex)):
+ this_target[i] = oneTargetVals[goodIndex[i]]
+
+ else:
+ this_primary = primary
+ this_control = control
+ this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals)))
+ for i in range(len(oneTargetVals)):
+ this_target[i] = oneTargetVals[i]
+
+ one_name = targetNames[targetIndex]
+ one_N = len(this_primary)
+
+ #calculate partial correlation
+ one_pc_coefficient = 'NA'
+ one_pc_p = 1
+
+ try:
+ if method == 's':
+ result = R_pcorr_function(this_primary, this_target, this_control, method='s')
+ else:
+ result = R_pcorr_function(this_primary, this_target, this_control)
+
+ #XZ: In very few cases, the returned coefficient is nan.
+ #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN
+ if result[0][0] == result[0][0]:
+ one_pc_coefficient = result[0][0]
+ #XZ: when the coefficient value is 1 (primary trait and target trait are the same),
+ #XZ: occationally, the returned p value is nan instead of 0.
+ if result[1][0] == result[1][0]:
+ one_pc_p = result[1][0]
+ elif abs(one_pc_coefficient - 1) < 0.0000001:
+ one_pc_p = 0
+ except:
+ pass
+
+ #calculate zero order correlation
+ one_corr_coefficient = 0
+ one_corr_p = 1
+
+ try:
+ if method == 's':
+ R_result = R_corr_test(this_primary, this_target, method='spearman')
+ else:
+ R_result = R_corr_test(this_primary, this_target)
+
+ one_corr_coefficient = R_result[3][0]
+ one_corr_p = R_result[2][0]
+ except:
+ pass
+
+ traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ]
+
+ allcorrelations.append(traitinfo)
+
+ return allcorrelations
+ #End of function compute_partial
+
+
+ allcorrelations = []
+
+ target_trait_number = len(targetVals)
+
+ if target_trait_number < 1000:
+ allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method )
+ else:
+ step = 1000
+ job_number = math.ceil( float(target_trait_number)/step )
+
+ job_targetVals_lists = []
+ job_targetNames_lists = []
+
+ for job_index in range( int(job_number) ):
+ starti = job_index*step
+ endi = min((job_index+1)*step, target_trait_number)
+
+ one_job_targetVals_list = []
+ one_job_targetNames_list = []
+
+ for i in range( starti, endi ):
+ one_job_targetVals_list.append( targetVals[i] )
+ one_job_targetNames_list.append( targetNames[i] )
+
+ job_targetVals_lists.append( one_job_targetVals_list )
+ job_targetNames_lists.append( one_job_targetNames_list )
+
+ ppservers = ()
+ # Creates jobserver with automatically detected number of workers
+ job_server = pp.Server(ppservers=ppservers)
+
+ jobs = []
+ results = []
+
+ for i, one_job_targetVals_list in enumerate( job_targetVals_lists ):
+ one_job_targetNames_list = job_targetNames_lists[i]
+ #pay attention to modules from outside
+ jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) )
+
+ for one_job in jobs:
+ one_result = one_job()
+ results.append( one_result )
+
+ for one_result in results:
+ for one_traitinfo in one_result:
+ allcorrelations.append( one_traitinfo )
+
+ return allcorrelations
+
+
+
+#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait
+#XZ: The primaryTrait and targetTrait should have executed retrieveData function
+def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'):
+
+ #primaryTrait.retrieveData()
+
+ #there is no None value in primary_val
+ primary_strain, primary_val, primary_var = primaryTrait.exportInformative()
+
+ #targetTrait.retrieveData()
+
+ #there might be None value in target_val
+ target_val = targetTrait.exportData(primary_strain, type="val")
+
+ R_primary = rpy2.robjects.FloatVector(range(len(primary_val)))
+ for i in range(len(primary_val)):
+ R_primary[i] = primary_val[i]
+
+ N = len(target_val)
+
+ if None in target_val:
+ goodIndex = []
+ for i in range(len(target_val)):
+ if target_val[i] != None:
+ goodIndex.append(i)
+
+ N = len(goodIndex)
+
+ R_primary = rpy2.robjects.FloatVector(range(len(goodIndex)))
+ for i in range(len(goodIndex)):
+ R_primary[i] = primary_val[goodIndex[i]]
+
+ R_target = rpy2.robjects.FloatVector(range(len(goodIndex)))
+ for i in range(len(goodIndex)):
+ R_target[i] = target_val[goodIndex[i]]
+
+ else:
+ R_target = rpy2.robjects.FloatVector(range(len(target_val)))
+ for i in range(len(target_val)):
+ R_target[i] = target_val[i]
+
+ R_corr_test = rpy2.robjects.r['cor.test']
+
+ if method == 'spearman':
+ R_result = R_corr_test(R_primary, R_target, method='spearman')
+ else:
+ R_result = R_corr_test(R_primary, R_target)
+
+ corr_result = []
+ corr_result.append( R_result[3][0] )
+ corr_result.append( N )
+ corr_result.append( R_result[2][0] )
+
+ return corr_result
+
+#####################################################################################
+#Input: primaryValue(list): one list of expression values of one probeSet,
+# targetValue(list): one list of expression values of one probeSet,
+# method(string): indicate correlation method ('pearson' or 'spearman')
+#Output: corr_result(list): first item is Correlation Value, second item is tissue number,
+# third item is PValue
+#Function: get correlation value,Tissue quantity ,p value result by using R;
+#Note : This function is special case since both primaryValue and targetValue are from
+#the same dataset. So the length of these two parameters is the same. They are pairs.
+#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on
+#the same tissue order
+#####################################################################################
+
+def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'):
+
+ R_primary = rpy2.robjects.FloatVector(range(len(primaryValue)))
+ N = len(primaryValue)
+ for i in range(len(primaryValue)):
+ R_primary[i] = primaryValue[i]
+
+ R_target = rpy2.robjects.FloatVector(range(len(targetValue)))
+ for i in range(len(targetValue)):
+ R_target[i]=targetValue[i]
+
+ R_corr_test = rpy2.robjects.r['cor.test']
+ if method =='spearman':
+ R_result = R_corr_test(R_primary, R_target, method='spearman')
+ else:
+ R_result = R_corr_test(R_primary, R_target)
+
+ corr_result =[]
+ corr_result.append( R_result[3][0])
+ corr_result.append( N )
+ corr_result.append( R_result[2][0])
+
+ return corr_result
+
+
+
+
+def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'):
+
+ def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ):
+
+ oneSymbolCorrDict = {}
+ oneSymbolPvalueDict = {}
+
+ R_corr_test = rpy2.robjects.r['cor.test']
+
+ R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue)))
+
+ for i in range(len(primaryTraitValue)):
+ R_primary[i] = primaryTraitValue[i]
+
+ for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems():
+ R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue)))
+ for i in range(len(oneTraitValue)):
+ R_target[i] = oneTraitValue[i]
+
+ if method =='spearman':
+ R_result = R_corr_test(R_primary, R_target, method='spearman')
+ else:
+ R_result = R_corr_test(R_primary, R_target)
+
+ oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0]
+ oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0]
+
+ return(oneSymbolCorrDict, oneSymbolPvalueDict)
+
+
+
+ symbolCorrDict = {}
+ symbolPvalueDict = {}
+
+ items_number = len(SymbolValueDict)
+
+ if items_number <= 1000:
+ symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method)
+ else:
+ items_list = SymbolValueDict.items()
+
+ step = 1000
+ job_number = math.ceil( float(items_number)/step )
+
+ job_oneSymbolValueDict_list = []
+
+ for job_index in range( int(job_number) ):
+ starti = job_index*step
+ endi = min((job_index+1)*step, items_number)
+
+ oneSymbolValueDict = {}
+
+ for i in range( starti, endi ):
+ one_item = items_list[i]
+ one_symbol = one_item[0]
+ one_value = one_item[1]
+ oneSymbolValueDict[one_symbol] = one_value
+
+ job_oneSymbolValueDict_list.append( oneSymbolValueDict )
+
+
+ ppservers = ()
+ # Creates jobserver with automatically detected number of workers
+ job_server = pp.Server(ppservers=ppservers)
+
+ jobs = []
+ results = []
+
+ for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ):
+
+ #pay attention to modules from outside
+ jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) )
+
+ for one_job in jobs:
+ one_result = one_job()
+ results.append( one_result )
+
+ for one_result in results:
+ oneSymbolCorrDict, oneSymbolPvalueDict = one_result
+ symbolCorrDict.update( oneSymbolCorrDict )
+ symbolPvalueDict.update( oneSymbolPvalueDict )
+
+ return (symbolCorrDict, symbolPvalueDict)
+
+###########################################################################
+#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId
+#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict)
+#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and
+# getGeneSymbolTissueValueDict to build dict to get CorrPvArray
+#Note: If there are multiple probesets for one gene, select the one with highest mean.
+###########################################################################
+def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0):
+ Symbols =""
+ symbolList =[]
+ geneIdDict ={}
+ dataIdDict = {}
+ ChrDict = {}
+ MbDict = {}
+ descDict = {}
+ pTargetDescDict = {}
+
+ count = len(GeneNameLst)
+
+ # Added by NL 01/06/2011
+ # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value
+ # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable.
+ if count==0:
+ query='''
+ select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description
+ from (
+ select Symbol, max(Mean) as maxmean
+ from TissueProbeSetXRef
+ where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol)
+ as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean;
+ '''%TissueProbeSetFreezeId
+
+ else:
+ for i, item in enumerate(GeneNameLst):
+
+ if i == count-1:
+ Symbols += "'%s'" %item
+ else:
+ Symbols += "'%s'," %item
+
+ Symbols = "("+ Symbols+")"
+ query='''
+ select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description
+ from (
+ select Symbol, max(Mean) as maxmean
+ from TissueProbeSetXRef
+ where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol)
+ as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean;
+ '''% (TissueProbeSetFreezeId,Symbols)
+
+ try:
+
+ cursor.execute(query)
+ results =cursor.fetchall()
+ resultCount = len(results)
+ # Key in all dicts is the lower-cased symbol
+ for i, item in enumerate(results):
+ symbol = item[0]
+ symbolList.append(symbol)
+
+ key =symbol.lower()
+ geneIdDict[key]=item[1]
+ dataIdDict[key]=item[2]
+ ChrDict[key]=item[3]
+ MbDict[key]=item[4]
+ descDict[key]=item[5]
+ pTargetDescDict[key]=item[6]
+
+ except:
+ symbolList = None
+ geneIdDict=None
+ dataIdDict=None
+ ChrDict=None
+ MbDict=None
+ descDict=None
+ pTargetDescDict=None
+
+ return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict
+
+###########################################################################
+#Input: cursor, symbolList (list), dataIdDict(Dict)
+#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair,
+# key is symbol, value is one list of expression values of one probeSet;
+#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type).
+#Attention! All keys are lower case!
+###########################################################################
+def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}):
+ symbolList = map(string.lower, symbolList)
+ symbolValuepairDict={}
+ valueList=[]
+
+ for key in symbolList:
+ if dataIdDict.has_key(key):
+ DataId = dataIdDict[key]
+
+ valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId
+ try :
+ cursor.execute(valueQuery)
+ valueResults = cursor.fetchall()
+ for item in valueResults:
+ item =item[0]
+ valueList.append(item)
+ symbolValuepairDict[key] = valueList
+ valueList=[]
+ except:
+ symbolValuepairDict[key] = None
+
+ return symbolValuepairDict
+
+
+########################################################################################################
+#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol
+#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair.
+# key is symbol, value is one list of expression values of one probeSet.
+#function: wrapper function for getSymbolValuePairDict function
+# build gene symbol list if necessary, cut it into small lists if necessary,
+# then call getSymbolValuePairDict function and merge the results.
+########################################################################################################
+
+def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}):
+ limitNum=1000
+ count = len(symbolList)
+
+ SymbolValuePairDict = {}
+
+ if count !=0 and count <=limitNum:
+ SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict)
+
+ elif count >limitNum:
+ SymbolValuePairDict={}
+ n = count/limitNum
+ start =0
+ stop =0
+
+ for i in range(n):
+ stop =limitNum*(i+1)
+ gList1 = symbolList[start:stop]
+ PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict)
+ start =limitNum*(i+1)
+
+ SymbolValuePairDict.update(PairDict1)
+
+ if stop < count:
+ stop = count
+ gList2 = symbolList[start:stop]
+ PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict)
+ SymbolValuePairDict.update(PairDict2)
+
+ return SymbolValuePairDict
+
+########################################################################################################
+#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int)
+#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair.
+# key is symbol, value is one list of expression values of one probeSet.
+#function: wrapper function of getGeneSymbolTissueValueDict function
+# for CorrelationPage.py
+########################################################################################################
+
+def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0):
+ SymbolValuePairDict={}
+ symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId)
+ if symbolList:
+ SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict)
+ return SymbolValuePairDict
+
+########################################################################################################
+#Input: cursor(cursor): MySQL connnection cursor;
+# priGeneSymbolList(list): one list of gene symbol;
+# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair,
+# key is symbol, value is one list of expression values of one probeSet;
+#Output: corrArray(array): array of Correlation Value,
+# pvArray(array): array of PValue;
+#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss
+########################################################################################################
+
+def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}):
+ # setting initial value for corrArray, pvArray equal to 0
+ Num = len(priGeneSymbolList)
+
+ corrArray = [([0] * (Num))[:] for i in range(Num)]
+ pvArray = [([0] * (Num))[:] for i in range(Num)]
+ i = 0
+ for pkey in priGeneSymbolList:
+ j = 0
+ pkey = pkey.strip().lower()# key in symbolValuepairDict is low case
+ if symbolValuepairDict.has_key(pkey):
+ priValue = symbolValuepairDict[pkey]
+ for tkey in priGeneSymbolList:
+ tkey = tkey.strip().lower()# key in symbolValuepairDict is low case
+ if priValue and symbolValuepairDict.has_key(tkey):
+ tarValue = symbolValuepairDict[tkey]
+
+ if tarValue:
+ if i>j:
+ # corrArray stores Pearson Correlation values
+ # pvArray stores Pearson P-Values
+ pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue)
+ corrArray[i][j] =pcorr_result[0]
+ pvArray[i][j] =pcorr_result[2]
+ elif i<j:
+ # corrArray stores Spearman Correlation values
+ # pvArray stores Spearman P-Values
+ scorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue,method='spearman')
+ corrArray[i][j] =scorr_result[0]
+ pvArray[i][j] =scorr_result[2]
+ else:
+ # on the diagonal line, correlation value is 1, P-Values is 0
+ corrArray[i][j] =1
+ pvArray[i][j] =0
+ j+=1
+ else:
+ corrArray[i][j] = None
+ pvArray[i][j] = None
+ j+=1
+ else:
+ corrArray[i][j] = None
+ pvArray[i][j] = None
+ j+=1
+ else:
+ corrArray[i][j] = None
+ pvArray[i][j] = None
+
+ i+=1
+
+ return corrArray, pvArray
+
+########################################################################################################
+#Input: cursor(cursor): MySQL connnection cursor;
+# primaryTraitSymbol(string): one gene symbol;
+# TissueProbeSetFreezeId (int): Id of related TissueProbeSetFreeze
+# method: '0' default value, Pearson Correlation; '1', Spearman Correlation
+#Output: symbolCorrDict(Dict): Dict of Correlation Value, key is symbol
+# symbolPvalueDict(Dict): Dict of PValue,key is symbol ;
+#Function: build symbolCorrDict, symbolPvalueDict for display by calling calculation function:calZeroOrderCorrForTiss
+########################################################################################################
+def calculateCorrOfAllTissueTrait(cursor=None, primaryTraitSymbol=None, TissueProbeSetFreezeId=None,method='0'):
+
+ symbolCorrDict = {}
+ symbolPvalueDict = {}
+
+ primaryTraitSymbolValueDict = getGeneSymbolTissueValueDictForTrait(cursor=cursor, GeneNameLst=[primaryTraitSymbol], TissueProbeSetFreezeId=TissueProbeSetFreezeId)
+ primaryTraitValue = primaryTraitSymbolValueDict.values()[0]
+
+ SymbolValueDict = getGeneSymbolTissueValueDictForTrait(cursor=cursor, GeneNameLst=[], TissueProbeSetFreezeId=TissueProbeSetFreezeId)
+
+ if method =='1':
+ symbolCorrDict, symbolPvalueDict = batchCalTissueCorr(primaryTraitValue,SymbolValueDict,method='spearman')
+ else:
+ symbolCorrDict, symbolPvalueDict = batchCalTissueCorr(primaryTraitValue,SymbolValueDict)
+
+
+ return (symbolCorrDict, symbolPvalueDict)