# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License # as published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # See the GNU Affero General Public License for more details. # # This program is available from Source Forge: at GeneNetwork Project # (sourceforge.net/projects/genenetwork/). # # Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) # at rwilliams@uthsc.edu and xzhou15@uthsc.edu # # # # This module is used by GeneNetwork project (www.genenetwork.org) # # Created by GeneNetwork Core Team 2010/08/10 # # Last updated by NL 2011/03/23 import math import rpy2.robjects import pp import string from utility import webqtlUtil from base.webqtlTrait import webqtlTrait from dbFunction import webqtlDatabaseFunction #XZ: The input 'controls' is String. It contains the full name of control traits. #XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait. #XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers. #XZ, 03/29/2010: For each returned control trait, there is no None value in it. def controlStrains(controls, strainlst): controls = controls.split(',') cvals = {} for oneTraitName in controls: oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() ) oneTrait.retrieveData() cvals[oneTraitName] = oneTrait.data tcstrains = [] tcvals = [] tcvars = [] for oneTraitName in controls: strains = [] vals = [] vars = [] for _strain in strainlst: if cvals[oneTraitName].has_key(_strain): _val = cvals[oneTraitName][_strain].val if _val != None: strains.append(_strain) vals.append(_val) vars.append(None) tcstrains.append(strains) tcvals.append(vals) tcvars.append(vars) return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains] #XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them. def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars): """Corrects strains, vals, and vars so that all contrain only those strains common to the reference trait and all control traits.""" def dictify(strains,vals,vars): subdict = {} for i in xrange(len(strains)): subdict[strains[i]] = (vals[i],vars[i]) return subdict #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits. dicts = [] dicts.append(dictify(_strains,_vals,_vars)) nCstrains = len(_controlstrains) for i in xrange(nCstrains): dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i])) _newstrains = [] _vals = [] _vars = [] _controlvals = [[] for x in xrange(nCstrains)] _controlvars = [[] for x in xrange(nCstrains)] for strain in _strains: inall = True for d in dicts: if strain not in d: inall = False break if inall: _newstrains.append(strain) _vals.append(dicts[0][strain][0]) _vars.append(dicts[0][strain][1]) for i in xrange(nCstrains): _controlvals[i].append(dicts[i+1][strain][0]) _controlvars[i].append(dicts[i+1][strain][1]) return _newstrains, _vals, _controlvals, _vars, _controlvars #XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. #else, the returned list has two elements of control trait name. def findIdenticalControlTraits ( controlVals, controlNames ): nameOfIdenticalTraits = [] controlTraitNumber = len(controlVals) if controlTraitNumber > 1: #XZ: reset the precision of values and convert to string type for oneTraitVal in controlVals: for oneStrainVal in oneTraitVal: oneStrainVal = '%.3f' % oneStrainVal for i, oneTraitVal in enumerate( controlVals ): for j in range(i+1, controlTraitNumber): if oneTraitVal == controlVals[j]: nameOfIdenticalTraits.append(controlNames[i]) nameOfIdenticalTraits.append(controlNames[j]) return nameOfIdenticalTraits #XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. #else, the returned list has two elements of control trait name. #primaryVal is of list type. It contains value of primary trait. #primaryName is of string type. #controlVals is of list type. Each element is list too. Each element contain value of one control trait. #controlNames is of list type. def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): nameOfIdenticalTraits = [] #XZ: reset the precision of values and convert to string type for oneStrainVal in primaryVal: oneStrainVal = '%.3f' % oneStrainVal for oneTraitVal in controlVals: for oneStrainVal in oneTraitVal: oneStrainVal = '%.3f' % oneStrainVal controlTraitNumber = len(controlVals) if controlTraitNumber > 1: for i, oneTraitVal in enumerate( controlVals ): for j in range(i+1, controlTraitNumber): if oneTraitVal == controlVals[j]: nameOfIdenticalTraits.append(controlNames[i]) nameOfIdenticalTraits.append(controlNames[j]) break if len(nameOfIdenticalTraits) == 0: for i, oneTraitVal in enumerate( controlVals ): if primaryVal == oneTraitVal: nameOfIdenticalTraits.append(primaryName) nameOfIdenticalTraits.append(controlNames[i]) break return nameOfIdenticalTraits #XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. #XZ: No value in primaryVal and controlVals could be None. def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): rpy2.robjects.r(""" pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ # The partial correlation coefficient between x and y given z # # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. # # x and y should be vectors # # z can be either a vector or a matrix # # use: There are two methods to calculate the partial correlation coefficient. # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). # Default is "mat". # # method: There are three ways to calculate the correlation coefficient, # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. # Default is "p". # # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). # If not, the missing samples will be removed just when the correlation coefficient is calculated. # However, the number of samples for the p-value is the number of samples after removing # all the missing samples from the whole dataset. # Default is "T". x <- c(x) y <- c(y) z <- as.data.frame(z) if(use == "mat"){ p.use <- "Var-Cov matrix" pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) }else if(use == "rec"){ p.use <- "Recursive formula" pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) }else{ stop("use should be either rec or mat!\n") } # print the method if(gregexpr("p",method)[[1]][1] == 1){ p.method <- "Pearson" }else if(gregexpr("s",method)[[1]][1] == 1){ p.method <- "Spearman" }else if(gregexpr("k",method)[[1]][1] == 1){ p.method <- "Kendall" }else{ stop("method should be pearson or spearman or kendall!\n") } # sample number n <- dim(na.omit(data.frame(x,y,z)))[1] # given variables' number gn <- dim(z)[2] # p-value if(p.method == "Kendall"){ statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) p.value <- 2*pnorm(-abs(statistic)) }else{ statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) p.value <- 2*pnorm(-abs(statistic)) } data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) } # By using var-cov matrix pcor.mat <- function(x,y,z,method="p",na.rm=T){ x <- c(x) y <- c(y) z <- as.data.frame(z) if(dim(z)[2] == 0){ stop("There should be given data\n") } data <- data.frame(x,y,z) if(na.rm == T){ data = na.omit(data) } xdata <- na.omit(data.frame(data[,c(1,2)])) Sxx <- cov(xdata,xdata,m=method) xzdata <- na.omit(data) xdata <- data.frame(xzdata[,c(1,2)]) zdata <- data.frame(xzdata[,-c(1,2)]) Sxz <- cov(xdata,zdata,m=method) zdata <- na.omit(data.frame(data[,-c(1,2)])) Szz <- cov(zdata,zdata,m=method) # is Szz positive definite? zz.ev <- eigen(Szz)$values if(min(zz.ev)[1]<0){ stop("\'Szz\' is not positive definite!\n") } # partial correlation Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) rxx.z <- cov2cor(Sxx.z)[1,2] rxx.z } # By using recursive formula pcor.rec <- function(x,y,z,method="p",na.rm=T){ # x <- c(x) y <- c(y) z <- as.data.frame(z) if(dim(z)[2] == 0){ stop("There should be given data\n") } data <- data.frame(x,y,z) if(na.rm == T){ data = na.omit(data) } # recursive formula if(dim(z)[2] == 1){ tdata <- na.omit(data.frame(data[,1],data[,2])) rxy <- cor(tdata[,1],tdata[,2],m=method) tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) rxz <- cor(tdata[,1],tdata[,2],m=method) tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) ryz <- cor(tdata[,1],tdata[,2],m=method) rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) return(rxy.z) }else{ x <- c(data[,1]) y <- c(data[,2]) z0 <- c(data[,3]) zc <- as.data.frame(data[,-c(1,2,3)]) rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) return(rxy.z) } } """) R_pcorr_function = rpy2.robjects.r['pcor.test'] R_corr_test = rpy2.robjects.r['cor.test'] primary = rpy2.robjects.FloatVector(range(len(primaryVal))) for i in range(len(primaryVal)): primary[i] = primaryVal[i] control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) for i in range(len(controlVals)): for j in range(len(controlVals[0])): control[i*len(controlVals[0]) + j] = controlVals[i][j] allcorrelations = [] for targetIndex, oneTargetVals in enumerate(targetVals): this_primary = None this_control = None this_target = None if None in oneTargetVals: goodIndex = [] for i in range(len(oneTargetVals)): if oneTargetVals[i] != None: goodIndex.append(i) this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) for i in range(len(goodIndex)): this_primary[i] = primaryVal[goodIndex[i]] this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) for i in range(len(controlVals)): for j in range(len(goodIndex)): this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) for i in range(len(goodIndex)): this_target[i] = oneTargetVals[goodIndex[i]] else: this_primary = primary this_control = control this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) for i in range(len(oneTargetVals)): this_target[i] = oneTargetVals[i] one_name = targetNames[targetIndex] one_N = len(this_primary) #calculate partial correlation one_pc_coefficient = 'NA' one_pc_p = 1 try: if method == 's': result = R_pcorr_function(this_primary, this_target, this_control, method='s') else: result = R_pcorr_function(this_primary, this_target, this_control) #XZ: In very few cases, the returned coefficient is nan. #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN if result[0][0] == result[0][0]: one_pc_coefficient = result[0][0] #XZ: when the coefficient value is 1 (primary trait and target trait are the same), #XZ: occationally, the returned p value is nan instead of 0. if result[1][0] == result[1][0]: one_pc_p = result[1][0] elif abs(one_pc_coefficient - 1) < 0.0000001: one_pc_p = 0 except: pass #calculate zero order correlation one_corr_coefficient = 0 one_corr_p = 1 try: if method == 's': R_result = R_corr_test(this_primary, this_target, method='spearman') else: R_result = R_corr_test(this_primary, this_target) one_corr_coefficient = R_result[3][0] one_corr_p = R_result[2][0] except: pass traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] allcorrelations.append(traitinfo) return allcorrelations #End of function compute_partial allcorrelations = [] target_trait_number = len(targetVals) if target_trait_number < 1000: allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) else: step = 1000 job_number = math.ceil( float(target_trait_number)/step ) job_targetVals_lists = [] job_targetNames_lists = [] for job_index in range( int(job_number) ): starti = job_index*step endi = min((job_index+1)*step, target_trait_number) one_job_targetVals_list = [] one_job_targetNames_list = [] for i in range( starti, endi ): one_job_targetVals_list.append( targetVals[i] ) one_job_targetNames_list.append( targetNames[i] ) job_targetVals_lists.append( one_job_targetVals_list ) job_targetNames_lists.append( one_job_targetNames_list ) ppservers = () # Creates jobserver with automatically detected number of workers job_server = pp.Server(ppservers=ppservers) jobs = [] results = [] for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): one_job_targetNames_list = job_targetNames_lists[i] #pay attention to modules from outside jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) for one_job in jobs: one_result = one_job() results.append( one_result ) for one_result in results: for one_traitinfo in one_result: allcorrelations.append( one_traitinfo ) return allcorrelations #XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait #XZ: The primaryTrait and targetTrait should have executed retrieveData function def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): #primaryTrait.retrieveData() #there is no None value in primary_val primary_strain, primary_val, primary_var = primaryTrait.exportInformative() #targetTrait.retrieveData() #there might be None value in target_val target_val = targetTrait.exportData(primary_strain, type="val") R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) for i in range(len(primary_val)): R_primary[i] = primary_val[i] N = len(target_val) if None in target_val: goodIndex = [] for i in range(len(target_val)): if target_val[i] != None: goodIndex.append(i) N = len(goodIndex) R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) for i in range(len(goodIndex)): R_primary[i] = primary_val[goodIndex[i]] R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) for i in range(len(goodIndex)): R_target[i] = target_val[goodIndex[i]] else: R_target = rpy2.robjects.FloatVector(range(len(target_val))) for i in range(len(target_val)): R_target[i] = target_val[i] R_corr_test = rpy2.robjects.r['cor.test'] if method == 'spearman': R_result = R_corr_test(R_primary, R_target, method='spearman') else: R_result = R_corr_test(R_primary, R_target) corr_result = [] corr_result.append( R_result[3][0] ) corr_result.append( N ) corr_result.append( R_result[2][0] ) return corr_result ##################################################################################### #Input: primaryValue(list): one list of expression values of one probeSet, # targetValue(list): one list of expression values of one probeSet, # method(string): indicate correlation method ('pearson' or 'spearman') #Output: corr_result(list): first item is Correlation Value, second item is tissue number, # third item is PValue #Function: get correlation value,Tissue quantity ,p value result by using R; #Note : This function is special case since both primaryValue and targetValue are from #the same dataset. So the length of these two parameters is the same. They are pairs. #Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on #the same tissue order ##################################################################################### def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) N = len(primaryValue) for i in range(len(primaryValue)): R_primary[i] = primaryValue[i] R_target = rpy2.robjects.FloatVector(range(len(targetValue))) for i in range(len(targetValue)): R_target[i]=targetValue[i] R_corr_test = rpy2.robjects.r['cor.test'] if method =='spearman': R_result = R_corr_test(R_primary, R_target, method='spearman') else: R_result = R_corr_test(R_primary, R_target) corr_result =[] corr_result.append( R_result[3][0]) corr_result.append( N ) corr_result.append( R_result[2][0]) return corr_result def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): oneSymbolCorrDict = {} oneSymbolPvalueDict = {} R_corr_test = rpy2.robjects.r['cor.test'] R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) for i in range(len(primaryTraitValue)): R_primary[i] = primaryTraitValue[i] for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) for i in range(len(oneTraitValue)): R_target[i] = oneTraitValue[i] if method =='spearman': R_result = R_corr_test(R_primary, R_target, method='spearman') else: R_result = R_corr_test(R_primary, R_target) oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] return(oneSymbolCorrDict, oneSymbolPvalueDict) symbolCorrDict = {} symbolPvalueDict = {} items_number = len(SymbolValueDict) if items_number <= 1000: symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) else: items_list = SymbolValueDict.items() step = 1000 job_number = math.ceil( float(items_number)/step ) job_oneSymbolValueDict_list = [] for job_index in range( int(job_number) ): starti = job_index*step endi = min((job_index+1)*step, items_number) oneSymbolValueDict = {} for i in range( starti, endi ): one_item = items_list[i] one_symbol = one_item[0] one_value = one_item[1] oneSymbolValueDict[one_symbol] = one_value job_oneSymbolValueDict_list.append( oneSymbolValueDict ) ppservers = () # Creates jobserver with automatically detected number of workers job_server = pp.Server(ppservers=ppservers) jobs = [] results = [] for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): #pay attention to modules from outside jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) for one_job in jobs: one_result = one_job() results.append( one_result ) for one_result in results: oneSymbolCorrDict, oneSymbolPvalueDict = one_result symbolCorrDict.update( oneSymbolCorrDict ) symbolPvalueDict.update( oneSymbolPvalueDict ) return (symbolCorrDict, symbolPvalueDict) ########################################################################### #Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId #output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) #function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and # getGeneSymbolTissueValueDict to build dict to get CorrPvArray #Note: If there are multiple probesets for one gene, select the one with highest mean. ########################################################################### def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): Symbols ="" symbolList =[] geneIdDict ={} dataIdDict = {} ChrDict = {} MbDict = {} descDict = {} pTargetDescDict = {} count = len(GeneNameLst) # Added by NL 01/06/2011 # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. if count==0: query=''' select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description from ( select Symbol, max(Mean) as maxmean from TissueProbeSetXRef where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; '''%TissueProbeSetFreezeId else: for i, item in enumerate(GeneNameLst): if i == count-1: Symbols += "'%s'" %item else: Symbols += "'%s'," %item Symbols = "("+ Symbols+")" query=''' select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description from ( select Symbol, max(Mean) as maxmean from TissueProbeSetXRef where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; '''% (TissueProbeSetFreezeId,Symbols) try: cursor.execute(query) results =cursor.fetchall() resultCount = len(results) # Key in all dicts is the lower-cased symbol for i, item in enumerate(results): symbol = item[0] symbolList.append(symbol) key =symbol.lower() geneIdDict[key]=item[1] dataIdDict[key]=item[2] ChrDict[key]=item[3] MbDict[key]=item[4] descDict[key]=item[5] pTargetDescDict[key]=item[6] except: symbolList = None geneIdDict=None dataIdDict=None ChrDict=None MbDict=None descDict=None pTargetDescDict=None return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict ########################################################################### #Input: cursor, symbolList (list), dataIdDict(Dict) #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, # key is symbol, value is one list of expression values of one probeSet; #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). #Attention! All keys are lower case! ########################################################################### def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): symbolList = map(string.lower, symbolList) symbolValuepairDict={} valueList=[] for key in symbolList: if dataIdDict.has_key(key): DataId = dataIdDict[key] valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId try : cursor.execute(valueQuery) valueResults = cursor.fetchall() for item in valueResults: item =item[0] valueList.append(item) symbolValuepairDict[key] = valueList valueList=[] except: symbolValuepairDict[key] = None return symbolValuepairDict ######################################################################################################## #input: cursor, symbolList (list), dataIdDict(Dict): key is symbol #output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. # key is symbol, value is one list of expression values of one probeSet. #function: wrapper function for getSymbolValuePairDict function # build gene symbol list if necessary, cut it into small lists if necessary, # then call getSymbolValuePairDict function and merge the results. ######################################################################################################## def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): limitNum=1000 count = len(symbolList) SymbolValuePairDict = {} if count !=0 and count <=limitNum: SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) elif count >limitNum: SymbolValuePairDict={} n = count/limitNum start =0 stop =0 for i in range(n): stop =limitNum*(i+1) gList1 = symbolList[start:stop] PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) start =limitNum*(i+1) SymbolValuePairDict.update(PairDict1) if stop < count: stop = count gList2 = symbolList[start:stop] PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) SymbolValuePairDict.update(PairDict2) return SymbolValuePairDict ######################################################################################################## #input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) #output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. # key is symbol, value is one list of expression values of one probeSet. #function: wrapper function of getGeneSymbolTissueValueDict function # for CorrelationPage.py ######################################################################################################## def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): SymbolValuePairDict={} symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) if symbolList: SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) return SymbolValuePairDict ######################################################################################################## #Input: cursor(cursor): MySQL connnection cursor; # priGeneSymbolList(list): one list of gene symbol; # symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, # key is symbol, value is one list of expression values of one probeSet; #Output: corrArray(array): array of Correlation Value, # pvArray(array): array of PValue; #Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss ######################################################################################################## def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): # setting initial value for corrArray, pvArray equal to 0 Num = len(priGeneSymbolList) corrArray = [([0] * (Num))[:] for i in range(Num)] pvArray = [([0] * (Num))[:] for i in range(Num)] i = 0 for pkey in priGeneSymbolList: j = 0 pkey = pkey.strip().lower()# key in symbolValuepairDict is low case if symbolValuepairDict.has_key(pkey): priValue = symbolValuepairDict[pkey] for tkey in priGeneSymbolList: tkey = tkey.strip().lower()# key in symbolValuepairDict is low case if priValue and symbolValuepairDict.has_key(tkey): tarValue = symbolValuepairDict[tkey] if tarValue: if i>j: # corrArray stores Pearson Correlation values # pvArray stores Pearson P-Values pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) corrArray[i][j] =pcorr_result[0] pvArray[i][j] =pcorr_result[2] elif i