From 155e2997613c0750de30b734686f8977524956f9 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Fri, 12 Jul 2013 22:58:39 +0000 Subject: Rewrote code related to getting the tissue correlation column to display Created new files for mrna assay tissue data and commonly used db query related functions --- wqflask/base/mrna_assay_tissue_data.py | 134 +++ wqflask/utility/db_tools.py | 15 + wqflask/wqflask/correlation/correlationFunction.py | 923 -------------------- .../wqflask/correlation/correlation_functions.py | 959 +++++++++++++++++++++ wqflask/wqflask/correlation/show_corr_results.py | 76 +- 5 files changed, 1151 insertions(+), 956 deletions(-) create mode 100644 wqflask/base/mrna_assay_tissue_data.py create mode 100644 wqflask/utility/db_tools.py delete mode 100644 wqflask/wqflask/correlation/correlationFunction.py create mode 100644 wqflask/wqflask/correlation/correlation_functions.py (limited to 'wqflask') diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py new file mode 100644 index 00000000..8ae71858 --- /dev/null +++ b/wqflask/base/mrna_assay_tissue_data.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import, print_function, division + +import collections + +from flask import g + +from utility import dbtools +from uitility import Bunch + +from MySQLdb import escape_string as escape + +class MrnaAssayTissueData(object): + + def __init__(self, gene_symbols=None): + self.gene_symbols = gene_symbols + self.have_data = False + if self.gene_symbols == None: + self.gene_symbols = [] + + self.data = collections.defaultdict(Bunch) + + #self.gene_id_dict ={} + #self.data_id_dict = {} + #self.chr_dict = {} + #self.mb_dict = {} + #self.desc_dict = {} + #self.probe_target_desc_dict = {} + + query = '''select t.Symbol, t.GeneId, t.DataId,t.Chr, t.Mb, t.description, t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=1 and ''' + + # Note that inner join is necessary in this query to get distinct record in one symbol group + # with highest mean value + # Due to the limit size of TissueProbeSetFreezeId table in DB, + # performance of inner join is acceptable. + if len(gene_symbols) == 0: + query += '''Symbol!='' and Symbol Is Not Null group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol + and t.Mean = x.maxmean; + ''' + else: + in_clause = dbtools.create_in_clause(gene_symbols) + + query += ''' Symbol in {} group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol + and t.Mean = x.maxmean; + '''.format(in_clause) + + results = g.db.execute(query).fetchall() + for result in results: + symbol = item[0] + gene_symbols.append(symbol) + symbol = symbol.lower() + + self.data[symbol].gene_id = result.GeneId + self.data[symbol].data_id = result.DataId + self.data[symbol].chr = result.Chr + self.data[symbol].mb = result.Mb + self.data[symbol].description = result.description + self.data[symbol].probe_target_description = result.Probe_Target_Description + + + ########################################################################### + #Input: cursor, symbolList (list), dataIdDict(Dict) + #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, + # key is symbol, value is one list of expression values of one probeSet; + #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). + #Attention! All keys are lower case! + ########################################################################### + def get_symbol_value_pairs(self): + + id_list = [self.tissue_data[symbol.lower()].data_id for item in self.tissue_data] + + symbol_value_pairs = {} + value_list=[] + + query = """SELECT value, id + FROM TissueProbeSetData + WHERE Id IN {}""".format(create_in_clause(id_list)) + + try : + results = g.db.execute(query).fetchall() + for result in results: + value_list.append(result.value) + symbol_value_pairs[symbol] = value_list + except: + symbol_value_pairs[symbol] = None + + #for symbol in symbol_list: + # if tissue_data.has_key(symbol): + # data_id = tissue_data[symbol].data_id + # + # query = """select value, id + # from TissueProbeSetData + # where Id={}""".format(escape(data_id)) + # try : + # results = g.db.execute(query).fetchall() + # for item in results: + # item = item[0] + # value_list.append(item) + # symbol_value_pairs[symbol] = value_list + # value_list=[] + # except: + # symbol_value_pairs[symbol] = None + + return symbol_value_pairs + + ######################################################################################################## + #input: cursor, symbolList (list), dataIdDict(Dict): key is symbol + #output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. + # key is symbol, value is one list of expression values of one probeSet. + #function: wrapper function for getSymbolValuePairDict function + # build gene symbol list if necessary, cut it into small lists if necessary, + # then call getSymbolValuePairDict function and merge the results. + ######################################################################################################## + + def get_trait_symbol_and_tissue_values(symbol_list=None): + tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) + + #symbolList, + #geneIdDict, + #dataIdDict, + #ChrDict, + #MbDict, + #descDict, + #pTargetDescDict = getTissueProbeSetXRefInfo( + # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + + if len(tissue_data.gene_symbols): + return get_symbol_value_pairs(tissue_data) + diff --git a/wqflask/utility/db_tools.py b/wqflask/utility/db_tools.py new file mode 100644 index 00000000..4034f39c --- /dev/null +++ b/wqflask/utility/db_tools.py @@ -0,0 +1,15 @@ +from __future__ import absolute_import, print_function, division + +from MySQLdb import escape_string as escape + +def create_in_clause(items): + """Create an in clause for mysql""" + in_clause = ', '.join("'{}'".format(x) for x in mescape(*items)) + in_clause = '( {} )'.format(in_clause) + return in_clause + +def mescape(*items): + """Multiple escape""" + escaped = [escape(str(item)) for item in items] + #print("escaped is:", escaped) + return escaped diff --git a/wqflask/wqflask/correlation/correlationFunction.py b/wqflask/wqflask/correlation/correlationFunction.py deleted file mode 100644 index 7d4b58a9..00000000 --- a/wqflask/wqflask/correlation/correlationFunction.py +++ /dev/null @@ -1,923 +0,0 @@ -# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. -# -# This program is free software: you can redistribute it and/or modify it -# under the terms of the GNU Affero General Public License -# as published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# See the GNU Affero General Public License for more details. -# -# This program is available from Source Forge: at GeneNetwork Project -# (sourceforge.net/projects/genenetwork/). -# -# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) -# at rwilliams@uthsc.edu and xzhou15@uthsc.edu -# -# -# -# This module is used by GeneNetwork project (www.genenetwork.org) -# -# Created by GeneNetwork Core Team 2010/08/10 -# -# Last updated by NL 2011/03/23 - - -import math -#import rpy2.robjects -import pp -import string - -from utility import webqtlUtil -from base.trait import GeneralTrait -from dbFunction import webqtlDatabaseFunction - - - -#XZ: The input 'controls' is String. It contains the full name of control traits. -#XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait. -#XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers. -#XZ, 03/29/2010: For each returned control trait, there is no None value in it. -def controlStrains(controls, strainlst): - - controls = controls.split(',') - - cvals = {} - for oneTraitName in controls: - oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() ) - oneTrait.retrieveData() - cvals[oneTraitName] = oneTrait.data - - tcstrains = [] - tcvals = [] - tcvars = [] - - for oneTraitName in controls: - strains = [] - vals = [] - vars = [] - - for _strain in strainlst: - if cvals[oneTraitName].has_key(_strain): - _val = cvals[oneTraitName][_strain].val - if _val != None: - strains.append(_strain) - vals.append(_val) - vars.append(None) - - tcstrains.append(strains) - tcvals.append(vals) - tcvars.append(vars) - - return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains] - - - -#XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them. -def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars): - """Corrects strains, vals, and vars so that all contrain only those strains common - to the reference trait and all control traits.""" - - def dictify(strains,vals,vars): - subdict = {} - for i in xrange(len(strains)): - subdict[strains[i]] = (vals[i],vars[i]) - return subdict - - #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits. - dicts = [] - dicts.append(dictify(_strains,_vals,_vars)) - - nCstrains = len(_controlstrains) - for i in xrange(nCstrains): - dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i])) - - _newstrains = [] - _vals = [] - _vars = [] - _controlvals = [[] for x in xrange(nCstrains)] - _controlvars = [[] for x in xrange(nCstrains)] - - for strain in _strains: - inall = True - for d in dicts: - if strain not in d: - inall = False - break - if inall: - _newstrains.append(strain) - _vals.append(dicts[0][strain][0]) - _vars.append(dicts[0][strain][1]) - for i in xrange(nCstrains): - _controlvals[i].append(dicts[i+1][strain][0]) - _controlvars[i].append(dicts[i+1][strain][1]) - - return _newstrains, _vals, _controlvals, _vars, _controlvars - - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -def findIdenticalControlTraits ( controlVals, controlNames ): - nameOfIdenticalTraits = [] - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - - #XZ: reset the precision of values and convert to string type - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - - return nameOfIdenticalTraits - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -#primaryVal is of list type. It contains value of primary trait. -#primaryName is of string type. -#controlVals is of list type. Each element is list too. Each element contain value of one control trait. -#controlNames is of list type. -def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): - nameOfIdenticalTraits = [] - - #XZ: reset the precision of values and convert to string type - for oneStrainVal in primaryVal: - oneStrainVal = '%.3f' % oneStrainVal - - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - break - - if len(nameOfIdenticalTraits) == 0: - for i, oneTraitVal in enumerate( controlVals ): - if primaryVal == oneTraitVal: - nameOfIdenticalTraits.append(primaryName) - nameOfIdenticalTraits.append(controlNames[i]) - break - - return nameOfIdenticalTraits - - - -#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. -#XZ: No value in primaryVal and controlVals could be None. - -def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): - - def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): - - rpy2.robjects.r(""" -pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ - # The partial correlation coefficient between x and y given z - # - # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. - # - # x and y should be vectors - # - # z can be either a vector or a matrix - # - # use: There are two methods to calculate the partial correlation coefficient. - # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). - # Default is "mat". - # - # method: There are three ways to calculate the correlation coefficient, - # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. - # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. - # Default is "p". - # - # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). - # If not, the missing samples will be removed just when the correlation coefficient is calculated. - # However, the number of samples for the p-value is the number of samples after removing - # all the missing samples from the whole dataset. - # Default is "T". - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(use == "mat"){ - p.use <- "Var-Cov matrix" - pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) - }else if(use == "rec"){ - p.use <- "Recursive formula" - pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) - }else{ - stop("use should be either rec or mat!\n") - } - - # print the method - if(gregexpr("p",method)[[1]][1] == 1){ - p.method <- "Pearson" - }else if(gregexpr("s",method)[[1]][1] == 1){ - p.method <- "Spearman" - }else if(gregexpr("k",method)[[1]][1] == 1){ - p.method <- "Kendall" - }else{ - stop("method should be pearson or spearman or kendall!\n") - } - - # sample number - n <- dim(na.omit(data.frame(x,y,z)))[1] - - # given variables' number - gn <- dim(z)[2] - - # p-value - if(p.method == "Kendall"){ - statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) - p.value <- 2*pnorm(-abs(statistic)) - - }else{ - statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) - p.value <- 2*pnorm(-abs(statistic)) - } - - data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) -} - -# By using var-cov matrix -pcor.mat <- function(x,y,z,method="p",na.rm=T){ - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - xdata <- na.omit(data.frame(data[,c(1,2)])) - Sxx <- cov(xdata,xdata,m=method) - - xzdata <- na.omit(data) - xdata <- data.frame(xzdata[,c(1,2)]) - zdata <- data.frame(xzdata[,-c(1,2)]) - Sxz <- cov(xdata,zdata,m=method) - - zdata <- na.omit(data.frame(data[,-c(1,2)])) - Szz <- cov(zdata,zdata,m=method) - - # is Szz positive definite? - zz.ev <- eigen(Szz)$values - if(min(zz.ev)[1]<0){ - stop("\'Szz\' is not positive definite!\n") - } - - # partial correlation - Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) - - rxx.z <- cov2cor(Sxx.z)[1,2] - - rxx.z -} - -# By using recursive formula -pcor.rec <- function(x,y,z,method="p",na.rm=T){ - # - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - # recursive formula - if(dim(z)[2] == 1){ - tdata <- na.omit(data.frame(data[,1],data[,2])) - rxy <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) - rxz <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) - ryz <- cor(tdata[,1],tdata[,2],m=method) - - rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) - - return(rxy.z) - }else{ - x <- c(data[,1]) - y <- c(data[,2]) - z0 <- c(data[,3]) - zc <- as.data.frame(data[,-c(1,2,3)]) - - rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) - rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) - ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) - - rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) - return(rxy.z) - } -} -""") - - R_pcorr_function = rpy2.robjects.r['pcor.test'] - R_corr_test = rpy2.robjects.r['cor.test'] - - primary = rpy2.robjects.FloatVector(range(len(primaryVal))) - for i in range(len(primaryVal)): - primary[i] = primaryVal[i] - - control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(controlVals[0])): - control[i*len(controlVals[0]) + j] = controlVals[i][j] - - allcorrelations = [] - - for targetIndex, oneTargetVals in enumerate(targetVals): - - this_primary = None - this_control = None - this_target = None - - if None in oneTargetVals: - - goodIndex = [] - for i in range(len(oneTargetVals)): - if oneTargetVals[i] != None: - goodIndex.append(i) - - this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_primary[i] = primaryVal[goodIndex[i]] - - this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(goodIndex)): - this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] - - this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_target[i] = oneTargetVals[goodIndex[i]] - - else: - this_primary = primary - this_control = control - this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) - for i in range(len(oneTargetVals)): - this_target[i] = oneTargetVals[i] - - one_name = targetNames[targetIndex] - one_N = len(this_primary) - - #calculate partial correlation - one_pc_coefficient = 'NA' - one_pc_p = 1 - - try: - if method == 's': - result = R_pcorr_function(this_primary, this_target, this_control, method='s') - else: - result = R_pcorr_function(this_primary, this_target, this_control) - - #XZ: In very few cases, the returned coefficient is nan. - #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN - if result[0][0] == result[0][0]: - one_pc_coefficient = result[0][0] - #XZ: when the coefficient value is 1 (primary trait and target trait are the same), - #XZ: occationally, the returned p value is nan instead of 0. - if result[1][0] == result[1][0]: - one_pc_p = result[1][0] - elif abs(one_pc_coefficient - 1) < 0.0000001: - one_pc_p = 0 - except: - pass - - #calculate zero order correlation - one_corr_coefficient = 0 - one_corr_p = 1 - - try: - if method == 's': - R_result = R_corr_test(this_primary, this_target, method='spearman') - else: - R_result = R_corr_test(this_primary, this_target) - - one_corr_coefficient = R_result[3][0] - one_corr_p = R_result[2][0] - except: - pass - - traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] - - allcorrelations.append(traitinfo) - - return allcorrelations - #End of function compute_partial - - - allcorrelations = [] - - target_trait_number = len(targetVals) - - if target_trait_number < 1000: - allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) - else: - step = 1000 - job_number = math.ceil( float(target_trait_number)/step ) - - job_targetVals_lists = [] - job_targetNames_lists = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, target_trait_number) - - one_job_targetVals_list = [] - one_job_targetNames_list = [] - - for i in range( starti, endi ): - one_job_targetVals_list.append( targetVals[i] ) - one_job_targetNames_list.append( targetNames[i] ) - - job_targetVals_lists.append( one_job_targetVals_list ) - job_targetNames_lists.append( one_job_targetNames_list ) - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): - one_job_targetNames_list = job_targetNames_lists[i] - #pay attention to modules from outside - jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - for one_traitinfo in one_result: - allcorrelations.append( one_traitinfo ) - - return allcorrelations - - - -#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait -#XZ: The primaryTrait and targetTrait should have executed retrieveData function -def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): - - #primaryTrait.retrieveData() - - #there is no None value in primary_val - primary_strain, primary_val, primary_var = primaryTrait.exportInformative() - - #targetTrait.retrieveData() - - #there might be None value in target_val - target_val = targetTrait.exportData(primary_strain, type="val") - - R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) - for i in range(len(primary_val)): - R_primary[i] = primary_val[i] - - N = len(target_val) - - if None in target_val: - goodIndex = [] - for i in range(len(target_val)): - if target_val[i] != None: - goodIndex.append(i) - - N = len(goodIndex) - - R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - R_primary[i] = primary_val[goodIndex[i]] - - R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - R_target[i] = target_val[goodIndex[i]] - - else: - R_target = rpy2.robjects.FloatVector(range(len(target_val))) - for i in range(len(target_val)): - R_target[i] = target_val[i] - - R_corr_test = rpy2.robjects.r['cor.test'] - - if method == 'spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - corr_result = [] - corr_result.append( R_result[3][0] ) - corr_result.append( N ) - corr_result.append( R_result[2][0] ) - - return corr_result - -##################################################################################### -#Input: primaryValue(list): one list of expression values of one probeSet, -# targetValue(list): one list of expression values of one probeSet, -# method(string): indicate correlation method ('pearson' or 'spearman') -#Output: corr_result(list): first item is Correlation Value, second item is tissue number, -# third item is PValue -#Function: get correlation value,Tissue quantity ,p value result by using R; -#Note : This function is special case since both primaryValue and targetValue are from -#the same dataset. So the length of these two parameters is the same. They are pairs. -#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on -#the same tissue order -##################################################################################### - -def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): - - R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) - N = len(primaryValue) - for i in range(len(primaryValue)): - R_primary[i] = primaryValue[i] - - R_target = rpy2.robjects.FloatVector(range(len(targetValue))) - for i in range(len(targetValue)): - R_target[i]=targetValue[i] - - R_corr_test = rpy2.robjects.r['cor.test'] - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - corr_result =[] - corr_result.append( R_result[3][0]) - corr_result.append( N ) - corr_result.append( R_result[2][0]) - - return corr_result - - - - -def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): - - def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): - - oneSymbolCorrDict = {} - oneSymbolPvalueDict = {} - - R_corr_test = rpy2.robjects.r['cor.test'] - - R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) - - for i in range(len(primaryTraitValue)): - R_primary[i] = primaryTraitValue[i] - - for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): - R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) - for i in range(len(oneTraitValue)): - R_target[i] = oneTraitValue[i] - - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] - oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] - - return(oneSymbolCorrDict, oneSymbolPvalueDict) - - - - symbolCorrDict = {} - symbolPvalueDict = {} - - items_number = len(SymbolValueDict) - - if items_number <= 1000: - symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) - else: - items_list = SymbolValueDict.items() - - step = 1000 - job_number = math.ceil( float(items_number)/step ) - - job_oneSymbolValueDict_list = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, items_number) - - oneSymbolValueDict = {} - - for i in range( starti, endi ): - one_item = items_list[i] - one_symbol = one_item[0] - one_value = one_item[1] - oneSymbolValueDict[one_symbol] = one_value - - job_oneSymbolValueDict_list.append( oneSymbolValueDict ) - - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): - - #pay attention to modules from outside - jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - oneSymbolCorrDict, oneSymbolPvalueDict = one_result - symbolCorrDict.update( oneSymbolCorrDict ) - symbolPvalueDict.update( oneSymbolPvalueDict ) - - return (symbolCorrDict, symbolPvalueDict) - -########################################################################### -#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId -#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) -#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and -# getGeneSymbolTissueValueDict to build dict to get CorrPvArray -#Note: If there are multiple probesets for one gene, select the one with highest mean. -########################################################################### -def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - Symbols ="" - symbolList =[] - geneIdDict ={} - dataIdDict = {} - ChrDict = {} - MbDict = {} - descDict = {} - pTargetDescDict = {} - - count = len(GeneNameLst) - - # Added by NL 01/06/2011 - # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value - # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. - if count==0: - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''%TissueProbeSetFreezeId - - else: - for i, item in enumerate(GeneNameLst): - - if i == count-1: - Symbols += "'%s'" %item - else: - Symbols += "'%s'," %item - - Symbols = "("+ Symbols+")" - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''% (TissueProbeSetFreezeId,Symbols) - - try: - - cursor.execute(query) - results =cursor.fetchall() - resultCount = len(results) - # Key in all dicts is the lower-cased symbol - for i, item in enumerate(results): - symbol = item[0] - symbolList.append(symbol) - - key =symbol.lower() - geneIdDict[key]=item[1] - dataIdDict[key]=item[2] - ChrDict[key]=item[3] - MbDict[key]=item[4] - descDict[key]=item[5] - pTargetDescDict[key]=item[6] - - except: - symbolList = None - geneIdDict=None - dataIdDict=None - ChrDict=None - MbDict=None - descDict=None - pTargetDescDict=None - - return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict - -########################################################################### -#Input: cursor, symbolList (list), dataIdDict(Dict) -#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). -#Attention! All keys are lower case! -########################################################################### -def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): - symbolList = map(string.lower, symbolList) - symbolValuepairDict={} - valueList=[] - - for key in symbolList: - if dataIdDict.has_key(key): - DataId = dataIdDict[key] - - valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId - try : - cursor.execute(valueQuery) - valueResults = cursor.fetchall() - for item in valueResults: - item =item[0] - valueList.append(item) - symbolValuepairDict[key] = valueList - valueList=[] - except: - symbolValuepairDict[key] = None - - return symbolValuepairDict - - -######################################################################################################## -#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function for getSymbolValuePairDict function -# build gene symbol list if necessary, cut it into small lists if necessary, -# then call getSymbolValuePairDict function and merge the results. -######################################################################################################## - -def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): - limitNum=1000 - count = len(symbolList) - - SymbolValuePairDict = {} - - if count !=0 and count <=limitNum: - SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - - elif count >limitNum: - SymbolValuePairDict={} - n = count/limitNum - start =0 - stop =0 - - for i in range(n): - stop =limitNum*(i+1) - gList1 = symbolList[start:stop] - PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) - start =limitNum*(i+1) - - SymbolValuePairDict.update(PairDict1) - - if stop < count: - stop = count - gList2 = symbolList[start:stop] - PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) - SymbolValuePairDict.update(PairDict2) - - return SymbolValuePairDict - -######################################################################################################## -#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function of getGeneSymbolTissueValueDict function -# for CorrelationPage.py -######################################################################################################## - -def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - SymbolValuePairDict={} - symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) - if symbolList: - SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - return SymbolValuePairDict - -######################################################################################################## -#Input: cursor(cursor): MySQL connnection cursor; -# priGeneSymbolList(list): one list of gene symbol; -# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#Output: corrArray(array): array of Correlation Value, -# pvArray(array): array of PValue; -#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss -######################################################################################################## - -def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): - # setting initial value for corrArray, pvArray equal to 0 - Num = len(priGeneSymbolList) - - corrArray = [([0] * (Num))[:] for i in range(Num)] - pvArray = [([0] * (Num))[:] for i in range(Num)] - i = 0 - for pkey in priGeneSymbolList: - j = 0 - pkey = pkey.strip().lower()# key in symbolValuepairDict is low case - if symbolValuepairDict.has_key(pkey): - priValue = symbolValuepairDict[pkey] - for tkey in priGeneSymbolList: - tkey = tkey.strip().lower()# key in symbolValuepairDict is low case - if priValue and symbolValuepairDict.has_key(tkey): - tarValue = symbolValuepairDict[tkey] - - if tarValue: - if i>j: - # corrArray stores Pearson Correlation values - # pvArray stores Pearson P-Values - pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) - corrArray[i][j] =pcorr_result[0] - pvArray[i][j] =pcorr_result[2] - elif i 1: + + #XZ: reset the precision of values and convert to string type + for oneTraitVal in controlVals: + for oneStrainVal in oneTraitVal: + oneStrainVal = '%.3f' % oneStrainVal + + for i, oneTraitVal in enumerate( controlVals ): + for j in range(i+1, controlTraitNumber): + if oneTraitVal == controlVals[j]: + nameOfIdenticalTraits.append(controlNames[i]) + nameOfIdenticalTraits.append(controlNames[j]) + + return nameOfIdenticalTraits + +#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. +#else, the returned list has two elements of control trait name. +#primaryVal is of list type. It contains value of primary trait. +#primaryName is of string type. +#controlVals is of list type. Each element is list too. Each element contain value of one control trait. +#controlNames is of list type. +def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): + nameOfIdenticalTraits = [] + + #XZ: reset the precision of values and convert to string type + for oneStrainVal in primaryVal: + oneStrainVal = '%.3f' % oneStrainVal + + for oneTraitVal in controlVals: + for oneStrainVal in oneTraitVal: + oneStrainVal = '%.3f' % oneStrainVal + + controlTraitNumber = len(controlVals) + + if controlTraitNumber > 1: + for i, oneTraitVal in enumerate( controlVals ): + for j in range(i+1, controlTraitNumber): + if oneTraitVal == controlVals[j]: + nameOfIdenticalTraits.append(controlNames[i]) + nameOfIdenticalTraits.append(controlNames[j]) + break + + if len(nameOfIdenticalTraits) == 0: + for i, oneTraitVal in enumerate( controlVals ): + if primaryVal == oneTraitVal: + nameOfIdenticalTraits.append(primaryName) + nameOfIdenticalTraits.append(controlNames[i]) + break + + return nameOfIdenticalTraits + + + +#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. +#XZ: No value in primaryVal and controlVals could be None. + +def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): + + def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): + + rpy2.robjects.r(""" +pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ + # The partial correlation coefficient between x and y given z + # + # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. + # + # x and y should be vectors + # + # z can be either a vector or a matrix + # + # use: There are two methods to calculate the partial correlation coefficient. + # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). + # Default is "mat". + # + # method: There are three ways to calculate the correlation coefficient, + # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. + # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. + # Default is "p". + # + # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). + # If not, the missing samples will be removed just when the correlation coefficient is calculated. + # However, the number of samples for the p-value is the number of samples after removing + # all the missing samples from the whole dataset. + # Default is "T". + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(use == "mat"){ + p.use <- "Var-Cov matrix" + pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) + }else if(use == "rec"){ + p.use <- "Recursive formula" + pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) + }else{ + stop("use should be either rec or mat!\n") + } + + # print the method + if(gregexpr("p",method)[[1]][1] == 1){ + p.method <- "Pearson" + }else if(gregexpr("s",method)[[1]][1] == 1){ + p.method <- "Spearman" + }else if(gregexpr("k",method)[[1]][1] == 1){ + p.method <- "Kendall" + }else{ + stop("method should be pearson or spearman or kendall!\n") + } + + # sample number + n <- dim(na.omit(data.frame(x,y,z)))[1] + + # given variables' number + gn <- dim(z)[2] + + # p-value + if(p.method == "Kendall"){ + statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) + p.value <- 2*pnorm(-abs(statistic)) + + }else{ + statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) + p.value <- 2*pnorm(-abs(statistic)) + } + + data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) +} + +# By using var-cov matrix +pcor.mat <- function(x,y,z,method="p",na.rm=T){ + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(dim(z)[2] == 0){ + stop("There should be given data\n") + } + + data <- data.frame(x,y,z) + + if(na.rm == T){ + data = na.omit(data) + } + + xdata <- na.omit(data.frame(data[,c(1,2)])) + Sxx <- cov(xdata,xdata,m=method) + + xzdata <- na.omit(data) + xdata <- data.frame(xzdata[,c(1,2)]) + zdata <- data.frame(xzdata[,-c(1,2)]) + Sxz <- cov(xdata,zdata,m=method) + + zdata <- na.omit(data.frame(data[,-c(1,2)])) + Szz <- cov(zdata,zdata,m=method) + + # is Szz positive definite? + zz.ev <- eigen(Szz)$values + if(min(zz.ev)[1]<0){ + stop("\'Szz\' is not positive definite!\n") + } + + # partial correlation + Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) + + rxx.z <- cov2cor(Sxx.z)[1,2] + + rxx.z +} + +# By using recursive formula +pcor.rec <- function(x,y,z,method="p",na.rm=T){ + # + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(dim(z)[2] == 0){ + stop("There should be given data\n") + } + + data <- data.frame(x,y,z) + + if(na.rm == T){ + data = na.omit(data) + } + + # recursive formula + if(dim(z)[2] == 1){ + tdata <- na.omit(data.frame(data[,1],data[,2])) + rxy <- cor(tdata[,1],tdata[,2],m=method) + + tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) + rxz <- cor(tdata[,1],tdata[,2],m=method) + + tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) + ryz <- cor(tdata[,1],tdata[,2],m=method) + + rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) + + return(rxy.z) + }else{ + x <- c(data[,1]) + y <- c(data[,2]) + z0 <- c(data[,3]) + zc <- as.data.frame(data[,-c(1,2,3)]) + + rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) + rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) + ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) + + rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) + return(rxy.z) + } +} +""") + + R_pcorr_function = rpy2.robjects.r['pcor.test'] + R_corr_test = rpy2.robjects.r['cor.test'] + + primary = rpy2.robjects.FloatVector(range(len(primaryVal))) + for i in range(len(primaryVal)): + primary[i] = primaryVal[i] + + control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) + for i in range(len(controlVals)): + for j in range(len(controlVals[0])): + control[i*len(controlVals[0]) + j] = controlVals[i][j] + + allcorrelations = [] + + for targetIndex, oneTargetVals in enumerate(targetVals): + + this_primary = None + this_control = None + this_target = None + + if None in oneTargetVals: + + goodIndex = [] + for i in range(len(oneTargetVals)): + if oneTargetVals[i] != None: + goodIndex.append(i) + + this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + this_primary[i] = primaryVal[goodIndex[i]] + + this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) + for i in range(len(controlVals)): + for j in range(len(goodIndex)): + this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] + + this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + this_target[i] = oneTargetVals[goodIndex[i]] + + else: + this_primary = primary + this_control = control + this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) + for i in range(len(oneTargetVals)): + this_target[i] = oneTargetVals[i] + + one_name = targetNames[targetIndex] + one_N = len(this_primary) + + #calculate partial correlation + one_pc_coefficient = 'NA' + one_pc_p = 1 + + try: + if method == 's': + result = R_pcorr_function(this_primary, this_target, this_control, method='s') + else: + result = R_pcorr_function(this_primary, this_target, this_control) + + #XZ: In very few cases, the returned coefficient is nan. + #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN + if result[0][0] == result[0][0]: + one_pc_coefficient = result[0][0] + #XZ: when the coefficient value is 1 (primary trait and target trait are the same), + #XZ: occationally, the returned p value is nan instead of 0. + if result[1][0] == result[1][0]: + one_pc_p = result[1][0] + elif abs(one_pc_coefficient - 1) < 0.0000001: + one_pc_p = 0 + except: + pass + + #calculate zero order correlation + one_corr_coefficient = 0 + one_corr_p = 1 + + try: + if method == 's': + R_result = R_corr_test(this_primary, this_target, method='spearman') + else: + R_result = R_corr_test(this_primary, this_target) + + one_corr_coefficient = R_result[3][0] + one_corr_p = R_result[2][0] + except: + pass + + traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] + + allcorrelations.append(traitinfo) + + return allcorrelations + #End of function compute_partial + + + allcorrelations = [] + + target_trait_number = len(targetVals) + + if target_trait_number < 1000: + allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) + else: + step = 1000 + job_number = math.ceil( float(target_trait_number)/step ) + + job_targetVals_lists = [] + job_targetNames_lists = [] + + for job_index in range( int(job_number) ): + starti = job_index*step + endi = min((job_index+1)*step, target_trait_number) + + one_job_targetVals_list = [] + one_job_targetNames_list = [] + + for i in range( starti, endi ): + one_job_targetVals_list.append( targetVals[i] ) + one_job_targetNames_list.append( targetNames[i] ) + + job_targetVals_lists.append( one_job_targetVals_list ) + job_targetNames_lists.append( one_job_targetNames_list ) + + ppservers = () + # Creates jobserver with automatically detected number of workers + job_server = pp.Server(ppservers=ppservers) + + jobs = [] + results = [] + + for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): + one_job_targetNames_list = job_targetNames_lists[i] + #pay attention to modules from outside + jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) + + for one_job in jobs: + one_result = one_job() + results.append( one_result ) + + for one_result in results: + for one_traitinfo in one_result: + allcorrelations.append( one_traitinfo ) + + return allcorrelations + + + +#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait +#XZ: The primaryTrait and targetTrait should have executed retrieveData function +def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): + + #primaryTrait.retrieveData() + + #there is no None value in primary_val + primary_strain, primary_val, primary_var = primaryTrait.exportInformative() + + #targetTrait.retrieveData() + + #there might be None value in target_val + target_val = targetTrait.exportData(primary_strain, type="val") + + R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) + for i in range(len(primary_val)): + R_primary[i] = primary_val[i] + + N = len(target_val) + + if None in target_val: + goodIndex = [] + for i in range(len(target_val)): + if target_val[i] != None: + goodIndex.append(i) + + N = len(goodIndex) + + R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + R_primary[i] = primary_val[goodIndex[i]] + + R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + R_target[i] = target_val[goodIndex[i]] + + else: + R_target = rpy2.robjects.FloatVector(range(len(target_val))) + for i in range(len(target_val)): + R_target[i] = target_val[i] + + R_corr_test = rpy2.robjects.r['cor.test'] + + if method == 'spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + corr_result = [] + corr_result.append( R_result[3][0] ) + corr_result.append( N ) + corr_result.append( R_result[2][0] ) + + return corr_result + +##################################################################################### +#Input: primaryValue(list): one list of expression values of one probeSet, +# targetValue(list): one list of expression values of one probeSet, +# method(string): indicate correlation method ('pearson' or 'spearman') +#Output: corr_result(list): first item is Correlation Value, second item is tissue number, +# third item is PValue +#Function: get correlation value,Tissue quantity ,p value result by using R; +#Note : This function is special case since both primaryValue and targetValue are from +#the same dataset. So the length of these two parameters is the same. They are pairs. +#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on +#the same tissue order +##################################################################################### + +def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): + + R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) + N = len(primaryValue) + for i in range(len(primaryValue)): + R_primary[i] = primaryValue[i] + + R_target = rpy2.robjects.FloatVector(range(len(targetValue))) + for i in range(len(targetValue)): + R_target[i]=targetValue[i] + + R_corr_test = rpy2.robjects.r['cor.test'] + if method =='spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + corr_result =[] + corr_result.append( R_result[3][0]) + corr_result.append( N ) + corr_result.append( R_result[2][0]) + + return corr_result + + + + +def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): + + def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): + + oneSymbolCorrDict = {} + oneSymbolPvalueDict = {} + + R_corr_test = rpy2.robjects.r['cor.test'] + + R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) + + for i in range(len(primaryTraitValue)): + R_primary[i] = primaryTraitValue[i] + + for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): + R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) + for i in range(len(oneTraitValue)): + R_target[i] = oneTraitValue[i] + + if method =='spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] + oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] + + return(oneSymbolCorrDict, oneSymbolPvalueDict) + + + + symbolCorrDict = {} + symbolPvalueDict = {} + + items_number = len(SymbolValueDict) + + if items_number <= 1000: + symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) + else: + items_list = SymbolValueDict.items() + + step = 1000 + job_number = math.ceil( float(items_number)/step ) + + job_oneSymbolValueDict_list = [] + + for job_index in range( int(job_number) ): + starti = job_index*step + endi = min((job_index+1)*step, items_number) + + oneSymbolValueDict = {} + + for i in range( starti, endi ): + one_item = items_list[i] + one_symbol = one_item[0] + one_value = one_item[1] + oneSymbolValueDict[one_symbol] = one_value + + job_oneSymbolValueDict_list.append( oneSymbolValueDict ) + + + ppservers = () + # Creates jobserver with automatically detected number of workers + job_server = pp.Server(ppservers=ppservers) + + jobs = [] + results = [] + + for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): + + #pay attention to modules from outside + jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) + + for one_job in jobs: + one_result = one_job() + results.append( one_result ) + + for one_result in results: + oneSymbolCorrDict, oneSymbolPvalueDict = one_result + symbolCorrDict.update( oneSymbolCorrDict ) + symbolPvalueDict.update( oneSymbolPvalueDict ) + + return (symbolCorrDict, symbolPvalueDict) + +########################################################################### +#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId +#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) +#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and +# getGeneSymbolTissueValueDict to build dict to get CorrPvArray +#Note: If there are multiple probesets for one gene, select the one with highest mean. +########################################################################### +def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0): + Symbols ="" + symbolList =[] + geneIdDict ={} + dataIdDict = {} + ChrDict = {} + MbDict = {} + descDict = {} + pTargetDescDict = {} + + count = len(GeneNameLst) + + # Added by NL 01/06/2011 + # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value + # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. + if count==0: + query=''' + select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; + '''%TissueProbeSetFreezeId + + else: + for i, item in enumerate(GeneNameLst): + + if i == count-1: + Symbols += "'%s'" %item + else: + Symbols += "'%s'," %item + + Symbols = "("+ Symbols+")" + query=''' + select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; + '''% (TissueProbeSetFreezeId,Symbols) + + try: + cursor.execute(query) + results =cursor.fetchall() + resultCount = len(results) + # Key in all dicts is the lower-cased symbol + for i, item in enumerate(results): + symbol = item[0] + symbolList.append(symbol) + + key =symbol.lower() + geneIdDict[key]=item[1] + dataIdDict[key]=item[2] + ChrDict[key]=item[3] + MbDict[key]=item[4] + descDict[key]=item[5] + pTargetDescDict[key]=item[6] + + except: + symbolList = None + geneIdDict=None + dataIdDict=None + ChrDict=None + MbDict=None + descDict=None + pTargetDescDict=None + + return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict + +########################################################################### +#Input: cursor, symbolList (list), dataIdDict(Dict) +#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, +# key is symbol, value is one list of expression values of one probeSet; +#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). +#Attention! All keys are lower case! +########################################################################### +def get_symbol_value_pairs(tissue_data): + + id_list = [tissue_data[symbol.lower()].data_id for item in tissue_data] + + symbol_value_pairs = {} + value_list=[] + + query = """SELECT value, id + FROM TissueProbeSetData + WHERE Id IN {}""".format(create_in_clause(id_list)) + + try : + results = g.db.execute(query).fetchall() + for result in results: + value_list.append(result.value) + symbol_value_pairs[symbol] = value_list + except: + symbol_value_pairs[symbol] = None + + #for symbol in symbol_list: + # if tissue_data.has_key(symbol): + # data_id = tissue_data[symbol].data_id + # + # query = """select value, id + # from TissueProbeSetData + # where Id={}""".format(escape(data_id)) + # try : + # results = g.db.execute(query).fetchall() + # for item in results: + # item = item[0] + # value_list.append(item) + # symbol_value_pairs[symbol] = value_list + # value_list=[] + # except: + # symbol_value_pairs[symbol] = None + + return symbol_value_pairs + + +######################################################################################################## +#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol +#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. +# key is symbol, value is one list of expression values of one probeSet. +#function: wrapper function for getSymbolValuePairDict function +# build gene symbol list if necessary, cut it into small lists if necessary, +# then call getSymbolValuePairDict function and merge the results. +######################################################################################################## + +def get_trait_symbol_and_tissue_values(symbol_list=None): + SymbolValuePairDict={} + + tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) + + #symbolList, + #geneIdDict, + #dataIdDict, + #ChrDict, + #MbDict, + #descDict, + #pTargetDescDict = getTissueProbeSetXRefInfo( + # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + + if len(tissue_data.gene_symbols): + return get_symbol_value_pairs(tissue_data) + + #limit_num=1000 + #count = len(symbol_list) + # + #symbol_value_pairs = {} + # + #if count !=0 and count <= limit_num: + # symbol_value_pairs = getSymbolValuePairDict(cursor=cursor,symbolList=symbol_list,dataIdDict=dataIdDict) + # + #elif count > limit_num: + # n = count/limit_num + # start = 0 + # stop = 0 + # + # for i in range(n): + # stop =limit_num*(i+1) + # gList1 = symbolList[start:stop] + # PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) + # start =limit_num*(i+1) + # + # SymbolValuePairDict.update(PairDict1) + # + # if stop < count: + # stop = count + # gList2 = symbolList[start:stop] + # PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) + # SymbolValuePairDict.update(PairDict2) + # + #return SymbolValuePairDict + +######################################################################################################## +#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) +#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. +# key is symbol, value is one list of expression values of one probeSet. +#function: wrapper function of getGeneSymbolTissueValueDict function +# for CorrelationPage.py +######################################################################################################## + +#def get_trait_symbol_and_tissue_values(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): +# SymbolValuePairDict={} +# +# symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo( +# cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) +# +# if symbolList: +# SymbolValuePairDict = get_gene_symbol_and_tissue_values(symbolList=symbolList, +# dataIdDict=dataIdDict) +# +# return SymbolValuePairDict + +######################################################################################################## +#Input: cursor(cursor): MySQL connnection cursor; +# priGeneSymbolList(list): one list of gene symbol; +# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, +# key is symbol, value is one list of expression values of one probeSet; +#Output: corrArray(array): array of Correlation Value, +# pvArray(array): array of PValue; +#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss +######################################################################################################## + +def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): + # setting initial value for corrArray, pvArray equal to 0 + Num = len(priGeneSymbolList) + + corrArray = [([0] * (Num))[:] for i in range(Num)] + pvArray = [([0] * (Num))[:] for i in range(Num)] + i = 0 + for pkey in priGeneSymbolList: + j = 0 + pkey = pkey.strip().lower()# key in symbolValuepairDict is low case + if symbolValuepairDict.has_key(pkey): + priValue = symbolValuepairDict[pkey] + for tkey in priGeneSymbolList: + tkey = tkey.strip().lower()# key in symbolValuepairDict is low case + if priValue and symbolValuepairDict.has_key(tkey): + tarValue = symbolValuepairDict[tkey] + + if tarValue: + if i>j: + # corrArray stores Pearson Correlation values + # pvArray stores Pearson P-Values + pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) + corrArray[i][j] =pcorr_result[0] + pvArray[i][j] =pcorr_result[2] + elif i