diff options
author | zsloan | 2018-04-11 20:39:24 +0000 |
---|---|---|
committer | zsloan | 2018-04-11 20:39:24 +0000 |
commit | 9750e63d64849d7fa9e1e681f56b73cae96905df (patch) | |
tree | daa743957497d92b8a3573b00405593d3454cdd0 | |
parent | d8cec0ef94b7683f42946ce182a937484ad1034a (diff) | |
download | genenetwork2-9750e63d64849d7fa9e1e681f56b73cae96905df.tar.gz |
Added tissue correlation p value to correlation page results, since it was missing before
Removed a bunch of unused coded from all correlation-related files and the ctl analysis code
-rw-r--r-- | wqflask/base/mrna_assay_tissue_data.py | 65 | ||||
-rw-r--r-- | wqflask/wqflask/collect.py | 8 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/corr_scatter_plot.py | 16 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/correlation_functions.py | 786 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/show_corr_results.py | 844 | ||||
-rw-r--r-- | wqflask/wqflask/correlation_matrix/show_corr_matrix.py | 1 | ||||
-rw-r--r-- | wqflask/wqflask/ctl/ctl_analysis.py | 63 | ||||
-rw-r--r-- | wqflask/wqflask/templates/correlation_page.html | 9 |
8 files changed, 33 insertions, 1759 deletions
diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py index 53f7c16a..6fec5dcd 100644 --- a/wqflask/base/mrna_assay_tissue_data.py +++ b/wqflask/base/mrna_assay_tissue_data.py @@ -21,17 +21,8 @@ class MrnaAssayTissueData(object): if self.gene_symbols == None: self.gene_symbols = [] - #print("self.gene_symbols:", self.gene_symbols) - self.data = collections.defaultdict(Bunch) - #self.gene_id_dict ={} - #self.data_id_dict = {} - #self.chr_dict = {} - #self.mb_dict = {} - #self.desc_dict = {} - #self.probe_target_desc_dict = {} - query = '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description from ( select Symbol, max(Mean) as maxmean @@ -51,7 +42,6 @@ class MrnaAssayTissueData(object): in_clause = db_tools.create_in_clause(gene_symbols) #ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower - query += ''' Symbol in {} group by Symbol) as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; @@ -66,9 +56,7 @@ class MrnaAssayTissueData(object): for result in results: symbol = result[0] - #if symbol.lower() in [gene_symbol.lower() for gene_symbol in gene_symbols]: if symbol.lower() in lower_symbols: - #gene_symbols.append(symbol) symbol = symbol.lower() self.data[symbol].gene_id = result.GeneId @@ -78,8 +66,6 @@ class MrnaAssayTissueData(object): self.data[symbol].description = result.description self.data[symbol].probe_target_description = result.Probe_Target_Description - print("self.data: ", pf(self.data)) - ########################################################################### #Input: cursor, symbolList (list), dataIdDict(Dict) #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, @@ -106,53 +92,4 @@ class MrnaAssayTissueData(object): else: symbol_values_dict[result.Symbol.lower()].append(result.value) - #for symbol in self.data: - # data_id = self.data[symbol].data_id - # symbol_values_dict[symbol] = self.get_tissue_values(data_id) - - - return symbol_values_dict - - - #def get_tissue_values(self, data_id): - # """Gets the tissue values for a particular gene""" - # - # tissue_values=[] - # - # query = """SELECT value, id - # FROM TissueProbeSetData - # WHERE Id IN {}""".format(db_tools.create_in_clause(data_id)) - # - # #try : - # results = g.db.execute(query).fetchall() - # for result in results: - # tissue_values.append(result.value) - # #symbol_values_dict[symbol] = value_list - # #except: - # # symbol_values_pairs[symbol] = None - # - # return tissue_values - -######################################################################################################## -#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function for getSymbolValuePairDict function -# build gene symbol list if necessary, cut it into small lists if necessary, -# then call getSymbolValuePairDict function and merge the results. -######################################################################################################## - -#def get_trait_symbol_and_tissue_values(symbol_list=None): -# tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) -# -# #symbolList, -# #geneIdDict, -# #dataIdDict, -# #ChrDict, -# #MbDict, -# #descDict, -# #pTargetDescDict = getTissueProbeSetXRefInfo( -# # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) -# -# if len(tissue_data.gene_symbols): -# return get_symbol_values_pairs(tissue_data) + return symbol_values_dict
\ No newline at end of file diff --git a/wqflask/wqflask/collect.py b/wqflask/wqflask/collect.py index ffc698de..04035e96 100644 --- a/wqflask/wqflask/collect.py +++ b/wqflask/wqflask/collect.py @@ -40,13 +40,6 @@ import logging from utility.logger import getLogger logger = getLogger(__name__) -def get_collection(): - if g.user_session.logged_in: - return UserCollection() - else: - return AnonCollection() - #else: - # CauseError class AnonCollection(object): """User is not logged in""" @@ -282,7 +275,6 @@ def create_new(collection_name): db_session.commit() return redirect(url_for('view_collection', uc_id=uc.id)) else: - current_collections = user_manager.AnonUser().get_collections() ac = AnonCollection(collection_name) ac.changed_timestamp = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p') ac.add_traits(params) diff --git a/wqflask/wqflask/correlation/corr_scatter_plot.py b/wqflask/wqflask/correlation/corr_scatter_plot.py index a08cd759..94711c67 100644 --- a/wqflask/wqflask/correlation/corr_scatter_plot.py +++ b/wqflask/wqflask/correlation/corr_scatter_plot.py @@ -19,37 +19,31 @@ class CorrScatterPlot(object): width = int(params['width']) except: width = 800 - self.width = width try: height = int(params['height']) except: height = 600 - self.height = height try: circle_color = params['circle_color'] except: circle_color = '#3D85C6' - self.circle_color = circle_color try: circle_radius = int(params['circle_radius']) except: circle_radius = 5 - self.circle_radius = circle_radius try: line_color = params['line_color'] except: line_color = '#FF0000' - self.line_color = line_color try: line_width = int(params['line_width']) except: line_width = 1 - self.line_width = line_width samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data) @@ -66,14 +60,14 @@ class CorrScatterPlot(object): x = np.array(vals_1) y = np.array(vals_2) - slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) + slope, intercept, r_value, p_value, _std_err = stats.linregress(x, y) rx = stats.rankdata(x) ry = stats.rankdata(y) self.rdata = [] self.rdata.append(rx.tolist()) self.rdata.append(ry.tolist()) - srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry) + srslope, srintercept, srr_value, srp_value, _srstd_err = stats.linregress(rx, ry) self.js_data = dict( data = self.data, @@ -86,17 +80,17 @@ class CorrScatterPlot(object): num_overlap = num_overlap, vals_1 = vals_1, vals_2 = vals_2, - + slope = slope, intercept = intercept, r_value = r_value, p_value = p_value, - + srslope = srslope, srintercept = srintercept, srr_value = srr_value, srp_value = srp_value, - + width = width, height = height, circle_color = circle_color, diff --git a/wqflask/wqflask/correlation/correlation_functions.py b/wqflask/wqflask/correlation/correlation_functions.py index 1ee9b558..06dec795 100644 --- a/wqflask/wqflask/correlation/correlation_functions.py +++ b/wqflask/wqflask/correlation/correlation_functions.py @@ -28,468 +28,12 @@ from __future__ import absolute_import, print_function, division import math import rpy2.robjects -import pp import string -from utility import webqtlUtil from base.mrna_assay_tissue_data import MrnaAssayTissueData -from base.trait import GeneralTrait -from db import webqtlDatabaseFunction from flask import Flask, g -#XZ: The input 'controls' is String. It contains the full name of control traits. -#XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait. -#XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers. -#XZ, 03/29/2010: For each returned control trait, there is no None value in it. -def controlStrains(controls, strainlst): - - controls = controls.split(',') - - cvals = {} - for oneTraitName in controls: - oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() ) - oneTrait.retrieveData() - cvals[oneTraitName] = oneTrait.data - - tcstrains = [] - tcvals = [] - tcvars = [] - - for oneTraitName in controls: - strains = [] - vals = [] - vars = [] - - for _strain in strainlst: - if cvals[oneTraitName].has_key(_strain): - _val = cvals[oneTraitName][_strain].val - if _val != None: - strains.append(_strain) - vals.append(_val) - vars.append(None) - - tcstrains.append(strains) - tcvals.append(vals) - tcvars.append(vars) - - return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains] - - - -#XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them. -def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars): - """Corrects strains, vals, and vars so that all contrain only those strains common - to the reference trait and all control traits.""" - - def dictify(strains,vals,vars): - subdict = {} - for i in xrange(len(strains)): - subdict[strains[i]] = (vals[i],vars[i]) - return subdict - - #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits. - dicts = [] - dicts.append(dictify(_strains,_vals,_vars)) - - nCstrains = len(_controlstrains) - for i in xrange(nCstrains): - dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i])) - - _newstrains = [] - _vals = [] - _vars = [] - _controlvals = [[] for x in xrange(nCstrains)] - _controlvars = [[] for x in xrange(nCstrains)] - - for strain in _strains: - inall = True - for d in dicts: - if strain not in d: - inall = False - break - if inall: - _newstrains.append(strain) - _vals.append(dicts[0][strain][0]) - _vars.append(dicts[0][strain][1]) - for i in xrange(nCstrains): - _controlvals[i].append(dicts[i+1][strain][0]) - _controlvars[i].append(dicts[i+1][strain][1]) - - return _newstrains, _vals, _controlvals, _vars, _controlvars - - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -def findIdenticalControlTraits ( controlVals, controlNames ): - nameOfIdenticalTraits = [] - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - - #XZ: reset the precision of values and convert to string type - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - - return nameOfIdenticalTraits - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -#primaryVal is of list type. It contains value of primary trait. -#primaryName is of string type. -#controlVals is of list type. Each element is list too. Each element contain value of one control trait. -#controlNames is of list type. -def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): - nameOfIdenticalTraits = [] - - #XZ: reset the precision of values and convert to string type - for oneStrainVal in primaryVal: - oneStrainVal = '%.3f' % oneStrainVal - - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - break - - if len(nameOfIdenticalTraits) == 0: - for i, oneTraitVal in enumerate( controlVals ): - if primaryVal == oneTraitVal: - nameOfIdenticalTraits.append(primaryName) - nameOfIdenticalTraits.append(controlNames[i]) - break - - return nameOfIdenticalTraits - - - -#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. -#XZ: No value in primaryVal and controlVals could be None. - -def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): - - def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): - - rpy2.robjects.r(""" -pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ - # The partial correlation coefficient between x and y given z - # - # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. - # - # x and y should be vectors - # - # z can be either a vector or a matrix - # - # use: There are two methods to calculate the partial correlation coefficient. - # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). - # Default is "mat". - # - # method: There are three ways to calculate the correlation coefficient, - # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. - # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. - # Default is "p". - # - # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). - # If not, the missing samples will be removed just when the correlation coefficient is calculated. - # However, the number of samples for the p-value is the number of samples after removing - # all the missing samples from the whole dataset. - # Default is "T". - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(use == "mat"){ - p.use <- "Var-Cov matrix" - pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) - }else if(use == "rec"){ - p.use <- "Recursive formula" - pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) - }else{ - stop("use should be either rec or mat!\n") - } - - # print the method - if(gregexpr("p",method)[[1]][1] == 1){ - p.method <- "Pearson" - }else if(gregexpr("s",method)[[1]][1] == 1){ - p.method <- "Spearman" - }else if(gregexpr("k",method)[[1]][1] == 1){ - p.method <- "Kendall" - }else{ - stop("method should be pearson or spearman or kendall!\n") - } - - # sample number - n <- dim(na.omit(data.frame(x,y,z)))[1] - - # given variables' number - gn <- dim(z)[2] - - # p-value - if(p.method == "Kendall"){ - statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) - p.value <- 2*pnorm(-abs(statistic)) - - }else{ - statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) - p.value <- 2*pnorm(-abs(statistic)) - } - - data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) -} - -# By using var-cov matrix -pcor.mat <- function(x,y,z,method="p",na.rm=T){ - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - xdata <- na.omit(data.frame(data[,c(1,2)])) - Sxx <- cov(xdata,xdata,m=method) - - xzdata <- na.omit(data) - xdata <- data.frame(xzdata[,c(1,2)]) - zdata <- data.frame(xzdata[,-c(1,2)]) - Sxz <- cov(xdata,zdata,m=method) - - zdata <- na.omit(data.frame(data[,-c(1,2)])) - Szz <- cov(zdata,zdata,m=method) - - # is Szz positive definite? - zz.ev <- eigen(Szz)$values - if(min(zz.ev)[1]<0){ - stop("\'Szz\' is not positive definite!\n") - } - - # partial correlation - Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) - - rxx.z <- cov2cor(Sxx.z)[1,2] - - rxx.z -} - -# By using recursive formula -pcor.rec <- function(x,y,z,method="p",na.rm=T){ - # - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - # recursive formula - if(dim(z)[2] == 1){ - tdata <- na.omit(data.frame(data[,1],data[,2])) - rxy <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) - rxz <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) - ryz <- cor(tdata[,1],tdata[,2],m=method) - - rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) - - return(rxy.z) - }else{ - x <- c(data[,1]) - y <- c(data[,2]) - z0 <- c(data[,3]) - zc <- as.data.frame(data[,-c(1,2,3)]) - - rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) - rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) - ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) - - rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) - return(rxy.z) - } -} -""") - - R_pcorr_function = rpy2.robjects.r['pcor.test'] - R_corr_test = rpy2.robjects.r['cor.test'] - - primary = rpy2.robjects.FloatVector(range(len(primaryVal))) - for i in range(len(primaryVal)): - primary[i] = primaryVal[i] - - control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(controlVals[0])): - control[i*len(controlVals[0]) + j] = controlVals[i][j] - - allcorrelations = [] - - for targetIndex, oneTargetVals in enumerate(targetVals): - - this_primary = None - this_control = None - this_target = None - - if None in oneTargetVals: - - goodIndex = [] - for i in range(len(oneTargetVals)): - if oneTargetVals[i] != None: - goodIndex.append(i) - - this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_primary[i] = primaryVal[goodIndex[i]] - - this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(goodIndex)): - this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] - - this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_target[i] = oneTargetVals[goodIndex[i]] - - else: - this_primary = primary - this_control = control - this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) - for i in range(len(oneTargetVals)): - this_target[i] = oneTargetVals[i] - - one_name = targetNames[targetIndex] - one_N = len(this_primary) - - #calculate partial correlation - one_pc_coefficient = 'NA' - one_pc_p = 1 - - try: - if method == 's': - result = R_pcorr_function(this_primary, this_target, this_control, method='s') - else: - result = R_pcorr_function(this_primary, this_target, this_control) - - #XZ: In very few cases, the returned coefficient is nan. - #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN - if result[0][0] == result[0][0]: - one_pc_coefficient = result[0][0] - #XZ: when the coefficient value is 1 (primary trait and target trait are the same), - #XZ: occationally, the returned p value is nan instead of 0. - if result[1][0] == result[1][0]: - one_pc_p = result[1][0] - elif abs(one_pc_coefficient - 1) < 0.0000001: - one_pc_p = 0 - except: - pass - - #calculate zero order correlation - one_corr_coefficient = 0 - one_corr_p = 1 - - try: - if method == 's': - R_result = R_corr_test(this_primary, this_target, method='spearman') - else: - R_result = R_corr_test(this_primary, this_target) - - one_corr_coefficient = R_result[3][0] - one_corr_p = R_result[2][0] - except: - pass - - traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] - - allcorrelations.append(traitinfo) - - return allcorrelations - #End of function compute_partial - - - allcorrelations = [] - - target_trait_number = len(targetVals) - - if target_trait_number < 1000: - allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) - else: - step = 1000 - job_number = math.ceil( float(target_trait_number)/step ) - - job_targetVals_lists = [] - job_targetNames_lists = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, target_trait_number) - - one_job_targetVals_list = [] - one_job_targetNames_list = [] - - for i in range( starti, endi ): - one_job_targetVals_list.append( targetVals[i] ) - one_job_targetNames_list.append( targetNames[i] ) - - job_targetVals_lists.append( one_job_targetVals_list ) - job_targetNames_lists.append( one_job_targetNames_list ) - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): - one_job_targetNames_list = job_targetNames_lists[i] - #pay attention to modules from outside - jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - for one_traitinfo in one_result: - allcorrelations.append( one_traitinfo ) - - return allcorrelations - ##################################################################################### #Input: primaryValue(list): one list of expression values of one probeSet, @@ -529,170 +73,6 @@ def cal_zero_order_corr_for_tiss (primaryValue=[], targetValue=[], method='pears return corr_result - - -def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): - - def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): - - oneSymbolCorrDict = {} - oneSymbolPvalueDict = {} - - R_corr_test = rpy2.robjects.r['cor.test'] - - R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) - - for i in range(len(primaryTraitValue)): - R_primary[i] = primaryTraitValue[i] - - for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): - R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) - for i in range(len(oneTraitValue)): - R_target[i] = oneTraitValue[i] - - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] - oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] - - return(oneSymbolCorrDict, oneSymbolPvalueDict) - - - - symbolCorrDict = {} - symbolPvalueDict = {} - - items_number = len(SymbolValueDict) - - if items_number <= 1000: - symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) - else: - items_list = SymbolValueDict.items() - - step = 1000 - job_number = math.ceil( float(items_number)/step ) - - job_oneSymbolValueDict_list = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, items_number) - - oneSymbolValueDict = {} - - for i in range( starti, endi ): - one_item = items_list[i] - one_symbol = one_item[0] - one_value = one_item[1] - oneSymbolValueDict[one_symbol] = one_value - - job_oneSymbolValueDict_list.append( oneSymbolValueDict ) - - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): - - #pay attention to modules from outside - jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - oneSymbolCorrDict, oneSymbolPvalueDict = one_result - symbolCorrDict.update( oneSymbolCorrDict ) - symbolPvalueDict.update( oneSymbolPvalueDict ) - - return (symbolCorrDict, symbolPvalueDict) - -########################################################################### -#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId -#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) -#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and -# getGeneSymbolTissueValueDict to build dict to get CorrPvArray -#Note: If there are multiple probesets for one gene, select the one with highest mean. -########################################################################### -def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0): - Symbols ="" - symbolList =[] - geneIdDict ={} - dataIdDict = {} - ChrDict = {} - MbDict = {} - descDict = {} - pTargetDescDict = {} - - count = len(GeneNameLst) - - # Added by NL 01/06/2011 - # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value - # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. - if count==0: - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''%TissueProbeSetFreezeId - - else: - for i, item in enumerate(GeneNameLst): - - if i == count-1: - Symbols += "'%s'" %item - else: - Symbols += "'%s'," %item - - Symbols = "("+ Symbols+")" - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''% (TissueProbeSetFreezeId,Symbols) - - try: - cursor.execute(query) - results =cursor.fetchall() - resultCount = len(results) - # Key in all dicts is the lower-cased symbol - for i, item in enumerate(results): - symbol = item[0] - symbolList.append(symbol) - - key =symbol.lower() - geneIdDict[key]=item[1] - dataIdDict[key]=item[2] - ChrDict[key]=item[3] - MbDict[key]=item[4] - descDict[key]=item[5] - pTargetDescDict[key]=item[6] - - except: - symbolList = None - geneIdDict=None - dataIdDict=None - ChrDict=None - MbDict=None - descDict=None - pTargetDescDict=None - - return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict - ########################################################################### #Input: cursor, symbolList (list), dataIdDict(Dict) #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, @@ -701,7 +81,6 @@ def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0): #Attention! All keys are lower case! ########################################################################### def get_symbol_value_pairs(tissue_data): - id_list = [tissue_data[symbol.lower()].data_id for item in tissue_data] symbol_value_pairs = {} @@ -719,23 +98,6 @@ def get_symbol_value_pairs(tissue_data): except: symbol_value_pairs[symbol] = None - #for symbol in symbol_list: - # if tissue_data.has_key(symbol): - # data_id = tissue_data[symbol].data_id - # - # query = """select value, id - # from TissueProbeSetData - # where Id={}""".format(escape(data_id)) - # try : - # results = g.db.execute(query).fetchall() - # for item in results: - # item = item[0] - # value_list.append(item) - # symbol_value_pairs[symbol] = value_list - # value_list=[] - # except: - # symbol_value_pairs[symbol] = None - return symbol_value_pairs @@ -752,150 +114,4 @@ def get_trait_symbol_and_tissue_values(symbol_list=None): tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) if len(tissue_data.gene_symbols): - return tissue_data.get_symbol_values_pairs() - - #symbolList, - #geneIdDict, - #dataIdDict, - #ChrDict, - #MbDict, - #descDict, - #pTargetDescDict = getTissueProbeSetXRefInfo( - # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) - - #limit_num=1000 - #count = len(symbol_list) - # - #symbol_value_pairs = {} - # - #if count !=0 and count <= limit_num: - # symbol_value_pairs = getSymbolValuePairDict(cursor=cursor,symbolList=symbol_list,dataIdDict=dataIdDict) - # - #elif count > limit_num: - # n = count/limit_num - # start = 0 - # stop = 0 - # - # for i in range(n): - # stop =limit_num*(i+1) - # gList1 = symbolList[start:stop] - # PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) - # start =limit_num*(i+1) - # - # SymbolValuePairDict.update(PairDict1) - # - # if stop < count: - # stop = count - # gList2 = symbolList[start:stop] - # PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) - # SymbolValuePairDict.update(PairDict2) - # - #return SymbolValuePairDict - -######################################################################################################## -#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function of getGeneSymbolTissueValueDict function -# for CorrelationPage.py -######################################################################################################## - -#def get_trait_symbol_and_tissue_values(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): -# SymbolValuePairDict={} -# -# symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo( -# cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) -# -# if symbolList: -# SymbolValuePairDict = get_gene_symbol_and_tissue_values(symbolList=symbolList, -# dataIdDict=dataIdDict) -# -# return SymbolValuePairDict - -######################################################################################################## -#Input: cursor(cursor): MySQL connnection cursor; -# priGeneSymbolList(list): one list of gene symbol; -# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#Output: corrArray(array): array of Correlation Value, -# pvArray(array): array of PValue; -#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss -######################################################################################################## - -def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): - # setting initial value for corrArray, pvArray equal to 0 - Num = len(priGeneSymbolList) - - corrArray = [([0] * (Num))[:] for i in range(Num)] - pvArray = [([0] * (Num))[:] for i in range(Num)] - i = 0 - for pkey in priGeneSymbolList: - j = 0 - pkey = pkey.strip().lower()# key in symbolValuepairDict is low case - if symbolValuepairDict.has_key(pkey): - priValue = symbolValuepairDict[pkey] - for tkey in priGeneSymbolList: - tkey = tkey.strip().lower()# key in symbolValuepairDict is low case - if priValue and symbolValuepairDict.has_key(tkey): - tarValue = symbolValuepairDict[tkey] - - if tarValue: - if i>j: - # corrArray stores Pearson Correlation values - # pvArray stores Pearson P-Values - pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) - corrArray[i][j] =pcorr_result[0] - pvArray[i][j] =pcorr_result[2] - elif i<j: - # corrArray stores Spearman Correlation values - # pvArray stores Spearman P-Values - scorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue,method='spearman') - corrArray[i][j] =scorr_result[0] - pvArray[i][j] =scorr_result[2] - else: - # on the diagonal line, correlation value is 1, P-Values is 0 - corrArray[i][j] =1 - pvArray[i][j] =0 - j+=1 - else: - corrArray[i][j] = None - pvArray[i][j] = None - j+=1 - else: - corrArray[i][j] = None - pvArray[i][j] = None - j+=1 - else: - corrArray[i][j] = None - pvArray[i][j] = None - - i+=1 - - return corrArray, pvArray - -######################################################################################################## -#Input: cursor(cursor): MySQL connnection cursor; -# primaryTraitSymbol(string): one gene symbol; -# TissueProbeSetFreezeId (int): Id of related TissueProbeSetFreeze -# method: '0' default value, Pearson Correlation; '1', Spearman Correlation -#Output: symbolCorrDict(Dict): Dict of Correlation Value, key is symbol -# symbolPvalueDict(Dict): Dict of PValue,key is symbol ; -#Function: build symbolCorrDict, symbolPvalueDict for display by calling calculation function:calZeroOrderCorrForTiss -######################################################################################################## -def calculateCorrOfAllTissueTrait(cursor=None, primaryTraitSymbol=None, TissueProbeSetFreezeId=None,method='0'): - - symbolCorrDict = {} - symbolPvalueDict = {} - - primaryTraitSymbolValueDict = getGeneSymbolTissueValueDictForTrait(cursor=cursor, GeneNameLst=[primaryTraitSymbol], TissueProbeSetFreezeId=TissueProbeSetFreezeId) - primaryTraitValue = primaryTraitSymbolValueDict.values()[0] - - SymbolValueDict = getGeneSymbolTissueValueDictForTrait(cursor=cursor, GeneNameLst=[], TissueProbeSetFreezeId=TissueProbeSetFreezeId) - - if method =='1': - symbolCorrDict, symbolPvalueDict = batchCalTissueCorr(primaryTraitValue,SymbolValueDict,method='spearman') - else: - symbolCorrDict, symbolPvalueDict = batchCalTissueCorr(primaryTraitValue,SymbolValueDict) - - - return (symbolCorrDict, symbolPvalueDict) + return tissue_data.get_symbol_values_pairs()
\ No newline at end of file diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 85a8c0ef..9f3f7982 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -21,9 +21,7 @@ from __future__ import absolute_import, print_function, division import sys -# sys.path.append(".") Never in a running webserver -import gc import string import cPickle import os @@ -64,8 +62,6 @@ from flask import Flask, g import utility.logger logger = utility.logger.getLogger(__name__ ) -METHOD_SAMPLE_PEARSON = "1" -METHOD_SAMPLE_RANK = "2" METHOD_LIT = "3" METHOD_TISSUE_PEARSON = "4" METHOD_TISSUE_RANK = "5" @@ -74,26 +70,7 @@ TISSUE_METHODS = [METHOD_TISSUE_PEARSON, METHOD_TISSUE_RANK] TISSUE_MOUSE_DB = 1 -def print_mem(stage=""): - mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - #print("{}: {}".format(stage, mem/1024)) - -class AuthException(Exception): - pass - class CorrelationResults(object): - - corr_min_informative = 4 - - #PAGE_HEADING = "Correlation Table" - #CORRELATION_METHODS = {"1" : "Genetic Correlation (Pearson's r)", - # "2" : "Genetic Correlation (Spearman's rho)", - # "3" : "SGO Literature Correlation", - # "4" : "Tissue Correlation (Pearson's r)", - # "5" : "Tissue Correlation (Spearman's rho)"} - # - #RANK_ORDERS = {"1": 0, "2": 1, "3": 0, "4": 0, "5": 1} - def __init__(self, start_vars): # get trait list from db (database name) # calculate correlation with Base vector and targets @@ -167,16 +144,12 @@ class CorrelationResults(object): self.process_samples(start_vars, self.this_trait.data.keys(), primary_samples) self.target_dataset = data_set.create_dataset(start_vars['corr_dataset']) - # print("self.sample_data.keys: %s" % self.sample_data.keys) self.target_dataset.get_trait_data(self.sample_data.keys()) self.correlation_results = [] self.correlation_data = {} - db_filename = self.getFileName(target_db_name = self.target_dataset.name) - cache_available = db_filename in os.listdir(webqtlConfig.GENERATED_TEXT_DIR) - if self.corr_type == "tissue": self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") @@ -196,24 +169,6 @@ class CorrelationResults(object): self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) elif self.corr_type == "sample": - #ZS: Commented out since parallel correlation has issues with gunicorn - # if self.dataset.type == "ProbeSet" and cache_available: - # dataset_file = open(webqtlConfig.GENERATED_TEXT_DIR+db_filename,'r') - - ##XZ, 01/08/2009: read the first line - # line = dataset_file.readline() - # dataset_strains = webqtlUtil.readLineCSV(line)[1:] - - # self.this_trait_vals = [] - # for item in dataset_strains: - # if item in self.sample_data: - # self.this_trait_vals.append(self.sample_data[item]) - # else: - # self.this_trait_vals.append("None") - # num_overlap = len(self.this_trait_vals) - # logger.debug("DOING PARALLEL") - # self.do_parallel_correlation(db_filename, num_overlap) - # else: for trait, values in self.target_dataset.trait_data.iteritems(): self.get_sample_r_and_p_values(trait, values) @@ -256,9 +211,6 @@ class CorrelationResults(object): trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] - #Get symbol for trait and call function that gets each tissue value from the database (tables TissueProbeSetXRef, - #TissueProbeSetData, etc) and calculates the correlation (cal_zero_order_corr_for_tissue in correlation_functions) - # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 @@ -274,9 +226,6 @@ class CorrelationResults(object): trait_object.sample_p, trait_object.num_overlap) = self.correlation_data[trait] - #Get symbol for trait and call function that gets each tissue value from the database (tables TissueProbeSetXRef, - #TissueProbeSetData, etc) and calculates the correlation (cal_zero_order_corr_for_tissue in correlation_functions) - # Set some sane defaults trait_object.tissue_corr = 0 trait_object.tissue_pvalue = 0 @@ -323,13 +272,6 @@ class CorrelationResults(object): if self.this_trait.symbol.lower() in primary_trait_tissue_vals_dict: primary_trait_tissue_values = primary_trait_tissue_vals_dict[self.this_trait.symbol.lower()] - - #gene_symbol_list = [] - # - #for trait in self.correlation_results: - # if hasattr(trait, 'symbol'): - # gene_symbol_list.append(trait.symbol) - gene_symbol_list = [trait.symbol for trait in self.correlation_results if trait.symbol] corr_result_tissue_vals_dict= correlation_functions.get_trait_symbol_and_tissue_values( @@ -346,17 +288,6 @@ class CorrelationResults(object): trait.tissue_corr = result[0] trait.tissue_pvalue = result[2] - # else: - # trait.tissue_corr = None - # trait.tissue_pvalue = None - #else: - # for trait in self.correlation_results: - # trait.tissue_corr = None - # trait.tissue_pvalue = None - - #return self.correlation_results - - def do_tissue_correlation_for_all_traits(self, tissue_dataset_id=1): #Gets tissue expression values for the primary trait primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( @@ -377,7 +308,6 @@ class CorrelationResults(object): for trait, symbol in self.trait_symbol_dict.iteritems(): if symbol and symbol.lower() in corr_result_tissue_vals_dict: this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower()] - #print("this_trait_tissue_values: ", pf(this_trait_tissue_values)) result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values, this_trait_tissue_values, @@ -390,7 +320,6 @@ class CorrelationResults(object): return tissue_corr_data - def do_lit_correlation_for_trait_list(self): input_trait_mouse_gene_id = self.convert_to_mouse_gene_id(self.dataset.group.species.lower(), self.this_trait.geneid) @@ -529,219 +458,6 @@ class CorrelationResults(object): if num_overlap > 5: self.correlation_data[trait] = [sample_r, sample_p, num_overlap] - - """ - correlations = [] - - #XZ: Use the fast method only for probeset dataset, and this dataset must have been created. - #XZ: Otherwise, use original method - #print("Entering correlation") - - #db_filename = self.getFileName(target_db_name=self.target_db_name) - # - #cache_available = db_filename in os.listdir(webqtlConfig.GENERATED_TEXT_DIR) - - # If the cache file exists, do a cached correlation for probeset data - if self.dataset.type == "ProbeSet": -# if self.method in [METHOD_SAMPLE_PEARSON, METHOD_SAMPLE_RANK] and cache_available: -# traits = do_parallel_correlation() -# -# else: - - traits = self.get_traits(self.vals) - - for trait in traits: - trait.calculate_correlation(vals, self.method) - - self.record_count = len(traits) #ZS: This isn't a good way to get this value, so I need to change it later - - traits = sortTraitCorrelations(traits, self.method) - - # Strip to the top N correlations - traits = traits[:min(self.returnNumber, len(traits))] - - addLiteratureCorr = False - addTissueCorr = False - - trait_list = [] - for trait in traits: - db_trait = webqtlTrait(db=self.db, name=trait.name, cursor=self.cursor) - db_trait.retrieveInfo( QTL='Yes' ) - - db_trait.Name = trait.name - db_trait.corr = trait.correlation - db_trait.nOverlap = trait.overlap - db_trait.corrPValue = trait.p_value - - # NL, 07/19/2010 - # js function changed, add a new parameter rankOrder for js function 'showTissueCorrPlot' - db_trait.RANK_ORDER = self.RANK_ORDERS[self.method] - - #XZ, 26/09/2008: Method is 4 or 5. Have fetched tissue corr, but no literature correlation yet. - if self.method in TISSUE_METHODS: - db_trait.tissueCorr = trait.tissue_corr - db_trait.tissuePValue = trait.p_tissue - addTissueCorr = True - - - #XZ, 26/09/2008: Method is 3, Have fetched literature corr, but no tissue corr yet. - elif self.method == METHOD_LIT: - db_trait.LCorr = trait.lit_corr - db_trait.mouse_geneid = self.translateToMouseGeneID(self.species, db_trait.geneid) - addLiteratureCorr = True - - #XZ, 26/09/2008: Method is 1 or 2. Have NOT fetched literature corr and tissue corr yet. - # Phenotype data will not have geneid, and neither will some probes - # we need to handle this because we will get an attribute error - else: - if self.input_trait_mouse_gene_id and self.db.type=="ProbeSet": - addLiteratureCorr = True - if self.trait_symbol and self.db.type=="ProbeSet": - addTissueCorr = True - - trait_list.append(db_trait) - - if addLiteratureCorr: - trait_list = self.getLiteratureCorrelationByList(self.input_trait_mouse_gene_id, - self.species, trait_list) - if addTissueCorr: - trait_list = self.getTissueCorrelationByList( - primaryTraitSymbol = self.trait_symbol, - traitList = trait_list, - TissueProbeSetFreezeId = TISSUE_MOUSE_DB, - method=self.method) - - return trait_list - """ - - - def do_tissue_corr_for_all_traits_2(self): - """Comments Possibly Out of Date!!!!! - - Uses get_temp_tissue_corr_table to generate table of tissue correlations - - This function then gathers that data and pairs it with the TraitID string. - Takes as its arguments a formdata instance, and a dataset instance. - Returns a dictionary of 'TraitID':(tissueCorr, tissuePValue) - for the requested correlation - - Used when the user selects the tissue correlation method; i.e. not for the - column that is appended to all probeset trait correlation tables - - """ - - # table name string - temp_table = self.get_temp_tissue_corr_table(tissue_probesetfreeze_id=TISSUE_MOUSE_DB, - method=method) - - query = """SELECT ProbeSet.Name, {}.Correlation, {}.PValue - FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) - LEFT JOIN {} ON {}.Symbol=ProbeSet.Symbol - WHERE ProbeSetFreeze.Name = '{}' - and ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId - and ProbeSet.Id = ProbeSetXRef.ProbeSetId - and ProbeSet.Symbol IS NOT NULL - and {}.Correlation IS NOT NULL""".format(dataset.mescape( - temp_table, temp_table, temp_table, temp_table, - self.dataset.name, temp_table)) - - results = g.db.execute(query).fetchall() - - tissue_corr_dict = {} - - for entry in results: - trait_name, tissue_corr, tissue_pvalue = entry - tissue_corr_dict[trait_name] = (tissue_corr, tissue_pvalue) - #symbolList, - #geneIdDict, - #dataIdDict, - #ChrDict, - #MbDict, - #descDict, - #pTargetDescDict = getTissueProbeSetXRefInfo( - # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) - - g.db.execute('DROP TEMPORARY TABLE {}'.format(escape(temp_table))) - - return tissue_corr_dict - - - #XZ, 09/23/2008: In tissue correlation tables, there is no record of GeneId1 == GeneId2 - #XZ, 09/24/2008: Note that the correlation value can be negative. - def get_temp_tissue_corr_table(self, - tissue_probesetfreeze_id=0, - method="", - return_number=0): - - - def cmp_tisscorr_absolute_value(A, B): - try: - if abs(A[1]) < abs(B[1]): return 1 - elif abs(A[1]) == abs(B[1]): - return 0 - else: return -1 - except: - return 0 - - symbol_corr_dict, symbol_pvalue_dict = self.calculate_corr_for_all_tissues( - tissue_dataset_id=TISSUE_MOUSE_DB) - - symbol_corr_list = symbol_corr_dict.items() - - symbol_corr_list.sort(cmp_tisscorr_absolute_value) - symbol_corr_list = symbol_corr_list[0 : 2*return_number] - - tmp_table_name = webqtlUtil.genRandStr(prefix="TOPTISSUE") - - q1 = 'CREATE TEMPORARY TABLE %s (Symbol varchar(100) PRIMARY KEY, Correlation float, PValue float)' % tmp_table_name - self.cursor.execute(q1) - - for one_pair in symbol_corr_list: - one_symbol = one_pair[0] - one_corr = one_pair[1] - one_p_value = symbol_pvalue_dict[one_symbol] - - self.cursor.execute( "INSERT INTO %s (Symbol, Correlation, PValue) VALUES ('%s',%f,%f)" % (tmpTableName, one_symbol, float(one_corr), float(one_p_value)) ) - - return tmp_table_name - - - def calculate_corr_for_all_tissues(self, tissue_dataset_id=None): - - symbol_corr_dict = {} - symbol_pvalue_dict = {} - - primary_trait_symbol_value_dict = correlation_functions.make_gene_tissue_value_dict( - GeneNameLst=[self.this_trait.symbol], - TissueProbeSetFreezeId=tissue_dataset_id) - primary_trait_value = primary_trait_symbol_value_dict.values()[0] - - symbol_value_dict = correlation_functions.make_gene_tissue_value_dict( - gene_name_list=[], - tissue_dataset_id=tissue_dataset_id) - - symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr( - primaryTraitValue, - SymbolValueDict, - method=self.corr_method) - #else: - # symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr( - # primaryTraitValue, - # SymbolValueDict) - - return (symbolCorrDict, symbolPvalueDict) - - ##XZ, 12/16/2008: the input geneid is of mouse type - #def checkSymbolForTissueCorr(self, tissueProbeSetFreezeId=0, symbol=""): - # q = "SELECT 1 FROM TissueProbeSetXRef WHERE TissueProbeSetFreezeId=%s and Symbol='%s' LIMIT 1" % (tissueProbeSetFreezeId,symbol) - # self.cursor.execute(q) - # try: - # x = self.cursor.fetchone() - # if x: return True - # else: raise - # except: return False - - def process_samples(self, start_vars, sample_names, excluded_samples=None): if not excluded_samples: excluded_samples = () @@ -754,566 +470,6 @@ class CorrelationResults(object): if not value.strip().lower() == 'x': self.sample_data[str(sample)] = float(value) - ##XZ, 12/16/2008: the input geneid is of mouse type - #def checkForLitInfo(self,geneId): - # q = 'SELECT 1 FROM LCorrRamin3 WHERE GeneId1=%s LIMIT 1' % geneId - # self.cursor.execute(q) - # try: - # x = self.cursor.fetchone() - # if x: return True - # else: raise - # except: return False - - - - def fetchAllDatabaseData(self, species, GeneId, GeneSymbol, strains, db, method, returnNumber, tissueProbeSetFreezeId): - - StrainIds = [] - for item in strains: - self.cursor.execute('''SELECT Strain.Id FROM Strain, Species WHERE Strain.Name="%s" and Strain.SpeciesId=Species.Id and Species.name = "%s" ''' % (item, species)) - Id = self.cursor.fetchone()[0] - StrainIds.append('%d' % Id) - - # break it into smaller chunks so we don't overload the MySql server - nnn = len(StrainIds) / 25 - if len(StrainIds) % 25: - nnn += 1 - oridata = [] - - #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId - tempTable = None - if GeneId and db.type == "ProbeSet": - if method == "3": - tempTable = self.getTempLiteratureTable(species=species, input_species_geneid=GeneId, returnNumber=returnNumber) - - if method == "4" or method == "5": - tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, TissueProbeSetFreezeId=TISSUE_MOUSE_DB, method=method, returnNumber=returnNumber) - - for step in range(nnn): - temp = [] - StrainIdstep = StrainIds[step*25:min(len(StrainIds), (step+1)*25)] - for item in StrainIdstep: temp.append('T%s.value' % item) - - if db.type == "Publish": - query = "SELECT PublishXRef.Id, " - dataStartPos = 1 - query += string.join(temp,', ') - query += ' FROM (PublishXRef, PublishFreeze)' - #XZ, 03/04/2009: Xiaodong changed Data to PublishData - for item in StrainIdstep: - query += 'left join PublishData as T%s on T%s.Id = PublishXRef.DataId and T%s.StrainId=%s\n' %(item,item,item,item) - query += "WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId and PublishFreeze.Name = '%s'" % (db.name, ) - #XZ, 09/20/2008: extract literature correlation value together with gene expression values. - #XZ, 09/20/2008: notice the difference between the code in next block. - #elif tempTable: - # # we can get a little performance out of selecting our LitCorr here - # # but also we need to do this because we are unconcerned with probes that have no geneId associated with them - # # as we would not have litCorr data. - # - # if method == "3": - # query = "SELECT %s.Name, %s.value," % (db.type,tempTable) - # dataStartPos = 2 - # if method == "4" or method == "5": - # query = "SELECT %s.Name, %s.Correlation, %s.PValue," % (db.type,tempTable, tempTable) - # dataStartPos = 3 - # - # query += string.join(temp,', ') - # query += ' FROM (%s, %sXRef, %sFreeze)' % (db.type, db.type, db.type) - # if method == "3": - # query += ' LEFT JOIN %s ON %s.GeneId2=ProbeSet.GeneId ' % (tempTable,tempTable) - # if method == "4" or method == "5": - # query += ' LEFT JOIN %s ON %s.Symbol=ProbeSet.Symbol ' % (tempTable,tempTable) - # #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item) - # for item in StrainIdstep: - # query += 'left join %sData as T%s on T%s.Id = %sXRef.DataId and T%s.StrainId=%s\n' %(db.type, item,item, db.type,item,item) - # - # if method == "3": - # query += "WHERE ProbeSet.GeneId IS NOT NULL AND %s.value IS NOT NULL AND %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (tempTable,db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type) - # if method == "4" or method == "5": - # query += "WHERE ProbeSet.Symbol IS NOT NULL AND %s.Correlation IS NOT NULL AND %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (tempTable,db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type) - else: - query = "SELECT %s.Name," % db.type - dataStartPos = 1 - query += string.join(temp,', ') - query += ' FROM (%s, %sXRef, %sFreeze)' % (db.type, db.type, db.type) - #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item) - for item in StrainIdstep: - query += 'left join %sData as T%s on T%s.Id = %sXRef.DataId and T%s.StrainId=%s\n' %(db.type, item,item, db.type,item,item) - query += "WHERE %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type) - - self.cursor.execute(query) - results = self.cursor.fetchall() - oridata.append(results) - - datasize = len(oridata[0]) - traits = [] - # put all of the separate data together into a huge list of lists - for j in range(datasize): - traitdata = list(oridata[0][j]) - for i in range(1,nnn): - traitdata += list(oridata[i][j][dataStartPos:]) - - trait = Trait(traitdata[0], traitdata[dataStartPos:]) - - if method == METHOD_LIT: - trait.lit_corr = traitdata[1] - - if method in TISSUE_METHODS: - trait.tissue_corr = traitdata[1] - trait.p_tissue = traitdata[2] - - traits.append(trait) - - if tempTable: - self.cursor.execute( 'DROP TEMPORARY TABLE %s' % tempTable ) - - return traits - - - - - # XZ, 09/20/2008: This function creates TEMPORARY TABLE tmpTableName_2 and return its name. - # XZ, 09/20/2008: It stores top literature correlation values associated with the input geneId. - # XZ, 09/20/2008: Attention: In each row, the input geneId is always in column GeneId1. - #XZ, 12/16/2008: the input geneid can be of mouse, rat or human type - def getTempLiteratureTable(self, species, input_species_geneid, returnNumber): - # according to mysql the TEMPORARY TABLE name should not have to be unique because - # it is only available to the current connection. This program will be invoked via command line, but if it - # were to be invoked over mod_python this could cuase problems. mod_python will keep the connection alive - # in its executing threads ( i think) so there is a potential for the table not being dropped between users. - #XZ, 01/29/2009: To prevent the potential risk, I generate random table names and drop the tables after use them. - - - # the 'input_species_geneid' could be rat or human geneid, need to translate it to mouse geneid - translated_mouse_geneid = self.translateToMouseGeneID (species, input_species_geneid) - - tmpTableName_1 = webqtlUtil.genRandStr(prefix="LITERATURE") - - q1 = 'CREATE TEMPORARY TABLE %s (GeneId1 int(12) unsigned, GeneId2 int(12) unsigned PRIMARY KEY, value double)' % tmpTableName_1 - q2 = 'INSERT INTO %s (GeneId1, GeneId2, value) SELECT GeneId1,GeneId2,value FROM LCorrRamin3 WHERE GeneId1=%s' % (tmpTableName_1, translated_mouse_geneid) - q3 = 'INSERT INTO %s (GeneId1, GeneId2, value) SELECT GeneId2,GeneId1,value FROM LCorrRamin3 WHERE GeneId2=%s AND GeneId1!=%s' % (tmpTableName_1, translated_mouse_geneid,translated_mouse_geneid) - for x in [q1,q2,q3]: self.cursor.execute(x) - - #XZ, 09/23/2008: Just use the top records insteard of using all records - tmpTableName_2 = webqtlUtil.genRandStr(prefix="TOPLITERATURE") - - q1 = 'CREATE TEMPORARY TABLE %s (GeneId1 int(12) unsigned, GeneId2 int(12) unsigned PRIMARY KEY, value double)' % tmpTableName_2 - self.cursor.execute(q1) - q2 = 'SELECT GeneId1, GeneId2, value FROM %s ORDER BY value DESC' % tmpTableName_1 - self.cursor.execute(q2) - result = self.cursor.fetchall() - - counter = 0 #this is to count how many records being inserted into table - for one_row in result: - mouse_geneid1, mouse_geneid2, lit_corr_alue = one_row - - #mouse_geneid1 has been tested before, now should test if mouse_geneid2 has corresponding geneid in other species - translated_species_geneid = 0 - if species == 'mouse': - translated_species_geneid = mouse_geneid2 - elif species == 'rat': - self.cursor.execute( "SELECT rat FROM GeneIDXRef WHERE mouse=%d" % int(mouse_geneid2) ) - record = self.cursor.fetchone() - if record: - translated_species_geneid = record[0] - elif species == 'human': - self.cursor.execute( "SELECT human FROM GeneIDXRef WHERE mouse=%d" % int(mouse_geneid2) ) - record = self.cursor.fetchone() - if record: - translated_species_geneid = record[0] - - if translated_species_geneid: - self.cursor.execute( 'INSERT INTO %s (GeneId1, GeneId2, value) VALUES (%d,%d,%f)' % (tmpTableName_2, int(input_species_geneid),int(translated_species_geneid), float(lit_corr_alue)) ) - counter = counter + 1 - - #pay attention to the number - if (counter > 2*returnNumber): - break - - self.cursor.execute('DROP TEMPORARY TABLE %s' % tmpTableName_1) - - return tmpTableName_2 - - - - #XZ, 01/09/2009: This function was created by David Crowell. Xiaodong cleaned up and modified it. - def fetchLitCorrelations(self, species, GeneId, db, returnNumber): ### Used to generate Lit Correlations when calculations are done from text file. dcrowell August 2008 - """Uses getTempLiteratureTable to generate table of literatire correlations. This function then gathers that data and - pairs it with the TraitID string. Takes as its arguments a formdata instance, and a database instance. - Returns a dictionary of 'TraitID':'LitCorr' for the requested correlation""" - - tempTable = self.getTempLiteratureTable(species=species, input_species_geneid=GeneId, returnNumber=returnNumber) - - query = "SELECT %s.Name, %s.value" % (db.type,tempTable) - query += ' FROM (%s, %sXRef, %sFreeze)' % (db.type, db.type, db.type) - query += ' LEFT JOIN %s ON %s.GeneId2=ProbeSet.GeneId ' % (tempTable,tempTable) - query += "WHERE ProbeSet.GeneId IS NOT NULL AND %s.value IS NOT NULL AND %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (tempTable, db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type) - - self.cursor.execute(query) - results = self.cursor.fetchall() - - litCorrDict = {} - - for entry in results: - traitName,litcorr = entry - litCorrDict[traitName] = litcorr - - self.cursor.execute('DROP TEMPORARY TABLE %s' % tempTable) - - return litCorrDict - - - def get_traits(self, vals): - - #Todo: Redo cached stuff using memcached - if False: - lit_corrs = {} - tissue_corrs = {} - use_lit = False - if self.method == METHOD_LIT: - lit_corrs = self.fetchLitCorrelations(species=self.species, GeneId=self.gene_id, db=self.db, returnNumber=self.returnNumber) - use_lit = True - - use_tissue_corr = False - if self.method in TISSUE_METHODS: - tissue_corrs = self.fetch_tissue_correlations(method=self.method, return_number = self.return_number) - use_tissue_corr = True - - DatabaseFileName = self.getFileName( target_db_name=self.target_db_name ) - datasetFile = open(webqtlConfig.CACHEDIR+DatabaseFileName,'r') - - #XZ, 01/08/2009: read the first line - line = datasetFile.readline() - cached_sample_names = webqtlUtil.readLineCSV(line)[1:] - - #XZ, 01/08/2009: This step is critical. It is necessary for this new method. - #XZ: The original function fetchAllDatabaseData uses all strains stored in variable _strains to - #XZ: retrieve the values of each strain from database in real time. - #XZ: The new method uses all strains stored in variable dataset_strains to create a new variable - #XZ: _newvals. _newvals has the same length as dataset_strains. The items in _newvals is in - #XZ: the same order of items in dataset_strains. The value of each item in _newvals is either - #XZ: the value of correspinding strain in _vals or 'None'. - new_vals = [] - for name in cached_sample_names: - if name in self.sample_names: - new_vals.append(float(vals[self.sample_names.index(name)])) - else: - new_vals.append('None') - - nnCorr = len(new_vals) - - #XZ, 01/14/2009: If literature corr or tissue corr is selected, - #XZ: there is no need to use parallel computing. - - traits = [] - data_start = 1 - for line in datasetFile: - raw_trait = webqtlUtil.readLineCSV(line) - trait = Trait.from_csv(raw_trait, data_start) - trait.lit_corr = lit_corrs.get(trait.name) - trait.tissue_corr, trait.p_tissue = tissue_corrs.get(trait.name, (None, None)) - traits.append(trait) - - return traits, new_vals - - else: - traits = self.fetchAllDatabaseData(species=self.dataset.species, - GeneId=self.gene_id, - GeneSymbol=self.trait.symbol, - strains=self.sample_names, - db=self.db, - method=self.method, - returnNumber=self.returnNumber, - tissueProbeSetFreezeId= self.tissue_probeset_freeze_id) - totalTraits = len(traits) #XZ, 09/18/2008: total trait number - - return traits - - def calculate_corr_for_all_tissues(self, tissue_dataset_id=None): - - symbol_corr_dict = {} - symbol_pvalue_dict = {} - - primary_trait_symbol_value_dict = correlation_functions.make_gene_tissue_value_dict( - GeneNameLst=[self.this_trait.symbol], - TissueProbeSetFreezeId=tissue_dataset_id) - primary_trait_value = primary_trait_symbol_value_dict.values()[0] - - symbol_value_dict = correlation_functions.make_gene_tissue_value_dict( - gene_name_list=[], - tissue_dataset_id=tissue_dataset_id) - - symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr( - primaryTraitValue, - SymbolValueDict, - method=self.corr_method) - #else: - # symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr( - # primaryTraitValue, - # SymbolValueDict) - - return (symbolCorrDict, symbolPvalueDict) - - def getFileName(self, target_db_name): ### dcrowell August 2008 - """Returns the name of the reference database file with which correlations are calculated. - Takes argument cursor which is a cursor object of any instance of a subclass of templatePage - Used by correlationPage""" - - dataset_id = str(self.target_dataset.id) - dataset_fullname = self.target_dataset.fullname.replace(' ','_') - dataset_fullname = dataset_fullname.replace('/','_') - - FileName = 'ProbeSetFreezeId_' + dataset_id + '_FullName_' + dataset_fullname + '.txt' - - return FileName - - def do_parallel_correlation(self, db_filename, num_overlap): - - #XZ, 01/14/2009: This method is for parallel computing only. - #XZ: It is supposed to be called when "Genetic Correlation, Pearson's r" (method 1) - #XZ: or "Genetic Correlation, Spearman's rho" (method 2) is selected - def compute_corr(input_nnCorr, input_trait, input_list, corr_method): - - import math - import reaper - - def cmpOrder2(A,B): - try: - if A[-1] < B[-1]: - return -1 - elif A[-1] == B[-1]: - return 0 - else: - return 1 - except: - return 0 - - def calCorrelation(dbdata,userdata,N): - X = [] - Y = [] - for i in range(N): - if (dbdata[i] != None and userdata[i] != None) and (dbdata[i] != "None" and userdata[i] != "None"): - X.append(float(dbdata[i])) - Y.append(float(userdata[i])) - NN = len(X) - if NN <6: - return (0.0,NN) - sx = reduce(lambda x,y:x+y,X,0.0) - sy = reduce(lambda x,y:x+y,Y,0.0) - meanx = sx/NN - meany = sy/NN - xyd = 0.0 - sxd = 0.0 - syd = 0.0 - for i in range(NN): - xyd += (X[i] - meanx)*(Y[i]-meany) - sxd += (X[i] - meanx)*(X[i] - meanx) - syd += (Y[i] - meany)*(Y[i] - meany) - try: - corr = xyd/(math.sqrt(sxd)*math.sqrt(syd)) - except: - corr = 0 - return (corr,NN) - - def calCorrelationRank(xVals,yVals,N): - """ - Calculated Spearman Ranked Correlation. The algorithm works - by setting all tied ranks to the average of those ranks (for - example, if ranks 5-10 all have the same value, each will be set - to rank 7.5). - """ - - XX = [] - YY = [] - j = 0 - - for i in range(len(xVals)): - if (xVals[i]!= None and yVals[i]!= None) and (xVals[i] != "None" and yVals[i] != "None"): - XX.append((j,float(xVals[i]))) - YY.append((j,float(yVals[i]))) - j = j+1 - - NN = len(XX) - if NN <6: - return (0.0,NN) - XX.sort(cmpOrder2) - YY.sort(cmpOrder2) - X = [0]*NN - Y = [0]*NN - - j = 1 - rank = 0.0 - t = 0.0 - sx = 0.0 - - while j < NN: - - if XX[j][1] != XX[j-1][1]: - X[XX[j-1][0]] = j - j = j+1 - - else: - jt = j+1 - ji = j - for jt in range(j+1, NN): - if (XX[jt][1] != XX[j-1][1]): - break - rank = 0.5*(j+jt) - for ji in range(j-1, jt): - X[XX[ji][0]] = rank - t = jt-j - sx = sx + (t*t*t-t) - if (jt == NN-1): - if (XX[jt][1] == XX[j-1][1]): - X[XX[NN-1][0]] = rank - j = jt+1 - - if j == NN: - if X[XX[NN-1][0]] == 0: - X[XX[NN-1][0]] = NN - - j = 1 - rank = 0.0 - t = 0.0 - sy = 0.0 - - while j < NN: - - if YY[j][1] != YY[j-1][1]: - Y[YY[j-1][0]] = j - j = j+1 - else: - jt = j+1 - ji = j - for jt in range(j+1, NN): - if (YY[jt][1] != YY[j-1][1]): - break - rank = 0.5*(j+jt) - for ji in range(j-1, jt): - Y[YY[ji][0]] = rank - t = jt - j - sy = sy + (t*t*t-t) - if (jt == NN-1): - if (YY[jt][1] == YY[j-1][1]): - Y[YY[NN-1][0]] = rank - j = jt+1 - - if j == NN: - if Y[YY[NN-1][0]] == 0: - Y[YY[NN-1][0]] = NN - - D = 0.0 - - for i in range(NN): - D += (X[i]-Y[i])*(X[i]-Y[i]) - - fac = (1.0 -sx/(NN*NN*NN-NN))*(1.0-sy/(NN*NN*NN-NN)) - - return ((1-(6.0/(NN*NN*NN-NN))*(D+(sx+sy)/12.0))/math.sqrt(fac),NN) - - # allcorrelations = [] - - correlation_data = {} - for i, line in enumerate(input_list): - if i == 0: - continue - tokens = line.split('","') - tokens[-1] = tokens[-1][:-2] #remove the last " - tokens[0] = tokens[0][1:] #remove the first " - - traitdataName = tokens[0] - database_trait = tokens[1:] - - #print("database_trait:", database_trait) - - #ZS: 2015 could add biweight correlation, see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3465711/ - # if corr_method == 'pearson': - # sample_r, sample_p = scipy.stats.pearsonr(input_trait, database_trait) - # else: - # sample_r, sample_p = scipy.stats.spearmanr(input_trait, database_trait) - - if corr_method == "pearson": #XZ: Pearson's r - sample_r, nOverlap = calCorrelation(input_trait, database_trait, input_nnCorr) - else: #XZ: Spearman's rho - sample_r, nOverlap = calCorrelationRank(input_trait, database_trait, input_nnCorr) - - #XZ: calculate corrPValue - if nOverlap < 3: - sample_p = 1.0 - else: - if abs(sample_r) >= 1.0: - sample_p = 0.0 - else: - z_value = 0.5*math.log((1.0+sample_r)/(1.0-sample_r)) - z_value = z_value*math.sqrt(nOverlap-3) - sample_p = 2.0*(1.0 - reaper.normp(abs(z_value))) - - correlation_data[traitdataName] = [sample_r, sample_p, nOverlap] - - # traitinfo = [traitdataName, sample_r, nOverlap] - # allcorrelations.append(traitinfo) - - return correlation_data - # return allcorrelations - - - datasetFile = open(webqtlConfig.GENERATED_TEXT_DIR+db_filename,'r') - - print("Invoking parallel computing") - input_line_list = datasetFile.readlines() - print("Read lines from the file") - all_line_number = len(input_line_list) - - step = 1000 - job_number = math.ceil( float(all_line_number)/step ) - - print("JOB NUMBER", job_number) - - job_input_lists = [] - - print("Configuring jobs") - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, all_line_number) - - one_job_input_list = [] - - for i in range( starti, endi ): - one_job_input_list.append( input_line_list[i] ) - - job_input_lists.append( one_job_input_list ) - - print("Creating pp servers") - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - print("Done creating servers") - - jobs = [] - results = [] - - print("Starting parallel computation, submitting jobs") - for one_job_input_list in job_input_lists: #pay attention to modules from outside - jobs.append( job_server.submit(func=compute_corr, args=(num_overlap, self.this_trait_vals, one_job_input_list, self.corr_method), depfuncs=(), modules=("webqtlUtil",)) ) - print("Done submitting jobs") - - for one_job in jobs: - one_result = one_job() - self.correlation_data.update(one_result) - # one_result = one_job() - # results.append( one_result ) - - #print("CORRELATION DATA:", self.correlation_data) - - # print("Acquiring results") - - # for one_result in results: - # for one_traitinfo in one_result: - # allcorrelations.append( one_traitinfo ) - def generate_corr_json(corr_results, this_trait, dataset, target_dataset): results_list = [] for i, trait in enumerate(corr_results): diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py index 077386a3..4bb4d65d 100644 --- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py +++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py @@ -23,7 +23,6 @@ from __future__ import absolute_import, print_function, division import sys # sys.path.append(".") Never do this in a webserver! -import gc import string import cPickle import os diff --git a/wqflask/wqflask/ctl/ctl_analysis.py b/wqflask/wqflask/ctl/ctl_analysis.py index 9515d23a..6fda02fd 100644 --- a/wqflask/wqflask/ctl/ctl_analysis.py +++ b/wqflask/wqflask/ctl/ctl_analysis.py @@ -2,7 +2,6 @@ # Author / Maintainer: Danny Arends <Danny.Arends@gmail.com> import sys from numpy import * -import scipy as sp # SciPy import rpy2.robjects as ro # R Objects import rpy2.rinterface as ri @@ -24,60 +23,38 @@ from utility import helper_functions from utility.tools import locate from rpy2.robjects.packages import importr -utils = importr("utils") + +import utility.logger +logger = utility.logger.getLogger(__name__ ) ## Get pointers to some common R functions r_library = ro.r["library"] # Map the library function r_options = ro.r["options"] # Map the options function -r_read_csv = ro.r["read.csv"] # Map the read.csv function -r_dim = ro.r["dim"] # Map the dim function -r_c = ro.r["c"] # Map the c function r_t = ro.r["t"] # Map the t function -r_cat = ro.r["cat"] # Map the cat function -r_paste = ro.r["paste"] # Map the paste function -r_unlist = ro.r["unlist"] # Map the unlist function -r_head = ro.r["head"] # Map the unlist function -r_unique = ro.r["unique"] # Map the unique function -r_length = ro.r["length"] # Map the length function r_unlist = ro.r["unlist"] # Map the unlist function r_list = ro.r.list # Map the list function -r_matrix = ro.r.matrix # Map the matrix function -r_seq = ro.r["seq"] # Map the seq function -r_table = ro.r["table"] # Map the table function -r_names = ro.r["names"] # Map the names function -r_sink = ro.r["sink"] # Map the sink function -r_is_NA = ro.r["is.na"] # Map the is.na function -r_file = ro.r["file"] # Map the file function r_png = ro.r["png"] # Map the png function for plotting r_dev_off = ro.r["dev.off"] # Map the dev.off function -r_save_image = ro.r["save.image"] # Map the save.image function -r_class = ro.r["class"] # Map the class function -r_save = ro.r["save"] # Map the save function r_write_table = ro.r["write.table"] # Map the write.table function -r_read_table = ro.r["read.table"] # Map the read.table function -r_as_data_frame = ro.r["as.data.frame"] # Map the write.table function r_data_frame = ro.r["data.frame"] # Map the write.table function r_as_numeric = ro.r["as.numeric"] # Map the write.table function class CTL(object): def __init__(self): - print("Initialization of CTL") + logger.info("Initialization of CTL") #log = r_file("/tmp/genenetwork_ctl.log", open = "wt") - #r_sink(log) # Uncomment the r_sink() commands to log output from stdout/stderr to a file + #r_sink(log) # Uncomment the r_sink() commands to log output from stdout/stderr to a file #r_sink(log, type = "message") - r_library("ctl") # Load CTL - Should only be done once, since it is quite expensive + r_library("ctl") # Load CTL - Should only be done once, since it is quite expensive r_options(stringsAsFactors = False) - print("Initialization of CTL done, package loaded in R session") + logger.info("Initialization of CTL done, package loaded in R session") self.r_CTLscan = ro.r["CTLscan"] # Map the CTLscan function self.r_CTLsignificant = ro.r["CTLsignificant"] # Map the CTLsignificant function self.r_lineplot = ro.r["ctl.lineplot"] # Map the ctl.lineplot function - self.r_CTLsignificant = ro.r["CTLsignificant"] # Map the CTLsignificant function - self.r_CTLnetwork = ro.r["CTLnetwork"] # Map the CTLnetwork function - self.r_CTLprofiles = ro.r["CTLprofiles"] # Map the CTLprofiles function self.r_plotCTLobject = ro.r["plot.CTLobject"] # Map the CTLsignificant function self.nodes_list = [] self.edges_list = [] - print("Obtained pointers to CTL functions") + logger.info("Obtained pointers to CTL functions") def addNode(self, gt): node_dict = { 'data' : {'id' : str(gt.name) + ":" + str(gt.dataset.name), @@ -100,20 +77,20 @@ class CTL(object): self.edges_list.append(edge_dict) def run_analysis(self, requestform): - print("Starting CTL analysis on dataset") + logger.info("Starting CTL analysis on dataset") self.trait_db_list = [trait.strip() for trait in requestform['trait_list'].split(',')] self.trait_db_list = [x for x in self.trait_db_list if x] - print("strategy:", requestform.get("strategy")) + logger.debug("strategy:", requestform.get("strategy")) strategy = requestform.get("strategy") - print("nperm:", requestform.get("nperm")) + logger.debug("nperm:", requestform.get("nperm")) nperm = int(requestform.get("nperm")) - print("parametric:", requestform.get("parametric")) + logger.debug("parametric:", requestform.get("parametric")) parametric = bool(requestform.get("parametric")) - print("significance:", requestform.get("significance")) + logger.debug("significance:", requestform.get("significance")) significance = float(requestform.get("significance")) # Get the name of the .geno file belonging to the first phenotype @@ -123,7 +100,7 @@ class CTL(object): genofilelocation = locate(dataset.group.name + ".geno", "genotype") parser = genofile_parser.ConvertGenoFile(genofilelocation) parser.process_csv() - print(dataset.group) + logger.debug("dataset group: ", dataset.group) # Create a genotype matrix individuals = parser.individuals markers = [] @@ -133,14 +110,14 @@ class CTL(object): markers.append(marker["genotypes"]) genotypes = list(itertools.chain(*markers)) - print(len(genotypes) / len(individuals), "==", len(parser.markers)) + logger.debug(len(genotypes) / len(individuals), "==", len(parser.markers)) rGeno = r_t(ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames = r_list(markernames, individuals), byrow=True)) # Create a phenotype matrix traits = [] for trait in self.trait_db_list: - print("retrieving data for", trait) + logger.debug("retrieving data for", trait) if trait != "": ts = trait.split(':') gt = TRAIT.GeneralTrait(name = ts[0], dataset_name = ts[1]) @@ -153,7 +130,7 @@ class CTL(object): rPheno = r_t(ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames = r_list(self.trait_db_list, individuals), byrow=True)) - print(rPheno) + logger.debug(rPheno) # Use a data frame to store the objects rPheno = r_data_frame(rPheno, check_names = False) @@ -196,10 +173,9 @@ class CTL(object): sys.stdout.flush() # Create the interactive graph for cytoscape visualization (Nodes and Edges) - print(type(significant)) if not type(significant) == ri.RNULLType: for x in range(len(significant[0])): - print(significant[0][x], significant[1][x], significant[2][x]) # Debug to console + logger.debug(significant[0][x], significant[1][x], significant[2][x]) # Debug to console tsS = significant[0][x].split(':') # Source tsT = significant[2][x].split(':') # Target gtS = TRAIT.GeneralTrait(name = tsS[0], dataset_name = tsS[1]) # Retrieve Source info from the DB @@ -214,7 +190,6 @@ class CTL(object): self.elements = json.dumps(self.nodes_list + self.edges_list) def loadImage(self, path, name): - print("pre-loading imgage results:", self.results[path]) imgfile = open(self.results[path], 'rb') imgdata = imgfile.read() imgB64 = imgdata.encode("base64") @@ -229,7 +204,7 @@ class CTL(object): n = n + 1 def process_results(self, results): - print("Processing CTL output") + logger.info("Processing CTL output") template_vars = {} template_vars["results"] = self.results template_vars["elements"] = self.elements diff --git a/wqflask/wqflask/templates/correlation_page.html b/wqflask/wqflask/templates/correlation_page.html index fb4e19a1..05136ad8 100644 --- a/wqflask/wqflask/templates/correlation_page.html +++ b/wqflask/wqflask/templates/correlation_page.html @@ -94,12 +94,14 @@ <th>Sample p(r)</th> <th>Lit r</th> <th>Tissue r</th> + <th>Tissue p(r)</th> {% else %} <th>Sample rho</th> <th>N</th> <th>Sample p(rho)</th> <th>Lit r</th> <th>Tissue rho</th> + <th>Tissue p(rho)</th> {% endif %} {% elif target_dataset.type == "Publish" %} {% if corr_method == 'pearson' %} @@ -156,8 +158,10 @@ {% endif %} {% if trait.tissue_corr == "" or trait.tissue_corr == 0.000 %} <td align="right">--</td> + <td align="right">--</td> {% else %} <td align="right">{{'%0.3f'|format(trait.tissue_corr)}}</td> + <td align="right">{{'%0.3e'|format(trait.tissue_pvalue)}}</td> {% endif %} {% elif target_dataset.type == "Publish" %} <td>{{ trait.description_display }}</td> @@ -319,7 +323,7 @@ title: 'correlation_results', fieldBoundary: '"', exportOptions: { - columns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + columns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] } } ], @@ -342,7 +346,8 @@ { "type": "natural" }, { "type": "scientific" }, { "type": "natural" }, - { "type": "natural" } + { "type": "natural" }, + { "type": "scientific" } ], "createdRow": function ( row, data, index ) { $('td', row).eq(4).attr('title', $('td', row).eq(4).text()); |