aboutsummaryrefslogtreecommitdiff
path: root/wqflask
diff options
context:
space:
mode:
authorzsloan2018-04-11 20:39:24 +0000
committerzsloan2018-04-11 20:39:24 +0000
commit9750e63d64849d7fa9e1e681f56b73cae96905df (patch)
treedaa743957497d92b8a3573b00405593d3454cdd0 /wqflask
parentd8cec0ef94b7683f42946ce182a937484ad1034a (diff)
downloadgenenetwork2-9750e63d64849d7fa9e1e681f56b73cae96905df.tar.gz
Added tissue correlation p value to correlation page results, since it was missing before
Removed a bunch of unused coded from all correlation-related files and the ctl analysis code
Diffstat (limited to 'wqflask')
-rw-r--r--wqflask/base/mrna_assay_tissue_data.py65
-rw-r--r--wqflask/wqflask/collect.py8
-rw-r--r--wqflask/wqflask/correlation/corr_scatter_plot.py16
-rw-r--r--wqflask/wqflask/correlation/correlation_functions.py786
-rw-r--r--wqflask/wqflask/correlation/show_corr_results.py844
-rw-r--r--wqflask/wqflask/correlation_matrix/show_corr_matrix.py1
-rw-r--r--wqflask/wqflask/ctl/ctl_analysis.py63
-rw-r--r--wqflask/wqflask/templates/correlation_page.html9
8 files changed, 33 insertions, 1759 deletions
diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py
index 53f7c16a..6fec5dcd 100644
--- a/wqflask/base/mrna_assay_tissue_data.py
+++ b/wqflask/base/mrna_assay_tissue_data.py
@@ -21,17 +21,8 @@ class MrnaAssayTissueData(object):
if self.gene_symbols == None:
self.gene_symbols = []
- #print("self.gene_symbols:", self.gene_symbols)
-
self.data = collections.defaultdict(Bunch)
- #self.gene_id_dict ={}
- #self.data_id_dict = {}
- #self.chr_dict = {}
- #self.mb_dict = {}
- #self.desc_dict = {}
- #self.probe_target_desc_dict = {}
-
query = '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description
from (
select Symbol, max(Mean) as maxmean
@@ -51,7 +42,6 @@ class MrnaAssayTissueData(object):
in_clause = db_tools.create_in_clause(gene_symbols)
#ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower
-
query += ''' Symbol in {} group by Symbol)
as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
and t.Mean = x.maxmean;
@@ -66,9 +56,7 @@ class MrnaAssayTissueData(object):
for result in results:
symbol = result[0]
- #if symbol.lower() in [gene_symbol.lower() for gene_symbol in gene_symbols]:
if symbol.lower() in lower_symbols:
- #gene_symbols.append(symbol)
symbol = symbol.lower()
self.data[symbol].gene_id = result.GeneId
@@ -78,8 +66,6 @@ class MrnaAssayTissueData(object):
self.data[symbol].description = result.description
self.data[symbol].probe_target_description = result.Probe_Target_Description
- print("self.data: ", pf(self.data))
-
###########################################################################
#Input: cursor, symbolList (list), dataIdDict(Dict)
#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair,
@@ -106,53 +92,4 @@ class MrnaAssayTissueData(object):
else:
symbol_values_dict[result.Symbol.lower()].append(result.value)
- #for symbol in self.data:
- # data_id = self.data[symbol].data_id
- # symbol_values_dict[symbol] = self.get_tissue_values(data_id)
-
-
- return symbol_values_dict
-
-
- #def get_tissue_values(self, data_id):
- # """Gets the tissue values for a particular gene"""
- #
- # tissue_values=[]
- #
- # query = """SELECT value, id
- # FROM TissueProbeSetData
- # WHERE Id IN {}""".format(db_tools.create_in_clause(data_id))
- #
- # #try :
- # results = g.db.execute(query).fetchall()
- # for result in results:
- # tissue_values.append(result.value)
- # #symbol_values_dict[symbol] = value_list
- # #except:
- # # symbol_values_pairs[symbol] = None
- #
- # return tissue_values
-
-########################################################################################################
-#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol
-#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair.
-# key is symbol, value is one list of expression values of one probeSet.
-#function: wrapper function for getSymbolValuePairDict function
-# build gene symbol list if necessary, cut it into small lists if necessary,
-# then call getSymbolValuePairDict function and merge the results.
-########################################################################################################
-
-#def get_trait_symbol_and_tissue_values(symbol_list=None):
-# tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list)
-#
-# #symbolList,
-# #geneIdDict,
-# #dataIdDict,
-# #ChrDict,
-# #MbDict,
-# #descDict,
-# #pTargetDescDict = getTissueProbeSetXRefInfo(
-# # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId)
-#
-# if len(tissue_data.gene_symbols):
-# return get_symbol_values_pairs(tissue_data)
+ return symbol_values_dict \ No newline at end of file
diff --git a/wqflask/wqflask/collect.py b/wqflask/wqflask/collect.py
index ffc698de..04035e96 100644
--- a/wqflask/wqflask/collect.py
+++ b/wqflask/wqflask/collect.py
@@ -40,13 +40,6 @@ import logging
from utility.logger import getLogger
logger = getLogger(__name__)
-def get_collection():
- if g.user_session.logged_in:
- return UserCollection()
- else:
- return AnonCollection()
- #else:
- # CauseError
class AnonCollection(object):
"""User is not logged in"""
@@ -282,7 +275,6 @@ def create_new(collection_name):
db_session.commit()
return redirect(url_for('view_collection', uc_id=uc.id))
else:
- current_collections = user_manager.AnonUser().get_collections()
ac = AnonCollection(collection_name)
ac.changed_timestamp = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p')
ac.add_traits(params)
diff --git a/wqflask/wqflask/correlation/corr_scatter_plot.py b/wqflask/wqflask/correlation/corr_scatter_plot.py
index a08cd759..94711c67 100644
--- a/wqflask/wqflask/correlation/corr_scatter_plot.py
+++ b/wqflask/wqflask/correlation/corr_scatter_plot.py
@@ -19,37 +19,31 @@ class CorrScatterPlot(object):
width = int(params['width'])
except:
width = 800
- self.width = width
try:
height = int(params['height'])
except:
height = 600
- self.height = height
try:
circle_color = params['circle_color']
except:
circle_color = '#3D85C6'
- self.circle_color = circle_color
try:
circle_radius = int(params['circle_radius'])
except:
circle_radius = 5
- self.circle_radius = circle_radius
try:
line_color = params['line_color']
except:
line_color = '#FF0000'
- self.line_color = line_color
try:
line_width = int(params['line_width'])
except:
line_width = 1
- self.line_width = line_width
samples_1, samples_2, num_overlap = corr_result_helpers.normalize_values_with_samples(self.trait_1.data, self.trait_2.data)
@@ -66,14 +60,14 @@ class CorrScatterPlot(object):
x = np.array(vals_1)
y = np.array(vals_2)
- slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
+ slope, intercept, r_value, p_value, _std_err = stats.linregress(x, y)
rx = stats.rankdata(x)
ry = stats.rankdata(y)
self.rdata = []
self.rdata.append(rx.tolist())
self.rdata.append(ry.tolist())
- srslope, srintercept, srr_value, srp_value, srstd_err = stats.linregress(rx, ry)
+ srslope, srintercept, srr_value, srp_value, _srstd_err = stats.linregress(rx, ry)
self.js_data = dict(
data = self.data,
@@ -86,17 +80,17 @@ class CorrScatterPlot(object):
num_overlap = num_overlap,
vals_1 = vals_1,
vals_2 = vals_2,
-
+
slope = slope,
intercept = intercept,
r_value = r_value,
p_value = p_value,
-
+
srslope = srslope,
srintercept = srintercept,
srr_value = srr_value,
srp_value = srp_value,
-
+
width = width,
height = height,
circle_color = circle_color,
diff --git a/wqflask/wqflask/correlation/correlation_functions.py b/wqflask/wqflask/correlation/correlation_functions.py
index 1ee9b558..06dec795 100644
--- a/wqflask/wqflask/correlation/correlation_functions.py
+++ b/wqflask/wqflask/correlation/correlation_functions.py
@@ -28,468 +28,12 @@ from __future__ import absolute_import, print_function, division
import math
import rpy2.robjects
-import pp
import string
-from utility import webqtlUtil
from base.mrna_assay_tissue_data import MrnaAssayTissueData
-from base.trait import GeneralTrait
-from db import webqtlDatabaseFunction
from flask import Flask, g
-#XZ: The input 'controls' is String. It contains the full name of control traits.
-#XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait.
-#XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers.
-#XZ, 03/29/2010: For each returned control trait, there is no None value in it.
-def controlStrains(controls, strainlst):
-
- controls = controls.split(',')
-
- cvals = {}
- for oneTraitName in controls:
- oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() )
- oneTrait.retrieveData()
- cvals[oneTraitName] = oneTrait.data
-
- tcstrains = []
- tcvals = []
- tcvars = []
-
- for oneTraitName in controls:
- strains = []
- vals = []
- vars = []
-
- for _strain in strainlst:
- if cvals[oneTraitName].has_key(_strain):
- _val = cvals[oneTraitName][_strain].val
- if _val != None:
- strains.append(_strain)
- vals.append(_val)
- vars.append(None)
-
- tcstrains.append(strains)
- tcvals.append(vals)
- tcvars.append(vars)
-
- return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains]
-
-
-
-#XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them.
-def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars):
- """Corrects strains, vals, and vars so that all contrain only those strains common
- to the reference trait and all control traits."""
-
- def dictify(strains,vals,vars):
- subdict = {}
- for i in xrange(len(strains)):
- subdict[strains[i]] = (vals[i],vars[i])
- return subdict
-
- #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits.
- dicts = []
- dicts.append(dictify(_strains,_vals,_vars))
-
- nCstrains = len(_controlstrains)
- for i in xrange(nCstrains):
- dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i]))
-
- _newstrains = []
- _vals = []
- _vars = []
- _controlvals = [[] for x in xrange(nCstrains)]
- _controlvars = [[] for x in xrange(nCstrains)]
-
- for strain in _strains:
- inall = True
- for d in dicts:
- if strain not in d:
- inall = False
- break
- if inall:
- _newstrains.append(strain)
- _vals.append(dicts[0][strain][0])
- _vars.append(dicts[0][strain][1])
- for i in xrange(nCstrains):
- _controlvals[i].append(dicts[i+1][strain][0])
- _controlvars[i].append(dicts[i+1][strain][1])
-
- return _newstrains, _vals, _controlvals, _vars, _controlvars
-
-
-#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty.
-#else, the returned list has two elements of control trait name.
-def findIdenticalControlTraits ( controlVals, controlNames ):
- nameOfIdenticalTraits = []
-
- controlTraitNumber = len(controlVals)
-
- if controlTraitNumber > 1:
-
- #XZ: reset the precision of values and convert to string type
- for oneTraitVal in controlVals:
- for oneStrainVal in oneTraitVal:
- oneStrainVal = '%.3f' % oneStrainVal
-
- for i, oneTraitVal in enumerate( controlVals ):
- for j in range(i+1, controlTraitNumber):
- if oneTraitVal == controlVals[j]:
- nameOfIdenticalTraits.append(controlNames[i])
- nameOfIdenticalTraits.append(controlNames[j])
-
- return nameOfIdenticalTraits
-
-#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty.
-#else, the returned list has two elements of control trait name.
-#primaryVal is of list type. It contains value of primary trait.
-#primaryName is of string type.
-#controlVals is of list type. Each element is list too. Each element contain value of one control trait.
-#controlNames is of list type.
-def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ):
- nameOfIdenticalTraits = []
-
- #XZ: reset the precision of values and convert to string type
- for oneStrainVal in primaryVal:
- oneStrainVal = '%.3f' % oneStrainVal
-
- for oneTraitVal in controlVals:
- for oneStrainVal in oneTraitVal:
- oneStrainVal = '%.3f' % oneStrainVal
-
- controlTraitNumber = len(controlVals)
-
- if controlTraitNumber > 1:
- for i, oneTraitVal in enumerate( controlVals ):
- for j in range(i+1, controlTraitNumber):
- if oneTraitVal == controlVals[j]:
- nameOfIdenticalTraits.append(controlNames[i])
- nameOfIdenticalTraits.append(controlNames[j])
- break
-
- if len(nameOfIdenticalTraits) == 0:
- for i, oneTraitVal in enumerate( controlVals ):
- if primaryVal == oneTraitVal:
- nameOfIdenticalTraits.append(primaryName)
- nameOfIdenticalTraits.append(controlNames[i])
- break
-
- return nameOfIdenticalTraits
-
-
-
-#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order.
-#XZ: No value in primaryVal and controlVals could be None.
-
-def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'):
-
- def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ):
-
- rpy2.robjects.r("""
-pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){
- # The partial correlation coefficient between x and y given z
- #
- # pcor.test is free and comes with ABSOLUTELY NO WARRANTY.
- #
- # x and y should be vectors
- #
- # z can be either a vector or a matrix
- #
- # use: There are two methods to calculate the partial correlation coefficient.
- # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec").
- # Default is "mat".
- #
- # method: There are three ways to calculate the correlation coefficient,
- # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods.
- # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis.
- # Default is "p".
- #
- # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z).
- # If not, the missing samples will be removed just when the correlation coefficient is calculated.
- # However, the number of samples for the p-value is the number of samples after removing
- # all the missing samples from the whole dataset.
- # Default is "T".
-
- x <- c(x)
- y <- c(y)
- z <- as.data.frame(z)
-
- if(use == "mat"){
- p.use <- "Var-Cov matrix"
- pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm)
- }else if(use == "rec"){
- p.use <- "Recursive formula"
- pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm)
- }else{
- stop("use should be either rec or mat!\n")
- }
-
- # print the method
- if(gregexpr("p",method)[[1]][1] == 1){
- p.method <- "Pearson"
- }else if(gregexpr("s",method)[[1]][1] == 1){
- p.method <- "Spearman"
- }else if(gregexpr("k",method)[[1]][1] == 1){
- p.method <- "Kendall"
- }else{
- stop("method should be pearson or spearman or kendall!\n")
- }
-
- # sample number
- n <- dim(na.omit(data.frame(x,y,z)))[1]
-
- # given variables' number
- gn <- dim(z)[2]
-
- # p-value
- if(p.method == "Kendall"){
- statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn)))
- p.value <- 2*pnorm(-abs(statistic))
-
- }else{
- statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2))
- p.value <- 2*pnorm(-abs(statistic))
- }
-
- data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use)
-}
-
-# By using var-cov matrix
-pcor.mat <- function(x,y,z,method="p",na.rm=T){
-
- x <- c(x)
- y <- c(y)
- z <- as.data.frame(z)
-
- if(dim(z)[2] == 0){
- stop("There should be given data\n")
- }
-
- data <- data.frame(x,y,z)
-
- if(na.rm == T){
- data = na.omit(data)
- }
-
- xdata <- na.omit(data.frame(data[,c(1,2)]))
- Sxx <- cov(xdata,xdata,m=method)
-
- xzdata <- na.omit(data)
- xdata <- data.frame(xzdata[,c(1,2)])
- zdata <- data.frame(xzdata[,-c(1,2)])
- Sxz <- cov(xdata,zdata,m=method)
-
- zdata <- na.omit(data.frame(data[,-c(1,2)]))
- Szz <- cov(zdata,zdata,m=method)
-
- # is Szz positive definite?
- zz.ev <- eigen(Szz)$values
- if(min(zz.ev)[1]<0){
- stop("\'Szz\' is not positive definite!\n")
- }
-
- # partial correlation
- Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz)
-
- rxx.z <- cov2cor(Sxx.z)[1,2]
-
- rxx.z
-}
-
-# By using recursive formula
-pcor.rec <- function(x,y,z,method="p",na.rm=T){
- #
-
- x <- c(x)
- y <- c(y)
- z <- as.data.frame(z)
-
- if(dim(z)[2] == 0){
- stop("There should be given data\n")
- }
-
- data <- data.frame(x,y,z)
-
- if(na.rm == T){
- data = na.omit(data)
- }
-
- # recursive formula
- if(dim(z)[2] == 1){
- tdata <- na.omit(data.frame(data[,1],data[,2]))
- rxy <- cor(tdata[,1],tdata[,2],m=method)
-
- tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)]))
- rxz <- cor(tdata[,1],tdata[,2],m=method)
-
- tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)]))
- ryz <- cor(tdata[,1],tdata[,2],m=method)
-
- rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) )
-
- return(rxy.z)
- }else{
- x <- c(data[,1])
- y <- c(data[,2])
- z0 <- c(data[,3])
- zc <- as.data.frame(data[,-c(1,2,3)])
-
- rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm)
- rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm)
- ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm)
-
- rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) )
- return(rxy.z)
- }
-}
-""")
-
- R_pcorr_function = rpy2.robjects.r['pcor.test']
- R_corr_test = rpy2.robjects.r['cor.test']
-
- primary = rpy2.robjects.FloatVector(range(len(primaryVal)))
- for i in range(len(primaryVal)):
- primary[i] = primaryVal[i]
-
- control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals))
- for i in range(len(controlVals)):
- for j in range(len(controlVals[0])):
- control[i*len(controlVals[0]) + j] = controlVals[i][j]
-
- allcorrelations = []
-
- for targetIndex, oneTargetVals in enumerate(targetVals):
-
- this_primary = None
- this_control = None
- this_target = None
-
- if None in oneTargetVals:
-
- goodIndex = []
- for i in range(len(oneTargetVals)):
- if oneTargetVals[i] != None:
- goodIndex.append(i)
-
- this_primary = rpy2.robjects.FloatVector(range(len(goodIndex)))
- for i in range(len(goodIndex)):
- this_primary[i] = primaryVal[goodIndex[i]]
-
- this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals))
- for i in range(len(controlVals)):
- for j in range(len(goodIndex)):
- this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]]
-
- this_target = rpy2.robjects.FloatVector(range(len(goodIndex)))
- for i in range(len(goodIndex)):
- this_target[i] = oneTargetVals[goodIndex[i]]
-
- else:
- this_primary = primary
- this_control = control
- this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals)))
- for i in range(len(oneTargetVals)):
- this_target[i] = oneTargetVals[i]
-
- one_name = targetNames[targetIndex]
- one_N = len(this_primary)
-
- #calculate partial correlation
- one_pc_coefficient = 'NA'
- one_pc_p = 1
-
- try:
- if method == 's':
- result = R_pcorr_function(this_primary, this_target, this_control, method='s')
- else:
- result = R_pcorr_function(this_primary, this_target, this_control)
-
- #XZ: In very few cases, the returned coefficient is nan.
- #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN
- if result[0][0] == result[0][0]:
- one_pc_coefficient = result[0][0]
- #XZ: when the coefficient value is 1 (primary trait and target trait are the same),
- #XZ: occationally, the returned p value is nan instead of 0.
- if result[1][0] == result[1][0]:
- one_pc_p = result[1][0]
- elif abs(one_pc_coefficient - 1) < 0.0000001:
- one_pc_p = 0
- except:
- pass
-
- #calculate zero order correlation
- one_corr_coefficient = 0
- one_corr_p = 1
-
- try:
- if method == 's':
- R_result = R_corr_test(this_primary, this_target, method='spearman')
- else:
- R_result = R_corr_test(this_primary, this_target)
-
- one_corr_coefficient = R_result[3][0]
- one_corr_p = R_result[2][0]
- except:
- pass
-
- traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ]
-
- allcorrelations.append(traitinfo)
-
- return allcorrelations
- #End of function compute_partial
-
-
- allcorrelations = []
-
- target_trait_number = len(targetVals)
-
- if target_trait_number < 1000:
- allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method )
- else:
- step = 1000
- job_number = math.ceil( float(target_trait_number)/step )
-
- job_targetVals_lists = []
- job_targetNames_lists = []
-
- for job_index in range( int(job_number) ):
- starti = job_index*step
- endi = min((job_index+1)*step, target_trait_number)
-
- one_job_targetVals_list = []
- one_job_targetNames_list = []
-
- for i in range( starti, endi ):
- one_job_targetVals_list.append( targetVals[i] )
- one_job_targetNames_list.append( targetNames[i] )
-
- job_targetVals_lists.append( one_job_targetVals_list )
- job_targetNames_lists.append( one_job_targetNames_list )
-
- ppservers = ()
- # Creates jobserver with automatically detected number of workers
- job_server = pp.Server(ppservers=ppservers)
-
- jobs = []
- results = []
-
- for i, one_job_targetVals_list in enumerate( job_targetVals_lists ):
- one_job_targetNames_list = job_targetNames_lists[i]
- #pay attention to modules from outside
- jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) )
-
- for one_job in jobs:
- one_result = one_job()
- results.append( one_result )
-
- for one_result in results:
- for one_traitinfo in one_result:
- allcorrelations.append( one_traitinfo )
-
- return allcorrelations
-
#####################################################################################
#Input: primaryValue(list): one list of expression values of one probeSet,
@@ -529,170 +73,6 @@ def cal_zero_order_corr_for_tiss (primaryValue=[], targetValue=[], method='pears
return corr_result
-
-
-def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'):
-
- def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ):
-
- oneSymbolCorrDict = {}
- oneSymbolPvalueDict = {}
-
- R_corr_test = rpy2.robjects.r['cor.test']
-
- R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue)))
-
- for i in range(len(primaryTraitValue)):
- R_primary[i] = primaryTraitValue[i]
-
- for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems():
- R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue)))
- for i in range(len(oneTraitValue)):
- R_target[i] = oneTraitValue[i]
-
- if method =='spearman':
- R_result = R_corr_test(R_primary, R_target, method='spearman')
- else:
- R_result = R_corr_test(R_primary, R_target)
-
- oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0]
- oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0]
-
- return(oneSymbolCorrDict, oneSymbolPvalueDict)
-
-
-
- symbolCorrDict = {}
- symbolPvalueDict = {}
-
- items_number = len(SymbolValueDict)
-
- if items_number <= 1000:
- symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method)
- else:
- items_list = SymbolValueDict.items()
-
- step = 1000
- job_number = math.ceil( float(items_number)/step )
-
- job_oneSymbolValueDict_list = []
-
- for job_index in range( int(job_number) ):
- starti = job_index*step
- endi = min((job_index+1)*step, items_number)
-
- oneSymbolValueDict = {}
-
- for i in range( starti, endi ):
- one_item = items_list[i]
- one_symbol = one_item[0]
- one_value = one_item[1]
- oneSymbolValueDict[one_symbol] = one_value
-
- job_oneSymbolValueDict_list.append( oneSymbolValueDict )
-
-
- ppservers = ()
- # Creates jobserver with automatically detected number of workers
- job_server = pp.Server(ppservers=ppservers)
-
- jobs = []
- results = []
-
- for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ):
-
- #pay attention to modules from outside
- jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) )
-
- for one_job in jobs:
- one_result = one_job()
- results.append( one_result )
-
- for one_result in results:
- oneSymbolCorrDict, oneSymbolPvalueDict = one_result
- symbolCorrDict.update( oneSymbolCorrDict )
- symbolPvalueDict.update( oneSymbolPvalueDict )
-
- return (symbolCorrDict, symbolPvalueDict)
-
-###########################################################################
-#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId
-#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict)
-#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and
-# getGeneSymbolTissueValueDict to build dict to get CorrPvArray
-#Note: If there are multiple probesets for one gene, select the one with highest mean.
-###########################################################################
-def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0):
- Symbols =""
- symbolList =[]
- geneIdDict ={}
- dataIdDict = {}
- ChrDict = {}
- MbDict = {}
- descDict = {}
- pTargetDescDict = {}
-
- count = len(GeneNameLst)
-
- # Added by NL 01/06/2011
- # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value
- # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable.
- if count==0:
- query='''
- select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description
- from (
- select Symbol, max(Mean) as maxmean
- from TissueProbeSetXRef
- where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol)
- as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean;
- '''%TissueProbeSetFreezeId
-
- else:
- for i, item in enumerate(GeneNameLst):
-
- if i == count-1:
- Symbols += "'%s'" %item
- else:
- Symbols += "'%s'," %item
-
- Symbols = "("+ Symbols+")"
- query='''
- select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description
- from (
- select Symbol, max(Mean) as maxmean
- from TissueProbeSetXRef
- where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol)
- as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean;
- '''% (TissueProbeSetFreezeId,Symbols)
-
- try:
- cursor.execute(query)
- results =cursor.fetchall()
- resultCount = len(results)
- # Key in all dicts is the lower-cased symbol
- for i, item in enumerate(results):
- symbol = item[0]
- symbolList.append(symbol)
-
- key =symbol.lower()
- geneIdDict[key]=item[1]
- dataIdDict[key]=item[2]
- ChrDict[key]=item[3]
- MbDict[key]=item[4]
- descDict[key]=item[5]
- pTargetDescDict[key]=item[6]
-
- except:
- symbolList = None
- geneIdDict=None
- dataIdDict=None
- ChrDict=None
- MbDict=None
- descDict=None
- pTargetDescDict=None
-
- return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict
-
###########################################################################
#Input: cursor, symbolList (list), dataIdDict(Dict)
#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair,
@@ -701,7 +81,6 @@ def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0):
#Attention! All keys are lower case!
###########################################################################
def get_symbol_value_pairs(tissue_data):
-
id_list = [tissue_data[symbol.lower()].data_id for item in tissue_data]
symbol_value_pairs = {}
@@ -719,23 +98,6 @@ def get_symbol_value_pairs(tissue_data):
except:
symbol_value_pairs[symbol] = None
- #for symbol in symbol_list:
- # if tissue_data.has_key(symbol):
- # data_id = tissue_data[symbol].data_id
- #
- # query = """select value, id
- # from TissueProbeSetData
- # where Id={}""".format(escape(data_id))
- # try :
- # results = g.db.execute(query).fetchall()
- # for item in results:
- # item = item[0]
- # value_list.append(item)
- # symbol_value_pairs[symbol] = value_list
- # value_list=[]
- # except:
- # symbol_value_pairs[symbol] = None
-
return symbol_value_pairs
@@ -752,150 +114,4 @@ def get_trait_symbol_and_tissue_values(symbol_list=None):
tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list)
if len(tissue_data.gene_symbols):
- return tissue_data.get_symbol_values_pairs()
-
- #symbolList,
- #geneIdDict,
- #dataIdDict,
- #ChrDict,
- #MbDict,
- #descDict,
- #pTargetDescDict = getTissueProbeSetXRefInfo(
- # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId)
-
- #limit_num=1000
- #count = len(symbol_list)
- #
- #symbol_value_pairs = {}
- #
- #if count !=0 and count <= limit_num:
- # symbol_value_pairs = getSymbolValuePairDict(cursor=cursor,symbolList=symbol_list,dataIdDict=dataIdDict)
- #
- #elif count > limit_num:
- # n = count/limit_num
- # start = 0
- # stop = 0
- #
- # for i in range(n):
- # stop =limit_num*(i+1)
- # gList1 = symbolList[start:stop]
- # PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict)
- # start =limit_num*(i+1)
- #
- # SymbolValuePairDict.update(PairDict1)
- #
- # if stop < count:
- # stop = count
- # gList2 = symbolList[start:stop]
- # PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict)
- # SymbolValuePairDict.update(PairDict2)
- #
- #return SymbolValuePairDict
-
-########################################################################################################
-#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int)
-#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair.
-# key is symbol, value is one list of expression values of one probeSet.
-#function: wrapper function of getGeneSymbolTissueValueDict function
-# for CorrelationPage.py
-########################################################################################################
-
-#def get_trait_symbol_and_tissue_values(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0):
-# SymbolValuePairDict={}
-#
-# symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(
-# cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId)
-#
-# if symbolList:
-# SymbolValuePairDict = get_gene_symbol_and_tissue_values(symbolList=symbolList,
-# dataIdDict=dataIdDict)
-#
-# return SymbolValuePairDict
-
-########################################################################################################
-#Input: cursor(cursor): MySQL connnection cursor;
-# priGeneSymbolList(list): one list of gene symbol;
-# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair,
-# key is symbol, value is one list of expression values of one probeSet;
-#Output: corrArray(array): array of Correlation Value,
-# pvArray(array): array of PValue;
-#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss
-########################################################################################################
-
-def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}):
- # setting initial value for corrArray, pvArray equal to 0
- Num = len(priGeneSymbolList)
-
- corrArray = [([0] * (Num))[:] for i in range(Num)]
- pvArray = [([0] * (Num))[:] for i in range(Num)]
- i = 0
- for pkey in priGeneSymbolList:
- j = 0
- pkey = pkey.strip().lower()# key in symbolValuepairDict is low case
- if symbolValuepairDict.has_key(pkey):
- priValue = symbolValuepairDict[pkey]
- for tkey in priGeneSymbolList:
- tkey = tkey.strip().lower()# key in symbolValuepairDict is low case
- if priValue and symbolValuepairDict.has_key(tkey):
- tarValue = symbolValuepairDict[tkey]
-
- if tarValue:
- if i>j:
- # corrArray stores Pearson Correlation values
- # pvArray stores Pearson P-Values
- pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue)
- corrArray[i][j] =pcorr_result[0]
- pvArray[i][j] =pcorr_result[2]
- elif i<j:
- # corrArray stores Spearman Correlation values
- # pvArray stores Spearman P-Values
- scorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue,method='spearman')
- corrArray[i][j] =scorr_result[0]
- pvArray[i][j] =scorr_result[2]
- else:
- # on the diagonal line, correlation value is 1, P-Values is 0
- corrArray[i][j] =1
- pvArray[i][j] =0
- j+=1
- else:
- corrArray[i][j] = None
- pvArray[i][j] = None
- j+=1
- else:
- corrArray[i][j] = None
- pvArray[i][j] = None
- j+=1
- else:
- corrArray[i][j] = None
- pvArray[i][j] = None
-
- i+=1
-
- return corrArray, pvArray
-
-########################################################################################################
-#Input: cursor(cursor): MySQL connnection cursor;
-# primaryTraitSymbol(string): one gene symbol;
-# TissueProbeSetFreezeId (int): Id of related TissueProbeSetFreeze
-# method: '0' default value, Pearson Correlation; '1', Spearman Correlation
-#Output: symbolCorrDict(Dict): Dict of Correlation Value, key is symbol
-# symbolPvalueDict(Dict): Dict of PValue,key is symbol ;
-#Function: build symbolCorrDict, symbolPvalueDict for display by calling calculation function:calZeroOrderCorrForTiss
-########################################################################################################
-def calculateCorrOfAllTissueTrait(cursor=None, primaryTraitSymbol=None, TissueProbeSetFreezeId=None,method='0'):
-
- symbolCorrDict = {}
- symbolPvalueDict = {}
-
- primaryTraitSymbolValueDict = getGeneSymbolTissueValueDictForTrait(cursor=cursor, GeneNameLst=[primaryTraitSymbol], TissueProbeSetFreezeId=TissueProbeSetFreezeId)
- primaryTraitValue = primaryTraitSymbolValueDict.values()[0]
-
- SymbolValueDict = getGeneSymbolTissueValueDictForTrait(cursor=cursor, GeneNameLst=[], TissueProbeSetFreezeId=TissueProbeSetFreezeId)
-
- if method =='1':
- symbolCorrDict, symbolPvalueDict = batchCalTissueCorr(primaryTraitValue,SymbolValueDict,method='spearman')
- else:
- symbolCorrDict, symbolPvalueDict = batchCalTissueCorr(primaryTraitValue,SymbolValueDict)
-
-
- return (symbolCorrDict, symbolPvalueDict)
+ return tissue_data.get_symbol_values_pairs() \ No newline at end of file
diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py
index 85a8c0ef..9f3f7982 100644
--- a/wqflask/wqflask/correlation/show_corr_results.py
+++ b/wqflask/wqflask/correlation/show_corr_results.py
@@ -21,9 +21,7 @@
from __future__ import absolute_import, print_function, division
import sys
-# sys.path.append(".") Never in a running webserver
-import gc
import string
import cPickle
import os
@@ -64,8 +62,6 @@ from flask import Flask, g
import utility.logger
logger = utility.logger.getLogger(__name__ )
-METHOD_SAMPLE_PEARSON = "1"
-METHOD_SAMPLE_RANK = "2"
METHOD_LIT = "3"
METHOD_TISSUE_PEARSON = "4"
METHOD_TISSUE_RANK = "5"
@@ -74,26 +70,7 @@ TISSUE_METHODS = [METHOD_TISSUE_PEARSON, METHOD_TISSUE_RANK]
TISSUE_MOUSE_DB = 1
-def print_mem(stage=""):
- mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
- #print("{}: {}".format(stage, mem/1024))
-
-class AuthException(Exception):
- pass
-
class CorrelationResults(object):
-
- corr_min_informative = 4
-
- #PAGE_HEADING = "Correlation Table"
- #CORRELATION_METHODS = {"1" : "Genetic Correlation (Pearson's r)",
- # "2" : "Genetic Correlation (Spearman's rho)",
- # "3" : "SGO Literature Correlation",
- # "4" : "Tissue Correlation (Pearson's r)",
- # "5" : "Tissue Correlation (Spearman's rho)"}
- #
- #RANK_ORDERS = {"1": 0, "2": 1, "3": 0, "4": 0, "5": 1}
-
def __init__(self, start_vars):
# get trait list from db (database name)
# calculate correlation with Base vector and targets
@@ -167,16 +144,12 @@ class CorrelationResults(object):
self.process_samples(start_vars, self.this_trait.data.keys(), primary_samples)
self.target_dataset = data_set.create_dataset(start_vars['corr_dataset'])
- # print("self.sample_data.keys: %s" % self.sample_data.keys)
self.target_dataset.get_trait_data(self.sample_data.keys())
self.correlation_results = []
self.correlation_data = {}
- db_filename = self.getFileName(target_db_name = self.target_dataset.name)
- cache_available = db_filename in os.listdir(webqtlConfig.GENERATED_TEXT_DIR)
-
if self.corr_type == "tissue":
self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol")
@@ -196,24 +169,6 @@ class CorrelationResults(object):
self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait])
elif self.corr_type == "sample":
- #ZS: Commented out since parallel correlation has issues with gunicorn
- # if self.dataset.type == "ProbeSet" and cache_available:
- # dataset_file = open(webqtlConfig.GENERATED_TEXT_DIR+db_filename,'r')
-
- ##XZ, 01/08/2009: read the first line
- # line = dataset_file.readline()
- # dataset_strains = webqtlUtil.readLineCSV(line)[1:]
-
- # self.this_trait_vals = []
- # for item in dataset_strains:
- # if item in self.sample_data:
- # self.this_trait_vals.append(self.sample_data[item])
- # else:
- # self.this_trait_vals.append("None")
- # num_overlap = len(self.this_trait_vals)
- # logger.debug("DOING PARALLEL")
- # self.do_parallel_correlation(db_filename, num_overlap)
- # else:
for trait, values in self.target_dataset.trait_data.iteritems():
self.get_sample_r_and_p_values(trait, values)
@@ -256,9 +211,6 @@ class CorrelationResults(object):
trait_object.sample_p,
trait_object.num_overlap) = self.correlation_data[trait]
- #Get symbol for trait and call function that gets each tissue value from the database (tables TissueProbeSetXRef,
- #TissueProbeSetData, etc) and calculates the correlation (cal_zero_order_corr_for_tissue in correlation_functions)
-
# Set some sane defaults
trait_object.tissue_corr = 0
trait_object.tissue_pvalue = 0
@@ -274,9 +226,6 @@ class CorrelationResults(object):
trait_object.sample_p,
trait_object.num_overlap) = self.correlation_data[trait]
- #Get symbol for trait and call function that gets each tissue value from the database (tables TissueProbeSetXRef,
- #TissueProbeSetData, etc) and calculates the correlation (cal_zero_order_corr_for_tissue in correlation_functions)
-
# Set some sane defaults
trait_object.tissue_corr = 0
trait_object.tissue_pvalue = 0
@@ -323,13 +272,6 @@ class CorrelationResults(object):
if self.this_trait.symbol.lower() in primary_trait_tissue_vals_dict:
primary_trait_tissue_values = primary_trait_tissue_vals_dict[self.this_trait.symbol.lower()]
-
- #gene_symbol_list = []
- #
- #for trait in self.correlation_results:
- # if hasattr(trait, 'symbol'):
- # gene_symbol_list.append(trait.symbol)
-
gene_symbol_list = [trait.symbol for trait in self.correlation_results if trait.symbol]
corr_result_tissue_vals_dict= correlation_functions.get_trait_symbol_and_tissue_values(
@@ -346,17 +288,6 @@ class CorrelationResults(object):
trait.tissue_corr = result[0]
trait.tissue_pvalue = result[2]
- # else:
- # trait.tissue_corr = None
- # trait.tissue_pvalue = None
- #else:
- # for trait in self.correlation_results:
- # trait.tissue_corr = None
- # trait.tissue_pvalue = None
-
- #return self.correlation_results
-
-
def do_tissue_correlation_for_all_traits(self, tissue_dataset_id=1):
#Gets tissue expression values for the primary trait
primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values(
@@ -377,7 +308,6 @@ class CorrelationResults(object):
for trait, symbol in self.trait_symbol_dict.iteritems():
if symbol and symbol.lower() in corr_result_tissue_vals_dict:
this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower()]
- #print("this_trait_tissue_values: ", pf(this_trait_tissue_values))
result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values,
this_trait_tissue_values,
@@ -390,7 +320,6 @@ class CorrelationResults(object):
return tissue_corr_data
-
def do_lit_correlation_for_trait_list(self):
input_trait_mouse_gene_id = self.convert_to_mouse_gene_id(self.dataset.group.species.lower(), self.this_trait.geneid)
@@ -529,219 +458,6 @@ class CorrelationResults(object):
if num_overlap > 5:
self.correlation_data[trait] = [sample_r, sample_p, num_overlap]
-
- """
- correlations = []
-
- #XZ: Use the fast method only for probeset dataset, and this dataset must have been created.
- #XZ: Otherwise, use original method
- #print("Entering correlation")
-
- #db_filename = self.getFileName(target_db_name=self.target_db_name)
- #
- #cache_available = db_filename in os.listdir(webqtlConfig.GENERATED_TEXT_DIR)
-
- # If the cache file exists, do a cached correlation for probeset data
- if self.dataset.type == "ProbeSet":
-# if self.method in [METHOD_SAMPLE_PEARSON, METHOD_SAMPLE_RANK] and cache_available:
-# traits = do_parallel_correlation()
-#
-# else:
-
- traits = self.get_traits(self.vals)
-
- for trait in traits:
- trait.calculate_correlation(vals, self.method)
-
- self.record_count = len(traits) #ZS: This isn't a good way to get this value, so I need to change it later
-
- traits = sortTraitCorrelations(traits, self.method)
-
- # Strip to the top N correlations
- traits = traits[:min(self.returnNumber, len(traits))]
-
- addLiteratureCorr = False
- addTissueCorr = False
-
- trait_list = []
- for trait in traits:
- db_trait = webqtlTrait(db=self.db, name=trait.name, cursor=self.cursor)
- db_trait.retrieveInfo( QTL='Yes' )
-
- db_trait.Name = trait.name
- db_trait.corr = trait.correlation
- db_trait.nOverlap = trait.overlap
- db_trait.corrPValue = trait.p_value
-
- # NL, 07/19/2010
- # js function changed, add a new parameter rankOrder for js function 'showTissueCorrPlot'
- db_trait.RANK_ORDER = self.RANK_ORDERS[self.method]
-
- #XZ, 26/09/2008: Method is 4 or 5. Have fetched tissue corr, but no literature correlation yet.
- if self.method in TISSUE_METHODS:
- db_trait.tissueCorr = trait.tissue_corr
- db_trait.tissuePValue = trait.p_tissue
- addTissueCorr = True
-
-
- #XZ, 26/09/2008: Method is 3, Have fetched literature corr, but no tissue corr yet.
- elif self.method == METHOD_LIT:
- db_trait.LCorr = trait.lit_corr
- db_trait.mouse_geneid = self.translateToMouseGeneID(self.species, db_trait.geneid)
- addLiteratureCorr = True
-
- #XZ, 26/09/2008: Method is 1 or 2. Have NOT fetched literature corr and tissue corr yet.
- # Phenotype data will not have geneid, and neither will some probes
- # we need to handle this because we will get an attribute error
- else:
- if self.input_trait_mouse_gene_id and self.db.type=="ProbeSet":
- addLiteratureCorr = True
- if self.trait_symbol and self.db.type=="ProbeSet":
- addTissueCorr = True
-
- trait_list.append(db_trait)
-
- if addLiteratureCorr:
- trait_list = self.getLiteratureCorrelationByList(self.input_trait_mouse_gene_id,
- self.species, trait_list)
- if addTissueCorr:
- trait_list = self.getTissueCorrelationByList(
- primaryTraitSymbol = self.trait_symbol,
- traitList = trait_list,
- TissueProbeSetFreezeId = TISSUE_MOUSE_DB,
- method=self.method)
-
- return trait_list
- """
-
-
- def do_tissue_corr_for_all_traits_2(self):
- """Comments Possibly Out of Date!!!!!
-
- Uses get_temp_tissue_corr_table to generate table of tissue correlations
-
- This function then gathers that data and pairs it with the TraitID string.
- Takes as its arguments a formdata instance, and a dataset instance.
- Returns a dictionary of 'TraitID':(tissueCorr, tissuePValue)
- for the requested correlation
-
- Used when the user selects the tissue correlation method; i.e. not for the
- column that is appended to all probeset trait correlation tables
-
- """
-
- # table name string
- temp_table = self.get_temp_tissue_corr_table(tissue_probesetfreeze_id=TISSUE_MOUSE_DB,
- method=method)
-
- query = """SELECT ProbeSet.Name, {}.Correlation, {}.PValue
- FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
- LEFT JOIN {} ON {}.Symbol=ProbeSet.Symbol
- WHERE ProbeSetFreeze.Name = '{}'
- and ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId
- and ProbeSet.Id = ProbeSetXRef.ProbeSetId
- and ProbeSet.Symbol IS NOT NULL
- and {}.Correlation IS NOT NULL""".format(dataset.mescape(
- temp_table, temp_table, temp_table, temp_table,
- self.dataset.name, temp_table))
-
- results = g.db.execute(query).fetchall()
-
- tissue_corr_dict = {}
-
- for entry in results:
- trait_name, tissue_corr, tissue_pvalue = entry
- tissue_corr_dict[trait_name] = (tissue_corr, tissue_pvalue)
- #symbolList,
- #geneIdDict,
- #dataIdDict,
- #ChrDict,
- #MbDict,
- #descDict,
- #pTargetDescDict = getTissueProbeSetXRefInfo(
- # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId)
-
- g.db.execute('DROP TEMPORARY TABLE {}'.format(escape(temp_table)))
-
- return tissue_corr_dict
-
-
- #XZ, 09/23/2008: In tissue correlation tables, there is no record of GeneId1 == GeneId2
- #XZ, 09/24/2008: Note that the correlation value can be negative.
- def get_temp_tissue_corr_table(self,
- tissue_probesetfreeze_id=0,
- method="",
- return_number=0):
-
-
- def cmp_tisscorr_absolute_value(A, B):
- try:
- if abs(A[1]) < abs(B[1]): return 1
- elif abs(A[1]) == abs(B[1]):
- return 0
- else: return -1
- except:
- return 0
-
- symbol_corr_dict, symbol_pvalue_dict = self.calculate_corr_for_all_tissues(
- tissue_dataset_id=TISSUE_MOUSE_DB)
-
- symbol_corr_list = symbol_corr_dict.items()
-
- symbol_corr_list.sort(cmp_tisscorr_absolute_value)
- symbol_corr_list = symbol_corr_list[0 : 2*return_number]
-
- tmp_table_name = webqtlUtil.genRandStr(prefix="TOPTISSUE")
-
- q1 = 'CREATE TEMPORARY TABLE %s (Symbol varchar(100) PRIMARY KEY, Correlation float, PValue float)' % tmp_table_name
- self.cursor.execute(q1)
-
- for one_pair in symbol_corr_list:
- one_symbol = one_pair[0]
- one_corr = one_pair[1]
- one_p_value = symbol_pvalue_dict[one_symbol]
-
- self.cursor.execute( "INSERT INTO %s (Symbol, Correlation, PValue) VALUES ('%s',%f,%f)" % (tmpTableName, one_symbol, float(one_corr), float(one_p_value)) )
-
- return tmp_table_name
-
-
- def calculate_corr_for_all_tissues(self, tissue_dataset_id=None):
-
- symbol_corr_dict = {}
- symbol_pvalue_dict = {}
-
- primary_trait_symbol_value_dict = correlation_functions.make_gene_tissue_value_dict(
- GeneNameLst=[self.this_trait.symbol],
- TissueProbeSetFreezeId=tissue_dataset_id)
- primary_trait_value = primary_trait_symbol_value_dict.values()[0]
-
- symbol_value_dict = correlation_functions.make_gene_tissue_value_dict(
- gene_name_list=[],
- tissue_dataset_id=tissue_dataset_id)
-
- symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr(
- primaryTraitValue,
- SymbolValueDict,
- method=self.corr_method)
- #else:
- # symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr(
- # primaryTraitValue,
- # SymbolValueDict)
-
- return (symbolCorrDict, symbolPvalueDict)
-
- ##XZ, 12/16/2008: the input geneid is of mouse type
- #def checkSymbolForTissueCorr(self, tissueProbeSetFreezeId=0, symbol=""):
- # q = "SELECT 1 FROM TissueProbeSetXRef WHERE TissueProbeSetFreezeId=%s and Symbol='%s' LIMIT 1" % (tissueProbeSetFreezeId,symbol)
- # self.cursor.execute(q)
- # try:
- # x = self.cursor.fetchone()
- # if x: return True
- # else: raise
- # except: return False
-
-
def process_samples(self, start_vars, sample_names, excluded_samples=None):
if not excluded_samples:
excluded_samples = ()
@@ -754,566 +470,6 @@ class CorrelationResults(object):
if not value.strip().lower() == 'x':
self.sample_data[str(sample)] = float(value)
- ##XZ, 12/16/2008: the input geneid is of mouse type
- #def checkForLitInfo(self,geneId):
- # q = 'SELECT 1 FROM LCorrRamin3 WHERE GeneId1=%s LIMIT 1' % geneId
- # self.cursor.execute(q)
- # try:
- # x = self.cursor.fetchone()
- # if x: return True
- # else: raise
- # except: return False
-
-
-
- def fetchAllDatabaseData(self, species, GeneId, GeneSymbol, strains, db, method, returnNumber, tissueProbeSetFreezeId):
-
- StrainIds = []
- for item in strains:
- self.cursor.execute('''SELECT Strain.Id FROM Strain, Species WHERE Strain.Name="%s" and Strain.SpeciesId=Species.Id and Species.name = "%s" ''' % (item, species))
- Id = self.cursor.fetchone()[0]
- StrainIds.append('%d' % Id)
-
- # break it into smaller chunks so we don't overload the MySql server
- nnn = len(StrainIds) / 25
- if len(StrainIds) % 25:
- nnn += 1
- oridata = []
-
- #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
- tempTable = None
- if GeneId and db.type == "ProbeSet":
- if method == "3":
- tempTable = self.getTempLiteratureTable(species=species, input_species_geneid=GeneId, returnNumber=returnNumber)
-
- if method == "4" or method == "5":
- tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, TissueProbeSetFreezeId=TISSUE_MOUSE_DB, method=method, returnNumber=returnNumber)
-
- for step in range(nnn):
- temp = []
- StrainIdstep = StrainIds[step*25:min(len(StrainIds), (step+1)*25)]
- for item in StrainIdstep: temp.append('T%s.value' % item)
-
- if db.type == "Publish":
- query = "SELECT PublishXRef.Id, "
- dataStartPos = 1
- query += string.join(temp,', ')
- query += ' FROM (PublishXRef, PublishFreeze)'
- #XZ, 03/04/2009: Xiaodong changed Data to PublishData
- for item in StrainIdstep:
- query += 'left join PublishData as T%s on T%s.Id = PublishXRef.DataId and T%s.StrainId=%s\n' %(item,item,item,item)
- query += "WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId and PublishFreeze.Name = '%s'" % (db.name, )
- #XZ, 09/20/2008: extract literature correlation value together with gene expression values.
- #XZ, 09/20/2008: notice the difference between the code in next block.
- #elif tempTable:
- # # we can get a little performance out of selecting our LitCorr here
- # # but also we need to do this because we are unconcerned with probes that have no geneId associated with them
- # # as we would not have litCorr data.
- #
- # if method == "3":
- # query = "SELECT %s.Name, %s.value," % (db.type,tempTable)
- # dataStartPos = 2
- # if method == "4" or method == "5":
- # query = "SELECT %s.Name, %s.Correlation, %s.PValue," % (db.type,tempTable, tempTable)
- # dataStartPos = 3
- #
- # query += string.join(temp,', ')
- # query += ' FROM (%s, %sXRef, %sFreeze)' % (db.type, db.type, db.type)
- # if method == "3":
- # query += ' LEFT JOIN %s ON %s.GeneId2=ProbeSet.GeneId ' % (tempTable,tempTable)
- # if method == "4" or method == "5":
- # query += ' LEFT JOIN %s ON %s.Symbol=ProbeSet.Symbol ' % (tempTable,tempTable)
- # #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
- # for item in StrainIdstep:
- # query += 'left join %sData as T%s on T%s.Id = %sXRef.DataId and T%s.StrainId=%s\n' %(db.type, item,item, db.type,item,item)
- #
- # if method == "3":
- # query += "WHERE ProbeSet.GeneId IS NOT NULL AND %s.value IS NOT NULL AND %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (tempTable,db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type)
- # if method == "4" or method == "5":
- # query += "WHERE ProbeSet.Symbol IS NOT NULL AND %s.Correlation IS NOT NULL AND %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (tempTable,db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type)
- else:
- query = "SELECT %s.Name," % db.type
- dataStartPos = 1
- query += string.join(temp,', ')
- query += ' FROM (%s, %sXRef, %sFreeze)' % (db.type, db.type, db.type)
- #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
- for item in StrainIdstep:
- query += 'left join %sData as T%s on T%s.Id = %sXRef.DataId and T%s.StrainId=%s\n' %(db.type, item,item, db.type,item,item)
- query += "WHERE %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type)
-
- self.cursor.execute(query)
- results = self.cursor.fetchall()
- oridata.append(results)
-
- datasize = len(oridata[0])
- traits = []
- # put all of the separate data together into a huge list of lists
- for j in range(datasize):
- traitdata = list(oridata[0][j])
- for i in range(1,nnn):
- traitdata += list(oridata[i][j][dataStartPos:])
-
- trait = Trait(traitdata[0], traitdata[dataStartPos:])
-
- if method == METHOD_LIT:
- trait.lit_corr = traitdata[1]
-
- if method in TISSUE_METHODS:
- trait.tissue_corr = traitdata[1]
- trait.p_tissue = traitdata[2]
-
- traits.append(trait)
-
- if tempTable:
- self.cursor.execute( 'DROP TEMPORARY TABLE %s' % tempTable )
-
- return traits
-
-
-
-
- # XZ, 09/20/2008: This function creates TEMPORARY TABLE tmpTableName_2 and return its name.
- # XZ, 09/20/2008: It stores top literature correlation values associated with the input geneId.
- # XZ, 09/20/2008: Attention: In each row, the input geneId is always in column GeneId1.
- #XZ, 12/16/2008: the input geneid can be of mouse, rat or human type
- def getTempLiteratureTable(self, species, input_species_geneid, returnNumber):
- # according to mysql the TEMPORARY TABLE name should not have to be unique because
- # it is only available to the current connection. This program will be invoked via command line, but if it
- # were to be invoked over mod_python this could cuase problems. mod_python will keep the connection alive
- # in its executing threads ( i think) so there is a potential for the table not being dropped between users.
- #XZ, 01/29/2009: To prevent the potential risk, I generate random table names and drop the tables after use them.
-
-
- # the 'input_species_geneid' could be rat or human geneid, need to translate it to mouse geneid
- translated_mouse_geneid = self.translateToMouseGeneID (species, input_species_geneid)
-
- tmpTableName_1 = webqtlUtil.genRandStr(prefix="LITERATURE")
-
- q1 = 'CREATE TEMPORARY TABLE %s (GeneId1 int(12) unsigned, GeneId2 int(12) unsigned PRIMARY KEY, value double)' % tmpTableName_1
- q2 = 'INSERT INTO %s (GeneId1, GeneId2, value) SELECT GeneId1,GeneId2,value FROM LCorrRamin3 WHERE GeneId1=%s' % (tmpTableName_1, translated_mouse_geneid)
- q3 = 'INSERT INTO %s (GeneId1, GeneId2, value) SELECT GeneId2,GeneId1,value FROM LCorrRamin3 WHERE GeneId2=%s AND GeneId1!=%s' % (tmpTableName_1, translated_mouse_geneid,translated_mouse_geneid)
- for x in [q1,q2,q3]: self.cursor.execute(x)
-
- #XZ, 09/23/2008: Just use the top records insteard of using all records
- tmpTableName_2 = webqtlUtil.genRandStr(prefix="TOPLITERATURE")
-
- q1 = 'CREATE TEMPORARY TABLE %s (GeneId1 int(12) unsigned, GeneId2 int(12) unsigned PRIMARY KEY, value double)' % tmpTableName_2
- self.cursor.execute(q1)
- q2 = 'SELECT GeneId1, GeneId2, value FROM %s ORDER BY value DESC' % tmpTableName_1
- self.cursor.execute(q2)
- result = self.cursor.fetchall()
-
- counter = 0 #this is to count how many records being inserted into table
- for one_row in result:
- mouse_geneid1, mouse_geneid2, lit_corr_alue = one_row
-
- #mouse_geneid1 has been tested before, now should test if mouse_geneid2 has corresponding geneid in other species
- translated_species_geneid = 0
- if species == 'mouse':
- translated_species_geneid = mouse_geneid2
- elif species == 'rat':
- self.cursor.execute( "SELECT rat FROM GeneIDXRef WHERE mouse=%d" % int(mouse_geneid2) )
- record = self.cursor.fetchone()
- if record:
- translated_species_geneid = record[0]
- elif species == 'human':
- self.cursor.execute( "SELECT human FROM GeneIDXRef WHERE mouse=%d" % int(mouse_geneid2) )
- record = self.cursor.fetchone()
- if record:
- translated_species_geneid = record[0]
-
- if translated_species_geneid:
- self.cursor.execute( 'INSERT INTO %s (GeneId1, GeneId2, value) VALUES (%d,%d,%f)' % (tmpTableName_2, int(input_species_geneid),int(translated_species_geneid), float(lit_corr_alue)) )
- counter = counter + 1
-
- #pay attention to the number
- if (counter > 2*returnNumber):
- break
-
- self.cursor.execute('DROP TEMPORARY TABLE %s' % tmpTableName_1)
-
- return tmpTableName_2
-
-
-
- #XZ, 01/09/2009: This function was created by David Crowell. Xiaodong cleaned up and modified it.
- def fetchLitCorrelations(self, species, GeneId, db, returnNumber): ### Used to generate Lit Correlations when calculations are done from text file. dcrowell August 2008
- """Uses getTempLiteratureTable to generate table of literatire correlations. This function then gathers that data and
- pairs it with the TraitID string. Takes as its arguments a formdata instance, and a database instance.
- Returns a dictionary of 'TraitID':'LitCorr' for the requested correlation"""
-
- tempTable = self.getTempLiteratureTable(species=species, input_species_geneid=GeneId, returnNumber=returnNumber)
-
- query = "SELECT %s.Name, %s.value" % (db.type,tempTable)
- query += ' FROM (%s, %sXRef, %sFreeze)' % (db.type, db.type, db.type)
- query += ' LEFT JOIN %s ON %s.GeneId2=ProbeSet.GeneId ' % (tempTable,tempTable)
- query += "WHERE ProbeSet.GeneId IS NOT NULL AND %s.value IS NOT NULL AND %sXRef.%sFreezeId = %sFreeze.Id and %sFreeze.Name = '%s' and %s.Id = %sXRef.%sId order by %s.Id" % (tempTable, db.type, db.type, db.type, db.type, db.name, db.type, db.type, db.type, db.type)
-
- self.cursor.execute(query)
- results = self.cursor.fetchall()
-
- litCorrDict = {}
-
- for entry in results:
- traitName,litcorr = entry
- litCorrDict[traitName] = litcorr
-
- self.cursor.execute('DROP TEMPORARY TABLE %s' % tempTable)
-
- return litCorrDict
-
-
- def get_traits(self, vals):
-
- #Todo: Redo cached stuff using memcached
- if False:
- lit_corrs = {}
- tissue_corrs = {}
- use_lit = False
- if self.method == METHOD_LIT:
- lit_corrs = self.fetchLitCorrelations(species=self.species, GeneId=self.gene_id, db=self.db, returnNumber=self.returnNumber)
- use_lit = True
-
- use_tissue_corr = False
- if self.method in TISSUE_METHODS:
- tissue_corrs = self.fetch_tissue_correlations(method=self.method, return_number = self.return_number)
- use_tissue_corr = True
-
- DatabaseFileName = self.getFileName( target_db_name=self.target_db_name )
- datasetFile = open(webqtlConfig.CACHEDIR+DatabaseFileName,'r')
-
- #XZ, 01/08/2009: read the first line
- line = datasetFile.readline()
- cached_sample_names = webqtlUtil.readLineCSV(line)[1:]
-
- #XZ, 01/08/2009: This step is critical. It is necessary for this new method.
- #XZ: The original function fetchAllDatabaseData uses all strains stored in variable _strains to
- #XZ: retrieve the values of each strain from database in real time.
- #XZ: The new method uses all strains stored in variable dataset_strains to create a new variable
- #XZ: _newvals. _newvals has the same length as dataset_strains. The items in _newvals is in
- #XZ: the same order of items in dataset_strains. The value of each item in _newvals is either
- #XZ: the value of correspinding strain in _vals or 'None'.
- new_vals = []
- for name in cached_sample_names:
- if name in self.sample_names:
- new_vals.append(float(vals[self.sample_names.index(name)]))
- else:
- new_vals.append('None')
-
- nnCorr = len(new_vals)
-
- #XZ, 01/14/2009: If literature corr or tissue corr is selected,
- #XZ: there is no need to use parallel computing.
-
- traits = []
- data_start = 1
- for line in datasetFile:
- raw_trait = webqtlUtil.readLineCSV(line)
- trait = Trait.from_csv(raw_trait, data_start)
- trait.lit_corr = lit_corrs.get(trait.name)
- trait.tissue_corr, trait.p_tissue = tissue_corrs.get(trait.name, (None, None))
- traits.append(trait)
-
- return traits, new_vals
-
- else:
- traits = self.fetchAllDatabaseData(species=self.dataset.species,
- GeneId=self.gene_id,
- GeneSymbol=self.trait.symbol,
- strains=self.sample_names,
- db=self.db,
- method=self.method,
- returnNumber=self.returnNumber,
- tissueProbeSetFreezeId= self.tissue_probeset_freeze_id)
- totalTraits = len(traits) #XZ, 09/18/2008: total trait number
-
- return traits
-
- def calculate_corr_for_all_tissues(self, tissue_dataset_id=None):
-
- symbol_corr_dict = {}
- symbol_pvalue_dict = {}
-
- primary_trait_symbol_value_dict = correlation_functions.make_gene_tissue_value_dict(
- GeneNameLst=[self.this_trait.symbol],
- TissueProbeSetFreezeId=tissue_dataset_id)
- primary_trait_value = primary_trait_symbol_value_dict.values()[0]
-
- symbol_value_dict = correlation_functions.make_gene_tissue_value_dict(
- gene_name_list=[],
- tissue_dataset_id=tissue_dataset_id)
-
- symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr(
- primaryTraitValue,
- SymbolValueDict,
- method=self.corr_method)
- #else:
- # symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr(
- # primaryTraitValue,
- # SymbolValueDict)
-
- return (symbolCorrDict, symbolPvalueDict)
-
- def getFileName(self, target_db_name): ### dcrowell August 2008
- """Returns the name of the reference database file with which correlations are calculated.
- Takes argument cursor which is a cursor object of any instance of a subclass of templatePage
- Used by correlationPage"""
-
- dataset_id = str(self.target_dataset.id)
- dataset_fullname = self.target_dataset.fullname.replace(' ','_')
- dataset_fullname = dataset_fullname.replace('/','_')
-
- FileName = 'ProbeSetFreezeId_' + dataset_id + '_FullName_' + dataset_fullname + '.txt'
-
- return FileName
-
- def do_parallel_correlation(self, db_filename, num_overlap):
-
- #XZ, 01/14/2009: This method is for parallel computing only.
- #XZ: It is supposed to be called when "Genetic Correlation, Pearson's r" (method 1)
- #XZ: or "Genetic Correlation, Spearman's rho" (method 2) is selected
- def compute_corr(input_nnCorr, input_trait, input_list, corr_method):
-
- import math
- import reaper
-
- def cmpOrder2(A,B):
- try:
- if A[-1] < B[-1]:
- return -1
- elif A[-1] == B[-1]:
- return 0
- else:
- return 1
- except:
- return 0
-
- def calCorrelation(dbdata,userdata,N):
- X = []
- Y = []
- for i in range(N):
- if (dbdata[i] != None and userdata[i] != None) and (dbdata[i] != "None" and userdata[i] != "None"):
- X.append(float(dbdata[i]))
- Y.append(float(userdata[i]))
- NN = len(X)
- if NN <6:
- return (0.0,NN)
- sx = reduce(lambda x,y:x+y,X,0.0)
- sy = reduce(lambda x,y:x+y,Y,0.0)
- meanx = sx/NN
- meany = sy/NN
- xyd = 0.0
- sxd = 0.0
- syd = 0.0
- for i in range(NN):
- xyd += (X[i] - meanx)*(Y[i]-meany)
- sxd += (X[i] - meanx)*(X[i] - meanx)
- syd += (Y[i] - meany)*(Y[i] - meany)
- try:
- corr = xyd/(math.sqrt(sxd)*math.sqrt(syd))
- except:
- corr = 0
- return (corr,NN)
-
- def calCorrelationRank(xVals,yVals,N):
- """
- Calculated Spearman Ranked Correlation. The algorithm works
- by setting all tied ranks to the average of those ranks (for
- example, if ranks 5-10 all have the same value, each will be set
- to rank 7.5).
- """
-
- XX = []
- YY = []
- j = 0
-
- for i in range(len(xVals)):
- if (xVals[i]!= None and yVals[i]!= None) and (xVals[i] != "None" and yVals[i] != "None"):
- XX.append((j,float(xVals[i])))
- YY.append((j,float(yVals[i])))
- j = j+1
-
- NN = len(XX)
- if NN <6:
- return (0.0,NN)
- XX.sort(cmpOrder2)
- YY.sort(cmpOrder2)
- X = [0]*NN
- Y = [0]*NN
-
- j = 1
- rank = 0.0
- t = 0.0
- sx = 0.0
-
- while j < NN:
-
- if XX[j][1] != XX[j-1][1]:
- X[XX[j-1][0]] = j
- j = j+1
-
- else:
- jt = j+1
- ji = j
- for jt in range(j+1, NN):
- if (XX[jt][1] != XX[j-1][1]):
- break
- rank = 0.5*(j+jt)
- for ji in range(j-1, jt):
- X[XX[ji][0]] = rank
- t = jt-j
- sx = sx + (t*t*t-t)
- if (jt == NN-1):
- if (XX[jt][1] == XX[j-1][1]):
- X[XX[NN-1][0]] = rank
- j = jt+1
-
- if j == NN:
- if X[XX[NN-1][0]] == 0:
- X[XX[NN-1][0]] = NN
-
- j = 1
- rank = 0.0
- t = 0.0
- sy = 0.0
-
- while j < NN:
-
- if YY[j][1] != YY[j-1][1]:
- Y[YY[j-1][0]] = j
- j = j+1
- else:
- jt = j+1
- ji = j
- for jt in range(j+1, NN):
- if (YY[jt][1] != YY[j-1][1]):
- break
- rank = 0.5*(j+jt)
- for ji in range(j-1, jt):
- Y[YY[ji][0]] = rank
- t = jt - j
- sy = sy + (t*t*t-t)
- if (jt == NN-1):
- if (YY[jt][1] == YY[j-1][1]):
- Y[YY[NN-1][0]] = rank
- j = jt+1
-
- if j == NN:
- if Y[YY[NN-1][0]] == 0:
- Y[YY[NN-1][0]] = NN
-
- D = 0.0
-
- for i in range(NN):
- D += (X[i]-Y[i])*(X[i]-Y[i])
-
- fac = (1.0 -sx/(NN*NN*NN-NN))*(1.0-sy/(NN*NN*NN-NN))
-
- return ((1-(6.0/(NN*NN*NN-NN))*(D+(sx+sy)/12.0))/math.sqrt(fac),NN)
-
- # allcorrelations = []
-
- correlation_data = {}
- for i, line in enumerate(input_list):
- if i == 0:
- continue
- tokens = line.split('","')
- tokens[-1] = tokens[-1][:-2] #remove the last "
- tokens[0] = tokens[0][1:] #remove the first "
-
- traitdataName = tokens[0]
- database_trait = tokens[1:]
-
- #print("database_trait:", database_trait)
-
- #ZS: 2015 could add biweight correlation, see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3465711/
- # if corr_method == 'pearson':
- # sample_r, sample_p = scipy.stats.pearsonr(input_trait, database_trait)
- # else:
- # sample_r, sample_p = scipy.stats.spearmanr(input_trait, database_trait)
-
- if corr_method == "pearson": #XZ: Pearson's r
- sample_r, nOverlap = calCorrelation(input_trait, database_trait, input_nnCorr)
- else: #XZ: Spearman's rho
- sample_r, nOverlap = calCorrelationRank(input_trait, database_trait, input_nnCorr)
-
- #XZ: calculate corrPValue
- if nOverlap < 3:
- sample_p = 1.0
- else:
- if abs(sample_r) >= 1.0:
- sample_p = 0.0
- else:
- z_value = 0.5*math.log((1.0+sample_r)/(1.0-sample_r))
- z_value = z_value*math.sqrt(nOverlap-3)
- sample_p = 2.0*(1.0 - reaper.normp(abs(z_value)))
-
- correlation_data[traitdataName] = [sample_r, sample_p, nOverlap]
-
- # traitinfo = [traitdataName, sample_r, nOverlap]
- # allcorrelations.append(traitinfo)
-
- return correlation_data
- # return allcorrelations
-
-
- datasetFile = open(webqtlConfig.GENERATED_TEXT_DIR+db_filename,'r')
-
- print("Invoking parallel computing")
- input_line_list = datasetFile.readlines()
- print("Read lines from the file")
- all_line_number = len(input_line_list)
-
- step = 1000
- job_number = math.ceil( float(all_line_number)/step )
-
- print("JOB NUMBER", job_number)
-
- job_input_lists = []
-
- print("Configuring jobs")
-
- for job_index in range( int(job_number) ):
- starti = job_index*step
- endi = min((job_index+1)*step, all_line_number)
-
- one_job_input_list = []
-
- for i in range( starti, endi ):
- one_job_input_list.append( input_line_list[i] )
-
- job_input_lists.append( one_job_input_list )
-
- print("Creating pp servers")
-
- ppservers = ()
- # Creates jobserver with automatically detected number of workers
- job_server = pp.Server(ppservers=ppservers)
-
- print("Done creating servers")
-
- jobs = []
- results = []
-
- print("Starting parallel computation, submitting jobs")
- for one_job_input_list in job_input_lists: #pay attention to modules from outside
- jobs.append( job_server.submit(func=compute_corr, args=(num_overlap, self.this_trait_vals, one_job_input_list, self.corr_method), depfuncs=(), modules=("webqtlUtil",)) )
- print("Done submitting jobs")
-
- for one_job in jobs:
- one_result = one_job()
- self.correlation_data.update(one_result)
- # one_result = one_job()
- # results.append( one_result )
-
- #print("CORRELATION DATA:", self.correlation_data)
-
- # print("Acquiring results")
-
- # for one_result in results:
- # for one_traitinfo in one_result:
- # allcorrelations.append( one_traitinfo )
-
def generate_corr_json(corr_results, this_trait, dataset, target_dataset):
results_list = []
for i, trait in enumerate(corr_results):
diff --git a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py
index 077386a3..4bb4d65d 100644
--- a/wqflask/wqflask/correlation_matrix/show_corr_matrix.py
+++ b/wqflask/wqflask/correlation_matrix/show_corr_matrix.py
@@ -23,7 +23,6 @@ from __future__ import absolute_import, print_function, division
import sys
# sys.path.append(".") Never do this in a webserver!
-import gc
import string
import cPickle
import os
diff --git a/wqflask/wqflask/ctl/ctl_analysis.py b/wqflask/wqflask/ctl/ctl_analysis.py
index 9515d23a..6fda02fd 100644
--- a/wqflask/wqflask/ctl/ctl_analysis.py
+++ b/wqflask/wqflask/ctl/ctl_analysis.py
@@ -2,7 +2,6 @@
# Author / Maintainer: Danny Arends <Danny.Arends@gmail.com>
import sys
from numpy import *
-import scipy as sp # SciPy
import rpy2.robjects as ro # R Objects
import rpy2.rinterface as ri
@@ -24,60 +23,38 @@ from utility import helper_functions
from utility.tools import locate
from rpy2.robjects.packages import importr
-utils = importr("utils")
+
+import utility.logger
+logger = utility.logger.getLogger(__name__ )
## Get pointers to some common R functions
r_library = ro.r["library"] # Map the library function
r_options = ro.r["options"] # Map the options function
-r_read_csv = ro.r["read.csv"] # Map the read.csv function
-r_dim = ro.r["dim"] # Map the dim function
-r_c = ro.r["c"] # Map the c function
r_t = ro.r["t"] # Map the t function
-r_cat = ro.r["cat"] # Map the cat function
-r_paste = ro.r["paste"] # Map the paste function
-r_unlist = ro.r["unlist"] # Map the unlist function
-r_head = ro.r["head"] # Map the unlist function
-r_unique = ro.r["unique"] # Map the unique function
-r_length = ro.r["length"] # Map the length function
r_unlist = ro.r["unlist"] # Map the unlist function
r_list = ro.r.list # Map the list function
-r_matrix = ro.r.matrix # Map the matrix function
-r_seq = ro.r["seq"] # Map the seq function
-r_table = ro.r["table"] # Map the table function
-r_names = ro.r["names"] # Map the names function
-r_sink = ro.r["sink"] # Map the sink function
-r_is_NA = ro.r["is.na"] # Map the is.na function
-r_file = ro.r["file"] # Map the file function
r_png = ro.r["png"] # Map the png function for plotting
r_dev_off = ro.r["dev.off"] # Map the dev.off function
-r_save_image = ro.r["save.image"] # Map the save.image function
-r_class = ro.r["class"] # Map the class function
-r_save = ro.r["save"] # Map the save function
r_write_table = ro.r["write.table"] # Map the write.table function
-r_read_table = ro.r["read.table"] # Map the read.table function
-r_as_data_frame = ro.r["as.data.frame"] # Map the write.table function
r_data_frame = ro.r["data.frame"] # Map the write.table function
r_as_numeric = ro.r["as.numeric"] # Map the write.table function
class CTL(object):
def __init__(self):
- print("Initialization of CTL")
+ logger.info("Initialization of CTL")
#log = r_file("/tmp/genenetwork_ctl.log", open = "wt")
- #r_sink(log) # Uncomment the r_sink() commands to log output from stdout/stderr to a file
+ #r_sink(log) # Uncomment the r_sink() commands to log output from stdout/stderr to a file
#r_sink(log, type = "message")
- r_library("ctl") # Load CTL - Should only be done once, since it is quite expensive
+ r_library("ctl") # Load CTL - Should only be done once, since it is quite expensive
r_options(stringsAsFactors = False)
- print("Initialization of CTL done, package loaded in R session")
+ logger.info("Initialization of CTL done, package loaded in R session")
self.r_CTLscan = ro.r["CTLscan"] # Map the CTLscan function
self.r_CTLsignificant = ro.r["CTLsignificant"] # Map the CTLsignificant function
self.r_lineplot = ro.r["ctl.lineplot"] # Map the ctl.lineplot function
- self.r_CTLsignificant = ro.r["CTLsignificant"] # Map the CTLsignificant function
- self.r_CTLnetwork = ro.r["CTLnetwork"] # Map the CTLnetwork function
- self.r_CTLprofiles = ro.r["CTLprofiles"] # Map the CTLprofiles function
self.r_plotCTLobject = ro.r["plot.CTLobject"] # Map the CTLsignificant function
self.nodes_list = []
self.edges_list = []
- print("Obtained pointers to CTL functions")
+ logger.info("Obtained pointers to CTL functions")
def addNode(self, gt):
node_dict = { 'data' : {'id' : str(gt.name) + ":" + str(gt.dataset.name),
@@ -100,20 +77,20 @@ class CTL(object):
self.edges_list.append(edge_dict)
def run_analysis(self, requestform):
- print("Starting CTL analysis on dataset")
+ logger.info("Starting CTL analysis on dataset")
self.trait_db_list = [trait.strip() for trait in requestform['trait_list'].split(',')]
self.trait_db_list = [x for x in self.trait_db_list if x]
- print("strategy:", requestform.get("strategy"))
+ logger.debug("strategy:", requestform.get("strategy"))
strategy = requestform.get("strategy")
- print("nperm:", requestform.get("nperm"))
+ logger.debug("nperm:", requestform.get("nperm"))
nperm = int(requestform.get("nperm"))
- print("parametric:", requestform.get("parametric"))
+ logger.debug("parametric:", requestform.get("parametric"))
parametric = bool(requestform.get("parametric"))
- print("significance:", requestform.get("significance"))
+ logger.debug("significance:", requestform.get("significance"))
significance = float(requestform.get("significance"))
# Get the name of the .geno file belonging to the first phenotype
@@ -123,7 +100,7 @@ class CTL(object):
genofilelocation = locate(dataset.group.name + ".geno", "genotype")
parser = genofile_parser.ConvertGenoFile(genofilelocation)
parser.process_csv()
- print(dataset.group)
+ logger.debug("dataset group: ", dataset.group)
# Create a genotype matrix
individuals = parser.individuals
markers = []
@@ -133,14 +110,14 @@ class CTL(object):
markers.append(marker["genotypes"])
genotypes = list(itertools.chain(*markers))
- print(len(genotypes) / len(individuals), "==", len(parser.markers))
+ logger.debug(len(genotypes) / len(individuals), "==", len(parser.markers))
rGeno = r_t(ro.r.matrix(r_unlist(genotypes), nrow=len(markernames), ncol=len(individuals), dimnames = r_list(markernames, individuals), byrow=True))
# Create a phenotype matrix
traits = []
for trait in self.trait_db_list:
- print("retrieving data for", trait)
+ logger.debug("retrieving data for", trait)
if trait != "":
ts = trait.split(':')
gt = TRAIT.GeneralTrait(name = ts[0], dataset_name = ts[1])
@@ -153,7 +130,7 @@ class CTL(object):
rPheno = r_t(ro.r.matrix(r_as_numeric(r_unlist(traits)), nrow=len(self.trait_db_list), ncol=len(individuals), dimnames = r_list(self.trait_db_list, individuals), byrow=True))
- print(rPheno)
+ logger.debug(rPheno)
# Use a data frame to store the objects
rPheno = r_data_frame(rPheno, check_names = False)
@@ -196,10 +173,9 @@ class CTL(object):
sys.stdout.flush()
# Create the interactive graph for cytoscape visualization (Nodes and Edges)
- print(type(significant))
if not type(significant) == ri.RNULLType:
for x in range(len(significant[0])):
- print(significant[0][x], significant[1][x], significant[2][x]) # Debug to console
+ logger.debug(significant[0][x], significant[1][x], significant[2][x]) # Debug to console
tsS = significant[0][x].split(':') # Source
tsT = significant[2][x].split(':') # Target
gtS = TRAIT.GeneralTrait(name = tsS[0], dataset_name = tsS[1]) # Retrieve Source info from the DB
@@ -214,7 +190,6 @@ class CTL(object):
self.elements = json.dumps(self.nodes_list + self.edges_list)
def loadImage(self, path, name):
- print("pre-loading imgage results:", self.results[path])
imgfile = open(self.results[path], 'rb')
imgdata = imgfile.read()
imgB64 = imgdata.encode("base64")
@@ -229,7 +204,7 @@ class CTL(object):
n = n + 1
def process_results(self, results):
- print("Processing CTL output")
+ logger.info("Processing CTL output")
template_vars = {}
template_vars["results"] = self.results
template_vars["elements"] = self.elements
diff --git a/wqflask/wqflask/templates/correlation_page.html b/wqflask/wqflask/templates/correlation_page.html
index fb4e19a1..05136ad8 100644
--- a/wqflask/wqflask/templates/correlation_page.html
+++ b/wqflask/wqflask/templates/correlation_page.html
@@ -94,12 +94,14 @@
<th>Sample p(r)</th>
<th>Lit r</th>
<th>Tissue r</th>
+ <th>Tissue p(r)</th>
{% else %}
<th>Sample rho</th>
<th>N</th>
<th>Sample p(rho)</th>
<th>Lit r</th>
<th>Tissue rho</th>
+ <th>Tissue p(rho)</th>
{% endif %}
{% elif target_dataset.type == "Publish" %}
{% if corr_method == 'pearson' %}
@@ -156,8 +158,10 @@
{% endif %}
{% if trait.tissue_corr == "" or trait.tissue_corr == 0.000 %}
<td align="right">--</td>
+ <td align="right">--</td>
{% else %}
<td align="right">{{'%0.3f'|format(trait.tissue_corr)}}</td>
+ <td align="right">{{'%0.3e'|format(trait.tissue_pvalue)}}</td>
{% endif %}
{% elif target_dataset.type == "Publish" %}
<td>{{ trait.description_display }}</td>
@@ -319,7 +323,7 @@
title: 'correlation_results',
fieldBoundary: '"',
exportOptions: {
- columns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
+ columns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}
}
],
@@ -342,7 +346,8 @@
{ "type": "natural" },
{ "type": "scientific" },
{ "type": "natural" },
- { "type": "natural" }
+ { "type": "natural" },
+ { "type": "scientific" }
],
"createdRow": function ( row, data, index ) {
$('td', row).eq(4).attr('title', $('td', row).eq(4).text());