From 99e1151d5603b1bbf52141166d72e6e32203bb62 Mon Sep 17 00:00:00 2001 From: Zachary Sloan Date: Tue, 16 Jul 2013 23:24:57 +0000 Subject: Wrote code that can get a dataset's type for every single GN dataset; previously we could not view traits in datasets that were not in the DBType table in the database --- wqflask/base/data_set.py | 90 +- wqflask/base/trait.py | 12 +- wqflask/wqflask/correlation/correlationFunction.py | 923 --------------------- .../wqflask/correlation/correlation_function.py | 923 +++++++++++++++++++++ wqflask/wqflask/correlation/show_corr_results.py | 130 +-- wqflask/wqflask/search_results.py | 8 +- 6 files changed, 1085 insertions(+), 1001 deletions(-) delete mode 100644 wqflask/wqflask/correlation/correlationFunction.py create mode 100644 wqflask/wqflask/correlation/correlation_function.py (limited to 'wqflask') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 03b24230..30221503 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -48,32 +48,67 @@ from MySQLdb import escape_string as escape from pprint import pformat as pf # Used by create_database to instantiate objects +# Each subclass will add to this DS_NAME_MAP = {} def create_dataset(dataset_name, dataset_type = None): - #print("dataset_name:", dataset_name) - + + print("dataset_type:", dataset_type) if not dataset_type: - query = """ - SELECT DBType.Name - FROM DBList, DBType - WHERE DBList.Name = '{}' and - DBType.Id = DBList.DBTypeId - """.format(escape(dataset_name)) - #print("query is: ", pf(query)) - dataset_type = g.db.execute(query).fetchone().Name + dataset_type = Dataset_Getter(dataset_name) + #dataset_type = get_dataset_type_from_json(dataset_name) - #dataset_type = cursor.fetchone()[0] - #print("[blubber] dataset_type:", pf(dataset_type)) + print("dataset_type is:", dataset_type) + #query = """ + # SELECT DBType.Name + # FROM DBList, DBType + # WHERE DBList.Name = '{}' and + # DBType.Id = DBList.DBTypeId + # """.format(escape(dataset_name)) + #dataset_type = g.db.execute(query).fetchone().Name - dataset_ob = DS_NAME_MAP[dataset_type] - #dataset_class = getattr(data_set, dataset_ob) - #print("dataset_ob:", dataset_ob) - #print("DS_NAME_MAP:", pf(DS_NAME_MAP)) + dataset_ob = DS_NAME_MAP[dataset_type] dataset_class = globals()[dataset_ob] return dataset_class(dataset_name) + +#def get_dataset_type_from_json(dataset_name): + +class Dataset_Types(object): + + def __init__(self): + self.datasets = {} + file_name = "wqflask/static/new/javascript/dataset_menu_structure.json" + with open(file_name, 'r') as fh: + data = json.load(fh) + + print("*" * 70) + for species in data['datasets']: + for group in data['datasets'][species]: + for dataset_type in data['datasets'][species][group]: + for dataset in data['datasets'][species][group][dataset_type]: + print("dataset is:", dataset) + + short_dataset_name = dataset[0] + if dataset_type == "Phenotypes": + new_type = "Publish" + elif dataset_type == "Genotypes": + new_type = "Geno" + else: + new_type = "ProbeSet" + self.datasets[short_dataset_name] = new_type + + def __call__(self, name): + return self.datasets[name] + +# Do the intensive work at startup one time only +Dataset_Getter = Dataset_Types() + +# +#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611")) + + def create_datasets_list(): key = "all_datasets" result = Redis.get(key) @@ -212,7 +247,7 @@ class DatasetGroup(object): marker_class = Markers self.markers = marker_class(self.name) - + def get_f1_parent_strains(self): try: @@ -225,7 +260,7 @@ class DatasetGroup(object): self.f1list = [f1, f12] if maternal and paternal: self.parlist = [maternal, paternal] - + def read_genotype_file(self): '''Read genotype from .geno file instead of database''' #if self.group == 'BXD300': @@ -375,6 +410,9 @@ class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' def setup(self): + + print("IS A PHENOTYPEDATASET") + # Fields in the database table self.search_fields = ['Phenotype.Post_publication_description', 'Phenotype.Pre_publication_description', @@ -445,14 +483,24 @@ class PhenotypeDataSet(DataSet): def get_trait_info(self, trait_list, species = ''): for this_trait in trait_list: if not this_trait.haveinfo: - this_trait.retrieveInfo(QTL=1) + this_trait.retrieve_info(get_qtl_info=True) description = this_trait.post_publication_description + + #If the dataset is confidential and the user has access to confidential + #phenotype traits, then display the pre-publication description instead + #of the post-publication description if this_trait.confidential: continue # for now - if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users): + + if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait( + privilege=self.privilege, + userName=self.userName, + authorized_users=this_trait.authorized_users): + description = this_trait.pre_publication_description - this_trait.description_display = unicode(description, "utf8") + + this_trait.description_display = description if not this_trait.year.isdigit(): this_trait.pubmed_text = "N/A" diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index db76ddea..6648047c 100755 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -320,7 +320,11 @@ class GeneralTrait(object): #XZ: assign SQL query result to trait attributes. for i, field in enumerate(self.dataset.display_fields): print(" mike: {} -> {} - {}".format(field, type(trait_info[i]), trait_info[i])) - setattr(self, field, trait_info[i]) + holder = trait_info[i] + if isinstance(trait_info[i], basestring): + print("is basestring") + holder = unicode(trait_info[i], "utf8") + setattr(self, field, holder) if self.dataset.type == 'Publish': self.confidential = 0 @@ -329,9 +333,9 @@ class GeneralTrait(object): self.homologeneid = None - print("self.geneid is:", self.geneid) - print(" type:", type(self.geneid)) - print("self.dataset.group.name is:", self.dataset.group.name) + #print("self.geneid is:", self.geneid) + #print(" type:", type(self.geneid)) + #print("self.dataset.group.name is:", self.dataset.group.name) if self.dataset.type == 'ProbeSet' and self.dataset.group and self.geneid: #XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number. #XZ: So I have to test if geneid is number before execute the query. diff --git a/wqflask/wqflask/correlation/correlationFunction.py b/wqflask/wqflask/correlation/correlationFunction.py deleted file mode 100644 index 7d4b58a9..00000000 --- a/wqflask/wqflask/correlation/correlationFunction.py +++ /dev/null @@ -1,923 +0,0 @@ -# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. -# -# This program is free software: you can redistribute it and/or modify it -# under the terms of the GNU Affero General Public License -# as published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# See the GNU Affero General Public License for more details. -# -# This program is available from Source Forge: at GeneNetwork Project -# (sourceforge.net/projects/genenetwork/). -# -# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) -# at rwilliams@uthsc.edu and xzhou15@uthsc.edu -# -# -# -# This module is used by GeneNetwork project (www.genenetwork.org) -# -# Created by GeneNetwork Core Team 2010/08/10 -# -# Last updated by NL 2011/03/23 - - -import math -#import rpy2.robjects -import pp -import string - -from utility import webqtlUtil -from base.trait import GeneralTrait -from dbFunction import webqtlDatabaseFunction - - - -#XZ: The input 'controls' is String. It contains the full name of control traits. -#XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait. -#XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers. -#XZ, 03/29/2010: For each returned control trait, there is no None value in it. -def controlStrains(controls, strainlst): - - controls = controls.split(',') - - cvals = {} - for oneTraitName in controls: - oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() ) - oneTrait.retrieveData() - cvals[oneTraitName] = oneTrait.data - - tcstrains = [] - tcvals = [] - tcvars = [] - - for oneTraitName in controls: - strains = [] - vals = [] - vars = [] - - for _strain in strainlst: - if cvals[oneTraitName].has_key(_strain): - _val = cvals[oneTraitName][_strain].val - if _val != None: - strains.append(_strain) - vals.append(_val) - vars.append(None) - - tcstrains.append(strains) - tcvals.append(vals) - tcvars.append(vars) - - return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains] - - - -#XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them. -def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars): - """Corrects strains, vals, and vars so that all contrain only those strains common - to the reference trait and all control traits.""" - - def dictify(strains,vals,vars): - subdict = {} - for i in xrange(len(strains)): - subdict[strains[i]] = (vals[i],vars[i]) - return subdict - - #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits. - dicts = [] - dicts.append(dictify(_strains,_vals,_vars)) - - nCstrains = len(_controlstrains) - for i in xrange(nCstrains): - dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i])) - - _newstrains = [] - _vals = [] - _vars = [] - _controlvals = [[] for x in xrange(nCstrains)] - _controlvars = [[] for x in xrange(nCstrains)] - - for strain in _strains: - inall = True - for d in dicts: - if strain not in d: - inall = False - break - if inall: - _newstrains.append(strain) - _vals.append(dicts[0][strain][0]) - _vars.append(dicts[0][strain][1]) - for i in xrange(nCstrains): - _controlvals[i].append(dicts[i+1][strain][0]) - _controlvars[i].append(dicts[i+1][strain][1]) - - return _newstrains, _vals, _controlvals, _vars, _controlvars - - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -def findIdenticalControlTraits ( controlVals, controlNames ): - nameOfIdenticalTraits = [] - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - - #XZ: reset the precision of values and convert to string type - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - - return nameOfIdenticalTraits - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -#primaryVal is of list type. It contains value of primary trait. -#primaryName is of string type. -#controlVals is of list type. Each element is list too. Each element contain value of one control trait. -#controlNames is of list type. -def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): - nameOfIdenticalTraits = [] - - #XZ: reset the precision of values and convert to string type - for oneStrainVal in primaryVal: - oneStrainVal = '%.3f' % oneStrainVal - - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - break - - if len(nameOfIdenticalTraits) == 0: - for i, oneTraitVal in enumerate( controlVals ): - if primaryVal == oneTraitVal: - nameOfIdenticalTraits.append(primaryName) - nameOfIdenticalTraits.append(controlNames[i]) - break - - return nameOfIdenticalTraits - - - -#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. -#XZ: No value in primaryVal and controlVals could be None. - -def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): - - def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): - - rpy2.robjects.r(""" -pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ - # The partial correlation coefficient between x and y given z - # - # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. - # - # x and y should be vectors - # - # z can be either a vector or a matrix - # - # use: There are two methods to calculate the partial correlation coefficient. - # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). - # Default is "mat". - # - # method: There are three ways to calculate the correlation coefficient, - # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. - # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. - # Default is "p". - # - # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). - # If not, the missing samples will be removed just when the correlation coefficient is calculated. - # However, the number of samples for the p-value is the number of samples after removing - # all the missing samples from the whole dataset. - # Default is "T". - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(use == "mat"){ - p.use <- "Var-Cov matrix" - pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) - }else if(use == "rec"){ - p.use <- "Recursive formula" - pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) - }else{ - stop("use should be either rec or mat!\n") - } - - # print the method - if(gregexpr("p",method)[[1]][1] == 1){ - p.method <- "Pearson" - }else if(gregexpr("s",method)[[1]][1] == 1){ - p.method <- "Spearman" - }else if(gregexpr("k",method)[[1]][1] == 1){ - p.method <- "Kendall" - }else{ - stop("method should be pearson or spearman or kendall!\n") - } - - # sample number - n <- dim(na.omit(data.frame(x,y,z)))[1] - - # given variables' number - gn <- dim(z)[2] - - # p-value - if(p.method == "Kendall"){ - statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) - p.value <- 2*pnorm(-abs(statistic)) - - }else{ - statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) - p.value <- 2*pnorm(-abs(statistic)) - } - - data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) -} - -# By using var-cov matrix -pcor.mat <- function(x,y,z,method="p",na.rm=T){ - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - xdata <- na.omit(data.frame(data[,c(1,2)])) - Sxx <- cov(xdata,xdata,m=method) - - xzdata <- na.omit(data) - xdata <- data.frame(xzdata[,c(1,2)]) - zdata <- data.frame(xzdata[,-c(1,2)]) - Sxz <- cov(xdata,zdata,m=method) - - zdata <- na.omit(data.frame(data[,-c(1,2)])) - Szz <- cov(zdata,zdata,m=method) - - # is Szz positive definite? - zz.ev <- eigen(Szz)$values - if(min(zz.ev)[1]<0){ - stop("\'Szz\' is not positive definite!\n") - } - - # partial correlation - Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) - - rxx.z <- cov2cor(Sxx.z)[1,2] - - rxx.z -} - -# By using recursive formula -pcor.rec <- function(x,y,z,method="p",na.rm=T){ - # - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - # recursive formula - if(dim(z)[2] == 1){ - tdata <- na.omit(data.frame(data[,1],data[,2])) - rxy <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) - rxz <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) - ryz <- cor(tdata[,1],tdata[,2],m=method) - - rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) - - return(rxy.z) - }else{ - x <- c(data[,1]) - y <- c(data[,2]) - z0 <- c(data[,3]) - zc <- as.data.frame(data[,-c(1,2,3)]) - - rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) - rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) - ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) - - rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) - return(rxy.z) - } -} -""") - - R_pcorr_function = rpy2.robjects.r['pcor.test'] - R_corr_test = rpy2.robjects.r['cor.test'] - - primary = rpy2.robjects.FloatVector(range(len(primaryVal))) - for i in range(len(primaryVal)): - primary[i] = primaryVal[i] - - control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(controlVals[0])): - control[i*len(controlVals[0]) + j] = controlVals[i][j] - - allcorrelations = [] - - for targetIndex, oneTargetVals in enumerate(targetVals): - - this_primary = None - this_control = None - this_target = None - - if None in oneTargetVals: - - goodIndex = [] - for i in range(len(oneTargetVals)): - if oneTargetVals[i] != None: - goodIndex.append(i) - - this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_primary[i] = primaryVal[goodIndex[i]] - - this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(goodIndex)): - this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] - - this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_target[i] = oneTargetVals[goodIndex[i]] - - else: - this_primary = primary - this_control = control - this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) - for i in range(len(oneTargetVals)): - this_target[i] = oneTargetVals[i] - - one_name = targetNames[targetIndex] - one_N = len(this_primary) - - #calculate partial correlation - one_pc_coefficient = 'NA' - one_pc_p = 1 - - try: - if method == 's': - result = R_pcorr_function(this_primary, this_target, this_control, method='s') - else: - result = R_pcorr_function(this_primary, this_target, this_control) - - #XZ: In very few cases, the returned coefficient is nan. - #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN - if result[0][0] == result[0][0]: - one_pc_coefficient = result[0][0] - #XZ: when the coefficient value is 1 (primary trait and target trait are the same), - #XZ: occationally, the returned p value is nan instead of 0. - if result[1][0] == result[1][0]: - one_pc_p = result[1][0] - elif abs(one_pc_coefficient - 1) < 0.0000001: - one_pc_p = 0 - except: - pass - - #calculate zero order correlation - one_corr_coefficient = 0 - one_corr_p = 1 - - try: - if method == 's': - R_result = R_corr_test(this_primary, this_target, method='spearman') - else: - R_result = R_corr_test(this_primary, this_target) - - one_corr_coefficient = R_result[3][0] - one_corr_p = R_result[2][0] - except: - pass - - traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] - - allcorrelations.append(traitinfo) - - return allcorrelations - #End of function compute_partial - - - allcorrelations = [] - - target_trait_number = len(targetVals) - - if target_trait_number < 1000: - allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) - else: - step = 1000 - job_number = math.ceil( float(target_trait_number)/step ) - - job_targetVals_lists = [] - job_targetNames_lists = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, target_trait_number) - - one_job_targetVals_list = [] - one_job_targetNames_list = [] - - for i in range( starti, endi ): - one_job_targetVals_list.append( targetVals[i] ) - one_job_targetNames_list.append( targetNames[i] ) - - job_targetVals_lists.append( one_job_targetVals_list ) - job_targetNames_lists.append( one_job_targetNames_list ) - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): - one_job_targetNames_list = job_targetNames_lists[i] - #pay attention to modules from outside - jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - for one_traitinfo in one_result: - allcorrelations.append( one_traitinfo ) - - return allcorrelations - - - -#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait -#XZ: The primaryTrait and targetTrait should have executed retrieveData function -def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): - - #primaryTrait.retrieveData() - - #there is no None value in primary_val - primary_strain, primary_val, primary_var = primaryTrait.exportInformative() - - #targetTrait.retrieveData() - - #there might be None value in target_val - target_val = targetTrait.exportData(primary_strain, type="val") - - R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) - for i in range(len(primary_val)): - R_primary[i] = primary_val[i] - - N = len(target_val) - - if None in target_val: - goodIndex = [] - for i in range(len(target_val)): - if target_val[i] != None: - goodIndex.append(i) - - N = len(goodIndex) - - R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - R_primary[i] = primary_val[goodIndex[i]] - - R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - R_target[i] = target_val[goodIndex[i]] - - else: - R_target = rpy2.robjects.FloatVector(range(len(target_val))) - for i in range(len(target_val)): - R_target[i] = target_val[i] - - R_corr_test = rpy2.robjects.r['cor.test'] - - if method == 'spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - corr_result = [] - corr_result.append( R_result[3][0] ) - corr_result.append( N ) - corr_result.append( R_result[2][0] ) - - return corr_result - -##################################################################################### -#Input: primaryValue(list): one list of expression values of one probeSet, -# targetValue(list): one list of expression values of one probeSet, -# method(string): indicate correlation method ('pearson' or 'spearman') -#Output: corr_result(list): first item is Correlation Value, second item is tissue number, -# third item is PValue -#Function: get correlation value,Tissue quantity ,p value result by using R; -#Note : This function is special case since both primaryValue and targetValue are from -#the same dataset. So the length of these two parameters is the same. They are pairs. -#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on -#the same tissue order -##################################################################################### - -def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): - - R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) - N = len(primaryValue) - for i in range(len(primaryValue)): - R_primary[i] = primaryValue[i] - - R_target = rpy2.robjects.FloatVector(range(len(targetValue))) - for i in range(len(targetValue)): - R_target[i]=targetValue[i] - - R_corr_test = rpy2.robjects.r['cor.test'] - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - corr_result =[] - corr_result.append( R_result[3][0]) - corr_result.append( N ) - corr_result.append( R_result[2][0]) - - return corr_result - - - - -def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): - - def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): - - oneSymbolCorrDict = {} - oneSymbolPvalueDict = {} - - R_corr_test = rpy2.robjects.r['cor.test'] - - R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) - - for i in range(len(primaryTraitValue)): - R_primary[i] = primaryTraitValue[i] - - for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): - R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) - for i in range(len(oneTraitValue)): - R_target[i] = oneTraitValue[i] - - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] - oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] - - return(oneSymbolCorrDict, oneSymbolPvalueDict) - - - - symbolCorrDict = {} - symbolPvalueDict = {} - - items_number = len(SymbolValueDict) - - if items_number <= 1000: - symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) - else: - items_list = SymbolValueDict.items() - - step = 1000 - job_number = math.ceil( float(items_number)/step ) - - job_oneSymbolValueDict_list = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, items_number) - - oneSymbolValueDict = {} - - for i in range( starti, endi ): - one_item = items_list[i] - one_symbol = one_item[0] - one_value = one_item[1] - oneSymbolValueDict[one_symbol] = one_value - - job_oneSymbolValueDict_list.append( oneSymbolValueDict ) - - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): - - #pay attention to modules from outside - jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - oneSymbolCorrDict, oneSymbolPvalueDict = one_result - symbolCorrDict.update( oneSymbolCorrDict ) - symbolPvalueDict.update( oneSymbolPvalueDict ) - - return (symbolCorrDict, symbolPvalueDict) - -########################################################################### -#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId -#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) -#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and -# getGeneSymbolTissueValueDict to build dict to get CorrPvArray -#Note: If there are multiple probesets for one gene, select the one with highest mean. -########################################################################### -def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - Symbols ="" - symbolList =[] - geneIdDict ={} - dataIdDict = {} - ChrDict = {} - MbDict = {} - descDict = {} - pTargetDescDict = {} - - count = len(GeneNameLst) - - # Added by NL 01/06/2011 - # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value - # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. - if count==0: - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''%TissueProbeSetFreezeId - - else: - for i, item in enumerate(GeneNameLst): - - if i == count-1: - Symbols += "'%s'" %item - else: - Symbols += "'%s'," %item - - Symbols = "("+ Symbols+")" - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''% (TissueProbeSetFreezeId,Symbols) - - try: - - cursor.execute(query) - results =cursor.fetchall() - resultCount = len(results) - # Key in all dicts is the lower-cased symbol - for i, item in enumerate(results): - symbol = item[0] - symbolList.append(symbol) - - key =symbol.lower() - geneIdDict[key]=item[1] - dataIdDict[key]=item[2] - ChrDict[key]=item[3] - MbDict[key]=item[4] - descDict[key]=item[5] - pTargetDescDict[key]=item[6] - - except: - symbolList = None - geneIdDict=None - dataIdDict=None - ChrDict=None - MbDict=None - descDict=None - pTargetDescDict=None - - return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict - -########################################################################### -#Input: cursor, symbolList (list), dataIdDict(Dict) -#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). -#Attention! All keys are lower case! -########################################################################### -def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): - symbolList = map(string.lower, symbolList) - symbolValuepairDict={} - valueList=[] - - for key in symbolList: - if dataIdDict.has_key(key): - DataId = dataIdDict[key] - - valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId - try : - cursor.execute(valueQuery) - valueResults = cursor.fetchall() - for item in valueResults: - item =item[0] - valueList.append(item) - symbolValuepairDict[key] = valueList - valueList=[] - except: - symbolValuepairDict[key] = None - - return symbolValuepairDict - - -######################################################################################################## -#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function for getSymbolValuePairDict function -# build gene symbol list if necessary, cut it into small lists if necessary, -# then call getSymbolValuePairDict function and merge the results. -######################################################################################################## - -def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): - limitNum=1000 - count = len(symbolList) - - SymbolValuePairDict = {} - - if count !=0 and count <=limitNum: - SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - - elif count >limitNum: - SymbolValuePairDict={} - n = count/limitNum - start =0 - stop =0 - - for i in range(n): - stop =limitNum*(i+1) - gList1 = symbolList[start:stop] - PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) - start =limitNum*(i+1) - - SymbolValuePairDict.update(PairDict1) - - if stop < count: - stop = count - gList2 = symbolList[start:stop] - PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) - SymbolValuePairDict.update(PairDict2) - - return SymbolValuePairDict - -######################################################################################################## -#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function of getGeneSymbolTissueValueDict function -# for CorrelationPage.py -######################################################################################################## - -def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - SymbolValuePairDict={} - symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) - if symbolList: - SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - return SymbolValuePairDict - -######################################################################################################## -#Input: cursor(cursor): MySQL connnection cursor; -# priGeneSymbolList(list): one list of gene symbol; -# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#Output: corrArray(array): array of Correlation Value, -# pvArray(array): array of PValue; -#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss -######################################################################################################## - -def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): - # setting initial value for corrArray, pvArray equal to 0 - Num = len(priGeneSymbolList) - - corrArray = [([0] * (Num))[:] for i in range(Num)] - pvArray = [([0] * (Num))[:] for i in range(Num)] - i = 0 - for pkey in priGeneSymbolList: - j = 0 - pkey = pkey.strip().lower()# key in symbolValuepairDict is low case - if symbolValuepairDict.has_key(pkey): - priValue = symbolValuepairDict[pkey] - for tkey in priGeneSymbolList: - tkey = tkey.strip().lower()# key in symbolValuepairDict is low case - if priValue and symbolValuepairDict.has_key(tkey): - tarValue = symbolValuepairDict[tkey] - - if tarValue: - if i>j: - # corrArray stores Pearson Correlation values - # pvArray stores Pearson P-Values - pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) - corrArray[i][j] =pcorr_result[0] - pvArray[i][j] =pcorr_result[2] - elif i 1: + + #XZ: reset the precision of values and convert to string type + for oneTraitVal in controlVals: + for oneStrainVal in oneTraitVal: + oneStrainVal = '%.3f' % oneStrainVal + + for i, oneTraitVal in enumerate( controlVals ): + for j in range(i+1, controlTraitNumber): + if oneTraitVal == controlVals[j]: + nameOfIdenticalTraits.append(controlNames[i]) + nameOfIdenticalTraits.append(controlNames[j]) + + return nameOfIdenticalTraits + +#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. +#else, the returned list has two elements of control trait name. +#primaryVal is of list type. It contains value of primary trait. +#primaryName is of string type. +#controlVals is of list type. Each element is list too. Each element contain value of one control trait. +#controlNames is of list type. +def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): + nameOfIdenticalTraits = [] + + #XZ: reset the precision of values and convert to string type + for oneStrainVal in primaryVal: + oneStrainVal = '%.3f' % oneStrainVal + + for oneTraitVal in controlVals: + for oneStrainVal in oneTraitVal: + oneStrainVal = '%.3f' % oneStrainVal + + controlTraitNumber = len(controlVals) + + if controlTraitNumber > 1: + for i, oneTraitVal in enumerate( controlVals ): + for j in range(i+1, controlTraitNumber): + if oneTraitVal == controlVals[j]: + nameOfIdenticalTraits.append(controlNames[i]) + nameOfIdenticalTraits.append(controlNames[j]) + break + + if len(nameOfIdenticalTraits) == 0: + for i, oneTraitVal in enumerate( controlVals ): + if primaryVal == oneTraitVal: + nameOfIdenticalTraits.append(primaryName) + nameOfIdenticalTraits.append(controlNames[i]) + break + + return nameOfIdenticalTraits + + + +#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. +#XZ: No value in primaryVal and controlVals could be None. + +def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): + + def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): + + rpy2.robjects.r(""" +pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ + # The partial correlation coefficient between x and y given z + # + # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. + # + # x and y should be vectors + # + # z can be either a vector or a matrix + # + # use: There are two methods to calculate the partial correlation coefficient. + # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). + # Default is "mat". + # + # method: There are three ways to calculate the correlation coefficient, + # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. + # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. + # Default is "p". + # + # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). + # If not, the missing samples will be removed just when the correlation coefficient is calculated. + # However, the number of samples for the p-value is the number of samples after removing + # all the missing samples from the whole dataset. + # Default is "T". + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(use == "mat"){ + p.use <- "Var-Cov matrix" + pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) + }else if(use == "rec"){ + p.use <- "Recursive formula" + pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) + }else{ + stop("use should be either rec or mat!\n") + } + + # print the method + if(gregexpr("p",method)[[1]][1] == 1){ + p.method <- "Pearson" + }else if(gregexpr("s",method)[[1]][1] == 1){ + p.method <- "Spearman" + }else if(gregexpr("k",method)[[1]][1] == 1){ + p.method <- "Kendall" + }else{ + stop("method should be pearson or spearman or kendall!\n") + } + + # sample number + n <- dim(na.omit(data.frame(x,y,z)))[1] + + # given variables' number + gn <- dim(z)[2] + + # p-value + if(p.method == "Kendall"){ + statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) + p.value <- 2*pnorm(-abs(statistic)) + + }else{ + statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) + p.value <- 2*pnorm(-abs(statistic)) + } + + data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) +} + +# By using var-cov matrix +pcor.mat <- function(x,y,z,method="p",na.rm=T){ + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(dim(z)[2] == 0){ + stop("There should be given data\n") + } + + data <- data.frame(x,y,z) + + if(na.rm == T){ + data = na.omit(data) + } + + xdata <- na.omit(data.frame(data[,c(1,2)])) + Sxx <- cov(xdata,xdata,m=method) + + xzdata <- na.omit(data) + xdata <- data.frame(xzdata[,c(1,2)]) + zdata <- data.frame(xzdata[,-c(1,2)]) + Sxz <- cov(xdata,zdata,m=method) + + zdata <- na.omit(data.frame(data[,-c(1,2)])) + Szz <- cov(zdata,zdata,m=method) + + # is Szz positive definite? + zz.ev <- eigen(Szz)$values + if(min(zz.ev)[1]<0){ + stop("\'Szz\' is not positive definite!\n") + } + + # partial correlation + Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) + + rxx.z <- cov2cor(Sxx.z)[1,2] + + rxx.z +} + +# By using recursive formula +pcor.rec <- function(x,y,z,method="p",na.rm=T){ + # + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(dim(z)[2] == 0){ + stop("There should be given data\n") + } + + data <- data.frame(x,y,z) + + if(na.rm == T){ + data = na.omit(data) + } + + # recursive formula + if(dim(z)[2] == 1){ + tdata <- na.omit(data.frame(data[,1],data[,2])) + rxy <- cor(tdata[,1],tdata[,2],m=method) + + tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) + rxz <- cor(tdata[,1],tdata[,2],m=method) + + tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) + ryz <- cor(tdata[,1],tdata[,2],m=method) + + rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) + + return(rxy.z) + }else{ + x <- c(data[,1]) + y <- c(data[,2]) + z0 <- c(data[,3]) + zc <- as.data.frame(data[,-c(1,2,3)]) + + rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) + rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) + ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) + + rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) + return(rxy.z) + } +} +""") + + R_pcorr_function = rpy2.robjects.r['pcor.test'] + R_corr_test = rpy2.robjects.r['cor.test'] + + primary = rpy2.robjects.FloatVector(range(len(primaryVal))) + for i in range(len(primaryVal)): + primary[i] = primaryVal[i] + + control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) + for i in range(len(controlVals)): + for j in range(len(controlVals[0])): + control[i*len(controlVals[0]) + j] = controlVals[i][j] + + allcorrelations = [] + + for targetIndex, oneTargetVals in enumerate(targetVals): + + this_primary = None + this_control = None + this_target = None + + if None in oneTargetVals: + + goodIndex = [] + for i in range(len(oneTargetVals)): + if oneTargetVals[i] != None: + goodIndex.append(i) + + this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + this_primary[i] = primaryVal[goodIndex[i]] + + this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) + for i in range(len(controlVals)): + for j in range(len(goodIndex)): + this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] + + this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + this_target[i] = oneTargetVals[goodIndex[i]] + + else: + this_primary = primary + this_control = control + this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) + for i in range(len(oneTargetVals)): + this_target[i] = oneTargetVals[i] + + one_name = targetNames[targetIndex] + one_N = len(this_primary) + + #calculate partial correlation + one_pc_coefficient = 'NA' + one_pc_p = 1 + + try: + if method == 's': + result = R_pcorr_function(this_primary, this_target, this_control, method='s') + else: + result = R_pcorr_function(this_primary, this_target, this_control) + + #XZ: In very few cases, the returned coefficient is nan. + #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN + if result[0][0] == result[0][0]: + one_pc_coefficient = result[0][0] + #XZ: when the coefficient value is 1 (primary trait and target trait are the same), + #XZ: occationally, the returned p value is nan instead of 0. + if result[1][0] == result[1][0]: + one_pc_p = result[1][0] + elif abs(one_pc_coefficient - 1) < 0.0000001: + one_pc_p = 0 + except: + pass + + #calculate zero order correlation + one_corr_coefficient = 0 + one_corr_p = 1 + + try: + if method == 's': + R_result = R_corr_test(this_primary, this_target, method='spearman') + else: + R_result = R_corr_test(this_primary, this_target) + + one_corr_coefficient = R_result[3][0] + one_corr_p = R_result[2][0] + except: + pass + + traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] + + allcorrelations.append(traitinfo) + + return allcorrelations + #End of function compute_partial + + + allcorrelations = [] + + target_trait_number = len(targetVals) + + if target_trait_number < 1000: + allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) + else: + step = 1000 + job_number = math.ceil( float(target_trait_number)/step ) + + job_targetVals_lists = [] + job_targetNames_lists = [] + + for job_index in range( int(job_number) ): + starti = job_index*step + endi = min((job_index+1)*step, target_trait_number) + + one_job_targetVals_list = [] + one_job_targetNames_list = [] + + for i in range( starti, endi ): + one_job_targetVals_list.append( targetVals[i] ) + one_job_targetNames_list.append( targetNames[i] ) + + job_targetVals_lists.append( one_job_targetVals_list ) + job_targetNames_lists.append( one_job_targetNames_list ) + + ppservers = () + # Creates jobserver with automatically detected number of workers + job_server = pp.Server(ppservers=ppservers) + + jobs = [] + results = [] + + for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): + one_job_targetNames_list = job_targetNames_lists[i] + #pay attention to modules from outside + jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) + + for one_job in jobs: + one_result = one_job() + results.append( one_result ) + + for one_result in results: + for one_traitinfo in one_result: + allcorrelations.append( one_traitinfo ) + + return allcorrelations + + + +#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait +#XZ: The primaryTrait and targetTrait should have executed retrieveData function +def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): + + #primaryTrait.retrieveData() + + #there is no None value in primary_val + primary_strain, primary_val, primary_var = primaryTrait.exportInformative() + + #targetTrait.retrieveData() + + #there might be None value in target_val + target_val = targetTrait.exportData(primary_strain, type="val") + + R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) + for i in range(len(primary_val)): + R_primary[i] = primary_val[i] + + N = len(target_val) + + if None in target_val: + goodIndex = [] + for i in range(len(target_val)): + if target_val[i] != None: + goodIndex.append(i) + + N = len(goodIndex) + + R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + R_primary[i] = primary_val[goodIndex[i]] + + R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + R_target[i] = target_val[goodIndex[i]] + + else: + R_target = rpy2.robjects.FloatVector(range(len(target_val))) + for i in range(len(target_val)): + R_target[i] = target_val[i] + + R_corr_test = rpy2.robjects.r['cor.test'] + + if method == 'spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + corr_result = [] + corr_result.append( R_result[3][0] ) + corr_result.append( N ) + corr_result.append( R_result[2][0] ) + + return corr_result + +##################################################################################### +#Input: primaryValue(list): one list of expression values of one probeSet, +# targetValue(list): one list of expression values of one probeSet, +# method(string): indicate correlation method ('pearson' or 'spearman') +#Output: corr_result(list): first item is Correlation Value, second item is tissue number, +# third item is PValue +#Function: get correlation value,Tissue quantity ,p value result by using R; +#Note : This function is special case since both primaryValue and targetValue are from +#the same dataset. So the length of these two parameters is the same. They are pairs. +#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on +#the same tissue order +##################################################################################### + +def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): + + R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) + N = len(primaryValue) + for i in range(len(primaryValue)): + R_primary[i] = primaryValue[i] + + R_target = rpy2.robjects.FloatVector(range(len(targetValue))) + for i in range(len(targetValue)): + R_target[i]=targetValue[i] + + R_corr_test = rpy2.robjects.r['cor.test'] + if method =='spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + corr_result =[] + corr_result.append( R_result[3][0]) + corr_result.append( N ) + corr_result.append( R_result[2][0]) + + return corr_result + + + + +def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): + + def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): + + oneSymbolCorrDict = {} + oneSymbolPvalueDict = {} + + R_corr_test = rpy2.robjects.r['cor.test'] + + R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) + + for i in range(len(primaryTraitValue)): + R_primary[i] = primaryTraitValue[i] + + for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): + R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) + for i in range(len(oneTraitValue)): + R_target[i] = oneTraitValue[i] + + if method =='spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] + oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] + + return(oneSymbolCorrDict, oneSymbolPvalueDict) + + + + symbolCorrDict = {} + symbolPvalueDict = {} + + items_number = len(SymbolValueDict) + + if items_number <= 1000: + symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) + else: + items_list = SymbolValueDict.items() + + step = 1000 + job_number = math.ceil( float(items_number)/step ) + + job_oneSymbolValueDict_list = [] + + for job_index in range( int(job_number) ): + starti = job_index*step + endi = min((job_index+1)*step, items_number) + + oneSymbolValueDict = {} + + for i in range( starti, endi ): + one_item = items_list[i] + one_symbol = one_item[0] + one_value = one_item[1] + oneSymbolValueDict[one_symbol] = one_value + + job_oneSymbolValueDict_list.append( oneSymbolValueDict ) + + + ppservers = () + # Creates jobserver with automatically detected number of workers + job_server = pp.Server(ppservers=ppservers) + + jobs = [] + results = [] + + for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): + + #pay attention to modules from outside + jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) + + for one_job in jobs: + one_result = one_job() + results.append( one_result ) + + for one_result in results: + oneSymbolCorrDict, oneSymbolPvalueDict = one_result + symbolCorrDict.update( oneSymbolCorrDict ) + symbolPvalueDict.update( oneSymbolPvalueDict ) + + return (symbolCorrDict, symbolPvalueDict) + +########################################################################### +#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId +#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) +#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and +# getGeneSymbolTissueValueDict to build dict to get CorrPvArray +#Note: If there are multiple probesets for one gene, select the one with highest mean. +########################################################################### +def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): + Symbols ="" + symbolList =[] + geneIdDict ={} + dataIdDict = {} + ChrDict = {} + MbDict = {} + descDict = {} + pTargetDescDict = {} + + count = len(GeneNameLst) + + # Added by NL 01/06/2011 + # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value + # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. + if count==0: + query=''' + select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; + '''%TissueProbeSetFreezeId + + else: + for i, item in enumerate(GeneNameLst): + + if i == count-1: + Symbols += "'%s'" %item + else: + Symbols += "'%s'," %item + + Symbols = "("+ Symbols+")" + query=''' + select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; + '''% (TissueProbeSetFreezeId,Symbols) + + try: + + cursor.execute(query) + results =cursor.fetchall() + resultCount = len(results) + # Key in all dicts is the lower-cased symbol + for i, item in enumerate(results): + symbol = item[0] + symbolList.append(symbol) + + key =symbol.lower() + geneIdDict[key]=item[1] + dataIdDict[key]=item[2] + ChrDict[key]=item[3] + MbDict[key]=item[4] + descDict[key]=item[5] + pTargetDescDict[key]=item[6] + + except: + symbolList = None + geneIdDict=None + dataIdDict=None + ChrDict=None + MbDict=None + descDict=None + pTargetDescDict=None + + return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict + +########################################################################### +#Input: cursor, symbolList (list), dataIdDict(Dict) +#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, +# key is symbol, value is one list of expression values of one probeSet; +#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). +#Attention! All keys are lower case! +########################################################################### +def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): + symbolList = map(string.lower, symbolList) + symbolValuepairDict={} + valueList=[] + + for key in symbolList: + if dataIdDict.has_key(key): + DataId = dataIdDict[key] + + valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId + try : + cursor.execute(valueQuery) + valueResults = cursor.fetchall() + for item in valueResults: + item =item[0] + valueList.append(item) + symbolValuepairDict[key] = valueList + valueList=[] + except: + symbolValuepairDict[key] = None + + return symbolValuepairDict + + +######################################################################################################## +#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol +#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. +# key is symbol, value is one list of expression values of one probeSet. +#function: wrapper function for getSymbolValuePairDict function +# build gene symbol list if necessary, cut it into small lists if necessary, +# then call getSymbolValuePairDict function and merge the results. +######################################################################################################## + +def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): + limitNum=1000 + count = len(symbolList) + + SymbolValuePairDict = {} + + if count !=0 and count <=limitNum: + SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) + + elif count >limitNum: + SymbolValuePairDict={} + n = count/limitNum + start =0 + stop =0 + + for i in range(n): + stop =limitNum*(i+1) + gList1 = symbolList[start:stop] + PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) + start =limitNum*(i+1) + + SymbolValuePairDict.update(PairDict1) + + if stop < count: + stop = count + gList2 = symbolList[start:stop] + PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) + SymbolValuePairDict.update(PairDict2) + + return SymbolValuePairDict + +######################################################################################################## +#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) +#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. +# key is symbol, value is one list of expression values of one probeSet. +#function: wrapper function of getGeneSymbolTissueValueDict function +# for CorrelationPage.py +######################################################################################################## + +def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): + SymbolValuePairDict={} + symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + if symbolList: + SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) + return SymbolValuePairDict + +######################################################################################################## +#Input: cursor(cursor): MySQL connnection cursor; +# priGeneSymbolList(list): one list of gene symbol; +# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, +# key is symbol, value is one list of expression values of one probeSet; +#Output: corrArray(array): array of Correlation Value, +# pvArray(array): array of PValue; +#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss +######################################################################################################## + +def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): + # setting initial value for corrArray, pvArray equal to 0 + Num = len(priGeneSymbolList) + + corrArray = [([0] * (Num))[:] for i in range(Num)] + pvArray = [([0] * (Num))[:] for i in range(Num)] + i = 0 + for pkey in priGeneSymbolList: + j = 0 + pkey = pkey.strip().lower()# key in symbolValuepairDict is low case + if symbolValuepairDict.has_key(pkey): + priValue = symbolValuepairDict[pkey] + for tkey in priGeneSymbolList: + tkey = tkey.strip().lower()# key in symbolValuepairDict is low case + if priValue and symbolValuepairDict.has_key(tkey): + tarValue = symbolValuepairDict[tkey] + + if tarValue: + if i>j: + # corrArray stores Pearson Correlation values + # pvArray stores Pearson P-Values + pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) + corrArray[i][j] =pcorr_result[0] + pvArray[i][j] =pcorr_result[2] + elif i