diff options
Diffstat (limited to 'wqflask')
-rw-r--r-- | wqflask/base/mrna_assay_tissue_data.py | 134 | ||||
-rwxr-xr-x | wqflask/base/trait.py | 5 | ||||
-rw-r--r-- | wqflask/utility/db_tools.py | 15 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/correlation_functions.py (renamed from wqflask/wqflask/correlation/correlation_function.py) | 158 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/show_corr_results.py | 85 | ||||
-rw-r--r-- | wqflask/wqflask/templates/correlation_page.html | 12 |
6 files changed, 300 insertions, 109 deletions
diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py new file mode 100644 index 00000000..8ae71858 --- /dev/null +++ b/wqflask/base/mrna_assay_tissue_data.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import, print_function, division + +import collections + +from flask import g + +from utility import dbtools +from uitility import Bunch + +from MySQLdb import escape_string as escape + +class MrnaAssayTissueData(object): + + def __init__(self, gene_symbols=None): + self.gene_symbols = gene_symbols + self.have_data = False + if self.gene_symbols == None: + self.gene_symbols = [] + + self.data = collections.defaultdict(Bunch) + + #self.gene_id_dict ={} + #self.data_id_dict = {} + #self.chr_dict = {} + #self.mb_dict = {} + #self.desc_dict = {} + #self.probe_target_desc_dict = {} + + query = '''select t.Symbol, t.GeneId, t.DataId,t.Chr, t.Mb, t.description, t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=1 and ''' + + # Note that inner join is necessary in this query to get distinct record in one symbol group + # with highest mean value + # Due to the limit size of TissueProbeSetFreezeId table in DB, + # performance of inner join is acceptable. + if len(gene_symbols) == 0: + query += '''Symbol!='' and Symbol Is Not Null group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol + and t.Mean = x.maxmean; + ''' + else: + in_clause = dbtools.create_in_clause(gene_symbols) + + query += ''' Symbol in {} group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol + and t.Mean = x.maxmean; + '''.format(in_clause) + + results = g.db.execute(query).fetchall() + for result in results: + symbol = item[0] + gene_symbols.append(symbol) + symbol = symbol.lower() + + self.data[symbol].gene_id = result.GeneId + self.data[symbol].data_id = result.DataId + self.data[symbol].chr = result.Chr + self.data[symbol].mb = result.Mb + self.data[symbol].description = result.description + self.data[symbol].probe_target_description = result.Probe_Target_Description + + + ########################################################################### + #Input: cursor, symbolList (list), dataIdDict(Dict) + #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, + # key is symbol, value is one list of expression values of one probeSet; + #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). + #Attention! All keys are lower case! + ########################################################################### + def get_symbol_value_pairs(self): + + id_list = [self.tissue_data[symbol.lower()].data_id for item in self.tissue_data] + + symbol_value_pairs = {} + value_list=[] + + query = """SELECT value, id + FROM TissueProbeSetData + WHERE Id IN {}""".format(create_in_clause(id_list)) + + try : + results = g.db.execute(query).fetchall() + for result in results: + value_list.append(result.value) + symbol_value_pairs[symbol] = value_list + except: + symbol_value_pairs[symbol] = None + + #for symbol in symbol_list: + # if tissue_data.has_key(symbol): + # data_id = tissue_data[symbol].data_id + # + # query = """select value, id + # from TissueProbeSetData + # where Id={}""".format(escape(data_id)) + # try : + # results = g.db.execute(query).fetchall() + # for item in results: + # item = item[0] + # value_list.append(item) + # symbol_value_pairs[symbol] = value_list + # value_list=[] + # except: + # symbol_value_pairs[symbol] = None + + return symbol_value_pairs + + ######################################################################################################## + #input: cursor, symbolList (list), dataIdDict(Dict): key is symbol + #output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. + # key is symbol, value is one list of expression values of one probeSet. + #function: wrapper function for getSymbolValuePairDict function + # build gene symbol list if necessary, cut it into small lists if necessary, + # then call getSymbolValuePairDict function and merge the results. + ######################################################################################################## + + def get_trait_symbol_and_tissue_values(symbol_list=None): + tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) + + #symbolList, + #geneIdDict, + #dataIdDict, + #ChrDict, + #MbDict, + #descDict, + #pTargetDescDict = getTissueProbeSetXRefInfo( + # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + + if len(tissue_data.gene_symbols): + return get_symbol_value_pairs(tissue_data) + diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index c893c887..6a64eeaf 100755 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -286,7 +286,6 @@ class GeneralTrait(object): escape(self.dataset.name), escape(self.name)) trait_info = g.db.execute(query).fetchone() - #print("trait_info is: ", pf(trait_info)) #XZ, 05/08/2009: We also should use Geno.Id to find marker instead of just using Geno.Name # to avoid the problem of same marker name from different species. elif self.dataset.type == 'Geno': @@ -359,7 +358,6 @@ class GeneralTrait(object): InbredSet.SpeciesId = Species.Id AND Species.TaxonomyId = Homologene.TaxonomyId """ % (escape(str(self.geneid)), escape(self.dataset.group.name)) - print("-> query is:", query) result = g.db.execute(query).fetchone() #else: # result = None @@ -391,7 +389,6 @@ class GeneralTrait(object): Geno.Name = '{}' and Geno.SpeciesId = Species.Id """.format(self.dataset.group.species, self.locus) - print("query is:", query) result = g.db.execute(query).fetchone() self.locus_chr = result[0] self.locus_mb = result[1] @@ -603,4 +600,4 @@ class GeneralTrait(object): else: ZValue = 0.5*log((1.0+self.correlation)/(1.0-self.correlation)) ZValue = ZValue*sqrt(self.overlap-3) - self.p_value = 2.0*(1.0 - reaper.normp(abs(ZValue)))
\ No newline at end of file + self.p_value = 2.0*(1.0 - reaper.normp(abs(ZValue))) diff --git a/wqflask/utility/db_tools.py b/wqflask/utility/db_tools.py new file mode 100644 index 00000000..4034f39c --- /dev/null +++ b/wqflask/utility/db_tools.py @@ -0,0 +1,15 @@ +from __future__ import absolute_import, print_function, division + +from MySQLdb import escape_string as escape + +def create_in_clause(items): + """Create an in clause for mysql""" + in_clause = ', '.join("'{}'".format(x) for x in mescape(*items)) + in_clause = '( {} )'.format(in_clause) + return in_clause + +def mescape(*items): + """Multiple escape""" + escaped = [escape(str(item)) for item in items] + #print("escaped is:", escaped) + return escaped diff --git a/wqflask/wqflask/correlation/correlation_function.py b/wqflask/wqflask/correlation/correlation_functions.py index 7d4b58a9..56f66810 100644 --- a/wqflask/wqflask/correlation/correlation_function.py +++ b/wqflask/wqflask/correlation/correlation_functions.py @@ -24,6 +24,7 @@ # # Last updated by NL 2011/03/23 +from __future__ import absolute_import, print_function, division import math #import rpy2.robjects @@ -31,10 +32,11 @@ import pp import string from utility import webqtlUtil +from base.mrna_assay_tissue_data import MrnaAssayTissueData from base.trait import GeneralTrait from dbFunction import webqtlDatabaseFunction - +from flask import Flask, g #XZ: The input 'controls' is String. It contains the full name of control traits. #XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait. @@ -676,7 +678,7 @@ def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson # getGeneSymbolTissueValueDict to build dict to get CorrPvArray #Note: If there are multiple probesets for one gene, select the one with highest mean. ########################################################################### -def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): +def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0): Symbols ="" symbolList =[] geneIdDict ={} @@ -720,7 +722,6 @@ def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId= '''% (TissueProbeSetFreezeId,Symbols) try: - cursor.execute(query) results =cursor.fetchall() resultCount = len(results) @@ -755,28 +756,43 @@ def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId= #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). #Attention! All keys are lower case! ########################################################################### -def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): - symbolList = map(string.lower, symbolList) - symbolValuepairDict={} - valueList=[] - - for key in symbolList: - if dataIdDict.has_key(key): - DataId = dataIdDict[key] - - valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId - try : - cursor.execute(valueQuery) - valueResults = cursor.fetchall() - for item in valueResults: - item =item[0] - valueList.append(item) - symbolValuepairDict[key] = valueList - valueList=[] - except: - symbolValuepairDict[key] = None - - return symbolValuepairDict +def get_symbol_value_pairs(tissue_data): + + id_list = [tissue_data[symbol.lower()].data_id for item in tissue_data] + + symbol_value_pairs = {} + value_list=[] + + query = """SELECT value, id + FROM TissueProbeSetData + WHERE Id IN {}""".format(create_in_clause(id_list)) + + try : + results = g.db.execute(query).fetchall() + for result in results: + value_list.append(result.value) + symbol_value_pairs[symbol] = value_list + except: + symbol_value_pairs[symbol] = None + + #for symbol in symbol_list: + # if tissue_data.has_key(symbol): + # data_id = tissue_data[symbol].data_id + # + # query = """select value, id + # from TissueProbeSetData + # where Id={}""".format(escape(data_id)) + # try : + # results = g.db.execute(query).fetchall() + # for item in results: + # item = item[0] + # value_list.append(item) + # symbol_value_pairs[symbol] = value_list + # value_list=[] + # except: + # symbol_value_pairs[symbol] = None + + return symbol_value_pairs ######################################################################################################## @@ -788,36 +804,51 @@ def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): # then call getSymbolValuePairDict function and merge the results. ######################################################################################################## -def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): - limitNum=1000 - count = len(symbolList) - - SymbolValuePairDict = {} - - if count !=0 and count <=limitNum: - SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - - elif count >limitNum: - SymbolValuePairDict={} - n = count/limitNum - start =0 - stop =0 - - for i in range(n): - stop =limitNum*(i+1) - gList1 = symbolList[start:stop] - PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) - start =limitNum*(i+1) - - SymbolValuePairDict.update(PairDict1) - - if stop < count: - stop = count - gList2 = symbolList[start:stop] - PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) - SymbolValuePairDict.update(PairDict2) - - return SymbolValuePairDict +def get_trait_symbol_and_tissue_values(symbol_list=None): + SymbolValuePairDict={} + + tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) + + #symbolList, + #geneIdDict, + #dataIdDict, + #ChrDict, + #MbDict, + #descDict, + #pTargetDescDict = getTissueProbeSetXRefInfo( + # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + + if len(tissue_data.gene_symbols): + return get_symbol_value_pairs(tissue_data) + + #limit_num=1000 + #count = len(symbol_list) + # + #symbol_value_pairs = {} + # + #if count !=0 and count <= limit_num: + # symbol_value_pairs = getSymbolValuePairDict(cursor=cursor,symbolList=symbol_list,dataIdDict=dataIdDict) + # + #elif count > limit_num: + # n = count/limit_num + # start = 0 + # stop = 0 + # + # for i in range(n): + # stop =limit_num*(i+1) + # gList1 = symbolList[start:stop] + # PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) + # start =limit_num*(i+1) + # + # SymbolValuePairDict.update(PairDict1) + # + # if stop < count: + # stop = count + # gList2 = symbolList[start:stop] + # PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) + # SymbolValuePairDict.update(PairDict2) + # + #return SymbolValuePairDict ######################################################################################################## #input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) @@ -827,12 +858,17 @@ def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): # for CorrelationPage.py ######################################################################################################## -def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - SymbolValuePairDict={} - symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) - if symbolList: - SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - return SymbolValuePairDict +#def get_trait_symbol_and_tissue_values(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): +# SymbolValuePairDict={} +# +# symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo( +# cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) +# +# if symbolList: +# SymbolValuePairDict = get_gene_symbol_and_tissue_values(symbolList=symbolList, +# dataIdDict=dataIdDict) +# +# return SymbolValuePairDict ######################################################################################################## #Input: cursor(cursor): MySQL connnection cursor; diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 1615fe21..b17e1db1 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -49,13 +49,15 @@ from base.templatePage import templatePage from utility import webqtlUtil, helper_functions, corr_result_helpers from dbFunction import webqtlDatabaseFunction import utility.webqtlUtil #this is for parallel computing only. -from wqflask.correlation import correlation_function +from wqflask.correlation import correlation_functions from utility.benchmark import Bench from MySQLdb import escape_string as escape from pprint import pformat as pf +from flask import Flask, g + METHOD_SAMPLE_PEARSON = "1" METHOD_SAMPLE_RANK = "2" METHOD_LIT = "3" @@ -159,6 +161,11 @@ class CorrelationResults(object): trait_object.sample_r = self.correlation_data[trait][0] trait_object.sample_p = self.correlation_data[trait][1] trait_object.num_overlap = self.correlation_data[trait][2] + + # Set some sane defaults + trait_object.tissue_corr = None + trait_object.tissue_pvalue = None + self.correlation_results.append(trait_object) @@ -916,61 +923,63 @@ class CorrelationResults(object): symbol_corr_dict = {} symbol_pvalue_dict = {} - primary_trait_symbol_value_dict = correlation_function.make_gene_tissue_value_dict( + primary_trait_symbol_value_dict = correlation_functions.make_gene_tissue_value_dict( GeneNameLst=[self.this_trait.symbol], TissueProbeSetFreezeId=tissue_dataset_id) primary_trait_value = primary_trait_symbol_value_dict.values()[0] - symbol_value_dict = correlation_function.make_gene_tissue_value_dict( + symbol_value_dict = correlation_functions.make_gene_tissue_value_dict( gene_name_list=[], tissue_dataset_id=tissue_dataset_id) - symbol_corr_dict, symbol_pvalue_dict = correlation_function.batch_cal_tissue_corr( + symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr( primaryTraitValue, SymbolValueDict, method=self.corr_method) #else: - # symbol_corr_dict, symbol_pvalue_dict = correlation_function.batch_cal_tissue_corr( + # symbol_corr_dict, symbol_pvalue_dict = correlation_functions.batch_cal_tissue_corr( # primaryTraitValue, # SymbolValueDict) - return (symbolCorrDict, symbolPvalueDict) + def do_tissue_correlation_by_list(self, tissue_dataset_id): - #XZ, 10/13/2010 - def getTissueCorrelationByList(self, primaryTraitSymbol=None, traitList=None, TissueProbeSetFreezeId=None, method=None): - - primaryTraitSymbolValueDict = correlationFunction.getGeneSymbolTissueValueDictForTrait(cursor=self.cursor, GeneNameLst=[primaryTraitSymbol], TissueProbeSetFreezeId=TISSUE_MOUSE_DB) - - if primaryTraitSymbol.lower() in primaryTraitSymbolValueDict: - primaryTraitValue = primaryTraitSymbolValueDict[primaryTraitSymbol.lower()] - - geneSymbolList = [] - - for thisTrait in traitList: - if hasattr(thisTrait, 'symbol'): - geneSymbolList.append(thisTrait.symbol) - - SymbolValueDict = correlationFunction.getGeneSymbolTissueValueDictForTrait(cursor=self.cursor, GeneNameLst=geneSymbolList, TissueProbeSetFreezeId=TISSUE_MOUSE_DB) + trait_symbol_and_values = correlation_functions.get_trait_symbol_and_tissue_values( + gene_name_list = [self.this_trait.symbol]) - for thisTrait in traitList: - if hasattr(thisTrait, 'symbol') and thisTrait.symbol and thisTrait.symbol.lower() in SymbolValueDict: - oneTraitValue = SymbolValueDict[thisTrait.symbol.lower()] - if method in ["2","5"]: - result = correlationFunction.calZeroOrderCorrForTiss( primaryTraitValue, oneTraitValue, method='spearman' ) - else: - result = correlationFunction.calZeroOrderCorrForTiss( primaryTraitValue, oneTraitValue) - thisTrait.tissueCorr = result[0] - thisTrait.tissuePValue = result[2] - else: - thisTrait.tissueCorr = None - thisTrait.tissuePValue = None - else: - for thisTrait in traitList: - thisTrait.tissueCorr = None - thisTrait.tissuePValue = None + if self.this_trait.symbol.lower() in trait_symbol_and_values: + primary_trait_value = trait_symbol_and_values[self.this_trait_symbol.lower()] + + #gene_symbol_list = [] + # + #for trait in self.correlation_results: + # if hasattr(trait, 'symbol'): + # gene_symbol_list.append(trait.symbol) + + gene_symbol_list = [trait.symbol for trait in self.correlation_results if trait.symbol] + + symbol_value_dict = correlation_functions.get_trait_gene_symbol_and_tissue_values( + gene_symbol_list=gene_symbol_list) + + for trait in self.correlation_results: + if trait.symbol and trait.symbol.lower() in symbol_value_dict: + this_trait_value = symbol_value_dict[trait.symbol.lower()] + + result = correlation_functions.calZeroOrderCorrForTiss(primary_trait_value, + this_trait_value, + self.corr_method) + + trait.tissue_corr = result[0] + trait.tissue_pvalue = result[2] + # else: + # trait.tissue_corr = None + # trait.tissue_pvalue = None + #else: + # for trait in self.correlation_results: + # trait.tissue_corr = None + # trait.tissue_pvalue = None - return traitList + #return self.correlation_results diff --git a/wqflask/wqflask/templates/correlation_page.html b/wqflask/wqflask/templates/correlation_page.html index efbf689c..53b12545 100644 --- a/wqflask/wqflask/templates/correlation_page.html +++ b/wqflask/wqflask/templates/correlation_page.html @@ -41,13 +41,13 @@ <td>{{ trait.symbol }}</td> <td>{{ trait.alias }}</td> <td>{{ trait.description }}</td> - <td>Chr{{ trait.chr }}: {{ trait.mb }}</td> - <td>{{ trait.mean }}</td> - <td>{{ trait.lrs }}</td> - <td>Chr{{ trait.locus_chr }}: {{ trait.locus_mb }}</td> - <td>{{ trait.sample_r }}</td> + <td>Chr{{ trait.chr }}:{{'%0.6f'|format(trait.mb)}}</td> + <td>{{'%0.3f'|format(trait.mean)}}</td> + <td>{{'%0.3f'|format(trait.lrs)}}</td> + <td>Chr{{ trait.locus_chr }}:{{'%0.6f'|format(trait.locus_mb)}}</td> + <td>{{'%0.3f'|format(trait.sample_r)}}</td> <td>{{ trait.num_overlap }}</td> - <td>{{ trait.sample_p }}</td> + <td>{{'%0.3e'|format(trait.sample_p)}}</td> </tr> {% endfor %} </tbody> |