From 615b861dfd05c04df2e1a753dd135b07c1d88a94 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Wed, 19 Jun 2013 23:04:21 +0000 Subject: Moved the normalize_values function to separate file corr_result_helpers.py Added docstring test to normalize_values Number of overlapping samples column now displays correctly in the correlation results page --- wqflask/base/trait.py | 7 - wqflask/utility/corr_result_helpers.py | 30 ++++ wqflask/wqflask/correlation/show_corr_results.py | 183 ++--------------------- 3 files changed, 44 insertions(+), 176 deletions(-) create mode 100644 wqflask/utility/corr_result_helpers.py (limited to 'wqflask') diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index 801d32c2..3429d9c1 100755 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -286,7 +286,6 @@ class GeneralTrait(object): escape(self.dataset.name), escape(self.name)) trait_info = g.db.execute(query).fetchone() - #print("trait_info is: ", pf(trait_info)) #XZ, 05/08/2009: We also should use Geno.Id to find marker instead of just using Geno.Name # to avoid the problem of same marker name from different species. elif self.dataset.type == 'Geno': @@ -319,7 +318,6 @@ class GeneralTrait(object): #XZ: assign SQL query result to trait attributes. for i, field in enumerate(self.dataset.display_fields): - print(" mike: {} -> {} - {}".format(field, type(trait_info[i]), trait_info[i])) setattr(self, field, trait_info[i]) if self.dataset.type == 'Publish': @@ -329,9 +327,6 @@ class GeneralTrait(object): self.homologeneid = None - print("self.geneid is:", self.geneid) - print(" type:", type(self.geneid)) - print("self.dataset.group.name is:", self.dataset.group.name) if self.dataset.type == 'ProbeSet' and self.dataset.group and self.geneid: #XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number. #XZ: So I have to test if geneid is number before execute the query. @@ -356,7 +351,6 @@ class GeneralTrait(object): InbredSet.SpeciesId = Species.Id AND Species.TaxonomyId = Homologene.TaxonomyId """ % (escape(str(self.geneid)), escape(self.dataset.group.name)) - print("-> query is:", query) result = g.db.execute(query).fetchone() #else: # result = None @@ -388,7 +382,6 @@ class GeneralTrait(object): Geno.Name = '{}' and Geno.SpeciesId = Species.Id """.format(self.dataset.group.species, self.locus) - print("query is:", query) result = g.db.execute(query).fetchone() self.locus_chr = result[0] self.locus_mb = result[1] diff --git a/wqflask/utility/corr_result_helpers.py b/wqflask/utility/corr_result_helpers.py new file mode 100644 index 00000000..edf32449 --- /dev/null +++ b/wqflask/utility/corr_result_helpers.py @@ -0,0 +1,30 @@ +def normalize_values(a_values, b_values): + """ + Trim two lists of values to contain only the values they both share + + Given two lists of sample values, trim each list so that it contains + only the samples that contain a value in both lists. Also returns + the number of such samples. + + >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1]) + ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) + + """ + + min_length = min(len(a_values), len(b_values)) + a_new = [] + b_new = [] + for counter in range(min_length): + if a_values[counter] and b_values[counter]: + a_new.append(a_values[counter]) + b_new.append(b_values[counter]) + + num_overlap = len(a_new) + assert num_overlap == len(b_new), "Lengths should be the same" + + return a_new, b_new, num_overlap + + +if __name__ == '__main__': + import doctest + doctest.testmod() \ No newline at end of file diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 3b1ac87d..1410ae0c 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -20,6 +20,9 @@ from __future__ import absolute_import, print_function, division +import sys +sys.path.append(".") + import gc import string import cPickle @@ -43,7 +46,7 @@ from utility.TDCell import TDCell from base.trait import GeneralTrait from base import data_set from base.templatePage import templatePage -from utility import webqtlUtil, helper_functions +from utility import webqtlUtil, helper_functions, corr_result_helpers from dbFunction import webqtlDatabaseFunction import utility.webqtlUtil #this is for parallel computing only. from wqflask.correlation import correlationFunction @@ -122,22 +125,24 @@ class CorrelationResults(object): self.correlation_data = {} for trait, values in self.target_dataset.trait_data.iteritems(): - this_trait_values = [] - target_values = [] + this_trait_vals = [] + target_vals = [] for index, sample in enumerate(self.target_dataset.samplelist): if sample in self.sample_data: sample_value = self.sample_data[sample] target_sample_value = values[index] - this_trait_values.append(sample_value) - target_values.append(target_sample_value) + this_trait_vals.append(sample_value) + target_vals.append(target_sample_value) + + this_trait_vals, target_vals, num_overlap = corr_result_helpers.normalize_values( + this_trait_vals, target_vals) - this_trait_values, target_values, num_overlap = normalize_values(this_trait_values, - target_values) + print("num_overlap:", num_overlap) if self.corr_method == 'pearson': - sample_r, sample_p = scipy.stats.pearsonr(this_trait_values, target_values) + sample_r, sample_p = scipy.stats.pearsonr(this_trait_vals, target_vals) else: - sample_r, sample_p = scipy.stats.spearmanr(this_trait_values, target_values) + sample_r, sample_p = scipy.stats.spearmanr(this_trait_vals, target_vals) self.correlation_data[trait] = [sample_r, sample_p, num_overlap] @@ -940,163 +945,3 @@ class CorrelationResults(object): return traitList - - def createExcelFileWithTitleAndFooter(self, workbook=None, identification=None, db=None, returnNumber=None): - - worksheet = workbook.add_worksheet() - - titleStyle = workbook.add_format(align = 'left', bold = 0, size=14, border = 1, border_color="gray") - - ##Write title Info - # Modified by Hongqiang Li - worksheet.write([1, 0], "Citations: Please see %s/reference.html" % webqtlConfig.PORTADDR, titleStyle) - worksheet.write([1, 0], "Citations: Please see %s/reference.html" % webqtlConfig.PORTADDR, titleStyle) - worksheet.write([2, 0], "Trait : %s" % identification, titleStyle) - worksheet.write([3, 0], "Database : %s" % db.fullname, titleStyle) - worksheet.write([4, 0], "Date : %s" % time.strftime("%B %d, %Y", time.gmtime()), titleStyle) - worksheet.write([5, 0], "Time : %s GMT" % time.strftime("%H:%M ", time.gmtime()), titleStyle) - worksheet.write([6, 0], "Status of data ownership: Possibly unpublished data; please see %s/statusandContact.html for details on sources, ownership, and usage of these data." % webqtlConfig.PORTADDR, titleStyle) - #Write footer info - worksheet.write([9 + returnNumber, 0], "Funding for The GeneNetwork: NIAAA (U01AA13499, U24AA13513), NIDA, NIMH, and NIAAA (P20-DA21131), NCI MMHCC (U01CA105417), and NCRR (U01NR 105417)", titleStyle) - worksheet.write([10 + returnNumber, 0], "PLEASE RETAIN DATA SOURCE INFORMATION WHENEVER POSSIBLE", titleStyle) - - return worksheet - - - def getTableHeaderForGeno(self, method=None, worksheet=None, newrow=None, headingStyle=None): - - tblobj_header = [] - - if method in ["1","3","4"]: - tblobj_header = [[THCell(HT.TD(' ', Class="fs13 fwb ffl b1 cw cbrb"), sort=0), - THCell(HT.TD('Record', HT.BR(), 'ID', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Record ID', idx=1), - THCell(HT.TD('Location', HT.BR(), 'Chr and Mb', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Location (Chr and Mb)', idx=2), - THCell(HT.TD(HT.Href( - text = HT.Span('Sample',HT.BR(), 'r', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"), - target = '_blank', - url = "/correlationAnnotation.html#genetic_r"), - Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample r", idx=3), - THCell(HT.TD('N',HT.BR(),'Cases',HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text="N Cases", idx=4), - THCell(HT.TD(HT.Href( - text = HT.Span('Sample',HT.BR(), 'p(r)', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"), - target = '_blank', - url = "/correlationAnnotation.html#genetic_p_r"), - Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample p(r)", idx=5)]] - - for ncol, item in enumerate(['Record ID', 'Location (Chr, Mb)', 'Sample r', 'N Cases', 'Sample p(r)']): - worksheet.write([newrow, ncol], item, headingStyle) - worksheet.set_column([ncol, ncol], 2*len(item)) - else: - tblobj_header = [[THCell(HT.TD(' ', Class="fs13 fwb ffl b1 cw cbrb"), sort=0), - THCell(HT.TD('Record', HT.BR(), 'ID', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Record ID', idx=1), - THCell(HT.TD('Location', HT.BR(), 'Chr and Mb', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Location (Chr and Mb)', idx=2), - THCell(HT.TD(HT.Href( - text = HT.Span('Sample',HT.BR(), 'rho', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"), - target = '_blank', - url = "/correlationAnnotation.html#genetic_rho"), - Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample rho", idx=3), - THCell(HT.TD('N',HT.BR(),'Cases',HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text="N Cases", idx=4), - THCell(HT.TD(HT.Href( - text = HT.Span('Sample',HT.BR(), 'p(rho)', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"), - target = '_blank', - url = "/correlationAnnotation.html#genetic_p_rho"), - Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample p(rho)", idx=5)]] - - for ncol, item in enumerate(['Record ID', 'Location (Chr, Mb)', 'Sample rho', 'N Cases', 'Sample p(rho)']): - worksheet.write([newrow, ncol], item, headingStyle) - worksheet.set_column([ncol, ncol], 2*len(item)) - - - return tblobj_header, worksheet - - - def getTableBodyForGeno(self, traitList, formName=None, worksheet=None, newrow=None, corrScript=None): - - tblobj_body = [] - - for thisTrait in traitList: - tr = [] - - trId = str(thisTrait) - - corrScript.append('corrArray["%s"] = {corr:%1.4f};' % (trId, thisTrait.corr)) - - tr.append(TDCell(HT.TD(HT.Input(type="checkbox", Class="checkbox", name="searchResult",value=trId, onClick="highlight(this)"), nowrap="on", Class="fs12 fwn ffl b1 c222"), text=trId)) - - tr.append(TDCell(HT.TD(HT.Href(text=thisTrait.name,url="javascript:showTrait('%s', '%s')" % (formName, thisTrait.name), Class="fs12 fwn ffl"),align="left", Class="fs12 fwn ffl b1 c222"), text=thisTrait.name, val=thisTrait.name.upper())) - - #XZ: trait_location_value is used for sorting - trait_location_repr = '--' - trait_location_value = 1000000 - - if thisTrait.chr and thisTrait.mb: - try: - trait_location_value = int(thisTrait.chr)*1000 + thisTrait.mb - except: - if thisTrait.chr.upper() == 'X': - trait_location_value = 20*1000 + thisTrait.mb - else: - trait_location_value = ord(str(thisTrait.chr).upper()[0])*1000 + thisTrait.mb - - trait_location_repr = 'Chr%s: %.6f' % (thisTrait.chr, float(thisTrait.mb) ) - - tr.append(TDCell(HT.TD(trait_location_repr, Class="fs12 fwn b1 c222", nowrap="on"), trait_location_repr, trait_location_value)) - - - repr='%3.3f' % thisTrait.corr - tr.append(TDCell(HT.TD(HT.Href(text=repr, url="javascript:showCorrPlot('%s', '%s')" % (formName, thisTrait.name), Class="fs12 fwn ffl"), Class="fs12 fwn ffl b1 c222", nowrap='ON', align='right'),repr,abs(thisTrait.corr))) - - repr = '%d' % thisTrait.nOverlap - tr.append(TDCell(HT.TD(repr, Class="fs12 fwn ffl b1 c222",align='right'),repr,thisTrait.nOverlap)) - - repr = webqtlUtil.SciFloat(thisTrait.corrPValue) - tr.append(TDCell(HT.TD(repr,nowrap='ON', Class="fs12 fwn ffl b1 c222", align='right'),repr,thisTrait.corrPValue)) - - tblobj_body.append(tr) - - for ncol, item in enumerate([thisTrait.name, trait_location_repr, thisTrait.corr, thisTrait.nOverlap, thisTrait.corrPValue]): - worksheet.write([newrow, ncol], item) - newrow += 1 - - return tblobj_body, worksheet, corrScript - -def normalize_values(values_1, values_2): - N = min(len(values_1), len(values_2)) - X = [] - Y = [] - for i in range(N): - if values_1[i]!= None and values_2[i]!= None: - X.append(values_1[i]) - Y.append(values_2[i]) - num_overlap = len(X) - - return (X, Y, num_overlap) - - -def cal_correlation(values_1, values_2): - N = min(len(values_1), len(values_2)) - X = [] - Y = [] - for i in range(N): - if values_1[i]!= None and values_2[i]!= None: - X.append(values_1[i]) - Y.append(values_2[i]) - NN = len(X) - if NN <6: - return (0.0,NN) - sx = reduce(lambda x,y:x+y,X,0.0) - sy = reduce(lambda x,y:x+y,Y,0.0) - x_mean = sx/NN - y_mean = sy/NN - xyd = 0.0 - sxd = 0.0 - syd = 0.0 - for i in range(NN): - xyd += (X[i] - x_mean)*(Y[i] - y_mean) - sxd += (X[i] - x_mean)*(X[i] - x_mean) - syd += (Y[i] - y_mean)*(Y[i] - y_mean) - try: - corr = xyd/(sqrt(sxd)*sqrt(syd)) - except: - corr = 0 - return (corr, NN) -- cgit v1.2.3 From d95eff3f0410ec5cddd82398f6db0fffa5d9be0b Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Thu, 27 Jun 2013 21:49:58 +0000 Subject: Fixed N Cases column on correlation results page --- wqflask/wqflask/correlation/show_corr_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 1410ae0c..03059d68 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -157,7 +157,7 @@ class CorrelationResults(object): trait_object = GeneralTrait(dataset=self.dataset, name=trait, get_qtl_info=True) trait_object.sample_r = self.correlation_data[trait][0] trait_object.sample_p = self.correlation_data[trait][1] - trait_object_num_overlap = self.correlation_data[trait][2] + trait_object.num_overlap = self.correlation_data[trait][2] self.correlation_results.append(trait_object) #self.correlation_data_slice[trait] = self.correlation_data[trait] -- cgit v1.2.3 From ad4a6f4d12df9a00f2a9b925f1147a26c6ee0227 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Fri, 5 Jul 2013 16:25:13 +0000 Subject: Change Correlation table value format --- wqflask/wqflask/templates/correlation_page.html | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/templates/correlation_page.html b/wqflask/wqflask/templates/correlation_page.html index efbf689c..53b12545 100644 --- a/wqflask/wqflask/templates/correlation_page.html +++ b/wqflask/wqflask/templates/correlation_page.html @@ -41,13 +41,13 @@ {{ trait.symbol }} {{ trait.alias }} {{ trait.description }} - Chr{{ trait.chr }}: {{ trait.mb }} - {{ trait.mean }} - {{ trait.lrs }} - Chr{{ trait.locus_chr }}: {{ trait.locus_mb }} - {{ trait.sample_r }} + Chr{{ trait.chr }}:{{'%0.6f'|format(trait.mb)}} + {{'%0.3f'|format(trait.mean)}} + {{'%0.3f'|format(trait.lrs)}} + Chr{{ trait.locus_chr }}:{{'%0.6f'|format(trait.locus_mb)}} + {{'%0.3f'|format(trait.sample_r)}} {{ trait.num_overlap }} - {{ trait.sample_p }} + {{'%0.3e'|format(trait.sample_p)}} {% endfor %} -- cgit v1.2.3 From 0feeb303cb1874cffd6e20f2758dcd578247bd54 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Thu, 11 Jul 2013 22:00:38 +0000 Subject: Began changing the style of the code related to the tissue correlation column of the correlation page --- wqflask/wqflask/correlation/show_corr_results.py | 125 ++++++++++++++--------- 1 file changed, 74 insertions(+), 51 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 11eca936..4a9aea73 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -52,6 +52,8 @@ import utility.webqtlUtil #this is for parallel computing only. from wqflask.correlation import correlationFunction from utility.benchmark import Bench +from MySQLdb import escape_string as escape + from pprint import pformat as pf METHOD_SAMPLE_PEARSON = "1" @@ -101,13 +103,14 @@ class CorrelationResults(object): self.sample_data = {} self.corr_method = start_vars['corr_sample_method'] + self.return_number = 500 #The two if statements below append samples to the sample list based upon whether the user #rselected Primary Samples Only, Other Samples Only, or All Samples primary_samples = (self.dataset.group.parlist + - self.dataset.group.f1list + - self.dataset.group.samplelist) + self.dataset.group.f1list + + self.dataset.group.samplelist) #If either BXD/whatever Only or All Samples, append all of that group's samplelist if corr_samples_group != 'samples_other': @@ -153,7 +156,7 @@ class CorrelationResults(object): #self.correlation_data_slice = collections.OrderedDict() - for trait_counter, trait in enumerate(self.correlation_data.keys()[:300]): + for trait_counter, trait in enumerate(self.correlation_data.keys()[:self.return_number]): trait_object = GeneralTrait(dataset=self.dataset, name=trait, get_qtl_info=True) trait_object.sample_r = self.correlation_data[trait][0] trait_object.sample_p = self.correlation_data[trait][1] @@ -203,13 +206,6 @@ class CorrelationResults(object): # mb = trait_object.mb # ) - - - #trait_list = self.getTissueCorrelationByList( primary_trait_symbol = self.this_trait.symbol, - # corr_results = self.correlation_results, - # TissueProbeSetFreezeId = 1, - # method=1) - #XZ, 09/18/2008: get all information about the user selected database. #target_db_name = fd.corr_dataset #self.target_db_name = start_vars['corr_dataset'] @@ -531,9 +527,13 @@ class CorrelationResults(object): #XZ, 09/23/2008: In tissue correlation tables, there is no record of GeneId1 == GeneId2 #XZ, 09/24/2008: Note that the correlation value can be negative. - def getTempTissueCorrTable(self, primaryTraitSymbol="", TissueProbeSetFreezeId=0, method="", returnNumber=0): + def get_temp_tissue_corr_table(self, + tissue_probesetfreeze_id=0, + method="", + return_number=0): + - def cmpTissCorrAbsoluteValue(A, B): + def cmp_tisscorr_absolute_value(A, B): try: if abs(A[1]) < abs(B[1]): return 1 elif abs(A[1]) == abs(B[1]): @@ -542,26 +542,28 @@ class CorrelationResults(object): except: return 0 - symbolCorrDict, symbolPvalueDict = self.calculateCorrOfAllTissueTrait(primaryTraitSymbol=primaryTraitSymbol, TissueProbeSetFreezeId=TISSUE_MOUSE_DB, method=method) + symbol_corr_dict, symbol_pvalue_dict = self.calculate_corr_all_tissue_trait( + tissue_probesetfreeze_id=TISSUE_MOUSE_DB, + method=method) - symbolCorrList = symbolCorrDict.items() + symbol_corr_list = symbol_corr_dict.items() - symbolCorrList.sort(cmpTissCorrAbsoluteValue) - symbolCorrList = symbolCorrList[0 : 2*returnNumber] + symbol_corr_list.sort(cmp_tisscorr_absolute_value) + symbol_corr_list = symbol_corr_list[0 : 2*return_number] - tmpTableName = webqtlUtil.genRandStr(prefix="TOPTISSUE") + tmp_table_name = webqtlUtil.genRandStr(prefix="TOPTISSUE") - q1 = 'CREATE TEMPORARY TABLE %s (Symbol varchar(100) PRIMARY KEY, Correlation float, PValue float)' % tmpTableName + q1 = 'CREATE TEMPORARY TABLE %s (Symbol varchar(100) PRIMARY KEY, Correlation float, PValue float)' % tmp_table_name self.cursor.execute(q1) - for one_pair in symbolCorrList: + for one_pair in symbol_corr_list: one_symbol = one_pair[0] one_corr = one_pair[1] - one_p_value = symbolPvalueDict[one_symbol] + one_p_value = symbol_pvalue_dict[one_symbol] self.cursor.execute( "INSERT INTO %s (Symbol, Correlation, PValue) VALUES ('%s',%f,%f)" % (tmpTableName, one_symbol, float(one_corr), float(one_p_value)) ) - return tmpTableName + return tmp_table_name #XZ, 01/09/2009: This function was created by David Crowell. Xiaodong cleaned up and modified it. @@ -591,37 +593,47 @@ class CorrelationResults(object): return litCorrDict + def fetch_tissue_correlations(self, method=""): + """Comments Possibly Out of Date!!!!! + + + Uses getTempTissueCorrTable to generate table of tissue correlations. + + This function then gathers that data and pairs it with the TraitID string. + Takes as its arguments a formdata instance, and a database instance. + Returns a dictionary of 'TraitID':(tissueCorr, tissuePValue) + for the requested correlation + + """ - #XZ, 01/09/2009: Xiaodong created this function. - def fetchTissueCorrelations(self, db, primaryTraitSymbol="", TissueProbeSetFreezeId=0, method="", returnNumber = 0): - """Uses getTempTissueCorrTable to generate table of tissue correlations. This function then gathers that data and - pairs it with the TraitID string. Takes as its arguments a formdata instance, and a database instance. - Returns a dictionary of 'TraitID':(tissueCorr, tissuePValue) for the requested correlation""" - - - tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=primaryTraitSymbol, TissueProbeSetFreezeId=TISSUE_MOUSE_DB, method=method, returnNumber=returnNumber) + # table name string + temp_table = self.get_temp_tissue_corr_table(tissue_probesetfreeze_id=TISSUE_MOUSE_DB, + method=method) - query = "SELECT ProbeSet.Name, %s.Correlation, %s.PValue" % (tempTable, tempTable) - query += ' FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)' - query += ' LEFT JOIN %s ON %s.Symbol=ProbeSet.Symbol ' % (tempTable,tempTable) - query += "WHERE ProbeSetFreeze.Name = '%s' and ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId and ProbeSet.Id = ProbeSetXRef.ProbeSetId and ProbeSet.Symbol IS NOT NULL AND %s.Correlation IS NOT NULL" % (db.name, tempTable) + query = """SELECT ProbeSet.Name, {}.Correlation, {}.PValue + FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) + LEFT JOIN {} ON {}.Symbol=ProbeSet.Symbol + WHERE ProbeSetFreeze.Name = '{}' + and ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId + and ProbeSet.Id = ProbeSetXRef.ProbeSetId + and ProbeSet.Symbol IS NOT NULL + and {}.Correlation IS NOT NULL""".format(dataset.mescape( + temp_table, temp_table, temp_table, temp_table, + self.dataset.name, temp_table)) - self.cursor.execute(query) - results = self.cursor.fetchall() + results = g.db.execute(query).fetchall() - tissueCorrDict = {} + tissue_corr_dict = {} for entry in results: - traitName, tissueCorr, tissuePValue = entry - tissueCorrDict[traitName] = (tissueCorr, tissuePValue) - - self.cursor.execute('DROP TEMPORARY TABLE %s' % tempTable) + trait_name, tissue_corr, tissue_pvalue = entry + tissue_corr_dict[trait_name] = (tissue_corr, tissue_pvalue) - return tissueCorrDict + g.db.execute('DROP TEMPORARY TABLE {}'.format(escape(temp_table)) + return tissue_corr_dict - #XZ, 01/13/2008 def getLiteratureCorrelationByList(self, input_trait_mouse_geneid=None, species=None, traitList=None): tmpTableName = webqtlUtil.genRandStr(prefix="LITERATURE") @@ -672,7 +684,7 @@ class CorrelationResults(object): use_tissue_corr = False if self.method in TISSUE_METHODS: - tissue_corrs = self.fetchTissueCorrelations(db=self.db, primaryTraitSymbol=self.trait_symbol, TissueProbeSetFreezeId=TISSUE_MOUSE_DB, method=self.method, returnNumber = self.returnNumber) + tissue_corrs = self.fetch_tissue_correlations(method=self.method, return_number = self.return_number) use_tissue_corr = True DatabaseFileName = self.getFileName( target_db_name=self.target_db_name ) @@ -897,20 +909,31 @@ class CorrelationResults(object): return trait_list """ - def calculateCorrOfAllTissueTrait(self, primaryTraitSymbol=None, TissueProbeSetFreezeId=None, method=None): + def calculate_corr_all_tissue_trait(self, tissue_probesetfreeze_id=None, method=None): - symbolCorrDict = {} - symbolPvalueDict = {} + symbol_corr_dict = {} + symbol_pvalue_dict = {} - primaryTraitSymbolValueDict = correlationFunction.getGeneSymbolTissueValueDictForTrait(cursor=self.cursor, GeneNameLst=[primaryTraitSymbol], TissueProbeSetFreezeId=TISSUE_MOUSE_DB) - primaryTraitValue = primaryTraitSymbolValueDict.values()[0] + primary_trait_symbol_value_dict = correlation_function. + get_genesymbol_tissue_value_dict_trait(cursor=self.cursor, + GeneNameLst=[primaryTraitSymbol], + TissueProbeSetFreezeId=TISSUE_MOUSE_DB) + primary_trait_value = primary_trait_symbol_value_dict.values()[0] - SymbolValueDict = correlationFunction.getGeneSymbolTissueValueDictForTrait(cursor=self.cursor, GeneNameLst=[], TissueProbeSetFreezeId=TISSUE_MOUSE_DB) + symbol_value_dict = correlation_function.get_genesymbol_tissue_value_dict_trait( + cursor=self.cursor, + gene_name_list=[], + tissue_probeSetfreeze_id=TISSUE_MOUSE_DB) if method in ["2","5"]: - symbolCorrDict, symbolPvalueDict = correlationFunction.batchCalTissueCorr(primaryTraitValue,SymbolValueDict,method='spearman') + symbol_corr_dict, symbol_pvalue_dict = correlation_function.batch_cal_tissue_corr( + primaryTraitValue, + SymbolValueDict, + method='spearman') else: - symbolCorrDict, symbolPvalueDict = correlationFunction.batchCalTissueCorr(primaryTraitValue,SymbolValueDict) + symbol_corr_dict, symbol_pvalue_dict = correlation_function.batch_cal_tissue_corr( + primaryTraitValue, + SymbolValueDict) return (symbolCorrDict, symbolPvalueDict) -- cgit v1.2.3 From 155e2997613c0750de30b734686f8977524956f9 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Fri, 12 Jul 2013 22:58:39 +0000 Subject: Rewrote code related to getting the tissue correlation column to display Created new files for mrna assay tissue data and commonly used db query related functions --- wqflask/base/mrna_assay_tissue_data.py | 134 +++ wqflask/utility/db_tools.py | 15 + wqflask/wqflask/correlation/correlationFunction.py | 923 -------------------- .../wqflask/correlation/correlation_functions.py | 959 +++++++++++++++++++++ wqflask/wqflask/correlation/show_corr_results.py | 76 +- 5 files changed, 1151 insertions(+), 956 deletions(-) create mode 100644 wqflask/base/mrna_assay_tissue_data.py create mode 100644 wqflask/utility/db_tools.py delete mode 100644 wqflask/wqflask/correlation/correlationFunction.py create mode 100644 wqflask/wqflask/correlation/correlation_functions.py (limited to 'wqflask') diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py new file mode 100644 index 00000000..8ae71858 --- /dev/null +++ b/wqflask/base/mrna_assay_tissue_data.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import, print_function, division + +import collections + +from flask import g + +from utility import dbtools +from uitility import Bunch + +from MySQLdb import escape_string as escape + +class MrnaAssayTissueData(object): + + def __init__(self, gene_symbols=None): + self.gene_symbols = gene_symbols + self.have_data = False + if self.gene_symbols == None: + self.gene_symbols = [] + + self.data = collections.defaultdict(Bunch) + + #self.gene_id_dict ={} + #self.data_id_dict = {} + #self.chr_dict = {} + #self.mb_dict = {} + #self.desc_dict = {} + #self.probe_target_desc_dict = {} + + query = '''select t.Symbol, t.GeneId, t.DataId,t.Chr, t.Mb, t.description, t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=1 and ''' + + # Note that inner join is necessary in this query to get distinct record in one symbol group + # with highest mean value + # Due to the limit size of TissueProbeSetFreezeId table in DB, + # performance of inner join is acceptable. + if len(gene_symbols) == 0: + query += '''Symbol!='' and Symbol Is Not Null group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol + and t.Mean = x.maxmean; + ''' + else: + in_clause = dbtools.create_in_clause(gene_symbols) + + query += ''' Symbol in {} group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol + and t.Mean = x.maxmean; + '''.format(in_clause) + + results = g.db.execute(query).fetchall() + for result in results: + symbol = item[0] + gene_symbols.append(symbol) + symbol = symbol.lower() + + self.data[symbol].gene_id = result.GeneId + self.data[symbol].data_id = result.DataId + self.data[symbol].chr = result.Chr + self.data[symbol].mb = result.Mb + self.data[symbol].description = result.description + self.data[symbol].probe_target_description = result.Probe_Target_Description + + + ########################################################################### + #Input: cursor, symbolList (list), dataIdDict(Dict) + #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, + # key is symbol, value is one list of expression values of one probeSet; + #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). + #Attention! All keys are lower case! + ########################################################################### + def get_symbol_value_pairs(self): + + id_list = [self.tissue_data[symbol.lower()].data_id for item in self.tissue_data] + + symbol_value_pairs = {} + value_list=[] + + query = """SELECT value, id + FROM TissueProbeSetData + WHERE Id IN {}""".format(create_in_clause(id_list)) + + try : + results = g.db.execute(query).fetchall() + for result in results: + value_list.append(result.value) + symbol_value_pairs[symbol] = value_list + except: + symbol_value_pairs[symbol] = None + + #for symbol in symbol_list: + # if tissue_data.has_key(symbol): + # data_id = tissue_data[symbol].data_id + # + # query = """select value, id + # from TissueProbeSetData + # where Id={}""".format(escape(data_id)) + # try : + # results = g.db.execute(query).fetchall() + # for item in results: + # item = item[0] + # value_list.append(item) + # symbol_value_pairs[symbol] = value_list + # value_list=[] + # except: + # symbol_value_pairs[symbol] = None + + return symbol_value_pairs + + ######################################################################################################## + #input: cursor, symbolList (list), dataIdDict(Dict): key is symbol + #output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. + # key is symbol, value is one list of expression values of one probeSet. + #function: wrapper function for getSymbolValuePairDict function + # build gene symbol list if necessary, cut it into small lists if necessary, + # then call getSymbolValuePairDict function and merge the results. + ######################################################################################################## + + def get_trait_symbol_and_tissue_values(symbol_list=None): + tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) + + #symbolList, + #geneIdDict, + #dataIdDict, + #ChrDict, + #MbDict, + #descDict, + #pTargetDescDict = getTissueProbeSetXRefInfo( + # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + + if len(tissue_data.gene_symbols): + return get_symbol_value_pairs(tissue_data) + diff --git a/wqflask/utility/db_tools.py b/wqflask/utility/db_tools.py new file mode 100644 index 00000000..4034f39c --- /dev/null +++ b/wqflask/utility/db_tools.py @@ -0,0 +1,15 @@ +from __future__ import absolute_import, print_function, division + +from MySQLdb import escape_string as escape + +def create_in_clause(items): + """Create an in clause for mysql""" + in_clause = ', '.join("'{}'".format(x) for x in mescape(*items)) + in_clause = '( {} )'.format(in_clause) + return in_clause + +def mescape(*items): + """Multiple escape""" + escaped = [escape(str(item)) for item in items] + #print("escaped is:", escaped) + return escaped diff --git a/wqflask/wqflask/correlation/correlationFunction.py b/wqflask/wqflask/correlation/correlationFunction.py deleted file mode 100644 index 7d4b58a9..00000000 --- a/wqflask/wqflask/correlation/correlationFunction.py +++ /dev/null @@ -1,923 +0,0 @@ -# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. -# -# This program is free software: you can redistribute it and/or modify it -# under the terms of the GNU Affero General Public License -# as published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# See the GNU Affero General Public License for more details. -# -# This program is available from Source Forge: at GeneNetwork Project -# (sourceforge.net/projects/genenetwork/). -# -# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) -# at rwilliams@uthsc.edu and xzhou15@uthsc.edu -# -# -# -# This module is used by GeneNetwork project (www.genenetwork.org) -# -# Created by GeneNetwork Core Team 2010/08/10 -# -# Last updated by NL 2011/03/23 - - -import math -#import rpy2.robjects -import pp -import string - -from utility import webqtlUtil -from base.trait import GeneralTrait -from dbFunction import webqtlDatabaseFunction - - - -#XZ: The input 'controls' is String. It contains the full name of control traits. -#XZ: The input variable 'strainlst' is List. It contains the strain names of primary trait. -#XZ: The returned tcstrains is the list of list [[],[]...]. So are tcvals and tcvars. The last returned parameter is list of numbers. -#XZ, 03/29/2010: For each returned control trait, there is no None value in it. -def controlStrains(controls, strainlst): - - controls = controls.split(',') - - cvals = {} - for oneTraitName in controls: - oneTrait = webqtlTrait(fullname=oneTraitName, cursor=webqtlDatabaseFunction.getCursor() ) - oneTrait.retrieveData() - cvals[oneTraitName] = oneTrait.data - - tcstrains = [] - tcvals = [] - tcvars = [] - - for oneTraitName in controls: - strains = [] - vals = [] - vars = [] - - for _strain in strainlst: - if cvals[oneTraitName].has_key(_strain): - _val = cvals[oneTraitName][_strain].val - if _val != None: - strains.append(_strain) - vals.append(_val) - vars.append(None) - - tcstrains.append(strains) - tcvals.append(vals) - tcvars.append(vars) - - return tcstrains, tcvals, tcvars, [len(x) for x in tcstrains] - - - -#XZ, 03/29/2010: After execution of functon "controlStrains" and "fixStrains", primary trait and control traits have the same strains and in the same order. There is no 'None' value in them. -def fixStrains(_strains,_controlstrains,_vals,_controlvals,_vars,_controlvars): - """Corrects strains, vals, and vars so that all contrain only those strains common - to the reference trait and all control traits.""" - - def dictify(strains,vals,vars): - subdict = {} - for i in xrange(len(strains)): - subdict[strains[i]] = (vals[i],vars[i]) - return subdict - - #XZ: The 'dicts' is a list of dictionary. The first element is the dictionary of reference trait. The rest elements are for control traits. - dicts = [] - dicts.append(dictify(_strains,_vals,_vars)) - - nCstrains = len(_controlstrains) - for i in xrange(nCstrains): - dicts.append(dictify(_controlstrains[i],_controlvals[i],_controlvars[i])) - - _newstrains = [] - _vals = [] - _vars = [] - _controlvals = [[] for x in xrange(nCstrains)] - _controlvars = [[] for x in xrange(nCstrains)] - - for strain in _strains: - inall = True - for d in dicts: - if strain not in d: - inall = False - break - if inall: - _newstrains.append(strain) - _vals.append(dicts[0][strain][0]) - _vars.append(dicts[0][strain][1]) - for i in xrange(nCstrains): - _controlvals[i].append(dicts[i+1][strain][0]) - _controlvars[i].append(dicts[i+1][strain][1]) - - return _newstrains, _vals, _controlvals, _vars, _controlvars - - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -def findIdenticalControlTraits ( controlVals, controlNames ): - nameOfIdenticalTraits = [] - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - - #XZ: reset the precision of values and convert to string type - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - - return nameOfIdenticalTraits - -#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. -#else, the returned list has two elements of control trait name. -#primaryVal is of list type. It contains value of primary trait. -#primaryName is of string type. -#controlVals is of list type. Each element is list too. Each element contain value of one control trait. -#controlNames is of list type. -def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): - nameOfIdenticalTraits = [] - - #XZ: reset the precision of values and convert to string type - for oneStrainVal in primaryVal: - oneStrainVal = '%.3f' % oneStrainVal - - for oneTraitVal in controlVals: - for oneStrainVal in oneTraitVal: - oneStrainVal = '%.3f' % oneStrainVal - - controlTraitNumber = len(controlVals) - - if controlTraitNumber > 1: - for i, oneTraitVal in enumerate( controlVals ): - for j in range(i+1, controlTraitNumber): - if oneTraitVal == controlVals[j]: - nameOfIdenticalTraits.append(controlNames[i]) - nameOfIdenticalTraits.append(controlNames[j]) - break - - if len(nameOfIdenticalTraits) == 0: - for i, oneTraitVal in enumerate( controlVals ): - if primaryVal == oneTraitVal: - nameOfIdenticalTraits.append(primaryName) - nameOfIdenticalTraits.append(controlNames[i]) - break - - return nameOfIdenticalTraits - - - -#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. -#XZ: No value in primaryVal and controlVals could be None. - -def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): - - def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): - - rpy2.robjects.r(""" -pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ - # The partial correlation coefficient between x and y given z - # - # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. - # - # x and y should be vectors - # - # z can be either a vector or a matrix - # - # use: There are two methods to calculate the partial correlation coefficient. - # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). - # Default is "mat". - # - # method: There are three ways to calculate the correlation coefficient, - # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. - # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. - # Default is "p". - # - # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). - # If not, the missing samples will be removed just when the correlation coefficient is calculated. - # However, the number of samples for the p-value is the number of samples after removing - # all the missing samples from the whole dataset. - # Default is "T". - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(use == "mat"){ - p.use <- "Var-Cov matrix" - pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) - }else if(use == "rec"){ - p.use <- "Recursive formula" - pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) - }else{ - stop("use should be either rec or mat!\n") - } - - # print the method - if(gregexpr("p",method)[[1]][1] == 1){ - p.method <- "Pearson" - }else if(gregexpr("s",method)[[1]][1] == 1){ - p.method <- "Spearman" - }else if(gregexpr("k",method)[[1]][1] == 1){ - p.method <- "Kendall" - }else{ - stop("method should be pearson or spearman or kendall!\n") - } - - # sample number - n <- dim(na.omit(data.frame(x,y,z)))[1] - - # given variables' number - gn <- dim(z)[2] - - # p-value - if(p.method == "Kendall"){ - statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) - p.value <- 2*pnorm(-abs(statistic)) - - }else{ - statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) - p.value <- 2*pnorm(-abs(statistic)) - } - - data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) -} - -# By using var-cov matrix -pcor.mat <- function(x,y,z,method="p",na.rm=T){ - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - xdata <- na.omit(data.frame(data[,c(1,2)])) - Sxx <- cov(xdata,xdata,m=method) - - xzdata <- na.omit(data) - xdata <- data.frame(xzdata[,c(1,2)]) - zdata <- data.frame(xzdata[,-c(1,2)]) - Sxz <- cov(xdata,zdata,m=method) - - zdata <- na.omit(data.frame(data[,-c(1,2)])) - Szz <- cov(zdata,zdata,m=method) - - # is Szz positive definite? - zz.ev <- eigen(Szz)$values - if(min(zz.ev)[1]<0){ - stop("\'Szz\' is not positive definite!\n") - } - - # partial correlation - Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) - - rxx.z <- cov2cor(Sxx.z)[1,2] - - rxx.z -} - -# By using recursive formula -pcor.rec <- function(x,y,z,method="p",na.rm=T){ - # - - x <- c(x) - y <- c(y) - z <- as.data.frame(z) - - if(dim(z)[2] == 0){ - stop("There should be given data\n") - } - - data <- data.frame(x,y,z) - - if(na.rm == T){ - data = na.omit(data) - } - - # recursive formula - if(dim(z)[2] == 1){ - tdata <- na.omit(data.frame(data[,1],data[,2])) - rxy <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) - rxz <- cor(tdata[,1],tdata[,2],m=method) - - tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) - ryz <- cor(tdata[,1],tdata[,2],m=method) - - rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) - - return(rxy.z) - }else{ - x <- c(data[,1]) - y <- c(data[,2]) - z0 <- c(data[,3]) - zc <- as.data.frame(data[,-c(1,2,3)]) - - rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) - rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) - ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) - - rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) - return(rxy.z) - } -} -""") - - R_pcorr_function = rpy2.robjects.r['pcor.test'] - R_corr_test = rpy2.robjects.r['cor.test'] - - primary = rpy2.robjects.FloatVector(range(len(primaryVal))) - for i in range(len(primaryVal)): - primary[i] = primaryVal[i] - - control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(controlVals[0])): - control[i*len(controlVals[0]) + j] = controlVals[i][j] - - allcorrelations = [] - - for targetIndex, oneTargetVals in enumerate(targetVals): - - this_primary = None - this_control = None - this_target = None - - if None in oneTargetVals: - - goodIndex = [] - for i in range(len(oneTargetVals)): - if oneTargetVals[i] != None: - goodIndex.append(i) - - this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_primary[i] = primaryVal[goodIndex[i]] - - this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) - for i in range(len(controlVals)): - for j in range(len(goodIndex)): - this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] - - this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - this_target[i] = oneTargetVals[goodIndex[i]] - - else: - this_primary = primary - this_control = control - this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) - for i in range(len(oneTargetVals)): - this_target[i] = oneTargetVals[i] - - one_name = targetNames[targetIndex] - one_N = len(this_primary) - - #calculate partial correlation - one_pc_coefficient = 'NA' - one_pc_p = 1 - - try: - if method == 's': - result = R_pcorr_function(this_primary, this_target, this_control, method='s') - else: - result = R_pcorr_function(this_primary, this_target, this_control) - - #XZ: In very few cases, the returned coefficient is nan. - #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN - if result[0][0] == result[0][0]: - one_pc_coefficient = result[0][0] - #XZ: when the coefficient value is 1 (primary trait and target trait are the same), - #XZ: occationally, the returned p value is nan instead of 0. - if result[1][0] == result[1][0]: - one_pc_p = result[1][0] - elif abs(one_pc_coefficient - 1) < 0.0000001: - one_pc_p = 0 - except: - pass - - #calculate zero order correlation - one_corr_coefficient = 0 - one_corr_p = 1 - - try: - if method == 's': - R_result = R_corr_test(this_primary, this_target, method='spearman') - else: - R_result = R_corr_test(this_primary, this_target) - - one_corr_coefficient = R_result[3][0] - one_corr_p = R_result[2][0] - except: - pass - - traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] - - allcorrelations.append(traitinfo) - - return allcorrelations - #End of function compute_partial - - - allcorrelations = [] - - target_trait_number = len(targetVals) - - if target_trait_number < 1000: - allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) - else: - step = 1000 - job_number = math.ceil( float(target_trait_number)/step ) - - job_targetVals_lists = [] - job_targetNames_lists = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, target_trait_number) - - one_job_targetVals_list = [] - one_job_targetNames_list = [] - - for i in range( starti, endi ): - one_job_targetVals_list.append( targetVals[i] ) - one_job_targetNames_list.append( targetNames[i] ) - - job_targetVals_lists.append( one_job_targetVals_list ) - job_targetNames_lists.append( one_job_targetNames_list ) - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): - one_job_targetNames_list = job_targetNames_lists[i] - #pay attention to modules from outside - jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - for one_traitinfo in one_result: - allcorrelations.append( one_traitinfo ) - - return allcorrelations - - - -#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait -#XZ: The primaryTrait and targetTrait should have executed retrieveData function -def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): - - #primaryTrait.retrieveData() - - #there is no None value in primary_val - primary_strain, primary_val, primary_var = primaryTrait.exportInformative() - - #targetTrait.retrieveData() - - #there might be None value in target_val - target_val = targetTrait.exportData(primary_strain, type="val") - - R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) - for i in range(len(primary_val)): - R_primary[i] = primary_val[i] - - N = len(target_val) - - if None in target_val: - goodIndex = [] - for i in range(len(target_val)): - if target_val[i] != None: - goodIndex.append(i) - - N = len(goodIndex) - - R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - R_primary[i] = primary_val[goodIndex[i]] - - R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) - for i in range(len(goodIndex)): - R_target[i] = target_val[goodIndex[i]] - - else: - R_target = rpy2.robjects.FloatVector(range(len(target_val))) - for i in range(len(target_val)): - R_target[i] = target_val[i] - - R_corr_test = rpy2.robjects.r['cor.test'] - - if method == 'spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - corr_result = [] - corr_result.append( R_result[3][0] ) - corr_result.append( N ) - corr_result.append( R_result[2][0] ) - - return corr_result - -##################################################################################### -#Input: primaryValue(list): one list of expression values of one probeSet, -# targetValue(list): one list of expression values of one probeSet, -# method(string): indicate correlation method ('pearson' or 'spearman') -#Output: corr_result(list): first item is Correlation Value, second item is tissue number, -# third item is PValue -#Function: get correlation value,Tissue quantity ,p value result by using R; -#Note : This function is special case since both primaryValue and targetValue are from -#the same dataset. So the length of these two parameters is the same. They are pairs. -#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on -#the same tissue order -##################################################################################### - -def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): - - R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) - N = len(primaryValue) - for i in range(len(primaryValue)): - R_primary[i] = primaryValue[i] - - R_target = rpy2.robjects.FloatVector(range(len(targetValue))) - for i in range(len(targetValue)): - R_target[i]=targetValue[i] - - R_corr_test = rpy2.robjects.r['cor.test'] - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - corr_result =[] - corr_result.append( R_result[3][0]) - corr_result.append( N ) - corr_result.append( R_result[2][0]) - - return corr_result - - - - -def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): - - def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): - - oneSymbolCorrDict = {} - oneSymbolPvalueDict = {} - - R_corr_test = rpy2.robjects.r['cor.test'] - - R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) - - for i in range(len(primaryTraitValue)): - R_primary[i] = primaryTraitValue[i] - - for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): - R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) - for i in range(len(oneTraitValue)): - R_target[i] = oneTraitValue[i] - - if method =='spearman': - R_result = R_corr_test(R_primary, R_target, method='spearman') - else: - R_result = R_corr_test(R_primary, R_target) - - oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] - oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] - - return(oneSymbolCorrDict, oneSymbolPvalueDict) - - - - symbolCorrDict = {} - symbolPvalueDict = {} - - items_number = len(SymbolValueDict) - - if items_number <= 1000: - symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) - else: - items_list = SymbolValueDict.items() - - step = 1000 - job_number = math.ceil( float(items_number)/step ) - - job_oneSymbolValueDict_list = [] - - for job_index in range( int(job_number) ): - starti = job_index*step - endi = min((job_index+1)*step, items_number) - - oneSymbolValueDict = {} - - for i in range( starti, endi ): - one_item = items_list[i] - one_symbol = one_item[0] - one_value = one_item[1] - oneSymbolValueDict[one_symbol] = one_value - - job_oneSymbolValueDict_list.append( oneSymbolValueDict ) - - - ppservers = () - # Creates jobserver with automatically detected number of workers - job_server = pp.Server(ppservers=ppservers) - - jobs = [] - results = [] - - for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): - - #pay attention to modules from outside - jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) - - for one_job in jobs: - one_result = one_job() - results.append( one_result ) - - for one_result in results: - oneSymbolCorrDict, oneSymbolPvalueDict = one_result - symbolCorrDict.update( oneSymbolCorrDict ) - symbolPvalueDict.update( oneSymbolPvalueDict ) - - return (symbolCorrDict, symbolPvalueDict) - -########################################################################### -#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId -#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) -#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and -# getGeneSymbolTissueValueDict to build dict to get CorrPvArray -#Note: If there are multiple probesets for one gene, select the one with highest mean. -########################################################################### -def getTissueProbeSetXRefInfo(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - Symbols ="" - symbolList =[] - geneIdDict ={} - dataIdDict = {} - ChrDict = {} - MbDict = {} - descDict = {} - pTargetDescDict = {} - - count = len(GeneNameLst) - - # Added by NL 01/06/2011 - # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value - # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. - if count==0: - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''%TissueProbeSetFreezeId - - else: - for i, item in enumerate(GeneNameLst): - - if i == count-1: - Symbols += "'%s'" %item - else: - Symbols += "'%s'," %item - - Symbols = "("+ Symbols+")" - query=''' - select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description - from ( - select Symbol, max(Mean) as maxmean - from TissueProbeSetXRef - where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) - as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; - '''% (TissueProbeSetFreezeId,Symbols) - - try: - - cursor.execute(query) - results =cursor.fetchall() - resultCount = len(results) - # Key in all dicts is the lower-cased symbol - for i, item in enumerate(results): - symbol = item[0] - symbolList.append(symbol) - - key =symbol.lower() - geneIdDict[key]=item[1] - dataIdDict[key]=item[2] - ChrDict[key]=item[3] - MbDict[key]=item[4] - descDict[key]=item[5] - pTargetDescDict[key]=item[6] - - except: - symbolList = None - geneIdDict=None - dataIdDict=None - ChrDict=None - MbDict=None - descDict=None - pTargetDescDict=None - - return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict - -########################################################################### -#Input: cursor, symbolList (list), dataIdDict(Dict) -#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). -#Attention! All keys are lower case! -########################################################################### -def getSymbolValuePairDict(cursor=None,symbolList=None,dataIdDict={}): - symbolList = map(string.lower, symbolList) - symbolValuepairDict={} - valueList=[] - - for key in symbolList: - if dataIdDict.has_key(key): - DataId = dataIdDict[key] - - valueQuery = "select value from TissueProbeSetData where Id=%s" % DataId - try : - cursor.execute(valueQuery) - valueResults = cursor.fetchall() - for item in valueResults: - item =item[0] - valueList.append(item) - symbolValuepairDict[key] = valueList - valueList=[] - except: - symbolValuepairDict[key] = None - - return symbolValuepairDict - - -######################################################################################################## -#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function for getSymbolValuePairDict function -# build gene symbol list if necessary, cut it into small lists if necessary, -# then call getSymbolValuePairDict function and merge the results. -######################################################################################################## - -def getGeneSymbolTissueValueDict(cursor=None,symbolList=None,dataIdDict={}): - limitNum=1000 - count = len(symbolList) - - SymbolValuePairDict = {} - - if count !=0 and count <=limitNum: - SymbolValuePairDict = getSymbolValuePairDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - - elif count >limitNum: - SymbolValuePairDict={} - n = count/limitNum - start =0 - stop =0 - - for i in range(n): - stop =limitNum*(i+1) - gList1 = symbolList[start:stop] - PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) - start =limitNum*(i+1) - - SymbolValuePairDict.update(PairDict1) - - if stop < count: - stop = count - gList2 = symbolList[start:stop] - PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) - SymbolValuePairDict.update(PairDict2) - - return SymbolValuePairDict - -######################################################################################################## -#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) -#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. -# key is symbol, value is one list of expression values of one probeSet. -#function: wrapper function of getGeneSymbolTissueValueDict function -# for CorrelationPage.py -######################################################################################################## - -def getGeneSymbolTissueValueDictForTrait(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): - SymbolValuePairDict={} - symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo(cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) - if symbolList: - SymbolValuePairDict = getGeneSymbolTissueValueDict(cursor=cursor,symbolList=symbolList,dataIdDict=dataIdDict) - return SymbolValuePairDict - -######################################################################################################## -#Input: cursor(cursor): MySQL connnection cursor; -# priGeneSymbolList(list): one list of gene symbol; -# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, -# key is symbol, value is one list of expression values of one probeSet; -#Output: corrArray(array): array of Correlation Value, -# pvArray(array): array of PValue; -#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss -######################################################################################################## - -def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): - # setting initial value for corrArray, pvArray equal to 0 - Num = len(priGeneSymbolList) - - corrArray = [([0] * (Num))[:] for i in range(Num)] - pvArray = [([0] * (Num))[:] for i in range(Num)] - i = 0 - for pkey in priGeneSymbolList: - j = 0 - pkey = pkey.strip().lower()# key in symbolValuepairDict is low case - if symbolValuepairDict.has_key(pkey): - priValue = symbolValuepairDict[pkey] - for tkey in priGeneSymbolList: - tkey = tkey.strip().lower()# key in symbolValuepairDict is low case - if priValue and symbolValuepairDict.has_key(tkey): - tarValue = symbolValuepairDict[tkey] - - if tarValue: - if i>j: - # corrArray stores Pearson Correlation values - # pvArray stores Pearson P-Values - pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) - corrArray[i][j] =pcorr_result[0] - pvArray[i][j] =pcorr_result[2] - elif i 1: + + #XZ: reset the precision of values and convert to string type + for oneTraitVal in controlVals: + for oneStrainVal in oneTraitVal: + oneStrainVal = '%.3f' % oneStrainVal + + for i, oneTraitVal in enumerate( controlVals ): + for j in range(i+1, controlTraitNumber): + if oneTraitVal == controlVals[j]: + nameOfIdenticalTraits.append(controlNames[i]) + nameOfIdenticalTraits.append(controlNames[j]) + + return nameOfIdenticalTraits + +#XZ, 6/15/2010: If there is no identical control traits, the returned list is empty. +#else, the returned list has two elements of control trait name. +#primaryVal is of list type. It contains value of primary trait. +#primaryName is of string type. +#controlVals is of list type. Each element is list too. Each element contain value of one control trait. +#controlNames is of list type. +def findIdenticalTraits (primaryVal, primaryName, controlVals, controlNames ): + nameOfIdenticalTraits = [] + + #XZ: reset the precision of values and convert to string type + for oneStrainVal in primaryVal: + oneStrainVal = '%.3f' % oneStrainVal + + for oneTraitVal in controlVals: + for oneStrainVal in oneTraitVal: + oneStrainVal = '%.3f' % oneStrainVal + + controlTraitNumber = len(controlVals) + + if controlTraitNumber > 1: + for i, oneTraitVal in enumerate( controlVals ): + for j in range(i+1, controlTraitNumber): + if oneTraitVal == controlVals[j]: + nameOfIdenticalTraits.append(controlNames[i]) + nameOfIdenticalTraits.append(controlNames[j]) + break + + if len(nameOfIdenticalTraits) == 0: + for i, oneTraitVal in enumerate( controlVals ): + if primaryVal == oneTraitVal: + nameOfIdenticalTraits.append(primaryName) + nameOfIdenticalTraits.append(controlNames[i]) + break + + return nameOfIdenticalTraits + + + +#XZ, 03/29/2010: The strains in primaryVal, controlVals, targetVals must be of the same number and in same order. +#XZ: No value in primaryVal and controlVals could be None. + +def determinePartialsByR (primaryVal, controlVals, targetVals, targetNames, method='p'): + + def compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ): + + rpy2.robjects.r(""" +pcor.test <- function(x,y,z,use="mat",method="p",na.rm=T){ + # The partial correlation coefficient between x and y given z + # + # pcor.test is free and comes with ABSOLUTELY NO WARRANTY. + # + # x and y should be vectors + # + # z can be either a vector or a matrix + # + # use: There are two methods to calculate the partial correlation coefficient. + # One is by using variance-covariance matrix ("mat") and the other is by using recursive formula ("rec"). + # Default is "mat". + # + # method: There are three ways to calculate the correlation coefficient, + # which are Pearson's ("p"), Spearman's ("s"), and Kendall's ("k") methods. + # The last two methods which are Spearman's and Kendall's coefficient are based on the non-parametric analysis. + # Default is "p". + # + # na.rm: If na.rm is T, then all the missing samples are deleted from the whole dataset, which is (x,y,z). + # If not, the missing samples will be removed just when the correlation coefficient is calculated. + # However, the number of samples for the p-value is the number of samples after removing + # all the missing samples from the whole dataset. + # Default is "T". + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(use == "mat"){ + p.use <- "Var-Cov matrix" + pcor = pcor.mat(x,y,z,method=method,na.rm=na.rm) + }else if(use == "rec"){ + p.use <- "Recursive formula" + pcor = pcor.rec(x,y,z,method=method,na.rm=na.rm) + }else{ + stop("use should be either rec or mat!\n") + } + + # print the method + if(gregexpr("p",method)[[1]][1] == 1){ + p.method <- "Pearson" + }else if(gregexpr("s",method)[[1]][1] == 1){ + p.method <- "Spearman" + }else if(gregexpr("k",method)[[1]][1] == 1){ + p.method <- "Kendall" + }else{ + stop("method should be pearson or spearman or kendall!\n") + } + + # sample number + n <- dim(na.omit(data.frame(x,y,z)))[1] + + # given variables' number + gn <- dim(z)[2] + + # p-value + if(p.method == "Kendall"){ + statistic <- pcor/sqrt(2*(2*(n-gn)+5)/(9*(n-gn)*(n-1-gn))) + p.value <- 2*pnorm(-abs(statistic)) + + }else{ + statistic <- pcor*sqrt((n-2-gn)/(1-pcor^2)) + p.value <- 2*pnorm(-abs(statistic)) + } + + data.frame(estimate=pcor,p.value=p.value,statistic=statistic,n=n,gn=gn,Method=p.method,Use=p.use) +} + +# By using var-cov matrix +pcor.mat <- function(x,y,z,method="p",na.rm=T){ + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(dim(z)[2] == 0){ + stop("There should be given data\n") + } + + data <- data.frame(x,y,z) + + if(na.rm == T){ + data = na.omit(data) + } + + xdata <- na.omit(data.frame(data[,c(1,2)])) + Sxx <- cov(xdata,xdata,m=method) + + xzdata <- na.omit(data) + xdata <- data.frame(xzdata[,c(1,2)]) + zdata <- data.frame(xzdata[,-c(1,2)]) + Sxz <- cov(xdata,zdata,m=method) + + zdata <- na.omit(data.frame(data[,-c(1,2)])) + Szz <- cov(zdata,zdata,m=method) + + # is Szz positive definite? + zz.ev <- eigen(Szz)$values + if(min(zz.ev)[1]<0){ + stop("\'Szz\' is not positive definite!\n") + } + + # partial correlation + Sxx.z <- Sxx - Sxz %*% solve(Szz) %*% t(Sxz) + + rxx.z <- cov2cor(Sxx.z)[1,2] + + rxx.z +} + +# By using recursive formula +pcor.rec <- function(x,y,z,method="p",na.rm=T){ + # + + x <- c(x) + y <- c(y) + z <- as.data.frame(z) + + if(dim(z)[2] == 0){ + stop("There should be given data\n") + } + + data <- data.frame(x,y,z) + + if(na.rm == T){ + data = na.omit(data) + } + + # recursive formula + if(dim(z)[2] == 1){ + tdata <- na.omit(data.frame(data[,1],data[,2])) + rxy <- cor(tdata[,1],tdata[,2],m=method) + + tdata <- na.omit(data.frame(data[,1],data[,-c(1,2)])) + rxz <- cor(tdata[,1],tdata[,2],m=method) + + tdata <- na.omit(data.frame(data[,2],data[,-c(1,2)])) + ryz <- cor(tdata[,1],tdata[,2],m=method) + + rxy.z <- (rxy - rxz*ryz)/( sqrt(1-rxz^2)*sqrt(1-ryz^2) ) + + return(rxy.z) + }else{ + x <- c(data[,1]) + y <- c(data[,2]) + z0 <- c(data[,3]) + zc <- as.data.frame(data[,-c(1,2,3)]) + + rxy.zc <- pcor.rec(x,y,zc,method=method,na.rm=na.rm) + rxz0.zc <- pcor.rec(x,z0,zc,method=method,na.rm=na.rm) + ryz0.zc <- pcor.rec(y,z0,zc,method=method,na.rm=na.rm) + + rxy.z <- (rxy.zc - rxz0.zc*ryz0.zc)/( sqrt(1-rxz0.zc^2)*sqrt(1-ryz0.zc^2) ) + return(rxy.z) + } +} +""") + + R_pcorr_function = rpy2.robjects.r['pcor.test'] + R_corr_test = rpy2.robjects.r['cor.test'] + + primary = rpy2.robjects.FloatVector(range(len(primaryVal))) + for i in range(len(primaryVal)): + primary[i] = primaryVal[i] + + control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(controlVals[0])) ), ncol=len(controlVals)) + for i in range(len(controlVals)): + for j in range(len(controlVals[0])): + control[i*len(controlVals[0]) + j] = controlVals[i][j] + + allcorrelations = [] + + for targetIndex, oneTargetVals in enumerate(targetVals): + + this_primary = None + this_control = None + this_target = None + + if None in oneTargetVals: + + goodIndex = [] + for i in range(len(oneTargetVals)): + if oneTargetVals[i] != None: + goodIndex.append(i) + + this_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + this_primary[i] = primaryVal[goodIndex[i]] + + this_control = rpy2.robjects.r.matrix(rpy2.robjects.FloatVector( range(len(controlVals)*len(goodIndex)) ), ncol=len(controlVals)) + for i in range(len(controlVals)): + for j in range(len(goodIndex)): + this_control[i*len(goodIndex) + j] = controlVals[i][goodIndex[j]] + + this_target = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + this_target[i] = oneTargetVals[goodIndex[i]] + + else: + this_primary = primary + this_control = control + this_target = rpy2.robjects.FloatVector(range(len(oneTargetVals))) + for i in range(len(oneTargetVals)): + this_target[i] = oneTargetVals[i] + + one_name = targetNames[targetIndex] + one_N = len(this_primary) + + #calculate partial correlation + one_pc_coefficient = 'NA' + one_pc_p = 1 + + try: + if method == 's': + result = R_pcorr_function(this_primary, this_target, this_control, method='s') + else: + result = R_pcorr_function(this_primary, this_target, this_control) + + #XZ: In very few cases, the returned coefficient is nan. + #XZ: One way to detect nan is to compare the number to itself. NaN is always != NaN + if result[0][0] == result[0][0]: + one_pc_coefficient = result[0][0] + #XZ: when the coefficient value is 1 (primary trait and target trait are the same), + #XZ: occationally, the returned p value is nan instead of 0. + if result[1][0] == result[1][0]: + one_pc_p = result[1][0] + elif abs(one_pc_coefficient - 1) < 0.0000001: + one_pc_p = 0 + except: + pass + + #calculate zero order correlation + one_corr_coefficient = 0 + one_corr_p = 1 + + try: + if method == 's': + R_result = R_corr_test(this_primary, this_target, method='spearman') + else: + R_result = R_corr_test(this_primary, this_target) + + one_corr_coefficient = R_result[3][0] + one_corr_p = R_result[2][0] + except: + pass + + traitinfo = [ one_name, one_N, one_pc_coefficient, one_pc_p, one_corr_coefficient, one_corr_p ] + + allcorrelations.append(traitinfo) + + return allcorrelations + #End of function compute_partial + + + allcorrelations = [] + + target_trait_number = len(targetVals) + + if target_trait_number < 1000: + allcorrelations = compute_partial ( primaryVal, controlVals, targetVals, targetNames, method ) + else: + step = 1000 + job_number = math.ceil( float(target_trait_number)/step ) + + job_targetVals_lists = [] + job_targetNames_lists = [] + + for job_index in range( int(job_number) ): + starti = job_index*step + endi = min((job_index+1)*step, target_trait_number) + + one_job_targetVals_list = [] + one_job_targetNames_list = [] + + for i in range( starti, endi ): + one_job_targetVals_list.append( targetVals[i] ) + one_job_targetNames_list.append( targetNames[i] ) + + job_targetVals_lists.append( one_job_targetVals_list ) + job_targetNames_lists.append( one_job_targetNames_list ) + + ppservers = () + # Creates jobserver with automatically detected number of workers + job_server = pp.Server(ppservers=ppservers) + + jobs = [] + results = [] + + for i, one_job_targetVals_list in enumerate( job_targetVals_lists ): + one_job_targetNames_list = job_targetNames_lists[i] + #pay attention to modules from outside + jobs.append( job_server.submit(func=compute_partial, args=( primaryVal, controlVals, one_job_targetVals_list, one_job_targetNames_list, method), depfuncs=(), modules=("rpy2.robjects",)) ) + + for one_job in jobs: + one_result = one_job() + results.append( one_result ) + + for one_result in results: + for one_traitinfo in one_result: + allcorrelations.append( one_traitinfo ) + + return allcorrelations + + + +#XZ, April 30, 2010: The input primaryTrait and targetTrait are instance of webqtlTrait +#XZ: The primaryTrait and targetTrait should have executed retrieveData function +def calZeroOrderCorr (primaryTrait, targetTrait, method='pearson'): + + #primaryTrait.retrieveData() + + #there is no None value in primary_val + primary_strain, primary_val, primary_var = primaryTrait.exportInformative() + + #targetTrait.retrieveData() + + #there might be None value in target_val + target_val = targetTrait.exportData(primary_strain, type="val") + + R_primary = rpy2.robjects.FloatVector(range(len(primary_val))) + for i in range(len(primary_val)): + R_primary[i] = primary_val[i] + + N = len(target_val) + + if None in target_val: + goodIndex = [] + for i in range(len(target_val)): + if target_val[i] != None: + goodIndex.append(i) + + N = len(goodIndex) + + R_primary = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + R_primary[i] = primary_val[goodIndex[i]] + + R_target = rpy2.robjects.FloatVector(range(len(goodIndex))) + for i in range(len(goodIndex)): + R_target[i] = target_val[goodIndex[i]] + + else: + R_target = rpy2.robjects.FloatVector(range(len(target_val))) + for i in range(len(target_val)): + R_target[i] = target_val[i] + + R_corr_test = rpy2.robjects.r['cor.test'] + + if method == 'spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + corr_result = [] + corr_result.append( R_result[3][0] ) + corr_result.append( N ) + corr_result.append( R_result[2][0] ) + + return corr_result + +##################################################################################### +#Input: primaryValue(list): one list of expression values of one probeSet, +# targetValue(list): one list of expression values of one probeSet, +# method(string): indicate correlation method ('pearson' or 'spearman') +#Output: corr_result(list): first item is Correlation Value, second item is tissue number, +# third item is PValue +#Function: get correlation value,Tissue quantity ,p value result by using R; +#Note : This function is special case since both primaryValue and targetValue are from +#the same dataset. So the length of these two parameters is the same. They are pairs. +#Also, in the datatable TissueProbeSetData, all Tissue values are loaded based on +#the same tissue order +##################################################################################### + +def calZeroOrderCorrForTiss (primaryValue=[], targetValue=[], method='pearson'): + + R_primary = rpy2.robjects.FloatVector(range(len(primaryValue))) + N = len(primaryValue) + for i in range(len(primaryValue)): + R_primary[i] = primaryValue[i] + + R_target = rpy2.robjects.FloatVector(range(len(targetValue))) + for i in range(len(targetValue)): + R_target[i]=targetValue[i] + + R_corr_test = rpy2.robjects.r['cor.test'] + if method =='spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + corr_result =[] + corr_result.append( R_result[3][0]) + corr_result.append( N ) + corr_result.append( R_result[2][0]) + + return corr_result + + + + +def batchCalTissueCorr(primaryTraitValue=[], SymbolValueDict={}, method='pearson'): + + def cal_tissue_corr(primaryTraitValue, oneSymbolValueDict, method ): + + oneSymbolCorrDict = {} + oneSymbolPvalueDict = {} + + R_corr_test = rpy2.robjects.r['cor.test'] + + R_primary = rpy2.robjects.FloatVector(range(len(primaryTraitValue))) + + for i in range(len(primaryTraitValue)): + R_primary[i] = primaryTraitValue[i] + + for (oneTraitSymbol, oneTraitValue) in oneSymbolValueDict.iteritems(): + R_target = rpy2.robjects.FloatVector(range(len(oneTraitValue))) + for i in range(len(oneTraitValue)): + R_target[i] = oneTraitValue[i] + + if method =='spearman': + R_result = R_corr_test(R_primary, R_target, method='spearman') + else: + R_result = R_corr_test(R_primary, R_target) + + oneSymbolCorrDict[oneTraitSymbol] = R_result[3][0] + oneSymbolPvalueDict[oneTraitSymbol] = R_result[2][0] + + return(oneSymbolCorrDict, oneSymbolPvalueDict) + + + + symbolCorrDict = {} + symbolPvalueDict = {} + + items_number = len(SymbolValueDict) + + if items_number <= 1000: + symbolCorrDict, symbolPvalueDict = cal_tissue_corr(primaryTraitValue, SymbolValueDict, method) + else: + items_list = SymbolValueDict.items() + + step = 1000 + job_number = math.ceil( float(items_number)/step ) + + job_oneSymbolValueDict_list = [] + + for job_index in range( int(job_number) ): + starti = job_index*step + endi = min((job_index+1)*step, items_number) + + oneSymbolValueDict = {} + + for i in range( starti, endi ): + one_item = items_list[i] + one_symbol = one_item[0] + one_value = one_item[1] + oneSymbolValueDict[one_symbol] = one_value + + job_oneSymbolValueDict_list.append( oneSymbolValueDict ) + + + ppservers = () + # Creates jobserver with automatically detected number of workers + job_server = pp.Server(ppservers=ppservers) + + jobs = [] + results = [] + + for i, oneSymbolValueDict in enumerate( job_oneSymbolValueDict_list ): + + #pay attention to modules from outside + jobs.append( job_server.submit(func=cal_tissue_corr, args=(primaryTraitValue, oneSymbolValueDict, method), depfuncs=(), modules=("rpy2.robjects",)) ) + + for one_job in jobs: + one_result = one_job() + results.append( one_result ) + + for one_result in results: + oneSymbolCorrDict, oneSymbolPvalueDict = one_result + symbolCorrDict.update( oneSymbolCorrDict ) + symbolPvalueDict.update( oneSymbolPvalueDict ) + + return (symbolCorrDict, symbolPvalueDict) + +########################################################################### +#Input: cursor, GeneNameLst (list), TissueProbeSetFreezeId +#output: geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict (Dict) +#function: get multi dicts for short and long label functions, and for getSymbolValuePairDict and +# getGeneSymbolTissueValueDict to build dict to get CorrPvArray +#Note: If there are multiple probesets for one gene, select the one with highest mean. +########################################################################### +def getTissueProbeSetXRefInfo(GeneNameLst=[],TissueProbeSetFreezeId=0): + Symbols ="" + symbolList =[] + geneIdDict ={} + dataIdDict = {} + ChrDict = {} + MbDict = {} + descDict = {} + pTargetDescDict = {} + + count = len(GeneNameLst) + + # Added by NL 01/06/2011 + # Note that:inner join is necessary in this query to get distinct record in one symbol group with highest mean value + # Duo to the limit size of TissueProbeSetFreezeId table in DB, performance of inner join is acceptable. + if count==0: + query=''' + select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=%s and Symbol!='' and Symbol Is Not Null group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; + '''%TissueProbeSetFreezeId + + else: + for i, item in enumerate(GeneNameLst): + + if i == count-1: + Symbols += "'%s'" %item + else: + Symbols += "'%s'," %item + + Symbols = "("+ Symbols+")" + query=''' + select t.Symbol,t.GeneId, t.DataId,t.Chr, t.Mb,t.description,t.Probe_Target_Description + from ( + select Symbol, max(Mean) as maxmean + from TissueProbeSetXRef + where TissueProbeSetFreezeId=%s and Symbol in %s group by Symbol) + as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol and t.Mean = x.maxmean; + '''% (TissueProbeSetFreezeId,Symbols) + + try: + cursor.execute(query) + results =cursor.fetchall() + resultCount = len(results) + # Key in all dicts is the lower-cased symbol + for i, item in enumerate(results): + symbol = item[0] + symbolList.append(symbol) + + key =symbol.lower() + geneIdDict[key]=item[1] + dataIdDict[key]=item[2] + ChrDict[key]=item[3] + MbDict[key]=item[4] + descDict[key]=item[5] + pTargetDescDict[key]=item[6] + + except: + symbolList = None + geneIdDict=None + dataIdDict=None + ChrDict=None + MbDict=None + descDict=None + pTargetDescDict=None + + return symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict + +########################################################################### +#Input: cursor, symbolList (list), dataIdDict(Dict) +#output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair, +# key is symbol, value is one list of expression values of one probeSet; +#function: get one dictionary whose key is gene symbol and value is tissue expression data (list type). +#Attention! All keys are lower case! +########################################################################### +def get_symbol_value_pairs(tissue_data): + + id_list = [tissue_data[symbol.lower()].data_id for item in tissue_data] + + symbol_value_pairs = {} + value_list=[] + + query = """SELECT value, id + FROM TissueProbeSetData + WHERE Id IN {}""".format(create_in_clause(id_list)) + + try : + results = g.db.execute(query).fetchall() + for result in results: + value_list.append(result.value) + symbol_value_pairs[symbol] = value_list + except: + symbol_value_pairs[symbol] = None + + #for symbol in symbol_list: + # if tissue_data.has_key(symbol): + # data_id = tissue_data[symbol].data_id + # + # query = """select value, id + # from TissueProbeSetData + # where Id={}""".format(escape(data_id)) + # try : + # results = g.db.execute(query).fetchall() + # for item in results: + # item = item[0] + # value_list.append(item) + # symbol_value_pairs[symbol] = value_list + # value_list=[] + # except: + # symbol_value_pairs[symbol] = None + + return symbol_value_pairs + + +######################################################################################################## +#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol +#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. +# key is symbol, value is one list of expression values of one probeSet. +#function: wrapper function for getSymbolValuePairDict function +# build gene symbol list if necessary, cut it into small lists if necessary, +# then call getSymbolValuePairDict function and merge the results. +######################################################################################################## + +def get_trait_symbol_and_tissue_values(symbol_list=None): + SymbolValuePairDict={} + + tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list) + + #symbolList, + #geneIdDict, + #dataIdDict, + #ChrDict, + #MbDict, + #descDict, + #pTargetDescDict = getTissueProbeSetXRefInfo( + # GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) + + if len(tissue_data.gene_symbols): + return get_symbol_value_pairs(tissue_data) + + #limit_num=1000 + #count = len(symbol_list) + # + #symbol_value_pairs = {} + # + #if count !=0 and count <= limit_num: + # symbol_value_pairs = getSymbolValuePairDict(cursor=cursor,symbolList=symbol_list,dataIdDict=dataIdDict) + # + #elif count > limit_num: + # n = count/limit_num + # start = 0 + # stop = 0 + # + # for i in range(n): + # stop =limit_num*(i+1) + # gList1 = symbolList[start:stop] + # PairDict1 = getSymbolValuePairDict(cursor=cursor,symbolList=gList1,dataIdDict=dataIdDict) + # start =limit_num*(i+1) + # + # SymbolValuePairDict.update(PairDict1) + # + # if stop < count: + # stop = count + # gList2 = symbolList[start:stop] + # PairDict2 = getSymbolValuePairDict(cursor=cursor,symbolList=gList2,dataIdDict=dataIdDict) + # SymbolValuePairDict.update(PairDict2) + # + #return SymbolValuePairDict + +######################################################################################################## +#input: cursor, GeneNameLst (list), TissueProbeSetFreezeId(int) +#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair. +# key is symbol, value is one list of expression values of one probeSet. +#function: wrapper function of getGeneSymbolTissueValueDict function +# for CorrelationPage.py +######################################################################################################## + +#def get_trait_symbol_and_tissue_values(cursor=None,GeneNameLst=[],TissueProbeSetFreezeId=0): +# SymbolValuePairDict={} +# +# symbolList,geneIdDict,dataIdDict,ChrDict,MbDict,descDict,pTargetDescDict = getTissueProbeSetXRefInfo( +# cursor=cursor,GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId) +# +# if symbolList: +# SymbolValuePairDict = get_gene_symbol_and_tissue_values(symbolList=symbolList, +# dataIdDict=dataIdDict) +# +# return SymbolValuePairDict + +######################################################################################################## +#Input: cursor(cursor): MySQL connnection cursor; +# priGeneSymbolList(list): one list of gene symbol; +# symbolValuepairDict(dictionary): one dictionary of Symbol and Value Pair, +# key is symbol, value is one list of expression values of one probeSet; +#Output: corrArray(array): array of Correlation Value, +# pvArray(array): array of PValue; +#Function: build corrArray, pvArray for display by calling calculation function:calZeroOrderCorrForTiss +######################################################################################################## + +def getCorrPvArray(cursor=None,priGeneSymbolList=[],symbolValuepairDict={}): + # setting initial value for corrArray, pvArray equal to 0 + Num = len(priGeneSymbolList) + + corrArray = [([0] * (Num))[:] for i in range(Num)] + pvArray = [([0] * (Num))[:] for i in range(Num)] + i = 0 + for pkey in priGeneSymbolList: + j = 0 + pkey = pkey.strip().lower()# key in symbolValuepairDict is low case + if symbolValuepairDict.has_key(pkey): + priValue = symbolValuepairDict[pkey] + for tkey in priGeneSymbolList: + tkey = tkey.strip().lower()# key in symbolValuepairDict is low case + if priValue and symbolValuepairDict.has_key(tkey): + tarValue = symbolValuepairDict[tkey] + + if tarValue: + if i>j: + # corrArray stores Pearson Correlation values + # pvArray stores Pearson P-Values + pcorr_result =calZeroOrderCorrForTiss(primaryValue=priValue,targetValue=tarValue) + corrArray[i][j] =pcorr_result[0] + pvArray[i][j] =pcorr_result[2] + elif i