From 5daef1bd5e6e494b477797993bb72488b24dd8b3 Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Wed, 9 Oct 2013 17:50:48 -0500 Subject: Improved some of the code related to the correlation page For example, changed the two functions getting gene symbols and ids for a dataset into one function that can take a column name as a parameter --- wqflask/base/data_set.py | 63 +++++++++++++------ wqflask/wqflask/correlation/show_corr_results.py | 79 ++++++++++-------------- 2 files changed, 74 insertions(+), 68 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 16f9da5d..20c9a24f 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1075,32 +1075,55 @@ class MrnaAssayDataSet(DataSet): """ % (escape(trait), escape(self.name)) results = g.db.execute(query).fetchall() return results - - def retrieve_gene_symbols(self): - query = """ - select ProbeSet.Name, ProbeSet.Symbol, ProbeSet.GeneId - from ProbeSet,ProbeSetXRef - where ProbeSetXRef.ProbeSetFreezeId = %s and - ProbeSetXRef.ProbeSetId=ProbeSet.Id; - """ % (self.id) - results = g.db.execute(query).fetchall() - symbol_dict = {} - for item in results: - symbol_dict[item[0]] = item[1] - return symbol_dict - def retrieve_gene_ids(self): + + def retrieve_genes(self, column_name): query = """ - select ProbeSet.Name, ProbeSet.GeneId + select ProbeSet.Name, ProbeSet.%s from ProbeSet,ProbeSetXRef where ProbeSetXRef.ProbeSetFreezeId = %s and ProbeSetXRef.ProbeSetId=ProbeSet.Id; - """ % (self.id) + """ % (column_name, escape(str(self.id))) results = g.db.execute(query).fetchall() - symbol_dict = {} - for item in results: - symbol_dict[item[0]] = item[1] - return symbol_dict + print("in retrieve_genes results {}: {}".format(type(results), results)) + + return dict(results) + + #return {item[0]: item[1] for item in results} + + #symbol_dict = {} + #for item in results: + # symbol_dict[item[0]] = item[1] + #return symbol_dict + + #def retrieve_gene_symbols(self): + # query = """ + # select ProbeSet.Name, ProbeSet.Symbol, ProbeSet.GeneId + # from ProbeSet,ProbeSetXRef + # where ProbeSetXRef.ProbeSetFreezeId = %s and + # ProbeSetXRef.ProbeSetId=ProbeSet.Id; + # """ % (self.id) + # results = g.db.execute(query).fetchall() + # symbol_dict = {} + # for item in results: + # symbol_dict[item[0]] = item[1] + # return symbol_dict + # + #def retrieve_gene_ids(self): + # query = """ + # select ProbeSet.Name, ProbeSet.GeneId + # from ProbeSet,ProbeSetXRef + # where ProbeSetXRef.ProbeSetFreezeId = %s and + # ProbeSetXRef.ProbeSetId=ProbeSet.Id; + # """ % (self.id) + # return process_and_run_query(query) + # results = g.db.execute(query).fetchall() + # symbol_dict = {} + # for item in results: + # symbol_dict[item[0]] = item[1] + # return symbol_dict + + class TempDataSet(DataSet): diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 5df2f316..258dcfa4 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -93,9 +93,6 @@ class CorrelationResults(object): # get trait list from db (database name) # calculate correlation with Base vector and targets - #self.this_trait = GeneralTrait(dataset=self.dataset.name, - # name=start_vars['trait_id'], - # cellid=None) with Bench("Doing correlations"): helper_functions.get_species_dataset_trait(self, start_vars) self.dataset.group.read_genotype_file() @@ -114,9 +111,9 @@ class CorrelationResults(object): self.dataset.group.f1list + self.dataset.group.samplelist) - #If either BXD/whatever Only or All Samples, append all of that group's samplelist + #If either BXD/whatever Only or All Samples, append all of that group's samplelist if corr_samples_group != 'samples_other': - self.process_samples(start_vars, primary_samples, ()) + self.process_samples(start_vars, primary_samples) #If either Non-BXD/whatever or All Samples, get all samples from this_trait.data and #exclude the primary samples (because they would have been added in the previous @@ -132,55 +129,36 @@ class CorrelationResults(object): self.correlation_data = {} if self.corr_type == "tissue": - trait_symbol_dict = self.dataset.retrieve_gene_symbols() - tissue_corr_data = self.do_tissue_correlation_for_all_traits(trait_gene_symbols = trait_symbol_dict) - #print("tissue_corr_data: ", pf(tissue_corr_data)) + self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") + tissue_corr_data = self.do_tissue_correlation_for_all_traits() for trait in tissue_corr_data.keys()[:self.return_number]: - self.get_sample_r_and_p_values(trait = trait, target_samples = self.target_dataset.trait_data[trait]) - #this_trait_vals = [] - #target_vals = [] - #for index, sample in enumerate(self.target_dataset.samplelist): - # if sample in self.sample_data: - # sample_value = self.sample_data[sample] - # target_sample_value = self.target_dataset.trait_data[trait][index] - # this_trait_vals.append(sample_value) - # target_vals.append(target_sample_value) - # - #this_trait_vals, target_vals, num_overlap = corr_result_helpers.normalize_values( - # this_trait_vals, target_vals) - # - #if self.corr_method == 'pearson': - # sample_r, sample_p = scipy.stats.pearsonr(this_trait_vals, target_vals) - #else: - # sample_r, sample_p = scipy.stats.spearmanr(this_trait_vals, target_vals) - # - #self.correlation_data[trait] = [sample_r, sample_p, num_overlap] - + self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) + elif self.corr_type == "lit": - trait_geneid_dict = self.dataset.retrieve_gene_ids() - lit_corr_data = self.do_lit_correlation_for_all_traits(trait_gene_ids = trait_geneid_dict) + self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId") + lit_corr_data = self.do_lit_correlation_for_all_traits() for trait in lit_corr_data.keys()[:self.return_number]: - self.get_sample_r_and_p_values(trait = trait, target_samples = self.target_dataset.trait_data[trait]) + self.get_sample_r_and_p_values(trait, self.target_dataset.trait_data[trait]) elif self.corr_type == "sample": for trait, values in self.target_dataset.trait_data.iteritems(): - self.get_sample_r_and_p_values(trait = trait, target_samples = values) + self.get_sample_r_and_p_values(trait, values) self.correlation_data = collections.OrderedDict(sorted(self.correlation_data.items(), key=lambda t: -abs(t[1][0]))) - #print("correlation_data: ", pf(self.correlation_data)) for _trait_counter, trait in enumerate(self.correlation_data.keys()[:self.return_number]): trait_object = GeneralTrait(dataset=self.dataset, name=trait, get_qtl_info=True) - - #print("gene symbol: ", trait_object.symbol) - trait_object.sample_r = self.correlation_data[trait][0] - trait_object.sample_p = self.correlation_data[trait][1] - trait_object.num_overlap = self.correlation_data[trait][2] + (trait_object.sample_r, + trait_object.sample_p, + trait_object.num_overlap) = self.correlation_data[trait] + + #trait_object.sample_p = self.correlation_data[trait][1] + #trait_object.num_overlap = self.correlation_data[trait][2] #Get symbol for trait and call function that gets each tissue value from the database (tables TissueProbeSetXRef, #TissueProbeSetData, etc) and calculates the correlation (cal_zero_order_corr_for_tissue in correlation_functions) @@ -194,7 +172,6 @@ class CorrelationResults(object): trait_object.tissue_pvalue = tissue_corr_data[trait][2] elif self.corr_type == "lit": trait_object.lit_corr = lit_corr_data[trait][1] - self.correlation_results.append(trait_object) if self.corr_type != "lit": @@ -305,7 +282,7 @@ class CorrelationResults(object): #return self.correlation_results - def do_tissue_correlation_for_all_traits(self, trait_gene_symbols, tissue_dataset_id=1): + def do_tissue_correlation_for_all_traits(self, tissue_dataset_id=1): #Gets tissue expression values for the primary trait primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( symbol_list = [self.this_trait.symbol]) @@ -315,14 +292,14 @@ class CorrelationResults(object): #print("trait_gene_symbols: ", pf(trait_gene_symbols.values())) corr_result_tissue_vals_dict= correlation_functions.get_trait_symbol_and_tissue_values( - symbol_list=trait_gene_symbols.values()) + symbol_list=self.trait_symbol_dict.values()) #print("corr_result_tissue_vals: ", pf(corr_result_tissue_vals_dict)) #print("trait_gene_symbols: ", pf(trait_gene_symbols)) tissue_corr_data = {} - for trait, symbol in trait_gene_symbols.iteritems(): + for trait, symbol in self.trait_symbol_dict.iteritems(): if symbol and symbol.lower() in corr_result_tissue_vals_dict: this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower()] #print("this_trait_tissue_values: ", pf(this_trait_tissue_values)) @@ -375,15 +352,15 @@ class CorrelationResults(object): trait.lit_corr = 0 - def do_lit_correlation_for_all_traits(self, trait_gene_ids): + def do_lit_correlation_for_all_traits(self): input_trait_mouse_gene_id = self.convert_to_mouse_gene_id(self.dataset.group.species.lower(), self.this_trait.geneid) lit_corr_data = {} - for trait, gene_id in trait_gene_ids.iteritems(): + for trait, gene_id in self.trait_geneid_dict.iteritems(): mouse_gene_id = self.convert_to_mouse_gene_id(self.dataset.group.species.lower(), gene_id) if mouse_gene_id and str(mouse_gene_id).find(";") == -1: - print("gene_symbols:", input_trait_mouse_gene_id + " / " + mouse_gene_id) + #print("gene_symbols:", input_trait_mouse_gene_id + " / " + mouse_gene_id) result = g.db.execute( """SELECT value FROM LCorrRamin3 @@ -399,7 +376,7 @@ class CorrelationResults(object): """ % (escape(mouse_gene_id), escape(input_trait_mouse_gene_id)) ).fetchone() if result: - print("result:", result) + #print("result:", result) lit_corr = result.value lit_corr_data[trait] = [gene_id, lit_corr] else: @@ -458,7 +435,9 @@ class CorrelationResults(object): for index, sample in enumerate(self.target_dataset.samplelist): if sample in self.sample_data: sample_value = self.sample_data[sample] + print("sample_value:", sample_value) target_sample_value = target_samples[index] + print("target_sample_value:", target_sample_value) this_trait_vals.append(sample_value) target_vals.append(target_sample_value) @@ -470,7 +449,8 @@ class CorrelationResults(object): else: sample_r, sample_p = scipy.stats.spearmanr(this_trait_vals, target_vals) - self.correlation_data[trait] = [sample_r, sample_p, num_overlap] + self.correlation_data[trait] = [sample_r, sample_p, num_overlap] + def do_tissue_corr_for_all_traits_2(self): @@ -632,7 +612,10 @@ class CorrelationResults(object): ProbeSet.Id = ProbeSetXRef.ProbeSetId order by ProbeSet.Id """ - def process_samples(self, start_vars, sample_names, excluded_samples): + def process_samples(self, start_vars, sample_names, excluded_samples=None): + if not excluded_samples: + excluded_samples = () + for sample in sample_names: if sample not in excluded_samples: value = start_vars['value:' + sample] -- cgit v1.2.3