diff options
Diffstat (limited to 'gn3/correlation/show_corr_results.py')
-rw-r--r-- | gn3/correlation/show_corr_results.py | 735 |
1 files changed, 735 insertions, 0 deletions
diff --git a/gn3/correlation/show_corr_results.py b/gn3/correlation/show_corr_results.py new file mode 100644 index 0000000..55d8366 --- /dev/null +++ b/gn3/correlation/show_corr_results.py @@ -0,0 +1,735 @@ +"""module contains code for doing correlation""" + +import json +import collections +import numpy +import scipy.stats +import rpy2.robjects as ro +from flask import g +from gn3.base.data_set import create_dataset +from gn3.utility.db_tools import escape +from gn3.utility.helper_functions import get_species_dataset_trait +from gn3.utility.corr_result_helpers import normalize_values +from gn3.base.trait import create_trait +from gn3.utility import hmac +from . import correlation_functions + + +class CorrelationResults: + """class for computing correlation""" + # pylint: disable=too-many-instance-attributes + # pylint:disable=attribute-defined-outside-init + + def __init__(self, start_vars): + self.assertion_for_start_vars(start_vars) + + @staticmethod + def assertion_for_start_vars(start_vars): + # pylint: disable = E, W, R, C + + # should better ways to assert the variables + # example includes sample + assert("corr_type" in start_vars) + assert(isinstance(start_vars['corr_type'], str)) + # example includes pearson + assert('corr_sample_method' in start_vars) + assert('corr_dataset' in start_vars) + # means the limit + assert('corr_return_results' in start_vars) + + if "loc_chr" in start_vars: + assert('min_loc_mb' in start_vars) + assert('max_loc_mb' in start_vars) + + def get_formatted_corr_type(self): + """method to formatt corr_types""" + self.formatted_corr_type = "" + if self.corr_type == "lit": + self.formatted_corr_type += "Literature Correlation " + elif self.corr_type == "tissue": + self.formatted_corr_type += "Tissue Correlation " + elif self.corr_type == "sample": + self.formatted_corr_type += "Genetic Correlation " + + if self.corr_method == "pearson": + self.formatted_corr_type += "(Pearson's r)" + elif self.corr_method == "spearman": + self.formatted_corr_type += "(Spearman's rho)" + elif self.corr_method == "bicor": + self.formatted_corr_type += "(Biweight r)" + + def process_samples(self, start_vars, sample_names, excluded_samples=None): + """method to process samples""" + + + if not excluded_samples: + excluded_samples = () + + sample_val_dict = json.loads(start_vars["sample_vals"]) + print(sample_val_dict) + if sample_names is None: + raise NotImplementedError + + for sample in sample_names: + if sample not in excluded_samples: + value = sample_val_dict[sample] + + if not value.strip().lower() == "x": + self.sample_data[str(sample)] = float(value) + + def do_tissue_correlation_for_trait_list(self, tissue_dataset_id=1): + """Given a list of correlation results (self.correlation_results),\ + gets the tissue correlation value for each""" + # pylint: disable = E, W, R, C + + # Gets tissue expression values for the primary trait + primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( + symbol_list=[self.this_trait.symbol]) + + if self.this_trait.symbol.lower() in primary_trait_tissue_vals_dict: + primary_trait_tissue_values = primary_trait_tissue_vals_dict[self.this_trait.symbol.lower( + )] + gene_symbol_list = [ + trait.symbol for trait in self.correlation_results if trait.symbol] + + corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( + symbol_list=gene_symbol_list) + + for trait in self.correlation_results: + if trait.symbol and trait.symbol.lower() in corr_result_tissue_vals_dict: + this_trait_tissue_values = corr_result_tissue_vals_dict[trait.symbol.lower( + )] + + result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values, + this_trait_tissue_values, + self.corr_method) + + trait.tissue_corr = result[0] + trait.tissue_pvalue = result[2] + + def do_lit_correlation_for_trait_list(self): + # pylint: disable = E, W, R, C + + input_trait_mouse_gene_id = self.convert_to_mouse_gene_id( + self.dataset.group.species.lower(), self.this_trait.geneid) + + for trait in self.correlation_results: + + if trait.geneid: + trait.mouse_gene_id = self.convert_to_mouse_gene_id( + self.dataset.group.species.lower(), trait.geneid) + else: + trait.mouse_gene_id = None + + if trait.mouse_gene_id and str(trait.mouse_gene_id).find(";") == -1: + result = g.db.execute( + """SELECT value + FROM LCorrRamin3 + WHERE GeneId1='%s' and + GeneId2='%s' + """ % (escape(str(trait.mouse_gene_id)), escape(str(input_trait_mouse_gene_id))) + ).fetchone() + if not result: + result = g.db.execute("""SELECT value + FROM LCorrRamin3 + WHERE GeneId2='%s' and + GeneId1='%s' + """ % (escape(str(trait.mouse_gene_id)), escape(str(input_trait_mouse_gene_id))) + ).fetchone() + + if result: + lit_corr = result.value + trait.lit_corr = lit_corr + else: + trait.lit_corr = 0 + else: + trait.lit_corr = 0 + + def do_lit_correlation_for_all_traits(self): + """method for lit_correlation for all traits""" + # pylint: disable = E, W, R, C + input_trait_mouse_gene_id = self.convert_to_mouse_gene_id( + self.dataset.group.species.lower(), self.this_trait.geneid) + + lit_corr_data = {} + for trait, gene_id in list(self.trait_geneid_dict.items()): + mouse_gene_id = self.convert_to_mouse_gene_id( + self.dataset.group.species.lower(), gene_id) + + if mouse_gene_id and str(mouse_gene_id).find(";") == -1: + #print("gene_symbols:", input_trait_mouse_gene_id + " / " + mouse_gene_id) + result = g.db.execute( + """SELECT value + FROM LCorrRamin3 + WHERE GeneId1='%s' and + GeneId2='%s' + """ % (escape(mouse_gene_id), escape(input_trait_mouse_gene_id)) + ).fetchone() + if not result: + result = g.db.execute("""SELECT value + FROM LCorrRamin3 + WHERE GeneId2='%s' and + GeneId1='%s' + """ % (escape(mouse_gene_id), escape(input_trait_mouse_gene_id)) + ).fetchone() + if result: + #print("result:", result) + lit_corr = result.value + lit_corr_data[trait] = [gene_id, lit_corr] + else: + lit_corr_data[trait] = [gene_id, 0] + else: + lit_corr_data[trait] = [gene_id, 0] + + lit_corr_data = collections.OrderedDict(sorted(list(lit_corr_data.items()), + key=lambda t: -abs(t[1][1]))) + + return lit_corr_data + + def do_tissue_correlation_for_all_traits(self, tissue_dataset_id=1): + # Gets tissue expression values for the primary trait + # pylint: disable = E, W, R, C + primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( + symbol_list=[self.this_trait.symbol]) + + if self.this_trait.symbol.lower() in primary_trait_tissue_vals_dict: + primary_trait_tissue_values = primary_trait_tissue_vals_dict[self.this_trait.symbol.lower( + )] + + #print("trait_gene_symbols: ", pf(trait_gene_symbols.values())) + corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( + symbol_list=list(self.trait_symbol_dict.values())) + + #print("corr_result_tissue_vals: ", pf(corr_result_tissue_vals_dict)) + + #print("trait_gene_symbols: ", pf(trait_gene_symbols)) + + tissue_corr_data = {} + for trait, symbol in list(self.trait_symbol_dict.items()): + if symbol and symbol.lower() in corr_result_tissue_vals_dict: + this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower( + )] + + result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values, + this_trait_tissue_values, + self.corr_method) + + tissue_corr_data[trait] = [symbol, result[0], result[2]] + + tissue_corr_data = collections.OrderedDict(sorted(list(tissue_corr_data.items()), + key=lambda t: -abs(t[1][1]))) + + def get_sample_r_and_p_values(self, trait, target_samples): + """Calculates the sample r (or rho) and p-value + + Given a primary trait and a target trait's sample values, + calculates either the pearson r or spearman rho and the p-value + using the corresponding scipy functions. + + """ + # pylint: disable = E, W, R, C + self.this_trait_vals = [] + target_vals = [] + + for index, sample in enumerate(self.target_dataset.samplelist): + if sample in self.sample_data: + sample_value = self.sample_data[sample] + target_sample_value = target_samples[index] + self.this_trait_vals.append(sample_value) + target_vals.append(target_sample_value) + + self.this_trait_vals, target_vals, num_overlap = normalize_values( + self.this_trait_vals, target_vals) + + if num_overlap > 5: + # ZS: 2015 could add biweight correlation, see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3465711/ + if self.corr_method == 'bicor': + sample_r, sample_p = do_bicor( + self.this_trait_vals, target_vals) + + elif self.corr_method == 'pearson': + sample_r, sample_p = scipy.stats.pearsonr( + self.this_trait_vals, target_vals) + + else: + sample_r, sample_p = scipy.stats.spearmanr( + self.this_trait_vals, target_vals) + + if numpy.isnan(sample_r): + pass + + else: + + self.correlation_data[trait] = [ + sample_r, sample_p, num_overlap] + + def convert_to_mouse_gene_id(self, species=None, gene_id=None): + """If the species is rat or human, translate the gene_id to the mouse geneid + + If there is no input gene_id or there's no corresponding mouse gene_id, return None + + """ + if not gene_id: + return None + + mouse_gene_id = None + if "species" == "mouse": + mouse_gene_id = gene_id + + elif species == 'rat': + query = """SELECT mouse + FROM GeneIDXRef + WHERE rat='%s'""" % escape(gene_id) + + result = g.db.execute(query).fetchone() + if result != None: + mouse_gene_id = result.mouse + + elif species == "human": + + query = """SELECT mouse + FROM GeneIDXRef + WHERE human='%s'""" % escape(gene_id) + + result = g.db.execute(query).fetchone() + if result != None: + mouse_gene_id = result.mouse + + return mouse_gene_id + + def do_correlation(self, start_vars, create_dataset=create_dataset, + create_trait=create_trait, + get_species_dataset_trait=get_species_dataset_trait): + # pylint: disable = E, W, R, C + # probably refactor start_vars being passed twice + # this method aims to replace the do_correlation but also add dependendency injection + # to enable testing + + # should maybe refactor below code more or less works the same + if start_vars["dataset"] == "Temp": + self.dataset = create_dataset( + dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) + + self.trait_id = start_vars["trait_id"] + + self.this_trait = create_trait(dataset=self.dataset, + name=self.trait_id, + cellid=None) + + else: + + get_species_dataset_trait(self, start_vars) + + corr_samples_group = start_vars['corr_samples_group'] + self.sample_data = {} + self.corr_type = start_vars['corr_type'] + self.corr_method = start_vars['corr_sample_method'] + self.min_expr = float( + start_vars["min_expr"]) if start_vars["min_expr"] != "" else None + self.p_range_lower = float( + start_vars["p_range_lower"]) if start_vars["p_range_lower"] != "" else -1.0 + self.p_range_upper = float( + start_vars["p_range_upper"]) if start_vars["p_range_upper"] != "" else 1.0 + + if ("loc_chr" in start_vars and "min_loc_mb" in start_vars and "max_loc_mb" in start_vars): + self.location_type = str(start_vars['location_type']) + self.location_chr = str(start_vars['loc_chr']) + + try: + + # the code is below is basically a temporary fix + self.min_location_mb = int(start_vars['min_loc_mb']) + self.max_location_mb = int(start_vars['max_loc_mb']) + except Exception as e: + self.min_location_mb = None + self.max_location_mb = None + + else: + self.location_type = self.location_chr = self.min_location_mb = self.max_location_mb = None + + self.get_formatted_corr_type() + + self.return_number = int(start_vars['corr_return_results']) + + primary_samples = self.dataset.group.samplelist + + + # The two if statements below append samples to the sample list based upon whether the user + # rselected Primary Samples Only, Other Samples Only, or All Samples + + if self.dataset.group.parlist != None: + primary_samples += self.dataset.group.parlist + + if self.dataset.group.f1list != None: + + primary_samples += self.dataset.group.f1list + + # If either BXD/whatever Only or All Samples, append all of that group's samplelist + + if corr_samples_group != 'samples_other': + + # print("primary samples are *****",primary_samples) + + self.process_samples(start_vars, primary_samples) + + if corr_samples_group != 'samples_primary': + if corr_samples_group == 'samples_other': + primary_samples = [x for x in primary_samples if x not in ( + self.dataset.group.parlist + self.dataset.group.f1list)] + + self.process_samples(start_vars, list(self.this_trait.data.keys()), primary_samples) + + self.target_dataset = create_dataset(start_vars['corr_dataset']) + # when you add code to retrieve the trait_data for target dataset got gets very slow + import time + + init_time = time.time() + self.target_dataset.get_trait_data(list(self.sample_data.keys())) + + aft_time = time.time() - init_time + + self.header_fields = get_header_fields( + self.target_dataset.type, self.corr_method) + + if self.target_dataset.type == "ProbeSet": + self.filter_cols = [7, 6] + + elif self.target_dataset.type == "Publish": + self.filter_cols = [6, 0] + + else: + self.filter_cols = [4, 0] + + self.correlation_results = [] + + self.correlation_data = {} + + if self.corr_type == "tissue": + self.trait_symbol_dict = self.dataset.retrieve_genes("Symbol") + + tissue_corr_data = self.do_tissue_correlation_for_all_traits() + if tissue_corr_data != None: + for trait in list(tissue_corr_data.keys())[:self.return_number]: + self.get_sample_r_and_p_values( + trait, self.target_dataset.trait_data[trait]) + else: + for trait, values in list(self.target_dataset.trait_data.items()): + self.get_sample_r_and_p_values(trait, values) + + elif self.corr_type == "lit": + self.trait_geneid_dict = self.dataset.retrieve_genes("GeneId") + lit_corr_data = self.do_lit_correlation_for_all_traits() + + for trait in list(lit_corr_data.keys())[:self.return_number]: + self.get_sample_r_and_p_values( + trait, self.target_dataset.trait_data[trait]) + + elif self.corr_type == "sample": + for trait, values in list(self.target_dataset.trait_data.items()): + self.get_sample_r_and_p_values(trait, values) + + self.correlation_data = collections.OrderedDict(sorted(list(self.correlation_data.items()), + key=lambda t: -abs(t[1][0]))) + + # ZS: Convert min/max chromosome to an int for the location range option + + """ + took 20.79 seconds took compute all the above majority of time taken on retrieving target dataset trait + info + """ + + initial_time_chr = time.time() + + range_chr_as_int = None + for order_id, chr_info in list(self.dataset.species.chromosomes.chromosomes.items()): + if 'loc_chr' in start_vars: + if chr_info.name == self.location_chr: + range_chr_as_int = order_id + + for _trait_counter, trait in enumerate(list(self.correlation_data.keys())[:self.return_number]): + trait_object = create_trait( + dataset=self.target_dataset, name=trait, get_qtl_info=True, get_sample_info=False) + if not trait_object: + continue + + chr_as_int = 0 + for order_id, chr_info in list(self.dataset.species.chromosomes.chromosomes.items()): + if self.location_type == "highest_lod": + if chr_info.name == trait_object.locus_chr: + chr_as_int = order_id + else: + if chr_info.name == trait_object.chr: + chr_as_int = order_id + + if (float(self.correlation_data[trait][0]) >= self.p_range_lower and + float(self.correlation_data[trait][0]) <= self.p_range_upper): + + if (self.target_dataset.type == "ProbeSet" or self.target_dataset.type == "Publish") and bool(trait_object.mean): + if (self.min_expr != None) and (float(trait_object.mean) < self.min_expr): + continue + + if range_chr_as_int != None and (chr_as_int != range_chr_as_int): + continue + if self.location_type == "highest_lod": + if (self.min_location_mb != None) and (float(trait_object.locus_mb) < float(self.min_location_mb)): + continue + if (self.max_location_mb != None) and (float(trait_object.locus_mb) > float(self.max_location_mb)): + continue + else: + if (self.min_location_mb != None) and (float(trait_object.mb) < float(self.min_location_mb)): + continue + if (self.max_location_mb != None) and (float(trait_object.mb) > float(self.max_location_mb)): + continue + + (trait_object.sample_r, + trait_object.sample_p, + trait_object.num_overlap) = self.correlation_data[trait] + + # Set some sane defaults + trait_object.tissue_corr = 0 + trait_object.tissue_pvalue = 0 + trait_object.lit_corr = 0 + if self.corr_type == "tissue" and tissue_corr_data != None: + trait_object.tissue_corr = tissue_corr_data[trait][1] + trait_object.tissue_pvalue = tissue_corr_data[trait][2] + elif self.corr_type == "lit": + trait_object.lit_corr = lit_corr_data[trait][1] + + self.correlation_results.append(trait_object) + + """ + above takes time with respect to size of traits i.e n=100,500,.....t_size + """ + + if self.corr_type != "lit" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": + # self.do_lit_correlation_for_trait_list() + self.do_lit_correlation_for_trait_list() + + if self.corr_type != "tissue" and self.dataset.type == "ProbeSet" and self.target_dataset.type == "ProbeSet": + self.do_tissue_correlation_for_trait_list() + # self.do_lit_correlation_for_trait_list() + + self.json_results = generate_corr_json( + self.correlation_results, self.this_trait, self.dataset, self.target_dataset) + + # org mode by bons + + # DVORAKS + # klavaro for touch typing + # archwiki for documentation + # exwm for window manager ->13 + + # will fit perfectly with genenetwork 2 with change of anything if return self + + # alternative for this + return self.json_results + # return { + # # "Results": "succeess", + # # "return_number": self.return_number, + # # "primary_samples": primary_samples, + # # "time_taken": 12, + # # "correlation_data": self.correlation_data, + # "correlation_json": self.json_results + # } + + +def do_bicor(this_trait_vals, target_trait_vals): + # pylint: disable = E, W, R, C + r_library = ro.r["library"] # Map the library function + r_options = ro.r["options"] # Map the options function + + r_library("WGCNA") + r_bicor = ro.r["bicorAndPvalue"] # Map the bicorAndPvalue function + + r_options(stringsAsFactors=False) + + this_vals = ro.Vector(this_trait_vals) + target_vals = ro.Vector(target_trait_vals) + + the_r, the_p, _fisher_transform, _the_t, _n_obs = [ + numpy.asarray(x) for x in r_bicor(x=this_vals, y=target_vals)] + + return the_r, the_p + + +def get_header_fields(data_type, corr_method): + """function to get header fields when doing correlation""" + if data_type == "ProbeSet": + if corr_method == "spearman": + + header_fields = ['Index', + 'Record', + 'Symbol', + 'Description', + 'Location', + 'Mean', + 'Sample rho', + 'N', + 'Sample p(rho)', + 'Lit rho', + 'Tissue rho', + 'Tissue p(rho)', + 'Max LRS', + 'Max LRS Location', + 'Additive Effect'] + + else: + header_fields = ['Index', + 'Record', + 'Abbreviation', + 'Description', + 'Mean', + 'Authors', + 'Year', + 'Sample r', + 'N', + 'Sample p(r)', + 'Max LRS', + 'Max LRS Location', + 'Additive Effect'] + + elif data_type == "Publish": + if corr_method == "spearman": + + header_fields = ['Index', + 'Record', + 'Abbreviation', + 'Description', + 'Mean', + 'Authors', + 'Year', + 'Sample rho', + 'N', + 'Sample p(rho)', + 'Max LRS', + 'Max LRS Location', + 'Additive Effect'] + + else: + header_fields = ['Index', + 'Record', + 'Abbreviation', + 'Description', + 'Mean', + 'Authors', + 'Year', + 'Sample r', + 'N', + 'Sample p(r)', + 'Max LRS', + 'Max LRS Location', + 'Additive Effect'] + + else: + if corr_method == "spearman": + header_fields = ['Index', + 'ID', + 'Location', + 'Sample rho', + 'N', + 'Sample p(rho)'] + + else: + header_fields = ['Index', + 'ID', + 'Location', + 'Sample r', + 'N', + 'Sample p(r)'] + + return header_fields + + +def generate_corr_json(corr_results, this_trait, dataset, target_dataset, for_api=False): + """function to generate corr json data""" + #todo refactor this function + results_list = [] + for i, trait in enumerate(corr_results): + if trait.view == False: + continue + results_dict = {} + results_dict['index'] = i + 1 + results_dict['trait_id'] = trait.name + results_dict['dataset'] = trait.dataset.name + results_dict['hmac'] = hmac.data_hmac( + '{}:{}'.format(trait.name, trait.dataset.name)) + if target_dataset.type == "ProbeSet": + results_dict['symbol'] = trait.symbol + results_dict['description'] = "N/A" + results_dict['location'] = trait.location_repr + results_dict['mean'] = "N/A" + results_dict['additive'] = "N/A" + if bool(trait.description_display): + results_dict['description'] = trait.description_display + if bool(trait.mean): + results_dict['mean'] = f"{float(trait.mean):.3f}" + try: + results_dict['lod_score'] = f"{float(trait.LRS_score_repr) / 4.61:.1f}" + except: + results_dict['lod_score'] = "N/A" + results_dict['lrs_location'] = trait.LRS_location_repr + if bool(trait.additive): + results_dict['additive'] = f"{float(trait.additive):.3f}" + results_dict['sample_r'] = f"{float(trait.sample_r):.3f}" + results_dict['num_overlap'] = trait.num_overlap + results_dict['sample_p'] = f"{float(trait.sample_p):.3e}" + results_dict['lit_corr'] = "--" + results_dict['tissue_corr'] = "--" + results_dict['tissue_pvalue'] = "--" + if bool(trait.lit_corr): + results_dict['lit_corr'] = f"{float(trait.lit_corr):.3f}" + if bool(trait.tissue_corr): + results_dict['tissue_corr'] = f"{float(trait.tissue_corr):.3f}" + results_dict['tissue_pvalue'] = f"{float(trait.tissue_pvalue):.3e}" + elif target_dataset.type == "Publish": + results_dict['abbreviation_display'] = "N/A" + results_dict['description'] = "N/A" + results_dict['mean'] = "N/A" + results_dict['authors_display'] = "N/A" + results_dict['additive'] = "N/A" + if for_api: + results_dict['pubmed_id'] = "N/A" + results_dict['year'] = "N/A" + else: + results_dict['pubmed_link'] = "N/A" + results_dict['pubmed_text'] = "N/A" + + if bool(trait.abbreviation): + results_dict['abbreviation_display'] = trait.abbreviation + if bool(trait.description_display): + results_dict['description'] = trait.description_display + if bool(trait.mean): + results_dict['mean'] = f"{float(trait.mean):.3f}" + if bool(trait.authors): + authors_list = trait.authors.split(',') + if len(authors_list) > 6: + results_dict['authors_display'] = ", ".join( + authors_list[:6]) + ", et al." + else: + results_dict['authors_display'] = trait.authors + if bool(trait.pubmed_id): + if for_api: + results_dict['pubmed_id'] = trait.pubmed_id + results_dict['year'] = trait.pubmed_text + else: + results_dict['pubmed_link'] = trait.pubmed_link + results_dict['pubmed_text'] = trait.pubmed_text + try: + results_dict['lod_score'] = f"{float(trait.LRS_score_repr) / 4.61:.1f}" + except: + results_dict['lod_score'] = "N/A" + results_dict['lrs_location'] = trait.LRS_location_repr + if bool(trait.additive): + results_dict['additive'] = f"{float(trait.additive):.3f}" + results_dict['sample_r'] = f"{float(trait.sample_r):.3f}" + results_dict['num_overlap'] = trait.num_overlap + results_dict['sample_p'] = f"{float(trait.sample_p):.3e}" + else: + results_dict['location'] = trait.location_repr + results_dict['sample_r'] = f"{float(trait.sample_r):.3f}" + results_dict['num_overlap'] = trait.num_overlap + results_dict['sample_p'] = f"{float(trait.sample_p):.3e}" + + results_list.append(results_dict) + + return json.dumps(results_list) |