import collections import scipy import numpy from gn2.base import data_set from gn2.base.trait import create_trait, retrieve_sample_data from gn2.utility import corr_result_helpers from gn2.utility.tools import get_setting from gn2.wqflask.correlation import correlation_functions from gn2.wqflask.database import database_connection def do_correlation(start_vars): if 'db' not in start_vars: raise ValueError("'db' not found!") if 'target_db' not in start_vars: raise ValueError("'target_db' not found!") if 'trait_id' not in start_vars: raise ValueError("'trait_id' not found!") this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) target_dataset = data_set.create_dataset( dataset_name=start_vars['target_db']) this_trait = create_trait(dataset=this_dataset, name=start_vars['trait_id']) this_trait = retrieve_sample_data(this_trait, this_dataset) corr_params = init_corr_params(start_vars) corr_results = calculate_results( this_trait, this_dataset, target_dataset, corr_params) final_results = [] for _trait_counter, trait in enumerate(list(corr_results.keys())[:corr_params['return_count']]): if corr_params['type'] == "tissue": [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p, "symbol": symbol } elif corr_params['type'] == "literature" or corr_params['type'] == "lit": [gene_id, sample_r] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "gene_id": gene_id } else: [sample_r, sample_p, num_overlap] = corr_results[trait] result_dict = { "trait": trait, "sample_r": sample_r, "#_strains": num_overlap, "p_value": sample_p } final_results.append(result_dict) return final_results def calculate_results(this_trait, this_dataset, target_dataset, corr_params): corr_results = {} target_dataset.get_trait_data() if corr_params['type'] == "tissue": trait_symbol_dict = this_dataset.retrieve_genes("Symbol") corr_results = do_tissue_correlation_for_all_traits( this_trait, trait_symbol_dict, corr_params) sorted_results = collections.OrderedDict(sorted(list(corr_results.items()), key=lambda t: -abs(t[1][1]))) # ZS: Just so a user can use either "lit" or "literature" elif corr_params['type'] == "literature" or corr_params['type'] == "lit": trait_geneid_dict = this_dataset.retrieve_genes("GeneId") corr_results = do_literature_correlation_for_all_traits( this_trait, this_dataset, trait_geneid_dict, corr_params) sorted_results = collections.OrderedDict(sorted(list(corr_results.items()), key=lambda t: -abs(t[1][1]))) else: for target_trait, target_vals in list(target_dataset.trait_data.items()): result = get_sample_r_and_p_values( this_trait, this_dataset, target_vals, target_dataset, corr_params['type']) if result is not None: corr_results[target_trait] = result sorted_results = collections.OrderedDict( sorted(list(corr_results.items()), key=lambda t: -abs(t[1][0]))) return sorted_results def do_tissue_correlation_for_all_traits(this_trait, trait_symbol_dict, corr_params, tissue_dataset_id=1): # Gets tissue expression values for the primary trait primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( symbol_list=[this_trait.symbol]) if this_trait.symbol.lower() in primary_trait_tissue_vals_dict: primary_trait_tissue_values = primary_trait_tissue_vals_dict[this_trait.symbol.lower( )] corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( symbol_list=list(trait_symbol_dict.values())) tissue_corr_data = {} for trait, symbol in list(trait_symbol_dict.items()): if symbol and symbol.lower() in corr_result_tissue_vals_dict: this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower( )] result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values, this_trait_tissue_values, corr_params['method']) tissue_corr_data[trait] = [ result[0], result[1], result[2], symbol] return tissue_corr_data def do_literature_correlation_for_all_traits(this_trait, target_dataset, trait_geneid_dict, corr_params): input_trait_mouse_gene_id = convert_to_mouse_gene_id( target_dataset.group.species.lower(), this_trait.geneid) lit_corr_data = {} for trait, gene_id in list(trait_geneid_dict.items()): mouse_gene_id = convert_to_mouse_gene_id( target_dataset.group.species.lower(), gene_id) if mouse_gene_id and str(mouse_gene_id).find(";") == -1: result = "" with database_connection(get_setting("SQL_URI")) as conn: with conn.cursor() as cursor: cursor.execute( ("SELECT value FROM LCorrRamin3 " "WHERE GeneId1=%s AND GeneId2=%s"), (mouse_gene_id, input_trait_mouse_gene_id)) result = cursor.fetchone() if not result: cursor.execute( ("SELECT value FROM LCorrRamin3 " "WHERE GeneId2=%s AND GeneId1=%s"), (mouse_gene_id, input_trait_mouse_gene_id)) result = cursor.fetchone() if result: lit_corr = result[0] lit_corr_data[trait] = [gene_id, lit_corr] else: lit_corr_data[trait] = [gene_id, 0] else: lit_corr_data[trait] = [gene_id, 0] return lit_corr_data def get_sample_r_and_p_values(this_trait, this_dataset, target_vals, target_dataset, type): """ Calculates the sample r (or rho) and p-value Given a primary trait and a target trait's sample values, calculates either the pearson r or spearman rho and the p-value using the corresponding scipy functions. """ this_trait_vals = [] shared_target_vals = [] for i, sample in enumerate(target_dataset.group.samplelist): if sample in this_trait.data: this_sample_value = this_trait.data[sample].value target_sample_value = target_vals[i] this_trait_vals.append(this_sample_value) shared_target_vals.append(target_sample_value) this_trait_vals, shared_target_vals, num_overlap = corr_result_helpers.normalize_values( this_trait_vals, shared_target_vals) if type == 'pearson': sample_r, sample_p = scipy.stats.pearsonr( this_trait_vals, shared_target_vals) else: sample_r, sample_p = scipy.stats.spearmanr( this_trait_vals, shared_target_vals) if num_overlap > 5: if numpy.isnan(sample_r): return None else: return [sample_r, sample_p, num_overlap] def convert_to_mouse_gene_id(species=None, gene_id=None): """If the species is rat or human, translate the gene_id to the mouse geneid If there is no input gene_id or there's no corresponding mouse gene_id, return None """ if not gene_id: return None mouse_gene_id = None with database_connection(get_setting("SQL_URI")) as conn: with conn.cursor() as cursor: if species == 'mouse': mouse_gene_id = gene_id elif species == 'rat': cursor.execute( ("SELECT mouse FROM GeneIDXRef " "WHERE rat=%s"), gene_id) result = cursor.fetchone() if result: mouse_gene_id = result[0] elif species == 'human': cursor.execute( "SELECT mouse FROM GeneIDXRef " "WHERE human=%s", gene_id) result = cursor.fetchone() if result: mouse_gene_id = result[0] return mouse_gene_id def init_corr_params(start_vars): method = "pearson" if 'method' in start_vars: method = start_vars['method'] type = "sample" if 'type' in start_vars: type = start_vars['type'] return_count = 500 if 'return_count' in start_vars: assert(start_vars['return_count'].isdigit()) return_count = int(start_vars['return_count']) corr_params = { 'method': method, 'type': type, 'return_count': return_count } return corr_params