gn2/wqflask/api/correlation.py - genenetwork2 - GeneNetwork (2nd generation)

import collections
import scipy
import numpy

from gn2.base import data_set
from gn2.base.trait import create_trait, retrieve_sample_data
from gn2.utility import corr_result_helpers
from gn2.utility.tools import get_setting
from gn2.wqflask.correlation import correlation_functions
from gn2.wqflask.database import database_connection

def do_correlation(start_vars):
    if 'db' not in start_vars:
        raise ValueError("'db' not found!")
    if 'target_db' not in start_vars:
        raise ValueError("'target_db' not found!")
    if 'trait_id' not in start_vars:
        raise ValueError("'trait_id' not found!")

    this_dataset = data_set.create_dataset(dataset_name=start_vars['db'])
    target_dataset = data_set.create_dataset(
        dataset_name=start_vars['target_db'])
    this_trait = create_trait(dataset=this_dataset,
                              name=start_vars['trait_id'])
    this_trait = retrieve_sample_data(this_trait, this_dataset)

    corr_params = init_corr_params(start_vars)

    corr_results = calculate_results(
        this_trait, this_dataset, target_dataset, corr_params)

    final_results = []
    for _trait_counter, trait in enumerate(list(corr_results.keys())[:corr_params['return_count']]):
        if corr_params['type'] == "tissue":
            [sample_r, num_overlap, sample_p, symbol] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p,
                "symbol": symbol
            }
        elif corr_params['type'] == "literature" or corr_params['type'] == "lit":
            [gene_id, sample_r] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "gene_id": gene_id
            }
        else:
            [sample_r, sample_p, num_overlap] = corr_results[trait]
            result_dict = {
                "trait": trait,
                "sample_r": sample_r,
                "#_strains": num_overlap,
                "p_value": sample_p
            }
        final_results.append(result_dict)
    return final_results


def calculate_results(this_trait, this_dataset, target_dataset, corr_params):
    corr_results = {}

    target_dataset.get_trait_data()

    if corr_params['type'] == "tissue":
        trait_symbol_dict = this_dataset.retrieve_genes("Symbol")
        corr_results = do_tissue_correlation_for_all_traits(
            this_trait, trait_symbol_dict, corr_params)
        sorted_results = collections.OrderedDict(sorted(list(corr_results.items()),
                                                        key=lambda t: -abs(t[1][1])))
    # ZS: Just so a user can use either "lit" or "literature"
    elif corr_params['type'] == "literature" or corr_params['type'] == "lit":
        trait_geneid_dict = this_dataset.retrieve_genes("GeneId")
        corr_results = do_literature_correlation_for_all_traits(
            this_trait, this_dataset, trait_geneid_dict, corr_params)
        sorted_results = collections.OrderedDict(sorted(list(corr_results.items()),
                                                        key=lambda t: -abs(t[1][1])))
    else:
        for target_trait, target_vals in list(target_dataset.trait_data.items()):
            result = get_sample_r_and_p_values(
                this_trait, this_dataset, target_vals, target_dataset, corr_params['type'])
            if result is not None:
                corr_results[target_trait] = result

        sorted_results = collections.OrderedDict(
            sorted(list(corr_results.items()), key=lambda t: -abs(t[1][0])))

    return sorted_results


def do_tissue_correlation_for_all_traits(this_trait, trait_symbol_dict, corr_params, tissue_dataset_id=1):
    # Gets tissue expression values for the primary trait
    primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values(
        symbol_list=[this_trait.symbol])

    if this_trait.symbol.lower() in primary_trait_tissue_vals_dict:
        primary_trait_tissue_values = primary_trait_tissue_vals_dict[this_trait.symbol.lower(
        )]

        corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values(
            symbol_list=list(trait_symbol_dict.values()))

        tissue_corr_data = {}
        for trait, symbol in list(trait_symbol_dict.items()):
            if symbol and symbol.lower() in corr_result_tissue_vals_dict:
                this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower(
                )]

                result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values,
                                                                            this_trait_tissue_values,
                                                                            corr_params['method'])

                tissue_corr_data[trait] = [
                    result[0], result[1], result[2], symbol]

        return tissue_corr_data


def do_literature_correlation_for_all_traits(this_trait, target_dataset, trait_geneid_dict, corr_params):
    input_trait_mouse_gene_id = convert_to_mouse_gene_id(
        target_dataset.group.species.lower(), this_trait.geneid)

    lit_corr_data = {}
    for trait, gene_id in list(trait_geneid_dict.items()):
        mouse_gene_id = convert_to_mouse_gene_id(
            target_dataset.group.species.lower(), gene_id)

        if mouse_gene_id and str(mouse_gene_id).find(";") == -1:
            result = ""
            with database_connection(get_setting("SQL_URI")) as conn:
                with conn.cursor() as cursor:
                    cursor.execute(
                        ("SELECT value FROM LCorrRamin3 "
                         "WHERE GeneId1=%s AND GeneId2=%s"),
                        (mouse_gene_id,
                         input_trait_mouse_gene_id))
                    result = cursor.fetchone()
                    if not result:
                        cursor.execute(
                            ("SELECT value FROM LCorrRamin3 "
                             "WHERE GeneId2=%s AND GeneId1=%s"),
                            (mouse_gene_id,
                             input_trait_mouse_gene_id))
                        result = cursor.fetchone()
            if result:
                lit_corr = result[0]
                lit_corr_data[trait] = [gene_id, lit_corr]
            else:
                lit_corr_data[trait] = [gene_id, 0]
        else:
            lit_corr_data[trait] = [gene_id, 0]

    return lit_corr_data


def get_sample_r_and_p_values(this_trait, this_dataset, target_vals, target_dataset, type):
    """
    Calculates the sample r (or rho) and p-value

    Given a primary trait and a target trait's sample values,
    calculates either the pearson r or spearman rho and the p-value
    using the corresponding scipy functions.
    """

    this_trait_vals = []
    shared_target_vals = []
    for i, sample in enumerate(target_dataset.group.samplelist):
        if sample in this_trait.data:
            this_sample_value = this_trait.data[sample].value
            target_sample_value = target_vals[i]
            this_trait_vals.append(this_sample_value)
            shared_target_vals.append(target_sample_value)

    this_trait_vals, shared_target_vals, num_overlap = corr_result_helpers.normalize_values(
        this_trait_vals, shared_target_vals)

    if type == 'pearson':
        sample_r, sample_p = scipy.stats.pearsonr(
            this_trait_vals, shared_target_vals)
    else:
        sample_r, sample_p = scipy.stats.spearmanr(
            this_trait_vals, shared_target_vals)

    if num_overlap > 5:
        if numpy.isnan(sample_r):
            return None
        else:
            return [sample_r, sample_p, num_overlap]


def convert_to_mouse_gene_id(species=None, gene_id=None):
    """If the species is rat or human, translate the gene_id to the mouse geneid

    If there is no input gene_id or there's no corresponding mouse gene_id, return None

    """
    if not gene_id:
        return None

    mouse_gene_id = None
    with database_connection(get_setting("SQL_URI")) as conn:
        with conn.cursor() as cursor:
            if species == 'mouse':
                mouse_gene_id = gene_id
            elif species == 'rat':
                cursor.execute(
                    ("SELECT mouse FROM GeneIDXRef "
                     "WHERE rat=%s"), gene_id)
                result = cursor.fetchone()
                if result:
                    mouse_gene_id = result[0]
            elif species == 'human':
                cursor.execute(
                    "SELECT mouse FROM GeneIDXRef "
                    "WHERE human=%s", gene_id)
                result = cursor.fetchone()
                if result:
                    mouse_gene_id = result[0]
    return mouse_gene_id


def init_corr_params(start_vars):
    method = "pearson"
    if 'method' in start_vars:
        method = start_vars['method']

    type = "sample"
    if 'type' in start_vars:
        type = start_vars['type']

    return_count = 500
    if 'return_count' in start_vars:
        assert(start_vars['return_count'].isdigit())
        return_count = int(start_vars['return_count'])

    corr_params = {
        'method': method,
        'type': type,
        'return_count': return_count
    }

    return corr_params