aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gn3/computations/correlations.py142
1 files changed, 54 insertions, 88 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 0d15d9b..cd7d604 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -9,12 +9,17 @@ from typing import Callable
import scipy.stats
-def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict)-> List:
- """Function to construct target dataset data items given commoned shared\
- keys and trait samplelist values for example given keys >>>>>>>>>>\
- ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value object as\
- "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7, 3.6, 1.5, 4.2, 2.3]}\
- return results should be a list of dicts mapping the shared keys to the trait values"""
+def map_shared_keys_to_values(target_sample_keys: List,
+ target_sample_vals: dict) -> List:
+ """Function to construct target dataset data items given common shared keys
+ and trait sample-list values for example given keys
+
+ >>>>>>>>>> ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value
+ object as "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7,
+ 3.6, 1.5, 4.2, 2.3]} return results should be a list of dicts mapping the
+ shared keys to the trait values
+
+ """
target_dataset_data = []
for trait_id, sample_values in target_sample_vals.items():
@@ -32,9 +37,9 @@ def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict
def normalize_values(a_values: List,
b_values: List) -> Tuple[List[float], List[float], int]:
- """Trim two lists of values to contain only the values they both share
- Given two lists of sample values, trim each list so that it contains only
- the samples that contain a value in both lists. Also returns the number of
+ """Trim two lists of values to contain only the values they both share Given
+ two lists of sample values, trim each list so that it contains only the
+ samples that contain a value in both lists. Also returns the number of
such samples.
>>> normalize_values([2.3, None, None, 3.2, 4.1, 5],
@@ -62,16 +67,14 @@ pearson,spearman and biweight mid correlation return value is rho and p_value
"pearson": scipy.stats.pearsonr,
"spearman": scipy.stats.spearmanr
}
-
use_corr_method = corr_mapping.get(corr_method, "spearman")
-
corr_coeffient, p_val = use_corr_method(primary_values, target_values)
-
return (corr_coeffient, p_val)
def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
- target_samples_vals) -> Optional[Tuple[str, float, float, int]]:
+ target_samples_vals) -> Optional[
+ Tuple[str, float, float, int]]:
"""Given a primary trait values and target trait values calculate the
correlation coeff and p value
@@ -90,7 +93,6 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
# should use numpy.isNan scipy.isNan is deprecated
if corr_coeffient is not None:
return (trait_name, corr_coeffient, p_value, num_overlap)
-
return None
@@ -99,15 +101,16 @@ def do_bicor(x_val, y_val) -> Tuple[float, float]:
package :not packaged in guix
"""
- _corr_input = (x_val, y_val)
- return (0.0, 0.0)
+ x_val, y_val = 0, 0
+ return (x_val, y_val)
def filter_shared_sample_keys(this_samplelist,
target_samplelist) -> Tuple[List, List]:
- """Given primary and target samplelist\
- for two base and target trait select\
- filter the values using the shared keys"""
+ """Given primary and target sample-list for two base and target trait select
+ filter the values using the shared keys
+
+ """
this_vals = []
target_vals = []
for key, value in target_samplelist.items():
@@ -120,21 +123,18 @@ def filter_shared_sample_keys(this_samplelist,
def compute_all_sample_correlation(this_trait,
target_dataset,
corr_method="pearson") -> List:
- """Given a trait data samplelist and\
- target__datasets compute all sample correlation
+ """Given a trait data sample-list and target__datasets compute all sample
+ correlation
+
"""
# xtodo fix trait_name currently returning single one
# pylint: disable-msg=too-many-locals
-
this_trait_samples = this_trait["trait_sample_data"]
corr_results = []
processed_values = []
for target_trait in target_dataset:
trait_name = target_trait.get("trait_id")
target_trait_data = target_trait["trait_sample_data"]
- # this_vals, target_vals = filter_shared_sample_keys(
- # this_trait_samples, target_trait_data)
-
processed_values.append((trait_name, corr_method, *filter_shared_sample_keys(
this_trait_samples, target_trait_data)))
with multiprocessing.Pool(4) as pool:
@@ -144,7 +144,6 @@ def compute_all_sample_correlation(this_trait,
if sample_correlation is not None:
(trait_name, corr_coeffient, p_value,
num_overlap) = sample_correlation
-
corr_result = {
"corr_coeffient": corr_coeffient,
"p_value": p_value,
@@ -152,7 +151,6 @@ def compute_all_sample_correlation(this_trait,
}
corr_results.append({trait_name: corr_result})
-
return sorted(
corr_results,
key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coeffient"]))
@@ -160,42 +158,34 @@ def compute_all_sample_correlation(this_trait,
def benchmark_compute_all_sample(this_trait,
target_dataset,
- corr_method="pearson") ->List:
- """Temp function to benchmark with compute_all_sample_r\
- alternative to compute_all_sample_r where we use \
- multiprocessing
- """
+ corr_method="pearson") -> List:
+ """Temp function to benchmark with compute_all_sample_r alternative to
+ compute_all_sample_r where we use multiprocessing
+ """
this_trait_samples = this_trait["trait_sample_data"]
-
corr_results = []
-
for target_trait in target_dataset:
trait_name = target_trait.get("trait_id")
target_trait_data = target_trait["trait_sample_data"]
this_vals, target_vals = filter_shared_sample_keys(
this_trait_samples, target_trait_data)
-
sample_correlation = compute_sample_r_correlation(
trait_name=trait_name,
corr_method=corr_method,
trait_vals=this_vals,
target_samples_vals=target_vals)
-
if sample_correlation is not None:
- (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation
-
+ (trait_name, corr_coeffient,
+ p_value, num_overlap) = sample_correlation
else:
continue
-
corr_result = {
"corr_coeffient": corr_coeffient,
"p_value": p_value,
"num_overlap": num_overlap
}
-
corr_results.append({trait_name: corr_result})
-
return corr_results
@@ -205,11 +195,8 @@ list depending on whether both dataset and target_dataset are both set to
probet
"""
-
corr_results = {"lit": 1}
-
if corr_type not in ("lit", "literature"):
-
corr_results["top_corr_results"] = top_corr_results
# run lit_correlation for the given top_corr_results
if corr_type == "tissue":
@@ -255,8 +242,10 @@ def fetch_lit_correlation_data(
input_mouse_gene_id: Optional[str],
gene_id: str,
mouse_gene_id: Optional[str] = None) -> Tuple[str, float]:
- """Given input trait mouse gene id and mouse gene id fetch the lit\
- corr_data"""
+ """Given input trait mouse gene id and mouse gene id fetch the lit
+ corr_data
+
+ """
if mouse_gene_id is not None and ";" not in mouse_gene_id:
query = """
SELECT VALUE
@@ -283,7 +272,6 @@ def fetch_lit_correlation_data(
lit_results = (gene_id, lit_corr_results.val)\
if lit_corr_results else (gene_id, 0)
return lit_results
-
return (gene_id, 0)
@@ -295,11 +283,9 @@ def lit_correlation_for_trait_list(
"""given species,base trait gene id fetch the lit corr results from the db\
output is float for lit corr results """
fetched_lit_corr_results = []
-
this_trait_mouse_gene_id = map_to_mouse_gene_id(conn=conn,
species=species,
gene_id=trait_gene_id)
-
for (trait_name, target_trait_gene_id) in target_trait_lists:
corr_results = {}
if target_trait_gene_id:
@@ -307,29 +293,26 @@ def lit_correlation_for_trait_list(
conn=conn,
species=species,
gene_id=target_trait_gene_id)
-
fetched_corr_data = fetch_lit_correlation_data(
conn=conn,
input_mouse_gene_id=this_trait_mouse_gene_id,
gene_id=target_trait_gene_id,
mouse_gene_id=target_mouse_gene_id)
-
dict_results = dict(zip(("gene_id", "lit_corr"),
fetched_corr_data))
corr_results[trait_name] = dict_results
fetched_lit_corr_results.append(corr_results)
-
return fetched_lit_corr_results
def query_formatter(query_string: str, *query_values):
- """Formatter query string given the unformatted query string\
- and the respectibe values.Assumes number of placeholders is
- equal to the number of query values """
- # xtodo escape sql queries
- results = query_string % (query_values)
+ """Formatter query string given the unformatted query string and the
+ respectibe values.Assumes number of placeholders is equal to the number of
+ query values
- return results
+ """
+ # xtodo escape sql queries
+ return query_string % (query_values)
def map_to_mouse_gene_id(conn, species: Optional[str],
@@ -342,26 +325,23 @@ def map_to_mouse_gene_id(conn, species: Optional[str],
return None
if species == "mouse":
return gene_id
-
cursor = conn.cursor()
query = """SELECT mouse
FROM GeneIDXRef
WHERE '%s' = '%s'"""
-
query_values = (species, gene_id)
cursor.execute(query_formatter(query,
*query_values))
results = cursor.fetchone()
-
mouse_gene_id = results.mouse if results is not None else None
-
return mouse_gene_id
def compute_all_lit_correlation(conn, trait_lists: List,
species: str, gene_id):
- """Function that acts as an abstraction for
- lit_correlation_for_trait_list"""
+ """Function that acts as an abstraction for lit_correlation_for_trait_list
+
+ """
lit_results = lit_correlation_for_trait_list(
conn=conn,
@@ -378,47 +358,37 @@ def compute_all_lit_correlation(conn, trait_lists: List,
def compute_all_tissue_correlation(primary_tissue_dict: dict,
target_tissues_data: dict,
corr_method: str):
- """Function acts as an abstraction for tissue_correlation_for_trait_list\
- required input are target tissue object and primary tissue trait\
- target tissues data contains the trait_symbol_dict and symbol_tissue_vals
+ """Function acts as an abstraction for tissue_correlation_for_trait_list
+ required input are target tissue object and primary tissue trait target
+ tissues data contains the trait_symbol_dict and symbol_tissue_vals
"""
-
tissues_results = []
-
primary_tissue_vals = primary_tissue_dict["tissue_values"]
traits_symbol_dict = target_tissues_data["trait_symbol_dict"]
symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"]
-
target_tissues_list = process_trait_symbol_dict(
traits_symbol_dict, symbol_tissue_vals_dict)
-
for target_tissue_obj in target_tissues_list:
trait_id = target_tissue_obj.get("trait_id")
-
target_tissue_vals = target_tissue_obj.get("tissue_values")
-
tissue_result = tissue_correlation_for_trait_list(
primary_tissue_vals=primary_tissue_vals,
target_tissues_values=target_tissue_vals,
trait_id=trait_id,
corr_method=corr_method)
-
tissue_result_dict = {trait_id: tissue_result}
tissues_results.append(tissue_result_dict)
-
- sorted_tissues_results = sorted(
+ return sorted(
tissues_results,
key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"]))
- return sorted_tissues_results
-
def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> List:
- """Method for processing trait symbol\
- dict given the symbol tissue values """
- traits_tissue_vals = []
+ """Method for processing trait symbol dict given the symbol tissue values
+ """
+ traits_tissue_vals = []
for (trait, symbol) in trait_symbol_dict.items():
if symbol is not None:
target_symbol = symbol.lower()
@@ -427,25 +397,21 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis
target_tissue_dict = {"trait_id": trait,
"symbol": target_symbol,
"tissue_values": trait_tissue_val}
-
traits_tissue_vals.append(target_tissue_dict)
-
return traits_tissue_vals
def compute_tissue_correlation(primary_tissue_dict: dict,
target_tissues_data: dict,
corr_method: str):
- """Experimental function that uses multiprocessing\
- for computing tissue correlation
- """
+ """Experimental function that uses multiprocessing for computing tissue
+ correlation
+ """
tissues_results = []
-
primary_tissue_vals = primary_tissue_dict["tissue_values"]
traits_symbol_dict = target_tissues_data["trait_symbol_dict"]
symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"]
-
target_tissues_list = process_trait_symbol_dict(
traits_symbol_dict, symbol_tissue_vals_dict)
processed_values = []