diff options
Diffstat (limited to 'gn3/computations')
-rw-r--r-- | gn3/computations/correlations.py | 131 | ||||
-rw-r--r-- | gn3/computations/gemma.py | 2 |
2 files changed, 50 insertions, 83 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 4fdf8cf..857ceb0 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -9,12 +9,17 @@ from typing import Callable import scipy.stats -def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict)-> List: - """Function to construct target dataset data items given commoned shared\ - keys and trait samplelist values for example given keys >>>>>>>>>>\ - ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value object as\ - "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7, 3.6, 1.5, 4.2, 2.3]}\ - return results should be a list of dicts mapping the shared keys to the trait values""" +def map_shared_keys_to_values(target_sample_keys: List, + target_sample_vals: dict) -> List: + """Function to construct target dataset data items given common shared keys + and trait sample-list values for example given keys + + >>>>>>>>>> ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value + object as "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7, + 3.6, 1.5, 4.2, 2.3]} return results should be a list of dicts mapping the + shared keys to the trait values + + """ target_dataset_data = [] for trait_id, sample_values in target_sample_vals.items(): @@ -32,9 +37,9 @@ def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict def normalize_values(a_values: List, b_values: List) -> Tuple[List[float], List[float], int]: - """Trim two lists of values to contain only the values they both share - Given two lists of sample values, trim each list so that it contains only - the samples that contain a value in both lists. Also returns the number of + """Trim two lists of values to contain only the values they both share Given + two lists of sample values, trim each list so that it contains only the + samples that contain a value in both lists. Also returns the number of such samples. >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], @@ -62,16 +67,14 @@ pearson,spearman and biweight mid correlation return value is rho and p_value "pearson": scipy.stats.pearsonr, "spearman": scipy.stats.spearmanr } - use_corr_method = corr_mapping.get(corr_method, "spearman") - corr_coeffient, p_val = use_corr_method(primary_values, target_values) - return (corr_coeffient, p_val) def compute_sample_r_correlation(trait_name, corr_method, trait_vals, - target_samples_vals) -> Optional[Tuple[str, float, float, int]]: + target_samples_vals) -> Optional[ + Tuple[str, float, float, int]]: """Given a primary trait values and target trait values calculate the correlation coeff and p value @@ -90,7 +93,6 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, # should use numpy.isNan scipy.isNan is deprecated if corr_coeffient is not None: return (trait_name, corr_coeffient, p_value, num_overlap) - return None @@ -99,15 +101,16 @@ def do_bicor(x_val, y_val) -> Tuple[float, float]: package :not packaged in guix """ - _corr_input = (x_val, y_val) - return (0.0, 0.0) + x_val, y_val = 0, 0 + return (x_val, y_val) def filter_shared_sample_keys(this_samplelist, target_samplelist) -> Tuple[List, List]: - """Given primary and target samplelist\ - for two base and target trait select\ - filter the values using the shared keys""" + """Given primary and target sample-list for two base and target trait select + filter the values using the shared keys + + """ this_vals = [] target_vals = [] for key, value in target_samplelist.items(): @@ -120,21 +123,18 @@ def filter_shared_sample_keys(this_samplelist, def compute_all_sample_correlation(this_trait, target_dataset, corr_method="pearson") -> List: - """Given a trait data samplelist and\ - target__datasets compute all sample correlation + """Given a trait data sample-list and target__datasets compute all sample + correlation + """ # xtodo fix trait_name currently returning single one # pylint: disable-msg=too-many-locals - this_trait_samples = this_trait["trait_sample_data"] corr_results = [] processed_values = [] for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - # this_vals, target_vals = filter_shared_sample_keys( - # this_trait_samples, target_trait_data) - processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( this_trait_samples, target_trait_data))) with multiprocessing.Pool(4) as pool: @@ -144,7 +144,6 @@ def compute_all_sample_correlation(this_trait, if sample_correlation is not None: (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation - corr_result = { "corr_coeffient": corr_coeffient, "p_value": p_value, @@ -152,7 +151,6 @@ def compute_all_sample_correlation(this_trait, } corr_results.append({trait_name: corr_result}) - return sorted( corr_results, key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coeffient"])) @@ -160,45 +158,36 @@ def compute_all_sample_correlation(this_trait, def benchmark_compute_all_sample(this_trait, target_dataset, - corr_method="pearson") ->List: - """Temp function to benchmark with compute_all_sample_r\ - alternative to compute_all_sample_r where we use \ - multiprocessing - """ + corr_method="pearson") -> List: + """Temp function to benchmark with compute_all_sample_r alternative to + compute_all_sample_r where we use multiprocessing + """ this_trait_samples = this_trait["trait_sample_data"] - corr_results = [] - for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = filter_shared_sample_keys( this_trait_samples, target_trait_data) - sample_correlation = compute_sample_r_correlation( trait_name=trait_name, corr_method=corr_method, trait_vals=this_vals, target_samples_vals=target_vals) - if sample_correlation is not None: - (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation - + (trait_name, corr_coeffient, + p_value, num_overlap) = sample_correlation else: continue - corr_result = { "corr_coeffient": corr_coeffient, "p_value": p_value, "num_overlap": num_overlap } - corr_results.append({trait_name: corr_result}) - return corr_results - - + def tissue_correlation_for_trait( primary_tissue_vals: List, target_tissues_values: List, @@ -232,8 +221,10 @@ def fetch_lit_correlation_data( input_mouse_gene_id: Optional[str], gene_id: str, mouse_gene_id: Optional[str] = None) -> Tuple[str, float]: - """Given input trait mouse gene id and mouse gene id fetch the lit\ - corr_data""" + """Given input trait mouse gene id and mouse gene id fetch the lit + corr_data + + """ if mouse_gene_id is not None and ";" not in mouse_gene_id: query = """ SELECT VALUE @@ -260,7 +251,6 @@ def fetch_lit_correlation_data( lit_results = (gene_id, lit_corr_results[1])\ if lit_corr_results else (gene_id, 0) return lit_results - return (gene_id, 0) @@ -272,11 +262,9 @@ def lit_correlation_for_trait( """given species,base trait gene id fetch the lit corr results from the db\ output is float for lit corr results """ fetched_lit_corr_results = [] - this_trait_mouse_gene_id = map_to_mouse_gene_id(conn=conn, species=species, gene_id=trait_gene_id) - for (trait_name, target_trait_gene_id) in target_trait_lists: corr_results = {} if target_trait_gene_id: @@ -284,29 +272,26 @@ def lit_correlation_for_trait( conn=conn, species=species, gene_id=target_trait_gene_id) - fetched_corr_data = fetch_lit_correlation_data( conn=conn, input_mouse_gene_id=this_trait_mouse_gene_id, gene_id=target_trait_gene_id, mouse_gene_id=target_mouse_gene_id) - dict_results = dict(zip(("gene_id", "lit_corr"), fetched_corr_data)) corr_results[trait_name] = dict_results fetched_lit_corr_results.append(corr_results) - return fetched_lit_corr_results def query_formatter(query_string: str, *query_values): - """Formatter query string given the unformatted query string\ - and the respectibe values.Assumes number of placeholders is - equal to the number of query values """ - # xtodo escape sql queries - results = query_string % (query_values) + """Formatter query string given the unformatted query string and the + respectibe values.Assumes number of placeholders is equal to the number of + query values - return results + """ + # xtodo escape sql queries + return query_string % (query_values) def map_to_mouse_gene_id(conn, species: Optional[str], @@ -319,19 +304,15 @@ def map_to_mouse_gene_id(conn, species: Optional[str], return None if species == "mouse": return gene_id - cursor = conn.cursor() query = """SELECT mouse FROM GeneIDXRef WHERE '%s' = '%s'""" - query_values = (species, gene_id) cursor.execute(query_formatter(query, *query_values)) results = cursor.fetchone() - mouse_gene_id = results.mouse if results is not None else None - return mouse_gene_id @@ -358,21 +339,15 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, """Function acts as an abstraction for tissue_correlation_for_trait\ required input are target tissue object and primary tissue trait\ target tissues data contains the trait_symbol_dict and symbol_tissue_vals - """ - tissues_results = [] - primary_tissue_vals = primary_tissue_dict["tissue_values"] traits_symbol_dict = target_tissues_data["trait_symbol_dict"] symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"] - target_tissues_list = process_trait_symbol_dict( traits_symbol_dict, symbol_tissue_vals_dict) - for target_tissue_obj in target_tissues_list: trait_id = target_tissue_obj.get("trait_id") - target_tissue_vals = target_tissue_obj.get("tissue_values") tissue_result = tissue_correlation_for_trait( @@ -380,22 +355,18 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, target_tissues_values=target_tissue_vals, trait_id=trait_id, corr_method=corr_method) - tissue_result_dict = {trait_id: tissue_result} tissues_results.append(tissue_result_dict) - - sorted_tissues_results = sorted( + return sorted( tissues_results, key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"])) - return sorted_tissues_results - def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> List: - """Method for processing trait symbol\ - dict given the symbol tissue values """ - traits_tissue_vals = [] + """Method for processing trait symbol dict given the symbol tissue values + """ + traits_tissue_vals = [] for (trait, symbol) in trait_symbol_dict.items(): if symbol is not None: target_symbol = symbol.lower() @@ -404,25 +375,21 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis target_tissue_dict = {"trait_id": trait, "symbol": target_symbol, "tissue_values": trait_tissue_val} - traits_tissue_vals.append(target_tissue_dict) - return traits_tissue_vals def compute_tissue_correlation(primary_tissue_dict: dict, target_tissues_data: dict, corr_method: str): - """Experimental function that uses multiprocessing\ - for computing tissue correlation - """ + """Experimental function that uses multiprocessing for computing tissue + correlation + """ tissues_results = [] - primary_tissue_vals = primary_tissue_dict["tissue_values"] traits_symbol_dict = target_tissues_data["trait_symbol_dict"] symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"] - target_tissues_list = process_trait_symbol_dict( traits_symbol_dict, symbol_tissue_vals_dict) processed_values = [] diff --git a/gn3/computations/gemma.py b/gn3/computations/gemma.py index 5f9d5a3..0b22d3c 100644 --- a/gn3/computations/gemma.py +++ b/gn3/computations/gemma.py @@ -7,7 +7,7 @@ from typing import Dict from typing import List from typing import ValuesView from gn3.commands import compose_gemma_cmd -from gn3.file_utils import get_hash_of_files +from gn3.fs_helpers import get_hash_of_files def generate_hash_of_string(unhashed_str: str) -> str: |