diff options
author | Alexander Kabui | 2021-03-16 10:36:58 +0300 |
---|---|---|
committer | GitHub | 2021-03-16 10:36:58 +0300 |
commit | 43d1bb7f6cd2b5890d5b3eb7c357caafda25a35c (patch) | |
tree | 73683272f32cffc860497a93b5c844c272252e67 /gn3 | |
parent | 995f1dbd081eb64ad177f929615a4edee01cb68f (diff) | |
download | genenetwork3-43d1bb7f6cd2b5890d5b3eb7c357caafda25a35c.tar.gz |
Refactor/clean up correlations (#4)
* initial commit for Refactor/clean-up-correlation
* add python scipy dependency
* initial commit for sample correlation
* initial commit for sample correlation endpoint
* initial commit for integration and unittest
* initial commit for registering correlation blueprint
* add and modify unittest and integration tests for correlation
* Add compute compute_all_sample_corr method for correlation
* add scipy to requirement txt file
* add tissue correlation for trait list
* add unittest for tissue correlation
* add lit correlation for trait list
* add unittests for lit correlation for trait list
* modify lit correlarion for trait list
* add unittests for lit correlation for trait list
* add correlation metho in dynamic url
* add file format for expected structure input while doing sample correlation
* modify input data structure -> add trait id
* update tests for sample r correlation
* add compute all lit correlation method
* add endpoint for computing lit_corr
* add unit and integration tests for computing lit corr
* add /api/correlation/tissue_corr/{corr_method} endpoint for tissue correlation
* add unittest and integration tests for tissue correlation
Co-authored-by: BonfaceKilz <bonfacemunyoki@gmail.com>
Diffstat (limited to 'gn3')
-rw-r--r-- | gn3/api/correlation.py | 77 | ||||
-rw-r--r-- | gn3/computations/correlations.py | 305 |
2 files changed, 353 insertions, 29 deletions
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index 217b7ce..56b8381 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -1,44 +1,63 @@ -"""Endpoints for computing correlation""" -import time -from flask import Blueprint +"""Endpoints for running correlations""" +from unittest import mock + from flask import jsonify +from flask import Blueprint from flask import request -from flask import g -from sqlalchemy import create_engine -from default_settings import SQL_URI -from gn3.correlation.correlation_computations import compute_correlation +from gn3.computations.correlations import compute_all_sample_correlation +from gn3.computations.correlations import compute_all_lit_correlation +from gn3.computations.correlations import compute_all_tissue_correlation + correlation = Blueprint("correlation", __name__) -# xtodo implement neat db setup -@correlation.before_request -def connect_db(): - """add connection to db method""" - print("@app.before_request connect_db") - db_connection = getattr(g, '_database', None) - if db_connection is None: - print("Get new database connector") - g.db = g._database = create_engine(SQL_URI, encoding="latin1") +@correlation.route("/sample_r/<string:corr_method>", methods=["POST"]) +def compute_sample_r(corr_method="pearson"): + """correlation endpoint for computing sample r correlations\ + api expects the trait data with has the trait and also the\ + target_dataset data""" + correlation_input = request.get_json() + + # xtodo move code below to compute_all_sampl correlation + this_trait_data = correlation_input.get("this_trait") + target_datasets = correlation_input.get("target_dataset") + + correlation_results = compute_all_sample_correlation(corr_method=corr_method, + this_trait=this_trait_data, + target_dataset=target_datasets) + + return jsonify({ + "corr_results": correlation_results + }) + - g.initial_time = time.time() +@correlation.route("/lit_corr/<string:species>/<int:gene_id>", methods=["POST"]) +def compute_lit_corr(species=None, gene_id=None): + """api endpoint for doing lit correlation.results for lit correlation\ + are fetched from the database this is the only case where the db\ + might be needed for actual computing of the correlation results""" + database_instance = mock.Mock() + target_traits_gene_ids = request.get_json() -@correlation.route("/corr_compute", methods=["POST"]) -def corr_compute_page(): - """api for doing correlation""" + lit_corr_results = compute_all_lit_correlation( + database_instance=database_instance, trait_lists=target_traits_gene_ids, + species=species, gene_id=gene_id) - correlation_input = request.json + return jsonify(lit_corr_results) - if correlation_input is None: - return jsonify({"error": str("Bad request")}), 400 - try: - corr_results = compute_correlation( - correlation_input_data=correlation_input) +@correlation.route("/tissue_corr/<string:corr_method>", methods=["POST"]) +def compute_tissue_corr(corr_method="pearson"): + """api endpoint fr doing tissue correlation""" + tissue_input_data = request.get_json() + primary_tissue_dict = tissue_input_data["primary_tissue"] + target_tissues_dict_list = tissue_input_data["target_tissues"] - except Exception as error: # pylint: disable=broad-except - return jsonify({"error": str(error)}) + results = compute_all_tissue_correlation(primary_tissue_dict=primary_tissue_dict, + target_tissues_dict_list=target_tissues_dict_list, + corr_method=corr_method) - return {"correlation_results": corr_results} + return jsonify(results)
\ No newline at end of file diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py new file mode 100644 index 0000000..21f5929 --- /dev/null +++ b/gn3/computations/correlations.py @@ -0,0 +1,305 @@ +"""module contains code for correlations""" +from typing import List +from typing import Tuple +from typing import Optional +from typing import Callable + +import scipy.stats # type: ignore + + +def compute_sum(rhs: int, lhs: int)-> int: + """initial tests to compute sum of two numbers""" + return rhs + lhs + + +def normalize_values(a_values: List, b_values: List)->Tuple[List[float], List[float], int]: + """ + Trim two lists of values to contain only the values they both share + + Given two lists of sample values, trim each list so that it contains + only the samples that contain a value in both lists. Also returns + the number of such samples. + + >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1]) + ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) + + """ + a_new = [] + b_new = [] + for a_val, b_val in zip(a_values, b_values): + if (a_val and b_val is not None): + a_new.append(a_val) + b_new.append(b_val) + return a_new, b_new, len(a_new) + + +def compute_corr_coeff_p_value(primary_values: List, target_values: List, corr_method: str)->\ + Tuple[float, float]: + """given array like inputs calculate the primary and target_value + methods ->pearson,spearman and biweight mid correlation + return value is rho and p_value + """ + corr_mapping = { + "bicor": do_bicor, + "pearson": scipy.stats.pearsonr, + "spearman": scipy.stats.spearmanr + } + + use_corr_method = corr_mapping.get(corr_method, "spearman") + + corr_coeffient, p_val = use_corr_method(primary_values, target_values) + + return (corr_coeffient, p_val) + + +def compute_sample_r_correlation(corr_method: str, trait_vals, target_samples_vals)->\ + Optional[Tuple[float, float, int]]: + """Given a primary trait values and target trait values + calculate the correlation coeff and p value""" + + sanitized_traits_vals, sanitized_target_vals,\ + num_overlap = normalize_values(trait_vals, target_samples_vals) + + if num_overlap > 5: + + (corr_coeffient, p_value) =\ + compute_corr_coeff_p_value(primary_values=sanitized_traits_vals, + target_values=sanitized_target_vals, + corr_method=corr_method) + + # xtodo check if corr_coefficient is None should use numpy.isNan scipy.isNan is deprecated + if corr_coeffient is not None: + return (corr_coeffient, p_value, num_overlap) + + return None + + +def do_bicor(x_val, y_val) -> Tuple[float, float]: + """not implemented method for doing biweight mid correlation + use astropy stats package :not packaged in guix + """ + + return (x_val, y_val) + + +def filter_shared_sample_keys(this_samplelist, target_samplelist)->Tuple[List, List]: + """given primary and target samplelist for two base and target\ + trait select filter the values using the shared keys""" + this_vals = [] + target_vals = [] + + for key, value in target_samplelist.items(): + if key in this_samplelist: + target_vals.append(value) + this_vals.append(this_samplelist[key]) + + return (this_vals, target_vals) + + +def compute_all_sample_correlation(this_trait, target_dataset, corr_method="pearson")->List: + """given a trait data samplelist and target__datasets compute all sample correlation""" + + this_trait_samples = this_trait["trait_sample_data"] + + corr_results = [] + + for target_trait in target_dataset: + trait_id = target_trait.get("trait_id") + target_trait_data = target_trait["trait_sample_data"] + this_vals, target_vals = filter_shared_sample_keys( + this_trait_samples, target_trait_data) + + sample_correlation = compute_sample_r_correlation( + corr_method=corr_method, trait_vals=this_vals, target_samples_vals=target_vals) + + if sample_correlation is not None: + (corr_coeffient, p_value, num_overlap) = sample_correlation + + else: + continue + + corr_result = {"corr_coeffient": corr_coeffient, + "p_value": p_value, + "num_overlap": num_overlap} + + corr_results.append({trait_id: corr_result}) + + return corr_results + + +def tissue_lit_corr_for_probe_type(corr_type: str, top_corr_results): + """function that does either lit_corr_for_trait_list or tissue_corr\ + _for_trait list depending on whether both dataset and target_dataset are\ + both set to probet""" + + corr_results = {"lit": 1} + + if corr_type not in ("lit", "literature"): + + corr_results["top_corr_results"] = top_corr_results + # run lit_correlation for the given top_corr_results + if corr_type == "tissue": + # run lit correlation the given top corr results + pass + if corr_type == "sample": + pass + # run sample r correlation for the given top results + + return corr_results + + +def tissue_correlation_for_trait_list(primary_tissue_vals: List, + target_tissues_values: List, + corr_method: str, + compute_corr_p_value: Callable = + compute_corr_coeff_p_value)->dict: + """given a primary tissue values for a trait and the target tissues values\ + compute the correlation_cooeff and p value the input required are arrays\ + output - > List containing Dicts with corr_coefficient value,P_value and\ + also the tissue numbers is len(primary) == len(target)""" + + # ax :todo assertion that lenggth one one target tissue ==primary_tissue + + (tissue_corr_coeffient, p_value) = compute_corr_p_value( + primary_values=primary_tissue_vals, + target_values=target_tissues_values, + corr_method=corr_method) + + lit_corr_result = { + "tissue_corr": tissue_corr_coeffient, + "p_value": p_value, + "tissue_number": len(primary_tissue_vals) + } + + return lit_corr_result + + +def fetch_lit_correlation_data(database, + input_mouse_gene_id: Optional[str], + gene_id: str, + mouse_gene_id: Optional[str] = None)->Tuple[str, float]: + """given input trait mouse gene id and mouse gene id fetch the lit\ + corr_data""" + if mouse_gene_id is not None and ";" not in mouse_gene_id: + query = """ + SELECT VALUE + FROM LCorrRamin3 + WHERE GeneId1='%s' and + GeneId2='%s' + """ + + query_values = (str(mouse_gene_id), str(input_mouse_gene_id)) + + results = database.execute( + query_formatter(query, *query_values)).fetchone() + + lit_corr_results = results if results is not None else database.execute( + query_formatter(query, *tuple(reversed(query_values)))).fetchone() + + lit_results = (gene_id, lit_corr_results.val)\ + if lit_corr_results else (gene_id, 0) + return lit_results + + return (gene_id, 0) + + +def lit_correlation_for_trait_list(database, + target_trait_lists: List, + species: Optional[str] = None, + trait_gene_id: Optional[str] = None)->List: + """given species,base trait gene id fetch the lit corr results from the db\ + output is float for lit corr results """ + fetched_lit_corr_results = [] + + this_trait_mouse_gene_id = map_to_mouse_gene_id( + database=database, species=species, gene_id=trait_gene_id) + + for trait in target_trait_lists: + target_trait_gene_id = trait.get("gene_id") + if target_trait_gene_id: + target_mouse_gene_id = map_to_mouse_gene_id( + database=database, species=species, gene_id=target_trait_gene_id) + + fetched_corr_data = fetch_lit_correlation_data( + database=database, input_mouse_gene_id=this_trait_mouse_gene_id, + gene_id=target_trait_gene_id, mouse_gene_id=target_mouse_gene_id) + + dict_results = dict( + zip(("gene_id", "lit_corr"), fetched_corr_data)) + fetched_lit_corr_results.append(dict_results) + + return fetched_lit_corr_results + + +def query_formatter(query_string: str, * query_values): + """formatter query string given the unformatted query string\ + and the respectibe values.Assumes number of placeholders is + equal to the number of query values """ + results = query_string % (query_values) + + return results + + +def map_to_mouse_gene_id(database, species: Optional[str], gene_id: Optional[str])->Optional[str]: + """given a species which is not mouse map the gene_id\ + to respective mouse gene id""" + # AK:xtodo move the code for checking nullity out of thing functions bug while\ + # method for string + if None in (species, gene_id): + return None + if species == "mouse": + return gene_id + + query = """SELECT mouse + FROM GeneIDXRef + WHERE '%s' = '%s'""" + + query_values = (species, gene_id) + + results = database.execute( + query_formatter(query, *query_values)).fetchone() + + mouse_gene_id = results.mouse if results is not None else None + + return mouse_gene_id + + +def compute_all_lit_correlation(database_instance, trait_lists: List, species: str, gene_id): + """function that acts as an abstraction for lit_correlation_for_trait_list""" + # xtodo to be refactored + + lit_results = lit_correlation_for_trait_list(database=database_instance, + target_trait_lists=trait_lists, + species=species, + trait_gene_id=gene_id + ) + + return { + "lit_results": lit_results + } + + +def compute_all_tissue_correlation(primary_tissue_dict: dict, + target_tissues_dict_list: List, + corr_method: str): + """function acts as an abstraction for tissue_correlation_for_trait_list\ + required input are target tissue object and primary tissue trait """ + + tissues_results = {} + + primary_tissue_vals = primary_tissue_dict["tissue_values"] + + target_tissues_list = target_tissues_dict_list + + for target_tissue_obj in target_tissues_list: + trait_id = target_tissue_obj.get("trait_id") + + target_tissue_vals = target_tissue_obj.get("tissue_values") + + tissue_result = tissue_correlation_for_trait_list(primary_tissue_vals=primary_tissue_vals, + target_tissues_values=target_tissue_vals, + corr_method=corr_method) + + tissues_results[trait_id] = tissue_result + + return tissues_results |