Refactor/clean up correlations (#4)

* initial commit for Refactor/clean-up-correlation * add python scipy dependency * initial commit for sample correlation * initial commit for sample correlation endpoint * initial commit for integration and unittest * initial commit for registering correlation blueprint * add and modify unittest and integration tests for correlation * Add compute compute_all_sample_corr method for correlation * add scipy to requirement txt file * add tissue correlation for trait list * add unittest for tissue correlation * add lit correlation for trait list * add unittests for lit correlation for trait list * modify lit correlarion for trait list * add unittests for lit correlation for trait list * add correlation metho in dynamic url * add file format for expected structure input while doing sample correlation * modify input data structure -> add trait id * update tests for sample r correlation * add compute all lit correlation method * add endpoint for computing lit_corr * add unit and integration tests for computing lit corr * add /api/correlation/tissue_corr/{corr_method} endpoint for tissue correlation * add unittest and integration tests for tissue correlation Co-authored-by: BonfaceKilz <bonfacemunyoki@gmail.com>
author: Alexander Kabui 2021-03-16 10:36:58 +0300
committer: GitHub 2021-03-16 10:36:58 +0300
commit: 43d1bb7f6cd2b5890d5b3eb7c357caafda25a35c (patch)
tree: 73683272f32cffc860497a93b5c844c272252e67 /gn3
parent: 995f1dbd081eb64ad177f929615a4edee01cb68f (diff)
download: genenetwork3-43d1bb7f6cd2b5890d5b3eb7c357caafda25a35c.tar.gz
2 files changed, 353 insertions, 29 deletions
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py
index 217b7ce..56b8381 100644
--- a/gn3/api/correlation.py
+++ b/gn3/api/correlation.py
@@ -1,44 +1,63 @@
-"""Endpoints for computing correlation"""
-import time
-from flask import Blueprint
+"""Endpoints for running correlations"""
+from unittest import mock
+
 from flask import jsonify
+from flask import Blueprint
 from flask import request
-from flask import g
-from sqlalchemy import create_engine
 
-from default_settings import SQL_URI
-from gn3.correlation.correlation_computations import compute_correlation
+from gn3.computations.correlations import compute_all_sample_correlation
+from gn3.computations.correlations import compute_all_lit_correlation
+from gn3.computations.correlations import compute_all_tissue_correlation
+
 
 correlation = Blueprint("correlation", __name__)
 
 
-# xtodo implement neat db setup
-@correlation.before_request
-def connect_db():
-    """add connection to db method"""
-    print("@app.before_request connect_db")
-    db_connection = getattr(g, '_database', None)
-    if db_connection is None:
-        print("Get new database connector")
-        g.db = g._database = create_engine(SQL_URI, encoding="latin1")
+@correlation.route("/sample_r/<string:corr_method>", methods=["POST"])
+def compute_sample_r(corr_method="pearson"):
+    """correlation endpoint for computing sample r correlations\
+    api expects the trait data with has the trait and also the\
+    target_dataset  data"""
+    correlation_input = request.get_json()
+
+    # xtodo move code below to compute_all_sampl correlation
+    this_trait_data = correlation_input.get("this_trait")
+    target_datasets = correlation_input.get("target_dataset")
+
+    correlation_results = compute_all_sample_correlation(corr_method=corr_method,
+                                                         this_trait=this_trait_data,
+                                                         target_dataset=target_datasets)
+
+    return jsonify({
+        "corr_results": correlation_results
+    })
+
 
-    g.initial_time = time.time()
+@correlation.route("/lit_corr/<string:species>/<int:gene_id>", methods=["POST"])
+def compute_lit_corr(species=None, gene_id=None):
+    """api endpoint for doing lit correlation.results for lit correlation\
+    are fetched from the database this is the only case where the db\
+    might be needed for actual computing of the correlation results"""
 
+    database_instance = mock.Mock()
+    target_traits_gene_ids = request.get_json()
 
-@correlation.route("/corr_compute", methods=["POST"])
-def corr_compute_page():
-    """api for doing  correlation"""
+    lit_corr_results = compute_all_lit_correlation(
+        database_instance=database_instance, trait_lists=target_traits_gene_ids,
+        species=species, gene_id=gene_id)
 
-    correlation_input = request.json
+    return jsonify(lit_corr_results)
 
-    if correlation_input is None:
-        return jsonify({"error": str("Bad request")}), 400
 
-    try:
-        corr_results = compute_correlation(
-            correlation_input_data=correlation_input)
+@correlation.route("/tissue_corr/<string:corr_method>", methods=["POST"])
+def compute_tissue_corr(corr_method="pearson"):
+    """api endpoint fr doing tissue correlation"""
+    tissue_input_data = request.get_json()
+    primary_tissue_dict = tissue_input_data["primary_tissue"]
+    target_tissues_dict_list = tissue_input_data["target_tissues"]
 
-    except Exception as error:  # pylint: disable=broad-except
-        return jsonify({"error": str(error)})
+    results = compute_all_tissue_correlation(primary_tissue_dict=primary_tissue_dict,
+                                             target_tissues_dict_list=target_tissues_dict_list,
+                                             corr_method=corr_method)
 
-    return {"correlation_results": corr_results}
+    return jsonify(results)
\ No newline at end of file
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
new file mode 100644
index 0000000..21f5929
--- /dev/null
+++ b/gn3/computations/correlations.py
@@ -0,0 +1,305 @@
+"""module contains code for correlations"""
+from typing import List
+from typing import Tuple
+from typing import Optional
+from typing import Callable
+
+import scipy.stats  # type: ignore
+
+
+def compute_sum(rhs: int, lhs: int)-> int:
+    """initial tests to compute  sum  of two numbers"""
+    return rhs + lhs
+
+
+def normalize_values(a_values: List, b_values: List)->Tuple[List[float], List[float], int]:
+    """
+    Trim two lists of values to contain only the values they both share
+
+    Given two lists of sample values, trim each list so that it contains
+    only the samples that contain a value in both lists. Also returns
+    the number of such samples.
+
+    >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1])
+    ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3)
+
+    """
+    a_new = []
+    b_new = []
+    for a_val, b_val in zip(a_values, b_values):
+        if (a_val and b_val is not None):
+            a_new.append(a_val)
+            b_new.append(b_val)
+    return a_new, b_new, len(a_new)
+
+
+def compute_corr_coeff_p_value(primary_values: List, target_values: List, corr_method: str)->\
+        Tuple[float, float]:
+    """given array like inputs calculate the primary and target_value
+     methods ->pearson,spearman and biweight mid correlation
+     return value is rho and p_value
+    """
+    corr_mapping = {
+        "bicor": do_bicor,
+        "pearson": scipy.stats.pearsonr,
+        "spearman": scipy.stats.spearmanr
+    }
+
+    use_corr_method = corr_mapping.get(corr_method, "spearman")
+
+    corr_coeffient, p_val = use_corr_method(primary_values, target_values)
+
+    return (corr_coeffient, p_val)
+
+
+def compute_sample_r_correlation(corr_method: str, trait_vals, target_samples_vals)->\
+        Optional[Tuple[float, float, int]]:
+    """Given a primary trait values and target trait values
+    calculate the correlation coeff and p value"""
+
+    sanitized_traits_vals, sanitized_target_vals,\
+        num_overlap = normalize_values(trait_vals, target_samples_vals)
+
+    if num_overlap > 5:
+
+        (corr_coeffient, p_value) =\
+            compute_corr_coeff_p_value(primary_values=sanitized_traits_vals,
+                                       target_values=sanitized_target_vals,
+                                       corr_method=corr_method)
+
+        # xtodo check if corr_coefficient is None should use numpy.isNan scipy.isNan is deprecated
+        if corr_coeffient is not None:
+            return (corr_coeffient, p_value, num_overlap)
+
+    return None
+
+
+def do_bicor(x_val, y_val) -> Tuple[float, float]:
+    """not implemented method for doing biweight mid correlation
+    use  astropy stats package :not packaged in guix
+    """
+
+    return (x_val, y_val)
+
+
+def filter_shared_sample_keys(this_samplelist, target_samplelist)->Tuple[List, List]:
+    """given primary and target samplelist for two base and target\
+    trait select filter the values using the shared keys"""
+    this_vals = []
+    target_vals = []
+
+    for key, value in target_samplelist.items():
+        if key in this_samplelist:
+            target_vals.append(value)
+            this_vals.append(this_samplelist[key])
+
+    return (this_vals, target_vals)
+
+
+def compute_all_sample_correlation(this_trait, target_dataset, corr_method="pearson")->List:
+    """given a trait data samplelist and target__datasets compute all sample correlation"""
+
+    this_trait_samples = this_trait["trait_sample_data"]
+
+    corr_results = []
+
+    for target_trait in target_dataset:
+        trait_id = target_trait.get("trait_id")
+        target_trait_data = target_trait["trait_sample_data"]
+        this_vals, target_vals = filter_shared_sample_keys(
+            this_trait_samples, target_trait_data)
+
+        sample_correlation = compute_sample_r_correlation(
+            corr_method=corr_method, trait_vals=this_vals, target_samples_vals=target_vals)
+
+        if sample_correlation is not None:
+            (corr_coeffient, p_value, num_overlap) = sample_correlation
+
+        else:
+            continue
+
+        corr_result = {"corr_coeffient": corr_coeffient,
+                       "p_value": p_value,
+                       "num_overlap": num_overlap}
+
+        corr_results.append({trait_id: corr_result})
+
+    return corr_results
+
+
+def tissue_lit_corr_for_probe_type(corr_type: str, top_corr_results):
+    """function that does either lit_corr_for_trait_list or tissue_corr\
+    _for_trait list depending on whether both dataset and target_dataset are\
+    both set to probet"""
+
+    corr_results = {"lit": 1}
+
+    if corr_type not in ("lit", "literature"):
+
+        corr_results["top_corr_results"] = top_corr_results
+        # run lit_correlation for  the given  top_corr_results
+    if corr_type == "tissue":
+        # run lit correlation the given top corr results
+        pass
+    if corr_type == "sample":
+        pass
+        # run sample r correlation for the given top  results
+
+    return corr_results
+
+
+def tissue_correlation_for_trait_list(primary_tissue_vals: List,
+                                      target_tissues_values: List,
+                                      corr_method: str,
+                                      compute_corr_p_value: Callable =
+                                      compute_corr_coeff_p_value)->dict:
+    """given a primary tissue values for a trait and the target tissues values\
+    compute the correlation_cooeff and p value  the input required are arrays\
+    output - > List containing Dicts with corr_coefficient value,P_value and\
+    also the tissue numbers is len(primary) == len(target)"""
+
+    # ax :todo assertion that lenggth one one target tissue ==primary_tissue
+
+    (tissue_corr_coeffient, p_value) = compute_corr_p_value(
+        primary_values=primary_tissue_vals,
+        target_values=target_tissues_values,
+        corr_method=corr_method)
+
+    lit_corr_result = {
+        "tissue_corr": tissue_corr_coeffient,
+        "p_value": p_value,
+        "tissue_number": len(primary_tissue_vals)
+    }
+
+    return lit_corr_result
+
+
+def fetch_lit_correlation_data(database,
+                               input_mouse_gene_id: Optional[str],
+                               gene_id: str,
+                               mouse_gene_id: Optional[str] = None)->Tuple[str, float]:
+    """given input trait mouse gene id and mouse gene id fetch the lit\
+    corr_data"""
+    if mouse_gene_id is not None and ";" not in mouse_gene_id:
+        query = """
+        SELECT VALUE
+        FROM  LCorrRamin3
+        WHERE GeneId1='%s' and
+        GeneId2='%s'
+        """
+
+        query_values = (str(mouse_gene_id), str(input_mouse_gene_id))
+
+        results = database.execute(
+            query_formatter(query, *query_values)).fetchone()
+
+        lit_corr_results = results if results is not None else database.execute(
+            query_formatter(query, *tuple(reversed(query_values)))).fetchone()
+
+        lit_results = (gene_id, lit_corr_results.val)\
+            if lit_corr_results else (gene_id, 0)
+        return lit_results
+
+    return (gene_id, 0)
+
+
+def lit_correlation_for_trait_list(database,
+                                   target_trait_lists: List,
+                                   species: Optional[str] = None,
+                                   trait_gene_id: Optional[str] = None)->List:
+    """given species,base trait gene id fetch the lit corr results from the db\
+    output is float for lit corr results """
+    fetched_lit_corr_results = []
+
+    this_trait_mouse_gene_id = map_to_mouse_gene_id(
+        database=database, species=species, gene_id=trait_gene_id)
+
+    for trait in target_trait_lists:
+        target_trait_gene_id = trait.get("gene_id")
+        if target_trait_gene_id:
+            target_mouse_gene_id = map_to_mouse_gene_id(
+                database=database, species=species, gene_id=target_trait_gene_id)
+
+            fetched_corr_data = fetch_lit_correlation_data(
+                database=database, input_mouse_gene_id=this_trait_mouse_gene_id,
+                gene_id=target_trait_gene_id, mouse_gene_id=target_mouse_gene_id)
+
+            dict_results = dict(
+                zip(("gene_id", "lit_corr"), fetched_corr_data))
+            fetched_lit_corr_results.append(dict_results)
+
+    return fetched_lit_corr_results
+
+
+def query_formatter(query_string: str, * query_values):
+    """formatter query string given the unformatted query string\
+    and the respectibe values.Assumes number of placeholders is
+    equal to the number of query values """
+    results = query_string % (query_values)
+
+    return results
+
+
+def map_to_mouse_gene_id(database, species: Optional[str], gene_id: Optional[str])->Optional[str]:
+    """given a species which is not mouse map the gene_id\
+    to respective mouse gene id"""
+    # AK:xtodo move the code for checking nullity out of thing functions bug while\
+    # method for string
+    if None in (species, gene_id):
+        return None
+    if species == "mouse":
+        return gene_id
+
+    query = """SELECT mouse
+                FROM GeneIDXRef
+                WHERE '%s' = '%s'"""
+
+    query_values = (species, gene_id)
+
+    results = database.execute(
+        query_formatter(query, *query_values)).fetchone()
+
+    mouse_gene_id = results.mouse if results is not None else None
+
+    return mouse_gene_id
+
+
+def compute_all_lit_correlation(database_instance, trait_lists: List, species: str, gene_id):
+    """function that acts as an abstraction for lit_correlation_for_trait_list"""
+    # xtodo to be refactored
+
+    lit_results = lit_correlation_for_trait_list(database=database_instance,
+                                                 target_trait_lists=trait_lists,
+                                                 species=species,
+                                                 trait_gene_id=gene_id
+                                                 )
+
+    return {
+        "lit_results": lit_results
+    }
+
+
+def compute_all_tissue_correlation(primary_tissue_dict: dict,
+                                   target_tissues_dict_list: List,
+                                   corr_method: str):
+    """function acts as an abstraction for tissue_correlation_for_trait_list\
+    required input are target tissue object and primary tissue trait """
+
+    tissues_results = {}
+
+    primary_tissue_vals = primary_tissue_dict["tissue_values"]
+
+    target_tissues_list = target_tissues_dict_list
+
+    for target_tissue_obj in target_tissues_list:
+        trait_id = target_tissue_obj.get("trait_id")
+
+        target_tissue_vals = target_tissue_obj.get("tissue_values")
+
+        tissue_result = tissue_correlation_for_trait_list(primary_tissue_vals=primary_tissue_vals,
+                                                          target_tissues_values=target_tissue_vals,
+                                                          corr_method=corr_method)
+
+        tissues_results[trait_id] = tissue_result
+
+    return tissues_results
author	Alexander Kabui	2021-03-16 10:36:58 +0300
committer	GitHub	2021-03-16 10:36:58 +0300
commit	43d1bb7f6cd2b5890d5b3eb7c357caafda25a35c (patch)
tree	73683272f32cffc860497a93b5c844c272252e67 /gn3
parent	995f1dbd081eb64ad177f929615a4edee01cb68f (diff)
download	genenetwork3-43d1bb7f6cd2b5890d5b3eb7c357caafda25a35c.tar.gz