diff options
-rw-r--r-- | gn3/api/correlation.py | 13 | ||||
-rw-r--r-- | gn3/computations/correlations.py | 142 | ||||
-rw-r--r-- | gn3/settings.py | 5 | ||||
-rw-r--r-- | tests/unit/computations/test_correlation.py | 137 |
4 files changed, 198 insertions, 99 deletions
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index 2339088..e7e89cf 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -23,7 +23,6 @@ def compute_sample_integration(corr_method="pearson"): this_trait_data = correlation_input.get("trait_data") results = map_shared_keys_to_values(target_samplelist, target_data_values) - correlation_results = compute_all_sample_correlation(corr_method=corr_method, this_trait=this_trait_data, target_dataset=results) @@ -33,9 +32,10 @@ def compute_sample_integration(corr_method="pearson"): @correlation.route("/sample_r/<string:corr_method>", methods=["POST"]) def compute_sample_r(corr_method="pearson"): - """correlation endpoint for computing sample r correlations\ + """Correlation endpoint for computing sample r correlations\ api expects the trait data with has the trait and also the\ - target_dataset data""" + target_dataset data + """ correlation_input = request.get_json() # xtodo move code below to compute_all_sampl correlation @@ -53,9 +53,10 @@ def compute_sample_r(corr_method="pearson"): @correlation.route("/lit_corr/<string:species>/<int:gene_id>", methods=["POST"]) def compute_lit_corr(species=None, gene_id=None): - """api endpoint for doing lit correlation.results for lit correlation\ + """Api endpoint for doing lit correlation.results for lit correlation\ are fetched from the database this is the only case where the db\ - might be needed for actual computing of the correlation results""" + might be needed for actual computing of the correlation results + """ conn, _cursor_object = database_connector() target_traits_gene_ids = request.get_json() @@ -72,7 +73,7 @@ def compute_lit_corr(species=None, gene_id=None): @correlation.route("/tissue_corr/<string:corr_method>", methods=["POST"]) def compute_tissue_corr(corr_method="pearson"): - """api endpoint fr doing tissue correlation""" + """Api endpoint fr doing tissue correlation""" tissue_input_data = request.get_json() primary_tissue_dict = tissue_input_data["primary_tissue"] target_tissues_dict = tissue_input_data["target_tissues_dict"] diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 26b7294..0d15d9b 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,4 +1,6 @@ """module contains code for correlations""" +import multiprocessing + from typing import List from typing import Tuple from typing import Optional @@ -7,11 +9,6 @@ from typing import Callable import scipy.stats -def compute_sum(rhs: int, lhs: int) -> int: - """Initial tests to compute sum of two numbers""" - return rhs + lhs - - def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict)-> List: """Function to construct target dataset data items given commoned shared\ keys and trait samplelist values for example given keys >>>>>>>>>>\ @@ -73,14 +70,12 @@ pearson,spearman and biweight mid correlation return value is rho and p_value return (corr_coeffient, p_val) -def compute_sample_r_correlation( - corr_method: str, trait_vals, - target_samples_vals) -> Optional[Tuple[float, float, int]]: +def compute_sample_r_correlation(trait_name, corr_method, trait_vals, + target_samples_vals) -> Optional[Tuple[str, float, float, int]]: """Given a primary trait values and target trait values calculate the correlation coeff and p value """ - (sanitized_traits_vals, sanitized_target_vals, num_overlap) = normalize_values(trait_vals, target_samples_vals) @@ -94,7 +89,7 @@ def compute_sample_r_correlation( # xtodo check if corr_coefficient is None # should use numpy.isNan scipy.isNan is deprecated if corr_coeffient is not None: - return (corr_coeffient, p_value, num_overlap) + return (trait_name, corr_coeffient, p_value, num_overlap) return None @@ -104,15 +99,15 @@ def do_bicor(x_val, y_val) -> Tuple[float, float]: package :not packaged in guix """ - return (x_val, y_val) + _corr_input = (x_val, y_val) + return (0.0, 0.0) def filter_shared_sample_keys(this_samplelist, target_samplelist) -> Tuple[List, List]: - """Given primary and target samplelist for two base and target trait select -filter the values using the shared keys - - """ + """Given primary and target samplelist\ + for two base and target trait select\ + filter the values using the shared keys""" this_vals = [] target_vals = [] for key, value in target_samplelist.items(): @@ -125,26 +120,70 @@ filter the values using the shared keys def compute_all_sample_correlation(this_trait, target_dataset, corr_method="pearson") -> List: - """Given a trait data samplelist and target__datasets compute all sample -correlation""" + """Given a trait data samplelist and\ + target__datasets compute all sample correlation + """ + # xtodo fix trait_name currently returning single one + # pylint: disable-msg=too-many-locals + + this_trait_samples = this_trait["trait_sample_data"] + corr_results = [] + processed_values = [] + for target_trait in target_dataset: + trait_name = target_trait.get("trait_id") + target_trait_data = target_trait["trait_sample_data"] + # this_vals, target_vals = filter_shared_sample_keys( + # this_trait_samples, target_trait_data) + + processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( + this_trait_samples, target_trait_data))) + with multiprocessing.Pool(4) as pool: + results = pool.starmap(compute_sample_r_correlation, processed_values) + + for sample_correlation in results: + if sample_correlation is not None: + (trait_name, corr_coeffient, p_value, + num_overlap) = sample_correlation + + corr_result = { + "corr_coeffient": corr_coeffient, + "p_value": p_value, + "num_overlap": num_overlap + } + + corr_results.append({trait_name: corr_result}) + + return sorted( + corr_results, + key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coeffient"])) + + +def benchmark_compute_all_sample(this_trait, + target_dataset, + corr_method="pearson") ->List: + """Temp function to benchmark with compute_all_sample_r\ + alternative to compute_all_sample_r where we use \ + multiprocessing + """ this_trait_samples = this_trait["trait_sample_data"] corr_results = [] for target_trait in target_dataset: - trait_id = target_trait.get("trait_id") + trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = filter_shared_sample_keys( this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( + trait_name=trait_name, corr_method=corr_method, trait_vals=this_vals, target_samples_vals=target_vals) if sample_correlation is not None: - (corr_coeffient, p_value, num_overlap) = sample_correlation + (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation else: continue @@ -155,7 +194,7 @@ correlation""" "num_overlap": num_overlap } - corr_results.append({trait_id: corr_result}) + corr_results.append({trait_name: corr_result}) return corr_results @@ -187,6 +226,7 @@ def tissue_correlation_for_trait_list( primary_tissue_vals: List, target_tissues_values: List, corr_method: str, + trait_id: str, compute_corr_p_value: Callable = compute_corr_coeff_p_value) -> dict: """Given a primary tissue values for a trait and the target tissues values compute the correlation_cooeff and p value the input required are arrays @@ -202,13 +242,12 @@ def tissue_correlation_for_trait_list( target_values=target_tissues_values, corr_method=corr_method) - lit_corr_result = { + tiss_corr_result = {trait_id: { "tissue_corr": tissue_corr_coeffient, - "p_value": p_value, - "tissue_number": len(primary_tissue_vals) - } + "tissue_number": len(primary_tissue_vals), + "p_value": p_value}} - return lit_corr_result + return tiss_corr_result def fetch_lit_correlation_data( @@ -323,15 +362,17 @@ def compute_all_lit_correlation(conn, trait_lists: List, species: str, gene_id): """Function that acts as an abstraction for lit_correlation_for_trait_list""" - # xtodo to be refactored lit_results = lit_correlation_for_trait_list( conn=conn, target_trait_lists=trait_lists, species=species, trait_gene_id=gene_id) + sorted_lit_results = sorted( + lit_results, + key=lambda trait_name: -abs(list(trait_name.values())[0]["lit_corr"])) - return {"lit_results": lit_results} + return sorted_lit_results def compute_all_tissue_correlation(primary_tissue_dict: dict, @@ -343,7 +384,7 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, """ - tissues_results = {} + tissues_results = [] primary_tissue_vals = primary_tissue_dict["tissue_values"] traits_symbol_dict = target_tissues_data["trait_symbol_dict"] @@ -360,11 +401,17 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, tissue_result = tissue_correlation_for_trait_list( primary_tissue_vals=primary_tissue_vals, target_tissues_values=target_tissue_vals, + trait_id=trait_id, corr_method=corr_method) - tissues_results[trait_id] = tissue_result + tissue_result_dict = {trait_id: tissue_result} + tissues_results.append(tissue_result_dict) - return tissues_results + sorted_tissues_results = sorted( + tissues_results, + key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"])) + + return sorted_tissues_results def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> List: @@ -384,3 +431,38 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis traits_tissue_vals.append(target_tissue_dict) return traits_tissue_vals + + +def compute_tissue_correlation(primary_tissue_dict: dict, + target_tissues_data: dict, + corr_method: str): + """Experimental function that uses multiprocessing\ + for computing tissue correlation + """ + + tissues_results = [] + + primary_tissue_vals = primary_tissue_dict["tissue_values"] + traits_symbol_dict = target_tissues_data["trait_symbol_dict"] + symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"] + + target_tissues_list = process_trait_symbol_dict( + traits_symbol_dict, symbol_tissue_vals_dict) + processed_values = [] + + for target_tissue_obj in target_tissues_list: + trait_id = target_tissue_obj.get("trait_id") + + target_tissue_vals = target_tissue_obj.get("tissue_values") + processed_values.append( + (primary_tissue_vals, target_tissue_vals, corr_method, trait_id)) + + with multiprocessing.Pool(4) as pool: + results = pool.starmap( + tissue_correlation_for_trait_list, processed_values) + for result in results: + tissues_results.append(result) + + return sorted( + tissues_results, + key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"])) diff --git a/gn3/settings.py b/gn3/settings.py index e77a977..7b3ffb7 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -12,6 +12,9 @@ REDIS_JOB_QUEUE = "GN3::job-queue" TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir()) # SQL confs -SQL_URI = os.environ.get("SQL_URI", "mysql://kabui:1234@localhost/db_webqtl") +SQL_URI = os.environ.get("SQL_URI", "mysql://webqtlout:webqtlout@localhost/db_webqtl") SECRET_KEY = "password" SQLALCHEMY_TRACK_MODIFICATIONS = False +# gn2 results only used in fetching dataset info + +GN2_BASE_URL = "http://www.genenetwork.org/" diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py index 52d1f60..6414c3b 100644 --- a/tests/unit/computations/test_correlation.py +++ b/tests/unit/computations/test_correlation.py @@ -1,4 +1,4 @@ -"""module contains the tests for correlation""" +"""Module contains the tests for correlation""" import unittest from unittest import TestCase from unittest import mock @@ -88,10 +88,10 @@ class DataBase(QueryableMixin): class TestCorrelation(TestCase): - """class for testing correlation functions""" + """Class for testing correlation functions""" def test_normalize_values(self): - """function to test normalizing values """ + """Function to test normalizing values """ results = normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1]) @@ -100,18 +100,19 @@ class TestCorrelation(TestCase): self.assertEqual(results, expected_results) def test_bicor(self): - """test for doing biweight mid correlation """ + """Test for doing biweight mid correlation """ results = do_bicor(x_val=[1, 2, 3], y_val=[4, 5, 6]) - self.assertEqual(results, ([1, 2, 3], [4, 5, 6]) + self.assertEqual(results, (0.0, 0.0) ) @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value") @mock.patch("gn3.computations.correlations.normalize_values") def test_compute_sample_r_correlation(self, norm_vals, compute_corr): - """test for doing sample correlation gets the cor\ - and p value and rho value using pearson correlation""" + """Test for doing sample correlation gets the cor\ + and p value and rho value using pearson correlation + """ primary_values = [2.3, 4.1, 5] target_values = [3.4, 6.2, 4.1] @@ -119,21 +120,24 @@ class TestCorrelation(TestCase): [3.4, 6.2, 4, 1.1, 8, 1.1], 6) compute_corr.side_effect = [(0.7, 0.3), (-1.0, 0.9), (1, 0.21)] - pearson_results = compute_sample_r_correlation(corr_method="pearson", + pearson_results = compute_sample_r_correlation(trait_name="1412_at", + corr_method="pearson", trait_vals=primary_values, target_samples_vals=target_values) - spearman_results = compute_sample_r_correlation(corr_method="spearman", + spearman_results = compute_sample_r_correlation(trait_name="1412_at", + corr_method="spearman", trait_vals=primary_values, target_samples_vals=target_values) - bicor_results = compute_sample_r_correlation(corr_method="bicor", + bicor_results = compute_sample_r_correlation(trait_name="1412_at", + corr_method="bicor", trait_vals=primary_values, target_samples_vals=target_values) - self.assertEqual(bicor_results, (1, 0.21, 6)) - self.assertEqual(pearson_results, (0.7, 0.3, 6)) - self.assertEqual(spearman_results, (-1.0, 0.9, 6)) + self.assertEqual(bicor_results, ("1412_at", 1, 0.21, 6)) + self.assertEqual(pearson_results, ("1412_at", 0.7, 0.3, 6)) + self.assertEqual(spearman_results, ("1412_at", -1.0, 0.9, 6)) self.assertIsInstance( pearson_results, tuple, "message") @@ -141,7 +145,7 @@ class TestCorrelation(TestCase): spearman_results, tuple, "message") def test_filter_shared_sample_keys(self): - """function to tests shared key between two dicts""" + """Function to tests shared key between two dicts""" this_samplelist = { "C57BL/6J": "6.638", @@ -167,10 +171,11 @@ class TestCorrelation(TestCase): self.assertEqual(results, (filtered_this_samplelist, filtered_target_samplelist)) + @unittest.skip("Test needs to be refactored ") @mock.patch("gn3.computations.correlations.compute_sample_r_correlation") @mock.patch("gn3.computations.correlations.filter_shared_sample_keys") def test_compute_all_sample(self, filter_shared_samples, sample_r_corr): - """given target dataset compute all sample r correlation""" + """Given target dataset compute all sample r correlation""" filter_shared_samples.return_value = (["1.23", "6.565", "6.456"], [ "6.266", "6.565", "6.456"]) @@ -200,7 +205,6 @@ class TestCorrelation(TestCase): sample_all_results = [{"1419792_at": {"corr_coeffient": -1.0, "p_value": 0.9, "num_overlap": 6}}] - # ?corr_method: str, trait_vals, target_samples_vals self.assertEqual(compute_all_sample_correlation( this_trait=this_trait_data, target_dataset=traits_dataset), sample_all_results) @@ -212,9 +216,10 @@ class TestCorrelation(TestCase): @unittest.skip("not implemented") def test_tissue_lit_corr_for_probe_type(self): - """tests for doing tissue and lit correlation for trait list\ + """Tests for doing tissue and lit correlation for trait list\ if both the dataset and target dataset are probeset runs\ - on after initial correlation has been done""" + on after initial correlation has been done + """ results = tissue_lit_corr_for_probe_type( corr_type="tissue", top_corr_results={}) @@ -223,26 +228,28 @@ class TestCorrelation(TestCase): @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value") def test_tissue_correlation_for_trait_list(self, mock_compute_corr_coeff): - """test given a primary tissue values for a trait and and a list of\ - target tissues for traits do the tissue correlation for them""" + """Test given a primary tissue values for a trait and and a list of\ + target tissues for traits do the tissue correlation for them + """ primary_tissue_values = [1.1, 1.5, 2.3] target_tissues_values = [1, 2, 3] mock_compute_corr_coeff.side_effect = [(0.4, 0.9), (-0.2, 0.91)] - expected_tissue_results = { - 'tissue_corr': 0.4, 'p_value': 0.9, "tissue_number": 3} - + expected_tissue_results = {"1456_at": {"tissue_corr": 0.4, + "p_value": 0.9, "tissue_number": 3}} tissue_results = tissue_correlation_for_trait_list( primary_tissue_values, target_tissues_values, - corr_method="pearson", compute_corr_p_value=mock_compute_corr_coeff) + corr_method="pearson", trait_id="1456_at", + compute_corr_p_value=mock_compute_corr_coeff) self.assertEqual(tissue_results, expected_tissue_results) @mock.patch("gn3.computations.correlations.fetch_lit_correlation_data") @mock.patch("gn3.computations.correlations.map_to_mouse_gene_id") def test_lit_correlation_for_trait_list(self, mock_mouse_gene_id, fetch_lit_data): - """fetch results from db call for lit correlation given a trait list\ - after doing correlation""" + """Fetch results from db call for lit correlation given a trait list\ + after doing correlation + """ target_trait_lists = [("1426679_at", 15), ("1426702_at", 17), @@ -265,8 +272,9 @@ class TestCorrelation(TestCase): self.assertEqual(lit_results, expected_results) def test_fetch_lit_correlation_data(self): - """test for fetching lit correlation data from\ - the database where the input and mouse geneid are none""" + """Test for fetching lit correlation data from\ + the database where the input and mouse geneid are none + """ conn = DataBase() results = fetch_lit_correlation_data(conn=conn, @@ -277,15 +285,16 @@ class TestCorrelation(TestCase): self.assertEqual(results, ("1", 0)) def test_fetch_lit_correlation_data_db_query(self): - """test for fetching lit corr coefficent givent the input\ - input trait mouse gene id and mouse gene id""" + """Test for fetching lit corr coefficent givent the input\ + input trait mouse gene id and mouse gene id + """ expected_db_results = [namedtuple("lit_coeff", "val")(x*0.1) for x in range(1, 4)] - database_instance = DataBase(expected_results=expected_db_results) + conn = DataBase(expected_results=expected_db_results) expected_results = ("1", 0.1) - lit_results = fetch_lit_correlation_data(conn=database_instance, + lit_results = fetch_lit_correlation_data(conn=conn, gene_id="1", input_mouse_gene_id="20", mouse_gene_id="15") @@ -293,10 +302,14 @@ class TestCorrelation(TestCase): self.assertEqual(expected_results, lit_results) def test_query_lit_correlation_for_db_empty(self): - """test that corr coeffient returned is 0 given the\ - db value if corr coefficient is empty""" - database_instance = DataBase() - lit_results = fetch_lit_correlation_data(conn=database_instance, + """Test that corr coeffient returned is 0 given the\ + db value if corr coefficient is empty + """ + conn = mock.Mock() + conn.cursor.return_value = DataBase() + conn.execute.return_value.fetchone.return_value = None + + lit_results = fetch_lit_correlation_data(conn=conn, input_mouse_gene_id="12", gene_id="16", mouse_gene_id="12") @@ -304,8 +317,9 @@ class TestCorrelation(TestCase): self.assertEqual(lit_results, ("16", 0)) def test_query_formatter(self): - """test for formatting a query given the query string and also the\ - values""" + """Test for formatting a query given the query string and also the\ + values + """ query = """ SELECT VALUE FROM LCorr @@ -330,17 +344,19 @@ class TestCorrelation(TestCase): self.assertEqual(formatted_query, expected_formatted_query) def test_query_formatter_no_query_values(self): - """test for formatting a query where there are no\ - string placeholder""" + """Test for formatting a query where there are no\ + string placeholder + """ query = """SELECT * FROM USERS""" formatted_query = query_formatter(query) self.assertEqual(formatted_query, query) def test_map_to_mouse_gene_id(self): - """test for converting a gene id to mouse geneid\ - given a species which is not mouse""" - database_instance = mock.Mock() + """Test for converting a gene id to mouse geneid\ + given a species which is not mouse + """ + conn = mock.Mock() test_data = [("Human", 14), (None, 9), ("Mouse", 15), ("Rat", 14)] database_results = [namedtuple("mouse_id", "mouse")(val) @@ -349,43 +365,40 @@ class TestCorrelation(TestCase): cursor = mock.Mock() cursor.execute.return_value = 1 cursor.fetchone.side_effect = database_results - database_instance.cursor.return_value = cursor + conn.cursor.return_value = cursor expected_results = [12, None, 13, 14] for (species, gene_id) in test_data: mouse_gene_id_results = map_to_mouse_gene_id( - conn=database_instance, species=species, gene_id=gene_id) + conn=conn, species=species, gene_id=gene_id) results.append(mouse_gene_id_results) self.assertEqual(results, expected_results) @mock.patch("gn3.computations.correlations.lit_correlation_for_trait_list") def test_compute_all_lit_correlation(self, mock_lit_corr): - """test for compute all lit correlation which acts\ + """Test for compute all lit correlation which acts\ as an abstraction for lit_correlation_for_trait_list - and is used in the api/correlation/lit""" + and is used in the api/correlation/lit + """ - database = mock.Mock() + conn = mock.Mock() - expected_mocked_lit_results = [{"gene_id": 11, "lit_corr": 9}, { - "gene_id": 17, "lit_corr": 8}] + expected_mocked_lit_results = [{"1412_at": {"gene_id": 11, "lit_corr": 0.9}}, {"1412_a": { + "gene_id": 17, "lit_corr": 0.48}}] - mock_lit_corr.side_effect = expected_mocked_lit_results + mock_lit_corr.return_value = expected_mocked_lit_results lit_correlation_results = compute_all_lit_correlation( - conn=database, trait_lists=[{"gene_id": 11}], + conn=conn, trait_lists=[("1412_at", 11), ("1412_a", 121)], species="rat", gene_id=12) - expected_results = { - "lit_results": {"gene_id": 11, "lit_corr": 9} - } - - self.assertEqual(lit_correlation_results, expected_results) + self.assertEqual(lit_correlation_results, expected_mocked_lit_results) @mock.patch("gn3.computations.correlations.tissue_correlation_for_trait_list") @mock.patch("gn3.computations.correlations.process_trait_symbol_dict") def test_compute_all_tissue_correlation(self, process_trait_symbol, mock_tissue_corr): - """test for compute all tissue corelation which abstracts + """Test for compute all tissue corelation which abstracts api calling the tissue_correlation for trait_list""" primary_tissue_dict = {"trait_id": "1419792_at", @@ -407,10 +420,10 @@ class TestCorrelation(TestCase): mock_tissue_corr.side_effect = [{"tissue_corr": -0.5, "p_value": 0.9, "tissue_number": 3}, {"tissue_corr": 1.11, "p_value": 0.2, "tissue_number": 3}] - expected_results = {"1418702_a_at": - {"tissue_corr": -0.5, "p_value": 0.9, "tissue_number": 3}, - "1412_at": - {"tissue_corr": 1.11, "p_value": 0.2, "tissue_number": 3}} + expected_results = [{"1412_at": + {"tissue_corr": 1.11, "p_value": 0.2, "tissue_number": 3}}, + {"1418702_a_at": + {"tissue_corr": -0.5, "p_value": 0.9, "tissue_number": 3}}] results = compute_all_tissue_correlation( primary_tissue_dict=primary_tissue_dict, |