diff options
author | Frederick Muriuki Muriithi | 2021-11-12 04:07:42 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2021-11-12 04:07:42 +0300 |
commit | d1617bd8af25bf7c7777be7a634559fd31b491ad (patch) | |
tree | 9565d4fcca4fa553dc21a9543353a8b29357ab4a | |
parent | d895eea22ab908c11f4ebb77f99518367879b1f6 (diff) | |
parent | 85405fe6875358d3bb98b03621271d5909dd393f (diff) | |
download | genenetwork3-d1617bd8af25bf7c7777be7a634559fd31b491ad.tar.gz |
Merge branch 'main' of github.com:genenetwork/genenetwork3 into partial-correlations
-rw-r--r-- | .github/PULL_REQUEST_TEMPLATE.md | 2 | ||||
-rw-r--r-- | README.md | 6 | ||||
-rw-r--r-- | gn3/authentication.py | 67 | ||||
-rw-r--r-- | gn3/computations/correlations.py | 30 | ||||
-rw-r--r-- | gn3/computations/correlations2.py | 36 | ||||
-rw-r--r-- | gn3/heatmaps.py | 6 | ||||
-rw-r--r-- | guix.scm | 77 | ||||
-rw-r--r-- | tests/unit/computations/test_correlation.py | 39 | ||||
-rw-r--r-- | tests/unit/test_heatmaps.py | 3 |
9 files changed, 143 insertions, 123 deletions
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 926b054..e9a2425 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,5 @@ +Please fill out the template below, and delete items not applicable to your pull request. + #### Description <!--Brief description of the PR. What does this PR do? --> @@ -24,7 +24,7 @@ guix environment --load=guix.scm Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up. ```bash -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm +guix environment --expose=$HOME/genotype_files/ --load=guix.scm python3 import redis ``` @@ -32,7 +32,7 @@ python3 #### Run a Guix container ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm +guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm ``` @@ -41,7 +41,7 @@ env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix env Create a new profile with ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix package -i genenetwork3 -p ~/opt/genenetwork3 +guix package -i genenetwork3 -p ~/opt/genenetwork3 ``` and load the profile settings with diff --git a/gn3/authentication.py b/gn3/authentication.py index 7bc7b77..6719631 100644 --- a/gn3/authentication.py +++ b/gn3/authentication.py @@ -1,9 +1,12 @@ """Methods for interacting with gn-proxy.""" import functools import json +import uuid +import datetime + from urllib.parse import urljoin from enum import Enum, unique -from typing import Dict, Union +from typing import Dict, List, Optional, Union from redis import Redis import requests @@ -95,3 +98,65 @@ def get_highest_user_access_role( for key, value in json.loads(response.content).items(): access_role[key] = max(map(lambda role: role_mapping[role], value)) return access_role + + +def get_groups_by_user_uid(user_uid: str, conn: Redis) -> Dict: + """Given a user uid, get the groups in which they are a member or admin of. + + Args: + - user_uid: A user's unique id + - conn: A redis connection + + Returns: + - A dictionary containing the list of groups the user is part of e.g.: + {"admin": [], "member": ["ce0dddd1-6c50-4587-9eec-6c687a54ad86"]} + """ + admin = [] + member = [] + for uuid, group_info in conn.hgetall("groups").items(): + group_info = json.loads(group_info) + group_info["uuid"] = uuid + if user_uid in group_info.get('admins'): + admin.append(group_info) + if user_uid in group_info.get('members'): + member.append(group_info) + return { + "admin": admin, + "member": member, + } + + +def get_user_info_by_key(key: str, value: str, + conn: Redis) -> Optional[Dict]: + """Given a key, get a user's information if value is matched""" + if key != "user_id": + for uuid, user_info in conn.hgetall("users").items(): + user_info = json.loads(user_info) + if (key in user_info and + user_info.get(key) == value): + user_info["user_id"] = uuid + return user_info + elif key == "user_id": + if user_info := conn.hget("users", value): + user_info = json.loads(user_info) + user_info["user_id"] = value + return user_info + return None + + +def create_group(conn: Redis, group_name: Optional[str], + admin_user_uids: List = [], + member_user_uids: List = []) -> Optional[Dict]: + """Create a group given the group name, members and admins of that group.""" + if group_name and bool(admin_user_uids + member_user_uids): + timestamp = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p') + group = { + "id": (group_id := str(uuid.uuid4())), + "admins": admin_user_uids, + "members": member_user_uids, + "name": group_name, + "created_timestamp": timestamp, + "changed_timestamp": timestamp, + } + conn.hset("groups", group_id, json.dumps(group)) + return group diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index c930df0..c5c56db 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,6 +1,7 @@ """module contains code for correlations""" import math import multiprocessing +from contextlib import closing from typing import List from typing import Tuple @@ -49,13 +50,9 @@ def normalize_values(a_values: List, ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) """ - a_new = [] - b_new = [] for a_val, b_val in zip(a_values, b_values): if (a_val and b_val is not None): - a_new.append(a_val) - b_new.append(b_val) - return a_new, b_new, len(a_new) + yield a_val, b_val def compute_corr_coeff_p_value(primary_values: List, target_values: List, @@ -81,8 +78,10 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, correlation coeff and p value """ - (sanitized_traits_vals, sanitized_target_vals, - num_overlap) = normalize_values(trait_vals, target_samples_vals) + + sanitized_traits_vals, sanitized_target_vals = list( + zip(*list(normalize_values(trait_vals, target_samples_vals)))) + num_overlap = len(sanitized_traits_vals) if num_overlap > 5: @@ -114,13 +113,9 @@ def filter_shared_sample_keys(this_samplelist, filter the values using the shared keys """ - this_vals = [] - target_vals = [] for key, value in target_samplelist.items(): if key in this_samplelist: - target_vals.append(value) - this_vals.append(this_samplelist[key]) - return (this_vals, target_vals) + yield this_samplelist[key], value def fast_compute_all_sample_correlation(this_trait, @@ -139,9 +134,10 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( - this_trait_samples, target_trait_data))) - with multiprocessing.Pool(4) as pool: + processed_values.append((trait_name, corr_method, + list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))))) + with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: @@ -172,8 +168,8 @@ def compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - this_vals, target_vals = filter_shared_sample_keys( - this_trait_samples, target_trait_data) + this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 93db3fa..d0222ae 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -6,45 +6,21 @@ FUNCTIONS: compute_correlation: TODO: Describe what the function does...""" -from math import sqrt -from functools import reduce +from scipy import stats ## From GN1: mostly for clustering and heatmap generation def __items_with_values(dbdata, userdata): """Retains only corresponding items in the data items that are not `None` values. This should probably be renamed to something sensible""" - def both_not_none(item1, item2): - """Check that both items are not the value `None`.""" - if (item1 is not None) and (item2 is not None): - return (item1, item2) - return None - def split_lists(accumulator, item): - """Separate the 'x' and 'y' items.""" - return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] - return reduce( - split_lists, - filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)), - [[], []]) + filtered = [x for x in zip(dbdata, userdata) if x[0] is not None and x[1] is not None] + return tuple(zip(*filtered)) if filtered else ([], []) def compute_correlation(dbdata, userdata): - """Compute some form of correlation. + """Compute the Pearson correlation coefficient. This is extracted from https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 """ x_items, y_items = __items_with_values(dbdata, userdata) - if len(x_items) < 6: - return (0.0, len(x_items)) - meanx = sum(x_items)/len(x_items) - meany = sum(y_items)/len(y_items) - def cal_corr_vals(acc, item): - xitem, yitem = item - return [ - acc[0] + ((xitem - meanx) * (yitem - meany)), - acc[1] + ((xitem - meanx) * (xitem - meanx)), - acc[2] + ((yitem - meany) * (yitem - meany))] - xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) - try: - return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) - except ZeroDivisionError: - return(0, len(x_items)) + correlation = stats.pearsonr(x_items, y_items)[0] if len(x_items) >= 6 else 0 + return (correlation, len(x_items)) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index bf9dfd1..f0af409 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -64,11 +64,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]): def __compute_corr(tdata_i, tdata_j): if tdata_i[0] == tdata_j[0]: return 0.0 - corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr + return 1 - compute_correlation(tdata_i[1], tdata_j[1])[0] def __cluster(tdata_i): return tuple( @@ -28,61 +28,39 @@ ;; ;; env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C -l guix.scm -(use-modules - (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 popen) - (ice-9 rdelim) - (gn packages gemma) - (gn packages python) - (gnu packages base) - (gnu packages check) - (gnu packages graph) - (gnu packages cran) - (gnu packages databases) - (gnu packages statistics) - (gnu packages bioconductor) - (gnu packages golang) - (gn packages genenetwork) - (gnu packages python) - (gnu packages python-check) - (gnu packages python-crypto) - (gnu packages python-web) - (gnu packages python-xyz) - (gnu packages python-science) - ((guix build utils) #:select (with-directory-excursion)) - (guix build-system python) - (guix gexp) - (guix git-download) - (guix licenses) - (guix packages)) +(use-modules (gn packages gemma) + (gn packages python) + (gnu packages base) + (gnu packages check) + (gnu packages graph) + (gnu packages cran) + (gnu packages databases) + (gnu packages statistics) + (gnu packages bioconductor) + (gnu packages golang) + (gn packages genenetwork) + (gnu packages python) + (gnu packages python-check) + (gnu packages python-crypto) + (gnu packages python-web) + (gnu packages python-xyz) + (gnu packages python-science) + ((guix build utils) #:select (with-directory-excursion)) + (guix build-system python) + (guix gexp) + (guix git-download) + (guix licenses) + (guix packages)) (define %source-dir (dirname (current-filename))) -(define git-file? - (let* ((pipe (with-directory-excursion %source-dir - (open-pipe* OPEN_READ "git" "ls-files"))) - (files (let loop ((lines '())) - (match (read-line pipe) - ((? eof-object?) - (reverse lines)) - (line - (loop (cons line lines)))))) - (status (close-pipe pipe))) - (lambda (file stat) - (match (stat:type stat) - ('directory #t) - ((or 'regular 'symlink) - (any (cut string-suffix? <> file) files)) - (_ #f))))) (package (name "genenetwork3.git") - (version "0.0.1") - (source (local-file %source-dir + (version "0.1.0") + (source (local-file %source-dir "genenetwork3-checkout" #:recursive? #t - #:select? git-file?)) + #:select? (git-predicate %source-dir))) (propagated-inputs `(("coreutils" ,coreutils) ("gemma-wrapper" ,gemma-wrapper) ("gunicorn" ,gunicorn) @@ -111,8 +89,7 @@ ("python-plotly" ,python-plotly) ("python-pandas" ,python-pandas) ("python-pingouin" ,python-pingouin) - ("rust-qtlreaper" ,rust-qtlreaper) - ("python-flask-cors" ,python-flask-cors))) + ("rust-qtlreaper" ,rust-qtlreaper))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py index 96d9c6d..d60dd62 100644 --- a/tests/unit/computations/test_correlation.py +++ b/tests/unit/computations/test_correlation.py @@ -1,13 +1,17 @@ """Module contains the tests for correlation""" from unittest import TestCase from unittest import mock +import unittest from collections import namedtuple +import math +from numpy.testing import assert_almost_equal from gn3.computations.correlations import normalize_values from gn3.computations.correlations import compute_sample_r_correlation from gn3.computations.correlations import compute_all_sample_correlation from gn3.computations.correlations import filter_shared_sample_keys + from gn3.computations.correlations import tissue_correlation_for_trait from gn3.computations.correlations import lit_correlation_for_trait from gn3.computations.correlations import fetch_lit_correlation_data @@ -93,10 +97,11 @@ class TestCorrelation(TestCase): results = normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1]) - expected_results = ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) + expected_results = [(2.3, 4.1, 5), (3.4, 6.2, 4.1)] - self.assertEqual(results, expected_results) + self.assertEqual(list(zip(*list(results))), expected_results) + @unittest.skip("reason for skipping") @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value") @mock.patch("gn3.computations.correlations.normalize_values") def test_compute_sample_r_correlation(self, norm_vals, compute_corr): @@ -152,22 +157,23 @@ class TestCorrelation(TestCase): } - filtered_target_samplelist = ["1.23", "6.565", "6.456"] - filtered_this_samplelist = ["6.266", "6.565", "6.456"] + filtered_target_samplelist = ("1.23", "6.565", "6.456") + filtered_this_samplelist = ("6.266", "6.565", "6.456") results = filter_shared_sample_keys( this_samplelist=this_samplelist, target_samplelist=target_samplelist) - self.assertEqual(results, (filtered_this_samplelist, - filtered_target_samplelist)) + self.assertEqual(list(zip(*list(results))), [filtered_this_samplelist, + filtered_target_samplelist]) @mock.patch("gn3.computations.correlations.compute_sample_r_correlation") @mock.patch("gn3.computations.correlations.filter_shared_sample_keys") def test_compute_all_sample(self, filter_shared_samples, sample_r_corr): """Given target dataset compute all sample r correlation""" - filter_shared_samples.return_value = (["1.23", "6.565", "6.456"], [ - "6.266", "6.565", "6.456"]) + filter_shared_samples.return_value = [iter(val) for val in [( + "1.23", "6.266"), ("6.565", "6.565"), ("6.456", "6.456")]] + sample_r_corr.return_value = (["1419792_at", -1.0, 0.9, 6]) this_trait_data = { @@ -199,10 +205,8 @@ class TestCorrelation(TestCase): this_trait=this_trait_data, target_dataset=traits_dataset), sample_all_results) sample_r_corr.assert_called_once_with( trait_name='1419792_at', - corr_method="pearson", trait_vals=['1.23', '6.565', '6.456'], - target_samples_vals=['6.266', '6.565', '6.456']) - filter_shared_samples.assert_called_once_with( - this_trait_data.get("trait_sample_data"), traits_dataset[0].get("trait_sample_data")) + corr_method="pearson", trait_vals=('1.23', '6.565', '6.456'), + target_samples_vals=('6.266', '6.565', '6.456')) @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value") def test_tissue_correlation_for_trait(self, mock_compute_corr_coeff): @@ -468,10 +472,10 @@ class TestCorrelation(TestCase): [None, None, None, None, None, None, None, None, None, 0], (0.0, 1)], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - (0, 10)], + (math.nan, 10)], [[9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87], [9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87], - (0.9999999999999998, 10)], + (math.nan, 10)], [[9.3, 2.2, 5.4, 7.2, 6.4, 7.6, 3.8, 1.8, 8.4, 0.2], [0.6, 3.97, 5.82, 8.21, 1.65, 4.55, 6.72, 9.5, 7.33, 2.34], (-0.12720361919462056, 10)], @@ -479,5 +483,8 @@ class TestCorrelation(TestCase): [None, None, None, None, 2, None, None, 3, None, None], (0.0, 2)]]: with self.subTest(dbdata=dbdata, userdata=userdata): - self.assertEqual(compute_correlation( - dbdata, userdata), expected) + actual = compute_correlation(dbdata, userdata) + with self.subTest("correlation coefficient"): + assert_almost_equal(actual[0], expected[0]) + with self.subTest("overlap"): + self.assertEqual(actual[1], expected[1]) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index 03fd4a6..e4c929d 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -1,5 +1,6 @@ """Module contains tests for gn3.heatmaps.heatmaps""" from unittest import TestCase +from numpy.testing import assert_allclose from gn3.heatmaps import ( cluster_traits, get_loci_names, @@ -39,7 +40,7 @@ class TestHeatmap(TestCase): (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] - self.assertEqual( + assert_allclose( cluster_traits(traits_data_list), ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, 1.5025235756329178, 0.6952839500255574, 1.271661230252733, |