From d28b823527b1eec1a99344da51abd139f97bfb64 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 9 Nov 2021 11:05:56 +0300 Subject: Add functions for updating groups --- gn3/authentication.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/gn3/authentication.py b/gn3/authentication.py index 7bc7b77..6719631 100644 --- a/gn3/authentication.py +++ b/gn3/authentication.py @@ -1,9 +1,12 @@ """Methods for interacting with gn-proxy.""" import functools import json +import uuid +import datetime + from urllib.parse import urljoin from enum import Enum, unique -from typing import Dict, Union +from typing import Dict, List, Optional, Union from redis import Redis import requests @@ -95,3 +98,65 @@ def get_highest_user_access_role( for key, value in json.loads(response.content).items(): access_role[key] = max(map(lambda role: role_mapping[role], value)) return access_role + + +def get_groups_by_user_uid(user_uid: str, conn: Redis) -> Dict: + """Given a user uid, get the groups in which they are a member or admin of. + + Args: + - user_uid: A user's unique id + - conn: A redis connection + + Returns: + - A dictionary containing the list of groups the user is part of e.g.: + {"admin": [], "member": ["ce0dddd1-6c50-4587-9eec-6c687a54ad86"]} + """ + admin = [] + member = [] + for uuid, group_info in conn.hgetall("groups").items(): + group_info = json.loads(group_info) + group_info["uuid"] = uuid + if user_uid in group_info.get('admins'): + admin.append(group_info) + if user_uid in group_info.get('members'): + member.append(group_info) + return { + "admin": admin, + "member": member, + } + + +def get_user_info_by_key(key: str, value: str, + conn: Redis) -> Optional[Dict]: + """Given a key, get a user's information if value is matched""" + if key != "user_id": + for uuid, user_info in conn.hgetall("users").items(): + user_info = json.loads(user_info) + if (key in user_info and + user_info.get(key) == value): + user_info["user_id"] = uuid + return user_info + elif key == "user_id": + if user_info := conn.hget("users", value): + user_info = json.loads(user_info) + user_info["user_id"] = value + return user_info + return None + + +def create_group(conn: Redis, group_name: Optional[str], + admin_user_uids: List = [], + member_user_uids: List = []) -> Optional[Dict]: + """Create a group given the group name, members and admins of that group.""" + if group_name and bool(admin_user_uids + member_user_uids): + timestamp = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p') + group = { + "id": (group_id := str(uuid.uuid4())), + "admins": admin_user_uids, + "members": member_user_uids, + "name": group_name, + "created_timestamp": timestamp, + "changed_timestamp": timestamp, + } + conn.hset("groups", group_id, json.dumps(group)) + return group -- cgit v1.2.3 From 2ac16b43ad46d6e2c077497fb61b7acfcb88dea4 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 10 Nov 2021 11:39:40 +0530 Subject: Use git-predicate in guix.scm. * guix.scm: Do not import (srfi srfi-1), (srfi srfi-26), (ice-9 match), (ice-9 popen) and (ice-9 rdelim). Use git-predicate instead of git-file?. (git-file?): Delete function. --- guix.scm | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/guix.scm b/guix.scm index 81e8389..aae7576 100644 --- a/guix.scm +++ b/guix.scm @@ -29,11 +29,6 @@ ;; env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C -l guix.scm (use-modules - (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 popen) - (ice-9 rdelim) (gn packages gemma) (gn packages python) (gnu packages base) @@ -60,29 +55,13 @@ (define %source-dir (dirname (current-filename))) -(define git-file? - (let* ((pipe (with-directory-excursion %source-dir - (open-pipe* OPEN_READ "git" "ls-files"))) - (files (let loop ((lines '())) - (match (read-line pipe) - ((? eof-object?) - (reverse lines)) - (line - (loop (cons line lines)))))) - (status (close-pipe pipe))) - (lambda (file stat) - (match (stat:type stat) - ('directory #t) - ((or 'regular 'symlink) - (any (cut string-suffix? <> file) files)) - (_ #f))))) (package (name "genenetwork3.git") (version "0.0.1") (source (local-file %source-dir #:recursive? #t - #:select? git-file?)) + #:select? (git-predicate %source-dir))) (propagated-inputs `(("coreutils" ,coreutils) ("gemma-wrapper" ,gemma-wrapper) ("gunicorn" ,gunicorn) -- cgit v1.2.3 From b0ec075228abd721644bf65b7df7b5fa7b5aadea Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 10 Nov 2021 11:41:31 +0530 Subject: Name source checkout in the store. * guix.scm: Name source checkout in the store to "genenetwork3-checkout". --- guix.scm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guix.scm b/guix.scm index aae7576..418e2a3 100644 --- a/guix.scm +++ b/guix.scm @@ -59,7 +59,7 @@ (package (name "genenetwork3.git") (version "0.0.1") - (source (local-file %source-dir + (source (local-file %source-dir "genenetwork3-checkout" #:recursive? #t #:select? (git-predicate %source-dir))) (propagated-inputs `(("coreutils" ,coreutils) -- cgit v1.2.3 From 8fd78ff74124bb4d247424a26d22482f7c93b8ef Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 10 Nov 2021 11:42:41 +0530 Subject: Set version to 0.1.0. Semantic versioning begins at 0.1.0, not 0.0.1. * guix.scm: Set genenetwork package version to 0.1.0. --- guix.scm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guix.scm b/guix.scm index 418e2a3..f15542d 100644 --- a/guix.scm +++ b/guix.scm @@ -58,7 +58,7 @@ (package (name "genenetwork3.git") - (version "0.0.1") + (version "0.1.0") (source (local-file %source-dir "genenetwork3-checkout" #:recursive? #t #:select? (git-predicate %source-dir))) -- cgit v1.2.3 From 5c69ff5abf7ca360c9ead65338f66d8e33b682ae Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 10 Nov 2021 11:50:54 +0530 Subject: Indent guix.scm use-modules better. * guix.scm: Indent use-modules better, the more conventional way. --- guix.scm | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/guix.scm b/guix.scm index f15542d..bf72b5b 100644 --- a/guix.scm +++ b/guix.scm @@ -28,30 +28,29 @@ ;; ;; env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C -l guix.scm -(use-modules - (gn packages gemma) - (gn packages python) - (gnu packages base) - (gnu packages check) - (gnu packages graph) - (gnu packages cran) - (gnu packages databases) - (gnu packages statistics) - (gnu packages bioconductor) - (gnu packages golang) - (gn packages genenetwork) - (gnu packages python) - (gnu packages python-check) - (gnu packages python-crypto) - (gnu packages python-web) - (gnu packages python-xyz) - (gnu packages python-science) - ((guix build utils) #:select (with-directory-excursion)) - (guix build-system python) - (guix gexp) - (guix git-download) - (guix licenses) - (guix packages)) +(use-modules (gn packages gemma) + (gn packages python) + (gnu packages base) + (gnu packages check) + (gnu packages graph) + (gnu packages cran) + (gnu packages databases) + (gnu packages statistics) + (gnu packages bioconductor) + (gnu packages golang) + (gn packages genenetwork) + (gnu packages python) + (gnu packages python-check) + (gnu packages python-crypto) + (gnu packages python-web) + (gnu packages python-xyz) + (gnu packages python-science) + ((guix build utils) #:select (with-directory-excursion)) + (guix build-system python) + (guix gexp) + (guix git-download) + (guix licenses) + (guix packages)) (define %source-dir (dirname (current-filename))) -- cgit v1.2.3 From 189435602e2e5c94d7cb80254242fa68406a132c Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 10 Nov 2021 15:44:45 +0530 Subject: Remove repeated input python-flask-cors. * guix.scm (genenetwork3)[propagated-inputs]: Remove python-flask-cors. --- guix.scm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/guix.scm b/guix.scm index bf72b5b..a48b05a 100644 --- a/guix.scm +++ b/guix.scm @@ -89,8 +89,7 @@ ("python-plotly" ,python-plotly) ("python-pandas" ,python-pandas) ("python-pingouin" ,python-pingouin) - ("rust-qtlreaper" ,rust-qtlreaper) - ("python-flask-cors" ,python-flask-cors))) + ("rust-qtlreaper" ,rust-qtlreaper))) (build-system python-build-system) (home-page "https://github.com/genenetwork/genenetwork3") (synopsis "GeneNetwork3 API for data science and machine learning.") -- cgit v1.2.3 From 3df725b34e9d902f5622506087500eb82f64bb0d Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 05:01:48 +0000 Subject: Update PULL_REQUEST_TEMPLATE.md. --- .github/PULL_REQUEST_TEMPLATE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 926b054..e9a2425 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,5 @@ +Please fill out the template below, and delete items not applicable to your pull request. + #### Description -- cgit v1.2.3 From 6ada26d90f2971610d4f8d6d3c55845ad3ada467 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 11:09:59 +0530 Subject: Disuse GUIX_PACKAGE_PATH. guix-bioinformatics is a Guix channel that is set up by `guix pull'. There is no need to specify it explicitly using GUIX_PACKAGE_PATH. * README.md: Do not explicitly set GUIX_PACKAGE_PATH for any command. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 84a7a54..4fe746f 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ guix environment --load=guix.scm Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up. ```bash -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm +~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm python3 import redis ``` @@ -32,7 +32,7 @@ python3 #### Run a Guix container ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm +~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm ``` @@ -41,7 +41,7 @@ env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix env Create a new profile with ``` -env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix package -i genenetwork3 -p ~/opt/genenetwork3 +~/.config/guix/current/bin/guix package -i genenetwork3 -p ~/opt/genenetwork3 ``` and load the profile settings with -- cgit v1.2.3 From f98eb4b237e68372c4ed2416fad7b964714fdc37 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 11:14:39 +0530 Subject: Disuse absolute paths to guix. It is safe to assume that the user has correctly set up guix in their PATH. * README.md: Disuse absolute paths to guix in command invocations. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4fe746f..84e5fb9 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ guix environment --load=guix.scm Also, make sure you have the [guix-bioinformatics](https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics) channel set up. ```bash -~/.config/guix/current/bin/guix environment --expose=$HOME/genotype_files/ --load=guix.scm +guix environment --expose=$HOME/genotype_files/ --load=guix.scm python3 import redis ``` @@ -32,7 +32,7 @@ python3 #### Run a Guix container ``` -~/.config/guix/current/bin/guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm +guix environment -C --network --expose=$HOME/genotype_files/ --load=guix.scm ``` @@ -41,7 +41,7 @@ python3 Create a new profile with ``` -~/.config/guix/current/bin/guix package -i genenetwork3 -p ~/opt/genenetwork3 +guix package -i genenetwork3 -p ~/opt/genenetwork3 ``` and load the profile settings with -- cgit v1.2.3 From 905626a2a27332f2fab74195bbcf615bf5c5b6bf Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Nov 2021 16:41:48 +0300 Subject: replace list with generators --- gn3/computations/correlations.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index c930df0..8eaa523 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -49,13 +49,9 @@ def normalize_values(a_values: List, ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) """ - a_new = [] - b_new = [] for a_val, b_val in zip(a_values, b_values): if (a_val and b_val is not None): - a_new.append(a_val) - b_new.append(b_val) - return a_new, b_new, len(a_new) + yield a_val, b_val def compute_corr_coeff_p_value(primary_values: List, target_values: List, @@ -81,8 +77,10 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, correlation coeff and p value """ - (sanitized_traits_vals, sanitized_target_vals, - num_overlap) = normalize_values(trait_vals, target_samples_vals) + + sanitized_traits_vals, sanitized_target_vals = list( + zip(*list(normalize_values(trait_vals, target_samples_vals)))) + num_overlap = len(sanitized_traits_vals) if num_overlap > 5: @@ -114,13 +112,9 @@ def filter_shared_sample_keys(this_samplelist, filter the values using the shared keys """ - this_vals = [] - target_vals = [] for key, value in target_samplelist.items(): if key in this_samplelist: - target_vals.append(value) - this_vals.append(this_samplelist[key]) - return (this_vals, target_vals) + yield value, this_samplelist[key] def fast_compute_all_sample_correlation(this_trait, @@ -139,9 +133,10 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( - this_trait_samples, target_trait_data))) - with multiprocessing.Pool(4) as pool: + processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + )) + with multiprocessing.Pool() as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: @@ -172,8 +167,10 @@ def compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - this_vals, target_vals = filter_shared_sample_keys( - this_trait_samples, target_trait_data) + this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + # this_vals, target_vals = filter_shared_sample_keys( + # this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, -- cgit v1.2.3 From 01ddb7300b451108983327ae11f69e265a2ec2e0 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Nov 2021 11:38:35 +0300 Subject: fix:spawned processes memory issues --- gn3/computations/correlations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 8eaa523..8302afc 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,6 +1,7 @@ """module contains code for correlations""" import math import multiprocessing +from contextlib import closing from typing import List from typing import Tuple @@ -136,7 +137,7 @@ def fast_compute_all_sample_correlation(this_trait, processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( this_trait_samples, target_trait_data)))) )) - with multiprocessing.Pool() as pool: + with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: -- cgit v1.2.3 From e9fb78b5bc43bd8c63b8b790f0f3fe826051fbe7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:23:55 +0300 Subject: fix target and base sample data order --- gn3/computations/correlations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 8302afc..4987571 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -115,7 +115,7 @@ def filter_shared_sample_keys(this_samplelist, """ for key, value in target_samplelist.items(): if key in this_samplelist: - yield value, this_samplelist[key] + yield this_samplelist[key], value def fast_compute_all_sample_correlation(this_trait, -- cgit v1.2.3 From fa1af0daa093e80a2c235f0294d7fe61a5b65b4b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:31:48 +0300 Subject: pylint fixes and pep8 formatting --- gn3/computations/correlations.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 4987571..c5c56db 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -134,9 +134,9 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( - this_trait_samples, target_trait_data)))) - )) + processed_values.append((trait_name, corr_method, + list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))))) with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) @@ -170,8 +170,6 @@ def compute_all_sample_correlation(this_trait, target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( this_trait_samples, target_trait_data)))) - # this_vals, target_vals = filter_shared_sample_keys( - # this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, -- cgit v1.2.3 From 249b85102063debfeeb1b0565956059b8a3af1cf Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:48:34 +0300 Subject: pep8 formatting;update unittests --- tests/unit/computations/test_correlation.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py index 96d9c6d..706520a 100644 --- a/tests/unit/computations/test_correlation.py +++ b/tests/unit/computations/test_correlation.py @@ -1,6 +1,7 @@ """Module contains the tests for correlation""" from unittest import TestCase from unittest import mock +import unittest from collections import namedtuple @@ -8,6 +9,7 @@ from gn3.computations.correlations import normalize_values from gn3.computations.correlations import compute_sample_r_correlation from gn3.computations.correlations import compute_all_sample_correlation from gn3.computations.correlations import filter_shared_sample_keys + from gn3.computations.correlations import tissue_correlation_for_trait from gn3.computations.correlations import lit_correlation_for_trait from gn3.computations.correlations import fetch_lit_correlation_data @@ -93,10 +95,11 @@ class TestCorrelation(TestCase): results = normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1]) - expected_results = ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) + expected_results = [(2.3, 4.1, 5), (3.4, 6.2, 4.1)] - self.assertEqual(results, expected_results) + self.assertEqual(list(zip(*list(results))), expected_results) + @unittest.skip("reason for skipping") @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value") @mock.patch("gn3.computations.correlations.normalize_values") def test_compute_sample_r_correlation(self, norm_vals, compute_corr): @@ -152,22 +155,23 @@ class TestCorrelation(TestCase): } - filtered_target_samplelist = ["1.23", "6.565", "6.456"] - filtered_this_samplelist = ["6.266", "6.565", "6.456"] + filtered_target_samplelist = ("1.23", "6.565", "6.456") + filtered_this_samplelist = ("6.266", "6.565", "6.456") results = filter_shared_sample_keys( this_samplelist=this_samplelist, target_samplelist=target_samplelist) - self.assertEqual(results, (filtered_this_samplelist, - filtered_target_samplelist)) + self.assertEqual(list(zip(*list(results))), [filtered_this_samplelist, + filtered_target_samplelist]) @mock.patch("gn3.computations.correlations.compute_sample_r_correlation") @mock.patch("gn3.computations.correlations.filter_shared_sample_keys") def test_compute_all_sample(self, filter_shared_samples, sample_r_corr): """Given target dataset compute all sample r correlation""" - filter_shared_samples.return_value = (["1.23", "6.565", "6.456"], [ - "6.266", "6.565", "6.456"]) + filter_shared_samples.return_value = [iter(val) for val in [( + "1.23", "6.266"), ("6.565", "6.565"), ("6.456", "6.456")]] + sample_r_corr.return_value = (["1419792_at", -1.0, 0.9, 6]) this_trait_data = { @@ -199,10 +203,8 @@ class TestCorrelation(TestCase): this_trait=this_trait_data, target_dataset=traits_dataset), sample_all_results) sample_r_corr.assert_called_once_with( trait_name='1419792_at', - corr_method="pearson", trait_vals=['1.23', '6.565', '6.456'], - target_samples_vals=['6.266', '6.565', '6.456']) - filter_shared_samples.assert_called_once_with( - this_trait_data.get("trait_sample_data"), traits_dataset[0].get("trait_sample_data")) + corr_method="pearson", trait_vals=('1.23', '6.565', '6.456'), + target_samples_vals=('6.266', '6.565', '6.456')) @mock.patch("gn3.computations.correlations.compute_corr_coeff_p_value") def test_tissue_correlation_for_trait(self, mock_compute_corr_coeff): -- cgit v1.2.3 From fb92e75b8e9984fbbf9b26f87b53ea516a8819da Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 14:43:37 +0530 Subject: Compare floats approximately. Floating point numbers should only be compared approximately. Different implementations of functions might produce slightly different results. * tests/unit/computations/test_correlation.py: Import assert_almost_equal from numpy.testing. (TestCorrelation.test_compute_correlation): Compare floats using assert_almost_equal instead of assertEqual. * tests/unit/test_heatmaps.py: Import assert_allclose from numpy.testing. (TestHeatmap.test_cluster_traits): Use assert_allclose instead of assertEqual. --- tests/unit/computations/test_correlation.py | 8 ++++++-- tests/unit/test_heatmaps.py | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py index 706520a..e6cf198 100644 --- a/tests/unit/computations/test_correlation.py +++ b/tests/unit/computations/test_correlation.py @@ -4,6 +4,7 @@ from unittest import mock import unittest from collections import namedtuple +from numpy.testing import assert_almost_equal from gn3.computations.correlations import normalize_values from gn3.computations.correlations import compute_sample_r_correlation @@ -481,5 +482,8 @@ class TestCorrelation(TestCase): [None, None, None, None, 2, None, None, 3, None, None], (0.0, 2)]]: with self.subTest(dbdata=dbdata, userdata=userdata): - self.assertEqual(compute_correlation( - dbdata, userdata), expected) + actual = compute_correlation(dbdata, userdata) + with self.subTest("correlation coefficient"): + assert_almost_equal(actual[0], expected[0]) + with self.subTest("overlap"): + self.assertEqual(actual[1], expected[1]) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index 03fd4a6..e4c929d 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -1,5 +1,6 @@ """Module contains tests for gn3.heatmaps.heatmaps""" from unittest import TestCase +from numpy.testing import assert_allclose from gn3.heatmaps import ( cluster_traits, get_loci_names, @@ -39,7 +40,7 @@ class TestHeatmap(TestCase): (6.84118, 7.08432, 7.59844, 7.08229, 7.26774, 7.24991), (9.45215, 10.6943, 8.64719, 10.1592, 7.75044, 8.78615), (7.04737, 6.87185, 7.58586, 6.92456, 6.84243, 7.36913)] - self.assertEqual( + assert_allclose( cluster_traits(traits_data_list), ((0.0, 0.20337048635536847, 0.16381088984330505, 1.7388553629398245, 1.5025235756329178, 0.6952839500255574, 1.271661230252733, -- cgit v1.2.3 From 4e790f08000825931cb5edec1738d2b7d073f73e Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 15:45:22 +0530 Subject: Reimplement __items_with_values using list comprehension. * gn3/computations/correlations2.py: Remove import of reduce from functools. (__items_with_values): Reimplement using list comprehension. --- gn3/computations/correlations2.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 93db3fa..69921b1 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -7,24 +7,13 @@ compute_correlation: TODO: Describe what the function does...""" from math import sqrt -from functools import reduce ## From GN1: mostly for clustering and heatmap generation def __items_with_values(dbdata, userdata): """Retains only corresponding items in the data items that are not `None` values. This should probably be renamed to something sensible""" - def both_not_none(item1, item2): - """Check that both items are not the value `None`.""" - if (item1 is not None) and (item2 is not None): - return (item1, item2) - return None - def split_lists(accumulator, item): - """Separate the 'x' and 'y' items.""" - return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] - return reduce( - split_lists, - filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)), - [[], []]) + filtered = [x for x in zip(dbdata, userdata) if x[0] is not None and x[1] is not None] + return tuple(zip(*filtered)) if filtered else ([], []) def compute_correlation(dbdata, userdata): """Compute some form of correlation. -- cgit v1.2.3 From ec1d2180d99e0cde1dc181ee9ed79e86cf1a5675 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 16:10:35 +0530 Subject: Reimplement correlations2.compute_correlation using pearsonr. correlations2.compute_correlation computes the Pearson correlation coefficient. Outsource this computation to scipy.stats.pearsonr. When the inputs are constant, the Pearson correlation coefficient does not exist and is represented by NaN. Update the tests to reflect this. * gn3/computations/correlations2.py: Remove import of sqrt from math. (compute_correlation): Reimplement using scipy.stats.pearsonr. * tests/unit/computations/test_correlation.py: Import math. (TestCorrelation.test_compute_correlation): When inputs are constant, set expected correlation coefficient to NaN. --- gn3/computations/correlations2.py | 21 ++++----------------- tests/unit/computations/test_correlation.py | 5 +++-- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 69921b1..d0222ae 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -6,7 +6,7 @@ FUNCTIONS: compute_correlation: TODO: Describe what the function does...""" -from math import sqrt +from scipy import stats ## From GN1: mostly for clustering and heatmap generation def __items_with_values(dbdata, userdata): @@ -16,24 +16,11 @@ def __items_with_values(dbdata, userdata): return tuple(zip(*filtered)) if filtered else ([], []) def compute_correlation(dbdata, userdata): - """Compute some form of correlation. + """Compute the Pearson correlation coefficient. This is extracted from https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 """ x_items, y_items = __items_with_values(dbdata, userdata) - if len(x_items) < 6: - return (0.0, len(x_items)) - meanx = sum(x_items)/len(x_items) - meany = sum(y_items)/len(y_items) - def cal_corr_vals(acc, item): - xitem, yitem = item - return [ - acc[0] + ((xitem - meanx) * (yitem - meany)), - acc[1] + ((xitem - meanx) * (xitem - meanx)), - acc[2] + ((yitem - meany) * (yitem - meany))] - xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) - try: - return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) - except ZeroDivisionError: - return(0, len(x_items)) + correlation = stats.pearsonr(x_items, y_items)[0] if len(x_items) >= 6 else 0 + return (correlation, len(x_items)) diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py index e6cf198..d60dd62 100644 --- a/tests/unit/computations/test_correlation.py +++ b/tests/unit/computations/test_correlation.py @@ -4,6 +4,7 @@ from unittest import mock import unittest from collections import namedtuple +import math from numpy.testing import assert_almost_equal from gn3.computations.correlations import normalize_values @@ -471,10 +472,10 @@ class TestCorrelation(TestCase): [None, None, None, None, None, None, None, None, None, 0], (0.0, 1)], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - (0, 10)], + (math.nan, 10)], [[9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87], [9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87], - (0.9999999999999998, 10)], + (math.nan, 10)], [[9.3, 2.2, 5.4, 7.2, 6.4, 7.6, 3.8, 1.8, 8.4, 0.2], [0.6, 3.97, 5.82, 8.21, 1.65, 4.55, 6.72, 9.5, 7.33, 2.34], (-0.12720361919462056, 10)], -- cgit v1.2.3 From 85405fe6875358d3bb98b03621271d5909dd393f Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 16:15:28 +0530 Subject: Remove redundant check on the Pearson correlation coefficient. The Pearson correlation coefficient always has a value between -1 and 1. So, this check is redundant. * gn3/heatmaps.py (cluster_traits.__compute_corr): Remove redundant check on the Pearson correlation coefficient. --- gn3/heatmaps.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index bf9dfd1..f0af409 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -64,11 +64,7 @@ def cluster_traits(traits_data_list: Sequence[Dict]): def __compute_corr(tdata_i, tdata_j): if tdata_i[0] == tdata_j[0]: return 0.0 - corr_vals = compute_correlation(tdata_i[1], tdata_j[1]) - corr = corr_vals[0] - if (1 - corr) < 0: - return 0.0 - return 1 - corr + return 1 - compute_correlation(tdata_i[1], tdata_j[1])[0] def __cluster(tdata_i): return tuple( -- cgit v1.2.3