From deca94c7b473ec79c5e5cee3d6caeb3c3885bd74 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 23 Dec 2021 13:40:31 +0530 Subject: db: Fix wrong continued indentation. * gn3/db/datasets.py (dataset_metadata): Fix wrong continued indentation. --- gn3/db/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gn3') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index c50e148..788e9cf 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -328,7 +328,7 @@ WHERE { OPTIONAL { ?dataset gn:geoSeries ?geo_series } . } """, - """ + """ PREFIX gn: SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name WHERE { @@ -341,7 +341,7 @@ WHERE { OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } . } """, - """ + """ PREFIX gn: SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform ?about_data_processing ?notes ?experiment_design ?contributors -- cgit v1.2.3 From 36cc8b1f837406d7002246c00d6054573687c472 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 23 Dec 2021 13:57:04 +0530 Subject: db: Fix sparql_query return type. * gn3/db/datasets.py: Import List from typing. (sparql_query): Set return type to List[Dict[str, Any]]. --- gn3/db/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gn3') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 788e9cf..ca48156 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -3,7 +3,7 @@ This module contains functions relating to specific trait dataset manipulation """ import re from string import Template -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional from SPARQLWrapper import JSON, SPARQLWrapper from gn3.settings import SPARQL_ENDPOINT @@ -297,7 +297,7 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): **group } -def sparql_query(query: str) -> Dict[str, Any]: +def sparql_query(query: str) -> List[Dict[str, Any]]: """Run a SPARQL query and return the bound variables.""" sparql = SPARQLWrapper(SPARQL_ENDPOINT) sparql.setQuery(query) -- cgit v1.2.3 From fa227ea35710658c7d8314315ee072a641c163f9 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 23 Dec 2021 13:58:07 +0530 Subject: db: Assist mypy with explicit type declaration. * gn3/db/datasets.py (dataset_metadata): Assist mypy by explicitly declaring type of return dictionary. --- gn3/db/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gn3') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index ca48156..a41e228 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -362,8 +362,8 @@ WHERE { OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . } } """] - result = {'accession_id': accession_id, - 'investigator': {}} + result: Dict[str, Any] = {'accession_id': accession_id, + 'investigator': {}} query_result = {} for query in queries: if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)): -- cgit v1.2.3 From ed8ee3077211cc227089f87929a70ac8b7c4593f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Dec 2021 09:29:11 +0300 Subject: Add API endpoint for partial correlations Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Add an API endpoint for the partial correlation. * gn3/api/correlation.py: --- gn3/api/correlation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'gn3') diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index e936eaf..f84228a 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -1,4 +1,6 @@ """Endpoints for running correlations""" +import json + from flask import jsonify from flask import Blueprint from flask import request @@ -87,9 +89,20 @@ def compute_tissue_corr(corr_method="pearson"): @correlation.route("/partial", methods=["POST"]) def partial_correlation(): + """API endpoint for partial correlations.""" def trait_fullname(trait): return f"{trait['dataset']}::{trait['name']}" + class OutputEncoder(json.JSONEncoder): + """ + Class to encode output into JSON, for objects which the default + json.JSONEncoder class does not have default encoding for. + """ + def default(self, obj): + if isinstance(obj, bytes): + return str(obj, encoding="utf-8") + return json.JSONEncoder.default(self, obj) + args = request.get_json() conn, _cursor_object = database_connector() corr_results = partial_correlations_entry( -- cgit v1.2.3 From 5abe0158daed9428484925b95bb4a8b2251adea2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Sat, 18 Dec 2021 08:09:04 +0300 Subject: Add dataset type to the results Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * The dataset type is relevant for the display of the data, therefore, this commit presents the dataset type as part of the results. --- gn3/computations/partial_correlations.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 13c411a..1c02533 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -717,7 +717,8 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] return { "status": "success", "results": { - "primary_trait": primary_trait, - "control_traits": cntrl_traits, - "correlations": trait_list + "primary_trait": primary_trait, + "control_traits": cntrl_traits, + "correlations": trait_list, + "dataset_type": target_dataset["type"] }} -- cgit v1.2.3 From 4304e7298af6769110a251b21fca2f105bee4c06 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Sat, 18 Dec 2021 11:36:07 +0300 Subject: Reduce the total amount of data to be output Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * There is a lot of data that is not necessary in the final result. This commit removes that data, retaining only data relevant for the display. --- gn3/computations/partial_correlations.py | 56 ++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 1c02533..9fb17f7 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -526,6 +526,54 @@ def tissue_correlation_by_list( } for trait in trait_list) return trait_list +def trait_for_output(trait): + """ + Process a trait for output. + + Removes a lot of extraneous data from the trait, that is not needed for + the display of partial correlation results. + This function also removes all key-value pairs, for which the value is + `None`, because it is a waste of network resources to transmit the key-value + pair just to indicate it does not exist. + """ + trait = { + "trait_type": trait["trait_type"], + "dataset_name": trait["db"]["dataset_name"], + "dataset_type": trait["db"]["dataset_type"], + "group": trait["db"]["group"], + "trait_fullname": trait["trait_fullname"], + "trait_name": trait["trait_name"], + "symbol": trait.get("symbol"), + "description": trait.get("description"), + "pre_publication_description": trait.get( + "pre_publication_description"), + "post_publication_description": trait.get( + "post_publication_description"), + "original_description": trait.get( + "original_description"), + "authors": trait.get("authors"), + "year": trait.get("year"), + "probe_target_description": trait.get( + "probe_target_description"), + "chr": trait.get("chr"), + "mb": trait.get("mb"), + "geneid": trait.get("geneid"), + "homologeneid": trait.get("homologeneid"), + "noverlap": trait.get("noverlap"), + "partial_corr": trait.get("partial_corr"), + "partial_corr_p_value": trait.get("partial_corr_p_value"), + "corr": trait.get("corr"), + "corr_p_value": trait.get("corr_p_value"), + "rank_order": trait.get("rank_order"), + "delta": ( + None if trait.get("partial_corr") is None + else (trait.get("partial_corr") - trait.get("corr"))), + "l_corr": trait.get("l_corr"), + "tissue_corr": trait.get("tissue_corr"), + "tissue_p_value": trait.get("tissue_p_value") + } + return {key: val for key, val in trait.items() if val is not None} + def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] conn: Any, primary_trait_name: str, control_trait_names: Tuple[str, ...], method: str, @@ -717,8 +765,10 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] return { "status": "success", "results": { - "primary_trait": primary_trait, - "control_traits": cntrl_traits, - "correlations": trait_list, + "primary_trait": trait_for_output(primary_trait), + "control_traits": tuple( + trait_for_output(trait) for trait in cntrl_traits), + "correlations": tuple( + trait_for_output(trait) for trait in trait_list), "dataset_type": target_dataset["type"] }} -- cgit v1.2.3 From c813dd68230a027b1b5acdbe9d3dba46f6bd1ad0 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Dec 2021 09:22:49 +0300 Subject: Encode the data to JSON and set the status code Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Encode bytes objects to string * Encode NaN values to "null" * gn3/api/correlation.py: --- gn3/api/correlation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'gn3') diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index f84228a..c0b5806 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -1,9 +1,9 @@ """Endpoints for running correlations""" import json - from flask import jsonify from flask import Blueprint from flask import request +from flask import make_response from gn3.computations.correlations import compute_all_sample_correlation from gn3.computations.correlations import compute_all_lit_correlation @@ -109,6 +109,8 @@ def partial_correlation(): conn, trait_fullname(args["primary_trait"]), tuple(trait_fullname(trait) for trait in args["control_traits"]), args["method"], int(args["criteria"]), args["target_db"]) - return make_response( - jsonify(corr_results), - 400) + response = make_response( + json.dumps(corr_results, cls=OutputEncoder), + 400 if "error" in corr_results.keys() else 200) + response.headers["Content-Type"] = "application/json" + return response -- cgit v1.2.3 From ac8528c5847f4a517c16b5283c06d3caeae8ef5e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Dec 2021 07:19:16 +0300 Subject: Replace `NaN` with `null` in JSON string Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * `NaN` is not a valid JSON value, and leads to errors in the code. This commit replaces all `NaN` values with `null`. --- gn3/api/correlation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3') diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index c0b5806..1caf31f 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -110,7 +110,7 @@ def partial_correlation(): tuple(trait_fullname(trait) for trait in args["control_traits"]), args["method"], int(args["criteria"]), args["target_db"]) response = make_response( - json.dumps(corr_results, cls=OutputEncoder), + json.dumps(corr_results, cls=OutputEncoder).replace(": NaN", ": null"), 400 if "error" in corr_results.keys() else 200) response.headers["Content-Type"] = "application/json" return response -- cgit v1.2.3 From 0508fc422c033cfff8bbea118f85282212d236e4 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 20 Dec 2021 09:38:17 +0300 Subject: Return the correlation method used Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Return the correlation method used --- gn3/computations/partial_correlations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gn3') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 9fb17f7..dbcbe29 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -770,5 +770,6 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] trait_for_output(trait) for trait in cntrl_traits), "correlations": tuple( trait_for_output(trait) for trait in trait_list), - "dataset_type": target_dataset["type"] + "dataset_type": target_dataset["type"], + "method": "spearman" if "spearman" in method.lower() else "pearson" }} -- cgit v1.2.3 From 2d3b6eae6953d5e4b00f21b5ffd683271d0f76bc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 21 Dec 2021 13:04:51 +0300 Subject: Fix sorting Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Update the sorting algorithm, for literature and tissue correlations so that it sorts the results by the correlation value first then by the p-value next. --- gn3/computations/partial_correlations.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index dbcbe29..1e4a646 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -717,19 +717,30 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] def __make_sorter__(method): - def __sort_6__(row): - return row[6] - - def __sort_3__(row): + def __compare_lit_or_tiss_correlation_values_(row): + # Index Content + # 0 trait name + # 1 N + # 2 partial correlation coefficient + # 3 p value of partial correlation + # 6 literature/tissue correlation value + return (row[6], row[3]) + + def __compare_partial_correlation_p_values__(row): + # Index Content + # 0 trait name + # 1 partial correlation coefficient + # 2 N + # 3 p value of partial correlation return row[3] if "literature" in method.lower(): - return __sort_6__ + return __compare_lit_or_tiss_correlation_values_ if "tissue" in method.lower(): - return __sort_6__ + return __compare_lit_or_tiss_correlation_values_ - return __sort_3__ + return __compare_partial_correlation_p_values__ sorted_correlations = sorted( all_correlations, key=__make_sorter__(method)) -- cgit v1.2.3 From 672c1f2d43961feb59d9549557cb10c4f0b19dcf Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Dec 2021 09:13:32 +0300 Subject: Fix linting errors --- gn3/authentication.py | 5 +++++ gn3/computations/correlations.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'gn3') diff --git a/gn3/authentication.py b/gn3/authentication.py index a6372c1..4aedacd 100644 --- a/gn3/authentication.py +++ b/gn3/authentication.py @@ -163,3 +163,8 @@ def create_group(conn: Redis, group_name: Optional[str], } conn.hset("groups", group_id, json.dumps(group)) return group + # This might break stuff, but it fixes the linting error regarding + # inconsistent return types. + # @BonfaceKilz please review this and replace with appropriate return and + # remove these comments. + return None diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index d38946e..345b8d7 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -79,7 +79,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, zip(*list(normalize_values(trait_vals, target_samples_vals)))) num_overlap = len(normalized_traits_vals) except ValueError: - return + return None if num_overlap > 5: -- cgit v1.2.3 From 784447b17d85a618005ac9acfc57f5b7ef8f5169 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 22 Dec 2021 09:14:04 +0300 Subject: Fix typing errors --- gn3/computations/correlations.py | 3 ++- gn3/computations/partial_correlations.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 345b8d7..1b4b3a4 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -7,6 +7,7 @@ from typing import List from typing import Tuple from typing import Optional from typing import Callable +from typing import Generator import scipy.stats import pingouin as pg @@ -106,7 +107,7 @@ package :not packaged in guix def filter_shared_sample_keys(this_samplelist, - target_samplelist) -> Tuple[List, List]: + target_samplelist) -> Generator: """Given primary and target sample-list for two base and target trait select filter the values using the shared keys diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 1e4a646..984c15a 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -217,7 +217,7 @@ def good_dataset_samples_indexes( def partial_correlations_fast(# pylint: disable=[R0913, R0914] samples, primary_vals, control_vals, database_filename, fetched_correlations, method: str, correlation_type: str) -> Tuple[ - float, Tuple[float, ...]]: + int, Tuple[float, ...]]: """ Computes partial correlation coefficients using data from a CSV file. @@ -350,7 +350,9 @@ def compute_partial( def partial_correlations_normal(# pylint: disable=R0913 primary_vals, control_vals, input_trait_gene_id, trait_database, data_start_pos: int, db_type: str, method: str) -> Tuple[ - float, Tuple[float, ...]]: + int, Tuple[Union[ + Tuple[str, int, float, float, float, float], None], + ...]]:#Tuple[float, ...] """ Computes the correlation coefficients. @@ -485,7 +487,7 @@ def literature_correlation_by_list( def tissue_correlation_by_list( conn: Any, primary_trait_symbol: str, tissue_probeset_freeze_id: int, - method: str, trait_list: Tuple[dict]) -> Tuple[dict]: + method: str, trait_list: Tuple[dict]) -> Tuple[dict, ...]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` @@ -508,7 +510,7 @@ def tissue_correlation_by_list( primary_trait_value = prim_trait_symbol_value_dict[ primary_trait_symbol.lower()] gene_symbol_list = tuple( - trait for trait in trait_list if "symbol" in trait.keys()) + trait["symbol"] for trait in trait_list if "symbol" in trait.keys()) symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( gene_symbol_list, tissue_probeset_freeze_id, conn) return tuple( -- cgit v1.2.3