Merge branch 'main' into fix/check-for-duplicates-before-deletions-or-insertions

author: BonfaceKilz 2022-01-05 17:01:24 +0300
committer: GitHub 2022-01-05 17:01:24 +0300
commit: 0b1643e87cf4303db3673dcf3cf240aeb4d518cb (patch)
tree: 62ed5cae507303a469c040eb47a8b5b70b3a1a93
parent: ac28fb48e4e3197de6bfeef332198b70689837c9 (diff)
parent: af52afa4318feadfa3cd1cc4dcdd3d86907f68a4 (diff)
download: genenetwork3-0b1643e87cf4303db3673dcf3cf240aeb4d518cb.tar.gz
7 files changed, 121 insertions, 27 deletions
diff --git a/.guix_deploy b/.guix_deploy
new file mode 100644
index 0000000..c7bbb5b
--- /dev/null
+++ b/.guix_deploy
@@ -0,0 +1,8 @@
+# Deploy script on tux01
+#
+# echo Run tests:
+# echo python -m unittest discover -v
+# echo Run service (single process):
+# echo flask run --port=8080
+
+/home/wrk/opt/guix-pull/bin/guix shell -L /home/wrk/guix-bioinformatics/ --expose=$HOME/production/genotype_files/ -C -N -Df guix.scm
diff --git a/.pylintrc b/.pylintrc
index 0bdef23..00dd6cd 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,3 +1,7 @@
 [SIMILARITIES]
 
-ignore-imports=yes
\ No newline at end of file
+ignore-imports=yes
+
+[MESSAGES CONTROL]
+
+disable=fixme
\ No newline at end of file
diff --git a/README.md b/README.md
index d3470ee..5669192 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,6 @@ python3
 guix shell -C --network --expose=$HOME/genotype_files/ -Df guix.scm
 ```
 
-
 #### Using a Guix profile (or rolling back)
 
 Create a new profile with
@@ -128,6 +127,8 @@ And for the scalable production version run
 gunicorn --bind 0.0.0.0:8080 --workers 8 --keep-alive 6000 --max-requests 10 --max-requests-jitter 5 --timeout 1200 wsgi:app
 ```
 
+(see also the [.guix_deploy](./.guix_deploy) script)
+
 ## Using python-pip
 
 IMPORTANT NOTE: we do not recommend using pip tools, use Guix instead
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py
index e936eaf..1caf31f 100644
--- a/gn3/api/correlation.py
+++ b/gn3/api/correlation.py
@@ -1,7 +1,9 @@
 """Endpoints for running correlations"""
+import json
 from flask import jsonify
 from flask import Blueprint
 from flask import request
+from flask import make_response
 
 from gn3.computations.correlations import compute_all_sample_correlation
 from gn3.computations.correlations import compute_all_lit_correlation
@@ -87,15 +89,28 @@ def compute_tissue_corr(corr_method="pearson"):
 
 @correlation.route("/partial", methods=["POST"])
 def partial_correlation():
+    """API endpoint for partial correlations."""
     def trait_fullname(trait):
         return f"{trait['dataset']}::{trait['name']}"
 
+    class OutputEncoder(json.JSONEncoder):
+        """
+        Class to encode output into JSON, for objects which the default
+        json.JSONEncoder class does not have default encoding for.
+        """
+        def default(self, obj):
+            if isinstance(obj, bytes):
+                return str(obj, encoding="utf-8")
+            return json.JSONEncoder.default(self, obj)
+
     args = request.get_json()
     conn, _cursor_object = database_connector()
     corr_results = partial_correlations_entry(
         conn, trait_fullname(args["primary_trait"]),
         tuple(trait_fullname(trait) for trait in args["control_traits"]),
         args["method"], int(args["criteria"]), args["target_db"])
-    return make_response(
-        jsonify(corr_results),
-        400)
+    response = make_response(
+        json.dumps(corr_results, cls=OutputEncoder).replace(": NaN", ": null"),
+        400 if "error" in corr_results.keys() else 200)
+    response.headers["Content-Type"] = "application/json"
+    return response
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index d38946e..1b4b3a4 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -7,6 +7,7 @@ from typing import List
 from typing import Tuple
 from typing import Optional
 from typing import Callable
+from typing import Generator
 
 import scipy.stats
 import pingouin as pg
@@ -79,7 +80,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
             zip(*list(normalize_values(trait_vals, target_samples_vals))))
         num_overlap = len(normalized_traits_vals)
     except ValueError:
-        return
+        return None
 
     if num_overlap > 5:
 
@@ -106,7 +107,7 @@ package :not packaged in guix
 
 
 def filter_shared_sample_keys(this_samplelist,
-                              target_samplelist) -> Tuple[List, List]:
+                              target_samplelist) -> Generator:
     """Given primary and target sample-list for two base and target trait select
     filter the values using the shared keys
 
diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py
index 13c411a..984c15a 100644
--- a/gn3/computations/partial_correlations.py
+++ b/gn3/computations/partial_correlations.py
@@ -217,7 +217,7 @@ def good_dataset_samples_indexes(
 def partial_correlations_fast(# pylint: disable=[R0913, R0914]
         samples, primary_vals, control_vals, database_filename,
         fetched_correlations, method: str, correlation_type: str) -> Tuple[
-            float, Tuple[float, ...]]:
+            int, Tuple[float, ...]]:
     """
     Computes partial correlation coefficients using data from a CSV file.
 
@@ -350,7 +350,9 @@ def compute_partial(
 def partial_correlations_normal(# pylint: disable=R0913
         primary_vals, control_vals, input_trait_gene_id, trait_database,
         data_start_pos: int, db_type: str, method: str) -> Tuple[
-            float, Tuple[float, ...]]:
+            int, Tuple[Union[
+                Tuple[str, int, float, float, float, float], None],
+                       ...]]:#Tuple[float, ...]
     """
     Computes the correlation coefficients.
 
@@ -485,7 +487,7 @@ def literature_correlation_by_list(
 
 def tissue_correlation_by_list(
         conn: Any, primary_trait_symbol: str, tissue_probeset_freeze_id: int,
-        method: str, trait_list: Tuple[dict]) -> Tuple[dict]:
+        method: str, trait_list: Tuple[dict]) -> Tuple[dict, ...]:
     """
     This is a migration of the
     `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList`
@@ -508,7 +510,7 @@ def tissue_correlation_by_list(
             primary_trait_value = prim_trait_symbol_value_dict[
                 primary_trait_symbol.lower()]
             gene_symbol_list = tuple(
-                trait for trait in trait_list if "symbol" in trait.keys())
+                trait["symbol"] for trait in trait_list if "symbol" in trait.keys())
             symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait(
                 gene_symbol_list, tissue_probeset_freeze_id, conn)
             return tuple(
@@ -526,6 +528,54 @@ def tissue_correlation_by_list(
         } for trait in trait_list)
     return trait_list
 
+def trait_for_output(trait):
+    """
+    Process a trait for output.
+
+    Removes a lot of extraneous data from the trait, that is not needed for
+    the display of partial correlation results.
+    This function also removes all key-value pairs, for which the value is
+    `None`, because it is a waste of network resources to transmit the key-value
+    pair just to indicate it does not exist.
+    """
+    trait = {
+        "trait_type": trait["trait_type"],
+        "dataset_name": trait["db"]["dataset_name"],
+        "dataset_type": trait["db"]["dataset_type"],
+        "group": trait["db"]["group"],
+        "trait_fullname": trait["trait_fullname"],
+        "trait_name": trait["trait_name"],
+        "symbol": trait.get("symbol"),
+        "description": trait.get("description"),
+        "pre_publication_description": trait.get(
+            "pre_publication_description"),
+        "post_publication_description": trait.get(
+            "post_publication_description"),
+        "original_description": trait.get(
+            "original_description"),
+        "authors": trait.get("authors"),
+        "year": trait.get("year"),
+        "probe_target_description": trait.get(
+            "probe_target_description"),
+        "chr": trait.get("chr"),
+        "mb": trait.get("mb"),
+        "geneid": trait.get("geneid"),
+        "homologeneid": trait.get("homologeneid"),
+        "noverlap": trait.get("noverlap"),
+        "partial_corr": trait.get("partial_corr"),
+        "partial_corr_p_value": trait.get("partial_corr_p_value"),
+        "corr": trait.get("corr"),
+        "corr_p_value": trait.get("corr_p_value"),
+        "rank_order": trait.get("rank_order"),
+        "delta": (
+            None if trait.get("partial_corr") is None
+            else (trait.get("partial_corr") - trait.get("corr"))),
+        "l_corr":  trait.get("l_corr"),
+        "tissue_corr": trait.get("tissue_corr"),
+        "tissue_p_value": trait.get("tissue_p_value")
+    }
+    return {key: val for key, val in trait.items() if val is not None}
+
 def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911]
         conn: Any, primary_trait_name: str,
         control_trait_names: Tuple[str, ...], method: str,
@@ -669,19 +719,30 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911]
 
 
     def __make_sorter__(method):
-        def __sort_6__(row):
-            return row[6]
-
-        def __sort_3__(row):
+        def __compare_lit_or_tiss_correlation_values_(row):
+            # Index  Content
+            # 0      trait name
+            # 1      N
+            # 2      partial correlation coefficient
+            # 3      p value of partial correlation
+            # 6      literature/tissue correlation value
+            return (row[6], row[3])
+
+        def __compare_partial_correlation_p_values__(row):
+            # Index  Content
+            # 0      trait name
+            # 1      partial correlation coefficient
+            # 2      N
+            # 3      p value of partial correlation
             return row[3]
 
         if "literature" in method.lower():
-            return __sort_6__
+            return __compare_lit_or_tiss_correlation_values_
 
         if "tissue" in method.lower():
-            return __sort_6__
+            return __compare_lit_or_tiss_correlation_values_
 
-        return __sort_3__
+        return __compare_partial_correlation_p_values__
 
     sorted_correlations = sorted(
         all_correlations, key=__make_sorter__(method))
@@ -717,7 +778,11 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911]
     return {
         "status": "success",
         "results": {
-        "primary_trait": primary_trait,
-        "control_traits": cntrl_traits,
-        "correlations": trait_list
+            "primary_trait": trait_for_output(primary_trait),
+            "control_traits": tuple(
+                trait_for_output(trait) for trait in cntrl_traits),
+            "correlations": tuple(
+                trait_for_output(trait) for trait in trait_list),
+            "dataset_type": target_dataset["type"],
+            "method": "spearman" if "spearman" in method.lower() else "pearson"
         }}
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index c50e148..a41e228 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -3,7 +3,7 @@ This module contains functions relating to specific trait dataset manipulation
 """
 import re
 from string import Template
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 from SPARQLWrapper import JSON, SPARQLWrapper
 from gn3.settings import SPARQL_ENDPOINT
 
@@ -297,7 +297,7 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
         **group
     }
 
-def sparql_query(query: str) -> Dict[str, Any]:
+def sparql_query(query: str) -> List[Dict[str, Any]]:
     """Run a SPARQL query and return the bound variables."""
     sparql = SPARQLWrapper(SPARQL_ENDPOINT)
     sparql.setQuery(query)
@@ -328,7 +328,7 @@ WHERE {
   OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
 }
 """,
-             """
+               """
 PREFIX gn: <http://genenetwork.org/>
 SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
 WHERE {
@@ -341,7 +341,7 @@ WHERE {
   OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
 }
 """,
-             """
+               """
 PREFIX gn: <http://genenetwork.org/>
 SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
        ?about_data_processing ?notes ?experiment_design ?contributors
@@ -362,8 +362,8 @@ WHERE {
   OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
 }
 """]
-    result = {'accession_id': accession_id,
-              'investigator': {}}
+    result: Dict[str, Any] = {'accession_id': accession_id,
+                              'investigator': {}}
     query_result = {}
     for query in queries:
         if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)):
author	BonfaceKilz	2022-01-05 17:01:24 +0300
committer	GitHub	2022-01-05 17:01:24 +0300
commit	0b1643e87cf4303db3673dcf3cf240aeb4d518cb (patch)
tree	62ed5cae507303a469c040eb47a8b5b70b3a1a93
parent	ac28fb48e4e3197de6bfeef332198b70689837c9 (diff)
parent	af52afa4318feadfa3cd1cc4dcdd3d86907f68a4 (diff)
download	genenetwork3-0b1643e87cf4303db3673dcf3cf240aeb4d518cb.tar.gz