about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAlexander Kabui2021-05-15 01:20:46 +0300
committerGitHub2021-05-15 01:20:46 +0300
commitc69b11cffba7547d65ac9812b0118cddad91be0d (patch)
treebfa55c23f702db5072e8839e44961dd72fd4a9e2
parentbcba700bd2835f0a36042d781860b3407519f6d8 (diff)
parent46a96ec0b89620eed4874ada565a9643ac19a042 (diff)
downloadgenenetwork3-c69b11cffba7547d65ac9812b0118cddad91be0d.tar.gz
Merge branch 'main' into feature/minor-fixes
-rw-r--r--README.md49
-rw-r--r--gn3/api/gemma.py4
-rw-r--r--gn3/api/general.py16
-rw-r--r--gn3/computations/correlations.py131
-rw-r--r--gn3/computations/gemma.py2
-rw-r--r--gn3/fs_helpers.py (renamed from gn3/file_utils.py)0
-rw-r--r--gn3/settings.py1
-rw-r--r--tests/integration/test_general.py11
-rw-r--r--tests/unit/test_file_utils.py20
9 files changed, 120 insertions, 114 deletions
diff --git a/README.md b/README.md
index b18fdf1..c1acba1 100644
--- a/README.md
+++ b/README.md
@@ -3,34 +3,27 @@ GeneNetwork3 REST API for data science and machine  learning
 
 ## Installation
 
-##### Using python-pip
+#### Using guix
 
-1. Prepare your system. You need to make you have python > 3.8, and
-   the ability to install modules.
-2. Create and enter your virtualenv:
+Simply load up the environment (for development purposes):
 
 ```bash
-virtualenv --python python3 venv
-. venv/bin/activate
+guix environment --load=guix.scm
 ```
-3. Install the required packages
+
+Also, make sure you have the *guix-bioinformatics* channel set up.
 
 ```bash
-# The --ignore-installed flag forces packages to
-# get installed in the venv even if they existed 
-# in the global env
-pip install -r requirements.txt --ignore-installed
+env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm
+python3
+  import redis
 ```
 
-#### Using guix
+Better run a proper container
 
-Simply load up the environment (for development purposes):
-
-```bash
-guix environment --load=guix.scm
 ```
-
-Also, make sure you have the *guix-bioinformatics* channel set up.
+env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm 
+```
 
 #### Running Tests
 
@@ -62,6 +55,26 @@ To spin up the server:
 env FLASK_DEBUG=1 FLASK_APP="main.py" flask run --port=8080
 ```
 
+##### Using python-pip
+
+IMPORTANT NOTE: we do not recommend using pip tools, use Guix instead
+
+1. Prepare your system. You need to make you have python > 3.8, and
+   the ability to install modules.
+2. Create and enter your virtualenv:
+
+```bash
+virtualenv --python python3 venv
+. venv/bin/activate
+```
+3. Install the required packages
+
+```bash
+# The --ignore-installed flag forces packages to
+# get installed in the venv even if they existed 
+# in the global env
+pip install -r requirements.txt --ignore-installed
+```
 
 #### A note on dependencies
 
diff --git a/gn3/api/gemma.py b/gn3/api/gemma.py
index 81e185d..6b0b20e 100644
--- a/gn3/api/gemma.py
+++ b/gn3/api/gemma.py
@@ -9,8 +9,8 @@ from flask import request
 
 from gn3.commands import queue_cmd
 from gn3.commands import run_cmd
-from gn3.file_utils import cache_ipfs_file
-from gn3.file_utils import jsonfile_to_dict
+from gn3.fs_helpers import cache_ipfs_file
+from gn3.fs_helpers import jsonfile_to_dict
 from gn3.computations.gemma import generate_gemma_cmd
 from gn3.computations.gemma import do_paths_exist
 
diff --git a/gn3/api/general.py b/gn3/api/general.py
index 38e6154..cebb2e3 100644
--- a/gn3/api/general.py
+++ b/gn3/api/general.py
@@ -5,7 +5,8 @@ from flask import current_app
 from flask import jsonify
 from flask import request
 
-from gn3.file_utils import extract_uploaded_file
+from gn3.fs_helpers import extract_uploaded_file
+from gn3.commands import run_cmd
 
 
 general = Blueprint("general", __name__)
@@ -50,3 +51,16 @@ TTL is set in the metadata file. If none is provided, the default is 1 week.
     if results.get("status") > 0:
         status = 500
     return jsonify(results), status
+
+
+@general.route("/qtl/run/<geno_filestr>/<pheno_filestr>",
+               methods=["POST"],
+               strict_slashes=False)
+def run_r_qtl(geno_filestr, pheno_filestr):
+    """Run r_qtl command using the written rqtl_wrapper program
+
+    """
+    rqtl_wrapper = current_app.config["RQTL_WRAPPER"]
+    cmd = (f"Rscript {rqtl_wrapper} "
+           f"{geno_filestr} {pheno_filestr}")
+    return jsonify(run_cmd(cmd)), 201
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 4fdf8cf..857ceb0 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -9,12 +9,17 @@ from typing import Callable
 import scipy.stats
 
 
-def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict)-> List:
-    """Function to construct target dataset data items given commoned shared\
-    keys and trait samplelist values for example given keys  >>>>>>>>>>\
-    ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value object as\
-    "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7, 3.6, 1.5, 4.2, 2.3]}\
-    return  results should be a list of dicts mapping the shared keys to the trait values"""
+def map_shared_keys_to_values(target_sample_keys: List,
+                              target_sample_vals: dict) -> List:
+    """Function to construct target dataset data items given common shared keys
+    and trait sample-list values for example given keys
+
+    >>>>>>>>>> ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value
+    object as "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7,
+    3.6, 1.5, 4.2, 2.3]} return results should be a list of dicts mapping the
+    shared keys to the trait values
+
+    """
     target_dataset_data = []
 
     for trait_id, sample_values in target_sample_vals.items():
@@ -32,9 +37,9 @@ def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict
 
 def normalize_values(a_values: List,
                      b_values: List) -> Tuple[List[float], List[float], int]:
-    """Trim two lists of values to contain only the values they both share
-    Given two lists of sample values, trim each list so that it contains only
-    the samples that contain a value in both lists. Also returns the number of
+    """Trim two lists of values to contain only the values they both share Given
+    two lists of sample values, trim each list so that it contains only the
+    samples that contain a value in both lists. Also returns the number of
     such samples.
 
     >>> normalize_values([2.3, None, None, 3.2, 4.1, 5],
@@ -62,16 +67,14 @@ pearson,spearman and biweight mid correlation return value is rho and p_value
         "pearson": scipy.stats.pearsonr,
         "spearman": scipy.stats.spearmanr
     }
-
     use_corr_method = corr_mapping.get(corr_method, "spearman")
-
     corr_coeffient, p_val = use_corr_method(primary_values, target_values)
-
     return (corr_coeffient, p_val)
 
 
 def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
-                                 target_samples_vals) -> Optional[Tuple[str, float, float, int]]:
+                                 target_samples_vals) -> Optional[
+                                     Tuple[str, float, float, int]]:
     """Given a primary trait values and target trait values calculate the
     correlation coeff and p value
 
@@ -90,7 +93,6 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
         # should use numpy.isNan scipy.isNan is deprecated
         if corr_coeffient is not None:
             return (trait_name, corr_coeffient, p_value, num_overlap)
-
     return None
 
 
@@ -99,15 +101,16 @@ def do_bicor(x_val, y_val) -> Tuple[float, float]:
 package :not packaged in guix
 
     """
-    _corr_input = (x_val, y_val)
-    return (0.0, 0.0)
+    x_val, y_val = 0, 0
+    return (x_val, y_val)
 
 
 def filter_shared_sample_keys(this_samplelist,
                               target_samplelist) -> Tuple[List, List]:
-    """Given primary and target samplelist\
-    for two base and target trait select\
-    filter the values using the shared keys"""
+    """Given primary and target sample-list for two base and target trait select
+    filter the values using the shared keys
+
+    """
     this_vals = []
     target_vals = []
     for key, value in target_samplelist.items():
@@ -120,21 +123,18 @@ def filter_shared_sample_keys(this_samplelist,
 def compute_all_sample_correlation(this_trait,
                                    target_dataset,
                                    corr_method="pearson") -> List:
-    """Given a trait data samplelist and\
-    target__datasets compute all sample correlation
+    """Given a trait data sample-list and target__datasets compute all sample
+    correlation
+
     """
     # xtodo fix trait_name currently returning single one
     # pylint: disable-msg=too-many-locals
-
     this_trait_samples = this_trait["trait_sample_data"]
     corr_results = []
     processed_values = []
     for target_trait in target_dataset:
         trait_name = target_trait.get("trait_id")
         target_trait_data = target_trait["trait_sample_data"]
-        # this_vals, target_vals = filter_shared_sample_keys(
-        #     this_trait_samples, target_trait_data)
-
         processed_values.append((trait_name, corr_method, *filter_shared_sample_keys(
             this_trait_samples, target_trait_data)))
     with multiprocessing.Pool(4) as pool:
@@ -144,7 +144,6 @@ def compute_all_sample_correlation(this_trait,
             if sample_correlation is not None:
                 (trait_name, corr_coeffient, p_value,
                  num_overlap) = sample_correlation
-
                 corr_result = {
                     "corr_coeffient": corr_coeffient,
                     "p_value": p_value,
@@ -152,7 +151,6 @@ def compute_all_sample_correlation(this_trait,
                 }
 
                 corr_results.append({trait_name: corr_result})
-
     return sorted(
         corr_results,
         key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coeffient"]))
@@ -160,45 +158,36 @@ def compute_all_sample_correlation(this_trait,
 
 def benchmark_compute_all_sample(this_trait,
                                  target_dataset,
-                                 corr_method="pearson") ->List:
-    """Temp function to benchmark with compute_all_sample_r\
-    alternative to compute_all_sample_r where we use \
-    multiprocessing
-    """
+                                 corr_method="pearson") -> List:
+    """Temp function to benchmark with compute_all_sample_r alternative to
+    compute_all_sample_r where we use multiprocessing
 
+    """
     this_trait_samples = this_trait["trait_sample_data"]
-
     corr_results = []
-
     for target_trait in target_dataset:
         trait_name = target_trait.get("trait_id")
         target_trait_data = target_trait["trait_sample_data"]
         this_vals, target_vals = filter_shared_sample_keys(
             this_trait_samples, target_trait_data)
-
         sample_correlation = compute_sample_r_correlation(
             trait_name=trait_name,
             corr_method=corr_method,
             trait_vals=this_vals,
             target_samples_vals=target_vals)
-
         if sample_correlation is not None:
-            (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation
-
+            (trait_name, corr_coeffient,
+             p_value, num_overlap) = sample_correlation
         else:
             continue
-
         corr_result = {
             "corr_coeffient": corr_coeffient,
             "p_value": p_value,
             "num_overlap": num_overlap
         }
-
         corr_results.append({trait_name: corr_result})
-
     return corr_results
-
-
+  
 def tissue_correlation_for_trait(
         primary_tissue_vals: List,
         target_tissues_values: List,
@@ -232,8 +221,10 @@ def fetch_lit_correlation_data(
         input_mouse_gene_id: Optional[str],
         gene_id: str,
         mouse_gene_id: Optional[str] = None) -> Tuple[str, float]:
-    """Given input trait mouse gene id and mouse gene id fetch the lit\
-    corr_data"""
+    """Given input trait mouse gene id and mouse gene id fetch the lit
+    corr_data
+
+    """
     if mouse_gene_id is not None and ";" not in mouse_gene_id:
         query = """
         SELECT VALUE
@@ -260,7 +251,6 @@ def fetch_lit_correlation_data(
         lit_results = (gene_id, lit_corr_results[1])\
             if lit_corr_results else (gene_id, 0)
         return lit_results
-
     return (gene_id, 0)
 
 
@@ -272,11 +262,9 @@ def lit_correlation_for_trait(
     """given species,base trait gene id fetch the lit corr results from the db\
     output is float for lit corr results """
     fetched_lit_corr_results = []
-
     this_trait_mouse_gene_id = map_to_mouse_gene_id(conn=conn,
                                                     species=species,
                                                     gene_id=trait_gene_id)
-
     for (trait_name, target_trait_gene_id) in target_trait_lists:
         corr_results = {}
         if target_trait_gene_id:
@@ -284,29 +272,26 @@ def lit_correlation_for_trait(
                 conn=conn,
                 species=species,
                 gene_id=target_trait_gene_id)
-
             fetched_corr_data = fetch_lit_correlation_data(
                 conn=conn,
                 input_mouse_gene_id=this_trait_mouse_gene_id,
                 gene_id=target_trait_gene_id,
                 mouse_gene_id=target_mouse_gene_id)
-
             dict_results = dict(zip(("gene_id", "lit_corr"),
                                     fetched_corr_data))
             corr_results[trait_name] = dict_results
             fetched_lit_corr_results.append(corr_results)
-
     return fetched_lit_corr_results
 
 
 def query_formatter(query_string: str, *query_values):
-    """Formatter query string given the unformatted query string\
-    and the respectibe values.Assumes number of placeholders is
-    equal to the number of query values """
-    # xtodo escape sql queries
-    results = query_string % (query_values)
+    """Formatter query string given the unformatted query string and the
+    respectibe values.Assumes number of placeholders is equal to the number of
+    query values
 
-    return results
+    """
+    # xtodo escape sql queries
+    return query_string % (query_values)
 
 
 def map_to_mouse_gene_id(conn, species: Optional[str],
@@ -319,19 +304,15 @@ def map_to_mouse_gene_id(conn, species: Optional[str],
         return None
     if species == "mouse":
         return gene_id
-
     cursor = conn.cursor()
     query = """SELECT mouse
                 FROM GeneIDXRef
                 WHERE '%s' = '%s'"""
-
     query_values = (species, gene_id)
     cursor.execute(query_formatter(query,
                                    *query_values))
     results = cursor.fetchone()
-
     mouse_gene_id = results.mouse if results is not None else None
-
     return mouse_gene_id
 
 
@@ -358,21 +339,15 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict,
     """Function acts as an abstraction for tissue_correlation_for_trait\
     required input are target tissue object and primary tissue trait\
     target tissues data contains the trait_symbol_dict and symbol_tissue_vals
-
     """
-
     tissues_results = []
-
     primary_tissue_vals = primary_tissue_dict["tissue_values"]
     traits_symbol_dict = target_tissues_data["trait_symbol_dict"]
     symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"]
-
     target_tissues_list = process_trait_symbol_dict(
         traits_symbol_dict, symbol_tissue_vals_dict)
-
     for target_tissue_obj in target_tissues_list:
         trait_id = target_tissue_obj.get("trait_id")
-
         target_tissue_vals = target_tissue_obj.get("tissue_values")
 
         tissue_result = tissue_correlation_for_trait(
@@ -380,22 +355,18 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict,
             target_tissues_values=target_tissue_vals,
             trait_id=trait_id,
             corr_method=corr_method)
-
         tissue_result_dict = {trait_id: tissue_result}
         tissues_results.append(tissue_result_dict)
-
-    sorted_tissues_results = sorted(
+    return sorted(
         tissues_results,
         key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"]))
 
-    return sorted_tissues_results
-
 
 def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> List:
-    """Method for processing trait symbol\
-    dict given the symbol tissue values """
-    traits_tissue_vals = []
+    """Method for processing trait symbol dict given the symbol tissue values
 
+    """
+    traits_tissue_vals = []
     for (trait, symbol) in trait_symbol_dict.items():
         if symbol is not None:
             target_symbol = symbol.lower()
@@ -404,25 +375,21 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis
                 target_tissue_dict = {"trait_id": trait,
                                       "symbol": target_symbol,
                                       "tissue_values": trait_tissue_val}
-
                 traits_tissue_vals.append(target_tissue_dict)
-
     return traits_tissue_vals
 
 
 def compute_tissue_correlation(primary_tissue_dict: dict,
                                target_tissues_data: dict,
                                corr_method: str):
-    """Experimental function that uses multiprocessing\
-    for computing tissue correlation
-    """
+    """Experimental function that uses multiprocessing for computing tissue
+    correlation
 
+    """
     tissues_results = []
-
     primary_tissue_vals = primary_tissue_dict["tissue_values"]
     traits_symbol_dict = target_tissues_data["trait_symbol_dict"]
     symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"]
-
     target_tissues_list = process_trait_symbol_dict(
         traits_symbol_dict, symbol_tissue_vals_dict)
     processed_values = []
diff --git a/gn3/computations/gemma.py b/gn3/computations/gemma.py
index 5f9d5a3..0b22d3c 100644
--- a/gn3/computations/gemma.py
+++ b/gn3/computations/gemma.py
@@ -7,7 +7,7 @@ from typing import Dict
 from typing import List
 from typing import ValuesView
 from gn3.commands import compose_gemma_cmd
-from gn3.file_utils import get_hash_of_files
+from gn3.fs_helpers import get_hash_of_files
 
 
 def generate_hash_of_string(unhashed_str: str) -> str:
diff --git a/gn3/file_utils.py b/gn3/fs_helpers.py
index 73f6567..73f6567 100644
--- a/gn3/file_utils.py
+++ b/gn3/fs_helpers.py
diff --git a/gn3/settings.py b/gn3/settings.py
index 7b3ffb7..2057ce1 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -10,6 +10,7 @@ CACHEDIR = ""
 REDIS_URI = "redis://localhost:6379/0"
 REDIS_JOB_QUEUE = "GN3::job-queue"
 TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir())
+RQTL_WRAPPER = "rqtl_wrapper.R"
 
 # SQL confs
 SQL_URI = os.environ.get("SQL_URI", "mysql://webqtlout:webqtlout@localhost/db_webqtl")
diff --git a/tests/integration/test_general.py b/tests/integration/test_general.py
index 99c4824..8fc2b43 100644
--- a/tests/integration/test_general.py
+++ b/tests/integration/test_general.py
@@ -46,3 +46,14 @@ class GeneralAPITest(unittest.TestCase):
         self.assertEqual(response.get_json(),
                          {"status": 128,
                           "error": "gzip failed to unpack file"})
+
+    @mock.patch("gn3.api.general.run_cmd")
+    def test_run_r_qtl(self, mock_run_cmd):
+        """Test correct upload of file"""
+        mock_run_cmd.return_value = "Random results from STDOUT"
+        response = self.app.post("/api/qtl/run/"
+                                 "geno_file_test/"
+                                 "pheno_file_test")
+        self.assertEqual(response.status_code, 201)
+        self.assertEqual(response.get_json(),
+                         "Random results from STDOUT")
diff --git a/tests/unit/test_file_utils.py b/tests/unit/test_file_utils.py
index cc842d5..75be4f6 100644
--- a/tests/unit/test_file_utils.py
+++ b/tests/unit/test_file_utils.py
@@ -1,14 +1,14 @@
-"""Test cases for procedures defined in file_utils.py"""
+"""Test cases for procedures defined in fs_helpers.py"""
 import os
 import unittest
 
 from dataclasses import dataclass
 from typing import Callable
 from unittest import mock
-from gn3.file_utils import extract_uploaded_file
-from gn3.file_utils import get_dir_hash
-from gn3.file_utils import jsonfile_to_dict
-from gn3.file_utils import cache_ipfs_file
+from gn3.fs_helpers import extract_uploaded_file
+from gn3.fs_helpers import get_dir_hash
+from gn3.fs_helpers import jsonfile_to_dict
+from gn3.fs_helpers import cache_ipfs_file
 
 
 @dataclass
@@ -19,7 +19,7 @@ class MockFile:
 
 
 class TestFileUtils(unittest.TestCase):
-    """Test cases for procedures defined in file_utils.py"""
+    """Test cases for procedures defined in fs_helpers.py"""
 
     def test_get_dir_hash(self):
         """Test that a directory is hashed correctly"""
@@ -45,8 +45,8 @@ non-existent"""
         self.assertRaises(FileNotFoundError, jsonfile_to_dict,
                           "/non-existent-dir")
 
-    @mock.patch("gn3.file_utils.tarfile")
-    @mock.patch("gn3.file_utils.secure_filename")
+    @mock.patch("gn3.fs_helpers.tarfile")
+    @mock.patch("gn3.fs_helpers.secure_filename")
     def test_extract_uploaded_file(self, mock_file, mock_tarfile):
         """Test that the gzip file is extracted to the right location"""
         mock_file.return_value = "upload-data.tar.gz"
@@ -65,7 +65,7 @@ non-existent"""
         mock_file.assert_called_once_with("upload-data.tar.gz")
         self.assertEqual(result, {"status": 0, "token": "abcdef-abcdef"})
 
-    @mock.patch("gn3.file_utils.secure_filename")
+    @mock.patch("gn3.fs_helpers.secure_filename")
     def test_extract_uploaded_file_non_existent_gzip(self, mock_file):
         """Test that the right error message is returned when there is a problem
 extracting the file"""
@@ -96,7 +96,7 @@ extracting the file"""
         os.rmdir(test_dir)
         self.assertEqual(file_loc, f"{test_dir}/genotype.txt")
 
-    @mock.patch("gn3.file_utils.ipfshttpclient")
+    @mock.patch("gn3.fs_helpers.ipfshttpclient")
     def test_cache_ipfs_file_cache_miss(self,
                                         mock_ipfs):
         """Test that a file is cached if there's a cache miss"""