diff options
-rw-r--r-- | README.md | 49 | ||||
-rw-r--r-- | gn3/api/gemma.py | 4 | ||||
-rw-r--r-- | gn3/api/general.py | 16 | ||||
-rw-r--r-- | gn3/computations/correlations.py | 131 | ||||
-rw-r--r-- | gn3/computations/gemma.py | 2 | ||||
-rw-r--r-- | gn3/fs_helpers.py (renamed from gn3/file_utils.py) | 0 | ||||
-rw-r--r-- | gn3/settings.py | 1 | ||||
-rw-r--r-- | tests/integration/test_general.py | 11 | ||||
-rw-r--r-- | tests/unit/test_file_utils.py | 20 |
9 files changed, 120 insertions, 114 deletions
@@ -3,34 +3,27 @@ GeneNetwork3 REST API for data science and machine learning ## Installation -##### Using python-pip +#### Using guix -1. Prepare your system. You need to make you have python > 3.8, and - the ability to install modules. -2. Create and enter your virtualenv: +Simply load up the environment (for development purposes): ```bash -virtualenv --python python3 venv -. venv/bin/activate +guix environment --load=guix.scm ``` -3. Install the required packages + +Also, make sure you have the *guix-bioinformatics* channel set up. ```bash -# The --ignore-installed flag forces packages to -# get installed in the venv even if they existed -# in the global env -pip install -r requirements.txt --ignore-installed +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment --load=guix.scm +python3 + import redis ``` -#### Using guix +Better run a proper container -Simply load up the environment (for development purposes): - -```bash -guix environment --load=guix.scm ``` - -Also, make sure you have the *guix-bioinformatics* channel set up. +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ ~/.config/guix/current/bin/guix environment -C --network --load=guix.scm +``` #### Running Tests @@ -62,6 +55,26 @@ To spin up the server: env FLASK_DEBUG=1 FLASK_APP="main.py" flask run --port=8080 ``` +##### Using python-pip + +IMPORTANT NOTE: we do not recommend using pip tools, use Guix instead + +1. Prepare your system. You need to make you have python > 3.8, and + the ability to install modules. +2. Create and enter your virtualenv: + +```bash +virtualenv --python python3 venv +. venv/bin/activate +``` +3. Install the required packages + +```bash +# The --ignore-installed flag forces packages to +# get installed in the venv even if they existed +# in the global env +pip install -r requirements.txt --ignore-installed +``` #### A note on dependencies diff --git a/gn3/api/gemma.py b/gn3/api/gemma.py index 81e185d..6b0b20e 100644 --- a/gn3/api/gemma.py +++ b/gn3/api/gemma.py @@ -9,8 +9,8 @@ from flask import request from gn3.commands import queue_cmd from gn3.commands import run_cmd -from gn3.file_utils import cache_ipfs_file -from gn3.file_utils import jsonfile_to_dict +from gn3.fs_helpers import cache_ipfs_file +from gn3.fs_helpers import jsonfile_to_dict from gn3.computations.gemma import generate_gemma_cmd from gn3.computations.gemma import do_paths_exist diff --git a/gn3/api/general.py b/gn3/api/general.py index 38e6154..cebb2e3 100644 --- a/gn3/api/general.py +++ b/gn3/api/general.py @@ -5,7 +5,8 @@ from flask import current_app from flask import jsonify from flask import request -from gn3.file_utils import extract_uploaded_file +from gn3.fs_helpers import extract_uploaded_file +from gn3.commands import run_cmd general = Blueprint("general", __name__) @@ -50,3 +51,16 @@ TTL is set in the metadata file. If none is provided, the default is 1 week. if results.get("status") > 0: status = 500 return jsonify(results), status + + +@general.route("/qtl/run/<geno_filestr>/<pheno_filestr>", + methods=["POST"], + strict_slashes=False) +def run_r_qtl(geno_filestr, pheno_filestr): + """Run r_qtl command using the written rqtl_wrapper program + + """ + rqtl_wrapper = current_app.config["RQTL_WRAPPER"] + cmd = (f"Rscript {rqtl_wrapper} " + f"{geno_filestr} {pheno_filestr}") + return jsonify(run_cmd(cmd)), 201 diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 4fdf8cf..857ceb0 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -9,12 +9,17 @@ from typing import Callable import scipy.stats -def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict)-> List: - """Function to construct target dataset data items given commoned shared\ - keys and trait samplelist values for example given keys >>>>>>>>>>\ - ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value object as\ - "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7, 3.6, 1.5, 4.2, 2.3]}\ - return results should be a list of dicts mapping the shared keys to the trait values""" +def map_shared_keys_to_values(target_sample_keys: List, + target_sample_vals: dict) -> List: + """Function to construct target dataset data items given common shared keys + and trait sample-list values for example given keys + + >>>>>>>>>> ["BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"] and value + object as "HCMA:_AT": [4.1, 5.6, 3.2, 1.1, 4.4, 2.2],TXD_AT": [6.2, 5.7, + 3.6, 1.5, 4.2, 2.3]} return results should be a list of dicts mapping the + shared keys to the trait values + + """ target_dataset_data = [] for trait_id, sample_values in target_sample_vals.items(): @@ -32,9 +37,9 @@ def map_shared_keys_to_values(target_sample_keys: List, target_sample_vals: dict def normalize_values(a_values: List, b_values: List) -> Tuple[List[float], List[float], int]: - """Trim two lists of values to contain only the values they both share - Given two lists of sample values, trim each list so that it contains only - the samples that contain a value in both lists. Also returns the number of + """Trim two lists of values to contain only the values they both share Given + two lists of sample values, trim each list so that it contains only the + samples that contain a value in both lists. Also returns the number of such samples. >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], @@ -62,16 +67,14 @@ pearson,spearman and biweight mid correlation return value is rho and p_value "pearson": scipy.stats.pearsonr, "spearman": scipy.stats.spearmanr } - use_corr_method = corr_mapping.get(corr_method, "spearman") - corr_coeffient, p_val = use_corr_method(primary_values, target_values) - return (corr_coeffient, p_val) def compute_sample_r_correlation(trait_name, corr_method, trait_vals, - target_samples_vals) -> Optional[Tuple[str, float, float, int]]: + target_samples_vals) -> Optional[ + Tuple[str, float, float, int]]: """Given a primary trait values and target trait values calculate the correlation coeff and p value @@ -90,7 +93,6 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, # should use numpy.isNan scipy.isNan is deprecated if corr_coeffient is not None: return (trait_name, corr_coeffient, p_value, num_overlap) - return None @@ -99,15 +101,16 @@ def do_bicor(x_val, y_val) -> Tuple[float, float]: package :not packaged in guix """ - _corr_input = (x_val, y_val) - return (0.0, 0.0) + x_val, y_val = 0, 0 + return (x_val, y_val) def filter_shared_sample_keys(this_samplelist, target_samplelist) -> Tuple[List, List]: - """Given primary and target samplelist\ - for two base and target trait select\ - filter the values using the shared keys""" + """Given primary and target sample-list for two base and target trait select + filter the values using the shared keys + + """ this_vals = [] target_vals = [] for key, value in target_samplelist.items(): @@ -120,21 +123,18 @@ def filter_shared_sample_keys(this_samplelist, def compute_all_sample_correlation(this_trait, target_dataset, corr_method="pearson") -> List: - """Given a trait data samplelist and\ - target__datasets compute all sample correlation + """Given a trait data sample-list and target__datasets compute all sample + correlation + """ # xtodo fix trait_name currently returning single one # pylint: disable-msg=too-many-locals - this_trait_samples = this_trait["trait_sample_data"] corr_results = [] processed_values = [] for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - # this_vals, target_vals = filter_shared_sample_keys( - # this_trait_samples, target_trait_data) - processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( this_trait_samples, target_trait_data))) with multiprocessing.Pool(4) as pool: @@ -144,7 +144,6 @@ def compute_all_sample_correlation(this_trait, if sample_correlation is not None: (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation - corr_result = { "corr_coeffient": corr_coeffient, "p_value": p_value, @@ -152,7 +151,6 @@ def compute_all_sample_correlation(this_trait, } corr_results.append({trait_name: corr_result}) - return sorted( corr_results, key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coeffient"])) @@ -160,45 +158,36 @@ def compute_all_sample_correlation(this_trait, def benchmark_compute_all_sample(this_trait, target_dataset, - corr_method="pearson") ->List: - """Temp function to benchmark with compute_all_sample_r\ - alternative to compute_all_sample_r where we use \ - multiprocessing - """ + corr_method="pearson") -> List: + """Temp function to benchmark with compute_all_sample_r alternative to + compute_all_sample_r where we use multiprocessing + """ this_trait_samples = this_trait["trait_sample_data"] - corr_results = [] - for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = filter_shared_sample_keys( this_trait_samples, target_trait_data) - sample_correlation = compute_sample_r_correlation( trait_name=trait_name, corr_method=corr_method, trait_vals=this_vals, target_samples_vals=target_vals) - if sample_correlation is not None: - (trait_name, corr_coeffient, p_value, num_overlap) = sample_correlation - + (trait_name, corr_coeffient, + p_value, num_overlap) = sample_correlation else: continue - corr_result = { "corr_coeffient": corr_coeffient, "p_value": p_value, "num_overlap": num_overlap } - corr_results.append({trait_name: corr_result}) - return corr_results - - + def tissue_correlation_for_trait( primary_tissue_vals: List, target_tissues_values: List, @@ -232,8 +221,10 @@ def fetch_lit_correlation_data( input_mouse_gene_id: Optional[str], gene_id: str, mouse_gene_id: Optional[str] = None) -> Tuple[str, float]: - """Given input trait mouse gene id and mouse gene id fetch the lit\ - corr_data""" + """Given input trait mouse gene id and mouse gene id fetch the lit + corr_data + + """ if mouse_gene_id is not None and ";" not in mouse_gene_id: query = """ SELECT VALUE @@ -260,7 +251,6 @@ def fetch_lit_correlation_data( lit_results = (gene_id, lit_corr_results[1])\ if lit_corr_results else (gene_id, 0) return lit_results - return (gene_id, 0) @@ -272,11 +262,9 @@ def lit_correlation_for_trait( """given species,base trait gene id fetch the lit corr results from the db\ output is float for lit corr results """ fetched_lit_corr_results = [] - this_trait_mouse_gene_id = map_to_mouse_gene_id(conn=conn, species=species, gene_id=trait_gene_id) - for (trait_name, target_trait_gene_id) in target_trait_lists: corr_results = {} if target_trait_gene_id: @@ -284,29 +272,26 @@ def lit_correlation_for_trait( conn=conn, species=species, gene_id=target_trait_gene_id) - fetched_corr_data = fetch_lit_correlation_data( conn=conn, input_mouse_gene_id=this_trait_mouse_gene_id, gene_id=target_trait_gene_id, mouse_gene_id=target_mouse_gene_id) - dict_results = dict(zip(("gene_id", "lit_corr"), fetched_corr_data)) corr_results[trait_name] = dict_results fetched_lit_corr_results.append(corr_results) - return fetched_lit_corr_results def query_formatter(query_string: str, *query_values): - """Formatter query string given the unformatted query string\ - and the respectibe values.Assumes number of placeholders is - equal to the number of query values """ - # xtodo escape sql queries - results = query_string % (query_values) + """Formatter query string given the unformatted query string and the + respectibe values.Assumes number of placeholders is equal to the number of + query values - return results + """ + # xtodo escape sql queries + return query_string % (query_values) def map_to_mouse_gene_id(conn, species: Optional[str], @@ -319,19 +304,15 @@ def map_to_mouse_gene_id(conn, species: Optional[str], return None if species == "mouse": return gene_id - cursor = conn.cursor() query = """SELECT mouse FROM GeneIDXRef WHERE '%s' = '%s'""" - query_values = (species, gene_id) cursor.execute(query_formatter(query, *query_values)) results = cursor.fetchone() - mouse_gene_id = results.mouse if results is not None else None - return mouse_gene_id @@ -358,21 +339,15 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, """Function acts as an abstraction for tissue_correlation_for_trait\ required input are target tissue object and primary tissue trait\ target tissues data contains the trait_symbol_dict and symbol_tissue_vals - """ - tissues_results = [] - primary_tissue_vals = primary_tissue_dict["tissue_values"] traits_symbol_dict = target_tissues_data["trait_symbol_dict"] symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"] - target_tissues_list = process_trait_symbol_dict( traits_symbol_dict, symbol_tissue_vals_dict) - for target_tissue_obj in target_tissues_list: trait_id = target_tissue_obj.get("trait_id") - target_tissue_vals = target_tissue_obj.get("tissue_values") tissue_result = tissue_correlation_for_trait( @@ -380,22 +355,18 @@ def compute_all_tissue_correlation(primary_tissue_dict: dict, target_tissues_values=target_tissue_vals, trait_id=trait_id, corr_method=corr_method) - tissue_result_dict = {trait_id: tissue_result} tissues_results.append(tissue_result_dict) - - sorted_tissues_results = sorted( + return sorted( tissues_results, key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"])) - return sorted_tissues_results - def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> List: - """Method for processing trait symbol\ - dict given the symbol tissue values """ - traits_tissue_vals = [] + """Method for processing trait symbol dict given the symbol tissue values + """ + traits_tissue_vals = [] for (trait, symbol) in trait_symbol_dict.items(): if symbol is not None: target_symbol = symbol.lower() @@ -404,25 +375,21 @@ def process_trait_symbol_dict(trait_symbol_dict, symbol_tissue_vals_dict) -> Lis target_tissue_dict = {"trait_id": trait, "symbol": target_symbol, "tissue_values": trait_tissue_val} - traits_tissue_vals.append(target_tissue_dict) - return traits_tissue_vals def compute_tissue_correlation(primary_tissue_dict: dict, target_tissues_data: dict, corr_method: str): - """Experimental function that uses multiprocessing\ - for computing tissue correlation - """ + """Experimental function that uses multiprocessing for computing tissue + correlation + """ tissues_results = [] - primary_tissue_vals = primary_tissue_dict["tissue_values"] traits_symbol_dict = target_tissues_data["trait_symbol_dict"] symbol_tissue_vals_dict = target_tissues_data["symbol_tissue_vals_dict"] - target_tissues_list = process_trait_symbol_dict( traits_symbol_dict, symbol_tissue_vals_dict) processed_values = [] diff --git a/gn3/computations/gemma.py b/gn3/computations/gemma.py index 5f9d5a3..0b22d3c 100644 --- a/gn3/computations/gemma.py +++ b/gn3/computations/gemma.py @@ -7,7 +7,7 @@ from typing import Dict from typing import List from typing import ValuesView from gn3.commands import compose_gemma_cmd -from gn3.file_utils import get_hash_of_files +from gn3.fs_helpers import get_hash_of_files def generate_hash_of_string(unhashed_str: str) -> str: diff --git a/gn3/file_utils.py b/gn3/fs_helpers.py index 73f6567..73f6567 100644 --- a/gn3/file_utils.py +++ b/gn3/fs_helpers.py diff --git a/gn3/settings.py b/gn3/settings.py index 7b3ffb7..2057ce1 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -10,6 +10,7 @@ CACHEDIR = "" REDIS_URI = "redis://localhost:6379/0" REDIS_JOB_QUEUE = "GN3::job-queue" TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir()) +RQTL_WRAPPER = "rqtl_wrapper.R" # SQL confs SQL_URI = os.environ.get("SQL_URI", "mysql://webqtlout:webqtlout@localhost/db_webqtl") diff --git a/tests/integration/test_general.py b/tests/integration/test_general.py index 99c4824..8fc2b43 100644 --- a/tests/integration/test_general.py +++ b/tests/integration/test_general.py @@ -46,3 +46,14 @@ class GeneralAPITest(unittest.TestCase): self.assertEqual(response.get_json(), {"status": 128, "error": "gzip failed to unpack file"}) + + @mock.patch("gn3.api.general.run_cmd") + def test_run_r_qtl(self, mock_run_cmd): + """Test correct upload of file""" + mock_run_cmd.return_value = "Random results from STDOUT" + response = self.app.post("/api/qtl/run/" + "geno_file_test/" + "pheno_file_test") + self.assertEqual(response.status_code, 201) + self.assertEqual(response.get_json(), + "Random results from STDOUT") diff --git a/tests/unit/test_file_utils.py b/tests/unit/test_file_utils.py index cc842d5..75be4f6 100644 --- a/tests/unit/test_file_utils.py +++ b/tests/unit/test_file_utils.py @@ -1,14 +1,14 @@ -"""Test cases for procedures defined in file_utils.py""" +"""Test cases for procedures defined in fs_helpers.py""" import os import unittest from dataclasses import dataclass from typing import Callable from unittest import mock -from gn3.file_utils import extract_uploaded_file -from gn3.file_utils import get_dir_hash -from gn3.file_utils import jsonfile_to_dict -from gn3.file_utils import cache_ipfs_file +from gn3.fs_helpers import extract_uploaded_file +from gn3.fs_helpers import get_dir_hash +from gn3.fs_helpers import jsonfile_to_dict +from gn3.fs_helpers import cache_ipfs_file @dataclass @@ -19,7 +19,7 @@ class MockFile: class TestFileUtils(unittest.TestCase): - """Test cases for procedures defined in file_utils.py""" + """Test cases for procedures defined in fs_helpers.py""" def test_get_dir_hash(self): """Test that a directory is hashed correctly""" @@ -45,8 +45,8 @@ non-existent""" self.assertRaises(FileNotFoundError, jsonfile_to_dict, "/non-existent-dir") - @mock.patch("gn3.file_utils.tarfile") - @mock.patch("gn3.file_utils.secure_filename") + @mock.patch("gn3.fs_helpers.tarfile") + @mock.patch("gn3.fs_helpers.secure_filename") def test_extract_uploaded_file(self, mock_file, mock_tarfile): """Test that the gzip file is extracted to the right location""" mock_file.return_value = "upload-data.tar.gz" @@ -65,7 +65,7 @@ non-existent""" mock_file.assert_called_once_with("upload-data.tar.gz") self.assertEqual(result, {"status": 0, "token": "abcdef-abcdef"}) - @mock.patch("gn3.file_utils.secure_filename") + @mock.patch("gn3.fs_helpers.secure_filename") def test_extract_uploaded_file_non_existent_gzip(self, mock_file): """Test that the right error message is returned when there is a problem extracting the file""" @@ -96,7 +96,7 @@ extracting the file""" os.rmdir(test_dir) self.assertEqual(file_loc, f"{test_dir}/genotype.txt") - @mock.patch("gn3.file_utils.ipfshttpclient") + @mock.patch("gn3.fs_helpers.ipfshttpclient") def test_cache_ipfs_file_cache_miss(self, mock_ipfs): """Test that a file is cached if there's a cache miss""" |