From bf1620406c3700d7e211a02fe13c3d8df9a9532d Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 10 Nov 2021 10:39:53 +0300
Subject: rename:loading correlation results to computing correlations
---
wqflask/wqflask/templates/loading.html | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/templates/loading.html b/wqflask/wqflask/templates/loading.html
index ccf810b0..b9e31ad0 100644
--- a/wqflask/wqflask/templates/loading.html
+++ b/wqflask/wqflask/templates/loading.html
@@ -66,11 +66,11 @@
{% endif %}
{% endif %}
{% else %}
-
Loading {{ start_vars.tool_used }} Results...
+ {{ start_vars.tool_used }} Computation in progress ...
{% endif %}
-
+
{% if start_vars.vals_diff|length != 0 and start_vars.transform == "" %}
--
cgit v1.2.3
From b5b44f401e0d05089534d7f8e6631d9a092fd0d7 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 10 Nov 2021 10:40:11 +0300
Subject: add compute gif
---
wqflask/wqflask/static/gif/waitAnima2.gif | Bin 0 -> 54013 bytes
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 wqflask/wqflask/static/gif/waitAnima2.gif
(limited to 'wqflask')
diff --git a/wqflask/wqflask/static/gif/waitAnima2.gif b/wqflask/wqflask/static/gif/waitAnima2.gif
new file mode 100644
index 00000000..50aff7f2
Binary files /dev/null and b/wqflask/wqflask/static/gif/waitAnima2.gif differ
--
cgit v1.2.3
From f3ff381a90733d6c64349ed1dd116df83b5565d6 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 02:51:08 +0300
Subject: add fast compute from gn3
---
wqflask/wqflask/correlation/correlation_gn3_api.py | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py
index 1e3a40f2..7b828016 100644
--- a/wqflask/wqflask/correlation/correlation_gn3_api.py
+++ b/wqflask/wqflask/correlation/correlation_gn3_api.py
@@ -1,5 +1,7 @@
"""module that calls the gn3 api's to do the correlation """
import json
+import time
+from functools import wraps
from wqflask.correlation import correlation_functions
@@ -9,6 +11,7 @@ from base.trait import create_trait
from base.trait import retrieve_sample_data
from gn3.computations.correlations import compute_all_sample_correlation
+from gn3.computations.correlations import fast_compute_all_sample_correlation
from gn3.computations.correlations import map_shared_keys_to_values
from gn3.computations.correlations import compute_all_lit_correlation
from gn3.computations.correlations import compute_tissue_correlation
@@ -19,9 +22,11 @@ def create_target_this_trait(start_vars):
"""this function creates the required trait and target dataset for correlation"""
if start_vars['dataset'] == "Temp":
- this_dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group'])
+ this_dataset = data_set.create_dataset(
+ dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group'])
else:
- this_dataset = data_set.create_dataset(dataset_name=start_vars['dataset'])
+ this_dataset = data_set.create_dataset(
+ dataset_name=start_vars['dataset'])
target_dataset = data_set.create_dataset(
dataset_name=start_vars['corr_dataset'])
this_trait = create_trait(dataset=this_dataset,
@@ -187,10 +192,10 @@ def compute_correlation(start_vars, method="pearson", compute_all=False):
if corr_type == "sample":
(this_trait_data, target_dataset_data) = fetch_sample_data(
start_vars, this_trait, this_dataset, target_dataset)
- correlation_results = compute_all_sample_correlation(corr_method=method,
- this_trait=this_trait_data,
- target_dataset=target_dataset_data)
+ correlation_results = fast_compute_all_sample_correlation(corr_method=method,
+ this_trait=this_trait_data,
+ target_dataset=target_dataset_data)
elif corr_type == "tissue":
trait_symbol_dict = this_dataset.retrieve_genes("Symbol")
tissue_input = get_tissue_correlation_input(
--
cgit v1.2.3
From bbc75dcef80c3df600ab01c1804a27cdfdce1b80 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 02:51:44 +0300
Subject: init test for precomputing sample correlation
---
wqflask/wqflask/correlation/pre_computes.py | 72 +++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)
create mode 100644 wqflask/wqflask/correlation/pre_computes.py
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
new file mode 100644
index 00000000..1db9f61b
--- /dev/null
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -0,0 +1,72 @@
+"""module contains the code to do the
+precomputations of sample data between
+two entire datasets"""
+
+import json
+from typing import List
+from base import data_set
+
+from gn3.computations.correlations import fast_compute_all_sample_correlation
+from gn3.computations.correlations import map_shared_keys_to_values
+
+def get_dataset_dict_data(dataset_obj):
+ """function to get the dataset data mapped to key"""
+ dataset_obj.get_trait_data()
+ return map_shared_keys_to_values(dataset_obj.samplelist,
+ dataset_obj.trait_data)
+
+
+def fetch_datasets(base_dataset_name: str, target_dataset_name: str) ->List:
+ """query to fetch create datasets and fetch traits
+ all traits of a dataset"""
+
+ # doesnt work for temp
+
+ base_dataset = data_set.create_dataset(dataset_name=base_dataset_name)
+
+ target_dataset = data_set.create_dataset(dataset_name=target_dataset_name)
+ # replace with map
+
+ return (map(get_dataset_dict_data,
+ [base_dataset, target_dataset]))
+
+
+# in the base dataset we just need the traits
+def pre_compute_sample_correlation(base_dataset: List,
+ target_dataset: List) -> List:
+ """function compute the correlation between the
+ a whole dataset against a target
+ input: target&base_dataset(contains traits and sample results)
+ output: list containing the computed results
+
+ precaution:function is expensive;targets only Exon and
+ """
+
+ for trait_info in base_dataset:
+
+ yield fast_compute_all_sample_correlation(corr_method="pearson",
+ this_trait=trait_info,
+ target_dataset=target_dataset)
+
+
+def cache_to_file(base_dataset_name: str, target_dataset_name: str):
+ """function to cache the results to file"""
+
+ # validate the datasets expiry first
+
+ base_dataset_data, target_dataset_data = [list(dataset) for dataset in list(
+ fetch_datasets(base_dataset_name, target_dataset_name))]
+
+
+ try:
+ with open("unique_file_name.json", "w") as file_handler:
+ file_handler.write()
+
+ dataset_correlation_results = list(pre_compute_sample_correlation(
+ base_dataset_data, target_dataset_data))
+
+ print(dataset_correlation_results)
+
+ json.dump(dataset_correlation_results, file_handler)
+ except Exception as error:
+ raise error
--
cgit v1.2.3
From 6ced33f201e8a4e389a077a91ba9ed8bf5c19fa0 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 15:56:31 +0300
Subject: fix issue with number for samples
---
wqflask/wqflask/correlation/pre_computes.py | 37 +++++++++++++++++++++--------
1 file changed, 27 insertions(+), 10 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 1db9f61b..f1c9e1bd 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -6,12 +6,14 @@ import json
from typing import List
from base import data_set
+from gn3.computations.correlations import compute_all_sample_correlation
from gn3.computations.correlations import fast_compute_all_sample_correlation
from gn3.computations.correlations import map_shared_keys_to_values
+
def get_dataset_dict_data(dataset_obj):
"""function to get the dataset data mapped to key"""
- dataset_obj.get_trait_data()
+ dataset_obj.get_trait_data(dataset_obj.group.all_samples_ordered())
return map_shared_keys_to_values(dataset_obj.samplelist,
dataset_obj.trait_data)
@@ -42,11 +44,21 @@ def pre_compute_sample_correlation(base_dataset: List,
precaution:function is expensive;targets only Exon and
"""
+ results = []
+
for trait_info in base_dataset:
- yield fast_compute_all_sample_correlation(corr_method="pearson",
- this_trait=trait_info,
- target_dataset=target_dataset)
+ result = fast_compute_all_sample_correlation(corr_method="pearson",
+ this_trait=trait_info,
+ target_dataset=target_dataset)
+
+ # results.append(fast_compute_all_sample_correlation(corr_method="pearson",
+ # this_trait=trait_info,
+ # target_dataset=target_dataset))
+ print("finished")
+ print(result)
+
+ return results
def cache_to_file(base_dataset_name: str, target_dataset_name: str):
@@ -57,16 +69,21 @@ def cache_to_file(base_dataset_name: str, target_dataset_name: str):
base_dataset_data, target_dataset_data = [list(dataset) for dataset in list(
fetch_datasets(base_dataset_name, target_dataset_name))]
+ # print(target_dataset_data)
try:
- with open("unique_file_name.json", "w") as file_handler:
- file_handler.write()
-
- dataset_correlation_results = list(pre_compute_sample_correlation(
- base_dataset_data, target_dataset_data))
+ # with open("unique_file_name.json", "w") as file_handler:
+ # file_handler.write()
+ dataset_correlation_results = pre_compute_sample_correlation(
+ base_dataset_data, target_dataset_data)
print(dataset_correlation_results)
- json.dump(dataset_correlation_results, file_handler)
+ # json.dump(dataset_correlation_results, file_handler)
except Exception as error:
raise error
+
+
+def check_cached_files_validity():
+ """function to check the validity of cached files"""
+ pass
--
cgit v1.2.3
From 5935e2cc3e0ac3a8004ccd5224557d34b62359d8 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 20:28:06 +0300
Subject: code to cache frequently run probeset correlation
---
wqflask/wqflask/correlation/pre_computes.py | 124 ++++++++++++++--------------
1 file changed, 63 insertions(+), 61 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index f1c9e1bd..d0caca60 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -1,89 +1,91 @@
-"""module contains the code to do the
-precomputations of sample data between
-two entire datasets"""
-import json
-from typing import List
-from base import data_set
+import os
+import hashlib
-from gn3.computations.correlations import compute_all_sample_correlation
-from gn3.computations.correlations import fast_compute_all_sample_correlation
-from gn3.computations.correlations import map_shared_keys_to_values
+from base.data_set import query_table_timestamp
+from base.webqtlConfig import TMPDIR
-def get_dataset_dict_data(dataset_obj):
- """function to get the dataset data mapped to key"""
- dataset_obj.get_trait_data(dataset_obj.group.all_samples_ordered())
- return map_shared_keys_to_values(dataset_obj.samplelist,
- dataset_obj.trait_data)
+def generate_filename(**kwargs):
+ """generate unique filename"""
+ base_dataset_name = kwargs["base_dataset"]
+ target_dataset_name = kwargs["target_dataset"]
+ base_timestamp = kwargs["base_timestamp"]
+ target_dataset_timestamp = kwargs["target_timestamp"]
-def fetch_datasets(base_dataset_name: str, target_dataset_name: str) ->List:
- """query to fetch create datasets and fetch traits
- all traits of a dataset"""
+ string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode()
+ hashlib.md5(string_unicode).hexdigest()
- # doesnt work for temp
- base_dataset = data_set.create_dataset(dataset_name=base_dataset_name)
+def cache_compute_results(start_vars,
+ base_dataset_type,
+ correlation_results,
+ trait_name):
+ # pass
- target_dataset = data_set.create_dataset(dataset_name=target_dataset_name)
- # replace with map
+ # init assumption only caching probeset type
+ # fix redis;issue potential redis_cache!=current_timestamp
+ base_timestamp = r.get(f"{base_dataset_type}timestamp")
- return (map(get_dataset_dict_data,
- [base_dataset, target_dataset]))
+ if base_timestamp is None:
+ # fetch the timestamp
+ base_timestamp = target_dataset_timestamp = query_table_timestamp(
+ dataset_type)
+ r.set(f"{dataset_type}timestamp", target_dataset_timestamp)
-# in the base dataset we just need the traits
-def pre_compute_sample_correlation(base_dataset: List,
- target_dataset: List) -> List:
- """function compute the correlation between the
- a whole dataset against a target
- input: target&base_dataset(contains traits and sample results)
- output: list containing the computed results
+ file_name = generate_filename(
+ base_dataset_name, target_dataset_name,
+ base_timestamp, target_dataset_timestamp)
- precaution:function is expensive;targets only Exon and
- """
+ file_path = os.path.join(TMPDIR, f"{file_name}.json")
- results = []
+ try:
- for trait_info in base_dataset:
+ with open(file_path, "r+") as json_handler:
- result = fast_compute_all_sample_correlation(corr_method="pearson",
- this_trait=trait_info,
- target_dataset=target_dataset)
+ results = json.load(json_handler)
- # results.append(fast_compute_all_sample_correlation(corr_method="pearson",
- # this_trait=trait_info,
- # target_dataset=target_dataset))
- print("finished")
- print(result)
+ if results.get(trait_name) is not None:
+ results.update({trait_name: correlation_results})
- return results
+ json.dump(results, json_handler)
+ except FileNotFoundError:
+ with open(file_path, "w") as json_handler:
+ json.dump({trait_name: correlation_results}, json_handler)
-def cache_to_file(base_dataset_name: str, target_dataset_name: str):
- """function to cache the results to file"""
+def fetch_precompute_results(base_dataset_name,target_dataset_name,trait_name):
+ """function to check for precomputed results"""
- # validate the datasets expiry first
+ # check for redis timestamp
+
+ # fix rely on the fact correlation run oftenly probeset is set
+
+ base_timestamp = target_dataset_timestamp = r.get(dataset_type)
+
+
+ if base_timestamp is None:
+ return
+
+ else:
+ file_name = generate_filename(
+ base_dataset_name, target_dataset_name,
+ base_timestamp, target_dataset_timestamp)
+
+ try:
+ with open(file_path,"r") as json_handler:
+ correlation_results = json.load(json_handler)
+
+ return correlation_results.get(trait_name)
+
+ except FileNotFoundError:
+ pass
- base_dataset_data, target_dataset_data = [list(dataset) for dataset in list(
- fetch_datasets(base_dataset_name, target_dataset_name))]
- # print(target_dataset_data)
- try:
- # with open("unique_file_name.json", "w") as file_handler:
- # file_handler.write()
- dataset_correlation_results = pre_compute_sample_correlation(
- base_dataset_data, target_dataset_data)
- print(dataset_correlation_results)
- # json.dump(dataset_correlation_results, file_handler)
- except Exception as error:
- raise error
-def check_cached_files_validity():
- """function to check the validity of cached files"""
- pass
--
cgit v1.2.3
From ca4a5fdda8a7225dc5bebc17c61837ba5373ec68 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 20:29:04 +0300
Subject: minor fix for generating file_name
---
wqflask/wqflask/correlation/pre_computes.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index d0caca60..55f25f0b 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -15,7 +15,7 @@ def generate_filename(**kwargs):
target_dataset_timestamp = kwargs["target_timestamp"]
string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode()
- hashlib.md5(string_unicode).hexdigest()
+ return hashlib.md5(string_unicode).hexdigest()
def cache_compute_results(start_vars,
--
cgit v1.2.3
From b4594a6f2dc5c0c0a8e62a327674126668391d6b Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 20:32:13 +0300
Subject: minor fix for updating dict
---
wqflask/wqflask/correlation/pre_computes.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 55f25f0b..4244fcfb 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -47,7 +47,7 @@ def cache_compute_results(start_vars,
results = json.load(json_handler)
- if results.get(trait_name) is not None:
+ if results.get(trait_name) is None:
results.update({trait_name: correlation_results})
json.dump(results, json_handler)
--
cgit v1.2.3
From a20e20c79b054350b84e70af6e7d5ef2a0407786 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 22:18:21 +0300
Subject: pep8 formatting + minor fixing for writing to files
---
wqflask/wqflask/correlation/pre_computes.py | 75 +++++++++++++----------------
1 file changed, 34 insertions(+), 41 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 4244fcfb..1d832fde 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -1,28 +1,28 @@
-
+import json
import os
import hashlib
from base.data_set import query_table_timestamp
from base.webqtlConfig import TMPDIR
+from redis import Redis
+r = Redis()
-def generate_filename(**kwargs):
- """generate unique filename"""
- base_dataset_name = kwargs["base_dataset"]
- target_dataset_name = kwargs["target_dataset"]
- base_timestamp = kwargs["base_timestamp"]
- target_dataset_timestamp = kwargs["target_timestamp"]
+def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
+ """generate unique filename"""
string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode()
return hashlib.md5(string_unicode).hexdigest()
-def cache_compute_results(start_vars,
- base_dataset_type,
- correlation_results,
- trait_name):
+def cache_compute_results(base_dataset_type,
+ base_dataset_name,
+ target_dataset_name,
+ correlation_results,
+ trait_name):
# pass
+ """function to cache correlation results for heavy computations"""
# init assumption only caching probeset type
# fix redis;issue potential redis_cache!=current_timestamp
@@ -30,10 +30,11 @@ def cache_compute_results(start_vars,
if base_timestamp is None:
# fetch the timestamp
- base_timestamp = target_dataset_timestamp = query_table_timestamp(
- dataset_type)
+ base_timestamp = query_table_timestamp(
+ base_dataset_type)
+ r.set(f"{base_dataset_type}timestamp", base_timestamp)
- r.set(f"{dataset_type}timestamp", target_dataset_timestamp)
+ target_dataset_timestamp = base_timestamp
file_name = generate_filename(
base_dataset_name, target_dataset_name,
@@ -41,51 +42,43 @@ def cache_compute_results(start_vars,
file_path = os.path.join(TMPDIR, f"{file_name}.json")
- try:
+ try:
+
+ with open(file_path, "r+") as json_handler:
- with open(file_path, "r+") as json_handler:
+ results = json.load(json_handler)
+ results[trait_name] = correlation_results
- results = json.load(json_handler)
+ json.dump(results, json_handler)
- if results.get(trait_name) is None:
- results.update({trait_name: correlation_results})
+ except FileNotFoundError:
- json.dump(results, json_handler)
+ with open(file_path, "w+") as write_json_handler:
+ json.dump({trait_name: correlation_results}, write_json_handler)
- except FileNotFoundError:
- with open(file_path, "w") as json_handler:
- json.dump({trait_name: correlation_results}, json_handler)
-def fetch_precompute_results(base_dataset_name,target_dataset_name,trait_name):
+def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
"""function to check for precomputed results"""
# check for redis timestamp
# fix rely on the fact correlation run oftenly probeset is set
- base_timestamp = target_dataset_timestamp = r.get(dataset_type)
-
+ base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp")
if base_timestamp is None:
return
- else:
- file_name = generate_filename(
+ file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp)
- try:
- with open(file_path,"r") as json_handler:
- correlation_results = json.load(json_handler)
-
- return correlation_results.get(trait_name)
-
- except FileNotFoundError:
- pass
-
-
-
-
-
+ file_path = os.path.join(TMPDIR, f"{file_name}.json")
+ try:
+ with open(file_path, "r") as json_handler:
+ correlation_results = json.load(json_handler)
+ return correlation_results.get(trait_name)
+ except FileNotFoundError:
+ pass
--
cgit v1.2.3
From 01d42255f52a61c6d3d007ffd1e5e02765a76730 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 15 Nov 2021 17:38:09 +0300
Subject: fix for truncating files
---
wqflask/wqflask/correlation/pre_computes.py | 53 ++++++++++++++++++++---------
1 file changed, 37 insertions(+), 16 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 1d832fde..01fa1a3d 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -4,6 +4,7 @@ import hashlib
from base.data_set import query_table_timestamp
from base.webqtlConfig import TMPDIR
+from json.decoder import JSONDecodeError
from redis import Redis
r = Redis()
@@ -26,35 +27,49 @@ def cache_compute_results(base_dataset_type,
# init assumption only caching probeset type
# fix redis;issue potential redis_cache!=current_timestamp
- base_timestamp = r.get(f"{base_dataset_type}timestamp")
- if base_timestamp is None:
- # fetch the timestamp
- base_timestamp = query_table_timestamp(
- base_dataset_type)
- r.set(f"{base_dataset_type}timestamp", base_timestamp)
+ base_timestamp = query_table_timestamp(base_dataset_type)
+
+ r.set(f"{base_dataset_type}timestamp", base_timestamp)
target_dataset_timestamp = base_timestamp
+
+
file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp)
- file_path = os.path.join(TMPDIR, f"{file_name}.json")
+
+ file_path = os.path.join(TMPDIR,f"{file_name}.json")
+
try:
+ with open(file_path,"r+") as json_file_handler:
+ data = json.load(json_file_handler)
- with open(file_path, "r+") as json_handler:
+ data[trait_name] = correlation_results
- results = json.load(json_handler)
- results[trait_name] = correlation_results
+ json_file_handler.seek(0)
- json.dump(results, json_handler)
+ json.dump(data,json_file_handler)
+ json_file_handler.truncate()
+
except FileNotFoundError:
+ with open(file_path,"w+") as file_handler:
+ data = {}
+ data[trait_name] =correlation_results
+
+ json.dump(data,file_handler)
- with open(file_path, "w+") as write_json_handler:
- json.dump({trait_name: correlation_results}, write_json_handler)
+
+
+ # create the file only if it does not exists
+
+ # else open the file to cache the results
+
+
def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
@@ -65,20 +80,26 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
# fix rely on the fact correlation run oftenly probeset is set
base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp")
-
if base_timestamp is None:
return
+ else:
+ base_timestamp = target_dataset_timestamp = base_timestamp.decode("utf-8")
+
file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
+ results = None
+
try:
- with open(file_path, "r") as json_handler:
+ with open(file_path, "r+") as json_handler:
correlation_results = json.load(json_handler)
- return correlation_results.get(trait_name)
+ # print(correlation_results)
+
+ return correlation_results.get(trait_name)
except FileNotFoundError:
pass
--
cgit v1.2.3
From 18b53441a0136071db94c72b112a746e056ef971 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 15 Nov 2021 18:03:55 +0300
Subject: refactor function to fetch datasets data for precomputes
---
wqflask/wqflask/correlation/pre_computes.py | 72 ++++++++++++++++++++++-------
1 file changed, 56 insertions(+), 16 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 01fa1a3d..e7147ddf 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -20,6 +20,7 @@ def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, ta
def cache_compute_results(base_dataset_type,
base_dataset_name,
target_dataset_name,
+ corr_method,
correlation_results,
trait_name):
# pass
@@ -34,43 +35,35 @@ def cache_compute_results(base_dataset_type,
target_dataset_timestamp = base_timestamp
-
-
file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp)
-
- file_path = os.path.join(TMPDIR,f"{file_name}.json")
-
+ file_path = os.path.join(TMPDIR, f"{file_name}.json")
try:
- with open(file_path,"r+") as json_file_handler:
+ with open(file_path, "r+") as json_file_handler:
data = json.load(json_file_handler)
data[trait_name] = correlation_results
json_file_handler.seek(0)
- json.dump(data,json_file_handler)
+ json.dump(data, json_file_handler)
json_file_handler.truncate()
-
+
except FileNotFoundError:
- with open(file_path,"w+") as file_handler:
+ with open(file_path, "w+") as file_handler:
data = {}
- data[trait_name] =correlation_results
-
- json.dump(data,file_handler)
-
+ data[trait_name] = correlation_results
+ json.dump(data, file_handler)
# create the file only if it does not exists
# else open the file to cache the results
-
-
def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
"""function to check for precomputed results"""
@@ -84,7 +77,8 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
return
else:
- base_timestamp = target_dataset_timestamp = base_timestamp.decode("utf-8")
+ base_timestamp = target_dataset_timestamp = base_timestamp.decode(
+ "utf-8")
file_name = generate_filename(
base_dataset_name, target_dataset_name,
@@ -103,3 +97,49 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
except FileNotFoundError:
pass
+
+
+def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method):
+ """compute sample correlation between dataset vs dataset
+ wn:heavy function should be invoked less frequently
+ input:datasets_data(two dicts),corr_method
+
+ output:correlation results for entire dataset against entire dataset
+ """
+ dataset_correlation_results = {}
+
+ for (trait_name, strain_values) in target_dataset.trait_data:
+
+ this_trait_data = {
+ "trait_sample_data": strain_values,
+ "trait_id": trait_name
+ }
+
+ trait_correlation_result = fast_compute_all_sample_correlation(
+ corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_dataset_data)
+
+ dataset_correlation_results[trait_name] = trait_correlation_result
+
+ return dataset_correlation_results
+
+
+def get_datasets_data(base_dataset, target_dataset_data):
+ """required to pass data in a given format to the pre compute
+ function
+
+ output:two dicts for datasets with key==trait and value==strains
+ """
+ target_traits_data = target_dataset.get_trait_data(
+ base_dataset.group.all_samples_ordered())
+
+ base_traits_data = base_dataset.get_trait_data(
+ base_dataset.group.all_samples_ordered())
+
+ samples_fetched = base_dataset.group.all_samples_ordered()
+
+ target_results = map_shared_keys_to_values(
+ samples_fetched, target_traits_data)
+ base_results = map_shared_keys_to_values(
+ samples_fetched, base_traits_data)
+
+ return (target_results, base_results)
--
cgit v1.2.3
From 6f6e28d216e0a2adbf939b9b29f8794ae45d9aa8 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 15 Nov 2021 18:08:10 +0300
Subject: pep8 formatting & fix variable names
---
wqflask/wqflask/correlation/pre_computes.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index e7147ddf..d8629706 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -108,17 +108,20 @@ def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method):
"""
dataset_correlation_results = {}
- for (trait_name, strain_values) in target_dataset.trait_data:
+ target_traits_data, base_traits_data = get_datasets_data(
+ base_dataset, target_dataset_data)
+
+ for (primary_trait_name, strain_values) in base_traits_data:
this_trait_data = {
"trait_sample_data": strain_values,
- "trait_id": trait_name
+ "trait_id": primary_trait_name
}
trait_correlation_result = fast_compute_all_sample_correlation(
- corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_dataset_data)
+ corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_traits_data)
- dataset_correlation_results[trait_name] = trait_correlation_result
+ dataset_correlation_results[primary_trait_name] = trait_correlation_result
return dataset_correlation_results
--
cgit v1.2.3
From 2982ba491e89acee5ead69206691c786be1cf728 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 15 Nov 2021 18:10:56 +0300
Subject: test precomppute caching integration
---
wqflask/wqflask/correlation/correlation_gn3_api.py | 23 +++++++++++++++++-----
1 file changed, 18 insertions(+), 5 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py
index 7b828016..191a748a 100644
--- a/wqflask/wqflask/correlation/correlation_gn3_api.py
+++ b/wqflask/wqflask/correlation/correlation_gn3_api.py
@@ -4,7 +4,8 @@ import time
from functools import wraps
from wqflask.correlation import correlation_functions
-
+from wqflask.correlation.pre_computes import fetch_precompute_results
+from wqflask.correlation.pre_computes import cache_compute_results
from base import data_set
from base.trait import create_trait
@@ -193,9 +194,21 @@ def compute_correlation(start_vars, method="pearson", compute_all=False):
(this_trait_data, target_dataset_data) = fetch_sample_data(
start_vars, this_trait, this_dataset, target_dataset)
- correlation_results = fast_compute_all_sample_correlation(corr_method=method,
- this_trait=this_trait_data,
- target_dataset=target_dataset_data)
+ correlation_results = fetch_precompute_results(
+ this_dataset.name, target_dataset.name, this_dataset.type, this_trait.name)
+
+ if correlation_results is None:
+ correlation_results = fast_compute_all_sample_correlation(corr_method=method,
+ this_trait=this_trait_data,
+ target_dataset=target_dataset_data)
+
+ cache_compute_results(this_dataset.type,
+ this_dataset.name,
+ target_dataset.name,
+ corr_method,
+ correlation_results,
+ this_trait.name)
+
elif corr_type == "tissue":
trait_symbol_dict = this_dataset.retrieve_genes("Symbol")
tissue_input = get_tissue_correlation_input(
@@ -295,7 +308,7 @@ def get_tissue_correlation_input(this_trait, trait_symbol_dict):
"""Gets tissue expression values for the primary trait and target tissues values"""
primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values(
symbol_list=[this_trait.symbol])
- if this_trait.symbol.lower() in primary_trait_tissue_vals_dict:
+ if this_trait.symbol and this_trait.symbol.lower() in primary_trait_tissue_vals_dict:
primary_trait_tissue_values = primary_trait_tissue_vals_dict[this_trait.symbol.lower(
)]
corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values(
--
cgit v1.2.3
From aab6393dd60872a6a3b6e7db2a7c087c4ec41295 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 15 Nov 2021 18:18:03 +0300
Subject: fetch only strains from the primary datasets
---
wqflask/wqflask/correlation/pre_computes.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index d8629706..355701f2 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -86,7 +86,6 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
file_path = os.path.join(TMPDIR, f"{file_name}.json")
- results = None
try:
with open(file_path, "r+") as json_handler:
@@ -130,15 +129,19 @@ def get_datasets_data(base_dataset, target_dataset_data):
"""required to pass data in a given format to the pre compute
function
+ (works for bxd only probeset datasets)
+
+ # fix issue with fetching of the datasets
+
output:two dicts for datasets with key==trait and value==strains
"""
+ samples_fetched = base_dataset.group.all_samples_ordered()
target_traits_data = target_dataset.get_trait_data(
- base_dataset.group.all_samples_ordered())
+ samples_fetched)
base_traits_data = base_dataset.get_trait_data(
- base_dataset.group.all_samples_ordered())
+ samples_fetched)
- samples_fetched = base_dataset.group.all_samples_ordered()
target_results = map_shared_keys_to_values(
samples_fetched, target_traits_data)
--
cgit v1.2.3
From 04452c274d51621a0cab1b8dce5b8101c69496b6 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 16 Nov 2021 14:41:41 +0300
Subject: refactor:fix on the query :modify cache point
---
wqflask/base/data_set.py | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 70c58136..a3a720ad 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -747,7 +747,9 @@ class DataSet:
and Species.name = '{}'
""".format(create_in_clause(self.samplelist), *mescape(self.group.species))
results = dict(g.db.execute(query).fetchall())
- sample_ids = [results[item] for item in self.samplelist]
+ sample_ids = [results.get(item) for item in self.samplelist]
+
+ sample_ids = [ids for ids in sample_ids if ids is not None]
# MySQL limits the number of tables that can be used in a join to 61,
# so we break the sample ids into smaller chunks
@@ -800,25 +802,22 @@ class DataSet:
results = g.db.execute(query).fetchall()
trait_sample_data.append([list(result) for result in results])
- cache_dataset_results(
- self.name, self.type, trait_sample_data)
+ trait_count = len(trait_sample_data[0])
+ self.trait_data = collections.defaultdict(list)
- else:
- trait_sample_data = cached_results
-
- trait_count = len(trait_sample_data[0])
- self.trait_data = collections.defaultdict(list)
-
- # put all of the separate data together into a dictionary where the keys are
- # trait names and values are lists of sample values
- data_start_pos = 1
- for trait_counter in range(trait_count):
- trait_name = trait_sample_data[0][trait_counter][0]
- for chunk_counter in range(int(number_chunks)):
- self.trait_data[trait_name] += (
+ data_start_pos = 1
+ for trait_counter in range(trait_count):
+ trait_name = trait_sample_data[0][trait_counter][0]
+ for chunk_counter in range(int(number_chunks)):
+ self.trait_data[trait_name] += (
trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+ cache_dataset_results(
+ self.name, self.type, self.trait_data)
+
+ else:
+ self.trait_data = cached_results
class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -1282,7 +1281,9 @@ def generate_hash_file(dataset_name: str, dataset_timestamp: str):
def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: List):
- """function to cache dataset query results to file"""
+ """function to cache dataset query results to file
+ input dataset_name and type query_results(already processed in default dict format)
+ """
# data computations actions
# store the file path on redis
--
cgit v1.2.3
From a8ccaf03ba151f9ceca2f0224af33db230a8c8b3 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 16 Nov 2021 15:53:50 +0300
Subject: test generate new files
---
wqflask/base/data_set.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index a3a720ad..cae1a2a7 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -810,7 +810,7 @@ class DataSet:
trait_name = trait_sample_data[0][trait_counter][0]
for chunk_counter in range(int(number_chunks)):
self.trait_data[trait_name] += (
- trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+ trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
cache_dataset_results(
self.name, self.type, self.trait_data)
@@ -818,6 +818,8 @@ class DataSet:
else:
self.trait_data = cached_results
+
+
class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -1291,7 +1293,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L
results = r.set(f"{dataset_type}timestamp", table_timestamp)
- file_name = generate_hash_file(dataset_name, table_timestamp)
+ file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
with open(file_path, "w") as file_handler:
@@ -1308,7 +1310,7 @@ def fetch_cached_results(dataset_name: str, dataset_type: str):
else:
table_timestamp = ""
- file_name = generate_hash_file(dataset_name, table_timestamp)
+ file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
try:
with open(file_path, "r") as file_handler:
--
cgit v1.2.3
From 4725a8c20d1d4209d59b3b113f00bbc467c5bd31 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 16 Nov 2021 19:43:15 +0300
Subject: init disable fast compute:memory fork issues
---
wqflask/wqflask/correlation/correlation_gn3_api.py | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py
index 191a748a..635ef5ed 100644
--- a/wqflask/wqflask/correlation/correlation_gn3_api.py
+++ b/wqflask/wqflask/correlation/correlation_gn3_api.py
@@ -194,20 +194,8 @@ def compute_correlation(start_vars, method="pearson", compute_all=False):
(this_trait_data, target_dataset_data) = fetch_sample_data(
start_vars, this_trait, this_dataset, target_dataset)
- correlation_results = fetch_precompute_results(
- this_dataset.name, target_dataset.name, this_dataset.type, this_trait.name)
-
- if correlation_results is None:
- correlation_results = fast_compute_all_sample_correlation(corr_method=method,
- this_trait=this_trait_data,
- target_dataset=target_dataset_data)
-
- cache_compute_results(this_dataset.type,
- this_dataset.name,
- target_dataset.name,
- corr_method,
- correlation_results,
- this_trait.name)
+ correlation_results = compute_all_sample_correlation(
+ corr_method=method, this_trait=this_trait_data, target_dataset=target_dataset_data)
elif corr_type == "tissue":
trait_symbol_dict = this_dataset.retrieve_genes("Symbol")
--
cgit v1.2.3
From 06fbab6427cadf7706da4e954874a7e5da1bd32d Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 16 Nov 2021 19:48:11 +0300
Subject: pep8 formatting remove debug statements
---
wqflask/base/data_set.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index cae1a2a7..37f35121 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -1263,7 +1263,7 @@ def query_table_timestamp(dataset_type: str):
# computation data and actions
query_update_time = f"""
- SELECT UPDATE_TIME FROM information_schema.tables
+ SELECT UPDATE_TIME FROM information_schfema.tables
WHERE TABLE_SCHEMA = 'db_webqtl_s'
AND TABLE_NAME = '{dataset_type}Data'
"""
@@ -1275,7 +1275,7 @@ def query_table_timestamp(dataset_type: str):
return date_time_obj.strftime(f)
-def generate_hash_file(dataset_name: str, dataset_timestamp: str):
+def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str):
"""given the trait_name generate a unique name for this"""
string_unicode = f"{dataset_name}{dataset_timestamp}".encode()
md5hash = hashlib.md5(string_unicode)
@@ -1317,5 +1317,4 @@ def fetch_cached_results(dataset_name: str, dataset_type: str):
return json.load(file_handler)
except FileNotFoundError:
- # take actions continue to fetch dataset results and fetch results
- pass
+ pass
\ No newline at end of file
--
cgit v1.2.3
From 679051788a475dfcefd4cb93dc82ec3a4b86edc3 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 16 Nov 2021 19:54:55 +0300
Subject: use comprehension list;fix typo
---
wqflask/base/data_set.py | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 37f35121..553530d4 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -747,9 +747,7 @@ class DataSet:
and Species.name = '{}'
""".format(create_in_clause(self.samplelist), *mescape(self.group.species))
results = dict(g.db.execute(query).fetchall())
- sample_ids = [results.get(item) for item in self.samplelist]
-
- sample_ids = [ids for ids in sample_ids if ids is not None]
+ sample_ids = [results.get(item) for item in self.samplelist if item is not None]
# MySQL limits the number of tables that can be used in a join to 61,
# so we break the sample ids into smaller chunks
@@ -1263,7 +1261,7 @@ def query_table_timestamp(dataset_type: str):
# computation data and actions
query_update_time = f"""
- SELECT UPDATE_TIME FROM information_schfema.tables
+ SELECT UPDATE_TIME FROM information_schema.tables
WHERE TABLE_SCHEMA = 'db_webqtl_s'
AND TABLE_NAME = '{dataset_type}Data'
"""
@@ -1317,4 +1315,4 @@ def fetch_cached_results(dataset_name: str, dataset_type: str):
return json.load(file_handler)
except FileNotFoundError:
- pass
\ No newline at end of file
+ pass
--
cgit v1.2.3
From 60fe836dc6c2f00cb99844572eb3fd29aee0163e Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 08:07:02 +0300
Subject: use a dynamic value for the db_name
---
wqflask/base/data_set.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 553530d4..2e401c8e 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -40,6 +40,7 @@ from base import species
from base import webqtlConfig
from flask import Flask, g
from base.webqtlConfig import TMPDIR
+from gn3.db_utils import parse_db_url
import os
import math
import string
@@ -747,7 +748,8 @@ class DataSet:
and Species.name = '{}'
""".format(create_in_clause(self.samplelist), *mescape(self.group.species))
results = dict(g.db.execute(query).fetchall())
- sample_ids = [results.get(item) for item in self.samplelist if item is not None]
+ sample_ids = [results.get(item)
+ for item in self.samplelist if item is not None]
# MySQL limits the number of tables that can be used in a join to 61,
# so we break the sample ids into smaller chunks
@@ -1260,9 +1262,11 @@ def query_table_timestamp(dataset_type: str):
# computation data and actions
+ fetch_db_name = parse_db_url()
+
query_update_time = f"""
SELECT UPDATE_TIME FROM information_schema.tables
- WHERE TABLE_SCHEMA = 'db_webqtl_s'
+ WHERE TABLE_SCHEMA = {fetch_db_name[-1]}
AND TABLE_NAME = '{dataset_type}Data'
"""
--
cgit v1.2.3
From 71a859c9facc7ae49d43e3e995166ad8dcb586cb Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 08:11:12 +0300
Subject: isolate SQL_URI parse to a function
---
wqflask/base/data_set.py | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 2e401c8e..f0a930a5 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -40,7 +40,8 @@ from base import species
from base import webqtlConfig
from flask import Flask, g
from base.webqtlConfig import TMPDIR
-from gn3.db_utils import parse_db_url
+from urllib.parse import urlparse
+from utility.tools import SQL_URI
import os
import math
import string
@@ -1257,6 +1258,13 @@ def geno_mrna_confidentiality(ob):
return True
+
+def parse_db_url():
+ parsed_db = urlparse(SQL_URI)
+
+ return (parsed_db.hostname, parsed_db.username,
+ parsed_db.password, parsed_db.path[1:])
+
def query_table_timestamp(dataset_type: str):
"""function to query the update timestamp of a given dataset_type"""
--
cgit v1.2.3
From 5a407a34442860ebaea2886f2278be9e1eb33a8d Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 08:13:41 +0300
Subject: replace redis fetch for cached timestamp with a query
---
wqflask/base/data_set.py | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index f0a930a5..52d1d254 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -1301,8 +1301,6 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L
table_timestamp = query_table_timestamp(dataset_type)
- results = r.set(f"{dataset_type}timestamp", table_timestamp)
-
file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
@@ -1313,12 +1311,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L
def fetch_cached_results(dataset_name: str, dataset_type: str):
"""function to fetch the cached results"""
- table_timestamp = r.get(f"{dataset_type}timestamp")
-
- if table_timestamp is not None:
- table_timestamp = table_timestamp.decode("utf-8")
- else:
- table_timestamp = ""
+ table_timestamp = query_table_timestamp(dataset_type)
file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
--
cgit v1.2.3
From 1090674ac9497dad22803e7bf8e51d77245f8a0c Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 08:27:05 +0300
Subject: isolate function to fetch the traits metadata
---
wqflask/wqflask/correlation/pre_computes.py | 32 +++++++++++++++++------------
1 file changed, 19 insertions(+), 13 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 355701f2..638ae860 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -9,6 +9,25 @@ from redis import Redis
r = Redis()
+# code to isolate metadata caching
+
+
+def fetch_all_cached_metadata(dataset_name):
+ """in a gvein dataset fetch all the traits metadata"""
+ file_name = f"{dataset_name}_metadata.json"
+
+ file_path = os.path.join(TMPDIR, file_name)
+
+ with open(file_path, "r+") as file_handler:
+ dataset_metadata = json.load(file_handler)
+
+ except FileNotFoundError:
+ Path(file_path).touch(exist_ok=True)
+ return {}
+
+ return dataset_metadata
+
+
def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
"""generate unique filename"""
@@ -60,18 +79,10 @@ def cache_compute_results(base_dataset_type,
json.dump(data, file_handler)
- # create the file only if it does not exists
-
- # else open the file to cache the results
-
def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
"""function to check for precomputed results"""
- # check for redis timestamp
-
- # fix rely on the fact correlation run oftenly probeset is set
-
base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp")
if base_timestamp is None:
return
@@ -86,11 +97,9 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
file_path = os.path.join(TMPDIR, f"{file_name}.json")
-
try:
with open(file_path, "r+") as json_handler:
correlation_results = json.load(json_handler)
- # print(correlation_results)
return correlation_results.get(trait_name)
@@ -131,8 +140,6 @@ def get_datasets_data(base_dataset, target_dataset_data):
(works for bxd only probeset datasets)
- # fix issue with fetching of the datasets
-
output:two dicts for datasets with key==trait and value==strains
"""
samples_fetched = base_dataset.group.all_samples_ordered()
@@ -142,7 +149,6 @@ def get_datasets_data(base_dataset, target_dataset_data):
base_traits_data = base_dataset.get_trait_data(
samples_fetched)
-
target_results = map_shared_keys_to_values(
samples_fetched, target_traits_data)
base_results = map_shared_keys_to_values(
--
cgit v1.2.3
From c872594d21ab743ae55ae4f1d037d13394ef8c67 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 08:34:24 +0300
Subject: isolate function to cache new traits metadata
---
wqflask/wqflask/correlation/pre_computes.py | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 638ae860..9270bdd4 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -1,6 +1,7 @@
import json
import os
import hashlib
+from pathlib import Path
from base.data_set import query_table_timestamp
from base.webqtlConfig import TMPDIR
@@ -25,8 +26,22 @@ def fetch_all_cached_metadata(dataset_name):
Path(file_path).touch(exist_ok=True)
return {}
- return dataset_metadata
+ return (file_path, dataset_metadata)
+ if bool(new_traits_metadata):
+ # that means new traits exists
+ dataset_metadata.update(new_traits_metadata)
+ with open(file_path, "w+") as file_handler:
+ json.dump(dataset_metadata, file_handler)
+
+
+def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_path: str):
+ """function to cache the new traits metadata"""
+
+ if bool(new_traits_metadata):
+ dataset_metadata.update(new_traits_metadata)
+ with open(file_path,"w+") as file_handler:
+ json.dump(dataset_metadata,file_handler)
def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
--
cgit v1.2.3
From a35ae60965d7cada41acad661afd88a8fc58e78e Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 08:42:47 +0300
Subject: pep8 formatting;delete remove redis dependency
---
wqflask/wqflask/correlation/pre_computes.py | 26 ++++----------------------
1 file changed, 4 insertions(+), 22 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 9270bdd4..403d60c9 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -5,12 +5,6 @@ from pathlib import Path
from base.data_set import query_table_timestamp
from base.webqtlConfig import TMPDIR
-from json.decoder import JSONDecodeError
-from redis import Redis
-
-r = Redis()
-
-# code to isolate metadata caching
def fetch_all_cached_metadata(dataset_name):
@@ -28,20 +22,14 @@ def fetch_all_cached_metadata(dataset_name):
return (file_path, dataset_metadata)
- if bool(new_traits_metadata):
- # that means new traits exists
- dataset_metadata.update(new_traits_metadata)
- with open(file_path, "w+") as file_handler:
- json.dump(dataset_metadata, file_handler)
-
def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_path: str):
"""function to cache the new traits metadata"""
if bool(new_traits_metadata):
dataset_metadata.update(new_traits_metadata)
- with open(file_path,"w+") as file_handler:
- json.dump(dataset_metadata,file_handler)
+ with open(file_path, "w+") as file_handler:
+ json.dump(dataset_metadata, file_handler)
def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
@@ -98,14 +86,8 @@ def cache_compute_results(base_dataset_type,
def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
"""function to check for precomputed results"""
- base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp")
- if base_timestamp is None:
- return
-
- else:
- base_timestamp = target_dataset_timestamp = base_timestamp.decode(
- "utf-8")
-
+ base_timestamp = target_dataset_timestamp = query_table_timestamp(
+ dataset_type)
file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp)
--
cgit v1.2.3
From 6786712e95cbb885b6b19b3ecd34e6c8ee893172 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 20:20:07 +0300
Subject: refactor sql query & date formatting
---
wqflask/base/data_set.py | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 52d1d254..2687738d 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -1258,31 +1258,27 @@ def geno_mrna_confidentiality(ob):
return True
-
def parse_db_url():
parsed_db = urlparse(SQL_URI)
return (parsed_db.hostname, parsed_db.username,
parsed_db.password, parsed_db.path[1:])
+
def query_table_timestamp(dataset_type: str):
"""function to query the update timestamp of a given dataset_type"""
# computation data and actions
fetch_db_name = parse_db_url()
-
query_update_time = f"""
SELECT UPDATE_TIME FROM information_schema.tables
- WHERE TABLE_SCHEMA = {fetch_db_name[-1]}
+ WHERE TABLE_SCHEMA = '{fetch_db_name[-1]}'
AND TABLE_NAME = '{dataset_type}Data'
"""
- # store the timestamp in redis=
date_time_obj = g.db.execute(query_update_time).fetchone()[0]
-
- f = "%Y-%m-%d %H:%M:%S"
- return date_time_obj.strftime(f)
+ return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str):
@@ -1301,6 +1297,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L
table_timestamp = query_table_timestamp(dataset_type)
+
file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
--
cgit v1.2.3
From 1a3b85c4ebc66d54e3bda06c3742e8046e4c8159 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 17 Nov 2021 20:38:20 +0300
Subject: add generic functio for generating filename
---
wqflask/wqflask/correlation/pre_computes.py | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 403d60c9..241b0730 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -13,8 +13,9 @@ def fetch_all_cached_metadata(dataset_name):
file_path = os.path.join(TMPDIR, file_name)
- with open(file_path, "r+") as file_handler:
- dataset_metadata = json.load(file_handler)
+ try:
+ with open(file_path, "r+") as file_handler:
+ dataset_metadata = json.load(file_handler)
except FileNotFoundError:
Path(file_path).touch(exist_ok=True)
@@ -32,6 +33,13 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_
json.dump(dataset_metadata, file_handler)
+def generate_file_name(*args, prefix=""):
+ """given a list of args generate a unique filename"""
+
+ string_unicode = f"{*args,}{prefix}".encode()
+ return hashlib.md5(string_unicode).hexdigest()
+
+
def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
"""generate unique filename"""
--
cgit v1.2.3
From 56b574b903244a64aecaa54e5305b25bb642b254 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 18 Nov 2021 12:02:10 +0300
Subject: pep8 formatting;minor fixes
---
wqflask/wqflask/correlation/pre_computes.py | 37 ++++++++++++++++++-----------
1 file changed, 23 insertions(+), 14 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 241b0730..77592a3a 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -9,7 +9,7 @@ from base.webqtlConfig import TMPDIR
def fetch_all_cached_metadata(dataset_name):
"""in a gvein dataset fetch all the traits metadata"""
- file_name = f"{dataset_name}_metadata.json"
+ file_name = generate_file_name(dataset_name, suffix="metadata")
file_path = os.path.join(TMPDIR, file_name)
@@ -33,11 +33,11 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_
json.dump(dataset_metadata, file_handler)
-def generate_file_name(*args, prefix=""):
+def generate_file_name(*args, suffix="", file_ext="json"):
"""given a list of args generate a unique filename"""
- string_unicode = f"{*args,}{prefix}".encode()
- return hashlib.md5(string_unicode).hexdigest()
+ string_unicode = f"{*args,}".encode()
+ return f"{hashlib.md5(string_unicode).hexdigest()}_{suffix}.{file_ext}"
def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
@@ -65,11 +65,12 @@ def cache_compute_results(base_dataset_type,
target_dataset_timestamp = base_timestamp
- file_name = generate_filename(
+ file_name = generate_file_name(
base_dataset_name, target_dataset_name,
- base_timestamp, target_dataset_timestamp)
+ base_timestamp, target_dataset_timestamp,
+ suffix="corr_precomputes")
- file_path = os.path.join(TMPDIR, f"{file_name}.json")
+ file_path = os.path.join(TMPDIR, file_name)
try:
with open(file_path, "r+") as json_file_handler:
@@ -91,16 +92,20 @@ def cache_compute_results(base_dataset_type,
json.dump(data, file_handler)
-def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
+def fetch_precompute_results(base_dataset_name,
+ target_dataset_name,
+ dataset_type,
+ trait_name):
"""function to check for precomputed results"""
base_timestamp = target_dataset_timestamp = query_table_timestamp(
dataset_type)
- file_name = generate_filename(
+ file_name = generate_file_name(
base_dataset_name, target_dataset_name,
- base_timestamp, target_dataset_timestamp)
+ base_timestamp, target_dataset_timestamp,
+ suffix="corr_precomputes")
- file_path = os.path.join(TMPDIR, f"{file_name}.json")
+ file_path = os.path.join(TMPDIR, file_name)
try:
with open(file_path, "r+") as json_handler:
@@ -112,7 +117,9 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
pass
-def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method):
+def pre_compute_dataset_vs_dataset(base_dataset,
+ target_dataset,
+ corr_method):
"""compute sample correlation between dataset vs dataset
wn:heavy function should be invoked less frequently
input:datasets_data(two dicts),corr_method
@@ -131,8 +138,10 @@ def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method):
"trait_id": primary_trait_name
}
- trait_correlation_result = fast_compute_all_sample_correlation(
- corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_traits_data)
+ trait_correlation_result = compute_all_sample_correlation(
+ corr_method=corr_method,
+ this_trait=this_trait_data,
+ target_dataset=target_traits_data)
dataset_correlation_results[primary_trait_name] = trait_correlation_result
--
cgit v1.2.3
From 4de623130dca019d15f956e91ec999fddc2e2a0f Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 18 Nov 2021 12:03:48 +0300
Subject: remove unused functions rename function names
---
wqflask/wqflask/correlation/pre_computes.py | 15 ++++-----------
1 file changed, 4 insertions(+), 11 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 77592a3a..b95ceba5 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -9,7 +9,7 @@ from base.webqtlConfig import TMPDIR
def fetch_all_cached_metadata(dataset_name):
"""in a gvein dataset fetch all the traits metadata"""
- file_name = generate_file_name(dataset_name, suffix="metadata")
+ file_name = generate_filename(dataset_name, suffix="metadata")
file_path = os.path.join(TMPDIR, file_name)
@@ -33,20 +33,13 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_
json.dump(dataset_metadata, file_handler)
-def generate_file_name(*args, suffix="", file_ext="json"):
+def generate_filename(*args, suffix="", file_ext="json"):
"""given a list of args generate a unique filename"""
string_unicode = f"{*args,}".encode()
return f"{hashlib.md5(string_unicode).hexdigest()}_{suffix}.{file_ext}"
-def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp):
- """generate unique filename"""
-
- string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode()
- return hashlib.md5(string_unicode).hexdigest()
-
-
def cache_compute_results(base_dataset_type,
base_dataset_name,
target_dataset_name,
@@ -65,7 +58,7 @@ def cache_compute_results(base_dataset_type,
target_dataset_timestamp = base_timestamp
- file_name = generate_file_name(
+ file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp,
suffix="corr_precomputes")
@@ -100,7 +93,7 @@ def fetch_precompute_results(base_dataset_name,
base_timestamp = target_dataset_timestamp = query_table_timestamp(
dataset_type)
- file_name = generate_file_name(
+ file_name = generate_filename(
base_dataset_name, target_dataset_name,
base_timestamp, target_dataset_timestamp,
suffix="corr_precomputes")
--
cgit v1.2.3
From 24d87cb4e75136f822b316a3c9f936b8e5efb5e9 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 18 Nov 2021 13:40:54 +0300
Subject: refactor code for metadata
---
wqflask/wqflask/correlation/pre_computes.py | 11 ++-------
wqflask/wqflask/correlation/show_corr_results.py | 29 +++++++++---------------
2 files changed, 13 insertions(+), 27 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index b95ceba5..ad0bc6ef 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -16,12 +16,11 @@ def fetch_all_cached_metadata(dataset_name):
try:
with open(file_path, "r+") as file_handler:
dataset_metadata = json.load(file_handler)
+ return (file_path, dataset_metadata)
except FileNotFoundError:
Path(file_path).touch(exist_ok=True)
- return {}
-
- return (file_path, dataset_metadata)
+ return (file_path, {})
def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_path: str):
@@ -46,16 +45,10 @@ def cache_compute_results(base_dataset_type,
corr_method,
correlation_results,
trait_name):
- # pass
"""function to cache correlation results for heavy computations"""
- # init assumption only caching probeset type
- # fix redis;issue potential redis_cache!=current_timestamp
-
base_timestamp = query_table_timestamp(base_dataset_type)
- r.set(f"{base_dataset_type}timestamp", base_timestamp)
-
target_dataset_timestamp = base_timestamp
file_name = generate_filename(
diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py
index 42010a1e..f5600f13 100644
--- a/wqflask/wqflask/correlation/show_corr_results.py
+++ b/wqflask/wqflask/correlation/show_corr_results.py
@@ -26,6 +26,9 @@ from base.trait import create_trait, jsonable
from base.data_set import create_dataset
from base.webqtlConfig import TMPDIR
+from wqflask.correlation.pre_computes import fetch_all_cached_metadata
+from wqflask.correlation.pre_computes import cache_new_traits_metadata
+
from utility import hmac
@@ -34,7 +37,8 @@ def set_template_vars(start_vars, correlation_data):
corr_method = start_vars['corr_sample_method']
if start_vars['dataset'] == "Temp":
- this_dataset_ob = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group'])
+ this_dataset_ob = create_dataset(
+ dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group'])
else:
this_dataset_ob = create_dataset(dataset_name=start_vars['dataset'])
this_trait = create_trait(dataset=this_dataset_ob,
@@ -86,25 +90,17 @@ def correlation_json_for_table(correlation_data, this_trait, this_dataset, targe
corr_results = correlation_data['correlation_results']
results_list = []
- file_name = f"{target_dataset['name']}_metadata.json"
-
- file_path = os.path.join(TMPDIR, file_name)
new_traits_metadata = {}
- try:
- with open(file_path,"r+") as file_handler:
- dataset_metadata = json.load(file_handler)
-
- except FileNotFoundError:
- Path(file_path).touch(exist_ok=True)
- dataset_metadata = {}
+ (file_path, dataset_metadata) = fetch_all_cached_metadata(
+ target_dataset['name'])
for i, trait_dict in enumerate(corr_results):
trait_name = list(trait_dict.keys())[0]
trait = trait_dict[trait_name]
target_trait = dataset_metadata.get(trait_name)
- if target_trait is None:
+ if target_trait is None:
target_trait_ob = create_trait(dataset=target_dataset_ob,
name=trait_name,
get_qtl_info=True)
@@ -184,12 +180,9 @@ def correlation_json_for_table(correlation_data, this_trait, this_dataset, targe
results_list.append(results_dict)
-
- if bool(new_traits_metadata):
- # that means new traits exists
- dataset_metadata.update(new_traits_metadata)
- with open(file_path,"w+") as file_handler:
- json.dump(dataset_metadata, file_handler)
+ cache_new_traits_metadata(dataset_metadata,
+ new_traits_metadata,
+ file_path)
return json.dumps(results_list)
--
cgit v1.2.3
From b7a4fa3007e2a3364e7a827b0bf4b3a54fcc272d Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 23 Nov 2021 13:38:36 +0300
Subject: merge commit :added some logic that takes into account
corr_sample_group when determining which samples to use when getting
sample_data
---
wqflask/wqflask/correlation/correlation_gn3_api.py | 31 +++++++++++++++++-----
1 file changed, 25 insertions(+), 6 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py
index 635ef5ed..32a55b44 100644
--- a/wqflask/wqflask/correlation/correlation_gn3_api.py
+++ b/wqflask/wqflask/correlation/correlation_gn3_api.py
@@ -64,20 +64,27 @@ def test_process_data(this_trait, dataset, start_vars):
return sample_data
-def process_samples(start_vars, sample_names, excluded_samples=None):
- """process samples"""
+def process_samples(start_vars,sample_names = [],excluded_samples = []):
+ """code to fetch correct samples"""
sample_data = {}
- if not excluded_samples:
- excluded_samples = ()
- sample_vals_dict = json.loads(start_vars["sample_vals"])
+ sample_vals_dict = json.loads(start_vars["sample_vals"])
+ if sample_names:
for sample in sample_names:
- if sample not in excluded_samples and sample in sample_vals_dict:
+ if sample in sample_vals_dict and sample not in excluded_samples:
+ val = sample_vals_dict[sample]
+ if not val.strip().lower() == "x":
+ sample_data[str(sample)] = float(val)
+
+ else:
+ for sample in sample_vals_dict.keys():
+ if sample not in excluded_samples:
val = sample_vals_dict[sample]
if not val.strip().lower() == "x":
sample_data[str(sample)] = float(val)
return sample_data
+
def merge_correlation_results(correlation_results, target_correlation_results):
corr_dict = {}
@@ -153,6 +160,18 @@ def lit_for_trait_list(corr_results, this_dataset, this_trait):
def fetch_sample_data(start_vars, this_trait, this_dataset, target_dataset):
+ corr_samples_group = start_vars["corr_samples_group"]
+ if corr_samples_group == "samples_primary":
+ sample_data = process_samples(
+ start_vars, this_dataset.group.all_samples_ordered())
+
+ elif corr_samples_group == "samples_other":
+ sample_data = process_samples(
+ start_vars, excluded_samples = this_dataset.group.samplelist)
+
+ else:
+ sample_data = process_samples(start_vars)
+
sample_data = process_samples(
start_vars, this_dataset.group.all_samples_ordered())
--
cgit v1.2.3
From aa9a06d927bdc2b5221e58559f24921a0ff72cd8 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 23 Nov 2021 13:50:21 +0300
Subject: pep8 formatting remove dead variables
---
wqflask/base/data_set.py | 1 -
1 file changed, 1 deletion(-)
(limited to 'wqflask')
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 2687738d..4d75e7ee 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -758,7 +758,6 @@ class DataSet:
chunk_size = 50
number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
cached_results = fetch_cached_results(self.name, self.type)
- # cached_results = None
if cached_results is None:
trait_sample_data = []
for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
--
cgit v1.2.3
From fffeb91789943a3c7db5a72d66405e2a0459ed44 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 23 Nov 2021 14:49:07 +0300
Subject: fix for overwriting file
---
wqflask/wqflask/correlation/pre_computes.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
(limited to 'wqflask')
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index ad0bc6ef..975a53b8 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -28,8 +28,9 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_
if bool(new_traits_metadata):
dataset_metadata.update(new_traits_metadata)
- with open(file_path, "w+") as file_handler:
- json.dump(dataset_metadata, file_handler)
+
+ with open(file_path, "w+") as file_handler:
+ json.dump(dataset_metadata, file_handler)
def generate_filename(*args, suffix="", file_ext="json"):
--
cgit v1.2.3