From bf1620406c3700d7e211a02fe13c3d8df9a9532d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Nov 2021 10:39:53 +0300 Subject: rename:loading correlation results to computing correlations --- wqflask/wqflask/templates/loading.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/templates/loading.html b/wqflask/wqflask/templates/loading.html index ccf810b0..b9e31ad0 100644 --- a/wqflask/wqflask/templates/loading.html +++ b/wqflask/wqflask/templates/loading.html @@ -66,11 +66,11 @@ {% endif %} {% endif %} {% else %} -

Loading {{ start_vars.tool_used }} Results...

+

 {{ start_vars.tool_used }} Computation in progress ...

{% endif %}

- +
{% if start_vars.vals_diff|length != 0 and start_vars.transform == "" %}

-- cgit v1.2.3 From b5b44f401e0d05089534d7f8e6631d9a092fd0d7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Nov 2021 10:40:11 +0300 Subject: add compute gif --- wqflask/wqflask/static/gif/waitAnima2.gif | Bin 0 -> 54013 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 wqflask/wqflask/static/gif/waitAnima2.gif diff --git a/wqflask/wqflask/static/gif/waitAnima2.gif b/wqflask/wqflask/static/gif/waitAnima2.gif new file mode 100644 index 00000000..50aff7f2 Binary files /dev/null and b/wqflask/wqflask/static/gif/waitAnima2.gif differ -- cgit v1.2.3 From f3ff381a90733d6c64349ed1dd116df83b5565d6 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 02:51:08 +0300 Subject: add fast compute from gn3 --- wqflask/wqflask/correlation/correlation_gn3_api.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index 1e3a40f2..7b828016 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -1,5 +1,7 @@ """module that calls the gn3 api's to do the correlation """ import json +import time +from functools import wraps from wqflask.correlation import correlation_functions @@ -9,6 +11,7 @@ from base.trait import create_trait from base.trait import retrieve_sample_data from gn3.computations.correlations import compute_all_sample_correlation +from gn3.computations.correlations import fast_compute_all_sample_correlation from gn3.computations.correlations import map_shared_keys_to_values from gn3.computations.correlations import compute_all_lit_correlation from gn3.computations.correlations import compute_tissue_correlation @@ -19,9 +22,11 @@ def create_target_this_trait(start_vars): """this function creates the required trait and target dataset for correlation""" if start_vars['dataset'] == "Temp": - this_dataset = data_set.create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) + this_dataset = data_set.create_dataset( + dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) else: - this_dataset = data_set.create_dataset(dataset_name=start_vars['dataset']) + this_dataset = data_set.create_dataset( + dataset_name=start_vars['dataset']) target_dataset = data_set.create_dataset( dataset_name=start_vars['corr_dataset']) this_trait = create_trait(dataset=this_dataset, @@ -187,10 +192,10 @@ def compute_correlation(start_vars, method="pearson", compute_all=False): if corr_type == "sample": (this_trait_data, target_dataset_data) = fetch_sample_data( start_vars, this_trait, this_dataset, target_dataset) - correlation_results = compute_all_sample_correlation(corr_method=method, - this_trait=this_trait_data, - target_dataset=target_dataset_data) + correlation_results = fast_compute_all_sample_correlation(corr_method=method, + this_trait=this_trait_data, + target_dataset=target_dataset_data) elif corr_type == "tissue": trait_symbol_dict = this_dataset.retrieve_genes("Symbol") tissue_input = get_tissue_correlation_input( -- cgit v1.2.3 From bbc75dcef80c3df600ab01c1804a27cdfdce1b80 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 02:51:44 +0300 Subject: init test for precomputing sample correlation --- wqflask/wqflask/correlation/pre_computes.py | 72 +++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 wqflask/wqflask/correlation/pre_computes.py diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py new file mode 100644 index 00000000..1db9f61b --- /dev/null +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -0,0 +1,72 @@ +"""module contains the code to do the +precomputations of sample data between +two entire datasets""" + +import json +from typing import List +from base import data_set + +from gn3.computations.correlations import fast_compute_all_sample_correlation +from gn3.computations.correlations import map_shared_keys_to_values + +def get_dataset_dict_data(dataset_obj): + """function to get the dataset data mapped to key""" + dataset_obj.get_trait_data() + return map_shared_keys_to_values(dataset_obj.samplelist, + dataset_obj.trait_data) + + +def fetch_datasets(base_dataset_name: str, target_dataset_name: str) ->List: + """query to fetch create datasets and fetch traits + all traits of a dataset""" + + # doesnt work for temp + + base_dataset = data_set.create_dataset(dataset_name=base_dataset_name) + + target_dataset = data_set.create_dataset(dataset_name=target_dataset_name) + # replace with map + + return (map(get_dataset_dict_data, + [base_dataset, target_dataset])) + + +# in the base dataset we just need the traits +def pre_compute_sample_correlation(base_dataset: List, + target_dataset: List) -> List: + """function compute the correlation between the + a whole dataset against a target + input: target&base_dataset(contains traits and sample results) + output: list containing the computed results + + precaution:function is expensive;targets only Exon and + """ + + for trait_info in base_dataset: + + yield fast_compute_all_sample_correlation(corr_method="pearson", + this_trait=trait_info, + target_dataset=target_dataset) + + +def cache_to_file(base_dataset_name: str, target_dataset_name: str): + """function to cache the results to file""" + + # validate the datasets expiry first + + base_dataset_data, target_dataset_data = [list(dataset) for dataset in list( + fetch_datasets(base_dataset_name, target_dataset_name))] + + + try: + with open("unique_file_name.json", "w") as file_handler: + file_handler.write() + + dataset_correlation_results = list(pre_compute_sample_correlation( + base_dataset_data, target_dataset_data)) + + print(dataset_correlation_results) + + json.dump(dataset_correlation_results, file_handler) + except Exception as error: + raise error -- cgit v1.2.3 From 6ced33f201e8a4e389a077a91ba9ed8bf5c19fa0 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 15:56:31 +0300 Subject: fix issue with number for samples --- wqflask/wqflask/correlation/pre_computes.py | 37 +++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 1db9f61b..f1c9e1bd 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -6,12 +6,14 @@ import json from typing import List from base import data_set +from gn3.computations.correlations import compute_all_sample_correlation from gn3.computations.correlations import fast_compute_all_sample_correlation from gn3.computations.correlations import map_shared_keys_to_values + def get_dataset_dict_data(dataset_obj): """function to get the dataset data mapped to key""" - dataset_obj.get_trait_data() + dataset_obj.get_trait_data(dataset_obj.group.all_samples_ordered()) return map_shared_keys_to_values(dataset_obj.samplelist, dataset_obj.trait_data) @@ -42,11 +44,21 @@ def pre_compute_sample_correlation(base_dataset: List, precaution:function is expensive;targets only Exon and """ + results = [] + for trait_info in base_dataset: - yield fast_compute_all_sample_correlation(corr_method="pearson", - this_trait=trait_info, - target_dataset=target_dataset) + result = fast_compute_all_sample_correlation(corr_method="pearson", + this_trait=trait_info, + target_dataset=target_dataset) + + # results.append(fast_compute_all_sample_correlation(corr_method="pearson", + # this_trait=trait_info, + # target_dataset=target_dataset)) + print("finished") + print(result) + + return results def cache_to_file(base_dataset_name: str, target_dataset_name: str): @@ -57,16 +69,21 @@ def cache_to_file(base_dataset_name: str, target_dataset_name: str): base_dataset_data, target_dataset_data = [list(dataset) for dataset in list( fetch_datasets(base_dataset_name, target_dataset_name))] + # print(target_dataset_data) try: - with open("unique_file_name.json", "w") as file_handler: - file_handler.write() - - dataset_correlation_results = list(pre_compute_sample_correlation( - base_dataset_data, target_dataset_data)) + # with open("unique_file_name.json", "w") as file_handler: + # file_handler.write() + dataset_correlation_results = pre_compute_sample_correlation( + base_dataset_data, target_dataset_data) print(dataset_correlation_results) - json.dump(dataset_correlation_results, file_handler) + # json.dump(dataset_correlation_results, file_handler) except Exception as error: raise error + + +def check_cached_files_validity(): + """function to check the validity of cached files""" + pass -- cgit v1.2.3 From 5935e2cc3e0ac3a8004ccd5224557d34b62359d8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 20:28:06 +0300 Subject: code to cache frequently run probeset correlation --- wqflask/wqflask/correlation/pre_computes.py | 124 ++++++++++++++-------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index f1c9e1bd..d0caca60 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -1,89 +1,91 @@ -"""module contains the code to do the -precomputations of sample data between -two entire datasets""" -import json -from typing import List -from base import data_set +import os +import hashlib -from gn3.computations.correlations import compute_all_sample_correlation -from gn3.computations.correlations import fast_compute_all_sample_correlation -from gn3.computations.correlations import map_shared_keys_to_values +from base.data_set import query_table_timestamp +from base.webqtlConfig import TMPDIR -def get_dataset_dict_data(dataset_obj): - """function to get the dataset data mapped to key""" - dataset_obj.get_trait_data(dataset_obj.group.all_samples_ordered()) - return map_shared_keys_to_values(dataset_obj.samplelist, - dataset_obj.trait_data) +def generate_filename(**kwargs): + """generate unique filename""" + base_dataset_name = kwargs["base_dataset"] + target_dataset_name = kwargs["target_dataset"] + base_timestamp = kwargs["base_timestamp"] + target_dataset_timestamp = kwargs["target_timestamp"] -def fetch_datasets(base_dataset_name: str, target_dataset_name: str) ->List: - """query to fetch create datasets and fetch traits - all traits of a dataset""" + string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode() + hashlib.md5(string_unicode).hexdigest() - # doesnt work for temp - base_dataset = data_set.create_dataset(dataset_name=base_dataset_name) +def cache_compute_results(start_vars, + base_dataset_type, + correlation_results, + trait_name): + # pass - target_dataset = data_set.create_dataset(dataset_name=target_dataset_name) - # replace with map + # init assumption only caching probeset type + # fix redis;issue potential redis_cache!=current_timestamp + base_timestamp = r.get(f"{base_dataset_type}timestamp") - return (map(get_dataset_dict_data, - [base_dataset, target_dataset])) + if base_timestamp is None: + # fetch the timestamp + base_timestamp = target_dataset_timestamp = query_table_timestamp( + dataset_type) + r.set(f"{dataset_type}timestamp", target_dataset_timestamp) -# in the base dataset we just need the traits -def pre_compute_sample_correlation(base_dataset: List, - target_dataset: List) -> List: - """function compute the correlation between the - a whole dataset against a target - input: target&base_dataset(contains traits and sample results) - output: list containing the computed results + file_name = generate_filename( + base_dataset_name, target_dataset_name, + base_timestamp, target_dataset_timestamp) - precaution:function is expensive;targets only Exon and - """ + file_path = os.path.join(TMPDIR, f"{file_name}.json") - results = [] + try: - for trait_info in base_dataset: + with open(file_path, "r+") as json_handler: - result = fast_compute_all_sample_correlation(corr_method="pearson", - this_trait=trait_info, - target_dataset=target_dataset) + results = json.load(json_handler) - # results.append(fast_compute_all_sample_correlation(corr_method="pearson", - # this_trait=trait_info, - # target_dataset=target_dataset)) - print("finished") - print(result) + if results.get(trait_name) is not None: + results.update({trait_name: correlation_results}) - return results + json.dump(results, json_handler) + except FileNotFoundError: + with open(file_path, "w") as json_handler: + json.dump({trait_name: correlation_results}, json_handler) -def cache_to_file(base_dataset_name: str, target_dataset_name: str): - """function to cache the results to file""" +def fetch_precompute_results(base_dataset_name,target_dataset_name,trait_name): + """function to check for precomputed results""" - # validate the datasets expiry first + # check for redis timestamp + + # fix rely on the fact correlation run oftenly probeset is set + + base_timestamp = target_dataset_timestamp = r.get(dataset_type) + + + if base_timestamp is None: + return + + else: + file_name = generate_filename( + base_dataset_name, target_dataset_name, + base_timestamp, target_dataset_timestamp) + + try: + with open(file_path,"r") as json_handler: + correlation_results = json.load(json_handler) + + return correlation_results.get(trait_name) + + except FileNotFoundError: + pass - base_dataset_data, target_dataset_data = [list(dataset) for dataset in list( - fetch_datasets(base_dataset_name, target_dataset_name))] - # print(target_dataset_data) - try: - # with open("unique_file_name.json", "w") as file_handler: - # file_handler.write() - dataset_correlation_results = pre_compute_sample_correlation( - base_dataset_data, target_dataset_data) - print(dataset_correlation_results) - # json.dump(dataset_correlation_results, file_handler) - except Exception as error: - raise error -def check_cached_files_validity(): - """function to check the validity of cached files""" - pass -- cgit v1.2.3 From ca4a5fdda8a7225dc5bebc17c61837ba5373ec68 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 20:29:04 +0300 Subject: minor fix for generating file_name --- wqflask/wqflask/correlation/pre_computes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index d0caca60..55f25f0b 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -15,7 +15,7 @@ def generate_filename(**kwargs): target_dataset_timestamp = kwargs["target_timestamp"] string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode() - hashlib.md5(string_unicode).hexdigest() + return hashlib.md5(string_unicode).hexdigest() def cache_compute_results(start_vars, -- cgit v1.2.3 From b4594a6f2dc5c0c0a8e62a327674126668391d6b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 20:32:13 +0300 Subject: minor fix for updating dict --- wqflask/wqflask/correlation/pre_computes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 55f25f0b..4244fcfb 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -47,7 +47,7 @@ def cache_compute_results(start_vars, results = json.load(json_handler) - if results.get(trait_name) is not None: + if results.get(trait_name) is None: results.update({trait_name: correlation_results}) json.dump(results, json_handler) -- cgit v1.2.3 From a20e20c79b054350b84e70af6e7d5ef2a0407786 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 22:18:21 +0300 Subject: pep8 formatting + minor fixing for writing to files --- wqflask/wqflask/correlation/pre_computes.py | 75 +++++++++++++---------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 4244fcfb..1d832fde 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -1,28 +1,28 @@ - +import json import os import hashlib from base.data_set import query_table_timestamp from base.webqtlConfig import TMPDIR +from redis import Redis +r = Redis() -def generate_filename(**kwargs): - """generate unique filename""" - base_dataset_name = kwargs["base_dataset"] - target_dataset_name = kwargs["target_dataset"] - base_timestamp = kwargs["base_timestamp"] - target_dataset_timestamp = kwargs["target_timestamp"] +def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): + """generate unique filename""" string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode() return hashlib.md5(string_unicode).hexdigest() -def cache_compute_results(start_vars, - base_dataset_type, - correlation_results, - trait_name): +def cache_compute_results(base_dataset_type, + base_dataset_name, + target_dataset_name, + correlation_results, + trait_name): # pass + """function to cache correlation results for heavy computations""" # init assumption only caching probeset type # fix redis;issue potential redis_cache!=current_timestamp @@ -30,10 +30,11 @@ def cache_compute_results(start_vars, if base_timestamp is None: # fetch the timestamp - base_timestamp = target_dataset_timestamp = query_table_timestamp( - dataset_type) + base_timestamp = query_table_timestamp( + base_dataset_type) + r.set(f"{base_dataset_type}timestamp", base_timestamp) - r.set(f"{dataset_type}timestamp", target_dataset_timestamp) + target_dataset_timestamp = base_timestamp file_name = generate_filename( base_dataset_name, target_dataset_name, @@ -41,51 +42,43 @@ def cache_compute_results(start_vars, file_path = os.path.join(TMPDIR, f"{file_name}.json") - try: + try: + + with open(file_path, "r+") as json_handler: - with open(file_path, "r+") as json_handler: + results = json.load(json_handler) + results[trait_name] = correlation_results - results = json.load(json_handler) + json.dump(results, json_handler) - if results.get(trait_name) is None: - results.update({trait_name: correlation_results}) + except FileNotFoundError: - json.dump(results, json_handler) + with open(file_path, "w+") as write_json_handler: + json.dump({trait_name: correlation_results}, write_json_handler) - except FileNotFoundError: - with open(file_path, "w") as json_handler: - json.dump({trait_name: correlation_results}, json_handler) -def fetch_precompute_results(base_dataset_name,target_dataset_name,trait_name): +def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): """function to check for precomputed results""" # check for redis timestamp # fix rely on the fact correlation run oftenly probeset is set - base_timestamp = target_dataset_timestamp = r.get(dataset_type) - + base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp") if base_timestamp is None: return - else: - file_name = generate_filename( + file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp) - try: - with open(file_path,"r") as json_handler: - correlation_results = json.load(json_handler) - - return correlation_results.get(trait_name) - - except FileNotFoundError: - pass - - - - - + file_path = os.path.join(TMPDIR, f"{file_name}.json") + try: + with open(file_path, "r") as json_handler: + correlation_results = json.load(json_handler) + return correlation_results.get(trait_name) + except FileNotFoundError: + pass -- cgit v1.2.3 From 01d42255f52a61c6d3d007ffd1e5e02765a76730 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 15 Nov 2021 17:38:09 +0300 Subject: fix for truncating files --- wqflask/wqflask/correlation/pre_computes.py | 53 ++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 1d832fde..01fa1a3d 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -4,6 +4,7 @@ import hashlib from base.data_set import query_table_timestamp from base.webqtlConfig import TMPDIR +from json.decoder import JSONDecodeError from redis import Redis r = Redis() @@ -26,35 +27,49 @@ def cache_compute_results(base_dataset_type, # init assumption only caching probeset type # fix redis;issue potential redis_cache!=current_timestamp - base_timestamp = r.get(f"{base_dataset_type}timestamp") - if base_timestamp is None: - # fetch the timestamp - base_timestamp = query_table_timestamp( - base_dataset_type) - r.set(f"{base_dataset_type}timestamp", base_timestamp) + base_timestamp = query_table_timestamp(base_dataset_type) + + r.set(f"{base_dataset_type}timestamp", base_timestamp) target_dataset_timestamp = base_timestamp + + file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp) - file_path = os.path.join(TMPDIR, f"{file_name}.json") + + file_path = os.path.join(TMPDIR,f"{file_name}.json") + try: + with open(file_path,"r+") as json_file_handler: + data = json.load(json_file_handler) - with open(file_path, "r+") as json_handler: + data[trait_name] = correlation_results - results = json.load(json_handler) - results[trait_name] = correlation_results + json_file_handler.seek(0) - json.dump(results, json_handler) + json.dump(data,json_file_handler) + json_file_handler.truncate() + except FileNotFoundError: + with open(file_path,"w+") as file_handler: + data = {} + data[trait_name] =correlation_results + + json.dump(data,file_handler) - with open(file_path, "w+") as write_json_handler: - json.dump({trait_name: correlation_results}, write_json_handler) + + + # create the file only if it does not exists + + # else open the file to cache the results + + def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): @@ -65,20 +80,26 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ # fix rely on the fact correlation run oftenly probeset is set base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp") - if base_timestamp is None: return + else: + base_timestamp = target_dataset_timestamp = base_timestamp.decode("utf-8") + file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") + results = None + try: - with open(file_path, "r") as json_handler: + with open(file_path, "r+") as json_handler: correlation_results = json.load(json_handler) - return correlation_results.get(trait_name) + # print(correlation_results) + + return correlation_results.get(trait_name) except FileNotFoundError: pass -- cgit v1.2.3 From 18b53441a0136071db94c72b112a746e056ef971 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 15 Nov 2021 18:03:55 +0300 Subject: refactor function to fetch datasets data for precomputes --- wqflask/wqflask/correlation/pre_computes.py | 72 ++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 01fa1a3d..e7147ddf 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -20,6 +20,7 @@ def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, ta def cache_compute_results(base_dataset_type, base_dataset_name, target_dataset_name, + corr_method, correlation_results, trait_name): # pass @@ -34,43 +35,35 @@ def cache_compute_results(base_dataset_type, target_dataset_timestamp = base_timestamp - - file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp) - - file_path = os.path.join(TMPDIR,f"{file_name}.json") - + file_path = os.path.join(TMPDIR, f"{file_name}.json") try: - with open(file_path,"r+") as json_file_handler: + with open(file_path, "r+") as json_file_handler: data = json.load(json_file_handler) data[trait_name] = correlation_results json_file_handler.seek(0) - json.dump(data,json_file_handler) + json.dump(data, json_file_handler) json_file_handler.truncate() - + except FileNotFoundError: - with open(file_path,"w+") as file_handler: + with open(file_path, "w+") as file_handler: data = {} - data[trait_name] =correlation_results - - json.dump(data,file_handler) - + data[trait_name] = correlation_results + json.dump(data, file_handler) # create the file only if it does not exists # else open the file to cache the results - - def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): """function to check for precomputed results""" @@ -84,7 +77,8 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ return else: - base_timestamp = target_dataset_timestamp = base_timestamp.decode("utf-8") + base_timestamp = target_dataset_timestamp = base_timestamp.decode( + "utf-8") file_name = generate_filename( base_dataset_name, target_dataset_name, @@ -103,3 +97,49 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ except FileNotFoundError: pass + + +def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method): + """compute sample correlation between dataset vs dataset + wn:heavy function should be invoked less frequently + input:datasets_data(two dicts),corr_method + + output:correlation results for entire dataset against entire dataset + """ + dataset_correlation_results = {} + + for (trait_name, strain_values) in target_dataset.trait_data: + + this_trait_data = { + "trait_sample_data": strain_values, + "trait_id": trait_name + } + + trait_correlation_result = fast_compute_all_sample_correlation( + corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_dataset_data) + + dataset_correlation_results[trait_name] = trait_correlation_result + + return dataset_correlation_results + + +def get_datasets_data(base_dataset, target_dataset_data): + """required to pass data in a given format to the pre compute + function + + output:two dicts for datasets with key==trait and value==strains + """ + target_traits_data = target_dataset.get_trait_data( + base_dataset.group.all_samples_ordered()) + + base_traits_data = base_dataset.get_trait_data( + base_dataset.group.all_samples_ordered()) + + samples_fetched = base_dataset.group.all_samples_ordered() + + target_results = map_shared_keys_to_values( + samples_fetched, target_traits_data) + base_results = map_shared_keys_to_values( + samples_fetched, base_traits_data) + + return (target_results, base_results) -- cgit v1.2.3 From 6f6e28d216e0a2adbf939b9b29f8794ae45d9aa8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 15 Nov 2021 18:08:10 +0300 Subject: pep8 formatting & fix variable names --- wqflask/wqflask/correlation/pre_computes.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index e7147ddf..d8629706 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -108,17 +108,20 @@ def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method): """ dataset_correlation_results = {} - for (trait_name, strain_values) in target_dataset.trait_data: + target_traits_data, base_traits_data = get_datasets_data( + base_dataset, target_dataset_data) + + for (primary_trait_name, strain_values) in base_traits_data: this_trait_data = { "trait_sample_data": strain_values, - "trait_id": trait_name + "trait_id": primary_trait_name } trait_correlation_result = fast_compute_all_sample_correlation( - corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_dataset_data) + corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_traits_data) - dataset_correlation_results[trait_name] = trait_correlation_result + dataset_correlation_results[primary_trait_name] = trait_correlation_result return dataset_correlation_results -- cgit v1.2.3 From 2982ba491e89acee5ead69206691c786be1cf728 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 15 Nov 2021 18:10:56 +0300 Subject: test precomppute caching integration --- wqflask/wqflask/correlation/correlation_gn3_api.py | 23 +++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index 7b828016..191a748a 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -4,7 +4,8 @@ import time from functools import wraps from wqflask.correlation import correlation_functions - +from wqflask.correlation.pre_computes import fetch_precompute_results +from wqflask.correlation.pre_computes import cache_compute_results from base import data_set from base.trait import create_trait @@ -193,9 +194,21 @@ def compute_correlation(start_vars, method="pearson", compute_all=False): (this_trait_data, target_dataset_data) = fetch_sample_data( start_vars, this_trait, this_dataset, target_dataset) - correlation_results = fast_compute_all_sample_correlation(corr_method=method, - this_trait=this_trait_data, - target_dataset=target_dataset_data) + correlation_results = fetch_precompute_results( + this_dataset.name, target_dataset.name, this_dataset.type, this_trait.name) + + if correlation_results is None: + correlation_results = fast_compute_all_sample_correlation(corr_method=method, + this_trait=this_trait_data, + target_dataset=target_dataset_data) + + cache_compute_results(this_dataset.type, + this_dataset.name, + target_dataset.name, + corr_method, + correlation_results, + this_trait.name) + elif corr_type == "tissue": trait_symbol_dict = this_dataset.retrieve_genes("Symbol") tissue_input = get_tissue_correlation_input( @@ -295,7 +308,7 @@ def get_tissue_correlation_input(this_trait, trait_symbol_dict): """Gets tissue expression values for the primary trait and target tissues values""" primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( symbol_list=[this_trait.symbol]) - if this_trait.symbol.lower() in primary_trait_tissue_vals_dict: + if this_trait.symbol and this_trait.symbol.lower() in primary_trait_tissue_vals_dict: primary_trait_tissue_values = primary_trait_tissue_vals_dict[this_trait.symbol.lower( )] corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( -- cgit v1.2.3 From aab6393dd60872a6a3b6e7db2a7c087c4ec41295 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 15 Nov 2021 18:18:03 +0300 Subject: fetch only strains from the primary datasets --- wqflask/wqflask/correlation/pre_computes.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index d8629706..355701f2 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -86,7 +86,6 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ file_path = os.path.join(TMPDIR, f"{file_name}.json") - results = None try: with open(file_path, "r+") as json_handler: @@ -130,15 +129,19 @@ def get_datasets_data(base_dataset, target_dataset_data): """required to pass data in a given format to the pre compute function + (works for bxd only probeset datasets) + + # fix issue with fetching of the datasets + output:two dicts for datasets with key==trait and value==strains """ + samples_fetched = base_dataset.group.all_samples_ordered() target_traits_data = target_dataset.get_trait_data( - base_dataset.group.all_samples_ordered()) + samples_fetched) base_traits_data = base_dataset.get_trait_data( - base_dataset.group.all_samples_ordered()) + samples_fetched) - samples_fetched = base_dataset.group.all_samples_ordered() target_results = map_shared_keys_to_values( samples_fetched, target_traits_data) -- cgit v1.2.3 From 04452c274d51621a0cab1b8dce5b8101c69496b6 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 14:41:41 +0300 Subject: refactor:fix on the query :modify cache point --- wqflask/base/data_set.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 70c58136..a3a720ad 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -747,7 +747,9 @@ class DataSet: and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) - sample_ids = [results[item] for item in self.samplelist] + sample_ids = [results.get(item) for item in self.samplelist] + + sample_ids = [ids for ids in sample_ids if ids is not None] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -800,25 +802,22 @@ class DataSet: results = g.db.execute(query).fetchall() trait_sample_data.append([list(result) for result in results]) - cache_dataset_results( - self.name, self.type, trait_sample_data) + trait_count = len(trait_sample_data[0]) + self.trait_data = collections.defaultdict(list) - else: - trait_sample_data = cached_results - - trait_count = len(trait_sample_data[0]) - self.trait_data = collections.defaultdict(list) - - # put all of the separate data together into a dictionary where the keys are - # trait names and values are lists of sample values - data_start_pos = 1 - for trait_counter in range(trait_count): - trait_name = trait_sample_data[0][trait_counter][0] - for chunk_counter in range(int(number_chunks)): - self.trait_data[trait_name] += ( + data_start_pos = 1 + for trait_counter in range(trait_count): + trait_name = trait_sample_data[0][trait_counter][0] + for chunk_counter in range(int(number_chunks)): + self.trait_data[trait_name] += ( trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + cache_dataset_results( + self.name, self.type, self.trait_data) + + else: + self.trait_data = cached_results class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' @@ -1282,7 +1281,9 @@ def generate_hash_file(dataset_name: str, dataset_timestamp: str): def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: List): - """function to cache dataset query results to file""" + """function to cache dataset query results to file + input dataset_name and type query_results(already processed in default dict format) + """ # data computations actions # store the file path on redis -- cgit v1.2.3 From a8ccaf03ba151f9ceca2f0224af33db230a8c8b3 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 15:53:50 +0300 Subject: test generate new files --- wqflask/base/data_set.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index a3a720ad..cae1a2a7 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -810,7 +810,7 @@ class DataSet: trait_name = trait_sample_data[0][trait_counter][0] for chunk_counter in range(int(number_chunks)): self.trait_data[trait_name] += ( - trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) cache_dataset_results( self.name, self.type, self.trait_data) @@ -818,6 +818,8 @@ class DataSet: else: self.trait_data = cached_results + + class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' @@ -1291,7 +1293,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L results = r.set(f"{dataset_type}timestamp", table_timestamp) - file_name = generate_hash_file(dataset_name, table_timestamp) + file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") with open(file_path, "w") as file_handler: @@ -1308,7 +1310,7 @@ def fetch_cached_results(dataset_name: str, dataset_type: str): else: table_timestamp = "" - file_name = generate_hash_file(dataset_name, table_timestamp) + file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") try: with open(file_path, "r") as file_handler: -- cgit v1.2.3 From 4725a8c20d1d4209d59b3b113f00bbc467c5bd31 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 19:43:15 +0300 Subject: init disable fast compute:memory fork issues --- wqflask/wqflask/correlation/correlation_gn3_api.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index 191a748a..635ef5ed 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -194,20 +194,8 @@ def compute_correlation(start_vars, method="pearson", compute_all=False): (this_trait_data, target_dataset_data) = fetch_sample_data( start_vars, this_trait, this_dataset, target_dataset) - correlation_results = fetch_precompute_results( - this_dataset.name, target_dataset.name, this_dataset.type, this_trait.name) - - if correlation_results is None: - correlation_results = fast_compute_all_sample_correlation(corr_method=method, - this_trait=this_trait_data, - target_dataset=target_dataset_data) - - cache_compute_results(this_dataset.type, - this_dataset.name, - target_dataset.name, - corr_method, - correlation_results, - this_trait.name) + correlation_results = compute_all_sample_correlation( + corr_method=method, this_trait=this_trait_data, target_dataset=target_dataset_data) elif corr_type == "tissue": trait_symbol_dict = this_dataset.retrieve_genes("Symbol") -- cgit v1.2.3 From 06fbab6427cadf7706da4e954874a7e5da1bd32d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 19:48:11 +0300 Subject: pep8 formatting remove debug statements --- wqflask/base/data_set.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index cae1a2a7..37f35121 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1263,7 +1263,7 @@ def query_table_timestamp(dataset_type: str): # computation data and actions query_update_time = f""" - SELECT UPDATE_TIME FROM information_schema.tables + SELECT UPDATE_TIME FROM information_schfema.tables WHERE TABLE_SCHEMA = 'db_webqtl_s' AND TABLE_NAME = '{dataset_type}Data' """ @@ -1275,7 +1275,7 @@ def query_table_timestamp(dataset_type: str): return date_time_obj.strftime(f) -def generate_hash_file(dataset_name: str, dataset_timestamp: str): +def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str): """given the trait_name generate a unique name for this""" string_unicode = f"{dataset_name}{dataset_timestamp}".encode() md5hash = hashlib.md5(string_unicode) @@ -1317,5 +1317,4 @@ def fetch_cached_results(dataset_name: str, dataset_type: str): return json.load(file_handler) except FileNotFoundError: - # take actions continue to fetch dataset results and fetch results - pass + pass \ No newline at end of file -- cgit v1.2.3 From 679051788a475dfcefd4cb93dc82ec3a4b86edc3 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 19:54:55 +0300 Subject: use comprehension list;fix typo --- wqflask/base/data_set.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 37f35121..553530d4 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -747,9 +747,7 @@ class DataSet: and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) - sample_ids = [results.get(item) for item in self.samplelist] - - sample_ids = [ids for ids in sample_ids if ids is not None] + sample_ids = [results.get(item) for item in self.samplelist if item is not None] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -1263,7 +1261,7 @@ def query_table_timestamp(dataset_type: str): # computation data and actions query_update_time = f""" - SELECT UPDATE_TIME FROM information_schfema.tables + SELECT UPDATE_TIME FROM information_schema.tables WHERE TABLE_SCHEMA = 'db_webqtl_s' AND TABLE_NAME = '{dataset_type}Data' """ @@ -1317,4 +1315,4 @@ def fetch_cached_results(dataset_name: str, dataset_type: str): return json.load(file_handler) except FileNotFoundError: - pass \ No newline at end of file + pass -- cgit v1.2.3 From 60fe836dc6c2f00cb99844572eb3fd29aee0163e Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:07:02 +0300 Subject: use a dynamic value for the db_name --- wqflask/base/data_set.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 553530d4..2e401c8e 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -40,6 +40,7 @@ from base import species from base import webqtlConfig from flask import Flask, g from base.webqtlConfig import TMPDIR +from gn3.db_utils import parse_db_url import os import math import string @@ -747,7 +748,8 @@ class DataSet: and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) - sample_ids = [results.get(item) for item in self.samplelist if item is not None] + sample_ids = [results.get(item) + for item in self.samplelist if item is not None] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -1260,9 +1262,11 @@ def query_table_timestamp(dataset_type: str): # computation data and actions + fetch_db_name = parse_db_url() + query_update_time = f""" SELECT UPDATE_TIME FROM information_schema.tables - WHERE TABLE_SCHEMA = 'db_webqtl_s' + WHERE TABLE_SCHEMA = {fetch_db_name[-1]} AND TABLE_NAME = '{dataset_type}Data' """ -- cgit v1.2.3 From 71a859c9facc7ae49d43e3e995166ad8dcb586cb Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:11:12 +0300 Subject: isolate SQL_URI parse to a function --- wqflask/base/data_set.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 2e401c8e..f0a930a5 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -40,7 +40,8 @@ from base import species from base import webqtlConfig from flask import Flask, g from base.webqtlConfig import TMPDIR -from gn3.db_utils import parse_db_url +from urllib.parse import urlparse +from utility.tools import SQL_URI import os import math import string @@ -1257,6 +1258,13 @@ def geno_mrna_confidentiality(ob): return True + +def parse_db_url(): + parsed_db = urlparse(SQL_URI) + + return (parsed_db.hostname, parsed_db.username, + parsed_db.password, parsed_db.path[1:]) + def query_table_timestamp(dataset_type: str): """function to query the update timestamp of a given dataset_type""" -- cgit v1.2.3 From 5a407a34442860ebaea2886f2278be9e1eb33a8d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:13:41 +0300 Subject: replace redis fetch for cached timestamp with a query --- wqflask/base/data_set.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index f0a930a5..52d1d254 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1301,8 +1301,6 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L table_timestamp = query_table_timestamp(dataset_type) - results = r.set(f"{dataset_type}timestamp", table_timestamp) - file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") @@ -1313,12 +1311,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L def fetch_cached_results(dataset_name: str, dataset_type: str): """function to fetch the cached results""" - table_timestamp = r.get(f"{dataset_type}timestamp") - - if table_timestamp is not None: - table_timestamp = table_timestamp.decode("utf-8") - else: - table_timestamp = "" + table_timestamp = query_table_timestamp(dataset_type) file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") -- cgit v1.2.3 From 1090674ac9497dad22803e7bf8e51d77245f8a0c Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:27:05 +0300 Subject: isolate function to fetch the traits metadata --- wqflask/wqflask/correlation/pre_computes.py | 32 +++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 355701f2..638ae860 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -9,6 +9,25 @@ from redis import Redis r = Redis() +# code to isolate metadata caching + + +def fetch_all_cached_metadata(dataset_name): + """in a gvein dataset fetch all the traits metadata""" + file_name = f"{dataset_name}_metadata.json" + + file_path = os.path.join(TMPDIR, file_name) + + with open(file_path, "r+") as file_handler: + dataset_metadata = json.load(file_handler) + + except FileNotFoundError: + Path(file_path).touch(exist_ok=True) + return {} + + return dataset_metadata + + def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): """generate unique filename""" @@ -60,18 +79,10 @@ def cache_compute_results(base_dataset_type, json.dump(data, file_handler) - # create the file only if it does not exists - - # else open the file to cache the results - def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): """function to check for precomputed results""" - # check for redis timestamp - - # fix rely on the fact correlation run oftenly probeset is set - base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp") if base_timestamp is None: return @@ -86,11 +97,9 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ file_path = os.path.join(TMPDIR, f"{file_name}.json") - try: with open(file_path, "r+") as json_handler: correlation_results = json.load(json_handler) - # print(correlation_results) return correlation_results.get(trait_name) @@ -131,8 +140,6 @@ def get_datasets_data(base_dataset, target_dataset_data): (works for bxd only probeset datasets) - # fix issue with fetching of the datasets - output:two dicts for datasets with key==trait and value==strains """ samples_fetched = base_dataset.group.all_samples_ordered() @@ -142,7 +149,6 @@ def get_datasets_data(base_dataset, target_dataset_data): base_traits_data = base_dataset.get_trait_data( samples_fetched) - target_results = map_shared_keys_to_values( samples_fetched, target_traits_data) base_results = map_shared_keys_to_values( -- cgit v1.2.3 From c872594d21ab743ae55ae4f1d037d13394ef8c67 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:34:24 +0300 Subject: isolate function to cache new traits metadata --- wqflask/wqflask/correlation/pre_computes.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 638ae860..9270bdd4 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -1,6 +1,7 @@ import json import os import hashlib +from pathlib import Path from base.data_set import query_table_timestamp from base.webqtlConfig import TMPDIR @@ -25,8 +26,22 @@ def fetch_all_cached_metadata(dataset_name): Path(file_path).touch(exist_ok=True) return {} - return dataset_metadata + return (file_path, dataset_metadata) + if bool(new_traits_metadata): + # that means new traits exists + dataset_metadata.update(new_traits_metadata) + with open(file_path, "w+") as file_handler: + json.dump(dataset_metadata, file_handler) + + +def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_path: str): + """function to cache the new traits metadata""" + + if bool(new_traits_metadata): + dataset_metadata.update(new_traits_metadata) + with open(file_path,"w+") as file_handler: + json.dump(dataset_metadata,file_handler) def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): -- cgit v1.2.3 From a35ae60965d7cada41acad661afd88a8fc58e78e Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:42:47 +0300 Subject: pep8 formatting;delete remove redis dependency --- wqflask/wqflask/correlation/pre_computes.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 9270bdd4..403d60c9 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -5,12 +5,6 @@ from pathlib import Path from base.data_set import query_table_timestamp from base.webqtlConfig import TMPDIR -from json.decoder import JSONDecodeError -from redis import Redis - -r = Redis() - -# code to isolate metadata caching def fetch_all_cached_metadata(dataset_name): @@ -28,20 +22,14 @@ def fetch_all_cached_metadata(dataset_name): return (file_path, dataset_metadata) - if bool(new_traits_metadata): - # that means new traits exists - dataset_metadata.update(new_traits_metadata) - with open(file_path, "w+") as file_handler: - json.dump(dataset_metadata, file_handler) - def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_path: str): """function to cache the new traits metadata""" if bool(new_traits_metadata): dataset_metadata.update(new_traits_metadata) - with open(file_path,"w+") as file_handler: - json.dump(dataset_metadata,file_handler) + with open(file_path, "w+") as file_handler: + json.dump(dataset_metadata, file_handler) def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): @@ -98,14 +86,8 @@ def cache_compute_results(base_dataset_type, def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): """function to check for precomputed results""" - base_timestamp = target_dataset_timestamp = r.get(f"{dataset_type}timestamp") - if base_timestamp is None: - return - - else: - base_timestamp = target_dataset_timestamp = base_timestamp.decode( - "utf-8") - + base_timestamp = target_dataset_timestamp = query_table_timestamp( + dataset_type) file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp) -- cgit v1.2.3 From 6786712e95cbb885b6b19b3ecd34e6c8ee893172 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 20:20:07 +0300 Subject: refactor sql query & date formatting --- wqflask/base/data_set.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 52d1d254..2687738d 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1258,31 +1258,27 @@ def geno_mrna_confidentiality(ob): return True - def parse_db_url(): parsed_db = urlparse(SQL_URI) return (parsed_db.hostname, parsed_db.username, parsed_db.password, parsed_db.path[1:]) + def query_table_timestamp(dataset_type: str): """function to query the update timestamp of a given dataset_type""" # computation data and actions fetch_db_name = parse_db_url() - query_update_time = f""" SELECT UPDATE_TIME FROM information_schema.tables - WHERE TABLE_SCHEMA = {fetch_db_name[-1]} + WHERE TABLE_SCHEMA = '{fetch_db_name[-1]}' AND TABLE_NAME = '{dataset_type}Data' """ - # store the timestamp in redis= date_time_obj = g.db.execute(query_update_time).fetchone()[0] - - f = "%Y-%m-%d %H:%M:%S" - return date_time_obj.strftime(f) + return date_time_obj.strftime("%Y-%m-%d %H:%M:%S") def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str): @@ -1301,6 +1297,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L table_timestamp = query_table_timestamp(dataset_type) + file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") -- cgit v1.2.3 From 1a3b85c4ebc66d54e3bda06c3742e8046e4c8159 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 20:38:20 +0300 Subject: add generic functio for generating filename --- wqflask/wqflask/correlation/pre_computes.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 403d60c9..241b0730 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -13,8 +13,9 @@ def fetch_all_cached_metadata(dataset_name): file_path = os.path.join(TMPDIR, file_name) - with open(file_path, "r+") as file_handler: - dataset_metadata = json.load(file_handler) + try: + with open(file_path, "r+") as file_handler: + dataset_metadata = json.load(file_handler) except FileNotFoundError: Path(file_path).touch(exist_ok=True) @@ -32,6 +33,13 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_ json.dump(dataset_metadata, file_handler) +def generate_file_name(*args, prefix=""): + """given a list of args generate a unique filename""" + + string_unicode = f"{*args,}{prefix}".encode() + return hashlib.md5(string_unicode).hexdigest() + + def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): """generate unique filename""" -- cgit v1.2.3 From 56b574b903244a64aecaa54e5305b25bb642b254 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 18 Nov 2021 12:02:10 +0300 Subject: pep8 formatting;minor fixes --- wqflask/wqflask/correlation/pre_computes.py | 37 ++++++++++++++++++----------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 241b0730..77592a3a 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -9,7 +9,7 @@ from base.webqtlConfig import TMPDIR def fetch_all_cached_metadata(dataset_name): """in a gvein dataset fetch all the traits metadata""" - file_name = f"{dataset_name}_metadata.json" + file_name = generate_file_name(dataset_name, suffix="metadata") file_path = os.path.join(TMPDIR, file_name) @@ -33,11 +33,11 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_ json.dump(dataset_metadata, file_handler) -def generate_file_name(*args, prefix=""): +def generate_file_name(*args, suffix="", file_ext="json"): """given a list of args generate a unique filename""" - string_unicode = f"{*args,}{prefix}".encode() - return hashlib.md5(string_unicode).hexdigest() + string_unicode = f"{*args,}".encode() + return f"{hashlib.md5(string_unicode).hexdigest()}_{suffix}.{file_ext}" def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): @@ -65,11 +65,12 @@ def cache_compute_results(base_dataset_type, target_dataset_timestamp = base_timestamp - file_name = generate_filename( + file_name = generate_file_name( base_dataset_name, target_dataset_name, - base_timestamp, target_dataset_timestamp) + base_timestamp, target_dataset_timestamp, + suffix="corr_precomputes") - file_path = os.path.join(TMPDIR, f"{file_name}.json") + file_path = os.path.join(TMPDIR, file_name) try: with open(file_path, "r+") as json_file_handler: @@ -91,16 +92,20 @@ def cache_compute_results(base_dataset_type, json.dump(data, file_handler) -def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): +def fetch_precompute_results(base_dataset_name, + target_dataset_name, + dataset_type, + trait_name): """function to check for precomputed results""" base_timestamp = target_dataset_timestamp = query_table_timestamp( dataset_type) - file_name = generate_filename( + file_name = generate_file_name( base_dataset_name, target_dataset_name, - base_timestamp, target_dataset_timestamp) + base_timestamp, target_dataset_timestamp, + suffix="corr_precomputes") - file_path = os.path.join(TMPDIR, f"{file_name}.json") + file_path = os.path.join(TMPDIR, file_name) try: with open(file_path, "r+") as json_handler: @@ -112,7 +117,9 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ pass -def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method): +def pre_compute_dataset_vs_dataset(base_dataset, + target_dataset, + corr_method): """compute sample correlation between dataset vs dataset wn:heavy function should be invoked less frequently input:datasets_data(two dicts),corr_method @@ -131,8 +138,10 @@ def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method): "trait_id": primary_trait_name } - trait_correlation_result = fast_compute_all_sample_correlation( - corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_traits_data) + trait_correlation_result = compute_all_sample_correlation( + corr_method=corr_method, + this_trait=this_trait_data, + target_dataset=target_traits_data) dataset_correlation_results[primary_trait_name] = trait_correlation_result -- cgit v1.2.3 From 4de623130dca019d15f956e91ec999fddc2e2a0f Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 18 Nov 2021 12:03:48 +0300 Subject: remove unused functions rename function names --- wqflask/wqflask/correlation/pre_computes.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 77592a3a..b95ceba5 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -9,7 +9,7 @@ from base.webqtlConfig import TMPDIR def fetch_all_cached_metadata(dataset_name): """in a gvein dataset fetch all the traits metadata""" - file_name = generate_file_name(dataset_name, suffix="metadata") + file_name = generate_filename(dataset_name, suffix="metadata") file_path = os.path.join(TMPDIR, file_name) @@ -33,20 +33,13 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_ json.dump(dataset_metadata, file_handler) -def generate_file_name(*args, suffix="", file_ext="json"): +def generate_filename(*args, suffix="", file_ext="json"): """given a list of args generate a unique filename""" string_unicode = f"{*args,}".encode() return f"{hashlib.md5(string_unicode).hexdigest()}_{suffix}.{file_ext}" -def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp): - """generate unique filename""" - - string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode() - return hashlib.md5(string_unicode).hexdigest() - - def cache_compute_results(base_dataset_type, base_dataset_name, target_dataset_name, @@ -65,7 +58,7 @@ def cache_compute_results(base_dataset_type, target_dataset_timestamp = base_timestamp - file_name = generate_file_name( + file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp, suffix="corr_precomputes") @@ -100,7 +93,7 @@ def fetch_precompute_results(base_dataset_name, base_timestamp = target_dataset_timestamp = query_table_timestamp( dataset_type) - file_name = generate_file_name( + file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp, suffix="corr_precomputes") -- cgit v1.2.3 From 24d87cb4e75136f822b316a3c9f936b8e5efb5e9 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 18 Nov 2021 13:40:54 +0300 Subject: refactor code for metadata --- wqflask/wqflask/correlation/pre_computes.py | 11 ++------- wqflask/wqflask/correlation/show_corr_results.py | 29 +++++++++--------------- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index b95ceba5..ad0bc6ef 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -16,12 +16,11 @@ def fetch_all_cached_metadata(dataset_name): try: with open(file_path, "r+") as file_handler: dataset_metadata = json.load(file_handler) + return (file_path, dataset_metadata) except FileNotFoundError: Path(file_path).touch(exist_ok=True) - return {} - - return (file_path, dataset_metadata) + return (file_path, {}) def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_path: str): @@ -46,16 +45,10 @@ def cache_compute_results(base_dataset_type, corr_method, correlation_results, trait_name): - # pass """function to cache correlation results for heavy computations""" - # init assumption only caching probeset type - # fix redis;issue potential redis_cache!=current_timestamp - base_timestamp = query_table_timestamp(base_dataset_type) - r.set(f"{base_dataset_type}timestamp", base_timestamp) - target_dataset_timestamp = base_timestamp file_name = generate_filename( diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 42010a1e..f5600f13 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -26,6 +26,9 @@ from base.trait import create_trait, jsonable from base.data_set import create_dataset from base.webqtlConfig import TMPDIR +from wqflask.correlation.pre_computes import fetch_all_cached_metadata +from wqflask.correlation.pre_computes import cache_new_traits_metadata + from utility import hmac @@ -34,7 +37,8 @@ def set_template_vars(start_vars, correlation_data): corr_method = start_vars['corr_sample_method'] if start_vars['dataset'] == "Temp": - this_dataset_ob = create_dataset(dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) + this_dataset_ob = create_dataset( + dataset_name="Temp", dataset_type="Temp", group_name=start_vars['group']) else: this_dataset_ob = create_dataset(dataset_name=start_vars['dataset']) this_trait = create_trait(dataset=this_dataset_ob, @@ -86,25 +90,17 @@ def correlation_json_for_table(correlation_data, this_trait, this_dataset, targe corr_results = correlation_data['correlation_results'] results_list = [] - file_name = f"{target_dataset['name']}_metadata.json" - - file_path = os.path.join(TMPDIR, file_name) new_traits_metadata = {} - try: - with open(file_path,"r+") as file_handler: - dataset_metadata = json.load(file_handler) - - except FileNotFoundError: - Path(file_path).touch(exist_ok=True) - dataset_metadata = {} + (file_path, dataset_metadata) = fetch_all_cached_metadata( + target_dataset['name']) for i, trait_dict in enumerate(corr_results): trait_name = list(trait_dict.keys())[0] trait = trait_dict[trait_name] target_trait = dataset_metadata.get(trait_name) - if target_trait is None: + if target_trait is None: target_trait_ob = create_trait(dataset=target_dataset_ob, name=trait_name, get_qtl_info=True) @@ -184,12 +180,9 @@ def correlation_json_for_table(correlation_data, this_trait, this_dataset, targe results_list.append(results_dict) - - if bool(new_traits_metadata): - # that means new traits exists - dataset_metadata.update(new_traits_metadata) - with open(file_path,"w+") as file_handler: - json.dump(dataset_metadata, file_handler) + cache_new_traits_metadata(dataset_metadata, + new_traits_metadata, + file_path) return json.dumps(results_list) -- cgit v1.2.3 From b7a4fa3007e2a3364e7a827b0bf4b3a54fcc272d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 23 Nov 2021 13:38:36 +0300 Subject: merge commit :added some logic that takes into account corr_sample_group when determining which samples to use when getting sample_data --- wqflask/wqflask/correlation/correlation_gn3_api.py | 31 +++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index 635ef5ed..32a55b44 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -64,20 +64,27 @@ def test_process_data(this_trait, dataset, start_vars): return sample_data -def process_samples(start_vars, sample_names, excluded_samples=None): - """process samples""" +def process_samples(start_vars,sample_names = [],excluded_samples = []): + """code to fetch correct samples""" sample_data = {} - if not excluded_samples: - excluded_samples = () - sample_vals_dict = json.loads(start_vars["sample_vals"]) + sample_vals_dict = json.loads(start_vars["sample_vals"]) + if sample_names: for sample in sample_names: - if sample not in excluded_samples and sample in sample_vals_dict: + if sample in sample_vals_dict and sample not in excluded_samples: + val = sample_vals_dict[sample] + if not val.strip().lower() == "x": + sample_data[str(sample)] = float(val) + + else: + for sample in sample_vals_dict.keys(): + if sample not in excluded_samples: val = sample_vals_dict[sample] if not val.strip().lower() == "x": sample_data[str(sample)] = float(val) return sample_data + def merge_correlation_results(correlation_results, target_correlation_results): corr_dict = {} @@ -153,6 +160,18 @@ def lit_for_trait_list(corr_results, this_dataset, this_trait): def fetch_sample_data(start_vars, this_trait, this_dataset, target_dataset): + corr_samples_group = start_vars["corr_samples_group"] + if corr_samples_group == "samples_primary": + sample_data = process_samples( + start_vars, this_dataset.group.all_samples_ordered()) + + elif corr_samples_group == "samples_other": + sample_data = process_samples( + start_vars, excluded_samples = this_dataset.group.samplelist) + + else: + sample_data = process_samples(start_vars) + sample_data = process_samples( start_vars, this_dataset.group.all_samples_ordered()) -- cgit v1.2.3 From aa9a06d927bdc2b5221e58559f24921a0ff72cd8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 23 Nov 2021 13:50:21 +0300 Subject: pep8 formatting remove dead variables --- wqflask/base/data_set.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 2687738d..4d75e7ee 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -758,7 +758,6 @@ class DataSet: chunk_size = 50 number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) cached_results = fetch_cached_results(self.name, self.type) - # cached_results = None if cached_results is None: trait_sample_data = [] for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): -- cgit v1.2.3 From fffeb91789943a3c7db5a72d66405e2a0459ed44 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 23 Nov 2021 14:49:07 +0300 Subject: fix for overwriting file --- wqflask/wqflask/correlation/pre_computes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index ad0bc6ef..975a53b8 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -28,8 +28,9 @@ def cache_new_traits_metadata(dataset_metadata: dict, new_traits_metadata, file_ if bool(new_traits_metadata): dataset_metadata.update(new_traits_metadata) - with open(file_path, "w+") as file_handler: - json.dump(dataset_metadata, file_handler) + + with open(file_path, "w+") as file_handler: + json.dump(dataset_metadata, file_handler) def generate_filename(*args, suffix="", file_ext="json"): -- cgit v1.2.3