From 5935e2cc3e0ac3a8004ccd5224557d34b62359d8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 20:28:06 +0300 Subject: code to cache frequently run probeset correlation --- wqflask/wqflask/correlation/pre_computes.py | 124 ++++++++++++++-------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index f1c9e1bd..d0caca60 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -1,89 +1,91 @@ -"""module contains the code to do the -precomputations of sample data between -two entire datasets""" -import json -from typing import List -from base import data_set +import os +import hashlib -from gn3.computations.correlations import compute_all_sample_correlation -from gn3.computations.correlations import fast_compute_all_sample_correlation -from gn3.computations.correlations import map_shared_keys_to_values +from base.data_set import query_table_timestamp +from base.webqtlConfig import TMPDIR -def get_dataset_dict_data(dataset_obj): - """function to get the dataset data mapped to key""" - dataset_obj.get_trait_data(dataset_obj.group.all_samples_ordered()) - return map_shared_keys_to_values(dataset_obj.samplelist, - dataset_obj.trait_data) +def generate_filename(**kwargs): + """generate unique filename""" + base_dataset_name = kwargs["base_dataset"] + target_dataset_name = kwargs["target_dataset"] + base_timestamp = kwargs["base_timestamp"] + target_dataset_timestamp = kwargs["target_timestamp"] -def fetch_datasets(base_dataset_name: str, target_dataset_name: str) ->List: - """query to fetch create datasets and fetch traits - all traits of a dataset""" + string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode() + hashlib.md5(string_unicode).hexdigest() - # doesnt work for temp - base_dataset = data_set.create_dataset(dataset_name=base_dataset_name) +def cache_compute_results(start_vars, + base_dataset_type, + correlation_results, + trait_name): + # pass - target_dataset = data_set.create_dataset(dataset_name=target_dataset_name) - # replace with map + # init assumption only caching probeset type + # fix redis;issue potential redis_cache!=current_timestamp + base_timestamp = r.get(f"{base_dataset_type}timestamp") - return (map(get_dataset_dict_data, - [base_dataset, target_dataset])) + if base_timestamp is None: + # fetch the timestamp + base_timestamp = target_dataset_timestamp = query_table_timestamp( + dataset_type) + r.set(f"{dataset_type}timestamp", target_dataset_timestamp) -# in the base dataset we just need the traits -def pre_compute_sample_correlation(base_dataset: List, - target_dataset: List) -> List: - """function compute the correlation between the - a whole dataset against a target - input: target&base_dataset(contains traits and sample results) - output: list containing the computed results + file_name = generate_filename( + base_dataset_name, target_dataset_name, + base_timestamp, target_dataset_timestamp) - precaution:function is expensive;targets only Exon and - """ + file_path = os.path.join(TMPDIR, f"{file_name}.json") - results = [] + try: - for trait_info in base_dataset: + with open(file_path, "r+") as json_handler: - result = fast_compute_all_sample_correlation(corr_method="pearson", - this_trait=trait_info, - target_dataset=target_dataset) + results = json.load(json_handler) - # results.append(fast_compute_all_sample_correlation(corr_method="pearson", - # this_trait=trait_info, - # target_dataset=target_dataset)) - print("finished") - print(result) + if results.get(trait_name) is not None: + results.update({trait_name: correlation_results}) - return results + json.dump(results, json_handler) + except FileNotFoundError: + with open(file_path, "w") as json_handler: + json.dump({trait_name: correlation_results}, json_handler) -def cache_to_file(base_dataset_name: str, target_dataset_name: str): - """function to cache the results to file""" +def fetch_precompute_results(base_dataset_name,target_dataset_name,trait_name): + """function to check for precomputed results""" - # validate the datasets expiry first + # check for redis timestamp + + # fix rely on the fact correlation run oftenly probeset is set + + base_timestamp = target_dataset_timestamp = r.get(dataset_type) + + + if base_timestamp is None: + return + + else: + file_name = generate_filename( + base_dataset_name, target_dataset_name, + base_timestamp, target_dataset_timestamp) + + try: + with open(file_path,"r") as json_handler: + correlation_results = json.load(json_handler) + + return correlation_results.get(trait_name) + + except FileNotFoundError: + pass - base_dataset_data, target_dataset_data = [list(dataset) for dataset in list( - fetch_datasets(base_dataset_name, target_dataset_name))] - # print(target_dataset_data) - try: - # with open("unique_file_name.json", "w") as file_handler: - # file_handler.write() - dataset_correlation_results = pre_compute_sample_correlation( - base_dataset_data, target_dataset_data) - print(dataset_correlation_results) - # json.dump(dataset_correlation_results, file_handler) - except Exception as error: - raise error -def check_cached_files_validity(): - """function to check the validity of cached files""" - pass -- cgit v1.2.3