From 5935e2cc3e0ac3a8004ccd5224557d34b62359d8 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 20:28:06 +0300
Subject: code to cache frequently run probeset correlation

---
 wqflask/wqflask/correlation/pre_computes.py | 124 ++++++++++++++--------------
 1 file changed, 63 insertions(+), 61 deletions(-)

diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index f1c9e1bd..d0caca60 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -1,89 +1,91 @@
-"""module contains the code to do the 
-precomputations of sample data between
-two entire datasets"""
 
-import json
-from typing import List
-from base import data_set
+import os
+import hashlib
 
-from gn3.computations.correlations import compute_all_sample_correlation
-from gn3.computations.correlations import fast_compute_all_sample_correlation
-from gn3.computations.correlations import map_shared_keys_to_values
+from base.data_set import query_table_timestamp
+from base.webqtlConfig import TMPDIR
 
 
-def get_dataset_dict_data(dataset_obj):
-    """function to get the dataset data mapped to key"""
-    dataset_obj.get_trait_data(dataset_obj.group.all_samples_ordered())
-    return map_shared_keys_to_values(dataset_obj.samplelist,
-                                     dataset_obj.trait_data)
+def generate_filename(**kwargs):
+    """generate unique filename"""
 
+    base_dataset_name = kwargs["base_dataset"]
+    target_dataset_name = kwargs["target_dataset"]
+    base_timestamp = kwargs["base_timestamp"]
+    target_dataset_timestamp = kwargs["target_timestamp"]
 
-def fetch_datasets(base_dataset_name: str, target_dataset_name: str) ->List:
-    """query to fetch create datasets and fetch traits
-    all traits of a dataset"""
+    string_unicode = f"{base_dataset_name}{target_dataset_name}{base_timestamp}{target_dataset_timestamp}sample_corr_compute".encode()
+    hashlib.md5(string_unicode).hexdigest()
 
-    # doesnt work for temp
 
-    base_dataset = data_set.create_dataset(dataset_name=base_dataset_name)
+def cache_compute_results(start_vars,
+    base_dataset_type,
+    correlation_results,
+    trait_name):
+    # pass
 
-    target_dataset = data_set.create_dataset(dataset_name=target_dataset_name)
-    # replace with map
+    # init assumption only caching probeset type
+    # fix redis;issue potential redis_cache!=current_timestamp
+    base_timestamp = r.get(f"{base_dataset_type}timestamp")
 
-    return (map(get_dataset_dict_data,
-                [base_dataset, target_dataset]))
+    if base_timestamp is None:
+        # fetch the timestamp
+        base_timestamp = target_dataset_timestamp = query_table_timestamp(
+            dataset_type)
 
+        r.set(f"{dataset_type}timestamp", target_dataset_timestamp)
 
-# in the base dataset we just need the traits
-def pre_compute_sample_correlation(base_dataset: List,
-                                   target_dataset: List) -> List:
-    """function compute the correlation between the
-    a whole dataset against a target
-    input: target&base_dataset(contains traits and sample results)
-    output: list containing the computed results
+    file_name = generate_filename(
+        base_dataset_name, target_dataset_name,
+        base_timestamp, target_dataset_timestamp)
 
-    precaution:function is expensive;targets only Exon and
-    """
+    file_path = os.path.join(TMPDIR, f"{file_name}.json")
 
-    results = []
+       try:
 
-    for trait_info in base_dataset:
+            with open(file_path, "r+") as json_handler:
 
-        result = fast_compute_all_sample_correlation(corr_method="pearson",
-                                                     this_trait=trait_info,
-                                                     target_dataset=target_dataset)
+                results = json.load(json_handler)
 
-        # results.append(fast_compute_all_sample_correlation(corr_method="pearson",
-        #                                                    this_trait=trait_info,
-        #                                                    target_dataset=target_dataset))
-        print("finished")
-        print(result)
+                if results.get(trait_name) is not None:
+                    results.update({trait_name: correlation_results})
 
-    return results
+                json.dump(results, json_handler)
 
+        except FileNotFoundError:
+            with open(file_path, "w") as json_handler:
+                json.dump({trait_name: correlation_results}, json_handler)
 
-def cache_to_file(base_dataset_name: str, target_dataset_name: str):
-    """function to cache the results to file"""
+def fetch_precompute_results(base_dataset_name,target_dataset_name,trait_name):
+    """function to check for precomputed  results"""
 
-    # validate the datasets expiry first
+    # check for redis timestamp
+
+    # fix rely on the fact correlation run oftenly probeset is set
+
+    base_timestamp = target_dataset_timestamp =  r.get(dataset_type)
+
+
+    if base_timestamp is None:
+        return
+
+    else:
+        file_name = generate_filename(
+        base_dataset_name, target_dataset_name,
+        base_timestamp, target_dataset_timestamp)
+
+        try:
+            with open(file_path,"r") as json_handler:
+                correlation_results = json.load(json_handler)
+
+                return correlation_results.get(trait_name)
+
+        except FileNotFoundError:
+            pass
 
-    base_dataset_data, target_dataset_data = [list(dataset) for dataset in list(
-        fetch_datasets(base_dataset_name, target_dataset_name))]
 
-    # print(target_dataset_data)
 
-    try:
-        # with open("unique_file_name.json", "w") as file_handler:
-        # file_handler.write()
 
-        dataset_correlation_results = pre_compute_sample_correlation(
-            base_dataset_data, target_dataset_data)
-        print(dataset_correlation_results)
 
-        # json.dump(dataset_correlation_results, file_handler)
-    except Exception as error:
-        raise error
 
 
-def check_cached_files_validity():
-    """function to check the validity of cached files"""
-    pass
-- 
cgit v1.2.3