From 18b53441a0136071db94c72b112a746e056ef971 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Mon, 15 Nov 2021 18:03:55 +0300
Subject: refactor function to fetch datasets data for precomputes

---
 wqflask/wqflask/correlation/pre_computes.py | 72 ++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 16 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py
index 01fa1a3d..e7147ddf 100644
--- a/wqflask/wqflask/correlation/pre_computes.py
+++ b/wqflask/wqflask/correlation/pre_computes.py
@@ -20,6 +20,7 @@ def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, ta
 def cache_compute_results(base_dataset_type,
                           base_dataset_name,
                           target_dataset_name,
+                          corr_method,
                           correlation_results,
                           trait_name):
     # pass
@@ -34,43 +35,35 @@ def cache_compute_results(base_dataset_type,
 
     target_dataset_timestamp = base_timestamp
 
-
-
     file_name = generate_filename(
         base_dataset_name, target_dataset_name,
         base_timestamp, target_dataset_timestamp)
 
-
-    file_path = os.path.join(TMPDIR,f"{file_name}.json")
-
+    file_path = os.path.join(TMPDIR, f"{file_name}.json")
 
     try:
-        with open(file_path,"r+") as json_file_handler:
+        with open(file_path, "r+") as json_file_handler:
             data = json.load(json_file_handler)
 
             data[trait_name] = correlation_results
 
             json_file_handler.seek(0)
 
-            json.dump(data,json_file_handler)
+            json.dump(data, json_file_handler)
 
             json_file_handler.truncate()
-        
+
     except FileNotFoundError:
-        with open(file_path,"w+") as file_handler:
+        with open(file_path, "w+") as file_handler:
             data = {}
-            data[trait_name] =correlation_results
-
-            json.dump(data,file_handler)
-
+            data[trait_name] = correlation_results
 
+            json.dump(data, file_handler)
 
     # create the file only if it does not exists
 
     # else open the file to cache the results
 
-    
-
 
 def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name):
     """function to check for precomputed  results"""
@@ -84,7 +77,8 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
         return
 
     else:
-        base_timestamp = target_dataset_timestamp = base_timestamp.decode("utf-8")
+        base_timestamp = target_dataset_timestamp = base_timestamp.decode(
+            "utf-8")
 
     file_name = generate_filename(
         base_dataset_name, target_dataset_name,
@@ -103,3 +97,49 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ
 
     except FileNotFoundError:
         pass
+
+
+def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method):
+    """compute sample correlation between dataset vs dataset
+    wn:heavy function should be invoked less frequently
+    input:datasets_data(two dicts),corr_method
+
+    output:correlation results for entire dataset against entire dataset
+    """
+    dataset_correlation_results = {}
+
+    for (trait_name, strain_values) in target_dataset.trait_data:
+
+        this_trait_data = {
+            "trait_sample_data": strain_values,
+            "trait_id": trait_name
+        }
+
+        trait_correlation_result = fast_compute_all_sample_correlation(
+            corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_dataset_data)
+
+        dataset_correlation_results[trait_name] = trait_correlation_result
+
+    return dataset_correlation_results
+
+
+def get_datasets_data(base_dataset, target_dataset_data):
+    """required to pass data in a given format to the pre compute
+    function
+
+    output:two dicts for datasets with key==trait and value==strains
+    """
+    target_traits_data = target_dataset.get_trait_data(
+        base_dataset.group.all_samples_ordered())
+
+    base_traits_data = base_dataset.get_trait_data(
+        base_dataset.group.all_samples_ordered())
+
+    samples_fetched = base_dataset.group.all_samples_ordered()
+
+    target_results = map_shared_keys_to_values(
+        samples_fetched, target_traits_data)
+    base_results = map_shared_keys_to_values(
+        samples_fetched, base_traits_data)
+
+    return (target_results, base_results)
-- 
cgit v1.2.3