Fix incorrect dataset trait data caching

Trait data caching wasn't working correctly because it didn't account for the samplelist, causing caching to work incorrect in any situation where the target dataset's samplelist wasn't the same as that of the trait being correlated against. Trait data is stored as a dictionary where the keys are trait IDs and values are *lists* of sample values. This means that the caching needs to account for the exact same set of samples; otherwise you'll end up with samples being mismatched (since "the third sample with a value" for one dataset's trait might not be the same as "the third sample with a value" for another dataset's trait). To fix this, I added the samplelist to the functions that generate and fetch the hash file. This will require more cache files, though, so this should probably be reexamined later to make the code work with only a single cache file for each dataset.
author: zsloan 2022-02-21 21:18:46 +0000
committer: zsloan 2022-02-21 15:27:29 -0600
commit: 7c9e73f196575cd6d1de7df4430bc2b4ecb28466 (patch)
tree: 10c5f75b683f8438745b0a1d489069fc3225c6a9 /wqflask/base/data_set.py
parent: 17652b17455bd58bf82d130b60b3e80c57b7f80c (diff)
download: genenetwork2-7c9e73f196575cd6d1de7df4430bc2b4ecb28466.tar.gz
1 files changed, 10 insertions, 10 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index af248659..d7e4e62f 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -756,7 +756,7 @@ class DataSet:
         chunk_size = 50
         number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
 
-        cached_results = fetch_cached_results(self.name, self.type)
+        cached_results = fetch_cached_results(self.name, self.type, self.samplelist)
         if cached_results is None:
             trait_sample_data = []
             for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
@@ -812,9 +812,8 @@ class DataSet:
                         trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
 
             cache_dataset_results(
-                self.name, self.type, self.trait_data)
+                self.name, self.type, self.samplelist, self.trait_data)
         else:
-
             self.trait_data = cached_results
 
 
@@ -1278,14 +1277,14 @@ def query_table_timestamp(dataset_type: str):
     return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
 
 
-def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str):
+def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str, samplelist: str):
     """given the trait_name generate a unique name for this"""
-    string_unicode = f"{dataset_name}{dataset_timestamp}".encode()
+    string_unicode = f"{dataset_name}{dataset_timestamp}{samplelist}".encode()
     md5hash = hashlib.md5(string_unicode)
     return md5hash.hexdigest()
 
 
-def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: List):
+def cache_dataset_results(dataset_name: str, dataset_type: str, samplelist: List, query_results: List):
     """function to cache dataset query results to file
     input dataset_name and type query_results(already processed in default dict format)
     """
@@ -1293,21 +1292,22 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L
     # store the file path on redis
 
     table_timestamp = query_table_timestamp(dataset_type)
+    samplelist_as_str = ",".join(samplelist)
 
-
-    file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
+    file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp, samplelist_as_str)
     file_path = os.path.join(TMPDIR, f"{file_name}.json")
 
     with open(file_path, "w") as file_handler:
         json.dump(query_results, file_handler)
 
 
-def fetch_cached_results(dataset_name: str, dataset_type: str):
+def fetch_cached_results(dataset_name: str, dataset_type: str, samplelist: List):
     """function to fetch the cached results"""
 
     table_timestamp = query_table_timestamp(dataset_type)
+    samplelist_as_str = ",".join(samplelist)
 
-    file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp)
+    file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp, samplelist_as_str)
     file_path = os.path.join(TMPDIR, f"{file_name}.json")
     try:
         with open(file_path, "r") as file_handler:
author	zsloan	2022-02-21 21:18:46 +0000
committer	zsloan	2022-02-21 15:27:29 -0600
commit	7c9e73f196575cd6d1de7df4430bc2b4ecb28466 (patch)
tree	10c5f75b683f8438745b0a1d489069fc3225c6a9 /wqflask/base/data_set.py
parent	17652b17455bd58bf82d130b60b3e80c57b7f80c (diff)
download	genenetwork2-7c9e73f196575cd6d1de7df4430bc2b4ecb28466.tar.gz