aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/base/data_set.py')
-rw-r--r--wqflask/base/data_set.py115
1 files changed, 61 insertions, 54 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 11ed2495..1a208050 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -39,6 +39,7 @@ from db import webqtlDatabaseFunction
from base import species
from base import webqtlConfig
from flask import Flask, g
+from base.webqtlConfig import TMPDIR
import os
import math
import string
@@ -50,6 +51,7 @@ import requests
import gzip
import pickle as pickle
import itertools
+import hashlib
from redis import Redis
@@ -751,54 +753,64 @@ class DataSet:
# Postgres doesn't have that limit, so we can get rid of this after we transition
chunk_size = 50
number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
- trait_sample_data = []
- for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
- if self.type == "Publish":
- dataset_type = "Phenotype"
- else:
- dataset_type = self.type
- temp = ['T%s.value' % item for item in sample_ids_step]
- if self.type == "Publish":
- query = "SELECT {}XRef.Id,".format(escape(self.type))
- else:
- query = "SELECT {}.Name,".format(escape(dataset_type))
- data_start_pos = 1
- query += ', '.join(temp)
- query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
- self.type,
- self.type))
-
- for item in sample_ids_step:
- query += """
- left join {}Data as T{} on T{}.Id = {}XRef.DataId
- and T{}.StrainId={}\n
- """.format(*mescape(self.type, item, item, self.type, item, item))
-
- if self.type == "Publish":
- query += """
- WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(self.type, self.type, self.type, self.name,
- dataset_type, self.type, dataset_type, dataset_type))
- else:
- query += """
- WHERE {}XRef.{}FreezeId = {}Freeze.Id
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(self.type, self.type, self.type, self.type,
- self.name, dataset_type, self.type, self.type, dataset_type))
+ cached_results = fetch_cached_results(self.name)
+ # cached_results = None
+ if cached_results is None:
+ trait_sample_data = []
+ for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+ if self.type == "Publish":
+ dataset_type = "Phenotype"
+ else:
+ dataset_type = self.type
+ temp = ['T%s.value' % item for item in sample_ids_step]
+ if self.type == "Publish":
+ query = "SELECT {}XRef.Id,".format(escape(self.type))
+ else:
+ query = "SELECT {}.Name,".format(escape(dataset_type))
+ data_start_pos = 1
+ query += ', '.join(temp)
+ query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
+ self.type,
+ self.type))
+
+ for item in sample_ids_step:
+ query += """
+ left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ and T{}.StrainId={}\n
+ """.format(*mescape(self.type, item, item, self.type, item, item))
+
+ if self.type == "Publish":
+ query += """
+ WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.name,
+ dataset_type, self.type, dataset_type, dataset_type))
+ else:
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.type,
+ self.name, dataset_type, self.type, self.type, dataset_type))
- results = g.db.execute(query).fetchall()
- trait_sample_data.append(results)
+ results = g.db.execute(query).fetchall()
+ trait_sample_data.append([list(result) for result in results])
+
+ cache_dataset_results(
+ self.name, "cached_time_stamp", trait_sample_data)
+
+ else:
+ trait_sample_data = cached_results
trait_count = len(trait_sample_data[0])
self.trait_data = collections.defaultdict(list)
# put all of the separate data together into a dictionary where the keys are
# trait names and values are lists of sample values
+ data_start_pos = 1
for trait_counter in range(trait_count):
trait_name = trait_sample_data[0][trait_counter][0]
for chunk_counter in range(int(number_chunks)):
@@ -1247,39 +1259,34 @@ def generate_hash_file(dataset_name: str, dataset_timestamp: str):
"""given the trait_name generate a unique name for this"""
string_unicode = f"{dataset_name}{dataset_timestamp}".encode()
- md5hash = hashlib.md5(str2hash.encode(string_unicode))
+ md5hash = hashlib.md5(string_unicode)
return md5hash.hexdigest()
-def cache_dataset_results(dataset_name: str, query_results: List):
+def cache_dataset_results(dataset_name: str, dataset_timestamp: str, query_results: List):
"""function to cache dataset query results to file"""
- # check if file exists clear if it does
- # aslo check for the timestamp
- # hash for unique name ??? are dataset name unique
# data computations actions
# store the file path on redis
- # hash functiob
- file_name = generate_hash_file(dataset_name, "dataset_timestamp")
+ file_name = generate_hash_file(dataset_name, dataset_timestamp)
file_path = os.path.join(TMPDIR, f"{file_name}.json")
- query_results = [list(results) for result in query_results]
-
with open(file_path, "w") as file_handler:
json.dump(query_results, file_handler)
def fetch_cached_results(dataset_name: str):
"""function to fetch the cached results"""
+ # store the timestamp in redis
- file_name = generate_hash_file(dataset_name,)
- file_path = os.path.join(TMPDIR, f"{file_path}.json")
-
+ file_name = generate_hash_file(dataset_name, "cached_time_stamp")
+ file_path = os.path.join(TMPDIR, f"{file_name}.json")
try:
- with open(file_path) as file_handler:
+ with open(file_path, "r") as file_handler:
data = json.load(file_handler)
+ # print(file_handler)
# check if table has been modified
return data
except FileNotFoundError: