From 04452c274d51621a0cab1b8dce5b8101c69496b6 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 14:41:41 +0300 Subject: refactor:fix on the query :modify cache point --- wqflask/base/data_set.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 70c58136..a3a720ad 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -747,7 +747,9 @@ class DataSet: and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) - sample_ids = [results[item] for item in self.samplelist] + sample_ids = [results.get(item) for item in self.samplelist] + + sample_ids = [ids for ids in sample_ids if ids is not None] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -800,25 +802,22 @@ class DataSet: results = g.db.execute(query).fetchall() trait_sample_data.append([list(result) for result in results]) - cache_dataset_results( - self.name, self.type, trait_sample_data) + trait_count = len(trait_sample_data[0]) + self.trait_data = collections.defaultdict(list) - else: - trait_sample_data = cached_results - - trait_count = len(trait_sample_data[0]) - self.trait_data = collections.defaultdict(list) - - # put all of the separate data together into a dictionary where the keys are - # trait names and values are lists of sample values - data_start_pos = 1 - for trait_counter in range(trait_count): - trait_name = trait_sample_data[0][trait_counter][0] - for chunk_counter in range(int(number_chunks)): - self.trait_data[trait_name] += ( + data_start_pos = 1 + for trait_counter in range(trait_count): + trait_name = trait_sample_data[0][trait_counter][0] + for chunk_counter in range(int(number_chunks)): + self.trait_data[trait_name] += ( trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + cache_dataset_results( + self.name, self.type, self.trait_data) + + else: + self.trait_data = cached_results class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' @@ -1282,7 +1281,9 @@ def generate_hash_file(dataset_name: str, dataset_timestamp: str): def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: List): - """function to cache dataset query results to file""" + """function to cache dataset query results to file + input dataset_name and type query_results(already processed in default dict format) + """ # data computations actions # store the file path on redis -- cgit v1.2.3 From a8ccaf03ba151f9ceca2f0224af33db230a8c8b3 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 15:53:50 +0300 Subject: test generate new files --- wqflask/base/data_set.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index a3a720ad..cae1a2a7 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -810,7 +810,7 @@ class DataSet: trait_name = trait_sample_data[0][trait_counter][0] for chunk_counter in range(int(number_chunks)): self.trait_data[trait_name] += ( - trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) cache_dataset_results( self.name, self.type, self.trait_data) @@ -818,6 +818,8 @@ class DataSet: else: self.trait_data = cached_results + + class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' @@ -1291,7 +1293,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L results = r.set(f"{dataset_type}timestamp", table_timestamp) - file_name = generate_hash_file(dataset_name, table_timestamp) + file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") with open(file_path, "w") as file_handler: @@ -1308,7 +1310,7 @@ def fetch_cached_results(dataset_name: str, dataset_type: str): else: table_timestamp = "" - file_name = generate_hash_file(dataset_name, table_timestamp) + file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") try: with open(file_path, "r") as file_handler: -- cgit v1.2.3 From 06fbab6427cadf7706da4e954874a7e5da1bd32d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 19:48:11 +0300 Subject: pep8 formatting remove debug statements --- wqflask/base/data_set.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index cae1a2a7..37f35121 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1263,7 +1263,7 @@ def query_table_timestamp(dataset_type: str): # computation data and actions query_update_time = f""" - SELECT UPDATE_TIME FROM information_schema.tables + SELECT UPDATE_TIME FROM information_schfema.tables WHERE TABLE_SCHEMA = 'db_webqtl_s' AND TABLE_NAME = '{dataset_type}Data' """ @@ -1275,7 +1275,7 @@ def query_table_timestamp(dataset_type: str): return date_time_obj.strftime(f) -def generate_hash_file(dataset_name: str, dataset_timestamp: str): +def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str): """given the trait_name generate a unique name for this""" string_unicode = f"{dataset_name}{dataset_timestamp}".encode() md5hash = hashlib.md5(string_unicode) @@ -1317,5 +1317,4 @@ def fetch_cached_results(dataset_name: str, dataset_type: str): return json.load(file_handler) except FileNotFoundError: - # take actions continue to fetch dataset results and fetch results - pass + pass \ No newline at end of file -- cgit v1.2.3 From 679051788a475dfcefd4cb93dc82ec3a4b86edc3 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 16 Nov 2021 19:54:55 +0300 Subject: use comprehension list;fix typo --- wqflask/base/data_set.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 37f35121..553530d4 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -747,9 +747,7 @@ class DataSet: and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) - sample_ids = [results.get(item) for item in self.samplelist] - - sample_ids = [ids for ids in sample_ids if ids is not None] + sample_ids = [results.get(item) for item in self.samplelist if item is not None] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -1263,7 +1261,7 @@ def query_table_timestamp(dataset_type: str): # computation data and actions query_update_time = f""" - SELECT UPDATE_TIME FROM information_schfema.tables + SELECT UPDATE_TIME FROM information_schema.tables WHERE TABLE_SCHEMA = 'db_webqtl_s' AND TABLE_NAME = '{dataset_type}Data' """ @@ -1317,4 +1315,4 @@ def fetch_cached_results(dataset_name: str, dataset_type: str): return json.load(file_handler) except FileNotFoundError: - pass \ No newline at end of file + pass -- cgit v1.2.3 From 60fe836dc6c2f00cb99844572eb3fd29aee0163e Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:07:02 +0300 Subject: use a dynamic value for the db_name --- wqflask/base/data_set.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 553530d4..2e401c8e 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -40,6 +40,7 @@ from base import species from base import webqtlConfig from flask import Flask, g from base.webqtlConfig import TMPDIR +from gn3.db_utils import parse_db_url import os import math import string @@ -747,7 +748,8 @@ class DataSet: and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) results = dict(g.db.execute(query).fetchall()) - sample_ids = [results.get(item) for item in self.samplelist if item is not None] + sample_ids = [results.get(item) + for item in self.samplelist if item is not None] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -1260,9 +1262,11 @@ def query_table_timestamp(dataset_type: str): # computation data and actions + fetch_db_name = parse_db_url() + query_update_time = f""" SELECT UPDATE_TIME FROM information_schema.tables - WHERE TABLE_SCHEMA = 'db_webqtl_s' + WHERE TABLE_SCHEMA = {fetch_db_name[-1]} AND TABLE_NAME = '{dataset_type}Data' """ -- cgit v1.2.3 From 71a859c9facc7ae49d43e3e995166ad8dcb586cb Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:11:12 +0300 Subject: isolate SQL_URI parse to a function --- wqflask/base/data_set.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 2e401c8e..f0a930a5 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -40,7 +40,8 @@ from base import species from base import webqtlConfig from flask import Flask, g from base.webqtlConfig import TMPDIR -from gn3.db_utils import parse_db_url +from urllib.parse import urlparse +from utility.tools import SQL_URI import os import math import string @@ -1257,6 +1258,13 @@ def geno_mrna_confidentiality(ob): return True + +def parse_db_url(): + parsed_db = urlparse(SQL_URI) + + return (parsed_db.hostname, parsed_db.username, + parsed_db.password, parsed_db.path[1:]) + def query_table_timestamp(dataset_type: str): """function to query the update timestamp of a given dataset_type""" -- cgit v1.2.3 From 5a407a34442860ebaea2886f2278be9e1eb33a8d Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 08:13:41 +0300 Subject: replace redis fetch for cached timestamp with a query --- wqflask/base/data_set.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index f0a930a5..52d1d254 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1301,8 +1301,6 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L table_timestamp = query_table_timestamp(dataset_type) - results = r.set(f"{dataset_type}timestamp", table_timestamp) - file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") @@ -1313,12 +1311,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L def fetch_cached_results(dataset_name: str, dataset_type: str): """function to fetch the cached results""" - table_timestamp = r.get(f"{dataset_type}timestamp") - - if table_timestamp is not None: - table_timestamp = table_timestamp.decode("utf-8") - else: - table_timestamp = "" + table_timestamp = query_table_timestamp(dataset_type) file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") -- cgit v1.2.3 From 6786712e95cbb885b6b19b3ecd34e6c8ee893172 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 17 Nov 2021 20:20:07 +0300 Subject: refactor sql query & date formatting --- wqflask/base/data_set.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 52d1d254..2687738d 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -1258,31 +1258,27 @@ def geno_mrna_confidentiality(ob): return True - def parse_db_url(): parsed_db = urlparse(SQL_URI) return (parsed_db.hostname, parsed_db.username, parsed_db.password, parsed_db.path[1:]) + def query_table_timestamp(dataset_type: str): """function to query the update timestamp of a given dataset_type""" # computation data and actions fetch_db_name = parse_db_url() - query_update_time = f""" SELECT UPDATE_TIME FROM information_schema.tables - WHERE TABLE_SCHEMA = {fetch_db_name[-1]} + WHERE TABLE_SCHEMA = '{fetch_db_name[-1]}' AND TABLE_NAME = '{dataset_type}Data' """ - # store the timestamp in redis= date_time_obj = g.db.execute(query_update_time).fetchone()[0] - - f = "%Y-%m-%d %H:%M:%S" - return date_time_obj.strftime(f) + return date_time_obj.strftime("%Y-%m-%d %H:%M:%S") def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str): @@ -1301,6 +1297,7 @@ def cache_dataset_results(dataset_name: str, dataset_type: str, query_results: L table_timestamp = query_table_timestamp(dataset_type) + file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp) file_path = os.path.join(TMPDIR, f"{file_name}.json") -- cgit v1.2.3 From aa9a06d927bdc2b5221e58559f24921a0ff72cd8 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 23 Nov 2021 13:50:21 +0300 Subject: pep8 formatting remove dead variables --- wqflask/base/data_set.py | 1 - 1 file changed, 1 deletion(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 2687738d..4d75e7ee 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -758,7 +758,6 @@ class DataSet: chunk_size = 50 number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) cached_results = fetch_cached_results(self.name, self.type) - # cached_results = None if cached_results is None: trait_sample_data = [] for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): -- cgit v1.2.3