diff options
author | Alexander Kabui | 2021-04-04 13:33:34 +0300 |
---|---|---|
committer | Alexander Kabui | 2021-04-04 13:33:34 +0300 |
commit | 2301b11e8a975f2e6dc7e5144e4b26c34b186501 (patch) | |
tree | 3e6815b89bd92fba3be0d339d0205a62f0772616 /gn3/computations/datasets.py | |
parent | 3dd25f77952f4829af5c19a0b40d221df308a185 (diff) | |
download | genenetwork3-2301b11e8a975f2e6dc7e5144e4b26c34b186501.tar.gz |
refactor code for trait data
modify unittest and integration tests for datasets
Diffstat (limited to 'gn3/computations/datasets.py')
-rw-r--r-- | gn3/computations/datasets.py | 78 |
1 files changed, 41 insertions, 37 deletions
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py index 533ebdd..b405e55 100644 --- a/gn3/computations/datasets.py +++ b/gn3/computations/datasets.py @@ -11,8 +11,6 @@ from dataclasses import dataclass from MySQLdb import escape_string # type: ignore import requests - -from gn3.experimental_db import database_connector from gn3.settings import GN2_BASE_URL @@ -45,13 +43,13 @@ def retrieve_trait_sample_data(dataset, def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List: """this is the function that does the actual fetching of\ results from the database""" - cursor = database_instance.cursor() - _conn = database_connector - # conn, cursor = database_connector() - # cursor = conn.cursor() + try: + cursor = database_instance.cursor() + cursor.execute(formatted_query) + results = cursor.fetchall() - cursor.execute(formatted_query) - results = cursor.fetchall() + except Exception as error: + raise error cursor.close() @@ -249,6 +247,11 @@ def divide_into_chunks(the_list, number_chunks): return chunks +def escape(string_): + """function escape sql value""" + return escape_string(string_).decode('utf8') + + def mescape(*items) -> List: """multiple escape for query values""" @@ -261,8 +264,8 @@ def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type): # so we break the sample ids into smaller chunks # Postgres doesn't have that limit, so we can get rid of this after we transition - trait_data = defaultdict(list) - chunk_size = 50 + _trait_data = defaultdict(list) + chunk_size = 61 number_chunks = int(ceil(len(sample_ids) / chunk_size)) for sample_ids_step in divide_into_chunks(sample_ids, number_chunks): if dataset_type == "Publish": @@ -271,46 +274,47 @@ def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type): full_dataset_type = dataset_type temp = ['T%s.value' % item for item in sample_ids_step] - if dataset_type: - query = "SELECT {}XRef.Id,".format(escape_string(dataset_type)) + if dataset_type == "Publish": + query = "SELECT {}XRef.Id,".format(escape(dataset_type)) else: - query = "SELECT {}.Name,".format(escape_string(full_dataset_type)) + query = "SELECT {}.Name,".format(escape(full_dataset_type)) query += ', '.join(temp) query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type, dataset_type, dataset_type)) for item in sample_ids_step: + query += """ left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n - """.format(*mescape(dataset_type, item, item, dataset_type, item, item)) + """.format(*mescape(dataset_type, item, + item, dataset_type, item, item)) if dataset_type == "Publish": query += """ - WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId - and {}Freeze.Name = '{}' - and {}.Id = {}XRef.{}Id - order by {}.Id - """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_name, - full_dataset_type, dataset_type, dataset_type, dataset_type)) - else: + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(dataset_type, dataset_type, + dataset_type, dataset_type, + dataset_name, full_dataset_type, + dataset_type, dataset_type, + full_dataset_type)) + else: query += """ - WHERE {}XRef.{}FreezeId = {}Freeze.Id - and {}Freeze.Name = '{}' - and {}.Id = {}XRef.{}Id - order by {}.Id - """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_type, - dataset_name, full_dataset_type, dataset_type, - dataset_type, full_dataset_type)) - - results = fetch_from_db_sample_data(query, database_instance) - - trait_name = results[0] - - sample_value_results = results[1:] - - trait_data[trait_name] += (sample_value_results) - return trait_data + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(dataset_type, dataset_type, + dataset_type, dataset_type, + dataset_name, dataset_type, + dataset_type, dataset_type, + full_dataset_type)) + + _results = fetch_from_db_sample_data(query, database_instance) + return {} |