diff options
author | Alexander Kabui | 2021-04-04 13:33:34 +0300 |
---|---|---|
committer | Alexander Kabui | 2021-04-04 13:33:34 +0300 |
commit | 2301b11e8a975f2e6dc7e5144e4b26c34b186501 (patch) | |
tree | 3e6815b89bd92fba3be0d339d0205a62f0772616 /gn3 | |
parent | 3dd25f77952f4829af5c19a0b40d221df308a185 (diff) | |
download | genenetwork3-2301b11e8a975f2e6dc7e5144e4b26c34b186501.tar.gz |
refactor code for trait data
modify unittest and integration tests for datasets
Diffstat (limited to 'gn3')
-rw-r--r-- | gn3/api/datasets.py | 20 | ||||
-rw-r--r-- | gn3/computations/datasets.py | 78 |
2 files changed, 57 insertions, 41 deletions
diff --git a/gn3/api/datasets.py b/gn3/api/datasets.py index d15aad7..70ee3a1 100644 --- a/gn3/api/datasets.py +++ b/gn3/api/datasets.py @@ -3,6 +3,8 @@ from flask import Blueprint from flask import jsonify from gn3.computations.datasets import create_dataset +from gn3.computations.datasets import get_traits_data +from gn3.experimental_db import database_connector dataset = Blueprint("dataset", __name__) @@ -33,7 +35,17 @@ def fetch_traits_data(dataset_name, dataset_type): """endpoints fetches sample for each trait in\ a dataset""" # what actually brings speed issues in correlation - _query_values = dataset_name, dataset_type - - return jsonify({}) -
\ No newline at end of file + sample_ids = [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, + 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 31, + 35, 36, 37, 39, 98, 99, 100, 103, 487, 105, 106, 110, 115, + 116, 117, 118, 119, 120, 919, 147, + 121, 40, 41, 124, 125, 128, 135, 129, 130, 131, + 132, 134, 138, 139, 140, 141, 142, 144, + 145, 148, 149, 920, 922, 2, 3, 1, 1100] + + conn, _cursor = database_connector() + results = get_traits_data(sample_ids=sample_ids, database_instance=conn, + dataset_name=dataset_name, dataset_type=dataset_type) + conn.close() + + return jsonify({"results": results}) diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py index 533ebdd..b405e55 100644 --- a/gn3/computations/datasets.py +++ b/gn3/computations/datasets.py @@ -11,8 +11,6 @@ from dataclasses import dataclass from MySQLdb import escape_string # type: ignore import requests - -from gn3.experimental_db import database_connector from gn3.settings import GN2_BASE_URL @@ -45,13 +43,13 @@ def retrieve_trait_sample_data(dataset, def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List: """this is the function that does the actual fetching of\ results from the database""" - cursor = database_instance.cursor() - _conn = database_connector - # conn, cursor = database_connector() - # cursor = conn.cursor() + try: + cursor = database_instance.cursor() + cursor.execute(formatted_query) + results = cursor.fetchall() - cursor.execute(formatted_query) - results = cursor.fetchall() + except Exception as error: + raise error cursor.close() @@ -249,6 +247,11 @@ def divide_into_chunks(the_list, number_chunks): return chunks +def escape(string_): + """function escape sql value""" + return escape_string(string_).decode('utf8') + + def mescape(*items) -> List: """multiple escape for query values""" @@ -261,8 +264,8 @@ def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type): # so we break the sample ids into smaller chunks # Postgres doesn't have that limit, so we can get rid of this after we transition - trait_data = defaultdict(list) - chunk_size = 50 + _trait_data = defaultdict(list) + chunk_size = 61 number_chunks = int(ceil(len(sample_ids) / chunk_size)) for sample_ids_step in divide_into_chunks(sample_ids, number_chunks): if dataset_type == "Publish": @@ -271,46 +274,47 @@ def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type): full_dataset_type = dataset_type temp = ['T%s.value' % item for item in sample_ids_step] - if dataset_type: - query = "SELECT {}XRef.Id,".format(escape_string(dataset_type)) + if dataset_type == "Publish": + query = "SELECT {}XRef.Id,".format(escape(dataset_type)) else: - query = "SELECT {}.Name,".format(escape_string(full_dataset_type)) + query = "SELECT {}.Name,".format(escape(full_dataset_type)) query += ', '.join(temp) query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type, dataset_type, dataset_type)) for item in sample_ids_step: + query += """ left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n - """.format(*mescape(dataset_type, item, item, dataset_type, item, item)) + """.format(*mescape(dataset_type, item, + item, dataset_type, item, item)) if dataset_type == "Publish": query += """ - WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId - and {}Freeze.Name = '{}' - and {}.Id = {}XRef.{}Id - order by {}.Id - """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_name, - full_dataset_type, dataset_type, dataset_type, dataset_type)) - else: + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(dataset_type, dataset_type, + dataset_type, dataset_type, + dataset_name, full_dataset_type, + dataset_type, dataset_type, + full_dataset_type)) + else: query += """ - WHERE {}XRef.{}FreezeId = {}Freeze.Id - and {}Freeze.Name = '{}' - and {}.Id = {}XRef.{}Id - order by {}.Id - """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_type, - dataset_name, full_dataset_type, dataset_type, - dataset_type, full_dataset_type)) - - results = fetch_from_db_sample_data(query, database_instance) - - trait_name = results[0] - - sample_value_results = results[1:] - - trait_data[trait_name] += (sample_value_results) - return trait_data + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(dataset_type, dataset_type, + dataset_type, dataset_type, + dataset_name, dataset_type, + dataset_type, dataset_type, + full_dataset_type)) + + _results = fetch_from_db_sample_data(query, database_instance) + return {} |