aboutsummaryrefslogtreecommitdiff
path: root/gn3
diff options
context:
space:
mode:
authorAlexander Kabui2021-04-04 13:33:34 +0300
committerAlexander Kabui2021-04-04 13:33:34 +0300
commit2301b11e8a975f2e6dc7e5144e4b26c34b186501 (patch)
tree3e6815b89bd92fba3be0d339d0205a62f0772616 /gn3
parent3dd25f77952f4829af5c19a0b40d221df308a185 (diff)
downloadgenenetwork3-2301b11e8a975f2e6dc7e5144e4b26c34b186501.tar.gz
refactor code for trait data
modify unittest and integration tests for datasets
Diffstat (limited to 'gn3')
-rw-r--r--gn3/api/datasets.py20
-rw-r--r--gn3/computations/datasets.py78
2 files changed, 57 insertions, 41 deletions
diff --git a/gn3/api/datasets.py b/gn3/api/datasets.py
index d15aad7..70ee3a1 100644
--- a/gn3/api/datasets.py
+++ b/gn3/api/datasets.py
@@ -3,6 +3,8 @@ from flask import Blueprint
from flask import jsonify
from gn3.computations.datasets import create_dataset
+from gn3.computations.datasets import get_traits_data
+from gn3.experimental_db import database_connector
dataset = Blueprint("dataset", __name__)
@@ -33,7 +35,17 @@ def fetch_traits_data(dataset_name, dataset_type):
"""endpoints fetches sample for each trait in\
a dataset"""
# what actually brings speed issues in correlation
- _query_values = dataset_name, dataset_type
-
- return jsonify({})
- \ No newline at end of file
+ sample_ids = [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15,
+ 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 31,
+ 35, 36, 37, 39, 98, 99, 100, 103, 487, 105, 106, 110, 115,
+ 116, 117, 118, 119, 120, 919, 147,
+ 121, 40, 41, 124, 125, 128, 135, 129, 130, 131,
+ 132, 134, 138, 139, 140, 141, 142, 144,
+ 145, 148, 149, 920, 922, 2, 3, 1, 1100]
+
+ conn, _cursor = database_connector()
+ results = get_traits_data(sample_ids=sample_ids, database_instance=conn,
+ dataset_name=dataset_name, dataset_type=dataset_type)
+ conn.close()
+
+ return jsonify({"results": results})
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py
index 533ebdd..b405e55 100644
--- a/gn3/computations/datasets.py
+++ b/gn3/computations/datasets.py
@@ -11,8 +11,6 @@ from dataclasses import dataclass
from MySQLdb import escape_string # type: ignore
import requests
-
-from gn3.experimental_db import database_connector
from gn3.settings import GN2_BASE_URL
@@ -45,13 +43,13 @@ def retrieve_trait_sample_data(dataset,
def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List:
"""this is the function that does the actual fetching of\
results from the database"""
- cursor = database_instance.cursor()
- _conn = database_connector
- # conn, cursor = database_connector()
- # cursor = conn.cursor()
+ try:
+ cursor = database_instance.cursor()
+ cursor.execute(formatted_query)
+ results = cursor.fetchall()
- cursor.execute(formatted_query)
- results = cursor.fetchall()
+ except Exception as error:
+ raise error
cursor.close()
@@ -249,6 +247,11 @@ def divide_into_chunks(the_list, number_chunks):
return chunks
+def escape(string_):
+ """function escape sql value"""
+ return escape_string(string_).decode('utf8')
+
+
def mescape(*items) -> List:
"""multiple escape for query values"""
@@ -261,8 +264,8 @@ def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
# so we break the sample ids into smaller chunks
# Postgres doesn't have that limit, so we can get rid of this after we transition
- trait_data = defaultdict(list)
- chunk_size = 50
+ _trait_data = defaultdict(list)
+ chunk_size = 61
number_chunks = int(ceil(len(sample_ids) / chunk_size))
for sample_ids_step in divide_into_chunks(sample_ids, number_chunks):
if dataset_type == "Publish":
@@ -271,46 +274,47 @@ def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
full_dataset_type = dataset_type
temp = ['T%s.value' % item for item in sample_ids_step]
- if dataset_type:
- query = "SELECT {}XRef.Id,".format(escape_string(dataset_type))
+ if dataset_type == "Publish":
+ query = "SELECT {}XRef.Id,".format(escape(dataset_type))
else:
- query = "SELECT {}.Name,".format(escape_string(full_dataset_type))
+ query = "SELECT {}.Name,".format(escape(full_dataset_type))
query += ', '.join(temp)
query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type,
dataset_type,
dataset_type))
for item in sample_ids_step:
+
query += """
left join {}Data as T{} on T{}.Id = {}XRef.DataId
and T{}.StrainId={}\n
- """.format(*mescape(dataset_type, item, item, dataset_type, item, item))
+ """.format(*mescape(dataset_type, item,
+ item, dataset_type, item, item))
if dataset_type == "Publish":
query += """
- WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_name,
- full_dataset_type, dataset_type, dataset_type, dataset_type))
- else:
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(dataset_type, dataset_type,
+ dataset_type, dataset_type,
+ dataset_name, full_dataset_type,
+ dataset_type, dataset_type,
+ full_dataset_type))
+ else:
query += """
- WHERE {}XRef.{}FreezeId = {}Freeze.Id
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_type,
- dataset_name, full_dataset_type, dataset_type,
- dataset_type, full_dataset_type))
-
- results = fetch_from_db_sample_data(query, database_instance)
-
- trait_name = results[0]
-
- sample_value_results = results[1:]
-
- trait_data[trait_name] += (sample_value_results)
- return trait_data
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(dataset_type, dataset_type,
+ dataset_type, dataset_type,
+ dataset_name, dataset_type,
+ dataset_type, dataset_type,
+ full_dataset_type))
+
+ _results = fetch_from_db_sample_data(query, database_instance)
+ return {}