about summary refs log tree commit diff
path: root/gn3
diff options
context:
space:
mode:
Diffstat (limited to 'gn3')
-rw-r--r--gn3/computations/datasets.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py
index 28d40a1..533ebdd 100644
--- a/gn3/computations/datasets.py
+++ b/gn3/computations/datasets.py
@@ -1,11 +1,15 @@
 """module contains the code all related to datasets"""
 import json
 from unittest import mock
+from math import ceil
+from collections import defaultdict
 
 from typing import Optional
 from typing import List
 
 from dataclasses import dataclass
+from MySQLdb import escape_string  # type: ignore
+
 import requests
 
 from gn3.experimental_db import database_connector
@@ -224,3 +228,89 @@ def fetch_dataset_sample_id(samplelist: List, database, species: str) -> dict:
     results = database_cursor.fetchall()
 
     return dict(results)
+
+
+def divide_into_chunks(the_list, number_chunks):
+    """Divides a list into approximately number_chunks
+    >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3)
+    [[1, 2, 7], [3, 22, 8], [5, 22, 333]]"""
+
+    length = len(the_list)
+    if length == 0:
+        return [[]]
+
+    if length <= number_chunks:
+        number_chunks = length
+    chunk_size = int(ceil(length/number_chunks))
+    chunks = []
+
+    for counter in range(0, length, chunk_size):
+        chunks.append(the_list[counter:counter+chunk_size])
+    return chunks
+
+
+def mescape(*items) -> List:
+    """multiple escape for query values"""
+
+    return [escape_string(str(item)).decode('utf8') for item in items]
+
+
+def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
+    """function to fetch trait data"""
+    # MySQL limits the number of tables that can be used in a join to 61,
+    # so we break the sample ids into smaller chunks
+    # Postgres doesn't have that limit, so we can get rid of this after we transition
+
+    trait_data = defaultdict(list)
+    chunk_size = 50
+    number_chunks = int(ceil(len(sample_ids) / chunk_size))
+    for sample_ids_step in divide_into_chunks(sample_ids, number_chunks):
+        if dataset_type == "Publish":
+            full_dataset_type = "Phenotype"
+        else:
+            full_dataset_type = dataset_type
+        temp = ['T%s.value' % item for item in sample_ids_step]
+
+        if dataset_type:
+            query = "SELECT {}XRef.Id,".format(escape_string(dataset_type))
+
+        else:
+            query = "SELECT {}.Name,".format(escape_string(full_dataset_type))
+
+        query += ', '.join(temp)
+        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type,
+                                                                 dataset_type,
+                                                                 dataset_type))
+        for item in sample_ids_step:
+            query += """
+                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                    and T{}.StrainId={}\n
+                    """.format(*mescape(dataset_type, item, item, dataset_type, item, item))
+
+        if dataset_type == "Publish":
+            query += """
+            WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
+            and {}Freeze.Name = '{}'
+            and {}.Id = {}XRef.{}Id
+            order by {}.Id
+            """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_name,
+                                full_dataset_type, dataset_type, dataset_type, dataset_type))
+        else:
+
+            query += """
+            WHERE {}XRef.{}FreezeId = {}Freeze.Id
+            and {}Freeze.Name = '{}'
+            and {}.Id = {}XRef.{}Id
+            order by {}.Id
+            """.format(*mescape(dataset_type, dataset_type, dataset_type, dataset_type,
+                                dataset_name, full_dataset_type, dataset_type,
+                                dataset_type, full_dataset_type))
+
+        results = fetch_from_db_sample_data(query, database_instance)
+
+        trait_name = results[0]
+
+        sample_value_results = results[1:]
+
+        trait_data[trait_name] += (sample_value_results)
+    return trait_data