3 files changed, 385 insertions, 7 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 26b7294..7fb67be 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -109,10 +109,9 @@ package :not packaged in guix
 
 def filter_shared_sample_keys(this_samplelist,
                               target_samplelist) -> Tuple[List, List]:
-    """Given primary and target samplelist for two base and target trait select
-filter the values using the shared keys
-
-    """
+    """Given primary and target samplelist\
+    for two base and target trait select\
+    filter the values using the shared keys"""
     this_vals = []
     target_vals = []
     for key, value in target_samplelist.items():
@@ -125,8 +124,9 @@ filter the values using the shared keys
 def compute_all_sample_correlation(this_trait,
                                    target_dataset,
                                    corr_method="pearson") -> List:
-    """Given a trait data samplelist and target__datasets compute all sample
-correlation"""
+    """Given a trait data samplelist and\
+    target__datasets compute all sample correlation
+    """
 
     this_trait_samples = this_trait["trait_sample_data"]
 
@@ -323,7 +323,6 @@ def compute_all_lit_correlation(conn, trait_lists: List,
                                 species: str, gene_id):
     """Function that acts as an abstraction for
     lit_correlation_for_trait_list"""
-    # xtodo to be refactored
 
     lit_results = lit_correlation_for_trait_list(
         conn=conn,
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py
new file mode 100644
index 0000000..57e1fe1
--- /dev/null
+++ b/gn3/computations/datasets.py
@@ -0,0 +1,323 @@
+"""module contains the code all related to datasets"""
+import json
+from math import ceil
+from collections import defaultdict
+
+from typing import Optional
+from typing import List
+
+from dataclasses import dataclass
+from MySQLdb import escape_string  # type: ignore
+
+import requests
+from gn3.settings import GN2_BASE_URL
+
+
+def retrieve_trait_sample_data(dataset,
+                               trait_name: str,
+                               database,
+                               group_species_id=None) -> List:
+    """given the dataset id and trait_name fetch the\
+    sample_name,value from the dataset"""
+
+    # should pass the db as arg all  do a setup
+
+    (dataset_name, dataset_id, dataset_type) = (dataset.get("name"), dataset.get(
+        "id"), dataset.get("type"))
+
+    dataset_query = get_query_for_dataset_sample(dataset_type)
+    results = []
+    sample_query_values = {
+        "Publish": (trait_name, dataset_id),
+        "Geno": (group_species_id, trait_name, dataset_name),
+        "ProbeSet": (trait_name, dataset_name)
+    }
+
+    if dataset_query:
+        formatted_query = dataset_query % sample_query_values[dataset_type]
+
+        results = fetch_from_db_sample_data(formatted_query, database)
+
+    return results
+
+
+def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List:
+    """this is the function that does the actual fetching of\
+    results from the database"""
+    try:
+        cursor = database_instance.cursor()
+        cursor.execute(formatted_query)
+        results = cursor.fetchall()
+
+    except Exception as error:
+        raise error
+
+    cursor.close()
+
+    return results
+
+
+def get_query_for_dataset_sample(dataset_type) -> Optional[str]:
+    """this functions contains querys for\
+    getting sample data from the db depending in
+    dataset"""
+    dataset_query = {}
+
+    pheno_query = """
+                SELECT
+                        Strain.Name, PublishData.value, PublishSE.error,NStrain.count, Strain.Name2
+                FROM
+                        (PublishData, Strain, PublishXRef, PublishFreeze)
+                left join PublishSE on
+                        (PublishSE.DataId = PublishData.Id AND PublishSE.StrainId = PublishData.StrainId)
+                left join NStrain on
+                        (NStrain.DataId = PublishData.Id AND
+                        NStrain.StrainId = PublishData.StrainId)
+                WHERE
+                        PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND
+                        PublishData.Id = PublishXRef.DataId AND PublishXRef.Id = %s AND
+                        PublishFreeze.Id = %s AND PublishData.StrainId = Strain.Id
+                Order BY
+                        Strain.Name
+                """
+    geno_query = """
+                SELECT
+                        Strain.Name, GenoData.value, GenoSE.error, "N/A", Strain.Name2
+                FROM
+                        (GenoData, GenoFreeze, Strain, Geno, GenoXRef)
+                left join GenoSE on
+                        (GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId)
+                WHERE
+                        Geno.SpeciesId = %s AND Geno.Name = %s AND GenoXRef.GenoId = Geno.Id AND
+                        GenoXRef.GenoFreezeId = GenoFreeze.Id AND
+                        GenoFreeze.Name = %s AND
+                        GenoXRef.DataId = GenoData.Id AND
+                        GenoData.StrainId = Strain.Id
+                Order BY
+                        Strain.Name
+                """
+
+    probeset_query = """
+                SELECT
+                        Strain.Name, ProbeSetData.value, ProbeSetSE.error, NStrain.count, Strain.Name2
+                FROM
+                        (ProbeSetData, ProbeSetFreeze,
+                         Strain, ProbeSet, ProbeSetXRef)
+                left join ProbeSetSE on
+                        (ProbeSetSE.DataId = ProbeSetData.Id AND ProbeSetSE.StrainId = ProbeSetData.StrainId)
+                left join NStrain on
+                        (NStrain.DataId = ProbeSetData.Id AND
+                        NStrain.StrainId = ProbeSetData.StrainId)
+                WHERE
+                        ProbeSet.Name = '%s' AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
+                        ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND
+                        ProbeSetFreeze.Name = '%s' AND
+                        ProbeSetXRef.DataId = ProbeSetData.Id AND
+                        ProbeSetData.StrainId = Strain.Id
+                Order BY
+                        Strain.Name
+                """
+
+    dataset_query["Publish"] = pheno_query
+    dataset_query["Geno"] = geno_query
+    dataset_query["ProbeSet"] = probeset_query
+
+    return dataset_query.get(dataset_type)
+
+
+@dataclass
+class Dataset:
+    """class for creating datasets"""
+    name: Optional[str] = None
+    dataset_type: Optional[str] = None
+    dataset_id: int = -1
+
+
+def create_mrna_tissue_dataset(dataset_name, dataset_type):
+    """an mrna assay is a quantitative assessment(assay) associated\
+    with an mrna trait.This used to be called probeset,but that term\
+    only referes specifically to the afffymetrix platform and is\
+    far too speficified"""
+
+    return Dataset(name=dataset_name, dataset_type=dataset_type)
+
+
+def dataset_type_getter(dataset_name, redis_instance=None) -> Optional[str]:
+    """given the dataset name fetch the type\
+    of the dataset this in turn  enables fetching\
+    the creation of the correct object could utilize\
+    redis for the case"""
+
+    results = redis_instance.get(dataset_name, None)
+
+    if results:
+        return results
+
+    return fetch_dataset_type_from_gn2_api(dataset_name)
+
+
+def fetch_dataset_type_from_gn2_api(dataset_name):
+    """this function is only called when the\
+    the redis is empty and does have the specificied\
+    dataset_type"""
+    # should only run once
+
+    dataset_structure = {}
+
+    map_dataset_to_new_type = {
+        "Phenotypes": "Publish",
+        "Genotypes": "Geno",
+        "MrnaTypes": "ProbeSet"
+    }
+
+    data = json.loads(requests.get(
+        GN2_BASE_URL + "/api/v_pre1/gen_dropdown", timeout=5).content)
+    _name = dataset_name
+    for species in data['datasets']:
+        for group in data['datasets'][species]:
+            for dataset_type in data['datasets'][species][group]:
+                for dataset in data['datasets'][species][group][dataset_type]:
+                    # assumes  the first is dataset_short_name
+                    short_dataset_name = next(
+                        item for item in dataset if item != "None" and item is not None)
+
+                    dataset_structure[short_dataset_name] = map_dataset_to_new_type.get(
+                        dataset_type, "MrnaTypes")
+    return dataset_structure
+
+
+def dataset_creator_store(dataset_type):
+    """function contains key value pairs for\
+    the function need to be called to create\
+    each dataset_type"""
+
+    dataset_obj = {
+        "ProbeSet": create_mrna_tissue_dataset
+    }
+
+    return dataset_obj[dataset_type]
+
+
+def create_dataset(dataset_type=None, dataset_name: str = None):
+    """function for creating new dataset  temp not implemented"""
+    if dataset_type is None:
+        dataset_type = dataset_type_getter(dataset_name)
+
+    dataset_creator = dataset_creator_store(dataset_type)
+    results = dataset_creator(
+        dataset_name=dataset_name, dataset_type=dataset_type)
+    return results
+
+
+def fetch_dataset_sample_id(samplelist: List, database, species: str) -> dict:
+    """fetch the strain ids from the db only if\
+    it is in the samplelist"""
+    # xtodo create an in clause for samplelist
+
+    strain_query = """
+        SELECT Strain.Name, Strain.Id FROM Strain, Species
+        WHERE Strain.Name IN {}
+        and Strain.SpeciesId=Species.Id
+        and Species.name = '{}'
+        """
+
+    database_cursor = database.cursor()
+    database_cursor.execute(strain_query.format(samplelist, species))
+
+    results = database_cursor.fetchall()
+
+    return dict(results)
+
+
+def divide_into_chunks(the_list, number_chunks):
+    """Divides a list into approximately number_chunks
+    >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3)
+    [[1, 2, 7], [3, 22, 8], [5, 22, 333]]"""
+
+    length = len(the_list)
+    if length == 0:
+        return [[]]
+
+    if length <= number_chunks:
+        number_chunks = length
+    chunk_size = int(ceil(length/number_chunks))
+    chunks = []
+
+    for counter in range(0, length, chunk_size):
+        chunks.append(the_list[counter:counter+chunk_size])
+    return chunks
+
+
+def escape(string_):
+    """function escape sql value"""
+    return escape_string(string_).decode('utf8')
+
+
+def mescape(*items) -> List:
+    """multiple escape for query values"""
+
+    return [escape_string(str(item)).decode('utf8') for item in items]
+
+
+def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
+    """function to fetch trait data"""
+    # MySQL limits the number of tables that can be used in a join to 61,
+    # so we break the sample ids into smaller chunks
+    # Postgres doesn't have that limit, so we can get rid of this after we transition
+
+    _trait_data = defaultdict(list)
+    chunk_size = 61
+    number_chunks = int(ceil(len(sample_ids) / chunk_size))
+    for sample_ids_step in divide_into_chunks(sample_ids, number_chunks):
+        if dataset_type == "Publish":
+            full_dataset_type = "Phenotype"
+        else:
+            full_dataset_type = dataset_type
+        temp = ['T%s.value' % item for item in sample_ids_step]
+
+        if dataset_type == "Publish":
+            query = "SELECT {}XRef.Id,".format(escape(dataset_type))
+
+        else:
+            query = "SELECT {}.Name,".format(escape(full_dataset_type))
+
+        query += ', '.join(temp)
+        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type,
+                                                                 dataset_type,
+                                                                 dataset_type))
+        for item in sample_ids_step:
+
+            query += """
+                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                    and T{}.StrainId={}\n
+                    """.format(*mescape(dataset_type, item,
+                                        item, dataset_type, item, item))
+
+        if dataset_type == "Publish":
+            query += """
+                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                        and {}Freeze.Name = '{}'
+                        and {}.Id = {}XRef.{}Id
+                        order by {}.Id
+                        """.format(*mescape(dataset_type, dataset_type,
+                                            dataset_type, dataset_type,
+                                            dataset_name, full_dataset_type,
+                                            dataset_type, dataset_type,
+                                            full_dataset_type))
+
+        else:
+            query += """
+                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                        and {}Freeze.Name = '{}'
+                        and {}.Id = {}XRef.{}Id
+                        order by {}.Id
+                        """.format(*mescape(dataset_type, dataset_type,
+                                            dataset_type, dataset_type,
+                                            dataset_name, dataset_type,
+                                            dataset_type, dataset_type,
+                                            full_dataset_type))
+
+        # print(query)
+
+        _results = fetch_from_db_sample_data(query, database_instance)
+    return {}
diff --git a/gn3/computations/traits.py b/gn3/computations/traits.py
new file mode 100644
index 0000000..1aa2970
--- /dev/null
+++ b/gn3/computations/traits.py
@@ -0,0 +1,56 @@
+"""module contains all operating related to traits"""
+from gn3.computations.datasets import retrieve_trait_sample_data
+
+
+def fetch_trait(dataset, trait_name: str, database) -> dict:
+    """this method creates a trait by\
+    fetching required data given the\
+    dataset and trait_name"""
+
+    created_trait = {
+        "dataset": dataset,
+        "trait_name": trait_name
+    }
+
+    trait_data = get_trait_sample_data(dataset, trait_name, database)
+
+    created_trait["trait_data"] = trait_data
+
+    return created_trait
+
+
+def get_trait_sample_data(trait_dataset, trait_name, database) -> dict:
+    """first try to fetch the traits sample data from redis if that\
+    try to fetch from the traits dataset redis is only  used for\
+    temp dataset type which is not used in this case """
+
+    sample_results = retrieve_trait_sample_data(
+        trait_dataset, trait_name, database)
+
+    trait_data = {}
+
+    for (name, sample_value, _variance, _numcase, _name2) in sample_results:
+
+        trait_data[name] = sample_value
+    return trait_data
+
+
+def get_trait_info_data(trait_dataset,
+                        trait_name: str,
+                        database_instance,
+                        get_qtl_info: bool = False) -> dict:
+    """given a dataset and trait_name return a dict containing all info\
+    regarding the get trait"""
+
+    _temp_var_holder = (trait_dataset, trait_name,
+                        database_instance, get_qtl_info)
+    trait_info_data = {
+        "description": "",
+        "chr": "",
+        "locus": "",
+        "mb": "",
+        "abbreviation": "",
+        "trait_display_name": ""
+
+    }
+    return trait_info_data