about summary refs log tree commit diff
path: root/gn3/computations/datasets.py
diff options
context:
space:
mode:
Diffstat (limited to 'gn3/computations/datasets.py')
-rw-r--r--gn3/computations/datasets.py371
1 files changed, 0 insertions, 371 deletions
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py
deleted file mode 100644
index b69583e..0000000
--- a/gn3/computations/datasets.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""module contains the code all related to datasets"""
-import json
-from math import ceil
-from collections import defaultdict
-
-from typing import Optional
-from typing import List
-
-from dataclasses import dataclass
-from MySQLdb import escape_string  # type: ignore
-
-import requests
-from gn3.settings import GN2_BASE_URL
-
-
-def retrieve_trait_sample_data(dataset,
-                               trait_name: str,
-                               database,
-                               group_species_id=None) -> List:
-    """given the dataset id and trait_name fetch the\
-    sample_name,value from the dataset"""
-
-    # should pass the db as arg all  do a setup
-
-    (dataset_name, dataset_id, dataset_type) = (dataset.get("name"), dataset.get(
-        "id"), dataset.get("type"))
-
-    dataset_query = get_query_for_dataset_sample(dataset_type)
-    results = []
-    sample_query_values = {
-        "Publish": (trait_name, dataset_id),
-        "Geno": (group_species_id, trait_name, dataset_name),
-        "ProbeSet": (trait_name, dataset_name)
-    }
-
-    if dataset_query:
-        formatted_query = dataset_query % sample_query_values[dataset_type]
-
-        results = fetch_from_db_sample_data(formatted_query, database)
-
-    return results
-
-
-def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List:
-    """this is the function that does the actual fetching of\
-    results from the database"""
-    try:
-        cursor = database_instance.cursor()
-        cursor.execute(formatted_query)
-        results = cursor.fetchall()
-
-    except Exception as error:
-        raise error
-
-    cursor.close()
-
-    return results
-
-
-def get_query_for_dataset_sample(dataset_type) -> Optional[str]:
-    """this functions contains querys for\
-    getting sample data from the db depending in
-    dataset"""
-    dataset_query = {}
-
-    pheno_query = """
-                SELECT
-                        Strain.Name, PublishData.value, PublishSE.error,NStrain.count, Strain.Name2
-                FROM
-                        (PublishData, Strain, PublishXRef, PublishFreeze)
-                left join PublishSE on
-                        (PublishSE.DataId = PublishData.Id AND PublishSE.StrainId = PublishData.StrainId)
-                left join NStrain on
-                        (NStrain.DataId = PublishData.Id AND
-                        NStrain.StrainId = PublishData.StrainId)
-                WHERE
-                        PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND
-                        PublishData.Id = PublishXRef.DataId AND PublishXRef.Id = %s AND
-                        PublishFreeze.Id = %s AND PublishData.StrainId = Strain.Id
-                Order BY
-                        Strain.Name
-                """
-    geno_query = """
-                SELECT
-                        Strain.Name, GenoData.value, GenoSE.error, "N/A", Strain.Name2
-                FROM
-                        (GenoData, GenoFreeze, Strain, Geno, GenoXRef)
-                left join GenoSE on
-                        (GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId)
-                WHERE
-                        Geno.SpeciesId = %s AND Geno.Name = %s AND GenoXRef.GenoId = Geno.Id AND
-                        GenoXRef.GenoFreezeId = GenoFreeze.Id AND
-                        GenoFreeze.Name = %s AND
-                        GenoXRef.DataId = GenoData.Id AND
-                        GenoData.StrainId = Strain.Id
-                Order BY
-                        Strain.Name
-                """
-
-    probeset_query = """
-                SELECT
-                        Strain.Name, ProbeSetData.value, ProbeSetSE.error, NStrain.count, Strain.Name2
-                FROM
-                        (ProbeSetData, ProbeSetFreeze,
-                         Strain, ProbeSet, ProbeSetXRef)
-                left join ProbeSetSE on
-                        (ProbeSetSE.DataId = ProbeSetData.Id AND ProbeSetSE.StrainId = ProbeSetData.StrainId)
-                left join NStrain on
-                        (NStrain.DataId = ProbeSetData.Id AND
-                        NStrain.StrainId = ProbeSetData.StrainId)
-                WHERE
-                        ProbeSet.Name = '%s' AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
-                        ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND
-                        ProbeSetFreeze.Name = '%s' AND
-                        ProbeSetXRef.DataId = ProbeSetData.Id AND
-                        ProbeSetData.StrainId = Strain.Id
-                Order BY
-                        Strain.Name
-                """
-
-    dataset_query["Publish"] = pheno_query
-    dataset_query["Geno"] = geno_query
-    dataset_query["ProbeSet"] = probeset_query
-
-    return dataset_query.get(dataset_type)
-
-
-@dataclass
-class Dataset:
-    """class for creating datasets"""
-    name: Optional[str] = None
-    dataset_type: Optional[str] = None
-    dataset_id: int = -1
-
-
-def create_mrna_tissue_dataset(dataset_name, dataset_type):
-    """an mrna assay is a quantitative assessment(assay) associated\
-    with an mrna trait.This used to be called probeset,but that term\
-    only referes specifically to the afffymetrix platform and is\
-    far too speficified"""
-
-    return Dataset(name=dataset_name, dataset_type=dataset_type)
-
-
-def dataset_type_getter(dataset_name, redis_instance=None) -> Optional[str]:
-    """given the dataset name fetch the type\
-    of the dataset this in turn  enables fetching\
-    the creation of the correct object could utilize\
-    redis for the case"""
-
-    results = redis_instance.get(dataset_name, None)
-
-    if results:
-        return results
-
-    return fetch_dataset_type_from_gn2_api(dataset_name)
-
-
-def fetch_dataset_type_from_gn2_api(dataset_name):
-    """this function is only called when the\
-    the redis is empty and does have the specificied\
-    dataset_type"""
-    # should only run once
-
-    dataset_structure = {}
-
-    map_dataset_to_new_type = {
-        "Phenotypes": "Publish",
-        "Genotypes": "Geno",
-        "MrnaTypes": "ProbeSet"
-    }
-
-    data = json.loads(requests.get(
-        GN2_BASE_URL + "/api/v_pre1/gen_dropdown", timeout=5).content)
-    _name = dataset_name
-    for species in data['datasets']:
-        for group in data['datasets'][species]:
-            for dataset_type in data['datasets'][species][group]:
-                for dataset in data['datasets'][species][group][dataset_type]:
-                    # assumes  the first is dataset_short_name
-                    short_dataset_name = next(
-                        item for item in dataset if item != "None" and item is not None)
-
-                    dataset_structure[short_dataset_name] = map_dataset_to_new_type.get(
-                        dataset_type, "MrnaTypes")
-    return dataset_structure
-
-
-def dataset_creator_store(dataset_type):
-    """function contains key value pairs for\
-    the function need to be called to create\
-    each dataset_type"""
-
-    dataset_obj = {
-        "ProbeSet": create_mrna_tissue_dataset
-    }
-
-    return dataset_obj[dataset_type]
-
-
-def create_dataset(dataset_type=None, dataset_name: str = None):
-    """function for creating new dataset  temp not implemented"""
-    if dataset_type is None:
-        dataset_type = dataset_type_getter(dataset_name)
-
-    dataset_creator = dataset_creator_store(dataset_type)
-    results = dataset_creator(
-        dataset_name=dataset_name, dataset_type=dataset_type)
-    return results
-
-
-def fetch_dataset_sample_id(samplelist: List, database, species: str) -> dict:
-    """fetch the strain ids from the db only if\
-    it is in the samplelist"""
-    # xtodo create an in clause for samplelist
-
-    strain_query = """
-        SELECT Strain.Name, Strain.Id FROM Strain, Species
-        WHERE Strain.Name IN {}
-        and Strain.SpeciesId=Species.Id
-        and Species.name = '{}'
-        """
-
-    database_cursor = database.cursor()
-    database_cursor.execute(strain_query.format(samplelist, species))
-
-    results = database_cursor.fetchall()
-
-    return dict(results)
-
-
-def divide_into_chunks(the_list, number_chunks):
-    """Divides a list into approximately number_chunks
-    >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3)
-    [[1, 2, 7], [3, 22, 8], [5, 22, 333]]"""
-
-    length = len(the_list)
-    if length == 0:
-        return [[]]
-
-    if length <= number_chunks:
-        number_chunks = length
-    chunk_size = int(ceil(length/number_chunks))
-    chunks = []
-
-    for counter in range(0, length, chunk_size):
-        chunks.append(the_list[counter:counter+chunk_size])
-    return chunks
-
-
-def escape(string_):
-    """function escape sql value"""
-    return escape_string(string_).decode('utf8')
-
-
-def mescape(*items) -> List:
-    """multiple escape for query values"""
-
-    return [escape_string(str(item)).decode('utf8') for item in items]
-
-
-def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
-    """function to fetch trait data"""
-    # MySQL limits the number of tables that can be used in a join to 61,
-    # so we break the sample ids into smaller chunks
-    # Postgres doesn't have that limit, so we can get rid of this after we transition
-
-    _trait_data = defaultdict(list)
-    chunk_size = 61
-    number_chunks = int(ceil(len(sample_ids) / chunk_size))
-    for sample_ids_step in divide_into_chunks(sample_ids, number_chunks):
-        if dataset_type == "Publish":
-            full_dataset_type = "Phenotype"
-        else:
-            full_dataset_type = dataset_type
-        temp = ['T%s.value' % item for item in sample_ids_step]
-
-        if dataset_type == "Publish":
-            query = "SELECT {}XRef.Id,".format(escape(dataset_type))
-
-        else:
-            query = "SELECT {}.Name,".format(escape(full_dataset_type))
-
-        query += ', '.join(temp)
-        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type,
-                                                                 dataset_type,
-                                                                 dataset_type))
-        for item in sample_ids_step:
-
-            query += """
-                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
-                    and T{}.StrainId={}\n
-                    """.format(*mescape(dataset_type, item,
-                                        item, dataset_type, item, item))
-
-        if dataset_type == "Publish":
-            query += """
-                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                        and {}Freeze.Name = '{}'
-                        and {}.Id = {}XRef.{}Id
-                        order by {}.Id
-                        """.format(*mescape(dataset_type, dataset_type,
-                                            dataset_type, dataset_type,
-                                            dataset_name, full_dataset_type,
-                                            dataset_type, dataset_type,
-                                            full_dataset_type))
-
-        else:
-            query += """
-                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                        and {}Freeze.Name = '{}'
-                        and {}.Id = {}XRef.{}Id
-                        order by {}.Id
-                        """.format(*mescape(dataset_type, dataset_type,
-                                            dataset_type, dataset_type,
-                                            dataset_name, dataset_type,
-                                            dataset_type, dataset_type,
-                                            full_dataset_type))
-
-        # print(query)
-
-        _results = fetch_from_db_sample_data(query, database_instance)
-
-        return []
-
-
-def get_probeset_trait_data(strain_ids: List, conn, dataset_name) -> dict:
-    """function for getting trait data\
-    for probeset data type similar to\
-    get trait data only difference is that\
-    it uses sub queries"""
-
-    trait_data: dict = {}
-
-    trait_id_name = {}
-
-    traits_query = """
-    SELECT ProbeSetXRef.DataId,ProbeSet.Name FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
-                    WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
-                    and ProbeSetFreeze.Name = '{}'
-                    and ProbeSet.Id = ProbeSetXRef.ProbeSetId
-                    order by ProbeSet.Id
-    """.format(dataset_name)
-
-    query = """
-    SELECT * from ProbeSetData
-    where StrainID in ({})
-    and id in (SELECT ProbeSetXRef.DataId FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
-                WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
-                and ProbeSetFreeze.Name = '{}'
-                and ProbeSet.Id = ProbeSetXRef.ProbeSetId
-                order by ProbeSet.Id)
-    """.format(",".join(str(strain_id) for strain_id in strain_ids), dataset_name)
-
-    with conn:
-        cursor = conn.cursor()
-        cursor.execute(query)
-        _results = cursor.fetchall()
-        cursor.execute(traits_query)
-        trait_id_name = dict(cursor.fetchall())
-
-    for trait_id, _strain_id, strain_value in _results:
-        trait_name = trait_id_name[trait_id]
-        if trait_data.get(trait_name):
-            trait_data[trait_name].append(strain_value)
-        else:
-            trait_data[trait_name] = []
-
-            trait_data[trait_name].append(strain_value)
-
-    return trait_data