about summary refs log tree commit diff
path: root/wqflask/base/data_set/dataset.py
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/base/data_set/dataset.py')
-rw-r--r--wqflask/base/data_set/dataset.py305
1 files changed, 0 insertions, 305 deletions
diff --git a/wqflask/base/data_set/dataset.py b/wqflask/base/data_set/dataset.py
deleted file mode 100644
index 435d74a9..00000000
--- a/wqflask/base/data_set/dataset.py
+++ /dev/null
@@ -1,305 +0,0 @@
-"Base Dataset class ..."
-
-import math
-import collections
-
-from redis import Redis
-
-from base import species
-from utility import chunks
-from utility.tools import get_setting
-from gn3.monads import MonadicDict, query_sql
-from pymonad.maybe import Maybe, Nothing
-from .datasetgroup import DatasetGroup
-from wqflask.database import database_connection
-from utility.db_tools import escape, mescape, create_in_clause
-from .utils import fetch_cached_results, cache_dataset_results
-
-
-class DataSet:
-    """
-    DataSet class defines a dataset in webqtl, can be either Microarray,
-    Published phenotype, genotype, or user input dataset(temp)
-
-    """
-
-    def __init__(self, name, get_samplelist=True, group_name=None, redis_conn=Redis()):
-
-        assert name, "Need a name"
-        self.name = name
-        self.id = None
-        self.shortname = None
-        self.fullname = None
-        self.type = None
-        self.data_scale = None  # ZS: For example log2
-        self.accession_id = Nothing
-
-        self.setup()
-
-        if self.type == "Temp":  # Need to supply group name as input if temp trait
-            # sets self.group and self.group_id and gets genotype
-            self.group = DatasetGroup(self, name=group_name)
-        else:
-            self.check_confidentiality()
-            self.retrieve_other_names()
-            # sets self.group and self.group_id and gets genotype
-            self.group = DatasetGroup(self)
-            self.accession_id = self.get_accession_id()
-        if get_samplelist == True:
-            self.group.get_samplelist(redis_conn)
-        self.species = species.TheSpecies(dataset=self)
-
-    def as_monadic_dict(self):
-        _result = MonadicDict({
-            'name': self.name,
-            'shortname': self.shortname,
-            'fullname': self.fullname,
-            'type': self.type,
-            'data_scale': self.data_scale,
-            'group': self.group.name
-        })
-        _result["accession_id"] = self.accession_id
-        return _result
-
-    def get_accession_id(self) -> Maybe[str]:
-        """Get the accession_id of this dataset depending on the
-        dataset type."""
-        __query = ""
-        with database_connection(get_setting("SQL_URI")) as conn:
-            if self.type == "Publish":
-                __query = (
-                    "SELECT InfoFiles.GN_AccesionId AS accession_id FROM "
-                    "InfoFiles, PublishFreeze, InbredSet "
-                    "WHERE InbredSet.Name = "
-                    f"'{conn.escape_string(self.group.name).decode()}' "
-                    "AND PublishFreeze.InbredSetId = InbredSet.Id "
-                    "AND InfoFiles.InfoPageName = PublishFreeze.Name "
-                    "AND PublishFreeze.public > 0 AND "
-                    "PublishFreeze.confidentiality < 1 "
-                    "ORDER BY PublishFreeze.CreateTime DESC"
-                )
-            elif self.type == "Geno":
-                __query = (
-                    "SELECT InfoFiles.GN_AccesionId AS accession_id FROM "
-                    "InfoFiles, GenoFreeze, InbredSet WHERE InbredSet.Name = "
-                    f"'{conn.escape_string(self.group.name).decode()}' AND "
-                    "GenoFreeze.InbredSetId = InbredSet.Id "
-                    "AND InfoFiles.InfoPageName = GenoFreeze.ShortName "
-                    "AND GenoFreeze.public > 0 AND "
-                    "GenoFreeze.confidentiality < 1 "
-                    "ORDER BY GenoFreeze.CreateTime DESC"
-                )
-            elif self.type == "ProbeSet":
-                __query = (
-                    "SELECT InfoFiles.GN_AccesionId AS accession_id "
-                    "FROM InfoFiles WHERE InfoFiles.InfoPageName = "
-                    f"'{conn.escape_string(self.name).decode()}'"
-                )
-            else:  # The Value passed is not present
-                raise LookupError
-
-            # Should there be an empty row, query_sql returns a None
-            # value instead of yielding a value; this block
-            # accomodates this non-intuitive edge-case
-            for result in query_sql(conn, __query) or ():
-                return result["accession_id"]
-        return Nothing
-
-    def retrieve_other_names(self):
-        """This method fetches the the dataset names in search_result.
-
-        If the data set name parameter is not found in the 'Name' field of
-        the data set table, check if it is actually the FullName or
-        ShortName instead.
-
-        This is not meant to retrieve the data set info if no name at
-        all is passed.
-
-        """
-        with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
-            try:
-                if self.type == "ProbeSet":
-                    cursor.execute(
-                        "SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, "
-                        "ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, "
-                        "ProbeSetFreeze.DataScale, Tissue.Name "
-                        "FROM ProbeSetFreeze, ProbeFreeze, Tissue "
-                        "WHERE ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id "
-                        "AND ProbeFreeze.TissueId = Tissue.Id "
-                        "AND (ProbeSetFreeze.Name = %s OR "
-                        "ProbeSetFreeze.FullName = %s "
-                        "OR ProbeSetFreeze.ShortName = %s)",
-                        (self.name,)*3)
-                    (self.id, self.name, self.fullname, self.shortname,
-                    self.data_scale, self.tissue) = cursor.fetchone()
-                else:
-                    self.tissue = "N/A"
-                    cursor.execute(
-                        "SELECT Id, Name, FullName, ShortName "
-                        f"FROM {self.type}Freeze "
-                        "WHERE (Name = %s OR FullName = "
-                        "%s OR ShortName = %s)",
-                        (self.name,)*3)
-                    (self.id, self.name, self.fullname,
-                    self.shortname) = cursor.fetchone()
-            except TypeError:
-                pass
-
-    def chunk_dataset(self, dataset, n):
-
-        results = {}
-        traits_name_dict = ()
-        with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
-            cursor.execute(
-                "SELECT ProbeSetXRef.DataId,ProbeSet.Name "
-                "FROM ProbeSet, ProbeSetXRef, ProbeSetFreeze "
-                "WHERE ProbeSetFreeze.Name = %s AND "
-                "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id "
-                "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id",
-                (self.name,))
-            # should cache this
-            traits_name_dict = dict(cursor.fetchall())
-
-        for i in range(0, len(dataset), n):
-            matrix = list(dataset[i:i + n])
-            trait_name = traits_name_dict[matrix[0][0]]
-
-            my_values = [value for (trait_name, strain, value) in matrix]
-            results[trait_name] = my_values
-        return results
-
-    def get_probeset_data(self, sample_list=None, trait_ids=None):
-
-        # improvement of get trait data--->>>
-        if sample_list:
-            self.samplelist = sample_list
-
-        else:
-            self.samplelist = self.group.samplelist
-
-        if self.group.parlist != None and self.group.f1list != None:
-            if (self.group.parlist + self.group.f1list) in self.samplelist:
-                self.samplelist += self.group.parlist + self.group.f1list
-        with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
-            cursor.execute(
-                "SELECT Strain.Name, Strain.Id FROM "
-                "Strain, Species WHERE Strain.Name IN "
-                f"{create_in_clause(self.samplelist)} "
-                "AND Strain.SpeciesId=Species.Id AND "
-                "Species.name = %s", (self.group.species,)
-            )
-            results = dict(cursor.fetchall())
-            sample_ids = [results[item] for item in self.samplelist]
-
-            sorted_samplelist = [strain_name for strain_name, strain_id in sorted(
-                results.items(), key=lambda item: item[1])]
-
-            cursor.execute(
-                "SELECT * from ProbeSetData WHERE StrainID IN "
-                f"{create_in_clause(sample_ids)} AND id IN "
-                "(SELECT ProbeSetXRef.DataId FROM "
-                "(ProbeSet, ProbeSetXRef, ProbeSetFreeze) "
-                "WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id "
-                "AND ProbeSetFreeze.Name = %s AND "
-                "ProbeSet.Id = ProbeSetXRef.ProbeSetId)",
-                (self.name,)
-            )
-
-            query_results = list(cursor.fetchall())
-            data_results = self.chunk_dataset(query_results, len(sample_ids))
-            self.samplelist = sorted_samplelist
-            self.trait_data = data_results
-
-    def get_trait_data(self, sample_list=None):
-        if sample_list:
-            self.samplelist = sample_list
-        else:
-            self.samplelist = self.group.samplelist
-
-        if self.group.parlist != None and self.group.f1list != None:
-            if (self.group.parlist + self.group.f1list) in self.samplelist:
-                self.samplelist += self.group.parlist + self.group.f1list
-
-        with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
-            cursor.execute(
-                "SELECT Strain.Name, Strain.Id FROM Strain, Species "
-                f"WHERE Strain.Name IN {create_in_clause(self.samplelist)} "
-                "AND Strain.SpeciesId=Species.Id "
-                "AND Species.name = %s",
-                (self.group.species,)
-            )
-            results = dict(cursor.fetchall())
-            sample_ids = [
-                sample_id for sample_id in
-                (results.get(item) for item in self.samplelist
-                 if item is not None)
-                if sample_id is not None
-            ]
-
-            # MySQL limits the number of tables that can be used in a join to 61,
-            # so we break the sample ids into smaller chunks
-            # Postgres doesn't have that limit, so we can get rid of this after we transition
-            chunk_size = 50
-            number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
-
-            cached_results = fetch_cached_results(self.name, self.type, self.samplelist)
-
-            if cached_results is None:
-                trait_sample_data = []
-                for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-                    if self.type == "Publish":
-                        dataset_type = "Phenotype"
-                    else:
-                        dataset_type = self.type
-                    temp = ['T%s.value' % item for item in sample_ids_step]
-                    if self.type == "Publish":
-                        query = "SELECT {}XRef.Id".format(escape(self.type))
-                    else:
-                        query = "SELECT {}.Name".format(escape(dataset_type))
-                    data_start_pos = 1
-                    if len(temp) > 0:
-                        query = query + ", " + ', '.join(temp)
-                    query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
-                                                                             self.type,
-                                                                             self.type))
-
-                    for item in sample_ids_step:
-                        query += """
-                                left join {}Data as T{} on T{}.Id = {}XRef.DataId
-                                and T{}.StrainId={}\n
-                                """.format(*mescape(self.type, item, item, self.type, item, item))
-
-                    if self.type == "Publish":
-                        query += """
-                                WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
-                                and {}Freeze.Name = '{}'
-                                and {}.Id = {}XRef.{}Id
-                                order by {}.Id
-                                """.format(*mescape(self.type, self.type, self.type, self.name,
-                                                    dataset_type, self.type, dataset_type, dataset_type))
-                    else:
-                        query += """
-                                WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                                and {}Freeze.Name = '{}'
-                                and {}.Id = {}XRef.{}Id
-                                order by {}.Id
-                                """.format(*mescape(self.type, self.type, self.type, self.type,
-                                                    self.name, dataset_type, self.type, self.type, dataset_type))
-                    cursor.execute(query)
-                    results = cursor.fetchall()
-                    trait_sample_data.append([list(result) for result in results])
-
-                trait_count = len(trait_sample_data[0])
-                self.trait_data = collections.defaultdict(list)
-
-                data_start_pos = 1
-                for trait_counter in range(trait_count):
-                    trait_name = trait_sample_data[0][trait_counter][0]
-                    for chunk_counter in range(int(number_chunks)):
-                        self.trait_data[trait_name] += (
-                            trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
-
-                cache_dataset_results(
-                    self.name, self.type, self.samplelist, self.trait_data)
-            else:
-                self.trait_data = cached_results