about summary refs log tree commit diff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
authorZachary Sloan2013-08-14 17:25:52 -0500
committerZachary Sloan2013-08-14 17:25:52 -0500
commit26b1883a8fe4053b59833178f44e047157f2fc9c (patch)
treed2359e5952f6062d75ba1f43f1804a7b6df284c9 /wqflask/base/data_set.py
parenteea1c38a9851f31011787b27c14365211a06ea51 (diff)
parent6379959af53b2ec595b85ccdc099c6f14adf0381 (diff)
downloadgenenetwork2-26b1883a8fe4053b59833178f44e047157f2fc9c.tar.gz
Merge branch 'master' of https://github.com/zsloan/genenetwork
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py447
1 files changed, 334 insertions, 113 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 07fe9cd9..091433a6 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -16,8 +16,6 @@
 # Contact Drs. Robert W. Williams and Xiaodong Zhou (2010)
 # at rwilliams@uthsc.edu and xzhou15@uthsc.edu
 #
-#we
-#
 # This module is used by GeneNetwork project (www.genenetwork.org)
 
 from __future__ import absolute_import, print_function, division
@@ -27,8 +25,13 @@ import string
 import collections
 
 import json
+import gzip
+import cPickle as pickle
 import itertools
 
+from redis import Redis
+Redis = Redis()
+
 from flask import Flask, g
 
 import reaper
@@ -40,35 +43,101 @@ from utility import webqtlUtil
 from utility.benchmark import Bench
 from wqflask.my_pylmm.pyLMM import chunks
 
+from maintenance import get_group_samplelists
+
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
 
 # Used by create_database to instantiate objects
+# Each subclass will add to this
 DS_NAME_MAP = {}
 
-def create_dataset(dataset_name):
-    #print("dataset_name:", dataset_name)
+def create_dataset(dataset_name, dataset_type = None):
+    if not dataset_type:
+        dataset_type = Dataset_Getter(dataset_name)
+        #dataset_type = get_dataset_type_from_json(dataset_name)
 
-    query = """
-        SELECT DBType.Name
-        FROM DBList, DBType
-        WHERE DBList.Name = '{}' and
-              DBType.Id = DBList.DBTypeId
-        """.format(escape(dataset_name))
-    #print("query is: ", pf(query))
-    dataset_type = g.db.execute(query).fetchone().Name
+        print("dataset_type is:", dataset_type)
+        #query = """
+        #    SELECT DBType.Name
+        #    FROM DBList, DBType
+        #    WHERE DBList.Name = '{}' and
+        #          DBType.Id = DBList.DBTypeId
+        #    """.format(escape(dataset_name))
+        #dataset_type = g.db.execute(query).fetchone().Name
 
-    #dataset_type = cursor.fetchone()[0]
-    #print("[blubber] dataset_type:", pf(dataset_type))
 
     dataset_ob = DS_NAME_MAP[dataset_type]
-    #dataset_class = getattr(data_set, dataset_ob)
-    #print("dataset_ob:", dataset_ob)
-    #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
-
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+
+#def get_dataset_type_from_json(dataset_name):
+    
+class Dataset_Types(object):
+    
+    def __init__(self):
+        self.datasets = {}
+        file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
+        with open(file_name, 'r') as fh:
+            data = json.load(fh)
+        
+        print("*" * 70)
+        for species in data['datasets']:
+            for group in data['datasets'][species]:
+                for dataset_type in data['datasets'][species][group]:
+                    for dataset in data['datasets'][species][group][dataset_type]:
+                        print("dataset is:", dataset)
+                        
+                        short_dataset_name = dataset[0]
+                        if dataset_type == "Phenotypes":
+                            new_type = "Publish"
+                        elif dataset_type == "Genotypes":
+                            new_type = "Geno"
+                        else:
+                            new_type = "ProbeSet"
+                        self.datasets[short_dataset_name] = new_type
+                            
+    def __call__(self, name):
+        return self.datasets[name]
+    
+# Do the intensive work at startup one time only
+Dataset_Getter = Dataset_Types()
+
+#
+#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611"))
+                    
+
+def create_datasets_list():
+    key = "all_datasets"
+    result = Redis.get(key)
+    
+    if result:
+        print("Cache hit!!!")
+        datasets = pickle.loads(result)
+        
+    else:
+        datasets = list()
+        with Bench("Creating DataSets object"):
+            type_dict = {'Publish': 'PublishFreeze',
+                         'ProbeSet': 'ProbeSetFreeze',
+                         'Geno': 'GenoFreeze'}
+        
+            for dataset_type in type_dict:
+                query = "SELECT Name FROM {}".format(type_dict[dataset_type])
+                for result in g.db.execute(query).fetchall():
+                    #The query at the beginning of this function isn't necessary here, but still would
+                    #rather just reuse it
+                    #print("type: {}\tname: {}".format(dataset_type, result.Name))
+                    dataset = create_dataset(result.Name, dataset_type)
+                    datasets.append(dataset)
+            
+        Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL))
+        Redis.expire(key, 60*60)
+    
+    return datasets
+
+
 def create_in_clause(items):
     """Create an in clause for mysql"""
     in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
@@ -99,7 +168,8 @@ class Markers(object):
         
         for marker, p_value in itertools.izip(self.markers, p_values):
             marker['p_value'] = p_value
-            print("p_value is:", marker['p_value'])
+            if math.isnan(marker['p_value']):
+                print("p_value is:", marker['p_value'])
             marker['lod_score'] = -math.log10(marker['p_value'])
             #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
             marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
@@ -167,8 +237,8 @@ class DatasetGroup(object):
         
         self.incparentsf1 = False
         self.allsamples = None
-        
-        
+
+
     def get_markers(self):
         #print("self.species is:", self.species)
         if self.species == "human":
@@ -177,7 +247,7 @@ class DatasetGroup(object):
             marker_class = Markers
 
         self.markers = marker_class(self.name)
-        
+
 
     def get_f1_parent_strains(self):
         try:
@@ -190,7 +260,29 @@ class DatasetGroup(object):
             self.f1list = [f1, f12]
         if maternal and paternal:
             self.parlist = [maternal, paternal]
-            
+
+    def get_samplelist(self):
+        key = "samplelist:v4:" + self.name
+        print("key is:", key)
+        with Bench("Loading cache"):
+            result = Redis.get(key)
+
+        if result:
+            print("Sample List Cache hit!!!")
+            print("Before unjsonifying {}: {}".format(type(result), result))
+            self.samplelist = json.loads(result)
+            print("  type: ", type(self.samplelist))
+            print("  self.samplelist: ", self.samplelist)
+        else:
+            print("Cache not hit")
+            try:
+                self.samplelist = get_group_samplelists.get_samplelist(self.name + ".geno")
+            except IOError:
+                self.samplelist = None
+            print("after get_samplelist")
+            Redis.set(key, json.dumps(self.samplelist))
+            Redis.expire(key, 60*5)
+
     def read_genotype_file(self):
         '''Read genotype from .geno file instead of database'''
         #if self.group == 'BXD300':
@@ -205,7 +297,18 @@ class DatasetGroup(object):
 
         # reaper barfs on unicode filenames, so here we ensure it's a string
         full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
-        genotype_1.read(full_filename)
+        if os.path.isfile(full_filename):
+            print("Reading file: ", full_filename)
+            genotype_1.read(full_filename)
+            print("File read")
+        else:
+            try:
+                full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno'))
+                #print("Reading file")
+                genotype_1.read(full_filename)
+                #print("File read")
+            except IOError:
+                print("File doesn't exist!")
 
         if genotype_1.type == "group" and self.parlist:
             genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1])       #, F1=_f1)
@@ -222,6 +325,27 @@ class DatasetGroup(object):
         self.samplelist = list(genotype.prgy)
 
 
+#class DataSets(object):
+#    """Builds a list of DataSets"""
+#    
+#    def __init__(self):
+#        self.datasets = list()
+#        
+
+        
+        #query = """SELECT Name FROM ProbeSetFreeze
+        #           UNION
+        #           SELECT Name From PublishFreeze
+        #           UNION
+        #           SELECT Name From GenoFreeze"""
+        #
+        #for result in g.db.execute(query).fetchall():
+        #    dataset = DataSet(result.Name)
+        #    self.datasets.append(dataset)
+
+#ds = DataSets()
+#print("[orange] ds:", ds.datasets)
+
 class DataSet(object):
     """
     DataSet class defines a dataset in webqtl, can be either Microarray,
@@ -234,6 +358,8 @@ class DataSet(object):
         assert name, "Need a name"
         self.name = name
         self.id = None
+        self.shortname = None
+        self.fullname = None
         self.type = None
 
         self.setup()
@@ -243,7 +369,7 @@ class DataSet(object):
         self.retrieve_other_names()
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
-        self.group.read_genotype_file()
+        self.group.get_samplelist()
         self.species = species.TheSpecies(self)
 
 
@@ -277,7 +403,6 @@ class DataSet(object):
     #    return self._group
 
 
-
     def retrieve_other_names(self):
         """
         If the data set name parameter is not found in the 'Name' field of the data set table,
@@ -293,7 +418,7 @@ class DataSet(object):
             self.name,
             self.name,
             self.name))
-        #print("query_args are:", query_args)
+        print("query_args are:", query_args)
 
         #print("""
         #        SELECT Id, Name, FullName, ShortName
@@ -301,22 +426,109 @@ class DataSet(object):
         #        WHERE public > %s AND
         #             (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
         #  """ % (query_args))
+        
+        try:
+            self.id, self.name, self.fullname, self.shortname = g.db.execute("""
+                    SELECT Id, Name, FullName, ShortName
+                    FROM %s
+                    WHERE public > %s AND
+                         (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+              """ % (query_args)).fetchone()
+        except TypeError:
+            print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
+            pass
+        
+    def get_trait_data(self):
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
 
-        self.id, self.name, self.fullname, self.shortname = g.db.execute("""
-                SELECT Id, Name, FullName, ShortName
-                FROM %s
-                WHERE public > %s AND
-                     (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-          """ % (query_args)).fetchone()
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        
+            if self.type == "Publish":
+                dataset_type = "Phenotype"
+            else:
+                dataset_type = self.type
+            temp = ['T%s.value' % item for item in sample_ids_step]
+            if self.type == "Publish":
+                query = "SELECT {}XRef.Id,".format(escape(self.type))
+            else:
+                query = "SELECT {}.Name,".format(escape(dataset_type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
+                                                                     self.type,
+                                                                     self.type))
+
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
+            if self.type == "Publish":
+                query += """
+                        WHERE {}XRef.PublicationId = {}Freeze.Id
+                        and {}Freeze.Name = '{}'
+                        and {}.Id = {}XRef.{}Id
+                        order by {}.Id
+                        """.format(*mescape(self.type, self.type, self.type, self.type,
+                                   self.name, dataset_type, self.type, self.type, dataset_type))
+            else:
+                query += """
+                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                        and {}Freeze.Name = '{}'
+                        and {}.Id = {}XRef.{}Id
+                        order by {}.Id
+                        """.format(*mescape(self.type, self.type, self.type, self.type,
+                                   self.name, dataset_type, self.type, self.type, dataset_type))
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
 
-        #self.cursor.execute(query)
-        #self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone()
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
         
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
 
     def setup(self):
+        
+        print("IS A PHENOTYPEDATASET")
+        
         # Fields in the database table
         self.search_fields = ['Phenotype.Post_publication_description',
                             'Phenotype.Pre_publication_description',
@@ -387,14 +599,24 @@ class PhenotypeDataSet(DataSet):
     def get_trait_info(self, trait_list, species = ''):
         for this_trait in trait_list:
             if not this_trait.haveinfo:
-                this_trait.retrieveInfo(QTL=1)
+                this_trait.retrieve_info(get_qtl_info=True)
 
             description = this_trait.post_publication_description
+            
+            #If the dataset is confidential and the user has access to confidential
+            #phenotype traits, then display the pre-publication description instead
+            #of the post-publication description
             if this_trait.confidential:
                 continue   # for now
-                if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
+            
+                if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(
+                        privilege=self.privilege,
+                        userName=self.userName,
+                        authorized_users=this_trait.authorized_users):
+                        
                     description = this_trait.pre_publication_description
-            this_trait.description_display = unicode(description, "utf8")
+            
+            this_trait.description_display = description
 
             if not this_trait.year.isdigit():
                 this_trait.pubmed_text = "N/A"
@@ -632,73 +854,73 @@ class MrnaAssayDataSet(DataSet):
         #print("After retrieve_sample_data")
         return trait_data
     
-    def get_trait_data(self):
-        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
-        query = """
-            SELECT Strain.Name, Strain.Id FROM Strain, Species
-            WHERE Strain.Name IN {}
-            and Strain.SpeciesId=Species.Id
-            and Species.name = '{}'
-            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
-        results = dict(g.db.execute(query).fetchall())
-        sample_ids = [results[item] for item in self.samplelist]
-
-        # MySQL limits the number of tables that can be used in a join to 61,
-        # so we break the sample ids into smaller chunks
-        # Postgres doesn't have that limit, so we can get rid of this after we transition
-        chunk_size = 50
-        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
-        trait_sample_data = []
-        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-
-        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
-        #tempTable = None
-        #if GeneId and db.type == "ProbeSet": 
-        #    if method == "3":
-        #        tempTable = self.getTempLiteratureTable(species=species,
-        #                                                input_species_geneid=GeneId,
-        #                                                returnNumber=returnNumber)
-        #
-        #    if method == "4" or method == "5":
-        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
-        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
-        #                                        method=method,
-        #                                        returnNumber=returnNumber)
-        
-            temp = ['T%s.value' % item for item in sample_ids_step]
-            query = "SELECT {}.Name,".format(escape(self.type))
-            data_start_pos = 1
-            query += string.join(temp, ', ')
-            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
-                                                                     self.type,
-                                                                     self.type))
-
-            for item in sample_ids_step:
-                query += """
-                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
-                        and T{}.StrainId={}\n
-                        """.format(*mescape(self.type, item, item, self.type, item, item))
-                        
-            query += """
-                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                    and {}Freeze.Name = '{}'
-                    and {}.Id = {}XRef.{}Id
-                    order by {}.Id
-                    """.format(*mescape(self.type, self.type, self.type, self.type,
-                               self.name, self.type, self.type, self.type, self.type))
-            results = g.db.execute(query).fetchall()
-            trait_sample_data.append(results)
-
-        trait_count = len(trait_sample_data[0])
-        self.trait_data = collections.defaultdict(list)
-        
-        # put all of the separate data together into a dictionary where the keys are
-        # trait names and values are lists of sample values
-        for trait_counter in range(trait_count):
-            trait_name = trait_sample_data[0][trait_counter][0]
-            for chunk_counter in range(int(number_chunks)):
-                self.trait_data[trait_name] += (
-                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+    #def get_trait_data(self):
+    #    self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+    #    query = """
+    #        SELECT Strain.Name, Strain.Id FROM Strain, Species
+    #        WHERE Strain.Name IN {}
+    #        and Strain.SpeciesId=Species.Id
+    #        and Species.name = '{}'
+    #        """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+    #    results = dict(g.db.execute(query).fetchall())
+    #    sample_ids = [results[item] for item in self.samplelist]
+    #
+    #    # MySQL limits the number of tables that can be used in a join to 61,
+    #    # so we break the sample ids into smaller chunks
+    #    # Postgres doesn't have that limit, so we can get rid of this after we transition
+    #    chunk_size = 50
+    #    number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+    #    trait_sample_data = []
+    #    for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+    #
+    #    #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+    #    #tempTable = None
+    #    #if GeneId and db.type == "ProbeSet": 
+    #    #    if method == "3":
+    #    #        tempTable = self.getTempLiteratureTable(species=species,
+    #    #                                                input_species_geneid=GeneId,
+    #    #                                                returnNumber=returnNumber)
+    #    #
+    #    #    if method == "4" or method == "5":
+    #    #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+    #    #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+    #    #                                        method=method,
+    #    #                                        returnNumber=returnNumber)
+    #    
+    #        temp = ['T%s.value' % item for item in sample_ids_step]
+    #        query = "SELECT {}.Name,".format(escape(self.type))
+    #        data_start_pos = 1
+    #        query += string.join(temp, ', ')
+    #        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+    #                                                                 self.type,
+    #                                                                 self.type))
+    #
+    #        for item in sample_ids_step:
+    #            query += """
+    #                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
+    #                    and T{}.StrainId={}\n
+    #                    """.format(*mescape(self.type, item, item, self.type, item, item))
+    #                    
+    #        query += """
+    #                WHERE {}XRef.{}FreezeId = {}Freeze.Id
+    #                and {}Freeze.Name = '{}'
+    #                and {}.Id = {}XRef.{}Id
+    #                order by {}.Id
+    #                """.format(*mescape(self.type, self.type, self.type, self.type,
+    #                           self.name, self.type, self.type, self.type, self.type))
+    #        results = g.db.execute(query).fetchall()
+    #        trait_sample_data.append(results)
+    #
+    #    trait_count = len(trait_sample_data[0])
+    #    self.trait_data = collections.defaultdict(list)
+    #    
+    #    # put all of the separate data together into a dictionary where the keys are
+    #    # trait names and values are lists of sample values
+    #    for trait_counter in range(trait_count):
+    #        trait_name = trait_sample_data[0][trait_counter][0]
+    #        for chunk_counter in range(int(number_chunks)):
+    #            self.trait_data[trait_name] += (
+    #                trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
     
 
     def get_trait_info(self, trait_list=None, species=''):
@@ -779,14 +1001,14 @@ class MrnaAssayDataSet(DataSet):
 
             #Max LRS and its Locus location
             if this_trait.lrs and this_trait.locus:
-                self.cursor.execute("""
+                query = """
                     select Geno.Chr, Geno.Mb from Geno, Species
-                    where Species.Name = '%s' and
-                        Geno.Name = '%s' and
+                    where Species.Name = '{}' and
+                        Geno.Name = '{}' and
                         Geno.SpeciesId = Species.Id
-                """ % (species, this_trait.locus))
-                result = self.cursor.fetchone()
-
+                """.format(species, this_trait.locus)
+                result = g.db.execute(query).fetchone()
+                
                 if result:
                     #if result[0] and result[1]:
                     #    lrs_chr = result[0]
@@ -940,6 +1162,5 @@ def geno_mrna_confidentiality(ob):
      authorized_users) = result.fetchall()[0]
 
     if confidential:
-        # Allow confidential data later
-        NoConfindetialDataForYouTodaySorry
+        return True