about summary refs log tree commit diff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
authorLei Yan2013-09-13 14:07:27 -0500
committerLei Yan2013-09-13 14:07:27 -0500
commitaf24c0d610d9a2189f86677e4f23deb372ee2bf7 (patch)
tree53480351b97727670637a37dbd4c78e52446ae88 /wqflask/base/data_set.py
parent155e2997613c0750de30b734686f8977524956f9 (diff)
parentc5fc931621707865357ace4b637db7481e0be552 (diff)
downloadgenenetwork2-af24c0d610d9a2189f86677e4f23deb372ee2bf7.tar.gz
Merge https://github.com/zsloan/genenetwork
Resolved conflicts:
	wqflask/base/trait.py
	wqflask/wqflask/correlation/correlationFunction.py
	wqflask/wqflask/correlation/correlation_function.py
	wqflask/wqflask/correlation/correlation_functions.py
	wqflask/wqflask/correlation/show_corr_results.py
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py366
1 files changed, 264 insertions, 102 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 03b24230..96e04df0 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -16,8 +16,6 @@
 # Contact Drs. Robert W. Williams and Xiaodong Zhou (2010)
 # at rwilliams@uthsc.edu and xzhou15@uthsc.edu
 #
-#we
-#
 # This module is used by GeneNetwork project (www.genenetwork.org)
 
 from __future__ import absolute_import, print_function, division
@@ -27,6 +25,7 @@ import string
 import collections
 
 import json
+import gzip
 import cPickle as pickle
 import itertools
 
@@ -44,36 +43,71 @@ from utility import webqtlUtil
 from utility.benchmark import Bench
 from wqflask.my_pylmm.pyLMM import chunks
 
+from maintenance import get_group_samplelists
+
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
 
 # Used by create_database to instantiate objects
+# Each subclass will add to this
 DS_NAME_MAP = {}
 
 def create_dataset(dataset_name, dataset_type = None):
-    #print("dataset_name:", dataset_name)
-
     if not dataset_type:
-        query = """
-            SELECT DBType.Name
-            FROM DBList, DBType
-            WHERE DBList.Name = '{}' and
-                  DBType.Id = DBList.DBTypeId
-            """.format(escape(dataset_name))
-        #print("query is: ", pf(query))
-        dataset_type = g.db.execute(query).fetchone().Name
+        dataset_type = Dataset_Getter(dataset_name)
+        #dataset_type = get_dataset_type_from_json(dataset_name)
 
-    #dataset_type = cursor.fetchone()[0]
-    #print("[blubber] dataset_type:", pf(dataset_type))
+        print("dataset_type is:", dataset_type)
+        #query = """
+        #    SELECT DBType.Name
+        #    FROM DBList, DBType
+        #    WHERE DBList.Name = '{}' and
+        #          DBType.Id = DBList.DBTypeId
+        #    """.format(escape(dataset_name))
+        #dataset_type = g.db.execute(query).fetchone().Name
 
-    dataset_ob = DS_NAME_MAP[dataset_type]
-    #dataset_class = getattr(data_set, dataset_ob)
-    #print("dataset_ob:", dataset_ob)
-    #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
 
+    dataset_ob = DS_NAME_MAP[dataset_type]
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+
+#def get_dataset_type_from_json(dataset_name):
+    
+class Dataset_Types(object):
+    
+    def __init__(self):
+        self.datasets = {}
+        file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
+        with open(file_name, 'r') as fh:
+            data = json.load(fh)
+        
+        print("*" * 70)
+        for species in data['datasets']:
+            for group in data['datasets'][species]:
+                for dataset_type in data['datasets'][species][group]:
+                    for dataset in data['datasets'][species][group][dataset_type]:
+                        print("dataset is:", dataset)
+                        
+                        short_dataset_name = dataset[0]
+                        if dataset_type == "Phenotypes":
+                            new_type = "Publish"
+                        elif dataset_type == "Genotypes":
+                            new_type = "Geno"
+                        else:
+                            new_type = "ProbeSet"
+                        self.datasets[short_dataset_name] = new_type
+                            
+    def __call__(self, name):
+        return self.datasets[name]
+    
+# Do the intensive work at startup one time only
+Dataset_Getter = Dataset_Types()
+
+#
+#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611"))
+                    
+
 def create_datasets_list():
     key = "all_datasets"
     result = Redis.get(key)
@@ -94,7 +128,7 @@ def create_datasets_list():
                 for result in g.db.execute(query).fetchall():
                     #The query at the beginning of this function isn't necessary here, but still would
                     #rather just reuse it
-                    print("type: {}\tname: {}".format(dataset_type, result.Name))
+                    #print("type: {}\tname: {}".format(dataset_type, result.Name))
                     dataset = create_dataset(result.Name, dataset_type)
                     datasets.append(dataset)
             
@@ -134,12 +168,13 @@ class Markers(object):
         
         for marker, p_value in itertools.izip(self.markers, p_values):
             marker['p_value'] = p_value
-            print("p_value is:", marker['p_value'])
-            marker['lod_score'] = -math.log10(marker['p_value'])
-            #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
-            marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
-        
-        
+            if marker['p_value'] == 0:
+                marker['lod_score'] = 0
+                marker['lrs_value'] = 0
+            else:
+                marker['lod_score'] = -math.log10(marker['p_value'])
+                #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+                marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
 
 
 class HumanMarkers(Markers):
@@ -154,8 +189,6 @@ class HumanMarkers(Markers):
             marker['name'] = splat[1]
             marker['Mb'] = float(splat[3]) / 1000000
             self.markers.append(marker)
-            
-        #print("markers is: ", pf(self.markers))
 
 
     def add_pvalues(self, p_values):
@@ -212,7 +245,7 @@ class DatasetGroup(object):
             marker_class = Markers
 
         self.markers = marker_class(self.name)
-        
+
 
     def get_f1_parent_strains(self):
         try:
@@ -225,7 +258,29 @@ class DatasetGroup(object):
             self.f1list = [f1, f12]
         if maternal and paternal:
             self.parlist = [maternal, paternal]
-            
+
+    def get_samplelist(self):
+        key = "samplelist:v4:" + self.name
+        print("key is:", key)
+        with Bench("Loading cache"):
+            result = Redis.get(key)
+
+        if result:
+            print("Sample List Cache hit!!!")
+            print("Before unjsonifying {}: {}".format(type(result), result))
+            self.samplelist = json.loads(result)
+            print("  type: ", type(self.samplelist))
+            print("  self.samplelist: ", self.samplelist)
+        else:
+            print("Cache not hit")
+            try:
+                self.samplelist = get_group_samplelists.get_samplelist(self.name + ".geno")
+            except IOError:
+                self.samplelist = None
+            print("after get_samplelist")
+            Redis.set(key, json.dumps(self.samplelist))
+            Redis.expire(key, 60*5)
+
     def read_genotype_file(self):
         '''Read genotype from .geno file instead of database'''
         #if self.group == 'BXD300':
@@ -240,7 +295,18 @@ class DatasetGroup(object):
 
         # reaper barfs on unicode filenames, so here we ensure it's a string
         full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
-        genotype_1.read(full_filename)
+        if os.path.isfile(full_filename):
+            print("Reading file: ", full_filename)
+            genotype_1.read(full_filename)
+            print("File read")
+        else:
+            try:
+                full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno'))
+                #print("Reading file")
+                genotype_1.read(full_filename)
+                #print("File read")
+            except IOError:
+                print("File doesn't exist!")
 
         if genotype_1.type == "group" and self.parlist:
             genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1])       #, F1=_f1)
@@ -301,7 +367,7 @@ class DataSet(object):
         self.retrieve_other_names()
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
-        self.group.read_genotype_file()
+        self.group.get_samplelist()
         self.species = species.TheSpecies(self)
 
 
@@ -335,7 +401,6 @@ class DataSet(object):
     #    return self._group
 
 
-
     def retrieve_other_names(self):
         """
         If the data set name parameter is not found in the 'Name' field of the data set table,
@@ -370,11 +435,98 @@ class DataSet(object):
         except TypeError:
             print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
+        
+    def get_trait_data(self):
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        
+            if self.type == "Publish":
+                dataset_type = "Phenotype"
+            else:
+                dataset_type = self.type
+            temp = ['T%s.value' % item for item in sample_ids_step]
+            if self.type == "Publish":
+                query = "SELECT {}XRef.Id,".format(escape(self.type))
+            else:
+                query = "SELECT {}.Name,".format(escape(dataset_type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
+                                                                     self.type,
+                                                                     self.type))
+
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
+            if self.type == "Publish":
+                query += """
+                        WHERE {}XRef.PublicationId = {}Freeze.Id
+                        and {}Freeze.Name = '{}'
+                        and {}.Id = {}XRef.{}Id
+                        order by {}.Id
+                        """.format(*mescape(self.type, self.type, self.type, self.type,
+                                   self.name, dataset_type, self.type, self.type, dataset_type))
+            else:
+                query += """
+                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                        and {}Freeze.Name = '{}'
+                        and {}.Id = {}XRef.{}Id
+                        order by {}.Id
+                        """.format(*mescape(self.type, self.type, self.type, self.type,
+                                   self.name, dataset_type, self.type, self.type, dataset_type))
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
+
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
+        
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
 
     def setup(self):
+        
+        print("IS A PHENOTYPEDATASET")
+        
         # Fields in the database table
         self.search_fields = ['Phenotype.Post_publication_description',
                             'Phenotype.Pre_publication_description',
@@ -445,14 +597,24 @@ class PhenotypeDataSet(DataSet):
     def get_trait_info(self, trait_list, species = ''):
         for this_trait in trait_list:
             if not this_trait.haveinfo:
-                this_trait.retrieveInfo(QTL=1)
+                this_trait.retrieve_info(get_qtl_info=True)
 
             description = this_trait.post_publication_description
+            
+            #If the dataset is confidential and the user has access to confidential
+            #phenotype traits, then display the pre-publication description instead
+            #of the post-publication description
             if this_trait.confidential:
                 continue   # for now
-                if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
+            
+                if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(
+                        privilege=self.privilege,
+                        userName=self.userName,
+                        authorized_users=this_trait.authorized_users):
+                        
                     description = this_trait.pre_publication_description
-            this_trait.description_display = unicode(description, "utf8")
+            
+            this_trait.description_display = description
 
             if not this_trait.year.isdigit():
                 this_trait.pubmed_text = "N/A"
@@ -690,73 +852,73 @@ class MrnaAssayDataSet(DataSet):
         #print("After retrieve_sample_data")
         return trait_data
     
-    def get_trait_data(self):
-        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
-        query = """
-            SELECT Strain.Name, Strain.Id FROM Strain, Species
-            WHERE Strain.Name IN {}
-            and Strain.SpeciesId=Species.Id
-            and Species.name = '{}'
-            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
-        results = dict(g.db.execute(query).fetchall())
-        sample_ids = [results[item] for item in self.samplelist]
-
-        # MySQL limits the number of tables that can be used in a join to 61,
-        # so we break the sample ids into smaller chunks
-        # Postgres doesn't have that limit, so we can get rid of this after we transition
-        chunk_size = 50
-        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
-        trait_sample_data = []
-        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-
-        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
-        #tempTable = None
-        #if GeneId and db.type == "ProbeSet": 
-        #    if method == "3":
-        #        tempTable = self.getTempLiteratureTable(species=species,
-        #                                                input_species_geneid=GeneId,
-        #                                                returnNumber=returnNumber)
-        #
-        #    if method == "4" or method == "5":
-        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
-        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
-        #                                        method=method,
-        #                                        returnNumber=returnNumber)
-        
-            temp = ['T%s.value' % item for item in sample_ids_step]
-            query = "SELECT {}.Name,".format(escape(self.type))
-            data_start_pos = 1
-            query += string.join(temp, ', ')
-            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
-                                                                     self.type,
-                                                                     self.type))
-
-            for item in sample_ids_step:
-                query += """
-                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
-                        and T{}.StrainId={}\n
-                        """.format(*mescape(self.type, item, item, self.type, item, item))
-                        
-            query += """
-                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                    and {}Freeze.Name = '{}'
-                    and {}.Id = {}XRef.{}Id
-                    order by {}.Id
-                    """.format(*mescape(self.type, self.type, self.type, self.type,
-                               self.name, self.type, self.type, self.type, self.type))
-            results = g.db.execute(query).fetchall()
-            trait_sample_data.append(results)
-
-        trait_count = len(trait_sample_data[0])
-        self.trait_data = collections.defaultdict(list)
-        
-        # put all of the separate data together into a dictionary where the keys are
-        # trait names and values are lists of sample values
-        for trait_counter in range(trait_count):
-            trait_name = trait_sample_data[0][trait_counter][0]
-            for chunk_counter in range(int(number_chunks)):
-                self.trait_data[trait_name] += (
-                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+    #def get_trait_data(self):
+    #    self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+    #    query = """
+    #        SELECT Strain.Name, Strain.Id FROM Strain, Species
+    #        WHERE Strain.Name IN {}
+    #        and Strain.SpeciesId=Species.Id
+    #        and Species.name = '{}'
+    #        """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+    #    results = dict(g.db.execute(query).fetchall())
+    #    sample_ids = [results[item] for item in self.samplelist]
+    #
+    #    # MySQL limits the number of tables that can be used in a join to 61,
+    #    # so we break the sample ids into smaller chunks
+    #    # Postgres doesn't have that limit, so we can get rid of this after we transition
+    #    chunk_size = 50
+    #    number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+    #    trait_sample_data = []
+    #    for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+    #
+    #    #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+    #    #tempTable = None
+    #    #if GeneId and db.type == "ProbeSet": 
+    #    #    if method == "3":
+    #    #        tempTable = self.getTempLiteratureTable(species=species,
+    #    #                                                input_species_geneid=GeneId,
+    #    #                                                returnNumber=returnNumber)
+    #    #
+    #    #    if method == "4" or method == "5":
+    #    #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+    #    #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+    #    #                                        method=method,
+    #    #                                        returnNumber=returnNumber)
+    #    
+    #        temp = ['T%s.value' % item for item in sample_ids_step]
+    #        query = "SELECT {}.Name,".format(escape(self.type))
+    #        data_start_pos = 1
+    #        query += string.join(temp, ', ')
+    #        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+    #                                                                 self.type,
+    #                                                                 self.type))
+    #
+    #        for item in sample_ids_step:
+    #            query += """
+    #                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
+    #                    and T{}.StrainId={}\n
+    #                    """.format(*mescape(self.type, item, item, self.type, item, item))
+    #                    
+    #        query += """
+    #                WHERE {}XRef.{}FreezeId = {}Freeze.Id
+    #                and {}Freeze.Name = '{}'
+    #                and {}.Id = {}XRef.{}Id
+    #                order by {}.Id
+    #                """.format(*mescape(self.type, self.type, self.type, self.type,
+    #                           self.name, self.type, self.type, self.type, self.type))
+    #        results = g.db.execute(query).fetchall()
+    #        trait_sample_data.append(results)
+    #
+    #    trait_count = len(trait_sample_data[0])
+    #    self.trait_data = collections.defaultdict(list)
+    #    
+    #    # put all of the separate data together into a dictionary where the keys are
+    #    # trait names and values are lists of sample values
+    #    for trait_counter in range(trait_count):
+    #        trait_name = trait_sample_data[0][trait_counter][0]
+    #        for chunk_counter in range(int(number_chunks)):
+    #            self.trait_data[trait_name] += (
+    #                trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
     
 
     def get_trait_info(self, trait_list=None, species=''):