From 930d8b0cf0c7bf88fee1de95852dfe883418a494 Mon Sep 17 00:00:00 2001
From: Zachary Sloan
Date: Tue, 23 Jul 2013 17:12:29 -0500
Subject: Fixed a couple issues with he quick_search_table.py script

Added notes on using percona to optimise mysql to gn_installation_notes.txt

Started moving "get_trait_data" to DataSet since it is the same regardless
of the particular DataSet subclass (PhenotypeDataSet, MrnaAssayDataSet, etc)
---
 wqflask/base/data_set.py | 202 +++++++++++++++++++++++++++++++----------------
 1 file changed, 135 insertions(+), 67 deletions(-)

(limited to 'wqflask/base/data_set.py')

diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index d5aae31d..188a94a2 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -435,6 +435,74 @@ class DataSet(object):
         except TypeError:
             print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
+        
+    def get_trait_data(self):
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        
+            temp = ['T%s.value' % item for item in sample_ids_step]
+            query = "SELECT {}.Name,".format(escape(self.type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+                                                                     self.type,
+                                                                     self.type))
+
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
+            query += """
+                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                    and {}Freeze.Name = '{}'
+                    and {}.Id = {}XRef.{}Id
+                    order by {}.Id
+                    """.format(*mescape(self.type, self.type, self.type, self.type,
+                               self.name, self.type, self.type, self.type, self.type))
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
+
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
+        
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -768,73 +836,73 @@ class MrnaAssayDataSet(DataSet):
         #print("After retrieve_sample_data")
         return trait_data
     
-    def get_trait_data(self):
-        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
-        query = """
-            SELECT Strain.Name, Strain.Id FROM Strain, Species
-            WHERE Strain.Name IN {}
-            and Strain.SpeciesId=Species.Id
-            and Species.name = '{}'
-            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
-        results = dict(g.db.execute(query).fetchall())
-        sample_ids = [results[item] for item in self.samplelist]
-
-        # MySQL limits the number of tables that can be used in a join to 61,
-        # so we break the sample ids into smaller chunks
-        # Postgres doesn't have that limit, so we can get rid of this after we transition
-        chunk_size = 50
-        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
-        trait_sample_data = []
-        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-
-        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
-        #tempTable = None
-        #if GeneId and db.type == "ProbeSet": 
-        #    if method == "3":
-        #        tempTable = self.getTempLiteratureTable(species=species,
-        #                                                input_species_geneid=GeneId,
-        #                                                returnNumber=returnNumber)
-        #
-        #    if method == "4" or method == "5":
-        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
-        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
-        #                                        method=method,
-        #                                        returnNumber=returnNumber)
-        
-            temp = ['T%s.value' % item for item in sample_ids_step]
-            query = "SELECT {}.Name,".format(escape(self.type))
-            data_start_pos = 1
-            query += string.join(temp, ', ')
-            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
-                                                                     self.type,
-                                                                     self.type))
-
-            for item in sample_ids_step:
-                query += """
-                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
-                        and T{}.StrainId={}\n
-                        """.format(*mescape(self.type, item, item, self.type, item, item))
-                        
-            query += """
-                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                    and {}Freeze.Name = '{}'
-                    and {}.Id = {}XRef.{}Id
-                    order by {}.Id
-                    """.format(*mescape(self.type, self.type, self.type, self.type,
-                               self.name, self.type, self.type, self.type, self.type))
-            results = g.db.execute(query).fetchall()
-            trait_sample_data.append(results)
-
-        trait_count = len(trait_sample_data[0])
-        self.trait_data = collections.defaultdict(list)
-        
-        # put all of the separate data together into a dictionary where the keys are
-        # trait names and values are lists of sample values
-        for trait_counter in range(trait_count):
-            trait_name = trait_sample_data[0][trait_counter][0]
-            for chunk_counter in range(int(number_chunks)):
-                self.trait_data[trait_name] += (
-                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+    #def get_trait_data(self):
+    #    self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+    #    query = """
+    #        SELECT Strain.Name, Strain.Id FROM Strain, Species
+    #        WHERE Strain.Name IN {}
+    #        and Strain.SpeciesId=Species.Id
+    #        and Species.name = '{}'
+    #        """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+    #    results = dict(g.db.execute(query).fetchall())
+    #    sample_ids = [results[item] for item in self.samplelist]
+    #
+    #    # MySQL limits the number of tables that can be used in a join to 61,
+    #    # so we break the sample ids into smaller chunks
+    #    # Postgres doesn't have that limit, so we can get rid of this after we transition
+    #    chunk_size = 50
+    #    number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+    #    trait_sample_data = []
+    #    for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+    #
+    #    #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+    #    #tempTable = None
+    #    #if GeneId and db.type == "ProbeSet": 
+    #    #    if method == "3":
+    #    #        tempTable = self.getTempLiteratureTable(species=species,
+    #    #                                                input_species_geneid=GeneId,
+    #    #                                                returnNumber=returnNumber)
+    #    #
+    #    #    if method == "4" or method == "5":
+    #    #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+    #    #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+    #    #                                        method=method,
+    #    #                                        returnNumber=returnNumber)
+    #    
+    #        temp = ['T%s.value' % item for item in sample_ids_step]
+    #        query = "SELECT {}.Name,".format(escape(self.type))
+    #        data_start_pos = 1
+    #        query += string.join(temp, ', ')
+    #        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+    #                                                                 self.type,
+    #                                                                 self.type))
+    #
+    #        for item in sample_ids_step:
+    #            query += """
+    #                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
+    #                    and T{}.StrainId={}\n
+    #                    """.format(*mescape(self.type, item, item, self.type, item, item))
+    #                    
+    #        query += """
+    #                WHERE {}XRef.{}FreezeId = {}Freeze.Id
+    #                and {}Freeze.Name = '{}'
+    #                and {}.Id = {}XRef.{}Id
+    #                order by {}.Id
+    #                """.format(*mescape(self.type, self.type, self.type, self.type,
+    #                           self.name, self.type, self.type, self.type, self.type))
+    #        results = g.db.execute(query).fetchall()
+    #        trait_sample_data.append(results)
+    #
+    #    trait_count = len(trait_sample_data[0])
+    #    self.trait_data = collections.defaultdict(list)
+    #    
+    #    # put all of the separate data together into a dictionary where the keys are
+    #    # trait names and values are lists of sample values
+    #    for trait_counter in range(trait_count):
+    #        trait_name = trait_sample_data[0][trait_counter][0]
+    #        for chunk_counter in range(int(number_chunks)):
+    #            self.trait_data[trait_name] += (
+    #                trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
     
 
     def get_trait_info(self, trait_list=None, species=''):
-- 
cgit v1.2.3