Fixed a couple issues with he quick_search_table.py script

Added notes on using percona to optimise mysql to gn_installation_notes.txt Started moving "get_trait_data" to DataSet since it is the same regardless of the particular DataSet subclass (PhenotypeDataSet, MrnaAssayDataSet, etc)
author: Zachary Sloan 2013-07-23 17:12:29 -0500
committer: Zachary Sloan 2013-07-23 17:12:29 -0500
commit: 930d8b0cf0c7bf88fee1de95852dfe883418a494 (patch)
tree: 04d542a01dba0c4529f55912ab242a3bc61459aa /wqflask
parent: 82f493650909e2351035e26e9dc82b16498beb48 (diff)
download: genenetwork2-930d8b0cf0c7bf88fee1de95852dfe883418a494.tar.gz
3 files changed, 171 insertions, 96 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index d5aae31d..188a94a2 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -435,6 +435,74 @@ class DataSet(object):
         except TypeError:
             print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
+        
+    def get_trait_data(self):
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        
+            temp = ['T%s.value' % item for item in sample_ids_step]
+            query = "SELECT {}.Name,".format(escape(self.type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+                                                                     self.type,
+                                                                     self.type))
+
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
+            query += """
+                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                    and {}Freeze.Name = '{}'
+                    and {}.Id = {}XRef.{}Id
+                    order by {}.Id
+                    """.format(*mescape(self.type, self.type, self.type, self.type,
+                               self.name, self.type, self.type, self.type, self.type))
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
+
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
+        
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -768,73 +836,73 @@ class MrnaAssayDataSet(DataSet):
         #print("After retrieve_sample_data")
         return trait_data
     
-    def get_trait_data(self):
-        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
-        query = """
-            SELECT Strain.Name, Strain.Id FROM Strain, Species
-            WHERE Strain.Name IN {}
-            and Strain.SpeciesId=Species.Id
-            and Species.name = '{}'
-            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
-        results = dict(g.db.execute(query).fetchall())
-        sample_ids = [results[item] for item in self.samplelist]
-
-        # MySQL limits the number of tables that can be used in a join to 61,
-        # so we break the sample ids into smaller chunks
-        # Postgres doesn't have that limit, so we can get rid of this after we transition
-        chunk_size = 50
-        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
-        trait_sample_data = []
-        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-
-        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
-        #tempTable = None
-        #if GeneId and db.type == "ProbeSet": 
-        #    if method == "3":
-        #        tempTable = self.getTempLiteratureTable(species=species,
-        #                                                input_species_geneid=GeneId,
-        #                                                returnNumber=returnNumber)
-        #
-        #    if method == "4" or method == "5":
-        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
-        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
-        #                                        method=method,
-        #                                        returnNumber=returnNumber)
-        
-            temp = ['T%s.value' % item for item in sample_ids_step]
-            query = "SELECT {}.Name,".format(escape(self.type))
-            data_start_pos = 1
-            query += string.join(temp, ', ')
-            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
-                                                                     self.type,
-                                                                     self.type))
-
-            for item in sample_ids_step:
-                query += """
-                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
-                        and T{}.StrainId={}\n
-                        """.format(*mescape(self.type, item, item, self.type, item, item))
-                        
-            query += """
-                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
-                    and {}Freeze.Name = '{}'
-                    and {}.Id = {}XRef.{}Id
-                    order by {}.Id
-                    """.format(*mescape(self.type, self.type, self.type, self.type,
-                               self.name, self.type, self.type, self.type, self.type))
-            results = g.db.execute(query).fetchall()
-            trait_sample_data.append(results)
-
-        trait_count = len(trait_sample_data[0])
-        self.trait_data = collections.defaultdict(list)
-        
-        # put all of the separate data together into a dictionary where the keys are
-        # trait names and values are lists of sample values
-        for trait_counter in range(trait_count):
-            trait_name = trait_sample_data[0][trait_counter][0]
-            for chunk_counter in range(int(number_chunks)):
-                self.trait_data[trait_name] += (
-                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+    #def get_trait_data(self):
+    #    self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+    #    query = """
+    #        SELECT Strain.Name, Strain.Id FROM Strain, Species
+    #        WHERE Strain.Name IN {}
+    #        and Strain.SpeciesId=Species.Id
+    #        and Species.name = '{}'
+    #        """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+    #    results = dict(g.db.execute(query).fetchall())
+    #    sample_ids = [results[item] for item in self.samplelist]
+    #
+    #    # MySQL limits the number of tables that can be used in a join to 61,
+    #    # so we break the sample ids into smaller chunks
+    #    # Postgres doesn't have that limit, so we can get rid of this after we transition
+    #    chunk_size = 50
+    #    number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+    #    trait_sample_data = []
+    #    for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+    #
+    #    #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+    #    #tempTable = None
+    #    #if GeneId and db.type == "ProbeSet": 
+    #    #    if method == "3":
+    #    #        tempTable = self.getTempLiteratureTable(species=species,
+    #    #                                                input_species_geneid=GeneId,
+    #    #                                                returnNumber=returnNumber)
+    #    #
+    #    #    if method == "4" or method == "5":
+    #    #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+    #    #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+    #    #                                        method=method,
+    #    #                                        returnNumber=returnNumber)
+    #    
+    #        temp = ['T%s.value' % item for item in sample_ids_step]
+    #        query = "SELECT {}.Name,".format(escape(self.type))
+    #        data_start_pos = 1
+    #        query += string.join(temp, ', ')
+    #        query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+    #                                                                 self.type,
+    #                                                                 self.type))
+    #
+    #        for item in sample_ids_step:
+    #            query += """
+    #                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
+    #                    and T{}.StrainId={}\n
+    #                    """.format(*mescape(self.type, item, item, self.type, item, item))
+    #                    
+    #        query += """
+    #                WHERE {}XRef.{}FreezeId = {}Freeze.Id
+    #                and {}Freeze.Name = '{}'
+    #                and {}.Id = {}XRef.{}Id
+    #                order by {}.Id
+    #                """.format(*mescape(self.type, self.type, self.type, self.type,
+    #                           self.name, self.type, self.type, self.type, self.type))
+    #        results = g.db.execute(query).fetchall()
+    #        trait_sample_data.append(results)
+    #
+    #    trait_count = len(trait_sample_data[0])
+    #    self.trait_data = collections.defaultdict(list)
+    #    
+    #    # put all of the separate data together into a dictionary where the keys are
+    #    # trait names and values are lists of sample values
+    #    for trait_counter in range(trait_count):
+    #        trait_name = trait_sample_data[0][trait_counter][0]
+    #        for chunk_counter in range(int(number_chunks)):
+    #            self.trait_data[trait_name] += (
+    #                trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
     
 
     def get_trait_info(self, trait_list=None, species=''):
diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py
index b07e7656..9cd792ef 100644
--- a/wqflask/maintenance/quick_search_table.py
+++ b/wqflask/maintenance/quick_search_table.py
@@ -71,9 +71,10 @@ class PublishXRef(Base):
             values['the_key'] = json.dumps([pub_row.Id, pub_row.InbredSetId])
             values['terms'] = cls.get_unique_terms(pub_row.Id, pub_row.InbredSetId)
             print("terms is:", values['terms'])
-            values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId)
-            ins = QuickSearch.insert().values(**values)
-            conn.execute(ins)
+            if values['terms']:
+                values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId)
+                ins = QuickSearch.insert().values(**values)
+                conn.execute(ins)
             counter += 1
             print("Done:", counter)
 
@@ -100,28 +101,30 @@ class PublishXRef(Base):
                                                             inbredset_id=inbredset_id).all()
 
         unique = set()
-        for item in results[0]:
-            #print("locals:", locals())
-            if not item:
-                continue
-            for token in item.split():
-                if token.startswith(('(','[')):
-                    token = token[1:]
-                if token.endswith((')', ']')):
-                    token = token[:-1]
-                if token.endswith(';'):
-                    token = token[:-1]
-                if len(token) > 2:
-                    try:
-                        # This hopefully ensures that the token is utf-8
-                        token = token.encode('utf-8')
-                        print(" ->", token)
-                    except UnicodeDecodeError:
-                        print("\n-- UDE \n")
-                        # Can't get it into utf-8, we won't use it
-                        continue 
-
-                    unique.add(token)
+        print("results: ", results)
+        if len(results):
+            for item in results[0]:
+                #print("locals:", locals())
+                if not item:
+                    continue
+                for token in item.split():
+                    if token.startswith(('(','[')):
+                        token = token[1:]
+                    if token.endswith((')', ']')):
+                        token = token[:-1]
+                    if token.endswith(';'):
+                        token = token[:-1]
+                    if len(token) > 2:
+                        try:
+                            # This hopefully ensures that the token is utf-8
+                            token = token.encode('utf-8')
+                            print(" ->", token)
+                        except UnicodeDecodeError:
+                            print("\n-- UDE \n")
+                            # Can't get it into utf-8, we won't use it
+                            continue 
+    
+                        unique.add(token)
         #print("\nUnique terms are: {}\n".format(unique))
         return " ".join(unique)            
 
@@ -467,8 +470,8 @@ QuickSearch = sa.Table("QuickSearch", Metadata,
         mysql_engine = 'MyISAM',
                     )
 
-#QuickSearch.drop(Engine, checkfirst=True)
-#Metadata.create_all(Engine)
+QuickSearch.drop(Engine, checkfirst=True)
+Metadata.create_all(Engine)
 
 
 def row2dict(row):
@@ -495,9 +498,10 @@ def main():
     Add all items from the ProbeSetXRef, GenoXRef, and PublishXRef tables to the QuickSearch tables.
     
     """
+
+    GenoXRef.run()
+    PublishXRef.run()
     ProbeSetXRef.run()
-    #GenoXRef.run()
-    #PublishXRef.run()
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 38a49f12..446683a4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -237,6 +237,9 @@ def run(pheno_vector,
     with Bench("Calculate Kinship"):
         kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
     
+    print("kinship_matrix: ", pf(kinship_matrix))
+    print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
+    
     with Bench("Create LMM object"):
         lmm_ob = LMM(pheno_vector, kinship_matrix)
author	Zachary Sloan	2013-07-23 17:12:29 -0500
committer	Zachary Sloan	2013-07-23 17:12:29 -0500
commit	930d8b0cf0c7bf88fee1de95852dfe883418a494 (patch)
tree	04d542a01dba0c4529f55912ab242a3bc61459aa /wqflask
parent	82f493650909e2351035e26e9dc82b16498beb48 (diff)
download	genenetwork2-930d8b0cf0c7bf88fee1de95852dfe883418a494.tar.gz