diff options
| author | Zachary Sloan | 2013-07-23 17:12:29 -0500 | 
|---|---|---|
| committer | Zachary Sloan | 2013-07-23 17:12:29 -0500 | 
| commit | 930d8b0cf0c7bf88fee1de95852dfe883418a494 (patch) | |
| tree | 04d542a01dba0c4529f55912ab242a3bc61459aa /wqflask | |
| parent | 82f493650909e2351035e26e9dc82b16498beb48 (diff) | |
| download | genenetwork2-930d8b0cf0c7bf88fee1de95852dfe883418a494.tar.gz | |
Fixed a couple issues with he quick_search_table.py script
Added notes on using percona to optimise mysql to gn_installation_notes.txt Started moving "get_trait_data" to DataSet since it is the same regardless of the particular DataSet subclass (PhenotypeDataSet, MrnaAssayDataSet, etc)
Diffstat (limited to 'wqflask')
| -rwxr-xr-x | wqflask/base/data_set.py | 202 | ||||
| -rw-r--r-- | wqflask/maintenance/quick_search_table.py | 62 | ||||
| -rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 3 | 
3 files changed, 171 insertions, 96 deletions
| diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index d5aae31d..188a94a2 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -435,6 +435,74 @@ class DataSet(object): except TypeError: print("Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass + + def get_trait_data(self): + self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list + query = """ + SELECT Strain.Name, Strain.Id FROM Strain, Species + WHERE Strain.Name IN {} + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + results = dict(g.db.execute(query).fetchall()) + sample_ids = [results[item] for item in self.samplelist] + + # MySQL limits the number of tables that can be used in a join to 61, + # so we break the sample ids into smaller chunks + # Postgres doesn't have that limit, so we can get rid of this after we transition + chunk_size = 50 + number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) + trait_sample_data = [] + for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): + + #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId + #tempTable = None + #if GeneId and db.type == "ProbeSet": + # if method == "3": + # tempTable = self.getTempLiteratureTable(species=species, + # input_species_geneid=GeneId, + # returnNumber=returnNumber) + # + # if method == "4" or method == "5": + # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, + # TissueProbeSetFreezeId=tissueProbeSetFreezeId, + # method=method, + # returnNumber=returnNumber) + + temp = ['T%s.value' % item for item in sample_ids_step] + query = "SELECT {}.Name,".format(escape(self.type)) + data_start_pos = 1 + query += string.join(temp, ', ') + query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, + self.type, + self.type)) + + for item in sample_ids_step: + query += """ + left join {}Data as T{} on T{}.Id = {}XRef.DataId + and T{}.StrainId={}\n + """.format(*mescape(self.type, item, item, self.type, item, item)) + + query += """ + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(self.type, self.type, self.type, self.type, + self.name, self.type, self.type, self.type, self.type)) + results = g.db.execute(query).fetchall() + trait_sample_data.append(results) + + trait_count = len(trait_sample_data[0]) + self.trait_data = collections.defaultdict(list) + + # put all of the separate data together into a dictionary where the keys are + # trait names and values are lists of sample values + for trait_counter in range(trait_count): + trait_name = trait_sample_data[0][trait_counter][0] + for chunk_counter in range(int(number_chunks)): + self.trait_data[trait_name] += ( + trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' @@ -768,73 +836,73 @@ class MrnaAssayDataSet(DataSet): #print("After retrieve_sample_data") return trait_data - def get_trait_data(self): - self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list - query = """ - SELECT Strain.Name, Strain.Id FROM Strain, Species - WHERE Strain.Name IN {} - and Strain.SpeciesId=Species.Id - and Species.name = '{}' - """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) - results = dict(g.db.execute(query).fetchall()) - sample_ids = [results[item] for item in self.samplelist] - - # MySQL limits the number of tables that can be used in a join to 61, - # so we break the sample ids into smaller chunks - # Postgres doesn't have that limit, so we can get rid of this after we transition - chunk_size = 50 - number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) - trait_sample_data = [] - for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): - - #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId - #tempTable = None - #if GeneId and db.type == "ProbeSet": - # if method == "3": - # tempTable = self.getTempLiteratureTable(species=species, - # input_species_geneid=GeneId, - # returnNumber=returnNumber) - # - # if method == "4" or method == "5": - # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, - # TissueProbeSetFreezeId=tissueProbeSetFreezeId, - # method=method, - # returnNumber=returnNumber) - - temp = ['T%s.value' % item for item in sample_ids_step] - query = "SELECT {}.Name,".format(escape(self.type)) - data_start_pos = 1 - query += string.join(temp, ', ') - query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, - self.type, - self.type)) - - for item in sample_ids_step: - query += """ - left join {}Data as T{} on T{}.Id = {}XRef.DataId - and T{}.StrainId={}\n - """.format(*mescape(self.type, item, item, self.type, item, item)) - - query += """ - WHERE {}XRef.{}FreezeId = {}Freeze.Id - and {}Freeze.Name = '{}' - and {}.Id = {}XRef.{}Id - order by {}.Id - """.format(*mescape(self.type, self.type, self.type, self.type, - self.name, self.type, self.type, self.type, self.type)) - results = g.db.execute(query).fetchall() - trait_sample_data.append(results) - - trait_count = len(trait_sample_data[0]) - self.trait_data = collections.defaultdict(list) - - # put all of the separate data together into a dictionary where the keys are - # trait names and values are lists of sample values - for trait_counter in range(trait_count): - trait_name = trait_sample_data[0][trait_counter][0] - for chunk_counter in range(int(number_chunks)): - self.trait_data[trait_name] += ( - trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + #def get_trait_data(self): + # self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list + # query = """ + # SELECT Strain.Name, Strain.Id FROM Strain, Species + # WHERE Strain.Name IN {} + # and Strain.SpeciesId=Species.Id + # and Species.name = '{}' + # """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + # results = dict(g.db.execute(query).fetchall()) + # sample_ids = [results[item] for item in self.samplelist] + # + # # MySQL limits the number of tables that can be used in a join to 61, + # # so we break the sample ids into smaller chunks + # # Postgres doesn't have that limit, so we can get rid of this after we transition + # chunk_size = 50 + # number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) + # trait_sample_data = [] + # for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): + # + # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId + # #tempTable = None + # #if GeneId and db.type == "ProbeSet": + # # if method == "3": + # # tempTable = self.getTempLiteratureTable(species=species, + # # input_species_geneid=GeneId, + # # returnNumber=returnNumber) + # # + # # if method == "4" or method == "5": + # # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, + # # TissueProbeSetFreezeId=tissueProbeSetFreezeId, + # # method=method, + # # returnNumber=returnNumber) + # + # temp = ['T%s.value' % item for item in sample_ids_step] + # query = "SELECT {}.Name,".format(escape(self.type)) + # data_start_pos = 1 + # query += string.join(temp, ', ') + # query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, + # self.type, + # self.type)) + # + # for item in sample_ids_step: + # query += """ + # left join {}Data as T{} on T{}.Id = {}XRef.DataId + # and T{}.StrainId={}\n + # """.format(*mescape(self.type, item, item, self.type, item, item)) + # + # query += """ + # WHERE {}XRef.{}FreezeId = {}Freeze.Id + # and {}Freeze.Name = '{}' + # and {}.Id = {}XRef.{}Id + # order by {}.Id + # """.format(*mescape(self.type, self.type, self.type, self.type, + # self.name, self.type, self.type, self.type, self.type)) + # results = g.db.execute(query).fetchall() + # trait_sample_data.append(results) + # + # trait_count = len(trait_sample_data[0]) + # self.trait_data = collections.defaultdict(list) + # + # # put all of the separate data together into a dictionary where the keys are + # # trait names and values are lists of sample values + # for trait_counter in range(trait_count): + # trait_name = trait_sample_data[0][trait_counter][0] + # for chunk_counter in range(int(number_chunks)): + # self.trait_data[trait_name] += ( + # trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) def get_trait_info(self, trait_list=None, species=''): diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py index b07e7656..9cd792ef 100644 --- a/wqflask/maintenance/quick_search_table.py +++ b/wqflask/maintenance/quick_search_table.py @@ -71,9 +71,10 @@ class PublishXRef(Base): values['the_key'] = json.dumps([pub_row.Id, pub_row.InbredSetId]) values['terms'] = cls.get_unique_terms(pub_row.Id, pub_row.InbredSetId) print("terms is:", values['terms']) - values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId) - ins = QuickSearch.insert().values(**values) - conn.execute(ins) + if values['terms']: + values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId) + ins = QuickSearch.insert().values(**values) + conn.execute(ins) counter += 1 print("Done:", counter) @@ -100,28 +101,30 @@ class PublishXRef(Base): inbredset_id=inbredset_id).all() unique = set() - for item in results[0]: - #print("locals:", locals()) - if not item: - continue - for token in item.split(): - if token.startswith(('(','[')): - token = token[1:] - if token.endswith((')', ']')): - token = token[:-1] - if token.endswith(';'): - token = token[:-1] - if len(token) > 2: - try: - # This hopefully ensures that the token is utf-8 - token = token.encode('utf-8') - print(" ->", token) - except UnicodeDecodeError: - print("\n-- UDE \n") - # Can't get it into utf-8, we won't use it - continue - - unique.add(token) + print("results: ", results) + if len(results): + for item in results[0]: + #print("locals:", locals()) + if not item: + continue + for token in item.split(): + if token.startswith(('(','[')): + token = token[1:] + if token.endswith((')', ']')): + token = token[:-1] + if token.endswith(';'): + token = token[:-1] + if len(token) > 2: + try: + # This hopefully ensures that the token is utf-8 + token = token.encode('utf-8') + print(" ->", token) + except UnicodeDecodeError: + print("\n-- UDE \n") + # Can't get it into utf-8, we won't use it + continue + + unique.add(token) #print("\nUnique terms are: {}\n".format(unique)) return " ".join(unique) @@ -467,8 +470,8 @@ QuickSearch = sa.Table("QuickSearch", Metadata, mysql_engine = 'MyISAM', ) -#QuickSearch.drop(Engine, checkfirst=True) -#Metadata.create_all(Engine) +QuickSearch.drop(Engine, checkfirst=True) +Metadata.create_all(Engine) def row2dict(row): @@ -495,9 +498,10 @@ def main(): Add all items from the ProbeSetXRef, GenoXRef, and PublishXRef tables to the QuickSearch tables. """ + + GenoXRef.run() + PublishXRef.run() ProbeSetXRef.run() - #GenoXRef.run() - #PublishXRef.run() if __name__ == "__main__": main() \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 38a49f12..446683a4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -237,6 +237,9 @@ def run(pheno_vector, with Bench("Calculate Kinship"): kinship_matrix = calculate_kinship(genotype_matrix, temp_data) + print("kinship_matrix: ", pf(kinship_matrix)) + print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) + with Bench("Create LMM object"): lmm_ob = LMM(pheno_vector, kinship_matrix) | 
