From e5896aebed0c7a433ecd79553b85920752f518d8 Mon Sep 17 00:00:00 2001 From: Artem Tarasov Date: Fri, 15 May 2015 16:31:25 +0300 Subject: reduce number of queries --- wqflask/wqflask/show_trait/SampleList.py | 114 +++++++++++++------------------ 1 file changed, 47 insertions(+), 67 deletions(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 060cb519..46f2ae43 100755 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -10,6 +10,8 @@ import numpy as np from scipy import stats from pprint import pformat as pf +import itertools + class SampleList(object): def __init__(self, dataset, @@ -25,14 +27,12 @@ class SampleList(object): self.sample_list = [] # The actual list - try: - self.get_attributes() - except Exception: - print("failed to get attributes") - self.attributes = {} - + self.get_attributes() print("camera: attributes are:", pf(self.attributes)) + if self.this_trait and self.dataset and self.dataset.type == 'ProbeSet': + self.get_extra_attribute_values() + for counter, sample_name in enumerate(sample_names, 1): sample_name = sample_name.replace("_2nd_", "") @@ -59,8 +59,8 @@ class SampleList(object): sample.this_id = "Other_" + str(counter) #### For extra attribute columns; currently only used by several datasets - Zach - if self.this_trait and self.dataset and self.dataset.type == 'ProbeSet': - sample.extra_attributes = self.get_extra_attribute_values(sample_name) + if self.sample_attribute_values: + sample.extra_attributes = self.sample_attribute_values.get(sample_name, {}) print("sample.extra_attributes is", pf(sample.extra_attributes)) self.sample_list.append(sample) @@ -135,75 +135,55 @@ class SampleList(object): else: sample.outlier = False - def get_attributes(self): """Finds which extra attributes apply to this dataset""" - - #ZS: Id and name values for this trait's extra attributes - case_attributes = g.db.execute('''SELECT CaseAttribute.Id, CaseAttribute.Name - FROM CaseAttribute, CaseAttributeXRef - WHERE CaseAttributeXRef.ProbeSetFreezeId = %s AND - CaseAttribute.Id = CaseAttributeXRef.CaseAttributeId - group by CaseAttributeXRef.CaseAttributeId''', - (str(self.dataset.id),)) + # Get attribute names and distinct values for each attribute + results = g.db.execute(''' + SELECT DISTINCT CaseAttribute.Id, CaseAttribute.Name, CaseAttributeXRef.Value + FROM CaseAttribute, CaseAttributeXRef + WHERE CaseAttributeXRef.CaseAttributeId = CaseAttribute.Id + AND CaseAttributeXRef.ProbeSetFreezeId = %s + ORDER BY CaseAttribute.Name''', (str(self.dataset.id),)) self.attributes = {} - for key, value in case_attributes.fetchall(): - print("radish: %s - %s" % (key, value)) + for attr, values in itertools.groupby(results.fetchall(), lambda row: (row.Id, row.Name)): + key, name = attr + print("radish: %s - %s" % (key, name)) self.attributes[key] = Bunch() - self.attributes[key].name = value - - attribute_values = g.db.execute('''SELECT DISTINCT CaseAttributeXRef.Value - FROM CaseAttribute, CaseAttributeXRef - WHERE CaseAttribute.Name = %s AND - CaseAttributeXRef.CaseAttributeId = CaseAttribute.Id''', (value,)) - - self.attributes[key].distinct_values = [item[0] for item in attribute_values.fetchall()] + self.attributes[key].name = name + self.attributes[key].distinct_values = [item.Value for item in values] self.attributes[key].distinct_values.sort(key=natural_sort_key) - - def get_extra_attribute_values(self, sample_name): - - attribute_values = {} - + def get_extra_attribute_values(self): + self.sample_attribute_values = {} if self.attributes: + results = g.db.execute(''' + SELECT Strain.Name AS SampleName, CaseAttributeId AS Id, CaseAttributeXRef.Value + FROM Strain, StrainXRef, InbredSet, CaseAttributeXRef + WHERE StrainXRef.StrainId = Strain.Id + AND InbredSet.Id = StrainXRef.InbredSetId + AND CaseAttributeXRef.StrainId = Strain.Id + AND InbredSet.Name = %s + AND CaseAttributeXRef.ProbeSetFreezeId = %s + ORDER BY SampleName''', + (self.dataset.group.name, self.this_trait.dataset.id)) + + for sample_name, items in itertools.groupby(results.fetchall(), lambda row: row.SampleName): + attribute_values = {} + for item in items: + attribute_value = item.Value + + #ZS: If it's an int, turn it into one for sorting + #(for example, 101 would be lower than 80 if they're strings instead of ints) + try: + attribute_value = int(attribute_value) + except ValueError: + pass + + attribute_values[self.attributes[item.Id].name] = attribute_value + self.sample_attribute_values[sample_name] = attribute_values - #ZS: Get StrainId value for the next query - result = g.db.execute("""SELECT Strain.Id - FROM Strain, StrainXRef, InbredSet - WHERE Strain.Name = %s and - StrainXRef.StrainId = Strain.Id and - InbredSet.Id = StrainXRef.InbredSetId and - InbredSet.Name = %s""", (sample_name, - self.dataset.group.name)) - - sample_id = result.fetchone().Id - - for attribute in self.attributes: - - #ZS: Add extra case attribute values (if any) - result = g.db.execute("""SELECT Value - FROM CaseAttributeXRef - WHERE ProbeSetFreezeId = %s AND - StrainId = %s AND - CaseAttributeId = %s - group by CaseAttributeXRef.CaseAttributeId""", ( - self.this_trait.dataset.id, sample_id, str(attribute))) - - attribute_value = result.fetchone().Value #Trait-specific attributes, if any - - #ZS: If it's an int, turn it into one for sorting - #(for example, 101 would be lower than 80 if they're strings instead of ints) - try: - attribute_value = int(attribute_value) - except ValueError: - pass - - attribute_values[self.attributes[attribute].name] = attribute_value - - return attribute_values - def se_exists(self): """Returns true if SE values exist for any samples, otherwise false""" -- cgit v1.2.3 From 83e45ddc49965b20ecd91b33d27f0d87df01b5bf Mon Sep 17 00:00:00 2001 From: zsloan Date: Fri, 15 May 2015 21:54:21 +0000 Subject: Position search now works for mRNA assay and genotype data sets --- wqflask/wqflask/do_search.py | 188 ++++++++++++++++++++------------------ wqflask/wqflask/search_results.py | 69 ++++++-------- 2 files changed, 129 insertions(+), 128 deletions(-) diff --git a/wqflask/wqflask/do_search.py b/wqflask/wqflask/do_search.py index 921a4a47..a3260f34 100755 --- a/wqflask/wqflask/do_search.py +++ b/wqflask/wqflask/do_search.py @@ -58,7 +58,14 @@ class DoSearch(object): @classmethod def get_search(cls, search_type): print("search_types are:", pf(cls.search_types)) - return cls.search_types[search_type] + + search_type_string = search_type['dataset_type'] + if 'key' in search_type: + search_type_string += '_' + search_type['key'] + + print("search_type_string is:", search_type_string) + + return cls.search_types[search_type_string] class QuickMrnaAssaySearch(DoSearch): """A general search for mRNA assays""" @@ -120,6 +127,22 @@ class MrnaAssaySearch(DoSearch): 'Max LRS Location', 'Additive Effect'] + def get_where_clause(self): + where_clause = """(MATCH (ProbeSet.Name, + ProbeSet.description, + ProbeSet.symbol, + alias, + GenbankId, + UniGeneId, + Probe_Target_Description) + AGAINST ('%s' IN BOOLEAN MODE)) + and ProbeSet.Id = ProbeSetXRef.ProbeSetId + and ProbeSetXRef.ProbeSetFreezeId = %s + """ % (escape(self.search_term[0]), + escape(str(self.dataset.id))) + + return where_clause + def compile_final_query(self, from_clause = '', where_clause = ''): """Generates the final query string""" @@ -138,23 +161,7 @@ class MrnaAssaySearch(DoSearch): return query - def get_where_clause(self): - where_clause = """(MATCH (ProbeSet.Name, - ProbeSet.description, - ProbeSet.symbol, - alias, - GenbankId, - UniGeneId, - Probe_Target_Description) - AGAINST ('%s' IN BOOLEAN MODE)) - and ProbeSet.Id = ProbeSetXRef.ProbeSetId - and ProbeSetXRef.ProbeSetFreezeId = %s - """ % (escape(self.search_term[0]), - escape(str(self.dataset.id))) - - return where_clause - - def run_combined(self, from_clause, where_clause): + def run_combined(self, from_clause = '', where_clause = ''): """Generates and runs a combined search of an mRNA expression dataset""" print("Running ProbeSetSearch") @@ -173,7 +180,6 @@ class MrnaAssaySearch(DoSearch): print("final query is:", pf(query)) - return self.execute(query) def run(self): @@ -219,7 +225,7 @@ class PhenotypeSearch(DoSearch): 'Max LRS Location', 'Additive Effect'] - def get_fields_clause(self): + def get_where_clause(self): """Generate clause for WHERE portion of query""" #Todo: Zach will figure out exactly what both these lines mean @@ -229,10 +235,10 @@ class PhenotypeSearch(DoSearch): # This adds a clause to the query that matches the search term # against each field in the search_fields tuple - fields_clause = [] + fields_clause_list = [] for field in self.search_fields: - fields_clause.append('''%s REGEXP "%s"''' % (field, search_term)) - fields_clause = "(%s) and " % ' OR '.join(fields_clause) + fields_clause_list.append('''%s REGEXP "%s"''' % (field, search_term)) + fields_clause = "(%s) " % ' OR '.join(fields_clause_list) return fields_clause @@ -241,6 +247,29 @@ class PhenotypeSearch(DoSearch): from_clause = self.normalize_spaces(from_clause) + query = (self.base_query + + """%s + WHERE %s + and PublishXRef.InbredSetId = %s + and PublishXRef.PhenotypeId = Phenotype.Id + and PublishXRef.PublicationId = Publication.Id + and PublishFreeze.Id = %s""" % ( + from_clause, + where_clause, + escape(str(self.dataset.group.id)), + escape(str(self.dataset.id)))) + + print("query is:", pf(query)) + + return query + + def run_combined(self, from_clause, where_clause): + """Generates and runs a combined search of an phenotype dataset""" + + print("Running PhenotypeSearch") + + from_clause = self.normalize_spaces(from_clause) + query = (self.base_query + """%s WHERE %s @@ -253,14 +282,15 @@ class PhenotypeSearch(DoSearch): escape(str(self.dataset.group.id)), escape(str(self.dataset.id)))) - print("query is:", pf(query)) + print("final query is:", pf(query)) - return query + + return self.execute(query) def run(self): """Generates and runs a simple search of a phenotype dataset""" - query = self.compile_final_query(where_clause = self.get_fields_clause()) + query = self.compile_final_query(where_clause = self.get_where_clause()) return self.execute(query) @@ -310,7 +340,7 @@ class QuickPhenotypeSearch(PhenotypeSearch): def run(self): """Generates and runs a search across all phenotype datasets""" - query = self.compile_final_query(where_clause = self.get_fields_clause()) + query = self.compile_final_query(where_clause = self.get_where_clause()) return self.execute(query) @@ -333,7 +363,7 @@ class GenotypeSearch(DoSearch): 'Record', 'Location'] - def get_fields_clause(self): + def get_where_clause(self): """Generate clause for part of the WHERE portion of query""" # This adds a clause to the query that matches the search term @@ -348,7 +378,7 @@ class GenotypeSearch(DoSearch): field), self.search_term)) print("hello ;where_clause is:", pf(fields_clause)) - fields_clause = "(%s)" % ' OR '.join(fields_clause) + fields_clause = "(%s) " % ' OR '.join(fields_clause) return fields_clause @@ -358,10 +388,10 @@ class GenotypeSearch(DoSearch): from_clause = self.normalize_spaces(from_clause) query = (self.base_query + - """WHERE %s and - Geno.Id = GenoXRef.GenoId and - GenoXRef.GenoFreezeId = GenoFreeze.Id and - GenoFreeze.Id = %s"""% (where_clause, + """WHERE %s + and Geno.Id = GenoXRef.GenoId + and GenoXRef.GenoFreezeId = GenoFreeze.Id + and GenoFreeze.Id = %s"""% (where_clause, escape(str(self.dataset.id)))) print("query is:", pf(query)) @@ -373,14 +403,14 @@ class GenotypeSearch(DoSearch): #Todo: Zach will figure out exactly what both these lines mean #and comment here - self.query = self.compile_final_query(where_clause = self.get_fields_clause()) + self.query = self.compile_final_query(where_clause = self.get_where_clause()) return self.execute(self.query) class RifSearch(MrnaAssaySearch): """Searches for traits with a Gene RIF entry including the search term.""" - DoSearch.search_types['RIF'] = "RifSearch" + DoSearch.search_types['ProbeSet_RIF'] = "RifSearch" def get_where_clause(self): where_clause = """( %s.symbol = GeneRIF_BASIC.symbol and @@ -390,10 +420,6 @@ class RifSearch(MrnaAssaySearch): return where_clause def run(self): - #where_clause = """( %s.symbol = GeneRIF_BASIC.symbol and - # MATCH (GeneRIF_BASIC.comment) - # AGAINST ('+%s' IN BOOLEAN MODE)) """ % (self.dataset.type, self.search_term[0]) - where_clause = self.get_where_clause() from_clause = ", GeneRIF_BASIC " @@ -404,7 +430,7 @@ class RifSearch(MrnaAssaySearch): class WikiSearch(MrnaAssaySearch): """Searches GeneWiki for traits other people have annotated""" - DoSearch.search_types['WIKI'] = "WikiSearch" + DoSearch.search_types['ProbeSet_WIKI'] = "WikiSearch" def get_where_clause(self): where_clause = """%s.symbol = GeneRIF.symbol @@ -433,7 +459,7 @@ class WikiSearch(MrnaAssaySearch): class GoSearch(MrnaAssaySearch): """Searches for synapse-associated genes listed in the Gene Ontology.""" - DoSearch.search_types['GO'] = "GoSearch" + DoSearch.search_types['ProbeSet_GO'] = "GoSearch" def run(self): field = 'GOterm.acc' @@ -525,43 +551,6 @@ class LrsSearch(MrnaAssaySearch): def run(self): self.from_clause = self.get_from_clause() - - #self.search_term = [float(value) for value in self.search_term] - # - #if self.search_operator == "=": - # assert isinstance(self.search_term, (list, tuple)) - # self.lrs_min, self.lrs_max = self.search_term[:2] - # - # self.sub_clause = """ %sXRef.LRS > %s and - # %sXRef.LRS < %s and """ % self.mescape(self.dataset.type, - # min(self.lrs_min, self.lrs_max), - # self.dataset.type, - # max(self.lrs_min, self.lrs_max)) - # - # if len(self.search_term) > 2: - # self.chr_num = self.search_term[2] - # self.sub_clause += """ Geno.Chr = %s and """ % (escape(self.chr_num)) - # if len(self.search_term) == 5: - # self.mb_low, self.mb_high = self.search_term[3:] - # self.sub_clause += """ Geno.Mb > %s and - # Geno.Mb < %s and - # """ % self.mescape(min(self.mb_low, self.mb_high), - # max(self.mb_low, self.mb_high)) - # print("self.sub_clause is:", pf(self.sub_clause)) - #else: - # # Deal with >, <, >=, and <= - # print("self.search_term is:", self.search_term) - # self.sub_clause = """ %sXRef.LRS %s %s and """ % self.mescape(self.dataset.type, - # self.search_operator, - # self.search_term[0]) - # - #self.where_clause = self.sub_clause + """ %sXRef.Locus = Geno.name and - # Geno.SpeciesId = %s and - # %s.Chr = Geno.Chr - # """ % self.mescape(self.dataset.type, - # self.species_id, - # self.dataset.type) - self.where_clause = self.get_where_clause() self.query = self.compile_final_query(self.from_clause, self.where_clause) @@ -673,7 +662,7 @@ class TransLrsSearch(CisTransLrsSearch): class MeanSearch(MrnaAssaySearch): """Searches for genes expressed within an interval (log2 units) determined by the user""" - DoSearch.search_types['MEAN'] = "MeanSearch" + DoSearch.search_types['ProbeSet_MEAN'] = "MeanSearch" def get_where_clause(self): self.search_term = [float(value) for value in self.search_term] @@ -732,7 +721,7 @@ class MeanSearch(MrnaAssaySearch): class RangeSearch(MrnaAssaySearch): """Searches for genes with a range of expression varying between two values""" - DoSearch.search_types['RANGE'] = "RangeSearch" + DoSearch.search_types['ProbeSet_RANGE'] = "RangeSearch" def get_where_clause(self): if self.search_operator == "=": @@ -791,10 +780,8 @@ class PositionSearch(DoSearch): for search_key in ('POSITION', 'POS', 'MB'): DoSearch.search_types[search_key] = "PositionSearch" - def setup(self): - self.search_term = [float(value) for value in self.search_term] - self.chr, self.mb_min, self.mb_max = self.search_term[:3] - self.where_clause = """ %s.Chr = '%s' and + def get_where_clause(self): + where_clause = """ %s.Chr = %s and %s.Mb > %s and %s.Mb < %s """ % self.mescape(self.dataset.type, self.chr, @@ -803,6 +790,21 @@ class PositionSearch(DoSearch): self.dataset.type, max(self.mb_min, self.mb_max)) + + return where_clause + + def setup(self): + self.search_term = [float(value) if is_number(value) else value for value in self.search_term] + self.chr, self.mb_min, self.mb_max = self.search_term[:3] + self.get_chr() + self.where_clause = self.get_where_clause() + + def get_chr(self): + try: + self.chr = int(self.chr) + except: + self.chr = int(self.chr.replace('chr', '')) + def run(self): self.setup() @@ -810,9 +812,12 @@ class PositionSearch(DoSearch): return self.execute(self.query) -class MrnaPositionSearch(MrnaAssaySearch, PositionSearch): +class MrnaPositionSearch(PositionSearch, MrnaAssaySearch): """Searches for genes located within a specified range on a specified chromosome""" + for search_key in ('POSITION', 'POS', 'MB'): + DoSearch.search_types['ProbeSet_'+search_key] = "MrnaPositionSearch" + def run(self): self.setup() @@ -820,9 +825,12 @@ class MrnaPositionSearch(MrnaAssaySearch, PositionSearch): return self.execute(self.query) -class GenotypePositionSearch(GenotypeSearch, PositionSearch): +class GenotypePositionSearch(PositionSearch, GenotypeSearch): """Searches for genes located within a specified range on a specified chromosome""" + for search_key in ('POSITION', 'POS', 'MB'): + DoSearch.search_types['Geno_'+search_key] = "GenotypePositionSearch" + def run(self): self.setup() @@ -863,7 +871,7 @@ class PvalueSearch(MrnaAssaySearch): class AuthorSearch(PhenotypeSearch): """Searches for phenotype traits with specified author(s)""" - DoSearch.search_types["NAME"] = "AuthorSearch" + DoSearch.search_types["Publish_NAME"] = "AuthorSearch" def run(self): @@ -875,6 +883,12 @@ class AuthorSearch(PhenotypeSearch): return self.execute(self.query) +def is_number(s): + try: + float(s) + return True + except ValueError: + return False if __name__ == "__main__": ### Usually this will be used as a library, but call it from the command line for testing diff --git a/wqflask/wqflask/search_results.py b/wqflask/wqflask/search_results.py index df1edb13..2a1e8bb3 100755 --- a/wqflask/wqflask/search_results.py +++ b/wqflask/wqflask/search_results.py @@ -223,26 +223,7 @@ class SearchResultPage(object): combined_from_clause = "" combined_where_clause = "" for i, a_search in enumerate(self.search_terms): - print("[kodak] item is:", pf(a_search)) - search_term = a_search['search_term'] - search_operator = a_search['separator'] - if a_search['key']: - search_type = a_search['key'].upper() - else: - # We fall back to the dataset type as the key to get the right object - search_type = self.dataset.type - - print("search_type is:", pf(search_type)) - - search_ob = do_search.DoSearch.get_search(search_type) - search_class = getattr(do_search, search_ob) - the_search = search_class(search_term, - search_operator, - self.dataset, - ) - - #search_query = the_search.get_final_query() - + the_search = self.get_search_ob(a_search) get_from_clause = getattr(the_search, "get_from_clause", None) if callable(get_from_clause): from_clause = the_search.get_from_clause() @@ -251,31 +232,37 @@ class SearchResultPage(object): combined_where_clause += "(" + where_clause + ")" if (i+1) < len(self.search_terms): combined_where_clause += "AND" - - results = the_search.run_combined(combined_from_clause, combined_where_clause) + final_query = the_search.compile_final_query(combined_from_clause, combined_where_clause) + results = the_search.execute(final_query) self.results.extend(results) - else: for a_search in self.search_terms: - print("[kodak] item is:", pf(a_search)) - search_term = a_search['search_term'] - search_operator = a_search['separator'] - if a_search['key']: - search_type = a_search['key'].upper() - else: - # We fall back to the dataset type as the key to get the right object - search_type = self.dataset.type - - print("search_type is:", pf(search_type)) - - search_ob = do_search.DoSearch.get_search(search_type) - search_class = getattr(do_search, search_ob) - print("search_class is: ", pf(search_class)) - the_search = search_class(search_term, - search_operator, - self.dataset, - ) + the_search = self.get_search_ob(a_search) self.results.extend(the_search.run()) #print("in the search results are:", self.results) self.header_fields = the_search.header_fields + + def get_search_ob(self, a_search): + print("[kodak] item is:", pf(a_search)) + search_term = a_search['search_term'] + search_operator = a_search['separator'] + search_type = {} + search_type['dataset_type'] = self.dataset.type + if a_search['key']: + search_type['key'] = a_search['key'].upper() + #search_type = a_search['key'].upper() + #else: + # # We fall back to the dataset type as the key to get the right object + # search_type = self.dataset.type + + print("search_type is:", pf(search_type)) + + search_ob = do_search.DoSearch.get_search(search_type) + search_class = getattr(do_search, search_ob) + print("search_class is: ", pf(search_class)) + the_search = search_class(search_term, + search_operator, + self.dataset, + ) + return the_search -- cgit v1.2.3 From d06d81615021609dac37f8958a536de412fe1f05 Mon Sep 17 00:00:00 2001 From: Artem Tarasov Date: Mon, 18 May 2015 08:25:43 +0300 Subject: fix broken trait page --- wqflask/wqflask/show_trait/SampleList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/show_trait/SampleList.py b/wqflask/wqflask/show_trait/SampleList.py index 46f2ae43..e98870e3 100755 --- a/wqflask/wqflask/show_trait/SampleList.py +++ b/wqflask/wqflask/show_trait/SampleList.py @@ -26,6 +26,7 @@ class SampleList(object): self.header = header self.sample_list = [] # The actual list + self.sample_attribute_values = {} self.get_attributes() print("camera: attributes are:", pf(self.attributes)) @@ -156,7 +157,6 @@ class SampleList(object): self.attributes[key].distinct_values.sort(key=natural_sort_key) def get_extra_attribute_values(self): - self.sample_attribute_values = {} if self.attributes: results = g.db.execute(''' SELECT Strain.Name AS SampleName, CaseAttributeId AS Id, CaseAttributeXRef.Value -- cgit v1.2.3