From 34e4933de5a1cd444abe618fcfd93b424bf3442e Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 20 Apr 2021 01:38:26 +0300 Subject: refactor code for iterating mrna tissue data --- wqflask/base/mrna_assay_tissue_data.py | 39 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py index f1929518..0220d73b 100644 --- a/wqflask/base/mrna_assay_tissue_data.py +++ b/wqflask/base/mrna_assay_tissue_data.py @@ -6,6 +6,7 @@ from utility import db_tools from utility import Bunch from utility.db_tools import escape +from gn3.db_utils import database_connector from utility.logger import getLogger @@ -44,16 +45,42 @@ class MrnaAssayTissueData(object): and t.Mean = x.maxmean; '''.format(in_clause) - results = g.db.execute(query).fetchall() - lower_symbols = [] + # lower_symbols = [] + lower_symbols = {} for gene_symbol in gene_symbols: + # lower_symbols[gene_symbol.lower()] = True if gene_symbol != None: - lower_symbols.append(gene_symbol.lower()) - + lower_symbols[gene_symbol.lower()] = True + + import time + # initial_time = time.time() + # conn,cursor = database_connector() + # cursor.execute(query) + # for result in cursor.fetchall(): + # symbol = result[0] + # self.data[symbol].gene_id = result[1] + # self.data[symbol].data_id = result[2] + # self.data[symbol].chr = result[3] + # self.data[symbol].mb = result[4] + # self.data[symbol].description = result[5] + # self.data[symbol].probe_target_description = result[6] + + + # print("my loop takes>>>>",time.time()-initial_time) + # conn.close() + # r + + # takes 5 seconds + initial_time = time.time() + results = list(g.db.execute(query).fetchall()) for result in results: symbol = result[0] - if symbol.lower() in lower_symbols: + # if symbol is not None + # exists = lower_symbols.get(symbol.lower()) + # if symbol.lower() in lower_symbols: + if symbol is not None and lower_symbols.get(symbol.lower()): + symbol = symbol.lower() self.data[symbol].gene_id = result.GeneId @@ -62,6 +89,7 @@ class MrnaAssayTissueData(object): self.data[symbol].mb = result.Mb self.data[symbol].description = result.description self.data[symbol].probe_target_description = result.Probe_Target_Description + print("time taken in the loop is",time.time()-initial_time) ########################################################################### #Input: cursor, symbolList (list), dataIdDict(Dict) @@ -82,6 +110,7 @@ class MrnaAssayTissueData(object): WHERE TissueProbeSetData.Id IN {} and TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(db_tools.create_in_clause(id_list)) + results = g.db.execute(query).fetchall() for result in results: if result.Symbol.lower() not in symbol_values_dict: -- cgit v1.2.3 From 1b0566d7c9779b979d20c350f66d5628fb55eba6 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Fri, 23 Apr 2021 23:22:46 +0300 Subject: debugging for fetching probe data --- wqflask/base/data_set.py | 51 ++++++++++++++++++++-- wqflask/wqflask/correlation/correlation_gn3_api.py | 2 +- wqflask/wqflask/views.py | 3 ++ 3 files changed, 51 insertions(+), 5 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 178234fe..468c4da0 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -115,7 +115,8 @@ Publish or ProbeSet. E.g. except: pass - self.redis_instance.set("dataset_structure", json.dumps(self.datasets)) + self.redis_instance.set( + "dataset_structure", json.dumps(self.datasets)) def set_dataset_key(self, t, name): """If name is not in the object's dataset dictionary, set it, and update @@ -154,10 +155,12 @@ Publish or ProbeSet. E.g. if t in ['pheno', 'other_pheno']: group_name = name.replace("Publish", "") - results = g.db.execute(sql_query_mapping[t].format(group_name)).fetchone() + results = g.db.execute( + sql_query_mapping[t].format(group_name)).fetchone() if results: self.datasets[name] = dataset_name_mapping[t] - self.redis_instance.set("dataset_structure", json.dumps(self.datasets)) + self.redis_instance.set( + "dataset_structure", json.dumps(self.datasets)) return True return None @@ -169,7 +172,8 @@ Publish or ProbeSet. E.g. # This has side-effects, with the end result being a truth-y value if(self.set_dataset_key(t, name)): break - return self.datasets.get(name, None) # Return None if name has not been set + # Return None if name has not been set + return self.datasets.get(name, None) # Do the intensive work at startup one time only @@ -651,6 +655,43 @@ class DataSet(object): "Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass + def fetch_probe_trait_data(self, sample_list=None): + if sample_list: + self.samplelist = sample_list + else: + self.samplelist = self.group.samplelist + + if self.group.parlist != None and self.group.f1list != None: + if (self.group.parlist + self.group.f1list) in self.samplelist: + self.samplelist += self.group.parlist + self.group.f1list + + query = """ + SELECT Strain.Name, Strain.Id FROM Strain, Species + WHERE Strain.Name IN {} + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + logger.sql(query) + results = dict(g.db.execute(query).fetchall()) + sample_ids = [results[item] for item in self.samplelist] + + query = """SELECT * from ProbeSetData WHERE Id in ( SELECT ProbeSetXRef.DataId FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and ProbeSetFreeze.Name = 'HC_M2_0606_P' and ProbeSet.Id = ProbeSetXRef.ProbeSetId order by ProbeSet.Id ) and StrainId in ({})""".format( + ",".join(str(sample_id) for sample_id in sample_ids)) + + results = g.db.execute(query).fetchall() + + # with conn: + # cursor = conn.cursor() + # cursor.execute(query) + # results = cursor.fetchall() + trait_data = {} + for trait_id, StrainId, value in results: + if trait_id in trait_data: + trait_data[trait_id].append(value) + else: + trait_data[trait_id] = [value] + self.trait_data = trait_data + def get_trait_data(self, sample_list=None): if sample_list: self.samplelist = sample_list @@ -670,6 +711,7 @@ class DataSet(object): logger.sql(query) results = dict(g.db.execute(query).fetchall()) sample_ids = [results[item] for item in self.samplelist] + print("the number of sample ids are", len(sample_ids)) # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -720,6 +762,7 @@ class DataSet(object): trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) + print("the trait count is >>>", trait_count) self.trait_data = collections.defaultdict(list) # put all of the separate data together into a dictionary where the keys are diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index e7394647..51bf5fb5 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -78,7 +78,7 @@ def compute_correlation(start_vars, method="pearson"): # } sample_data = process_samples( start_vars, this_dataset.group.samplelist) - target_dataset.get_trait_data(list(sample_data.keys())) + target_dataset.fetch_probe_trait_data(list(sample_data.keys())) this_trait = retrieve_sample_data(this_trait, this_dataset) print("Creating dataset and trait took", time.time()-initial_time) diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index 072db466..2c239425 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -881,7 +881,10 @@ def network_graph_page(): def corr_compute_page(): logger.info("In corr_compute, request.form is:", pf(request.form)) logger.info(request.url) + import time + initial_time = time.time() correlation_results = compute_correlation(request.form) + print(">>>>Time taken by this endpoint",time.time()-initial_time) return render_template("demo_correlation_page.html",correlation_results=correlation_results[1:20]) @app.route("/corr_matrix", methods=('POST',)) -- cgit v1.2.3 From 067d27460965aaf1ceaa863a315a0c7dbc47ae02 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 26 Apr 2021 17:05:06 +0300 Subject: fix:remove debug statements and commented code --- wqflask/base/mrna_assay_tissue_data.py | 25 --------- wqflask/wqflask/correlation/correlation_gn3_api.py | 60 +++------------------- 2 files changed, 8 insertions(+), 77 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/mrna_assay_tissue_data.py b/wqflask/base/mrna_assay_tissue_data.py index 0220d73b..5a64afb2 100644 --- a/wqflask/base/mrna_assay_tissue_data.py +++ b/wqflask/base/mrna_assay_tissue_data.py @@ -52,33 +52,9 @@ class MrnaAssayTissueData(object): # lower_symbols[gene_symbol.lower()] = True if gene_symbol != None: lower_symbols[gene_symbol.lower()] = True - - import time - # initial_time = time.time() - # conn,cursor = database_connector() - # cursor.execute(query) - # for result in cursor.fetchall(): - # symbol = result[0] - # self.data[symbol].gene_id = result[1] - # self.data[symbol].data_id = result[2] - # self.data[symbol].chr = result[3] - # self.data[symbol].mb = result[4] - # self.data[symbol].description = result[5] - # self.data[symbol].probe_target_description = result[6] - - - # print("my loop takes>>>>",time.time()-initial_time) - # conn.close() - # r - - # takes 5 seconds - initial_time = time.time() results = list(g.db.execute(query).fetchall()) for result in results: symbol = result[0] - # if symbol is not None - # exists = lower_symbols.get(symbol.lower()) - # if symbol.lower() in lower_symbols: if symbol is not None and lower_symbols.get(symbol.lower()): symbol = symbol.lower() @@ -89,7 +65,6 @@ class MrnaAssayTissueData(object): self.data[symbol].mb = result.Mb self.data[symbol].description = result.description self.data[symbol].probe_target_description = result.Probe_Target_Description - print("time taken in the loop is",time.time()-initial_time) ########################################################################### #Input: cursor, symbolList (list), dataIdDict(Dict) diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index c945f699..3c21a850 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -63,9 +63,6 @@ def sample_for_trait_lists(corr_results, target_dataset, this_trait, this_datase "trait_sample_data": sample_data, "trait_id": start_vars["trait_id"] } - # trait_lists = dict([(list(corr_result)[0],True) for corr_result in corr_results]) - # target_dataset.trait_data =list(filter(lambda dict_obj: dict_obj.keys()[ - # 0] in corr_results_traits, target_dataset_data)) results = map_shared_keys_to_values( target_dataset.samplelist, target_dataset.trait_data) correlation_results = compute_all_sample_correlation(corr_method="pearson", @@ -77,33 +74,15 @@ def sample_for_trait_lists(corr_results, target_dataset, this_trait, this_datase def tissue_for_trait_lists(corr_results, this_dataset, target_dataset, this_trait): - # # print(corr_results[0])-- - # [{"awsdsd_at": {'corr_coeffient': 0.49714692782257336, 'p_value': 1.872077762359228e-05, 'num_overlap': 67}}] - - print("creating trait_lists") - # corr_results = corr_results[0::] trait_lists = dict([(list(corr_result)[0], True) for corr_result in corr_results]) - print("finished creating trait_list") - traits_symbol_dict = this_dataset.retrieve_genes("Symbol") - print("Retrieved symbol dict") - print("creating dict here>>>>>>>>>") - import time - init_time = time.time() traits_symbol_dict = dict({trait_name: symbol for ( trait_name, symbol) in traits_symbol_dict.items() if trait_lists.get(trait_name)}) - print("time taken to create this max dict is>>>>", time.time()-init_time) - print("finished creatinf the dict") - print("Fetching tissue datas") primary_tissue_data, target_tissue_data = get_tissue_correlation_input( this_trait, traits_symbol_dict) - print("finihsed>>>>>>>>>>>>>>>>>>") - print("Calling experimental_compute_all_tissue_correlation") corr_results = experimental_compute_all_tissue_correlation( primary_tissue_dict=primary_tissue_data, target_tissues_data=target_tissue_data, corr_method="pearson") - # print('finished calling this tissue reuslts',corr_results) - return corr_results @@ -123,22 +102,14 @@ def compute_correlation(start_vars, method="pearson"): corr_input_data = {} if corr_type == "sample": - import time - initial_time = time.time() - # corr_input_data = { - # "target_dataset": target_dataset.trait_data, - # "target_samplelist": target_dataset.samplelist, - # "trait_data": { - # "trait_sample_data": sample_data, - # "trait_id": start_vars["trait_id"] - # } - # } + sample_data = process_samples( start_vars, this_dataset.group.samplelist) + initial_time = time.time() target_dataset.get_trait_data(list(sample_data.keys())) this_trait = retrieve_sample_data(this_trait, this_dataset) + print("Creating target dataset and trait took", time.time()-initial_time) - print("Creating dataset and trait took", time.time()-initial_time) this_trait_data = { "trait_sample_data": sample_data, @@ -151,15 +122,9 @@ def compute_correlation(start_vars, method="pearson"): this_trait=this_trait_data, target_dataset=results) - print("computedd>>>>>>>>>>>>>") - print("doing sample correlation took", time.time()-initial_time) - - other_results_time = time.time() - other_results = tissue_for_trait_lists( - correlation_results, this_dataset, target_dataset, this_trait) - print(">>>time taken for this is", time.time()-other_results_time) - + # other_results = tissue_for_trait_lists( + # correlation_results, this_dataset, target_dataset, this_trait) # requests_url = f"{GN3_CORRELATION_API}/sample_x/{method}" return correlation_results @@ -177,17 +142,9 @@ def compute_correlation(start_vars, method="pearson"): target_tissues_data=corr_input_data[ "target_tissues_dict"], corr_method=method) - print("correlation y took", time.time()-initial_time) - # initial_time = time.time() - # correlation_results = compute_all_tissue_correlation(primary_tissue_dict=corr_input_data["primary_tissue"], - # target_tissues_data=corr_input_data["target_tissues_dict"], - # corr_method=method) - # print("time taken for compute tissue is", time.time()-initial_time) - - # requests_url = f"{GN3_CORRELATION_API}/tissue_corr/{method}" - - sample_results = sample_for_trait_lists( - correlation_results, target_dataset, this_trait, this_dataset, start_vars) + print("computing tissue took >>>>", time.time()-initial_time) + # sample_results = sample_for_trait_lists( + # correlation_results, target_dataset, this_trait, this_dataset, start_vars) return correlation_results elif corr_type == "lit": @@ -203,7 +160,6 @@ def compute_correlation(start_vars, method="pearson"): species=species, gene_id=this_trait_geneid) return lit_corr_results - print("the time taken is", time.time()-initial_time) # requests_url = f"{GN3_CORRELATION_API}/lit_corr/{species}/{this_trait_geneid}" # corr_input_data = geneid_dict # corr_results = requests.post(requests_url, json=corr_input_data) -- cgit v1.2.3 From 27538980f93c1d72b0b2d76151312f3fbce4c9a5 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Mon, 10 May 2021 08:24:42 +0300 Subject: add previous endpoint for correlation --- wqflask/base/data_set.py | 37 ------------------------------------- wqflask/wqflask/views.py | 13 ++++++++----- 2 files changed, 8 insertions(+), 42 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 468c4da0..d0f5e6f2 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -655,42 +655,7 @@ class DataSet(object): "Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass - def fetch_probe_trait_data(self, sample_list=None): - if sample_list: - self.samplelist = sample_list - else: - self.samplelist = self.group.samplelist - - if self.group.parlist != None and self.group.f1list != None: - if (self.group.parlist + self.group.f1list) in self.samplelist: - self.samplelist += self.group.parlist + self.group.f1list - - query = """ - SELECT Strain.Name, Strain.Id FROM Strain, Species - WHERE Strain.Name IN {} - and Strain.SpeciesId=Species.Id - and Species.name = '{}' - """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) - logger.sql(query) - results = dict(g.db.execute(query).fetchall()) - sample_ids = [results[item] for item in self.samplelist] - - query = """SELECT * from ProbeSetData WHERE Id in ( SELECT ProbeSetXRef.DataId FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and ProbeSetFreeze.Name = 'HC_M2_0606_P' and ProbeSet.Id = ProbeSetXRef.ProbeSetId order by ProbeSet.Id ) and StrainId in ({})""".format( - ",".join(str(sample_id) for sample_id in sample_ids)) - results = g.db.execute(query).fetchall() - - # with conn: - # cursor = conn.cursor() - # cursor.execute(query) - # results = cursor.fetchall() - trait_data = {} - for trait_id, StrainId, value in results: - if trait_id in trait_data: - trait_data[trait_id].append(value) - else: - trait_data[trait_id] = [value] - self.trait_data = trait_data def get_trait_data(self, sample_list=None): if sample_list: @@ -711,7 +676,6 @@ class DataSet(object): logger.sql(query) results = dict(g.db.execute(query).fetchall()) sample_ids = [results[item] for item in self.samplelist] - print("the number of sample ids are", len(sample_ids)) # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -762,7 +726,6 @@ class DataSet(object): trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) - print("the trait count is >>>", trait_count) self.trait_data = collections.defaultdict(list) # put all of the separate data together into a dictionary where the keys are diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index b042a211..19779651 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -881,11 +881,14 @@ def network_graph_page(): def corr_compute_page(): logger.info("In corr_compute, request.form is:", pf(request.form)) logger.info(request.url) - import time - initial_time = time.time() - correlation_results = compute_correlation(request.form) - print(">>>>Time taken by this endpoint",time.time()-initial_time) - return render_template("test_correlation_page.html",correlation_results=correlation_results) + template_vars = show_corr_results.CorrelationResults(request.form) + return render_template("correlation_page.html", **template_vars.__dict__) + + # to test the new correlation api uncomment these lines + + # correlation_results = compute_correlation(request.form) + # print(">>>>Time taken by this endpoint",time.time()-initial_time) + # return render_template("test_correlation_page.html",correlation_results=correlation_results) @app.route("/corr_matrix", methods=('POST',)) def corr_matrix_page(): -- cgit v1.2.3