From 25bd2fa7ac229eb7862fe778fe03eb75ff34368c Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Thu, 13 Jun 2013 21:13:51 +0000 Subject: Fixed issue where too much memory was used as a result of creating a dataset object for each trait in the correlation results Added new fields/columns for each trait in the correlation result table (max LRS, max LRS location, mean expression) Fixed error if trait doesn't have these fields --- wqflask/base/data_set.py | 30 +++---- wqflask/base/trait.py | 27 +++++-- wqflask/utility/helper_functions.py | 2 +- wqflask/wqflask/correlation/show_corr_results.py | 99 +++++++++++------------- wqflask/wqflask/search_results.py | 2 +- 5 files changed, 83 insertions(+), 77 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 0c7676c4..0903bf16 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -90,8 +90,8 @@ class Markers(object): self.markers = json.load(json_data_fh) def add_pvalues(self, p_values): - print("length of self.markers:", len(self.markers)) - print("length of p_values:", len(p_values)) + #print("length of self.markers:", len(self.markers)) + #print("length of p_values:", len(p_values)) # THIS IS only needed for the case when we are limiting the number of p-values calculated if len(self.markers) < len(p_values): @@ -161,7 +161,7 @@ class DatasetGroup(object): self.f1list = None self.parlist = None self.get_f1_parent_strains() - print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) + #print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) self.species = webqtlDatabaseFunction.retrieve_species(self.name) @@ -170,7 +170,7 @@ class DatasetGroup(object): def get_markers(self): - print("self.species is:", self.species) + #print("self.species is:", self.species) if self.species == "human": marker_class = HumanMarkers else: @@ -293,14 +293,14 @@ class DataSet(object): self.name, self.name, self.name)) - print("query_args are:", query_args) + #print("query_args are:", query_args) - print(""" - SELECT Id, Name, FullName, ShortName - FROM %s - WHERE public > %s AND - (Name = '%s' OR FullName = '%s' OR ShortName = '%s') - """ % (query_args)) + #print(""" + # SELECT Id, Name, FullName, ShortName + # FROM %s + # WHERE public > %s AND + # (Name = '%s' OR FullName = '%s' OR ShortName = '%s') + # """ % (query_args)) self.id, self.name, self.fullname, self.shortname = g.db.execute(""" SELECT Id, Name, FullName, ShortName @@ -624,12 +624,12 @@ class MrnaAssayDataSet(DataSet): and ProbeSetFreezeId = {} """.format(escape(str(self.id))) results = g.db.execute(query).fetchall() - print("After get_trait_list query") + #print("After get_trait_list query") trait_data = {} for trait in results: print("Retrieving sample_data for ", trait[0]) trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) - print("After retrieve_sample_data") + #print("After retrieve_sample_data") return trait_data def get_trait_data(self): @@ -763,7 +763,7 @@ class MrnaAssayDataSet(DataSet): """ % (escape(str(this_trait.dataset.id)), escape(this_trait.name))) - print("query is:", pf(query)) + #print("query is:", pf(query)) result = g.db.execute(query).fetchone() @@ -926,7 +926,7 @@ class TempDataSet(DataSet): def geno_mrna_confidentiality(ob): dataset_table = ob.type + "Freeze" - print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table)) + #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table)) query = '''SELECT Id, Name, FullName, confidentiality, AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table) diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index 53f41779..f333d5a7 100755 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, print_function import string +import resource + from htmlgen import HTMLgen2 as HT @@ -15,6 +17,10 @@ from pprint import pformat as pf from flask import Flask, g +def print_mem(stage=""): + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + print("{}: {}".format(stage, mem/1024)) + class GeneralTrait(object): """ Trait class defines a trait in webqtl, can be either Microarray, @@ -23,8 +29,12 @@ class GeneralTrait(object): """ def __init__(self, **kw): - #print("in GeneralTrait") - self.dataset = kw.get('dataset') # database name + # xor assertion + assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. xor name"; + if kw.get('dataset_name'): + self.dataset = create_dataset(kw.get('dataset_name')) + else: + self.dataset = kw.get('dataset') self.name = kw.get('name') # Trait ID, ProbeSet ID, Published ID, etc. self.cellid = kw.get('cellid') self.identification = kw.get('identification', 'un-named trait') @@ -39,8 +49,6 @@ class GeneralTrait(object): # self.cellid is set to None above elif len(name2) == 3: self.dataset, self.name, self.cellid = name2 - - self.dataset = create_dataset(self.dataset) # Todo: These two lines are necessary most of the time, but perhaps not all of the time # So we could add a simple if statement to short-circuit this if necessary @@ -355,8 +363,17 @@ class GeneralTrait(object): #traitQTL = self.cursor.fetchone() if traitQTL: self.locus, self.lrs, self.pvalue, self.mean = traitQTL + if self.locus: + result = g.db.execute(""" + select Geno.Chr, Geno.Mb from Geno, Species + where Species.Name = '%s' and + Geno.Name = '%s' and + Geno.SpeciesId = Species.Id + """, (species, self.locus)).fetchone() + self.locus_chr = result[0] + self.locus_mb = result[1] else: - self.locus = self.lrs = self.pvalue = self.mean = "" + self.locus = self.locus_chr = self.locus_mb = self.lrs = self.pvalue = self.mean = "" if self.dataset.type == 'Publish': traitQTL = g.db.execute(""" SELECT diff --git a/wqflask/utility/helper_functions.py b/wqflask/utility/helper_functions.py index 28242c27..d76a32ce 100644 --- a/wqflask/utility/helper_functions.py +++ b/wqflask/utility/helper_functions.py @@ -9,7 +9,7 @@ def get_species_dataset_trait(self, start_vars): #assert type(read_genotype) == type(bool()), "Expecting boolean value for read_genotype" self.dataset = data_set.create_dataset(start_vars['dataset']) self.species = TheSpecies(dataset=self.dataset) - self.this_trait = GeneralTrait(dataset=self.dataset.name, + self.this_trait = GeneralTrait(dataset=self.dataset, name=start_vars['trait_id'], cellid=None) diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 96c0155b..3b8b7ba2 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -92,11 +92,6 @@ class CorrelationResults(object): # #RANK_ORDERS = {"1": 0, "2": 1, "3": 0, "4": 0, "5": 1} - - #def error(self, message, *args, **kw): - # heading = heading or self.PAGE_HEADING - # return templatePage.error(heading = heading, detail = [message], error=error) - def __init__(self, start_vars): # get trait list from db (database name) # calculate correlation with Base vector and targets @@ -104,10 +99,8 @@ class CorrelationResults(object): #self.this_trait = GeneralTrait(dataset=self.dataset.name, # name=start_vars['trait_id'], # cellid=None) - #print("start_vars: ", pf(start_vars)) with Bench("Doing correlations"): - print_mem("At beginning") helper_functions.get_species_dataset_trait(self, start_vars) self.dataset.group.read_genotype_file() @@ -138,7 +131,6 @@ class CorrelationResults(object): self.correlation_data = {} - print_mem("Before calculating correlations") for trait, values in self.target_dataset.trait_data.iteritems(): this_trait_values = [] target_values = [] @@ -150,63 +142,60 @@ class CorrelationResults(object): target_values.append(target_sample_value) this_trait_values, target_values = normalize_values(this_trait_values, target_values) - + if self.corr_method == 'pearson': sample_r, sample_p = scipy.stats.pearsonr(this_trait_values, target_values) else: sample_r, sample_p = scipy.stats.spearmanr(this_trait_values, target_values) - + self.correlation_data[trait] = [sample_r, sample_p] - - print_mem("After calculating correlations") - + self.correlation_data = collections.OrderedDict(sorted(self.correlation_data.items(), key=lambda t: -abs(t[1][0]))) - + self.correlation_data_slice = collections.OrderedDict() - - old_memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - + for trait_counter, trait in enumerate(self.correlation_data.keys()[:300]): - print_mem("In trait info loop") - print("\nTrait #:", trait_counter) - print_mem("Before trait_object") - trait_object = GeneralTrait(dataset=self.dataset.name, name=trait) - print_mem("After trait object") - trait_info = dict( - correlation = float(self.correlation_data[trait][0]), - p_value = float(self.correlation_data[trait][1]), - symbol = trait_object.symbol, - alias = trait_object.alias, - description = trait_object.description, - chromosome = trait_object.chr, - mb = trait_object.mb - ) - print_mem("Before deleting trait object") - del trait_object - print_mem("After deleting trait object") - gc.collect() - print_mem("After colleting garabage") - print("** trait_info:", pf(trait_info)) - print("\n** Start trait_info") - counter = 1 - for key, value in trait_info.iteritems(): - print(" <{}> [{}] {}: [{}] {}\n".format( - counter, type(key), key, type(value), value)) - counter += 1 - print("** Done trait_info") + trait_object = GeneralTrait(dataset=self.dataset, name=trait) + if self.dataset.type == 'ProbeSet': + trait_info = collections.OrderedDict( + correlation = float(self.correlation_data[trait][0]), + p_value = float(self.correlation_data[trait][1]), + symbol = trait_object.symbol, + alias = trait_object.alias, + description = trait_object.description, + chromosome = trait_object.chr, + mb = trait_object.mb + ) + if hasattr(trait_object, 'mean'): + trait_info[mean] = trait_object.mean + if hasattr(trait_object, 'lrs'): + trait_info[lrs] = trait_object.lrs + if hasattr(trait_object, 'locus_chr'): + trait_info[locus_chr] = trait_object.locus_chr + if hasattr(trait_object, 'locus_mb'): + trait_info[locus_mb] = trait_object.locus_mb + elif self.dataset.type == 'Geno': + trait_info = collections.OrderedDict( + correlation = float(self.correlation_data[trait][0]), + p_value = float(self.correlation_data[trait][1]), + symbol = trait_object.symbol, + alias = trait_object.alias, + description = trait_object.description, + chromosome = trait_object.chr, + mb = trait_object.mb + ) + else: # 'Publish' + trait_info = collections.OrderedDict( + correlation = float(self.correlation_data[trait][0]), + p_value = float(self.correlation_data[trait][1]), + symbol = trait_object.symbol, + alias = trait_object.alias, + description = trait_object.description, + chromosome = trait_object.chr, + mb = trait_object.mb + ) self.correlation_data_slice[trait] = trait_info - #self.correlation_data_slice[trait].append(trait_object) - - new_memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - print("Memory difference:", new_memory_usage-old_memory_usage) - old_memory_usage = new_memory_usage - print_mem("End of purple loop") - print("*************************** End purple ******** ") - - print_mem("After getting trait info") - print("Garbage colleting...") - gc.collect() #XZ, 09/18/2008: get all information about the user selected database. #target_db_name = fd.corr_dataset diff --git a/wqflask/wqflask/search_results.py b/wqflask/wqflask/search_results.py index dc872a8b..e171f1ab 100644 --- a/wqflask/wqflask/search_results.py +++ b/wqflask/wqflask/search_results.py @@ -106,7 +106,7 @@ class SearchResultPage(object): print("foo locals are:", locals()) trait_id = result[0] - this_trait = GeneralTrait(dataset=self.dataset.name, name=trait_id) + this_trait = GeneralTrait(dataset=self.dataset, name=trait_id) this_trait.retrieve_info(QTL=True) self.trait_list.append(this_trait) -- cgit v1.2.3