From 25bd2fa7ac229eb7862fe778fe03eb75ff34368c Mon Sep 17 00:00:00 2001
From: Lei Yan
Date: Thu, 13 Jun 2013 21:13:51 +0000
Subject: Fixed issue where too much memory was used as a result of creating a
 dataset object for each trait in the correlation results

Added new fields/columns for each trait in the correlation result table
(max LRS, max LRS location, mean expression)

Fixed error if trait doesn't have these fields
---
 wqflask/base/data_set.py                         | 30 +++----
 wqflask/base/trait.py                            | 27 +++++--
 wqflask/utility/helper_functions.py              |  2 +-
 wqflask/wqflask/correlation/show_corr_results.py | 99 +++++++++++-------------
 wqflask/wqflask/search_results.py                |  2 +-
 5 files changed, 83 insertions(+), 77 deletions(-)

diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 0c7676c4..0903bf16 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -90,8 +90,8 @@ class Markers(object):
         self.markers = json.load(json_data_fh)
     
     def add_pvalues(self, p_values):
-        print("length of self.markers:", len(self.markers))
-        print("length of p_values:", len(p_values))
+        #print("length of self.markers:", len(self.markers))
+        #print("length of p_values:", len(p_values))
         
         # THIS IS only needed for the case when we are limiting the number of p-values calculated
         if len(self.markers) < len(p_values):
@@ -161,7 +161,7 @@ class DatasetGroup(object):
         self.f1list = None
         self.parlist = None        
         self.get_f1_parent_strains()
-        print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
         
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
         
@@ -170,7 +170,7 @@ class DatasetGroup(object):
         
         
     def get_markers(self):
-        print("self.species is:", self.species)
+        #print("self.species is:", self.species)
         if self.species == "human":
             marker_class = HumanMarkers 
         else:
@@ -293,14 +293,14 @@ class DataSet(object):
             self.name,
             self.name,
             self.name))
-        print("query_args are:", query_args)
+        #print("query_args are:", query_args)
 
-        print("""
-                SELECT Id, Name, FullName, ShortName
-                FROM %s
-                WHERE public > %s AND
-                     (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-          """ % (query_args))
+        #print("""
+        #        SELECT Id, Name, FullName, ShortName
+        #        FROM %s
+        #        WHERE public > %s AND
+        #             (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+        #  """ % (query_args))
 
         self.id, self.name, self.fullname, self.shortname = g.db.execute("""
                 SELECT Id, Name, FullName, ShortName
@@ -624,12 +624,12 @@ class MrnaAssayDataSet(DataSet):
             and ProbeSetFreezeId = {}
             """.format(escape(str(self.id)))
         results = g.db.execute(query).fetchall()
-        print("After get_trait_list query")
+        #print("After get_trait_list query")
         trait_data = {}
         for trait in results:
             print("Retrieving sample_data for ", trait[0])
             trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
-        print("After retrieve_sample_data")
+        #print("After retrieve_sample_data")
         return trait_data
     
     def get_trait_data(self):
@@ -763,7 +763,7 @@ class MrnaAssayDataSet(DataSet):
             """ % (escape(str(this_trait.dataset.id)),
                    escape(this_trait.name)))
 
-            print("query is:", pf(query))
+            #print("query is:", pf(query))
 
             result = g.db.execute(query).fetchone()
             
@@ -926,7 +926,7 @@ class TempDataSet(DataSet):
 
 def geno_mrna_confidentiality(ob):
     dataset_table = ob.type + "Freeze"
-    print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+    #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
 
     query = '''SELECT Id, Name, FullName, confidentiality,
                         AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py
index 53f41779..f333d5a7 100755
--- a/wqflask/base/trait.py
+++ b/wqflask/base/trait.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import, division, print_function
 
 import string
+import resource
+
 
 from htmlgen import HTMLgen2 as HT
 
@@ -15,6 +17,10 @@ from pprint import pformat as pf
 
 from flask import Flask, g
 
+def print_mem(stage=""):
+    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    print("{}: {}".format(stage, mem/1024))
+
 class GeneralTrait(object):
     """
     Trait class defines a trait in webqtl, can be either Microarray,
@@ -23,8 +29,12 @@ class GeneralTrait(object):
     """
 
     def __init__(self, **kw):
-        #print("in GeneralTrait")
-        self.dataset = kw.get('dataset')           # database name
+        # xor assertion
+        assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. xor name";
+        if kw.get('dataset_name'):
+            self.dataset = create_dataset(kw.get('dataset_name'))
+        else:
+            self.dataset = kw.get('dataset')
         self.name = kw.get('name')                 # Trait ID, ProbeSet ID, Published ID, etc.
         self.cellid = kw.get('cellid')
         self.identification = kw.get('identification', 'un-named trait')
@@ -39,8 +49,6 @@ class GeneralTrait(object):
                 # self.cellid is set to None above
             elif len(name2) == 3:
                 self.dataset, self.name, self.cellid = name2
-
-        self.dataset = create_dataset(self.dataset)
         
         # Todo: These two lines are necessary most of the time, but perhaps not all of the time
         # So we could add a simple if statement to short-circuit this if necessary
@@ -355,8 +363,17 @@ class GeneralTrait(object):
                     #traitQTL = self.cursor.fetchone()
                     if traitQTL:
                         self.locus, self.lrs, self.pvalue, self.mean = traitQTL
+                        if self.locus:
+                            result = g.db.execute("""
+                                select Geno.Chr, Geno.Mb from Geno, Species
+                                where Species.Name = '%s' and
+                                Geno.Name = '%s' and
+                                Geno.SpeciesId = Species.Id
+                                """, (species, self.locus)).fetchone()
+                            self.locus_chr = result[0]
+                            self.locus_mb = result[1]
                     else:
-                        self.locus = self.lrs = self.pvalue = self.mean = ""
+                        self.locus = self.locus_chr = self.locus_mb = self.lrs = self.pvalue = self.mean = ""
                 if self.dataset.type == 'Publish':
                     traitQTL = g.db.execute("""
                             SELECT
diff --git a/wqflask/utility/helper_functions.py b/wqflask/utility/helper_functions.py
index 28242c27..d76a32ce 100644
--- a/wqflask/utility/helper_functions.py
+++ b/wqflask/utility/helper_functions.py
@@ -9,7 +9,7 @@ def get_species_dataset_trait(self, start_vars):
     #assert type(read_genotype) == type(bool()), "Expecting boolean value for read_genotype"
     self.dataset = data_set.create_dataset(start_vars['dataset'])
     self.species = TheSpecies(dataset=self.dataset)
-    self.this_trait = GeneralTrait(dataset=self.dataset.name,
+    self.this_trait = GeneralTrait(dataset=self.dataset,
                                    name=start_vars['trait_id'],
                                    cellid=None)
 
diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py
index 96c0155b..3b8b7ba2 100644
--- a/wqflask/wqflask/correlation/show_corr_results.py
+++ b/wqflask/wqflask/correlation/show_corr_results.py
@@ -92,11 +92,6 @@ class CorrelationResults(object):
     #
     #RANK_ORDERS = {"1": 0, "2": 1, "3": 0, "4": 0, "5": 1}
 
-
-    #def error(self, message, *args, **kw):
-    #    heading = heading or self.PAGE_HEADING
-    #    return templatePage.error(heading = heading, detail = [message], error=error)
-
     def __init__(self, start_vars):
         # get trait list from db (database name)
         # calculate correlation with Base vector and targets
@@ -104,10 +99,8 @@ class CorrelationResults(object):
         #self.this_trait = GeneralTrait(dataset=self.dataset.name,
         #                               name=start_vars['trait_id'],
         #                               cellid=None)                
-        
         #print("start_vars: ", pf(start_vars))
         with Bench("Doing correlations"):
-            print_mem("At beginning")
             helper_functions.get_species_dataset_trait(self, start_vars)
             self.dataset.group.read_genotype_file()
     
@@ -138,7 +131,6 @@ class CorrelationResults(object):
 
 
             self.correlation_data = {}
-            print_mem("Before calculating correlations")
             for trait, values in self.target_dataset.trait_data.iteritems():
                 this_trait_values = []
                 target_values = []
@@ -150,63 +142,60 @@ class CorrelationResults(object):
                         target_values.append(target_sample_value)
 
                 this_trait_values, target_values = normalize_values(this_trait_values, target_values)
-                
+
                 if self.corr_method == 'pearson':
                     sample_r, sample_p = scipy.stats.pearsonr(this_trait_values, target_values)
                 else:
                     sample_r, sample_p = scipy.stats.spearmanr(this_trait_values, target_values)
-                    
+
                 self.correlation_data[trait] = [sample_r, sample_p]
-                
-            print_mem("After calculating correlations")
-            
+
             self.correlation_data = collections.OrderedDict(sorted(self.correlation_data.items(),
                                                                    key=lambda t: -abs(t[1][0])))
-            
+
             self.correlation_data_slice = collections.OrderedDict()
-            
-            old_memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
-            
+
             for trait_counter, trait in enumerate(self.correlation_data.keys()[:300]):
-                print_mem("In trait info loop")
-                print("\nTrait #:", trait_counter)
-                print_mem("Before trait_object")
-                trait_object = GeneralTrait(dataset=self.dataset.name, name=trait)
-                print_mem("After trait object")
-                trait_info = dict(
-                    correlation = float(self.correlation_data[trait][0]),
-                    p_value = float(self.correlation_data[trait][1]),
-                    symbol = trait_object.symbol,
-                    alias = trait_object.alias,
-                    description = trait_object.description,
-                    chromosome = trait_object.chr,
-                    mb = trait_object.mb
-                )
-                print_mem("Before deleting trait object")
-                del trait_object
-                print_mem("After deleting trait object")
-                gc.collect()
-                print_mem("After colleting garabage")
-                print("** trait_info:", pf(trait_info))
-                print("\n** Start trait_info")
-                counter = 1
-                for key, value in trait_info.iteritems():
-                    print("   <{}> [{}] {}: [{}] {}\n".format(
-                        counter, type(key), key, type(value), value))
-                    counter += 1
-                print("** Done trait_info")
+                trait_object = GeneralTrait(dataset=self.dataset, name=trait)
+                if self.dataset.type == 'ProbeSet':
+                    trait_info = collections.OrderedDict(
+                        correlation = float(self.correlation_data[trait][0]),
+                        p_value = float(self.correlation_data[trait][1]),
+                        symbol = trait_object.symbol,
+                        alias = trait_object.alias,
+                        description = trait_object.description,
+                        chromosome = trait_object.chr,
+                        mb = trait_object.mb
+                    )
+                    if hasattr(trait_object, 'mean'):
+                       trait_info[mean] = trait_object.mean
+                    if hasattr(trait_object, 'lrs'):
+                       trait_info[lrs] = trait_object.lrs
+                    if hasattr(trait_object, 'locus_chr'):
+                       trait_info[locus_chr] = trait_object.locus_chr
+                    if hasattr(trait_object, 'locus_mb'):
+                       trait_info[locus_mb] = trait_object.locus_mb
+                elif self.dataset.type == 'Geno':
+                    trait_info = collections.OrderedDict(
+                        correlation = float(self.correlation_data[trait][0]),
+                        p_value = float(self.correlation_data[trait][1]),
+                        symbol = trait_object.symbol,
+                        alias = trait_object.alias,
+                        description = trait_object.description,
+                        chromosome = trait_object.chr,
+                        mb = trait_object.mb
+                    )
+                else: # 'Publish'
+                    trait_info = collections.OrderedDict(
+                        correlation = float(self.correlation_data[trait][0]),
+                        p_value = float(self.correlation_data[trait][1]),
+                        symbol = trait_object.symbol,
+                        alias = trait_object.alias,
+                        description = trait_object.description,
+                        chromosome = trait_object.chr,
+                        mb = trait_object.mb
+                    )
                 self.correlation_data_slice[trait] = trait_info
-                #self.correlation_data_slice[trait].append(trait_object)
-                
-                new_memory_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
-                print("Memory difference:", new_memory_usage-old_memory_usage)
-                old_memory_usage = new_memory_usage
-                print_mem("End of purple loop")
-                print("*************************** End purple ******** ")
-               
-            print_mem("After getting trait info")
-            print("Garbage colleting...")
-            gc.collect()
 
         #XZ, 09/18/2008: get all information about the user selected database.
         #target_db_name = fd.corr_dataset
diff --git a/wqflask/wqflask/search_results.py b/wqflask/wqflask/search_results.py
index dc872a8b..e171f1ab 100644
--- a/wqflask/wqflask/search_results.py
+++ b/wqflask/wqflask/search_results.py
@@ -106,7 +106,7 @@ class SearchResultPage(object):
 
             print("foo locals are:", locals())
             trait_id = result[0]
-            this_trait = GeneralTrait(dataset=self.dataset.name, name=trait_id)
+            this_trait = GeneralTrait(dataset=self.dataset, name=trait_id)
             this_trait.retrieve_info(QTL=True)
             self.trait_list.append(this_trait)
 
-- 
cgit v1.2.3