2 files changed, 152 insertions, 112 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 9b0a3dcc..07fe9cd9 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -38,6 +38,7 @@ from base import species
 from dbFunction import webqtlDatabaseFunction
 from utility import webqtlUtil
 from utility.benchmark import Bench
+from wqflask.my_pylmm.pyLMM import chunks
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
@@ -46,7 +47,7 @@ from pprint import pformat as pf
 DS_NAME_MAP = {}
 
 def create_dataset(dataset_name):
-    print("dataset_name:", dataset_name)
+    #print("dataset_name:", dataset_name)
 
     query = """
         SELECT DBType.Name
@@ -68,10 +69,17 @@ def create_dataset(dataset_name):
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+def create_in_clause(items):
+    """Create an in clause for mysql"""
+    in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
+    in_clause = '( {} )'.format(in_clause)
+    return in_clause
+
+
 def mescape(*items):
     """Multiple escape"""
-    escaped = [escape(item) for item in items]
-    print("escaped is:", escaped)
+    escaped = [escape(str(item)) for item in items]
+    #print("escaped is:", escaped)
     return escaped
 
 
@@ -82,8 +90,8 @@ class Markers(object):
         self.markers = json.load(json_data_fh)
     
     def add_pvalues(self, p_values):
-        print("length of self.markers:", len(self.markers))
-        print("length of p_values:", len(p_values))
+        #print("length of self.markers:", len(self.markers))
+        #print("length of p_values:", len(p_values))
         
         # THIS IS only needed for the case when we are limiting the number of p-values calculated
         if len(self.markers) < len(p_values):
@@ -153,7 +161,7 @@ class DatasetGroup(object):
         self.f1list = None
         self.parlist = None        
         self.get_f1_parent_strains()
-        print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
         
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
         
@@ -162,7 +170,7 @@ class DatasetGroup(object):
         
         
     def get_markers(self):
-        print("self.species is:", self.species)
+        #print("self.species is:", self.species)
         if self.species == "human":
             marker_class = HumanMarkers 
         else:
@@ -235,6 +243,7 @@ class DataSet(object):
         self.retrieve_other_names()
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
+        self.group.read_genotype_file()
         self.species = species.TheSpecies(self)
 
 
@@ -284,14 +293,14 @@ class DataSet(object):
             self.name,
             self.name,
             self.name))
-        print("query_args are:", query_args)
+        #print("query_args are:", query_args)
 
-        print("""
-                SELECT Id, Name, FullName, ShortName
-                FROM %s
-                WHERE public > %s AND
-                     (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-          """ % (query_args))
+        #print("""
+        #        SELECT Id, Name, FullName, ShortName
+        #        FROM %s
+        #        WHERE public > %s AND
+        #             (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+        #  """ % (query_args))
 
         self.id, self.name, self.fullname, self.shortname = g.db.execute("""
                 SELECT Id, Name, FullName, ShortName
@@ -615,34 +624,33 @@ class MrnaAssayDataSet(DataSet):
             and ProbeSetFreezeId = {}
             """.format(escape(str(self.id)))
         results = g.db.execute(query).fetchall()
-        print("After get_trait_list query")
+        #print("After get_trait_list query")
         trait_data = {}
         for trait in results:
             print("Retrieving sample_data for ", trait[0])
             trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
-        print("After retrieve_sample_data")
+        #print("After retrieve_sample_data")
         return trait_data
     
     def get_trait_data(self):
-        sample_ids = []
-        for sample in self.group.samplelist:
-            query = """
-                SELECT Strain.Id FROM Strain, Species
-                WHERE Strain.Name = '{}'
-                and Strain.SpeciesId=Species.Id
-                and Species.name = '{}'
-                """.format(*mescape(sample, self.group.species))
-            this_id = g.db.execute(query).fetchone()[0]
-            sample_ids.append('%d' % this_id)
-        print("sample_ids size: ", len(sample_ids))
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
 
         # MySQL limits the number of tables that can be used in a join to 61,
         # so we break the sample ids into smaller chunks
-        chunk_count = 50
-        n = len(sample_ids) / chunk_count
-        if len(sample_ids) % chunk_count:
-            n += 1
-        print("n: ", n)
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
         #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
         #tempTable = None
         #if GeneId and db.type == "ProbeSet": 
@@ -656,24 +664,21 @@ class MrnaAssayDataSet(DataSet):
         #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
         #                                        method=method,
         #                                        returnNumber=returnNumber)
-        trait_sample_data = []
-        for step in range(int(n)):
-            temp = []
-            sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)]
-            for item in sample_ids_step:
-                temp.append('T%s.value' % item)
+        
+            temp = ['T%s.value' % item for item in sample_ids_step]
             query = "SELECT {}.Name,".format(escape(self.type))
             data_start_pos = 1
             query += string.join(temp, ', ')
             query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
                                                                      self.type,
                                                                      self.type))
-            #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
+
             for item in sample_ids_step:
                 query += """
                         left join {}Data as T{} on T{}.Id = {}XRef.DataId
                         and T{}.StrainId={}\n
                         """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
             query += """
                     WHERE {}XRef.{}FreezeId = {}Freeze.Id
                     and {}Freeze.Name = '{}'
@@ -681,23 +686,24 @@ class MrnaAssayDataSet(DataSet):
                     order by {}.Id
                     """.format(*mescape(self.type, self.type, self.type, self.type,
                                self.name, self.type, self.type, self.type, self.type))
-            print("query: ", query)
             results = g.db.execute(query).fetchall()
             trait_sample_data.append(results)
-            
+
         trait_count = len(trait_sample_data[0])
         self.trait_data = collections.defaultdict(list)
+        
         # put all of the separate data together into a dictionary where the keys are
         # trait names and values are lists of sample values
-        for j in range(trait_count):
-            trait_name = trait_sample_data[0][j][0]
-            for i in range(int(n)):
-                self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:]
-
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
     
+
     def get_trait_info(self, trait_list=None, species=''):
 
-        #  Note: setting trait_list to [] is probably not a great idea.
+        #  Note: setting trait_list to [] is probably not a great idea. 
         if not trait_list:
             trait_list = []
 
@@ -706,9 +712,7 @@ class MrnaAssayDataSet(DataSet):
             if not this_trait.haveinfo:
                 this_trait.retrieveInfo(QTL=1)
 
-            if this_trait.symbol:
-                pass
-            else:
+            if not this_trait.symbol:
                 this_trait.symbol = "N/A"
 
             #XZ, 12/08/2008: description
@@ -716,62 +720,56 @@ class MrnaAssayDataSet(DataSet):
             description_string = str(this_trait.description).strip()
             target_string = str(this_trait.probe_target_description).strip()
 
-            description_display = ''
-
             if len(description_string) > 1 and description_string != 'None':
                 description_display = description_string
             else:
                 description_display = this_trait.symbol
 
-            if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None':
+            if (len(description_display) > 1 and description_display != 'N/A' and
+                    len(target_string) > 1 and target_string != 'None'):
                 description_display = description_display + '; ' + target_string.strip()
 
             # Save it for the jinja2 template
             this_trait.description_display = description_display
-            #print("  xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display))
 
             #XZ: trait_location_value is used for sorting
             trait_location_repr = 'N/A'
             trait_location_value = 1000000
 
             if this_trait.chr and this_trait.mb:
-                print("this_trait.chr is:", this_trait.chr)
-                print("this_trait.mb is:", this_trait.mb)
-                try:
-                    trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb)
-                except:
-                    if this_trait.chr.upper() == 'X':
-                        trait_location_value = 20*1000 + this_trait.mb
-                    else:
-                        trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb
-
-                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) )
+                #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y")
+                #This is so we can convert the location to a number used for sorting
+                trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb)
+                #try:
+                #    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+                #except ValueError:
+                #    if this_trait.chr.upper() == 'X':
+                #        trait_location_value = 20*1000 + this_trait.mb
+                #    else:
+                #        trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 +
+                #                               this_trait.mb)
+
+                #ZS: Put this in function currently called "convert_location_to_value"
+                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr,
+                                                                float(this_trait.mb))
                 this_trait.location_value = trait_location_value
-                #this_trait.trait_location_value = trait_location_value
 
-            #XZ, 01/12/08: This SQL query is much faster.
+            #Get mean expression value
             query = (
-"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
-    where ProbeSetXRef.ProbeSetFreezeId = %s and
-    ProbeSet.Id = ProbeSetXRef.ProbeSetId and
-    ProbeSet.Name = '%s'
+            """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
+                where ProbeSetXRef.ProbeSetFreezeId = %s and
+                ProbeSet.Id = ProbeSetXRef.ProbeSetId and
+                ProbeSet.Name = '%s'
             """ % (escape(str(this_trait.dataset.id)),
                    escape(this_trait.name)))
 
-            print("query is:", pf(query))
+            #print("query is:", pf(query))
 
             result = g.db.execute(query).fetchone()
+            
+            mean = result[0] if result else 0
 
-            if result:
-                if result[0]:
-                    mean = result[0]
-                else:
-                    mean=0
-            else:
-                mean = 0
-
-            #XZ, 06/05/2009: It is neccessary to turn on nowrap
-            this_trait.mean = repr = "%2.3f" % mean
+            this_trait.mean = "%2.3f" % mean
 
             #LRS and its location
             this_trait.LRS_score_repr = 'N/A'
@@ -790,23 +788,39 @@ class MrnaAssayDataSet(DataSet):
                 result = self.cursor.fetchone()
 
                 if result:
-                    if result[0] and result[1]:
-                        LRS_Chr = result[0]
-                        LRS_Mb = result[1]
-
-                        #XZ: LRS_location_value is used for sorting
-                        try:
-                            LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb)
-                        except:
-                            if LRS_Chr.upper() == 'X':
-                                LRS_location_value = 20*1000 + float(LRS_Mb)
-                            else:
-                                LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb)
+                    #if result[0] and result[1]:
+                    #    lrs_chr = result[0]
+                    #    lrs_mb = result[1]
+                    lrs_chr, lrs_mb = result
+                    #XZ: LRS_location_value is used for sorting
+                    lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
+                    
+                    #try:
+                    #    lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
+                    #except:
+                    #    if lrs_chr.upper() == 'X':
+                    #        lrs_location_value = 20*1000 + float(lrs_mb)
+                    #    else:
+                    #        lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 +
+                    #                              float(lrs_mb))
+
+                    this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
+                    this_trait.LRS_score_value = this_trait.lrs
+                    this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb))
+      
+
+    def convert_location_to_value(self, chromosome, mb):
+        try:
+            location_value = int(chromosome)*1000 + float(mb)
+        except ValueError:
+            if chromosome.upper() == 'X':
+                location_value = 20*1000 + float(mb)
+            else:
+                location_value = (ord(str(chromosome).upper()[0])*1000 +
+                                  float(mb))
+        
+        return location_value
 
-                        this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
-                        this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
-                        this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) )
-                        
     def get_sequence(self):
         query = """
                     SELECT
@@ -912,7 +926,7 @@ class TempDataSet(DataSet):
 
 def geno_mrna_confidentiality(ob):
     dataset_table = ob.type + "Freeze"
-    print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+    #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
 
     query = '''SELECT Id, Name, FullName, confidentiality,
                         AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py
index dde8b8d8..38b3a625 100755
--- a/wqflask/base/trait.py
+++ b/wqflask/base/trait.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import, division, print_function
 
 import string
+import resource
+
 
 from htmlgen import HTMLgen2 as HT
 
@@ -15,22 +17,38 @@ from pprint import pformat as pf
 
 from flask import Flask, g
 
-class GeneralTrait:
+def print_mem(stage=""):
+    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    print("{}: {}".format(stage, mem/1024))
+
+class GeneralTrait(object):
     """
     Trait class defines a trait in webqtl, can be either Microarray,
     Published phenotype, genotype, or user input trait
 
     """
 
-    def __init__(self, **kw):
-        print("in GeneralTrait")
-        self.dataset = kw.get('dataset')           # database name
+    def __init__(self, get_qtl_info=False, **kw):
+        # xor assertion
+        assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. xor name";
+        if kw.get('dataset_name'):
+            self.dataset = create_dataset(kw.get('dataset_name'))
+        else:
+            self.dataset = kw.get('dataset')
         self.name = kw.get('name')                 # Trait ID, ProbeSet ID, Published ID, etc.
         self.cellid = kw.get('cellid')
         self.identification = kw.get('identification', 'un-named trait')
         self.haveinfo = kw.get('haveinfo', False)
         self.sequence = kw.get('sequence')         # Blat sequence, available for ProbeSet
         self.data = kw.get('data', {})
+        
+        # Sets defaultst
+        self.locus = None
+        self.lrs = None
+        self.pvalue = None
+        self.mean = None
+        self.num_overlap = None
+        
 
         if kw.get('fullname'):
             name2 = value.split("::")
@@ -39,13 +57,12 @@ class GeneralTrait:
                 # self.cellid is set to None above
             elif len(name2) == 3:
                 self.dataset, self.name, self.cellid = name2
-
-        self.dataset = create_dataset(self.dataset)
         
         # Todo: These two lines are necessary most of the time, but perhaps not all of the time
         # So we could add a simple if statement to short-circuit this if necessary
-        self.retrieve_info()
+        self.retrieve_info(get_qtl_info=get_qtl_info)
         self.retrieve_sample_data()
+        
 
 
     def get_name(self):
@@ -78,7 +95,7 @@ class GeneralTrait:
                 #desc = self.handle_pca(desc)
                 stringy = desc
         return stringy
-    
+
 
 
     def display_name(self):
@@ -229,7 +246,7 @@ class GeneralTrait:
     #def items(self):
     #    return self.__dict__.items()
 
-    def retrieve_info(self, QTL=False):
+    def retrieve_info(self, get_qtl_info=False):
         assert self.dataset, "Dataset doesn't exist"
         if self.dataset.type == 'Publish':
             query = """
@@ -269,7 +286,7 @@ class GeneralTrait:
                            escape(self.dataset.name),
                            escape(self.name))
             traitInfo = g.db.execute(query).fetchone()
-            print("traitInfo is: ", pf(traitInfo))
+            #print("traitInfo is: ", pf(traitInfo))
         #XZ, 05/08/2009: We also should use Geno.Id to find marker instead of just using Geno.Name
         # to avoid the problem of same marker name from different species.
         elif self.dataset.type == 'Geno':
@@ -287,7 +304,7 @@ class GeneralTrait:
                            escape(self.dataset.name),
                            escape(self.name))
             traitInfo = g.db.execute(query).fetchone()
-            print("traitInfo is: ", pf(traitInfo))
+            #print("traitInfo is: ", pf(traitInfo))
         else: #Temp type
             query = """SELECT %s FROM %s WHERE Name = %s
                                      """ % (string.join(self.dataset.display_fields,','),
@@ -339,7 +356,7 @@ class GeneralTrait:
                 if result:
                     self.homologeneid = result[0]
 
-            if QTL:
+            if get_qtl_info:
                 if self.dataset.type == 'ProbeSet' and not self.cellid:
                     traitQTL = g.db.execute("""
                             SELECT
@@ -355,8 +372,17 @@ class GeneralTrait:
                     #traitQTL = self.cursor.fetchone()
                     if traitQTL:
                         self.locus, self.lrs, self.pvalue, self.mean = traitQTL
+                        if self.locus:
+                            result = g.db.execute("""
+                                select Geno.Chr, Geno.Mb from Geno, Species
+                                where Species.Name = '%s' and
+                                Geno.Name = '%s' and
+                                Geno.SpeciesId = Species.Id
+                                """, (species, self.locus)).fetchone()
+                            self.locus_chr = result[0]
+                            self.locus_mb = result[1]
                     else:
-                        self.locus = self.lrs = self.pvalue = self.mean = ""
+                        self.locus = self.locus_chr = self.locus_mb = self.lrs = self.pvalue = self.mean = ""
                 if self.dataset.type == 'Publish':
                     traitQTL = g.db.execute("""
                             SELECT