6 files changed, 454 insertions, 179 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 50ef8f57..07fe9cd9 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -22,10 +22,14 @@
 
 from __future__ import absolute_import, print_function, division
 import os
+import math
+import string
+import collections
 
-from flask import Flask, g
+import json
+import itertools
 
-from htmlgen import HTMLgen2 as HT
+from flask import Flask, g
 
 import reaper
 
@@ -33,6 +37,8 @@ from base import webqtlConfig
 from base import species
 from dbFunction import webqtlDatabaseFunction
 from utility import webqtlUtil
+from utility.benchmark import Bench
+from wqflask.my_pylmm.pyLMM import chunks
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
@@ -41,29 +47,102 @@ from pprint import pformat as pf
 DS_NAME_MAP = {}
 
 def create_dataset(dataset_name):
-    #cursor = db_conn.cursor()
-    print("dataset_name:", dataset_name)
+    #print("dataset_name:", dataset_name)
 
     query = """
         SELECT DBType.Name
         FROM DBList, DBType
-        WHERE DBList.Name = '%s' and
+        WHERE DBList.Name = '{}' and
               DBType.Id = DBList.DBTypeId
-        """ % (escape(dataset_name))
-    print("query is: ", pf(query))
+        """.format(escape(dataset_name))
+    #print("query is: ", pf(query))
     dataset_type = g.db.execute(query).fetchone().Name
 
     #dataset_type = cursor.fetchone()[0]
-    print("[blubber] dataset_type:", pf(dataset_type))
+    #print("[blubber] dataset_type:", pf(dataset_type))
 
     dataset_ob = DS_NAME_MAP[dataset_type]
     #dataset_class = getattr(data_set, dataset_ob)
-    print("dataset_ob:", dataset_ob)
-    print("DS_NAME_MAP:", pf(DS_NAME_MAP))
+    #print("dataset_ob:", dataset_ob)
+    #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
 
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+def create_in_clause(items):
+    """Create an in clause for mysql"""
+    in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
+    in_clause = '( {} )'.format(in_clause)
+    return in_clause
+
+
+def mescape(*items):
+    """Multiple escape"""
+    escaped = [escape(str(item)) for item in items]
+    #print("escaped is:", escaped)
+    return escaped
+
+
+class Markers(object):
+    """Todo: Build in cacheing so it saves us reading the same file more than once"""
+    def __init__(self, name):
+        json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+        self.markers = json.load(json_data_fh)
+    
+    def add_pvalues(self, p_values):
+        #print("length of self.markers:", len(self.markers))
+        #print("length of p_values:", len(p_values))
+        
+        # THIS IS only needed for the case when we are limiting the number of p-values calculated
+        if len(self.markers) < len(p_values):
+            self.markers = self.markers[:len(p_values)]
+        
+        for marker, p_value in itertools.izip(self.markers, p_values):
+            marker['p_value'] = p_value
+            print("p_value is:", marker['p_value'])
+            marker['lod_score'] = -math.log10(marker['p_value'])
+            #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+            marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        
+
+
+class HumanMarkers(Markers):
+    
+    def __init__(self, name):
+        marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+        self.markers = []
+        for line in marker_data_fh:
+            splat = line.strip().split()
+            marker = {}
+            marker['chr'] = int(splat[0])
+            marker['name'] = splat[1]
+            marker['Mb'] = float(splat[3]) / 1000000
+            self.markers.append(marker)
+            
+        #print("markers is: ", pf(self.markers))
+
+
+    def add_pvalues(self, p_values):
+        #for marker, p_value in itertools.izip(self.markers, p_values):
+        #    if marker['Mb'] <= 0 and marker['chr'] == 0:
+        #        continue
+        #    marker['p_value'] = p_value
+        #    print("p_value is:", marker['p_value'])
+        #    marker['lod_score'] = -math.log10(marker['p_value'])
+        #    #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+        #    marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        super(HumanMarkers, self).add_pvalues(p_values)
+        
+        with Bench("deleting markers"):
+            markers = []
+            for marker in self.markers:
+                if not marker['Mb'] <= 0 and not marker['chr'] == 0:
+                    markers.append(marker)
+            self.markers = markers
+        
+    
 
 class DatasetGroup(object):
     """
@@ -79,22 +158,41 @@ class DatasetGroup(object):
         if self.name == 'BXD300':
             self.name = "BXD"
         
+        self.f1list = None
+        self.parlist = None        
+        self.get_f1_parent_strains()
+        #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
         
         self.incparentsf1 = False
-        self.f1list = None
-        self.parlist = None
         self.allsamples = None
+        
+        
+    def get_markers(self):
+        #print("self.species is:", self.species)
+        if self.species == "human":
+            marker_class = HumanMarkers 
+        else:
+            marker_class = Markers
 
+        self.markers = marker_class(self.name)
+        
 
-    #def read_genotype(self):
-    #    self.read_genotype_file()
-    #
-    #    if not self.genotype:   # Didn'd succeed, so we try method 2
-    #        self.read_genotype_data()
+    def get_f1_parent_strains(self):
+        try:
+            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
+            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
+        except KeyError:
+            f1 = f12 = maternal = paternal = None
+            
+        if f1 and f12:
+            self.f1list = [f1, f12]
+        if maternal and paternal:
+            self.parlist = [maternal, paternal]
             
     def read_genotype_file(self):
-        '''read genotype from .geno file instead of database'''
+        '''Read genotype from .geno file instead of database'''
         #if self.group == 'BXD300':
         #    self.group = 'BXD'
         #
@@ -104,38 +202,24 @@ class DatasetGroup(object):
         #genotype_2 is Dataset Object with parents and f1 (not for intercross)
 
         genotype_1 = reaper.Dataset()
-        
+
         # reaper barfs on unicode filenames, so here we ensure it's a string
         full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
         genotype_1.read(full_filename)
 
-        print("Got to after read")
-
-        try:
-            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
-            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
-        except KeyError:
-            f1 = f12 = maternal = paternal = None
-
-
-        if genotype_1.type == "group" and maternal and paternal:
-            genotype_2 = genotype_1.add(Mat=maternal, Pat=paternal)       #, F1=_f1)
+        if genotype_1.type == "group" and self.parlist:
+            genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1])       #, F1=_f1)
         else:
             genotype_2 = genotype_1
 
         #determine default genotype object
         if self.incparentsf1 and genotype_1.type != "intercross":
-            self.genotype = genotype_2
+            genotype = genotype_2
         else:
             self.incparentsf1 = 0
-            self.genotype = genotype_1
-
-        self.samplelist = list(self.genotype.prgy)
+            genotype = genotype_1
 
-        if f1 and f12:
-            self.f1list = [f1, f12]
-        if maternal and paternal:
-            self.parlist = [maternal, paternal]
+        self.samplelist = list(genotype.prgy)
 
 
 class DataSet(object):
@@ -159,10 +243,10 @@ class DataSet(object):
         self.retrieve_other_names()
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
+        self.group.read_genotype_file()
         self.species = species.TheSpecies(self)
-    
-       
-        
+
+
     def get_desc(self):
         """Gets overridden later, at least for Temp...used by trait's get_given_name"""
         return None
@@ -209,14 +293,14 @@ class DataSet(object):
             self.name,
             self.name,
             self.name))
-        print("query_args are:", query_args)
+        #print("query_args are:", query_args)
 
-        print("""
-                SELECT Id, Name, FullName, ShortName
-                FROM %s
-                WHERE public > %s AND
-                     (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-          """ % (query_args))
+        #print("""
+        #        SELECT Id, Name, FullName, ShortName
+        #        FROM %s
+        #        WHERE public > %s AND
+        #             (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+        #  """ % (query_args))
 
         self.id, self.name, self.fullname, self.shortname = g.db.execute("""
                 SELECT Id, Name, FullName, ShortName
@@ -227,11 +311,7 @@ class DataSet(object):
 
         #self.cursor.execute(query)
         #self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone()
-
-
-    #def genHTML(self, Class='c0dd'):
-    #    return  HT.Href(text = HT.Span('%s Database' % self.fullname, Class= "fwb " + Class),
-    #            url= webqtlConfig.INFOPAGEHREF % self.name,target="_blank")
+        
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -291,6 +371,19 @@ class PhenotypeDataSet(DataSet):
         # (Urgently?) Need to write this
         pass
 
+    def get_trait_list(self):
+        query = """
+            select PublishXRef.Id
+            from PublishXRef, PublishFreeze
+            where PublishFreeze.InbredSetId=PublishXRef.InbredSetId
+            and PublishFreeze.Id = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        trait_data = {}
+        for trait in results:
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        return trait_data
+
     def get_trait_info(self, trait_list, species = ''):
         for this_trait in trait_list:
             if not this_trait.haveinfo:
@@ -301,7 +394,7 @@ class PhenotypeDataSet(DataSet):
                 continue   # for now
                 if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
                     description = this_trait.pre_publication_description
-            this_trait.description_display = description
+            this_trait.description_display = unicode(description, "utf8")
 
             if not this_trait.year.isdigit():
                 this_trait.pubmed_text = "N/A"
@@ -359,7 +452,7 @@ class PhenotypeDataSet(DataSet):
                             PublishFreeze.Id = %d AND PublishData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (trait.name, self.id)
+                    """ % (trait, self.id)
         results = g.db.execute(query).fetchall()
         return results
 
@@ -399,6 +492,19 @@ class GenotypeDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
+    
+    def get_trait_list(self):
+        query = """
+            select Geno.Name
+            from Geno, GenoXRef
+            where GenoXRef.GenoId = Geno.Id
+            and GenoFreezeId = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        trait_data = {}
+        for trait in results:
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        return trait_data
 
     def get_trait_info(self, trait_list, species=None):
         for this_trait in trait_list:
@@ -437,7 +543,7 @@ class GenotypeDataSet(DataSet):
                             GenoData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait.name, self.name)
+                    """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name)
         results = g.db.execute(query).fetchall()
         return results
 
@@ -509,10 +615,95 @@ class MrnaAssayDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
+        
+    def get_trait_list_1(self):
+        query = """
+            select ProbeSet.Name
+            from ProbeSet, ProbeSetXRef
+            where ProbeSetXRef.ProbeSetId = ProbeSet.Id
+            and ProbeSetFreezeId = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        #print("After get_trait_list query")
+        trait_data = {}
+        for trait in results:
+            print("Retrieving sample_data for ", trait[0])
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        #print("After retrieve_sample_data")
+        return trait_data
+    
+    def get_trait_data(self):
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        
+            temp = ['T%s.value' % item for item in sample_ids_step]
+            query = "SELECT {}.Name,".format(escape(self.type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+                                                                     self.type,
+                                                                     self.type))
+
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
+            query += """
+                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                    and {}Freeze.Name = '{}'
+                    and {}.Id = {}XRef.{}Id
+                    order by {}.Id
+                    """.format(*mescape(self.type, self.type, self.type, self.type,
+                               self.name, self.type, self.type, self.type, self.type))
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
+
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
+        
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+    
 
     def get_trait_info(self, trait_list=None, species=''):
 
-        #  Note: setting trait_list to [] is probably not a great idea.
+        #  Note: setting trait_list to [] is probably not a great idea. 
         if not trait_list:
             trait_list = []
 
@@ -521,9 +712,7 @@ class MrnaAssayDataSet(DataSet):
             if not this_trait.haveinfo:
                 this_trait.retrieveInfo(QTL=1)
 
-            if this_trait.symbol:
-                pass
-            else:
+            if not this_trait.symbol:
                 this_trait.symbol = "N/A"
 
             #XZ, 12/08/2008: description
@@ -531,60 +720,56 @@ class MrnaAssayDataSet(DataSet):
             description_string = str(this_trait.description).strip()
             target_string = str(this_trait.probe_target_description).strip()
 
-            description_display = ''
-
             if len(description_string) > 1 and description_string != 'None':
                 description_display = description_string
             else:
                 description_display = this_trait.symbol
 
-            if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None':
+            if (len(description_display) > 1 and description_display != 'N/A' and
+                    len(target_string) > 1 and target_string != 'None'):
                 description_display = description_display + '; ' + target_string.strip()
 
             # Save it for the jinja2 template
             this_trait.description_display = description_display
-            #print("  xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display))
 
             #XZ: trait_location_value is used for sorting
             trait_location_repr = 'N/A'
             trait_location_value = 1000000
 
             if this_trait.chr and this_trait.mb:
-                try:
-                    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
-                except:
-                    if this_trait.chr.upper() == 'X':
-                        trait_location_value = 20*1000 + this_trait.mb
-                    else:
-                        trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb
-
-                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) )
+                #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y")
+                #This is so we can convert the location to a number used for sorting
+                trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb)
+                #try:
+                #    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+                #except ValueError:
+                #    if this_trait.chr.upper() == 'X':
+                #        trait_location_value = 20*1000 + this_trait.mb
+                #    else:
+                #        trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 +
+                #                               this_trait.mb)
+
+                #ZS: Put this in function currently called "convert_location_to_value"
+                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr,
+                                                                float(this_trait.mb))
                 this_trait.location_value = trait_location_value
-                #this_trait.trait_location_value = trait_location_value
 
-            #XZ, 01/12/08: This SQL query is much faster.
+            #Get mean expression value
             query = (
-"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
-    where ProbeSetXRef.ProbeSetFreezeId = %s and
-    ProbeSet.Id = ProbeSetXRef.ProbeSetId and
-    ProbeSet.Name = '%s'
+            """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
+                where ProbeSetXRef.ProbeSetFreezeId = %s and
+                ProbeSet.Id = ProbeSetXRef.ProbeSetId and
+                ProbeSet.Name = '%s'
             """ % (escape(str(this_trait.dataset.id)),
                    escape(this_trait.name)))
 
-            print("query is:", pf(query))
+            #print("query is:", pf(query))
 
             result = g.db.execute(query).fetchone()
+            
+            mean = result[0] if result else 0
 
-            if result:
-                if result[0]:
-                    mean = result[0]
-                else:
-                    mean=0
-            else:
-                mean = 0
-
-            #XZ, 06/05/2009: It is neccessary to turn on nowrap
-            this_trait.mean = repr = "%2.3f" % mean
+            this_trait.mean = "%2.3f" % mean
 
             #LRS and its location
             this_trait.LRS_score_repr = 'N/A'
@@ -603,23 +788,39 @@ class MrnaAssayDataSet(DataSet):
                 result = self.cursor.fetchone()
 
                 if result:
-                    if result[0] and result[1]:
-                        LRS_Chr = result[0]
-                        LRS_Mb = result[1]
-
-                        #XZ: LRS_location_value is used for sorting
-                        try:
-                            LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb)
-                        except:
-                            if LRS_Chr.upper() == 'X':
-                                LRS_location_value = 20*1000 + float(LRS_Mb)
-                            else:
-                                LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb)
+                    #if result[0] and result[1]:
+                    #    lrs_chr = result[0]
+                    #    lrs_mb = result[1]
+                    lrs_chr, lrs_mb = result
+                    #XZ: LRS_location_value is used for sorting
+                    lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
+                    
+                    #try:
+                    #    lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
+                    #except:
+                    #    if lrs_chr.upper() == 'X':
+                    #        lrs_location_value = 20*1000 + float(lrs_mb)
+                    #    else:
+                    #        lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 +
+                    #                              float(lrs_mb))
+
+                    this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
+                    this_trait.LRS_score_value = this_trait.lrs
+                    this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb))
+      
+
+    def convert_location_to_value(self, chromosome, mb):
+        try:
+            location_value = int(chromosome)*1000 + float(mb)
+        except ValueError:
+            if chromosome.upper() == 'X':
+                location_value = 20*1000 + float(mb)
+            else:
+                location_value = (ord(str(chromosome).upper()[0])*1000 +
+                                  float(mb))
+        
+        return location_value
 
-                        this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
-                        this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
-                        this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) )
-                        
     def get_sequence(self):
         query = """
                     SELECT
@@ -633,9 +834,9 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSetFreeze.Name = %s
                 """ % (escape(self.name), escape(self.dataset.name))
         results = g.db.execute(query).fetchone()
-
         return results[0]
     
+   
     def retrieve_sample_data(self, trait):
         query = """
                     SELECT
@@ -652,7 +853,7 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSetData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (escape(trait.name), escape(self.name))
+                    """ % (escape(trait), escape(self.name))
         results = g.db.execute(query).fetchall()
         return results
 
@@ -725,7 +926,7 @@ class TempDataSet(DataSet):
 
 def geno_mrna_confidentiality(ob):
     dataset_table = ob.type + "Freeze"
-    print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+    #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
 
     query = '''SELECT Id, Name, FullName, confidentiality,
                         AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
@@ -741,3 +942,4 @@ def geno_mrna_confidentiality(ob):
     if confidential:
         # Allow confidential data later
         NoConfindetialDataForYouTodaySorry
+
diff --git a/wqflask/base/generate_probesetfreeze_file.py b/wqflask/base/generate_probesetfreeze_file.py
new file mode 100644
index 00000000..a0ff804b
--- /dev/null
+++ b/wqflask/base/generate_probesetfreeze_file.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import, print_function, division
+import os
+import math
+
+import json
+import itertools
+
+from flask import Flask, g
+
+from base import webqtlConfig
+from dbFunction import webqtlDatabaseFunction
+from utility import webqtlUtil
+
+from MySQLdb import escape_string as escape
+from pprint import pformat as pf
+
+
+query = """ select ProbeSet.Name
+            from ProbeSetXRef,
+                 ProbeSetFreeze,
+                 ProbeSet
+            where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and
+                  ProbeSetFreeze.Name = "EPFLMouseMuscleCDRMA1211" and
+                  ProbeSetXRef.ProbeSetId = ProbeSet.Id;
+        """
+
+markers = g.db.execute(query).fetchall()
+print("markers: ", pf(markers))
+
+if __name__ == '__main__':
+    main()
+\ No newline at end of file
diff --git a/wqflask/base/species.py b/wqflask/base/species.py
index 9d4cac4c..191f4535 100644
--- a/wqflask/base/species.py
+++ b/wqflask/base/species.py
@@ -16,8 +16,7 @@ class TheSpecies(object):
         print("self.dataset is:", pf(self.dataset.__dict__))
         self.chromosomes = Chromosomes(self.dataset)
         self.genome_mb_length = self.chromosomes.get_genome_mb_length()
-        
-        
+
     #@property
     #def chromosomes(self):
     #    chromosomes = [("All", -1)]
@@ -31,7 +30,8 @@ class TheSpecies(object):
     #    return chromosomes
 
 class IndChromosome(object):
-    def __init__(self, length):
+    def __init__(self, name, length):
+        self.name = name
         self.length = length
         
     @property
@@ -50,7 +50,7 @@ class Chromosomes(object):
 
         results = g.db.execute("""
                 Select
-                        Chr_Length.Name, Length from Chr_Length, InbredSet
+                        Chr_Length.Name, Chr_Length.OrderId, Length from Chr_Length, InbredSet
                 where
                         Chr_Length.SpeciesId = InbredSet.SpeciesId AND
                         InbredSet.Name = %s
@@ -59,10 +59,10 @@ class Chromosomes(object):
         print("bike:", results)
 
         for item in results:
-            self.chromosomes[item.Name] = IndChromosome(item.Length)
+            self.chromosomes[item.OrderId] = IndChromosome(item.Name, item.Length)
         
         self.set_mb_graph_interval()
-        self.get_cm_length_list()
+        #self.get_cm_length_list()
 
 
     def set_mb_graph_interval(self):
diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py
index 241bf2ab..db76ddea 100755
--- a/wqflask/base/trait.py
+++ b/wqflask/base/trait.py
@@ -1,6 +1,8 @@
 from __future__ import absolute_import, division, print_function
 
 import string
+import resource
+
 
 from htmlgen import HTMLgen2 as HT
 
@@ -15,22 +17,38 @@ from pprint import pformat as pf
 
 from flask import Flask, g
 
-class GeneralTrait:
+def print_mem(stage=""):
+    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    print("{}: {}".format(stage, mem/1024))
+
+class GeneralTrait(object):
     """
     Trait class defines a trait in webqtl, can be either Microarray,
     Published phenotype, genotype, or user input trait
 
     """
 
-    def __init__(self, **kw):
-        print("in GeneralTrait")
-        self.dataset = kw.get('dataset')           # database name
+    def __init__(self, get_qtl_info=False, **kw):
+        # xor assertion
+        assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. or name";
+        if kw.get('dataset_name'):
+            self.dataset = create_dataset(kw.get('dataset_name'))
+        else:
+            self.dataset = kw.get('dataset')
         self.name = kw.get('name')                 # Trait ID, ProbeSet ID, Published ID, etc.
         self.cellid = kw.get('cellid')
         self.identification = kw.get('identification', 'un-named trait')
         self.haveinfo = kw.get('haveinfo', False)
         self.sequence = kw.get('sequence')         # Blat sequence, available for ProbeSet
         self.data = kw.get('data', {})
+        
+        # Sets defaultst
+        self.locus = None
+        self.lrs = None
+        self.pvalue = None
+        self.mean = None
+        self.num_overlap = None
+        
 
         if kw.get('fullname'):
             name2 = value.split("::")
@@ -39,13 +57,12 @@ class GeneralTrait:
                 # self.cellid is set to None above
             elif len(name2) == 3:
                 self.dataset, self.name, self.cellid = name2
-
-        self.dataset = create_dataset(self.dataset)
         
         # Todo: These two lines are necessary most of the time, but perhaps not all of the time
         # So we could add a simple if statement to short-circuit this if necessary
-        self.retrieve_info()
+        self.retrieve_info(get_qtl_info=get_qtl_info)
         self.retrieve_sample_data()
+        
 
 
     def get_name(self):
@@ -78,7 +95,7 @@ class GeneralTrait:
                 #desc = self.handle_pca(desc)
                 stringy = desc
         return stringy
-    
+
 
 
     def display_name(self):
@@ -208,7 +225,7 @@ class GeneralTrait:
         #            ''' % (self.cellid, self.name, self.dataset.name)
         #            
         #else:
-        results = self.dataset.retrieve_sample_data(self)
+        results = self.dataset.retrieve_sample_data(self.name)
 
         # Todo: is this necessary? If not remove
         self.data.clear()
@@ -229,7 +246,7 @@ class GeneralTrait:
     #def items(self):
     #    return self.__dict__.items()
 
-    def retrieve_info(self, QTL=False):
+    def retrieve_info(self, get_qtl_info=False):
         assert self.dataset, "Dataset doesn't exist"
         if self.dataset.type == 'Publish':
             query = """
@@ -251,7 +268,7 @@ class GeneralTrait:
                             PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND
                             PublishFreeze.Id = %s
                     """ % (self.name, self.dataset.id)
-            traitInfo = g.db.execute(query).fetchone()
+            trait_info = g.db.execute(query).fetchone()
         #XZ, 05/08/2009: Xiaodong add this block to use ProbeSet.Id to find the probeset instead of just using ProbeSet.Name
         #XZ, 05/08/2009: to avoid the problem of same probeset name from different platforms.
         elif self.dataset.type == 'ProbeSet':
@@ -268,8 +285,8 @@ class GeneralTrait:
                     """ % (escape(display_fields_string),
                            escape(self.dataset.name),
                            escape(self.name))
-            traitInfo = g.db.execute(query).fetchone()
-            print("traitInfo is: ", pf(traitInfo))
+            trait_info = g.db.execute(query).fetchone()
+            #print("trait_info is: ", pf(trait_info))
         #XZ, 05/08/2009: We also should use Geno.Id to find marker instead of just using Geno.Name
         # to avoid the problem of same marker name from different species.
         elif self.dataset.type == 'Geno':
@@ -286,23 +303,24 @@ class GeneralTrait:
                     """ % (escape(display_fields_string),
                            escape(self.dataset.name),
                            escape(self.name))
-            traitInfo = g.db.execute(query).fetchone()
-            print("traitInfo is: ", pf(traitInfo))
+            trait_info = g.db.execute(query).fetchone()
+            #print("trait_info is: ", pf(trait_info))
         else: #Temp type
             query = """SELECT %s FROM %s WHERE Name = %s
                                      """ % (string.join(self.dataset.display_fields,','),
                                             self.dataset.type, self.name)
-            traitInfo = g.db.execute(query).fetchone()
+            trait_info = g.db.execute(query).fetchone()
 
 
         #self.cursor.execute(query)
-        #traitInfo = self.cursor.fetchone()
-        if traitInfo:
+        #trait_info = self.cursor.fetchone()
+        if trait_info:
             self.haveinfo = True
 
             #XZ: assign SQL query result to trait attributes.
             for i, field in enumerate(self.dataset.display_fields):
-                setattr(self, field, traitInfo[i])
+                print("  mike: {} -> {} - {}".format(field, type(trait_info[i]), trait_info[i]))
+                setattr(self, field, trait_info[i])
 
             if self.dataset.type == 'Publish':
                 self.confidential = 0
@@ -310,55 +328,76 @@ class GeneralTrait:
                     self.confidential = 1
 
             self.homologeneid = None
+            
+            print("self.geneid is:", self.geneid)
+            print("  type:", type(self.geneid))
+            print("self.dataset.group.name is:", self.dataset.group.name)
             if self.dataset.type == 'ProbeSet' and self.dataset.group and self.geneid:
                 #XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number.
                 #XZ: So I have to test if geneid is number before execute the query.
                 #XZ: The geneid values in database should be cleaned up.
-                try:
-                    junk = float(self.geneid)
-                    geneidIsNumber = 1
-                except:
-                    geneidIsNumber = 0
-
-                if geneidIsNumber:
-                    query = """
-                            SELECT
-                                    HomologeneId
-                            FROM
-                                    Homologene, Species, InbredSet
-                            WHERE
-                                    Homologene.GeneId =%s AND
-                                    InbredSet.Name = '%s' AND
-                                    InbredSet.SpeciesId = Species.Id AND
-                                    Species.TaxonomyId = Homologene.TaxonomyId
-                            """ % (escape(str(self.geneid)), escape(self.dataset.group.name))
-                    result = g.db.execute(query).fetchone()
-                else:
-                    result = None
+                #try:
+                #    float(self.geneid)
+                #    geneidIsNumber = True
+                #except ValueError:
+                #    geneidIsNumber = False
+
+                #if geneidIsNumber:
+
+
+                query = """
+                        SELECT
+                                HomologeneId
+                        FROM
+                                Homologene, Species, InbredSet
+                        WHERE
+                                Homologene.GeneId =%s AND
+                                InbredSet.Name = '%s' AND
+                                InbredSet.SpeciesId = Species.Id AND
+                                Species.TaxonomyId = Homologene.TaxonomyId
+                        """ % (escape(str(self.geneid)), escape(self.dataset.group.name))
+                print("-> query is:", query)
+                result = g.db.execute(query).fetchone()
+                #else:
+                #    result = None
 
                 if result:
                     self.homologeneid = result[0]
 
-            if QTL:
+            if get_qtl_info:
                 if self.dataset.type == 'ProbeSet' and not self.cellid:
-                    traitQTL = g.db.execute("""
+                    query = """
                             SELECT
                                     ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, ProbeSetXRef.mean
                             FROM
                                     ProbeSetXRef, ProbeSet
                             WHERE
                                     ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
-                                    ProbeSet.Name = "%s" AND
-                                    ProbeSetXRef.ProbeSetFreezeId =%s
-                            """, (self.name, self.dataset.id)).fetchone()
+                                    ProbeSet.Name = "{}" AND
+                                    ProbeSetXRef.ProbeSetFreezeId ={}
+                            """.format(self.name, self.dataset.id)
+                    trait_qtl = g.db.execute(query).fetchone()
                     #self.cursor.execute(query)
-                    #traitQTL = self.cursor.fetchone()
-                    if traitQTL:
-                        self.locus, self.lrs, self.pvalue, self.mean = traitQTL
+                    #trait_qtl = self.cursor.fetchone()
+                    if trait_qtl:
+                        self.locus, self.lrs, self.pvalue, self.mean = trait_qtl
+                        if self.locus:
+                            query = """
+                                select Geno.Chr, Geno.Mb from Geno, Species
+                                where Species.Name = '{}' and
+                                Geno.Name = '{}' and
+                                Geno.SpeciesId = Species.Id
+                                """.format(self.dataset.group.species, self.locus)
+                            print("query is:", query)
+                            result = g.db.execute(query).fetchone()
+                            self.locus_chr = result[0]
+                            self.locus_mb = result[1]
                     else:
-                        self.locus = self.lrs = self.pvalue = self.mean = ""
+                        self.locus = self.locus_chr = self.locus_mb = self.lrs = self.pvalue = self.mean = ""
+                
+                
                 if self.dataset.type == 'Publish':
-                    traitQTL = g.db.execute("""
+                    trait_qtl = g.db.execute("""
                             SELECT
                                     PublishXRef.Locus, PublishXRef.LRS
                             FROM
@@ -369,9 +408,9 @@ class GeneralTrait:
                                     PublishFreeze.Id =%s
                             """, (self.name, self.dataset.id)).fetchone()
                     #self.cursor.execute(query)
-                    #traitQTL = self.cursor.fetchone()
-                    if traitQTL:
-                        self.locus, self.lrs = traitQTL
+                    #trait_qtl = self.cursor.fetchone()
+                    if trait_qtl:
+                        self.locus, self.lrs = trait_qtl
                     else:
                         self.locus = self.lrs = ""
         else:
diff --git a/wqflask/base/webqtlConfig.py b/wqflask/base/webqtlConfig.py
index 755595e0..a811c3cd 100755
--- a/wqflask/base/webqtlConfig.py
+++ b/wqflask/base/webqtlConfig.py
@@ -52,19 +52,22 @@ ENSEMBLETRANSCRIPT_URL="http://useast.ensembl.org/Mus_musculus/Lucene/Details?sp
 SECUREDIR = GNROOT + 'secure/'
 COMMON_LIB = GNROOT + 'support/admin'
 HTMLPATH = GNROOT + 'web/'
+PYLMM_PATH = '/home/zas1024/'
+SNP_PATH = '/mnt/xvdf1/snps/' 
 IMGDIR = HTMLPATH +'image/'
 IMAGESPATH = HTMLPATH + 'images/'
 UPLOADPATH = IMAGESPATH + 'upload/'
-TMPDIR = '/tmp/'
+TMPDIR = HTMLPATH + 'tmp/'
 GENODIR = HTMLPATH + 'genotypes/'
+NEWGENODIR = HTMLPATH + 'new_genotypes/'
 GENO_ARCHIVE_DIR = GENODIR + 'archive/'
 TEXTDIR = HTMLPATH + 'ProbeSetFreeze_DataMatrix/'
 CMDLINEDIR = HTMLPATH + 'webqtl/cmdLine/'
 ChangableHtmlPath = GNROOT + 'web/'
 
 SITENAME = 'GN'
-PORTADDR = "http://132.192.47.32"
-BASEHREF = '<base href="http://132.192.47.32/">'
+PORTADDR = "http://50.16.251.170"
+BASEHREF = '<base href="http://50.16.251.170/">'
 INFOPAGEHREF = '/dbdoc/%s.html'
 GLOSSARYFILE = "/glossary.html"
 CGIDIR = '/webqtl/' #XZ: The variable name 'CGIDIR' should be changed to 'PYTHONDIR'
diff --git a/wqflask/base/webqtlConfigLocal.py b/wqflask/base/webqtlConfigLocal.py
index 84686234..abaeff93 100755
--- a/wqflask/base/webqtlConfigLocal.py
+++ b/wqflask/base/webqtlConfigLocal.py
@@ -2,18 +2,18 @@
 #      Environment Variables - private
 #########################################
 
-MYSQL_SERVER = 'localhost'
-DB_NAME = 'db_webqtl_zas1024'
+MYSQL_SERVER = 'gn.cazhbciu2y1i.us-east-1.rds.amazonaws.com'
+DB_NAME = 'db_webqtl'
 DB_USER = 'webqtl'
-DB_PASSWD = 'webqtl'
+DB_PASSWD = 'f2ZypIflRM'
 
-MYSQL_UPDSERVER = 'localhost'
-DB_UPDNAME = 'db_webqtl_zas1024'
+MYSQL_UPDSERVER = 'gn.cazhbciu2y1i.us-east-1.rds.amazonaws.com'
+DB_UPDNAME = 'db_webqtl'
 DB_UPDUSER = 'webqtl'
-DB_UPDPASSWD = 'webqtl'
+DB_UPDPASSWD = 'f2ZypIflRM'
 
-GNROOT = '/home/zas1024/gn/'
-ROOT_URL = 'http://alexandria.uthsc.edu:91/'
+GNROOT = '/home/zas1024/gene/'
+ROOT_URL = 'http://50.16.251.170'
 PythonPath = '/usr/bin/python'
 PIDDLE_FONT_PATH = '/usr/lib/python2.4/site-packages/piddle/truetypefonts/'