1 files changed, 311 insertions, 109 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 50ef8f57..07fe9cd9 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -22,10 +22,14 @@
 
 from __future__ import absolute_import, print_function, division
 import os
+import math
+import string
+import collections
 
-from flask import Flask, g
+import json
+import itertools
 
-from htmlgen import HTMLgen2 as HT
+from flask import Flask, g
 
 import reaper
 
@@ -33,6 +37,8 @@ from base import webqtlConfig
 from base import species
 from dbFunction import webqtlDatabaseFunction
 from utility import webqtlUtil
+from utility.benchmark import Bench
+from wqflask.my_pylmm.pyLMM import chunks
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
@@ -41,29 +47,102 @@ from pprint import pformat as pf
 DS_NAME_MAP = {}
 
 def create_dataset(dataset_name):
-    #cursor = db_conn.cursor()
-    print("dataset_name:", dataset_name)
+    #print("dataset_name:", dataset_name)
 
     query = """
         SELECT DBType.Name
         FROM DBList, DBType
-        WHERE DBList.Name = '%s' and
+        WHERE DBList.Name = '{}' and
               DBType.Id = DBList.DBTypeId
-        """ % (escape(dataset_name))
-    print("query is: ", pf(query))
+        """.format(escape(dataset_name))
+    #print("query is: ", pf(query))
     dataset_type = g.db.execute(query).fetchone().Name
 
     #dataset_type = cursor.fetchone()[0]
-    print("[blubber] dataset_type:", pf(dataset_type))
+    #print("[blubber] dataset_type:", pf(dataset_type))
 
     dataset_ob = DS_NAME_MAP[dataset_type]
     #dataset_class = getattr(data_set, dataset_ob)
-    print("dataset_ob:", dataset_ob)
-    print("DS_NAME_MAP:", pf(DS_NAME_MAP))
+    #print("dataset_ob:", dataset_ob)
+    #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
 
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+def create_in_clause(items):
+    """Create an in clause for mysql"""
+    in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
+    in_clause = '( {} )'.format(in_clause)
+    return in_clause
+
+
+def mescape(*items):
+    """Multiple escape"""
+    escaped = [escape(str(item)) for item in items]
+    #print("escaped is:", escaped)
+    return escaped
+
+
+class Markers(object):
+    """Todo: Build in cacheing so it saves us reading the same file more than once"""
+    def __init__(self, name):
+        json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+        self.markers = json.load(json_data_fh)
+    
+    def add_pvalues(self, p_values):
+        #print("length of self.markers:", len(self.markers))
+        #print("length of p_values:", len(p_values))
+        
+        # THIS IS only needed for the case when we are limiting the number of p-values calculated
+        if len(self.markers) < len(p_values):
+            self.markers = self.markers[:len(p_values)]
+        
+        for marker, p_value in itertools.izip(self.markers, p_values):
+            marker['p_value'] = p_value
+            print("p_value is:", marker['p_value'])
+            marker['lod_score'] = -math.log10(marker['p_value'])
+            #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+            marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        
+
+
+class HumanMarkers(Markers):
+    
+    def __init__(self, name):
+        marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+        self.markers = []
+        for line in marker_data_fh:
+            splat = line.strip().split()
+            marker = {}
+            marker['chr'] = int(splat[0])
+            marker['name'] = splat[1]
+            marker['Mb'] = float(splat[3]) / 1000000
+            self.markers.append(marker)
+            
+        #print("markers is: ", pf(self.markers))
+
+
+    def add_pvalues(self, p_values):
+        #for marker, p_value in itertools.izip(self.markers, p_values):
+        #    if marker['Mb'] <= 0 and marker['chr'] == 0:
+        #        continue
+        #    marker['p_value'] = p_value
+        #    print("p_value is:", marker['p_value'])
+        #    marker['lod_score'] = -math.log10(marker['p_value'])
+        #    #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+        #    marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        super(HumanMarkers, self).add_pvalues(p_values)
+        
+        with Bench("deleting markers"):
+            markers = []
+            for marker in self.markers:
+                if not marker['Mb'] <= 0 and not marker['chr'] == 0:
+                    markers.append(marker)
+            self.markers = markers
+        
+    
 
 class DatasetGroup(object):
     """
@@ -79,22 +158,41 @@ class DatasetGroup(object):
         if self.name == 'BXD300':
             self.name = "BXD"
         
+        self.f1list = None
+        self.parlist = None        
+        self.get_f1_parent_strains()
+        #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
         
         self.incparentsf1 = False
-        self.f1list = None
-        self.parlist = None
         self.allsamples = None
+        
+        
+    def get_markers(self):
+        #print("self.species is:", self.species)
+        if self.species == "human":
+            marker_class = HumanMarkers 
+        else:
+            marker_class = Markers
 
+        self.markers = marker_class(self.name)
+        
 
-    #def read_genotype(self):
-    #    self.read_genotype_file()
-    #
-    #    if not self.genotype:   # Didn'd succeed, so we try method 2
-    #        self.read_genotype_data()
+    def get_f1_parent_strains(self):
+        try:
+            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
+            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
+        except KeyError:
+            f1 = f12 = maternal = paternal = None
+            
+        if f1 and f12:
+            self.f1list = [f1, f12]
+        if maternal and paternal:
+            self.parlist = [maternal, paternal]
             
     def read_genotype_file(self):
-        '''read genotype from .geno file instead of database'''
+        '''Read genotype from .geno file instead of database'''
         #if self.group == 'BXD300':
         #    self.group = 'BXD'
         #
@@ -104,38 +202,24 @@ class DatasetGroup(object):
         #genotype_2 is Dataset Object with parents and f1 (not for intercross)
 
         genotype_1 = reaper.Dataset()
-        
+
         # reaper barfs on unicode filenames, so here we ensure it's a string
         full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
         genotype_1.read(full_filename)
 
-        print("Got to after read")
-
-        try:
-            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
-            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
-        except KeyError:
-            f1 = f12 = maternal = paternal = None
-
-
-        if genotype_1.type == "group" and maternal and paternal:
-            genotype_2 = genotype_1.add(Mat=maternal, Pat=paternal)       #, F1=_f1)
+        if genotype_1.type == "group" and self.parlist:
+            genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1])       #, F1=_f1)
         else:
             genotype_2 = genotype_1
 
         #determine default genotype object
         if self.incparentsf1 and genotype_1.type != "intercross":
-            self.genotype = genotype_2
+            genotype = genotype_2
         else:
             self.incparentsf1 = 0
-            self.genotype = genotype_1
-
-        self.samplelist = list(self.genotype.prgy)
+            genotype = genotype_1
 
-        if f1 and f12:
-            self.f1list = [f1, f12]
-        if maternal and paternal:
-            self.parlist = [maternal, paternal]
+        self.samplelist = list(genotype.prgy)
 
 
 class DataSet(object):
@@ -159,10 +243,10 @@ class DataSet(object):
         self.retrieve_other_names()
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
+        self.group.read_genotype_file()
         self.species = species.TheSpecies(self)
-    
-       
-        
+
+
     def get_desc(self):
         """Gets overridden later, at least for Temp...used by trait's get_given_name"""
         return None
@@ -209,14 +293,14 @@ class DataSet(object):
             self.name,
             self.name,
             self.name))
-        print("query_args are:", query_args)
+        #print("query_args are:", query_args)
 
-        print("""
-                SELECT Id, Name, FullName, ShortName
-                FROM %s
-                WHERE public > %s AND
-                     (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-          """ % (query_args))
+        #print("""
+        #        SELECT Id, Name, FullName, ShortName
+        #        FROM %s
+        #        WHERE public > %s AND
+        #             (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+        #  """ % (query_args))
 
         self.id, self.name, self.fullname, self.shortname = g.db.execute("""
                 SELECT Id, Name, FullName, ShortName
@@ -227,11 +311,7 @@ class DataSet(object):
 
         #self.cursor.execute(query)
         #self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone()
-
-
-    #def genHTML(self, Class='c0dd'):
-    #    return  HT.Href(text = HT.Span('%s Database' % self.fullname, Class= "fwb " + Class),
-    #            url= webqtlConfig.INFOPAGEHREF % self.name,target="_blank")
+        
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -291,6 +371,19 @@ class PhenotypeDataSet(DataSet):
         # (Urgently?) Need to write this
         pass
 
+    def get_trait_list(self):
+        query = """
+            select PublishXRef.Id
+            from PublishXRef, PublishFreeze
+            where PublishFreeze.InbredSetId=PublishXRef.InbredSetId
+            and PublishFreeze.Id = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        trait_data = {}
+        for trait in results:
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        return trait_data
+
     def get_trait_info(self, trait_list, species = ''):
         for this_trait in trait_list:
             if not this_trait.haveinfo:
@@ -301,7 +394,7 @@ class PhenotypeDataSet(DataSet):
                 continue   # for now
                 if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
                     description = this_trait.pre_publication_description
-            this_trait.description_display = description
+            this_trait.description_display = unicode(description, "utf8")
 
             if not this_trait.year.isdigit():
                 this_trait.pubmed_text = "N/A"
@@ -359,7 +452,7 @@ class PhenotypeDataSet(DataSet):
                             PublishFreeze.Id = %d AND PublishData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (trait.name, self.id)
+                    """ % (trait, self.id)
         results = g.db.execute(query).fetchall()
         return results
 
@@ -399,6 +492,19 @@ class GenotypeDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
+    
+    def get_trait_list(self):
+        query = """
+            select Geno.Name
+            from Geno, GenoXRef
+            where GenoXRef.GenoId = Geno.Id
+            and GenoFreezeId = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        trait_data = {}
+        for trait in results:
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        return trait_data
 
     def get_trait_info(self, trait_list, species=None):
         for this_trait in trait_list:
@@ -437,7 +543,7 @@ class GenotypeDataSet(DataSet):
                             GenoData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait.name, self.name)
+                    """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name)
         results = g.db.execute(query).fetchall()
         return results
 
@@ -509,10 +615,95 @@ class MrnaAssayDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
+        
+    def get_trait_list_1(self):
+        query = """
+            select ProbeSet.Name
+            from ProbeSet, ProbeSetXRef
+            where ProbeSetXRef.ProbeSetId = ProbeSet.Id
+            and ProbeSetFreezeId = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        #print("After get_trait_list query")
+        trait_data = {}
+        for trait in results:
+            print("Retrieving sample_data for ", trait[0])
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        #print("After retrieve_sample_data")
+        return trait_data
+    
+    def get_trait_data(self):
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        
+            temp = ['T%s.value' % item for item in sample_ids_step]
+            query = "SELECT {}.Name,".format(escape(self.type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+                                                                     self.type,
+                                                                     self.type))
+
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
+            query += """
+                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                    and {}Freeze.Name = '{}'
+                    and {}.Id = {}XRef.{}Id
+                    order by {}.Id
+                    """.format(*mescape(self.type, self.type, self.type, self.type,
+                               self.name, self.type, self.type, self.type, self.type))
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
+
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
+        
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+    
 
     def get_trait_info(self, trait_list=None, species=''):
 
-        #  Note: setting trait_list to [] is probably not a great idea.
+        #  Note: setting trait_list to [] is probably not a great idea. 
         if not trait_list:
             trait_list = []
 
@@ -521,9 +712,7 @@ class MrnaAssayDataSet(DataSet):
             if not this_trait.haveinfo:
                 this_trait.retrieveInfo(QTL=1)
 
-            if this_trait.symbol:
-                pass
-            else:
+            if not this_trait.symbol:
                 this_trait.symbol = "N/A"
 
             #XZ, 12/08/2008: description
@@ -531,60 +720,56 @@ class MrnaAssayDataSet(DataSet):
             description_string = str(this_trait.description).strip()
             target_string = str(this_trait.probe_target_description).strip()
 
-            description_display = ''
-
             if len(description_string) > 1 and description_string != 'None':
                 description_display = description_string
             else:
                 description_display = this_trait.symbol
 
-            if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None':
+            if (len(description_display) > 1 and description_display != 'N/A' and
+                    len(target_string) > 1 and target_string != 'None'):
                 description_display = description_display + '; ' + target_string.strip()
 
             # Save it for the jinja2 template
             this_trait.description_display = description_display
-            #print("  xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display))
 
             #XZ: trait_location_value is used for sorting
             trait_location_repr = 'N/A'
             trait_location_value = 1000000
 
             if this_trait.chr and this_trait.mb:
-                try:
-                    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
-                except:
-                    if this_trait.chr.upper() == 'X':
-                        trait_location_value = 20*1000 + this_trait.mb
-                    else:
-                        trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb
-
-                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) )
+                #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y")
+                #This is so we can convert the location to a number used for sorting
+                trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb)
+                #try:
+                #    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+                #except ValueError:
+                #    if this_trait.chr.upper() == 'X':
+                #        trait_location_value = 20*1000 + this_trait.mb
+                #    else:
+                #        trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 +
+                #                               this_trait.mb)
+
+                #ZS: Put this in function currently called "convert_location_to_value"
+                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr,
+                                                                float(this_trait.mb))
                 this_trait.location_value = trait_location_value
-                #this_trait.trait_location_value = trait_location_value
 
-            #XZ, 01/12/08: This SQL query is much faster.
+            #Get mean expression value
             query = (
-"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
-    where ProbeSetXRef.ProbeSetFreezeId = %s and
-    ProbeSet.Id = ProbeSetXRef.ProbeSetId and
-    ProbeSet.Name = '%s'
+            """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
+                where ProbeSetXRef.ProbeSetFreezeId = %s and
+                ProbeSet.Id = ProbeSetXRef.ProbeSetId and
+                ProbeSet.Name = '%s'
             """ % (escape(str(this_trait.dataset.id)),
                    escape(this_trait.name)))
 
-            print("query is:", pf(query))
+            #print("query is:", pf(query))
 
             result = g.db.execute(query).fetchone()
+            
+            mean = result[0] if result else 0
 
-            if result:
-                if result[0]:
-                    mean = result[0]
-                else:
-                    mean=0
-            else:
-                mean = 0
-
-            #XZ, 06/05/2009: It is neccessary to turn on nowrap
-            this_trait.mean = repr = "%2.3f" % mean
+            this_trait.mean = "%2.3f" % mean
 
             #LRS and its location
             this_trait.LRS_score_repr = 'N/A'
@@ -603,23 +788,39 @@ class MrnaAssayDataSet(DataSet):
                 result = self.cursor.fetchone()
 
                 if result:
-                    if result[0] and result[1]:
-                        LRS_Chr = result[0]
-                        LRS_Mb = result[1]
-
-                        #XZ: LRS_location_value is used for sorting
-                        try:
-                            LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb)
-                        except:
-                            if LRS_Chr.upper() == 'X':
-                                LRS_location_value = 20*1000 + float(LRS_Mb)
-                            else:
-                                LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb)
+                    #if result[0] and result[1]:
+                    #    lrs_chr = result[0]
+                    #    lrs_mb = result[1]
+                    lrs_chr, lrs_mb = result
+                    #XZ: LRS_location_value is used for sorting
+                    lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
+                    
+                    #try:
+                    #    lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
+                    #except:
+                    #    if lrs_chr.upper() == 'X':
+                    #        lrs_location_value = 20*1000 + float(lrs_mb)
+                    #    else:
+                    #        lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 +
+                    #                              float(lrs_mb))
+
+                    this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
+                    this_trait.LRS_score_value = this_trait.lrs
+                    this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb))
+      
+
+    def convert_location_to_value(self, chromosome, mb):
+        try:
+            location_value = int(chromosome)*1000 + float(mb)
+        except ValueError:
+            if chromosome.upper() == 'X':
+                location_value = 20*1000 + float(mb)
+            else:
+                location_value = (ord(str(chromosome).upper()[0])*1000 +
+                                  float(mb))
+        
+        return location_value
 
-                        this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
-                        this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
-                        this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) )
-                        
     def get_sequence(self):
         query = """
                     SELECT
@@ -633,9 +834,9 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSetFreeze.Name = %s
                 """ % (escape(self.name), escape(self.dataset.name))
         results = g.db.execute(query).fetchone()
-
         return results[0]
     
+   
     def retrieve_sample_data(self, trait):
         query = """
                     SELECT
@@ -652,7 +853,7 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSetData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (escape(trait.name), escape(self.name))
+                    """ % (escape(trait), escape(self.name))
         results = g.db.execute(query).fetchall()
         return results
 
@@ -725,7 +926,7 @@ class TempDataSet(DataSet):
 
 def geno_mrna_confidentiality(ob):
     dataset_table = ob.type + "Freeze"
-    print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+    #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
 
     query = '''SELECT Id, Name, FullName, confidentiality,
                         AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
@@ -741,3 +942,4 @@ def geno_mrna_confidentiality(ob):
     if confidential:
         # Allow confidential data later
         NoConfindetialDataForYouTodaySorry
+