about summary refs log tree commit diff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py290
1 files changed, 239 insertions, 51 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 50ef8f57..9b0a3dcc 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -22,10 +22,14 @@
 
 from __future__ import absolute_import, print_function, division
 import os
+import math
+import string
+import collections
 
-from flask import Flask, g
+import json
+import itertools
 
-from htmlgen import HTMLgen2 as HT
+from flask import Flask, g
 
 import reaper
 
@@ -33,6 +37,7 @@ from base import webqtlConfig
 from base import species
 from dbFunction import webqtlDatabaseFunction
 from utility import webqtlUtil
+from utility.benchmark import Bench
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
@@ -41,29 +46,95 @@ from pprint import pformat as pf
 DS_NAME_MAP = {}
 
 def create_dataset(dataset_name):
-    #cursor = db_conn.cursor()
     print("dataset_name:", dataset_name)
 
     query = """
         SELECT DBType.Name
         FROM DBList, DBType
-        WHERE DBList.Name = '%s' and
+        WHERE DBList.Name = '{}' and
               DBType.Id = DBList.DBTypeId
-        """ % (escape(dataset_name))
-    print("query is: ", pf(query))
+        """.format(escape(dataset_name))
+    #print("query is: ", pf(query))
     dataset_type = g.db.execute(query).fetchone().Name
 
     #dataset_type = cursor.fetchone()[0]
-    print("[blubber] dataset_type:", pf(dataset_type))
+    #print("[blubber] dataset_type:", pf(dataset_type))
 
     dataset_ob = DS_NAME_MAP[dataset_type]
     #dataset_class = getattr(data_set, dataset_ob)
-    print("dataset_ob:", dataset_ob)
-    print("DS_NAME_MAP:", pf(DS_NAME_MAP))
+    #print("dataset_ob:", dataset_ob)
+    #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
 
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+def mescape(*items):
+    """Multiple escape"""
+    escaped = [escape(item) for item in items]
+    print("escaped is:", escaped)
+    return escaped
+
+
+class Markers(object):
+    """Todo: Build in cacheing so it saves us reading the same file more than once"""
+    def __init__(self, name):
+        json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+        self.markers = json.load(json_data_fh)
+    
+    def add_pvalues(self, p_values):
+        print("length of self.markers:", len(self.markers))
+        print("length of p_values:", len(p_values))
+        
+        # THIS IS only needed for the case when we are limiting the number of p-values calculated
+        if len(self.markers) < len(p_values):
+            self.markers = self.markers[:len(p_values)]
+        
+        for marker, p_value in itertools.izip(self.markers, p_values):
+            marker['p_value'] = p_value
+            print("p_value is:", marker['p_value'])
+            marker['lod_score'] = -math.log10(marker['p_value'])
+            #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+            marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        
+
+
+class HumanMarkers(Markers):
+    
+    def __init__(self, name):
+        marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+        self.markers = []
+        for line in marker_data_fh:
+            splat = line.strip().split()
+            marker = {}
+            marker['chr'] = int(splat[0])
+            marker['name'] = splat[1]
+            marker['Mb'] = float(splat[3]) / 1000000
+            self.markers.append(marker)
+            
+        #print("markers is: ", pf(self.markers))
+
+
+    def add_pvalues(self, p_values):
+        #for marker, p_value in itertools.izip(self.markers, p_values):
+        #    if marker['Mb'] <= 0 and marker['chr'] == 0:
+        #        continue
+        #    marker['p_value'] = p_value
+        #    print("p_value is:", marker['p_value'])
+        #    marker['lod_score'] = -math.log10(marker['p_value'])
+        #    #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+        #    marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+        
+        super(HumanMarkers, self).add_pvalues(p_values)
+        
+        with Bench("deleting markers"):
+            markers = []
+            for marker in self.markers:
+                if not marker['Mb'] <= 0 and not marker['chr'] == 0:
+                    markers.append(marker)
+            self.markers = markers
+        
+    
 
 class DatasetGroup(object):
     """
@@ -79,22 +150,41 @@ class DatasetGroup(object):
         if self.name == 'BXD300':
             self.name = "BXD"
         
+        self.f1list = None
+        self.parlist = None        
+        self.get_f1_parent_strains()
+        print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
         
         self.incparentsf1 = False
-        self.f1list = None
-        self.parlist = None
         self.allsamples = None
+        
+        
+    def get_markers(self):
+        print("self.species is:", self.species)
+        if self.species == "human":
+            marker_class = HumanMarkers 
+        else:
+            marker_class = Markers
 
+        self.markers = marker_class(self.name)
+        
 
-    #def read_genotype(self):
-    #    self.read_genotype_file()
-    #
-    #    if not self.genotype:   # Didn'd succeed, so we try method 2
-    #        self.read_genotype_data()
+    def get_f1_parent_strains(self):
+        try:
+            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
+            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
+        except KeyError:
+            f1 = f12 = maternal = paternal = None
+            
+        if f1 and f12:
+            self.f1list = [f1, f12]
+        if maternal and paternal:
+            self.parlist = [maternal, paternal]
             
     def read_genotype_file(self):
-        '''read genotype from .geno file instead of database'''
+        '''Read genotype from .geno file instead of database'''
         #if self.group == 'BXD300':
         #    self.group = 'BXD'
         #
@@ -104,38 +194,24 @@ class DatasetGroup(object):
         #genotype_2 is Dataset Object with parents and f1 (not for intercross)
 
         genotype_1 = reaper.Dataset()
-        
+
         # reaper barfs on unicode filenames, so here we ensure it's a string
         full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
         genotype_1.read(full_filename)
 
-        print("Got to after read")
-
-        try:
-            # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
-            f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
-        except KeyError:
-            f1 = f12 = maternal = paternal = None
-
-
-        if genotype_1.type == "group" and maternal and paternal:
-            genotype_2 = genotype_1.add(Mat=maternal, Pat=paternal)       #, F1=_f1)
+        if genotype_1.type == "group" and self.parlist:
+            genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1])       #, F1=_f1)
         else:
             genotype_2 = genotype_1
 
         #determine default genotype object
         if self.incparentsf1 and genotype_1.type != "intercross":
-            self.genotype = genotype_2
+            genotype = genotype_2
         else:
             self.incparentsf1 = 0
-            self.genotype = genotype_1
+            genotype = genotype_1
 
-        self.samplelist = list(self.genotype.prgy)
-
-        if f1 and f12:
-            self.f1list = [f1, f12]
-        if maternal and paternal:
-            self.parlist = [maternal, paternal]
+        self.samplelist = list(genotype.prgy)
 
 
 class DataSet(object):
@@ -160,9 +236,8 @@ class DataSet(object):
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
         self.species = species.TheSpecies(self)
-    
-       
-        
+
+
     def get_desc(self):
         """Gets overridden later, at least for Temp...used by trait's get_given_name"""
         return None
@@ -227,11 +302,7 @@ class DataSet(object):
 
         #self.cursor.execute(query)
         #self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone()
-
-
-    #def genHTML(self, Class='c0dd'):
-    #    return  HT.Href(text = HT.Span('%s Database' % self.fullname, Class= "fwb " + Class),
-    #            url= webqtlConfig.INFOPAGEHREF % self.name,target="_blank")
+        
 
 class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -291,6 +362,19 @@ class PhenotypeDataSet(DataSet):
         # (Urgently?) Need to write this
         pass
 
+    def get_trait_list(self):
+        query = """
+            select PublishXRef.Id
+            from PublishXRef, PublishFreeze
+            where PublishFreeze.InbredSetId=PublishXRef.InbredSetId
+            and PublishFreeze.Id = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        trait_data = {}
+        for trait in results:
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        return trait_data
+
     def get_trait_info(self, trait_list, species = ''):
         for this_trait in trait_list:
             if not this_trait.haveinfo:
@@ -301,7 +385,7 @@ class PhenotypeDataSet(DataSet):
                 continue   # for now
                 if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
                     description = this_trait.pre_publication_description
-            this_trait.description_display = description
+            this_trait.description_display = unicode(description, "utf8")
 
             if not this_trait.year.isdigit():
                 this_trait.pubmed_text = "N/A"
@@ -359,7 +443,7 @@ class PhenotypeDataSet(DataSet):
                             PublishFreeze.Id = %d AND PublishData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (trait.name, self.id)
+                    """ % (trait, self.id)
         results = g.db.execute(query).fetchall()
         return results
 
@@ -399,6 +483,19 @@ class GenotypeDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
+    
+    def get_trait_list(self):
+        query = """
+            select Geno.Name
+            from Geno, GenoXRef
+            where GenoXRef.GenoId = Geno.Id
+            and GenoFreezeId = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        trait_data = {}
+        for trait in results:
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        return trait_data
 
     def get_trait_info(self, trait_list, species=None):
         for this_trait in trait_list:
@@ -437,7 +534,7 @@ class GenotypeDataSet(DataSet):
                             GenoData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait.name, self.name)
+                    """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name)
         results = g.db.execute(query).fetchall()
         return results
 
@@ -509,7 +606,95 @@ class MrnaAssayDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
+        
+    def get_trait_list_1(self):
+        query = """
+            select ProbeSet.Name
+            from ProbeSet, ProbeSetXRef
+            where ProbeSetXRef.ProbeSetId = ProbeSet.Id
+            and ProbeSetFreezeId = {}
+            """.format(escape(str(self.id)))
+        results = g.db.execute(query).fetchall()
+        print("After get_trait_list query")
+        trait_data = {}
+        for trait in results:
+            print("Retrieving sample_data for ", trait[0])
+            trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+        print("After retrieve_sample_data")
+        return trait_data
+    
+    def get_trait_data(self):
+        sample_ids = []
+        for sample in self.group.samplelist:
+            query = """
+                SELECT Strain.Id FROM Strain, Species
+                WHERE Strain.Name = '{}'
+                and Strain.SpeciesId=Species.Id
+                and Species.name = '{}'
+                """.format(*mescape(sample, self.group.species))
+            this_id = g.db.execute(query).fetchone()[0]
+            sample_ids.append('%d' % this_id)
+        print("sample_ids size: ", len(sample_ids))
+
+        # MySQL limits the number of tables that can be used in a join to 61,
+        # so we break the sample ids into smaller chunks
+        chunk_count = 50
+        n = len(sample_ids) / chunk_count
+        if len(sample_ids) % chunk_count:
+            n += 1
+        print("n: ", n)
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #tempTable = None
+        #if GeneId and db.type == "ProbeSet": 
+        #    if method == "3":
+        #        tempTable = self.getTempLiteratureTable(species=species,
+        #                                                input_species_geneid=GeneId,
+        #                                                returnNumber=returnNumber)
+        #
+        #    if method == "4" or method == "5":
+        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+        #                                        method=method,
+        #                                        returnNumber=returnNumber)
+        trait_sample_data = []
+        for step in range(int(n)):
+            temp = []
+            sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)]
+            for item in sample_ids_step:
+                temp.append('T%s.value' % item)
+            query = "SELECT {}.Name,".format(escape(self.type))
+            data_start_pos = 1
+            query += string.join(temp, ', ')
+            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+                                                                     self.type,
+                                                                     self.type))
+            #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
+            for item in sample_ids_step:
+                query += """
+                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
+                        and T{}.StrainId={}\n
+                        """.format(*mescape(self.type, item, item, self.type, item, item))
+            query += """
+                    WHERE {}XRef.{}FreezeId = {}Freeze.Id
+                    and {}Freeze.Name = '{}'
+                    and {}.Id = {}XRef.{}Id
+                    order by {}.Id
+                    """.format(*mescape(self.type, self.type, self.type, self.type,
+                               self.name, self.type, self.type, self.type, self.type))
+            print("query: ", query)
+            results = g.db.execute(query).fetchall()
+            trait_sample_data.append(results)
+            
+        trait_count = len(trait_sample_data[0])
+        self.trait_data = collections.defaultdict(list)
+        # put all of the separate data together into a dictionary where the keys are
+        # trait names and values are lists of sample values
+        for j in range(trait_count):
+            trait_name = trait_sample_data[0][j][0]
+            for i in range(int(n)):
+                self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:]
 
+    
     def get_trait_info(self, trait_list=None, species=''):
 
         #  Note: setting trait_list to [] is probably not a great idea.
@@ -550,8 +735,10 @@ class MrnaAssayDataSet(DataSet):
             trait_location_value = 1000000
 
             if this_trait.chr and this_trait.mb:
+                print("this_trait.chr is:", this_trait.chr)
+                print("this_trait.mb is:", this_trait.mb)
                 try:
-                    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+                    trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb)
                 except:
                     if this_trait.chr.upper() == 'X':
                         trait_location_value = 20*1000 + this_trait.mb
@@ -633,9 +820,9 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSetFreeze.Name = %s
                 """ % (escape(self.name), escape(self.dataset.name))
         results = g.db.execute(query).fetchone()
-
         return results[0]
     
+   
     def retrieve_sample_data(self, trait):
         query = """
                     SELECT
@@ -652,7 +839,7 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSetData.StrainId = Strain.Id
                     Order BY
                             Strain.Name
-                    """ % (escape(trait.name), escape(self.name))
+                    """ % (escape(trait), escape(self.name))
         results = g.db.execute(query).fetchall()
         return results
 
@@ -741,3 +928,4 @@ def geno_mrna_confidentiality(ob):
     if confidential:
         # Allow confidential data later
         NoConfindetialDataForYouTodaySorry
+