Merge branch 'flask' of git://github.com/leiyan/GeneNetwork2-Python into flask

Conflicts: wqflask/base/data_set.py wqflask/wqflask/views.py Fixed a couple conflicts to merge Lei's code related to the correlation page
author: Zachary Sloan 2013-06-19 18:30:59 +0000
committer: Zachary Sloan 2013-06-19 18:30:59 +0000
commit: 2ebacd207ee9c65d9239626fc18d2c1a50e08fbc (patch)
tree: 41627942dfce65608d376134ffb00a6765492676 /wqflask/base/data_set.py
parent: 0ce9a060640710b5a6e18f71b08d79e51ef71d8a (diff)
parent: 25bd2fa7ac229eb7862fe778fe03eb75ff34368c (diff)
download: genenetwork2-2ebacd207ee9c65d9239626fc18d2c1a50e08fbc.tar.gz
1 files changed, 113 insertions, 99 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 9b0a3dcc..07fe9cd9 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -38,6 +38,7 @@ from base import species
 from dbFunction import webqtlDatabaseFunction
 from utility import webqtlUtil
 from utility.benchmark import Bench
+from wqflask.my_pylmm.pyLMM import chunks
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
@@ -46,7 +47,7 @@ from pprint import pformat as pf
 DS_NAME_MAP = {}
 
 def create_dataset(dataset_name):
-    print("dataset_name:", dataset_name)
+    #print("dataset_name:", dataset_name)
 
     query = """
         SELECT DBType.Name
@@ -68,10 +69,17 @@ def create_dataset(dataset_name):
     dataset_class = globals()[dataset_ob]
     return dataset_class(dataset_name)
 
+def create_in_clause(items):
+    """Create an in clause for mysql"""
+    in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
+    in_clause = '( {} )'.format(in_clause)
+    return in_clause
+
+
 def mescape(*items):
     """Multiple escape"""
-    escaped = [escape(item) for item in items]
-    print("escaped is:", escaped)
+    escaped = [escape(str(item)) for item in items]
+    #print("escaped is:", escaped)
     return escaped
 
 
@@ -82,8 +90,8 @@ class Markers(object):
         self.markers = json.load(json_data_fh)
     
     def add_pvalues(self, p_values):
-        print("length of self.markers:", len(self.markers))
-        print("length of p_values:", len(p_values))
+        #print("length of self.markers:", len(self.markers))
+        #print("length of p_values:", len(p_values))
         
         # THIS IS only needed for the case when we are limiting the number of p-values calculated
         if len(self.markers) < len(p_values):
@@ -153,7 +161,7 @@ class DatasetGroup(object):
         self.f1list = None
         self.parlist = None        
         self.get_f1_parent_strains()
-        print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
         
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
         
@@ -162,7 +170,7 @@ class DatasetGroup(object):
         
         
     def get_markers(self):
-        print("self.species is:", self.species)
+        #print("self.species is:", self.species)
         if self.species == "human":
             marker_class = HumanMarkers 
         else:
@@ -235,6 +243,7 @@ class DataSet(object):
         self.retrieve_other_names()
         
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
+        self.group.read_genotype_file()
         self.species = species.TheSpecies(self)
 
 
@@ -284,14 +293,14 @@ class DataSet(object):
             self.name,
             self.name,
             self.name))
-        print("query_args are:", query_args)
+        #print("query_args are:", query_args)
 
-        print("""
-                SELECT Id, Name, FullName, ShortName
-                FROM %s
-                WHERE public > %s AND
-                     (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-          """ % (query_args))
+        #print("""
+        #        SELECT Id, Name, FullName, ShortName
+        #        FROM %s
+        #        WHERE public > %s AND
+        #             (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+        #  """ % (query_args))
 
         self.id, self.name, self.fullname, self.shortname = g.db.execute("""
                 SELECT Id, Name, FullName, ShortName
@@ -615,34 +624,33 @@ class MrnaAssayDataSet(DataSet):
             and ProbeSetFreezeId = {}
             """.format(escape(str(self.id)))
         results = g.db.execute(query).fetchall()
-        print("After get_trait_list query")
+        #print("After get_trait_list query")
         trait_data = {}
         for trait in results:
             print("Retrieving sample_data for ", trait[0])
             trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
-        print("After retrieve_sample_data")
+        #print("After retrieve_sample_data")
         return trait_data
     
     def get_trait_data(self):
-        sample_ids = []
-        for sample in self.group.samplelist:
-            query = """
-                SELECT Strain.Id FROM Strain, Species
-                WHERE Strain.Name = '{}'
-                and Strain.SpeciesId=Species.Id
-                and Species.name = '{}'
-                """.format(*mescape(sample, self.group.species))
-            this_id = g.db.execute(query).fetchone()[0]
-            sample_ids.append('%d' % this_id)
-        print("sample_ids size: ", len(sample_ids))
+        self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+        query = """
+            SELECT Strain.Name, Strain.Id FROM Strain, Species
+            WHERE Strain.Name IN {}
+            and Strain.SpeciesId=Species.Id
+            and Species.name = '{}'
+            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        results = dict(g.db.execute(query).fetchall())
+        sample_ids = [results[item] for item in self.samplelist]
 
         # MySQL limits the number of tables that can be used in a join to 61,
         # so we break the sample ids into smaller chunks
-        chunk_count = 50
-        n = len(sample_ids) / chunk_count
-        if len(sample_ids) % chunk_count:
-            n += 1
-        print("n: ", n)
+        # Postgres doesn't have that limit, so we can get rid of this after we transition
+        chunk_size = 50
+        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+        trait_sample_data = []
+        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
         #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
         #tempTable = None
         #if GeneId and db.type == "ProbeSet": 
@@ -656,24 +664,21 @@ class MrnaAssayDataSet(DataSet):
         #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
         #                                        method=method,
         #                                        returnNumber=returnNumber)
-        trait_sample_data = []
-        for step in range(int(n)):
-            temp = []
-            sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)]
-            for item in sample_ids_step:
-                temp.append('T%s.value' % item)
+        
+            temp = ['T%s.value' % item for item in sample_ids_step]
             query = "SELECT {}.Name,".format(escape(self.type))
             data_start_pos = 1
             query += string.join(temp, ', ')
             query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
                                                                      self.type,
                                                                      self.type))
-            #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
+
             for item in sample_ids_step:
                 query += """
                         left join {}Data as T{} on T{}.Id = {}XRef.DataId
                         and T{}.StrainId={}\n
                         """.format(*mescape(self.type, item, item, self.type, item, item))
+                        
             query += """
                     WHERE {}XRef.{}FreezeId = {}Freeze.Id
                     and {}Freeze.Name = '{}'
@@ -681,23 +686,24 @@ class MrnaAssayDataSet(DataSet):
                     order by {}.Id
                     """.format(*mescape(self.type, self.type, self.type, self.type,
                                self.name, self.type, self.type, self.type, self.type))
-            print("query: ", query)
             results = g.db.execute(query).fetchall()
             trait_sample_data.append(results)
-            
+
         trait_count = len(trait_sample_data[0])
         self.trait_data = collections.defaultdict(list)
+        
         # put all of the separate data together into a dictionary where the keys are
         # trait names and values are lists of sample values
-        for j in range(trait_count):
-            trait_name = trait_sample_data[0][j][0]
-            for i in range(int(n)):
-                self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:]
-
+        for trait_counter in range(trait_count):
+            trait_name = trait_sample_data[0][trait_counter][0]
+            for chunk_counter in range(int(number_chunks)):
+                self.trait_data[trait_name] += (
+                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
     
+
     def get_trait_info(self, trait_list=None, species=''):
 
-        #  Note: setting trait_list to [] is probably not a great idea.
+        #  Note: setting trait_list to [] is probably not a great idea. 
         if not trait_list:
             trait_list = []
 
@@ -706,9 +712,7 @@ class MrnaAssayDataSet(DataSet):
             if not this_trait.haveinfo:
                 this_trait.retrieveInfo(QTL=1)
 
-            if this_trait.symbol:
-                pass
-            else:
+            if not this_trait.symbol:
                 this_trait.symbol = "N/A"
 
             #XZ, 12/08/2008: description
@@ -716,62 +720,56 @@ class MrnaAssayDataSet(DataSet):
             description_string = str(this_trait.description).strip()
             target_string = str(this_trait.probe_target_description).strip()
 
-            description_display = ''
-
             if len(description_string) > 1 and description_string != 'None':
                 description_display = description_string
             else:
                 description_display = this_trait.symbol
 
-            if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None':
+            if (len(description_display) > 1 and description_display != 'N/A' and
+                    len(target_string) > 1 and target_string != 'None'):
                 description_display = description_display + '; ' + target_string.strip()
 
             # Save it for the jinja2 template
             this_trait.description_display = description_display
-            #print("  xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display))
 
             #XZ: trait_location_value is used for sorting
             trait_location_repr = 'N/A'
             trait_location_value = 1000000
 
             if this_trait.chr and this_trait.mb:
-                print("this_trait.chr is:", this_trait.chr)
-                print("this_trait.mb is:", this_trait.mb)
-                try:
-                    trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb)
-                except:
-                    if this_trait.chr.upper() == 'X':
-                        trait_location_value = 20*1000 + this_trait.mb
-                    else:
-                        trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb
-
-                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) )
+                #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y")
+                #This is so we can convert the location to a number used for sorting
+                trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb)
+                #try:
+                #    trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+                #except ValueError:
+                #    if this_trait.chr.upper() == 'X':
+                #        trait_location_value = 20*1000 + this_trait.mb
+                #    else:
+                #        trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 +
+                #                               this_trait.mb)
+
+                #ZS: Put this in function currently called "convert_location_to_value"
+                this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr,
+                                                                float(this_trait.mb))
                 this_trait.location_value = trait_location_value
-                #this_trait.trait_location_value = trait_location_value
 
-            #XZ, 01/12/08: This SQL query is much faster.
+            #Get mean expression value
             query = (
-"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
-    where ProbeSetXRef.ProbeSetFreezeId = %s and
-    ProbeSet.Id = ProbeSetXRef.ProbeSetId and
-    ProbeSet.Name = '%s'
+            """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
+                where ProbeSetXRef.ProbeSetFreezeId = %s and
+                ProbeSet.Id = ProbeSetXRef.ProbeSetId and
+                ProbeSet.Name = '%s'
             """ % (escape(str(this_trait.dataset.id)),
                    escape(this_trait.name)))
 
-            print("query is:", pf(query))
+            #print("query is:", pf(query))
 
             result = g.db.execute(query).fetchone()
+            
+            mean = result[0] if result else 0
 
-            if result:
-                if result[0]:
-                    mean = result[0]
-                else:
-                    mean=0
-            else:
-                mean = 0
-
-            #XZ, 06/05/2009: It is neccessary to turn on nowrap
-            this_trait.mean = repr = "%2.3f" % mean
+            this_trait.mean = "%2.3f" % mean
 
             #LRS and its location
             this_trait.LRS_score_repr = 'N/A'
@@ -790,23 +788,39 @@ class MrnaAssayDataSet(DataSet):
                 result = self.cursor.fetchone()
 
                 if result:
-                    if result[0] and result[1]:
-                        LRS_Chr = result[0]
-                        LRS_Mb = result[1]
-
-                        #XZ: LRS_location_value is used for sorting
-                        try:
-                            LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb)
-                        except:
-                            if LRS_Chr.upper() == 'X':
-                                LRS_location_value = 20*1000 + float(LRS_Mb)
-                            else:
-                                LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb)
+                    #if result[0] and result[1]:
+                    #    lrs_chr = result[0]
+                    #    lrs_mb = result[1]
+                    lrs_chr, lrs_mb = result
+                    #XZ: LRS_location_value is used for sorting
+                    lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
+                    
+                    #try:
+                    #    lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
+                    #except:
+                    #    if lrs_chr.upper() == 'X':
+                    #        lrs_location_value = 20*1000 + float(lrs_mb)
+                    #    else:
+                    #        lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 +
+                    #                              float(lrs_mb))
+
+                    this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
+                    this_trait.LRS_score_value = this_trait.lrs
+                    this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb))
+      
+
+    def convert_location_to_value(self, chromosome, mb):
+        try:
+            location_value = int(chromosome)*1000 + float(mb)
+        except ValueError:
+            if chromosome.upper() == 'X':
+                location_value = 20*1000 + float(mb)
+            else:
+                location_value = (ord(str(chromosome).upper()[0])*1000 +
+                                  float(mb))
+        
+        return location_value
 
-                        this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
-                        this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
-                        this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) )
-                        
     def get_sequence(self):
         query = """
                     SELECT
@@ -912,7 +926,7 @@ class TempDataSet(DataSet):
 
 def geno_mrna_confidentiality(ob):
     dataset_table = ob.type + "Freeze"
-    print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+    #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
 
     query = '''SELECT Id, Name, FullName, confidentiality,
                         AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
author	Zachary Sloan	2013-06-19 18:30:59 +0000
committer	Zachary Sloan	2013-06-19 18:30:59 +0000
commit	2ebacd207ee9c65d9239626fc18d2c1a50e08fbc (patch)
tree	41627942dfce65608d376134ffb00a6765492676 /wqflask/base/data_set.py
parent	0ce9a060640710b5a6e18f71b08d79e51ef71d8a (diff)
parent	25bd2fa7ac229eb7862fe778fe03eb75ff34368c (diff)
download	genenetwork2-2ebacd207ee9c65d9239626fc18d2c1a50e08fbc.tar.gz