about summary refs log tree commit diff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/base/data_set.py')
-rw-r--r--wqflask/base/data_set.py204
1 files changed, 122 insertions, 82 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 4953e728..c70738f7 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -40,7 +40,7 @@ import reaper
 
 from base import webqtlConfig
 from base import species
-from dbFunction import webqtlDatabaseFunction
+from db import webqtlDatabaseFunction
 from utility import webqtlUtil
 from utility.benchmark import Bench
 from utility import chunks
@@ -50,6 +50,12 @@ from maintenance import get_group_samplelists
 
 from MySQLdb import escape_string as escape
 from pprint import pformat as pf
+from db.gn_server import menu_main
+from db.call import fetchall,fetchone,fetch1
+
+from utility.tools import USE_GN_SERVER, USE_REDIS
+from utility.logger import getLogger
+logger = getLogger(__name__ )
 
 # Used by create_database to instantiate objects
 # Each subclass will add to this
@@ -58,8 +64,7 @@ DS_NAME_MAP = {}
 def create_dataset(dataset_name, dataset_type = None, get_samplelist = True):
     if not dataset_type:
         dataset_type = Dataset_Getter(dataset_name)
-
-        print("dataset_type is:", dataset_type)
+        logger.debug("dataset_type", dataset_type)
 
     dataset_ob = DS_NAME_MAP[dataset_type]
     dataset_class = globals()[dataset_ob]
@@ -68,12 +73,28 @@ def create_dataset(dataset_name, dataset_type = None, get_samplelist = True):
 class Dataset_Types(object):
 
     def __init__(self):
+        """Create a dictionary of samples where the value is set to Geno,
+Publish or ProbeSet. E.g.
+
+        {'AD-cases-controls-MyersGeno': 'Geno',
+         'AD-cases-controls-MyersPublish': 'Publish',
+         'AKXDGeno': 'Geno',
+         'AXBXAGeno': 'Geno',
+         'AXBXAPublish': 'Publish',
+         'Aging-Brain-UCIPublish': 'Publish',
+         'All Phenotypes': 'Publish',
+         'B139_K_1206_M': 'ProbeSet',
+         'B139_K_1206_R': 'ProbeSet' ...
+
+        """
         self.datasets = {}
-        file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
-        with open(file_name, 'r') as fh:
-            data = json.load(fh)
+        if USE_GN_SERVER:
+            data = menu_main()
+        else:
+            file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
+            with open(file_name, 'r') as fh:
+                data = json.load(fh)
 
-        print("*" * 70)
         for species in data['datasets']:
             for group in data['datasets'][species]:
                 for dataset_type in data['datasets'][species][group]:
@@ -85,7 +106,8 @@ class Dataset_Types(object):
                             new_type = "Geno"
                         else:
                             new_type = "ProbeSet"
-                        self.datasets[short_dataset_name] = new_type
+                            self.datasets[short_dataset_name] = new_type
+        logger.info("datasets",self.datasets)
 
     def __call__(self, name):
         return self.datasets[name]
@@ -94,14 +116,15 @@ class Dataset_Types(object):
 Dataset_Getter = Dataset_Types()
 
 def create_datasets_list():
-    key = "all_datasets"
-    result = Redis.get(key)
+    if USE_REDIS:
+        key = "all_datasets"
+        result = Redis.get(key)
 
-    if result:
-        print("Cache hit!!!")
-        datasets = pickle.loads(result)
+        if result:
+            logger.debug("Redis cache hit")
+            datasets = pickle.loads(result)
 
-    else:
+    if result is None:
         datasets = list()
         with Bench("Creating DataSets object"):
             type_dict = {'Publish': 'PublishFreeze',
@@ -110,15 +133,17 @@ def create_datasets_list():
 
             for dataset_type in type_dict:
                 query = "SELECT Name FROM {}".format(type_dict[dataset_type])
-                for result in g.db.execute(query).fetchall():
-                    #The query at the beginning of this function isn't necessary here, but still would
-                    #rather just reuse it
-                    #print("type: {}\tname: {}".format(dataset_type, result.Name))
+                for result in fetchall(query):
+                    #The query at the beginning of this function isn't
+                    #necessary here, but still would rather just reuse
+                    #it logger.debug("type: {}\tname:
+                    #{}".format(dataset_type, result.Name))
                     dataset = create_dataset(result.Name, dataset_type)
                     datasets.append(dataset)
 
-        Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL))
-        Redis.expire(key, 60*60)
+        if USE_REDIS:
+            Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL))
+            Redis.expire(key, 60*60)
 
     return datasets
 
@@ -133,7 +158,7 @@ def create_in_clause(items):
 def mescape(*items):
     """Multiple escape"""
     escaped = [escape(str(item)) for item in items]
-    #print("escaped is:", escaped)
+    #logger.debug("escaped is:", escaped)
     return escaped
 
 
@@ -152,12 +177,12 @@ class Markers(object):
             marker['Mb'] = float(marker['Mb'])
 
         self.markers = markers
-        #print("self.markers:", self.markers)
+        #logger.debug("self.markers:", self.markers)
 
 
     def add_pvalues(self, p_values):
-        print("length of self.markers:", len(self.markers))
-        print("length of p_values:", len(p_values))
+        logger.debug("length of self.markers:", len(self.markers))
+        logger.debug("length of p_values:", len(p_values))
 
         if type(p_values) is list:
             # THIS IS only needed for the case when we are limiting the number of p-values calculated
@@ -178,10 +203,10 @@ class Markers(object):
         elif type(p_values) is dict:
             filtered_markers = []
             for marker in self.markers:
-                #print("marker[name]", marker['name'])
-                #print("p_values:", p_values)
+                #logger.debug("marker[name]", marker['name'])
+                #logger.debug("p_values:", p_values)
                 if marker['name'] in p_values:
-                    #print("marker {} IS in p_values".format(i))
+                    #logger.debug("marker {} IS in p_values".format(i))
                     marker['p_value'] = p_values[marker['name']]
                     if math.isnan(marker['p_value']) or (marker['p_value'] <= 0):
                         marker['lod_score'] = 0
@@ -192,7 +217,7 @@ class Markers(object):
                         marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
                     filtered_markers.append(marker)
                 #else:
-                    #print("marker {} NOT in p_values".format(i))
+                    #logger.debug("marker {} NOT in p_values".format(i))
                     #self.markers.remove(marker)
                     #del self.markers[i]
             self.markers = filtered_markers
@@ -204,7 +229,7 @@ class HumanMarkers(Markers):
         self.markers = []
         for line in marker_data_fh:
             splat = line.strip().split()
-            #print("splat:", splat)
+            #logger.debug("splat:", splat)
             if len(specified_markers) > 0:
                 if splat[1] in specified_markers:
                     marker = {}
@@ -220,7 +245,7 @@ class HumanMarkers(Markers):
                 marker['Mb'] = float(splat[3]) / 1000000
             self.markers.append(marker)
 
-        #print("markers is: ", pf(self.markers))
+        #logger.debug("markers is: ", pf(self.markers))
 
 
     def add_pvalues(self, p_values):
@@ -237,15 +262,15 @@ class DatasetGroup(object):
     """
     def __init__(self, dataset):
         """This sets self.group and self.group_id"""
-        #print("DATASET NAME2:", dataset.name)
-        self.name, self.id = g.db.execute(dataset.query_for_group).fetchone()
+        #logger.debug("DATASET NAME2:", dataset.name)
+        self.name, self.id = fetchone(dataset.query_for_group)
         if self.name == 'BXD300':
             self.name = "BXD"
 
         self.f1list = None
         self.parlist = None
         self.get_f1_parent_strains()
-        #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+        #logger.debug("parents/f1s: {}:{}".format(self.parlist, self.f1list))
 
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
 
@@ -257,7 +282,7 @@ class DatasetGroup(object):
         self.markers = HumanMarkers(self.name, markers)
 
     def get_markers(self):
-        #print("self.species is:", self.species)
+        #logger.debug("self.species is:", self.species)
         if self.species == "human":
             marker_class = HumanMarkers
         else:
@@ -267,21 +292,21 @@ class DatasetGroup(object):
 
     def datasets(self):
         key = "group_dataset_menu:v2:" + self.name
-        print("key is2:", key)
+        logger.debug("key is2:", key)
         dataset_menu = []
-        print("[tape4] webqtlConfig.PUBLICTHRESH:", webqtlConfig.PUBLICTHRESH)
-        print("[tape4] type webqtlConfig.PUBLICTHRESH:", type(webqtlConfig.PUBLICTHRESH))
-        results = g.db.execute('''
+        logger.debug("[tape4] webqtlConfig.PUBLICTHRESH:", webqtlConfig.PUBLICTHRESH)
+        logger.debug("[tape4] type webqtlConfig.PUBLICTHRESH:", type(webqtlConfig.PUBLICTHRESH))
+        the_results = fetchall('''
              (SELECT '#PublishFreeze',PublishFreeze.FullName,PublishFreeze.Name
               FROM PublishFreeze,InbredSet
               WHERE PublishFreeze.InbredSetId = InbredSet.Id
-                and InbredSet.Name = %s
+                and InbredSet.Name = '%s'
                 and PublishFreeze.public > %s)
              UNION
              (SELECT '#GenoFreeze',GenoFreeze.FullName,GenoFreeze.Name
               FROM GenoFreeze, InbredSet
               WHERE GenoFreeze.InbredSetId = InbredSet.Id
-                and InbredSet.Name = %s
+                and InbredSet.Name = '%s'
                 and GenoFreeze.public > %s)
              UNION
              (SELECT Tissue.Name, ProbeSetFreeze.FullName,ProbeSetFreeze.Name
@@ -292,11 +317,9 @@ class DatasetGroup(object):
                 and InbredSet.Name like %s
                 and ProbeSetFreeze.public > %s
               ORDER BY Tissue.Name, ProbeSetFreeze.CreateTime desc, ProbeSetFreeze.AvgId)
-            ''', (self.name, webqtlConfig.PUBLICTHRESH,
+            ''' % (self.name, webqtlConfig.PUBLICTHRESH,
                   self.name, webqtlConfig.PUBLICTHRESH,
-                  "%" + self.name + "%", webqtlConfig.PUBLICTHRESH))
-
-        the_results = results.fetchall()
+                  "'" + self.name + "'", webqtlConfig.PUBLICTHRESH))
 
         #for tissue_name, dataset in itertools.groupby(the_results, itemgetter(0)):
         for dataset_item in the_results:
@@ -317,14 +340,15 @@ class DatasetGroup(object):
                         break
 
                 if tissue_already_exists:
-                    #print("dataset_menu:", dataset_menu[i]['datasets'])
+                    #logger.debug("dataset_menu:", dataset_menu[i]['datasets'])
                     dataset_menu[i]['datasets'].append((dataset, dataset_short))
                 else:
                     dataset_menu.append(dict(tissue=tissue_name,
                                         datasets=[(dataset, dataset_short)]))
 
-        Redis.set(key, pickle.dumps(dataset_menu, pickle.HIGHEST_PROTOCOL))
-        Redis.expire(key, 60*5)
+        if USE_REDIS:
+            Redis.set(key, pickle.dumps(dataset_menu, pickle.HIGHEST_PROTOCOL))
+            Redis.expire(key, 60*5)
         self._datasets = dataset_menu
 
         return self._datasets
@@ -342,19 +366,19 @@ class DatasetGroup(object):
             self.parlist = [maternal, paternal]
 
     def get_samplelist(self):
+        result = None
         key = "samplelist:v2:" + self.name
-        #print("key is:", key)
-        #with Bench("Loading cache"):
-        result = Redis.get(key)
+        if USE_REDIS:
+            result = Redis.get(key)
 
-        if result:
-            #print("Sample List Cache hit!!!")
-            #print("Before unjsonifying {}: {}".format(type(result), result))
+        if result is not None:
+            #logger.debug("Sample List Cache hit!!!")
+            #logger.debug("Before unjsonifying {}: {}".format(type(result), result))
             self.samplelist = json.loads(result)
-            #print("  type: ", type(self.samplelist))
-            #print("  self.samplelist: ", self.samplelist)
+            #logger.debug("  type: ", type(self.samplelist))
+            #logger.debug("  self.samplelist: ", self.samplelist)
         else:
-            print("Cache not hit")
+            logger.debug("Cache not hit")
 
             genotype_fn = locate_ignore_error(self.name+".geno",'genotype')
             mapping_fn = locate_ignore_error(self.name+".fam",'mapping')
@@ -364,9 +388,10 @@ class DatasetGroup(object):
                 self.samplelist = get_group_samplelists.get_samplelist("geno", genotype_fn)
             else:
                 self.samplelist = None
-            print("Sample list: ",self.samplelist)
-            Redis.set(key, json.dumps(self.samplelist))
-            Redis.expire(key, 60*5)
+            logger.debug("Sample list: ",self.samplelist)
+            if USE_REDIS:
+                Redis.set(key, json.dumps(self.samplelist))
+                Redis.expire(key, 60*5)
 
     def all_samples_ordered(self):
         result = []
@@ -457,14 +482,14 @@ class DataSet(object):
                     self.name,
                     self.name))
 
-                self.id, self.name, self.fullname, self.shortname, self.data_scale, self.tissue = g.db.execute("""
+                self.id, self.name, self.fullname, self.shortname, self.data_scale, self.tissue = fetchone("""
                         SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, ProbeSetFreeze.DataScale, Tissue.Name
                         FROM ProbeSetFreeze, ProbeFreeze, Tissue
                         WHERE ProbeSetFreeze.public > %s AND
                               ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id AND
                               ProbeFreeze.TissueId = Tissue.Id AND
                              (ProbeSetFreeze.Name = '%s' OR ProbeSetFreeze.FullName = '%s' OR ProbeSetFreeze.ShortName = '%s')
-                  """ % (query_args)).fetchone()
+                  """ % (query_args))
             else:
                 query_args = tuple(escape(x) for x in (
                     (self.type + "Freeze"),
@@ -474,15 +499,15 @@ class DataSet(object):
                     self.name))
 
                 self.tissue = "N/A"
-                self.id, self.name, self.fullname, self.shortname = g.db.execute("""
+                self.id, self.name, self.fullname, self.shortname = fetchone("""
                         SELECT Id, Name, FullName, ShortName
                         FROM %s
                         WHERE public > %s AND
                              (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
-                  """ % (query_args)).fetchone()
+                  """ % (query_args))
 
         except TypeError:
-            print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
+            logger.debug("Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
 
     def get_trait_data(self, sample_list=None):
@@ -501,6 +526,7 @@ class DataSet(object):
             and Strain.SpeciesId=Species.Id
             and Species.name = '{}'
             """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+        logger.sql(query)
         results = dict(g.db.execute(query).fetchall())
         sample_ids = [results[item] for item in self.samplelist]
 
@@ -549,10 +575,11 @@ class DataSet(object):
                         """.format(*mescape(self.type, self.type, self.type, self.type,
                                    self.name, dataset_type, self.type, self.type, dataset_type))
 
-            #print("trait data query: ", query)
+            #logger.debug("trait data query: ", query)
 
+            logger.sql(query)
             results = g.db.execute(query).fetchall()
-            #print("query results:", results)
+            #logger.debug("query results:", results)
             trait_sample_data.append(results)
 
         trait_count = len(trait_sample_data[0])
@@ -571,7 +598,7 @@ class PhenotypeDataSet(DataSet):
 
     def setup(self):
 
-        #print("IS A PHENOTYPEDATASET")
+        #logger.debug("IS A PHENOTYPEDATASET")
 
         # Fields in the database table
         self.search_fields = ['Phenotype.Post_publication_description',
@@ -635,6 +662,7 @@ class PhenotypeDataSet(DataSet):
             where PublishFreeze.InbredSetId=PublishXRef.InbredSetId
             and PublishFreeze.Id = {}
             """.format(escape(str(self.id)))
+        logger.sql(query)
         results = g.db.execute(query).fetchall()
         trait_data = {}
         for trait in results:
@@ -683,12 +711,14 @@ class PhenotypeDataSet(DataSet):
             this_trait.LRS_location_value = 1000000
 
             if this_trait.lrs:
-                result = g.db.execute("""
+                query = """
                     select Geno.Chr, Geno.Mb from Geno, Species
                     where Species.Name = %s and
                         Geno.Name = %s and
                         Geno.SpeciesId = Species.Id
-                """, (species, this_trait.locus)).fetchone()
+                """ % (species, this_trait.locus)
+                logger.sql(query)
+                result = g.db.execute(query).fetchone()
 
                 if result:
                     if result[0] and result[1]:
@@ -726,6 +756,7 @@ class PhenotypeDataSet(DataSet):
                     Order BY
                             Strain.Name
                     """
+        logger.sql(query)
         results = g.db.execute(query, (trait, self.id)).fetchall()
         return results
 
@@ -773,6 +804,7 @@ class GenotypeDataSet(DataSet):
             where GenoXRef.GenoId = Geno.Id
             and GenoFreezeId = {}
             """.format(escape(str(self.id)))
+        logger.sql(query)
         results = g.db.execute(query).fetchall()
         trait_data = {}
         for trait in results:
@@ -817,6 +849,7 @@ class GenotypeDataSet(DataSet):
                     Order BY
                             Strain.Name
                     """
+        logger.sql(query)
         results = g.db.execute(query,
                                (webqtlDatabaseFunction.retrieve_species_id(self.group.name),
                                 trait, self.name)).fetchall()
@@ -899,6 +932,7 @@ class MrnaAssayDataSet(DataSet):
             where ProbeSetXRef.ProbeSetId = ProbeSet.Id
             and ProbeSetFreezeId = {}
             """.format(escape(str(self.id)))
+        logger.sql(query)
         results = g.db.execute(query).fetchall()
         trait_data = {}
         for trait in results:
@@ -967,8 +1001,8 @@ class MrnaAssayDataSet(DataSet):
             """ % (escape(str(this_trait.dataset.id)),
                    escape(this_trait.name)))
 
-            #print("query is:", pf(query))
-
+            #logger.debug("query is:", pf(query))
+            logger.sql(query)
             result = g.db.execute(query).fetchone()
 
             mean = result[0] if result else 0
@@ -990,6 +1024,7 @@ class MrnaAssayDataSet(DataSet):
                         Geno.Name = '{}' and
                         Geno.SpeciesId = Species.Id
                 """.format(species, this_trait.locus)
+                logger.sql(query)
                 result = g.db.execute(query).fetchone()
 
                 if result:
@@ -1025,6 +1060,7 @@ class MrnaAssayDataSet(DataSet):
                             ProbeSet.Name = %s
                             ProbeSetFreeze.Name = %s
                 """ % (escape(self.name), escape(self.dataset.name))
+        logger.sql(query)
         results = g.db.execute(query).fetchone()
         return results[0]
 
@@ -1045,8 +1081,9 @@ class MrnaAssayDataSet(DataSet):
                     Order BY
                             Strain.Name
                     """ % (escape(trait), escape(self.name))
+        logger.sql(query)
         results = g.db.execute(query).fetchall()
-        #print("RETRIEVED RESULTS HERE:", results)
+        #logger.debug("RETRIEVED RESULTS HERE:", results)
         return results
 
 
@@ -1057,6 +1094,7 @@ class MrnaAssayDataSet(DataSet):
                     where ProbeSetXRef.ProbeSetFreezeId = %s and
                     ProbeSetXRef.ProbeSetId=ProbeSet.Id;
                 """ % (column_name, escape(str(self.id)))
+        logger.sql(query)
         results = g.db.execute(query).fetchall()
 
         return dict(results)
@@ -1093,13 +1131,15 @@ class TempDataSet(DataSet):
         return desc
 
     def get_desc(self):
-        g.db.execute('SELECT description FROM Temp WHERE Name=%s', self.name)
+        query = 'SELECT description FROM Temp WHERE Name=%s' % self.name
+        logger.sql(query)
+        g.db.execute(query)
         desc = g.db.fetchone()[0]
         desc = self.handle_pca(desc)
         return desc
 
     def get_group(self):
-        self.cursor.execute("""
+        query = """
                     SELECT
                             InbredSet.Name, InbredSet.Id
                     FROM
@@ -1107,9 +1147,9 @@ class TempDataSet(DataSet):
                     WHERE
                             Temp.InbredSetId = InbredSet.Id AND
                             Temp.Name = "%s"
-            """, self.name)
-        self.group, self.group_id = self.cursor.fetchone()
-        #return self.group
+            """ % self.name
+        logger.sql(query)
+        self.group, self.group_id = g.db.execute(query).fetchone()
 
     def retrieve_sample_data(self, trait):
         query = """
@@ -1125,17 +1165,18 @@ class TempDataSet(DataSet):
                         Strain.Name
                 """ % escape(trait.name)
 
+        logger.sql(query)
         results = g.db.execute(query).fetchall()
 
 
 def geno_mrna_confidentiality(ob):
     dataset_table = ob.type + "Freeze"
-    #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+    #logger.debug("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
 
     query = '''SELECT Id, Name, FullName, confidentiality,
-                        AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
-
-    result = g.db.execute(query, ob.name)
+                        AuthorisedUsers FROM %s WHERE Name = "%s"''' % (dataset_table,ob.name)
+    logger.sql(query)
+    result = g.db.execute(query)
 
     (dataset_id,
      name,
@@ -1145,4 +1186,3 @@ def geno_mrna_confidentiality(ob):
 
     if confidential:
         return True
-