about summary refs log tree commit diff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
authorPjotr Prins2016-02-24 11:01:29 +0000
committerPjotr Prins2016-04-20 09:13:26 +0000
commit5fffa1aa4260af6663c1f9b4cf7494d3ebc6995d (patch)
treea68c6d32ab7db60db2350c5a0ba8c0d4abc7eaf2 /wqflask/base/data_set.py
parent92fda7557645199276bf4f8a65c40252c4d83c21 (diff)
downloadgenenetwork2-5fffa1aa4260af6663c1f9b4cf7494d3ebc6995d.tar.gz
[PATCH 037/100] WebQtlConfig: sanitizing naming and used vars
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py194
1 files changed, 97 insertions, 97 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index ce13dd77..6527657a 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -76,22 +76,22 @@ def create_dataset(dataset_name, dataset_type = None, get_samplelist = True):
 
 
 #def get_dataset_type_from_json(dataset_name):
-    
+
 class Dataset_Types(object):
-    
+
     def __init__(self):
         self.datasets = {}
         file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
         with open(file_name, 'r') as fh:
             data = json.load(fh)
-        
+
         print("*" * 70)
         for species in data['datasets']:
             for group in data['datasets'][species]:
                 for dataset_type in data['datasets'][species][group]:
                     for dataset in data['datasets'][species][group][dataset_type]:
                         #print("dataset is:", dataset)
-                        
+
                         short_dataset_name = dataset[1]
                         if dataset_type == "Phenotypes":
                             new_type = "Publish"
@@ -100,32 +100,32 @@ class Dataset_Types(object):
                         else:
                             new_type = "ProbeSet"
                         self.datasets[short_dataset_name] = new_type
-                            
+
     def __call__(self, name):
         return self.datasets[name]
-    
+
 # Do the intensive work at startup one time only
 Dataset_Getter = Dataset_Types()
 
 #
 #print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611"))
-                    
+
 
 def create_datasets_list():
     key = "all_datasets"
     result = Redis.get(key)
-    
+
     if result:
         print("Cache hit!!!")
         datasets = pickle.loads(result)
-        
+
     else:
         datasets = list()
         with Bench("Creating DataSets object"):
             type_dict = {'Publish': 'PublishFreeze',
                          'ProbeSet': 'ProbeSetFreeze',
                          'Geno': 'GenoFreeze'}
-        
+
             for dataset_type in type_dict:
                 query = "SELECT Name FROM {}".format(type_dict[dataset_type])
                 for result in g.db.execute(query).fetchall():
@@ -134,10 +134,10 @@ def create_datasets_list():
                     #print("type: {}\tname: {}".format(dataset_type, result.Name))
                     dataset = create_dataset(result.Name, dataset_type)
                     datasets.append(dataset)
-            
+
         Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL))
         Redis.expire(key, 60*60)
-    
+
     return datasets
 
 
@@ -158,30 +158,30 @@ def mescape(*items):
 class Markers(object):
     """Todo: Build in cacheing so it saves us reading the same file more than once"""
     def __init__(self, name):
-        json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+        json_data_fh = open(locate(name + '.json','genotype/json'))
         try:
             markers = json.load(json_data_fh)
         except:
             markers = []
-    
+
         for marker in markers:
             if (marker['chr'] != "X") and (marker['chr'] != "Y"):
                 marker['chr'] = int(marker['chr'])
             marker['Mb'] = float(marker['Mb'])
-            
+
         self.markers = markers
         #print("self.markers:", self.markers)
-    
-    
+
+
     def add_pvalues(self, p_values):
         print("length of self.markers:", len(self.markers))
         print("length of p_values:", len(p_values))
-        
+
         if type(p_values) is list:
             # THIS IS only needed for the case when we are limiting the number of p-values calculated
             #if len(self.markers) > len(p_values):
             #    self.markers = self.markers[:len(p_values)]
-            
+
             for marker, p_value in itertools.izip(self.markers, p_values):
                 if not p_value:
                     continue
@@ -214,7 +214,7 @@ class Markers(object):
                     #self.markers.remove(marker)
                     #del self.markers[i]
             self.markers = filtered_markers
-            
+
 
         #for i, marker in enumerate(self.markers):
         #    if not 'p_value' in marker:
@@ -223,9 +223,9 @@ class Markers(object):
         #        #self.markers.remove(self.markers[i])
 
 class HumanMarkers(Markers):
-    
+
     def __init__(self, name, specified_markers = []):
-        marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+        marker_data_fh = open(locate('genotype') + '/' + name + '.bim')
         self.markers = []
         for line in marker_data_fh:
             splat = line.strip().split()
@@ -244,7 +244,7 @@ class HumanMarkers(Markers):
                 marker['name'] = splat[1]
                 marker['Mb'] = float(splat[3]) / 1000000
             self.markers.append(marker)
-            
+
         #print("markers is: ", pf(self.markers))
 
 
@@ -257,26 +257,26 @@ class HumanMarkers(Markers):
         #    marker['lod_score'] = -math.log10(marker['p_value'])
         #    #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
         #    marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
-        
+
         #print("p_values2:", pf(p_values))
         super(HumanMarkers, self).add_pvalues(p_values)
-        
+
         #with Bench("deleting markers"):
         #    markers = []
         #    for marker in self.markers:
         #        if not marker['Mb'] <= 0 and not marker['chr'] == 0:
         #            markers.append(marker)
         #    self.markers = markers
-        
-    
+
+
 
 class DatasetGroup(object):
     """
     Each group has multiple datasets; each species has multiple groups.
-    
+
     For example, Mouse has multiple groups (BXD, BXA, etc), and each group
     has multiple datasets associated with it.
-    
+
     """
     def __init__(self, dataset):
         """This sets self.group and self.group_id"""
@@ -284,14 +284,14 @@ class DatasetGroup(object):
         self.name, self.id = g.db.execute(dataset.query_for_group).fetchone()
         if self.name == 'BXD300':
             self.name = "BXD"
-        
+
         self.f1list = None
         self.parlist = None
         self.get_f1_parent_strains()
         #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
-        
+
         self.species = webqtlDatabaseFunction.retrieve_species(self.name)
-        
+
         self.incparentsf1 = False
         self.allsamples = None
         self._datasets = None
@@ -302,7 +302,7 @@ class DatasetGroup(object):
     def get_markers(self):
         #print("self.species is:", self.species)
         if self.species == "human":
-            marker_class = HumanMarkers 
+            marker_class = HumanMarkers
         else:
             marker_class = Markers
 
@@ -356,7 +356,7 @@ class DatasetGroup(object):
                 dataset_menu.append(dict(tissue=None, datasets=[(dataset, dataset_short)]))
             else:
                 dataset_sub_menu = [item[1:] for item in dataset]
-                
+
                 tissue_already_exists = False
                 tissue_position = None
                 for i, tissue_dict in enumerate(dataset_menu):
@@ -384,7 +384,7 @@ class DatasetGroup(object):
             f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
         except KeyError:
             f1 = f12 = maternal = paternal = None
-            
+
         if f1 and f12:
             self.f1list = [f1, f12]
         if maternal and paternal:
@@ -455,18 +455,18 @@ class DatasetGroup(object):
 
         #self.samplelist = list(self.genotype.prgy)
         self.samplelist = list(genotype.prgy)
-        
+
         return genotype
 
 
 #class DataSets(object):
 #    """Builds a list of DataSets"""
-#    
+#
 #    def __init__(self):
 #        self.datasets = list()
-#        
+#
+
 
-        
         #query = """SELECT Name FROM ProbeSetFreeze
         #           UNION
         #           SELECT Name From PublishFreeze
@@ -501,7 +501,7 @@ class DataSet(object):
         self.check_confidentiality()
 
         self.retrieve_other_names()
-        
+
         self.group = DatasetGroup(self)   # sets self.group and self.group_id and gets genotype
         if get_samplelist == True:
              self.group.get_samplelist()
@@ -511,30 +511,30 @@ class DataSet(object):
     def get_desc(self):
         """Gets overridden later, at least for Temp...used by trait's get_given_name"""
         return None
-    
+
     #@staticmethod
     #def get_by_trait_id(trait_id):
     #    """Gets the dataset object given the trait id"""
-    #    
-    #    
     #
-    #    name = g.db.execute(""" SELECT 
-    #                        
+    #
+    #
+    #    name = g.db.execute(""" SELECT
+    #
     #                        """)
-    #    
+    #
     #    return DataSet(name)
 
     # Delete this eventually
     @property
     def riset():
         Weve_Renamed_This_As_Group
-        
-        
+
+
     #@property
     #def group(self):
     #    if not self._group:
     #        self.get_group()
-    #        
+    #
     #    return self._group
 
 
@@ -546,7 +546,7 @@ class DataSet(object):
         This is not meant to retrieve the data set info if no name at all is passed.
 
         """
-        
+
         try:
             if self.type == "ProbeSet":
                 query_args = tuple(escape(x) for x in (
@@ -582,17 +582,17 @@ class DataSet(object):
         except TypeError:
             print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
             pass
-        
+
     def get_trait_data(self, sample_list=None):
         if sample_list:
             self.samplelist = sample_list
         else:
             self.samplelist = self.group.samplelist
-            
+
         if self.group.parlist != None and self.group.f1list != None:
             if (self.group.parlist + self.group.f1list) in self.samplelist:
                 self.samplelist += self.group.parlist + self.group.f1list
-        
+
         query = """
             SELECT Strain.Name, Strain.Id FROM Strain, Species
             WHERE Strain.Name IN {}
@@ -610,9 +610,9 @@ class DataSet(object):
         trait_sample_data = []
         for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
 
-        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
         #tempTable = None
-        #if GeneId and db.type == "ProbeSet": 
+        #if GeneId and db.type == "ProbeSet":
         #    if method == "3":
         #        tempTable = self.getTempLiteratureTable(species=species,
         #                                                input_species_geneid=GeneId,
@@ -623,7 +623,7 @@ class DataSet(object):
         #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
         #                                        method=method,
         #                                        returnNumber=returnNumber)
-        
+
             if self.type == "Publish":
                 dataset_type = "Phenotype"
             else:
@@ -644,7 +644,7 @@ class DataSet(object):
                         left join {}Data as T{} on T{}.Id = {}XRef.DataId
                         and T{}.StrainId={}\n
                         """.format(*mescape(self.type, item, item, self.type, item, item))
-                        
+
             if self.type == "Publish":
                 query += """
                         WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
@@ -661,16 +661,16 @@ class DataSet(object):
                         order by {}.Id
                         """.format(*mescape(self.type, self.type, self.type, self.type,
                                    self.name, dataset_type, self.type, self.type, dataset_type))
-                        
+
             #print("trait data query: ", query)
-            
+
             results = g.db.execute(query).fetchall()
             #print("query results:", results)
             trait_sample_data.append(results)
 
         trait_count = len(trait_sample_data[0])
         self.trait_data = collections.defaultdict(list)
-        
+
         # put all of the separate data together into a dictionary where the keys are
         # trait names and values are lists of sample values
         for trait_counter in range(trait_count):
@@ -683,9 +683,9 @@ class PhenotypeDataSet(DataSet):
     DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
 
     def setup(self):
-        
+
         #print("IS A PHENOTYPEDATASET")
-        
+
         # Fields in the database table
         self.search_fields = ['Phenotype.Post_publication_description',
                             'Phenotype.Pre_publication_description',
@@ -756,26 +756,26 @@ class PhenotypeDataSet(DataSet):
 
     def get_trait_info(self, trait_list, species = ''):
         for this_trait in trait_list:
-            
+
             if not this_trait.haveinfo:
                 this_trait.retrieve_info(get_qtl_info=True)
 
             description = this_trait.post_publication_description
-            
+
             #If the dataset is confidential and the user has access to confidential
             #phenotype traits, then display the pre-publication description instead
             #of the post-publication description
             if this_trait.confidential:
                 this_trait.description_display = ""
                 continue   # for now
-            
+
                 if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(
                         privilege=self.privilege,
                         userName=self.userName,
                         authorized_users=this_trait.authorized_users):
-                        
+
                     description = this_trait.pre_publication_description
-            
+
             if len(description) > 0:
                 this_trait.description_display = description.strip()
             else:
@@ -820,7 +820,7 @@ class PhenotypeDataSet(DataSet):
                         this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
                         this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
                         this_trait.LRS_location_repr = LRS_location_repr = 'Chr%s: %.6f' % (LRS_Chr, float(LRS_Mb))
-                        
+
     def retrieve_sample_data(self, trait):
         query = """
                     SELECT
@@ -878,7 +878,7 @@ class GenotypeDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
-    
+
     def get_trait_list(self):
         query = """
             select Geno.Name
@@ -912,7 +912,7 @@ class GenotypeDataSet(DataSet):
 
                 this_trait.location_repr = 'Chr%s: %.6f' % (this_trait.chr, float(this_trait.mb) )
                 this_trait.location_value = trait_location_value
-                
+
     def retrieve_sample_data(self, trait):
         query = """
                     SELECT
@@ -1004,7 +1004,7 @@ class MrnaAssayDataSet(DataSet):
 
     def check_confidentiality(self):
         return geno_mrna_confidentiality(self)
-        
+
     def get_trait_list_1(self):
         query = """
             select ProbeSet.Name
@@ -1020,7 +1020,7 @@ class MrnaAssayDataSet(DataSet):
             trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
         #print("After retrieve_sample_data")
         return trait_data
-    
+
     #def get_trait_data(self):
     #    self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
     #    query = """
@@ -1040,9 +1040,9 @@ class MrnaAssayDataSet(DataSet):
     #    trait_sample_data = []
     #    for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
     #
-    #    #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
+    #    #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
     #    #tempTable = None
-    #    #if GeneId and db.type == "ProbeSet": 
+    #    #if GeneId and db.type == "ProbeSet":
     #    #    if method == "3":
     #    #        tempTable = self.getTempLiteratureTable(species=species,
     #    #                                                input_species_geneid=GeneId,
@@ -1053,7 +1053,7 @@ class MrnaAssayDataSet(DataSet):
     #    #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
     #    #                                        method=method,
     #    #                                        returnNumber=returnNumber)
-    #    
+    #
     #        temp = ['T%s.value' % item for item in sample_ids_step]
     #        query = "SELECT {}.Name,".format(escape(self.type))
     #        data_start_pos = 1
@@ -1067,7 +1067,7 @@ class MrnaAssayDataSet(DataSet):
     #                    left join {}Data as T{} on T{}.Id = {}XRef.DataId
     #                    and T{}.StrainId={}\n
     #                    """.format(*mescape(self.type, item, item, self.type, item, item))
-    #                    
+    #
     #        query += """
     #                WHERE {}XRef.{}FreezeId = {}Freeze.Id
     #                and {}Freeze.Name = '{}'
@@ -1080,7 +1080,7 @@ class MrnaAssayDataSet(DataSet):
     #
     #    trait_count = len(trait_sample_data[0])
     #    self.trait_data = collections.defaultdict(list)
-    #    
+    #
     #    # put all of the separate data together into a dictionary where the keys are
     #    # trait names and values are lists of sample values
     #    for trait_counter in range(trait_count):
@@ -1088,11 +1088,11 @@ class MrnaAssayDataSet(DataSet):
     #        for chunk_counter in range(int(number_chunks)):
     #            self.trait_data[trait_name] += (
     #                trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
-    
+
 
     def get_trait_info(self, trait_list=None, species=''):
 
-        #  Note: setting trait_list to [] is probably not a great idea. 
+        #  Note: setting trait_list to [] is probably not a great idea.
         if not trait_list:
             trait_list = []
 
@@ -1155,7 +1155,7 @@ class MrnaAssayDataSet(DataSet):
             #print("query is:", pf(query))
 
             result = g.db.execute(query).fetchone()
-            
+
             mean = result[0] if result else 0
 
             if mean:
@@ -1176,7 +1176,7 @@ class MrnaAssayDataSet(DataSet):
                         Geno.SpeciesId = Species.Id
                 """.format(species, this_trait.locus)
                 result = g.db.execute(query).fetchone()
-                
+
                 if result:
                     #if result[0] and result[1]:
                     #    lrs_chr = result[0]
@@ -1184,7 +1184,7 @@ class MrnaAssayDataSet(DataSet):
                     lrs_chr, lrs_mb = result
                     #XZ: LRS_location_value is used for sorting
                     lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
-                    
+
                     #try:
                     #    lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
                     #except:
@@ -1197,7 +1197,7 @@ class MrnaAssayDataSet(DataSet):
                     this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
                     this_trait.LRS_score_value = this_trait.lrs
                     this_trait.LRS_location_repr = 'Chr%s: %.6f' % (lrs_chr, float(lrs_mb))
-      
+
 
     def convert_location_to_value(self, chromosome, mb):
         try:
@@ -1208,7 +1208,7 @@ class MrnaAssayDataSet(DataSet):
             else:
                 location_value = (ord(str(chromosome).upper()[0])*1000 +
                                   float(mb))
-        
+
         return location_value
 
     def get_sequence(self):
@@ -1225,7 +1225,7 @@ class MrnaAssayDataSet(DataSet):
                 """ % (escape(self.name), escape(self.dataset.name))
         results = g.db.execute(query).fetchone()
         return results[0]
-   
+
     def retrieve_sample_data(self, trait):
         query = """
                     SELECT
@@ -1246,8 +1246,8 @@ class MrnaAssayDataSet(DataSet):
         results = g.db.execute(query).fetchall()
         #print("RETRIEVED RESULTS HERE:", results)
         return results
-    
-    
+
+
     def retrieve_genes(self, column_name):
         query = """
                     select ProbeSet.Name, ProbeSet.%s
@@ -1256,7 +1256,7 @@ class MrnaAssayDataSet(DataSet):
                     ProbeSetXRef.ProbeSetId=ProbeSet.Id;
                 """ % (column_name, escape(str(self.id)))
         results = g.db.execute(query).fetchall()
-        
+
         return dict(results)
 
     #def retrieve_gene_symbols(self):
@@ -1285,8 +1285,8 @@ class MrnaAssayDataSet(DataSet):
     #    for item in results:
     #        symbol_dict[item[0]] = item[1]
     #    return symbol_dict
-    
-    
+
+
 
 
 class TempDataSet(DataSet):
@@ -1308,8 +1308,8 @@ class TempDataSet(DataSet):
         self.id = 1
         self.fullname = 'Temporary Storage'
         self.shortname = 'Temp'
-        
-       
+
+
     @staticmethod
     def handle_pca(desc):
         if 'PCA' in desc:
@@ -1318,13 +1318,13 @@ class TempDataSet(DataSet):
         else:
             desc = desc[:desc.index('entered')].strip()
         return desc
-        
+
     def get_desc(self):
         g.db.execute('SELECT description FROM Temp WHERE Name=%s', self.name)
         desc = g.db.fetchone()[0]
         desc = self.handle_pca(desc)
-        return desc    
-        
+        return desc
+
     def get_group(self):
         self.cursor.execute("""
                     SELECT
@@ -1337,7 +1337,7 @@ class TempDataSet(DataSet):
             """, self.name)
         self.group, self.group_id = self.cursor.fetchone()
         #return self.group
-        
+
     def retrieve_sample_data(self, trait):
         query = """
                 SELECT
@@ -1351,7 +1351,7 @@ class TempDataSet(DataSet):
                 Order BY
                         Strain.Name
                 """ % escape(trait.name)
-                
+
         results = g.db.execute(query).fetchall()