diff options
Diffstat (limited to 'wqflask/base/data_set.py')
-rw-r--r-- | wqflask/base/data_set.py | 204 |
1 files changed, 122 insertions, 82 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 4953e728..c70738f7 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -40,7 +40,7 @@ import reaper from base import webqtlConfig from base import species -from dbFunction import webqtlDatabaseFunction +from db import webqtlDatabaseFunction from utility import webqtlUtil from utility.benchmark import Bench from utility import chunks @@ -50,6 +50,12 @@ from maintenance import get_group_samplelists from MySQLdb import escape_string as escape from pprint import pformat as pf +from db.gn_server import menu_main +from db.call import fetchall,fetchone,fetch1 + +from utility.tools import USE_GN_SERVER, USE_REDIS +from utility.logger import getLogger +logger = getLogger(__name__ ) # Used by create_database to instantiate objects # Each subclass will add to this @@ -58,8 +64,7 @@ DS_NAME_MAP = {} def create_dataset(dataset_name, dataset_type = None, get_samplelist = True): if not dataset_type: dataset_type = Dataset_Getter(dataset_name) - - print("dataset_type is:", dataset_type) + logger.debug("dataset_type", dataset_type) dataset_ob = DS_NAME_MAP[dataset_type] dataset_class = globals()[dataset_ob] @@ -68,12 +73,28 @@ def create_dataset(dataset_name, dataset_type = None, get_samplelist = True): class Dataset_Types(object): def __init__(self): + """Create a dictionary of samples where the value is set to Geno, +Publish or ProbeSet. E.g. + + {'AD-cases-controls-MyersGeno': 'Geno', + 'AD-cases-controls-MyersPublish': 'Publish', + 'AKXDGeno': 'Geno', + 'AXBXAGeno': 'Geno', + 'AXBXAPublish': 'Publish', + 'Aging-Brain-UCIPublish': 'Publish', + 'All Phenotypes': 'Publish', + 'B139_K_1206_M': 'ProbeSet', + 'B139_K_1206_R': 'ProbeSet' ... + + """ self.datasets = {} - file_name = "wqflask/static/new/javascript/dataset_menu_structure.json" - with open(file_name, 'r') as fh: - data = json.load(fh) + if USE_GN_SERVER: + data = menu_main() + else: + file_name = "wqflask/static/new/javascript/dataset_menu_structure.json" + with open(file_name, 'r') as fh: + data = json.load(fh) - print("*" * 70) for species in data['datasets']: for group in data['datasets'][species]: for dataset_type in data['datasets'][species][group]: @@ -85,7 +106,8 @@ class Dataset_Types(object): new_type = "Geno" else: new_type = "ProbeSet" - self.datasets[short_dataset_name] = new_type + self.datasets[short_dataset_name] = new_type + logger.info("datasets",self.datasets) def __call__(self, name): return self.datasets[name] @@ -94,14 +116,15 @@ class Dataset_Types(object): Dataset_Getter = Dataset_Types() def create_datasets_list(): - key = "all_datasets" - result = Redis.get(key) + if USE_REDIS: + key = "all_datasets" + result = Redis.get(key) - if result: - print("Cache hit!!!") - datasets = pickle.loads(result) + if result: + logger.debug("Redis cache hit") + datasets = pickle.loads(result) - else: + if result is None: datasets = list() with Bench("Creating DataSets object"): type_dict = {'Publish': 'PublishFreeze', @@ -110,15 +133,17 @@ def create_datasets_list(): for dataset_type in type_dict: query = "SELECT Name FROM {}".format(type_dict[dataset_type]) - for result in g.db.execute(query).fetchall(): - #The query at the beginning of this function isn't necessary here, but still would - #rather just reuse it - #print("type: {}\tname: {}".format(dataset_type, result.Name)) + for result in fetchall(query): + #The query at the beginning of this function isn't + #necessary here, but still would rather just reuse + #it logger.debug("type: {}\tname: + #{}".format(dataset_type, result.Name)) dataset = create_dataset(result.Name, dataset_type) datasets.append(dataset) - Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL)) - Redis.expire(key, 60*60) + if USE_REDIS: + Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL)) + Redis.expire(key, 60*60) return datasets @@ -133,7 +158,7 @@ def create_in_clause(items): def mescape(*items): """Multiple escape""" escaped = [escape(str(item)) for item in items] - #print("escaped is:", escaped) + #logger.debug("escaped is:", escaped) return escaped @@ -152,12 +177,12 @@ class Markers(object): marker['Mb'] = float(marker['Mb']) self.markers = markers - #print("self.markers:", self.markers) + #logger.debug("self.markers:", self.markers) def add_pvalues(self, p_values): - print("length of self.markers:", len(self.markers)) - print("length of p_values:", len(p_values)) + logger.debug("length of self.markers:", len(self.markers)) + logger.debug("length of p_values:", len(p_values)) if type(p_values) is list: # THIS IS only needed for the case when we are limiting the number of p-values calculated @@ -178,10 +203,10 @@ class Markers(object): elif type(p_values) is dict: filtered_markers = [] for marker in self.markers: - #print("marker[name]", marker['name']) - #print("p_values:", p_values) + #logger.debug("marker[name]", marker['name']) + #logger.debug("p_values:", p_values) if marker['name'] in p_values: - #print("marker {} IS in p_values".format(i)) + #logger.debug("marker {} IS in p_values".format(i)) marker['p_value'] = p_values[marker['name']] if math.isnan(marker['p_value']) or (marker['p_value'] <= 0): marker['lod_score'] = 0 @@ -192,7 +217,7 @@ class Markers(object): marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 filtered_markers.append(marker) #else: - #print("marker {} NOT in p_values".format(i)) + #logger.debug("marker {} NOT in p_values".format(i)) #self.markers.remove(marker) #del self.markers[i] self.markers = filtered_markers @@ -204,7 +229,7 @@ class HumanMarkers(Markers): self.markers = [] for line in marker_data_fh: splat = line.strip().split() - #print("splat:", splat) + #logger.debug("splat:", splat) if len(specified_markers) > 0: if splat[1] in specified_markers: marker = {} @@ -220,7 +245,7 @@ class HumanMarkers(Markers): marker['Mb'] = float(splat[3]) / 1000000 self.markers.append(marker) - #print("markers is: ", pf(self.markers)) + #logger.debug("markers is: ", pf(self.markers)) def add_pvalues(self, p_values): @@ -237,15 +262,15 @@ class DatasetGroup(object): """ def __init__(self, dataset): """This sets self.group and self.group_id""" - #print("DATASET NAME2:", dataset.name) - self.name, self.id = g.db.execute(dataset.query_for_group).fetchone() + #logger.debug("DATASET NAME2:", dataset.name) + self.name, self.id = fetchone(dataset.query_for_group) if self.name == 'BXD300': self.name = "BXD" self.f1list = None self.parlist = None self.get_f1_parent_strains() - #print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) + #logger.debug("parents/f1s: {}:{}".format(self.parlist, self.f1list)) self.species = webqtlDatabaseFunction.retrieve_species(self.name) @@ -257,7 +282,7 @@ class DatasetGroup(object): self.markers = HumanMarkers(self.name, markers) def get_markers(self): - #print("self.species is:", self.species) + #logger.debug("self.species is:", self.species) if self.species == "human": marker_class = HumanMarkers else: @@ -267,21 +292,21 @@ class DatasetGroup(object): def datasets(self): key = "group_dataset_menu:v2:" + self.name - print("key is2:", key) + logger.debug("key is2:", key) dataset_menu = [] - print("[tape4] webqtlConfig.PUBLICTHRESH:", webqtlConfig.PUBLICTHRESH) - print("[tape4] type webqtlConfig.PUBLICTHRESH:", type(webqtlConfig.PUBLICTHRESH)) - results = g.db.execute(''' + logger.debug("[tape4] webqtlConfig.PUBLICTHRESH:", webqtlConfig.PUBLICTHRESH) + logger.debug("[tape4] type webqtlConfig.PUBLICTHRESH:", type(webqtlConfig.PUBLICTHRESH)) + the_results = fetchall(''' (SELECT '#PublishFreeze',PublishFreeze.FullName,PublishFreeze.Name FROM PublishFreeze,InbredSet WHERE PublishFreeze.InbredSetId = InbredSet.Id - and InbredSet.Name = %s + and InbredSet.Name = '%s' and PublishFreeze.public > %s) UNION (SELECT '#GenoFreeze',GenoFreeze.FullName,GenoFreeze.Name FROM GenoFreeze, InbredSet WHERE GenoFreeze.InbredSetId = InbredSet.Id - and InbredSet.Name = %s + and InbredSet.Name = '%s' and GenoFreeze.public > %s) UNION (SELECT Tissue.Name, ProbeSetFreeze.FullName,ProbeSetFreeze.Name @@ -292,11 +317,9 @@ class DatasetGroup(object): and InbredSet.Name like %s and ProbeSetFreeze.public > %s ORDER BY Tissue.Name, ProbeSetFreeze.CreateTime desc, ProbeSetFreeze.AvgId) - ''', (self.name, webqtlConfig.PUBLICTHRESH, + ''' % (self.name, webqtlConfig.PUBLICTHRESH, self.name, webqtlConfig.PUBLICTHRESH, - "%" + self.name + "%", webqtlConfig.PUBLICTHRESH)) - - the_results = results.fetchall() + "'" + self.name + "'", webqtlConfig.PUBLICTHRESH)) #for tissue_name, dataset in itertools.groupby(the_results, itemgetter(0)): for dataset_item in the_results: @@ -317,14 +340,15 @@ class DatasetGroup(object): break if tissue_already_exists: - #print("dataset_menu:", dataset_menu[i]['datasets']) + #logger.debug("dataset_menu:", dataset_menu[i]['datasets']) dataset_menu[i]['datasets'].append((dataset, dataset_short)) else: dataset_menu.append(dict(tissue=tissue_name, datasets=[(dataset, dataset_short)])) - Redis.set(key, pickle.dumps(dataset_menu, pickle.HIGHEST_PROTOCOL)) - Redis.expire(key, 60*5) + if USE_REDIS: + Redis.set(key, pickle.dumps(dataset_menu, pickle.HIGHEST_PROTOCOL)) + Redis.expire(key, 60*5) self._datasets = dataset_menu return self._datasets @@ -342,19 +366,19 @@ class DatasetGroup(object): self.parlist = [maternal, paternal] def get_samplelist(self): + result = None key = "samplelist:v2:" + self.name - #print("key is:", key) - #with Bench("Loading cache"): - result = Redis.get(key) + if USE_REDIS: + result = Redis.get(key) - if result: - #print("Sample List Cache hit!!!") - #print("Before unjsonifying {}: {}".format(type(result), result)) + if result is not None: + #logger.debug("Sample List Cache hit!!!") + #logger.debug("Before unjsonifying {}: {}".format(type(result), result)) self.samplelist = json.loads(result) - #print(" type: ", type(self.samplelist)) - #print(" self.samplelist: ", self.samplelist) + #logger.debug(" type: ", type(self.samplelist)) + #logger.debug(" self.samplelist: ", self.samplelist) else: - print("Cache not hit") + logger.debug("Cache not hit") genotype_fn = locate_ignore_error(self.name+".geno",'genotype') mapping_fn = locate_ignore_error(self.name+".fam",'mapping') @@ -364,9 +388,10 @@ class DatasetGroup(object): self.samplelist = get_group_samplelists.get_samplelist("geno", genotype_fn) else: self.samplelist = None - print("Sample list: ",self.samplelist) - Redis.set(key, json.dumps(self.samplelist)) - Redis.expire(key, 60*5) + logger.debug("Sample list: ",self.samplelist) + if USE_REDIS: + Redis.set(key, json.dumps(self.samplelist)) + Redis.expire(key, 60*5) def all_samples_ordered(self): result = [] @@ -457,14 +482,14 @@ class DataSet(object): self.name, self.name)) - self.id, self.name, self.fullname, self.shortname, self.data_scale, self.tissue = g.db.execute(""" + self.id, self.name, self.fullname, self.shortname, self.data_scale, self.tissue = fetchone(""" SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, ProbeSetFreeze.DataScale, Tissue.Name FROM ProbeSetFreeze, ProbeFreeze, Tissue WHERE ProbeSetFreeze.public > %s AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id AND ProbeFreeze.TissueId = Tissue.Id AND (ProbeSetFreeze.Name = '%s' OR ProbeSetFreeze.FullName = '%s' OR ProbeSetFreeze.ShortName = '%s') - """ % (query_args)).fetchone() + """ % (query_args)) else: query_args = tuple(escape(x) for x in ( (self.type + "Freeze"), @@ -474,15 +499,15 @@ class DataSet(object): self.name)) self.tissue = "N/A" - self.id, self.name, self.fullname, self.shortname = g.db.execute(""" + self.id, self.name, self.fullname, self.shortname = fetchone(""" SELECT Id, Name, FullName, ShortName FROM %s WHERE public > %s AND (Name = '%s' OR FullName = '%s' OR ShortName = '%s') - """ % (query_args)).fetchone() + """ % (query_args)) except TypeError: - print("Dataset {} is not yet available in GeneNetwork.".format(self.name)) + logger.debug("Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass def get_trait_data(self, sample_list=None): @@ -501,6 +526,7 @@ class DataSet(object): and Strain.SpeciesId=Species.Id and Species.name = '{}' """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + logger.sql(query) results = dict(g.db.execute(query).fetchall()) sample_ids = [results[item] for item in self.samplelist] @@ -549,10 +575,11 @@ class DataSet(object): """.format(*mescape(self.type, self.type, self.type, self.type, self.name, dataset_type, self.type, self.type, dataset_type)) - #print("trait data query: ", query) + #logger.debug("trait data query: ", query) + logger.sql(query) results = g.db.execute(query).fetchall() - #print("query results:", results) + #logger.debug("query results:", results) trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) @@ -571,7 +598,7 @@ class PhenotypeDataSet(DataSet): def setup(self): - #print("IS A PHENOTYPEDATASET") + #logger.debug("IS A PHENOTYPEDATASET") # Fields in the database table self.search_fields = ['Phenotype.Post_publication_description', @@ -635,6 +662,7 @@ class PhenotypeDataSet(DataSet): where PublishFreeze.InbredSetId=PublishXRef.InbredSetId and PublishFreeze.Id = {} """.format(escape(str(self.id))) + logger.sql(query) results = g.db.execute(query).fetchall() trait_data = {} for trait in results: @@ -683,12 +711,14 @@ class PhenotypeDataSet(DataSet): this_trait.LRS_location_value = 1000000 if this_trait.lrs: - result = g.db.execute(""" + query = """ select Geno.Chr, Geno.Mb from Geno, Species where Species.Name = %s and Geno.Name = %s and Geno.SpeciesId = Species.Id - """, (species, this_trait.locus)).fetchone() + """ % (species, this_trait.locus) + logger.sql(query) + result = g.db.execute(query).fetchone() if result: if result[0] and result[1]: @@ -726,6 +756,7 @@ class PhenotypeDataSet(DataSet): Order BY Strain.Name """ + logger.sql(query) results = g.db.execute(query, (trait, self.id)).fetchall() return results @@ -773,6 +804,7 @@ class GenotypeDataSet(DataSet): where GenoXRef.GenoId = Geno.Id and GenoFreezeId = {} """.format(escape(str(self.id))) + logger.sql(query) results = g.db.execute(query).fetchall() trait_data = {} for trait in results: @@ -817,6 +849,7 @@ class GenotypeDataSet(DataSet): Order BY Strain.Name """ + logger.sql(query) results = g.db.execute(query, (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name)).fetchall() @@ -899,6 +932,7 @@ class MrnaAssayDataSet(DataSet): where ProbeSetXRef.ProbeSetId = ProbeSet.Id and ProbeSetFreezeId = {} """.format(escape(str(self.id))) + logger.sql(query) results = g.db.execute(query).fetchall() trait_data = {} for trait in results: @@ -967,8 +1001,8 @@ class MrnaAssayDataSet(DataSet): """ % (escape(str(this_trait.dataset.id)), escape(this_trait.name))) - #print("query is:", pf(query)) - + #logger.debug("query is:", pf(query)) + logger.sql(query) result = g.db.execute(query).fetchone() mean = result[0] if result else 0 @@ -990,6 +1024,7 @@ class MrnaAssayDataSet(DataSet): Geno.Name = '{}' and Geno.SpeciesId = Species.Id """.format(species, this_trait.locus) + logger.sql(query) result = g.db.execute(query).fetchone() if result: @@ -1025,6 +1060,7 @@ class MrnaAssayDataSet(DataSet): ProbeSet.Name = %s ProbeSetFreeze.Name = %s """ % (escape(self.name), escape(self.dataset.name)) + logger.sql(query) results = g.db.execute(query).fetchone() return results[0] @@ -1045,8 +1081,9 @@ class MrnaAssayDataSet(DataSet): Order BY Strain.Name """ % (escape(trait), escape(self.name)) + logger.sql(query) results = g.db.execute(query).fetchall() - #print("RETRIEVED RESULTS HERE:", results) + #logger.debug("RETRIEVED RESULTS HERE:", results) return results @@ -1057,6 +1094,7 @@ class MrnaAssayDataSet(DataSet): where ProbeSetXRef.ProbeSetFreezeId = %s and ProbeSetXRef.ProbeSetId=ProbeSet.Id; """ % (column_name, escape(str(self.id))) + logger.sql(query) results = g.db.execute(query).fetchall() return dict(results) @@ -1093,13 +1131,15 @@ class TempDataSet(DataSet): return desc def get_desc(self): - g.db.execute('SELECT description FROM Temp WHERE Name=%s', self.name) + query = 'SELECT description FROM Temp WHERE Name=%s' % self.name + logger.sql(query) + g.db.execute(query) desc = g.db.fetchone()[0] desc = self.handle_pca(desc) return desc def get_group(self): - self.cursor.execute(""" + query = """ SELECT InbredSet.Name, InbredSet.Id FROM @@ -1107,9 +1147,9 @@ class TempDataSet(DataSet): WHERE Temp.InbredSetId = InbredSet.Id AND Temp.Name = "%s" - """, self.name) - self.group, self.group_id = self.cursor.fetchone() - #return self.group + """ % self.name + logger.sql(query) + self.group, self.group_id = g.db.execute(query).fetchone() def retrieve_sample_data(self, trait): query = """ @@ -1125,17 +1165,18 @@ class TempDataSet(DataSet): Strain.Name """ % escape(trait.name) + logger.sql(query) results = g.db.execute(query).fetchall() def geno_mrna_confidentiality(ob): dataset_table = ob.type + "Freeze" - #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table)) + #logger.debug("dataset_table [%s]: %s" % (type(dataset_table), dataset_table)) query = '''SELECT Id, Name, FullName, confidentiality, - AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table) - - result = g.db.execute(query, ob.name) + AuthorisedUsers FROM %s WHERE Name = "%s"''' % (dataset_table,ob.name) + logger.sql(query) + result = g.db.execute(query) (dataset_id, name, @@ -1145,4 +1186,3 @@ def geno_mrna_confidentiality(ob): if confidential: return True - |