diff options
Diffstat (limited to 'wqflask/base/data_set.py')
-rw-r--r-- | wqflask/base/data_set.py | 329 |
1 files changed, 78 insertions, 251 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 379e5906..053b45fc 100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -44,6 +44,7 @@ from dbFunction import webqtlDatabaseFunction from utility import webqtlUtil from utility.benchmark import Bench from utility import chunks +from utility.tools import locate, locate_ignore_error from maintenance import get_group_samplelists @@ -57,40 +58,26 @@ DS_NAME_MAP = {} def create_dataset(dataset_name, dataset_type = None, get_samplelist = True): if not dataset_type: dataset_type = Dataset_Getter(dataset_name) - #dataset_type = get_dataset_type_from_json(dataset_name) print("dataset_type is:", dataset_type) - #query = """ - # SELECT DBType.Name - # FROM DBList, DBType - # WHERE DBList.Name = '{}' and - # DBType.Id = DBList.DBTypeId - # """.format(escape(dataset_name)) - #dataset_type = g.db.execute(query).fetchone().Name - dataset_ob = DS_NAME_MAP[dataset_type] dataset_class = globals()[dataset_ob] return dataset_class(dataset_name, get_samplelist) - -#def get_dataset_type_from_json(dataset_name): - class Dataset_Types(object): - + def __init__(self): self.datasets = {} file_name = "wqflask/static/new/javascript/dataset_menu_structure.json" with open(file_name, 'r') as fh: data = json.load(fh) - + print("*" * 70) for species in data['datasets']: for group in data['datasets'][species]: for dataset_type in data['datasets'][species][group]: for dataset in data['datasets'][species][group][dataset_type]: - #print("dataset is:", dataset) - short_dataset_name = dataset[1] if dataset_type == "Phenotypes": new_type = "Publish" @@ -99,32 +86,28 @@ class Dataset_Types(object): else: new_type = "ProbeSet" self.datasets[short_dataset_name] = new_type - + def __call__(self, name): return self.datasets[name] - + # Do the intensive work at startup one time only Dataset_Getter = Dataset_Types() -# -#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611")) - - def create_datasets_list(): key = "all_datasets" result = Redis.get(key) - + if result: print("Cache hit!!!") datasets = pickle.loads(result) - + else: datasets = list() with Bench("Creating DataSets object"): type_dict = {'Publish': 'PublishFreeze', 'ProbeSet': 'ProbeSetFreeze', 'Geno': 'GenoFreeze'} - + for dataset_type in type_dict: query = "SELECT Name FROM {}".format(type_dict[dataset_type]) for result in g.db.execute(query).fetchall(): @@ -133,10 +116,10 @@ def create_datasets_list(): #print("type: {}\tname: {}".format(dataset_type, result.Name)) dataset = create_dataset(result.Name, dataset_type) datasets.append(dataset) - + Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL)) Redis.expire(key, 60*60) - + return datasets @@ -157,30 +140,30 @@ def mescape(*items): class Markers(object): """Todo: Build in cacheing so it saves us reading the same file more than once""" def __init__(self, name): - json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json')) + json_data_fh = open(locate(name + '.json','genotype/json')) try: markers = json.load(json_data_fh) except: markers = [] - + for marker in markers: if (marker['chr'] != "X") and (marker['chr'] != "Y"): marker['chr'] = int(marker['chr']) marker['Mb'] = float(marker['Mb']) - + self.markers = markers #print("self.markers:", self.markers) - - + + def add_pvalues(self, p_values): print("length of self.markers:", len(self.markers)) print("length of p_values:", len(p_values)) - + if type(p_values) is list: # THIS IS only needed for the case when we are limiting the number of p-values calculated #if len(self.markers) > len(p_values): # self.markers = self.markers[:len(p_values)] - + for marker, p_value in itertools.izip(self.markers, p_values): if not p_value: continue @@ -213,18 +196,11 @@ class Markers(object): #self.markers.remove(marker) #del self.markers[i] self.markers = filtered_markers - - - #for i, marker in enumerate(self.markers): - # if not 'p_value' in marker: - # #print("self.markers[i]", self.markers[i]) - # del self.markers[i] - # #self.markers.remove(self.markers[i]) class HumanMarkers(Markers): - + def __init__(self, name, specified_markers = []): - marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim')) + marker_data_fh = open(locate('genotype') + '/' + name + '.bim') self.markers = [] for line in marker_data_fh: splat = line.strip().split() @@ -243,39 +219,21 @@ class HumanMarkers(Markers): marker['name'] = splat[1] marker['Mb'] = float(splat[3]) / 1000000 self.markers.append(marker) - + #print("markers is: ", pf(self.markers)) def add_pvalues(self, p_values): - #for marker, p_value in itertools.izip(self.markers, p_values): - # if marker['Mb'] <= 0 and marker['chr'] == 0: - # continue - # marker['p_value'] = p_value - # print("p_value is:", marker['p_value']) - # marker['lod_score'] = -math.log10(marker['p_value']) - # #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values - # marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 - - #print("p_values2:", pf(p_values)) super(HumanMarkers, self).add_pvalues(p_values) - - #with Bench("deleting markers"): - # markers = [] - # for marker in self.markers: - # if not marker['Mb'] <= 0 and not marker['chr'] == 0: - # markers.append(marker) - # self.markers = markers - - + class DatasetGroup(object): """ Each group has multiple datasets; each species has multiple groups. - + For example, Mouse has multiple groups (BXD, BXA, etc), and each group has multiple datasets associated with it. - + """ def __init__(self, dataset): """This sets self.group and self.group_id""" @@ -283,14 +241,14 @@ class DatasetGroup(object): self.name, self.id = g.db.execute(dataset.query_for_group).fetchone() if self.name == 'BXD300': self.name = "BXD" - + self.f1list = None self.parlist = None self.get_f1_parent_strains() #print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) - + self.species = webqtlDatabaseFunction.retrieve_species(self.name) - + self.incparentsf1 = False self.allsamples = None self._datasets = None @@ -301,7 +259,7 @@ class DatasetGroup(object): def get_markers(self): #print("self.species is:", self.species) if self.species == "human": - marker_class = HumanMarkers + marker_class = HumanMarkers else: marker_class = Markers @@ -310,12 +268,6 @@ class DatasetGroup(object): def datasets(self): key = "group_dataset_menu:v2:" + self.name print("key is2:", key) - #with Bench("Loading cache"): - # result = Redis.get(key) - #if result: - # self._datasets = pickle.loads(result) - # return self._datasets - dataset_menu = [] print("[tape4] webqtlConfig.PUBLICTHRESH:", webqtlConfig.PUBLICTHRESH) print("[tape4] type webqtlConfig.PUBLICTHRESH:", type(webqtlConfig.PUBLICTHRESH)) @@ -355,7 +307,7 @@ class DatasetGroup(object): dataset_menu.append(dict(tissue=None, datasets=[(dataset, dataset_short)])) else: dataset_sub_menu = [item[1:] for item in dataset] - + tissue_already_exists = False tissue_position = None for i, tissue_dict in enumerate(dataset_menu): @@ -383,7 +335,7 @@ class DatasetGroup(object): f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name] except KeyError: f1 = f12 = maternal = paternal = None - + if f1 and f12: self.f1list = [f1, f12] if maternal and paternal: @@ -402,21 +354,17 @@ class DatasetGroup(object): #print(" type: ", type(self.samplelist)) #print(" self.samplelist: ", self.samplelist) else: - #print("Cache not hit") - - from utility.tools import plink_command - PLINK_PATH,PLINK_COMMAND = plink_command() - - geno_file_path = webqtlConfig.GENODIR+self.name+".geno" - plink_file_path = PLINK_PATH+"/"+self.name+".fam" - - if os.path.isfile(plink_file_path): - self.samplelist = get_group_samplelists.get_samplelist("plink", plink_file_path) - elif os.path.isfile(geno_file_path): - self.samplelist = get_group_samplelists.get_samplelist("geno", geno_file_path) + print("Cache not hit") + + genotype_fn = locate_ignore_error(self.name+".geno",'genotype') + mapping_fn = locate_ignore_error(self.name+".fam",'mapping') + if mapping_fn: + self.samplelist = get_group_samplelists.get_samplelist("plink", mapping_fn) + elif genotype_fn: + self.samplelist = get_group_samplelists.get_samplelist("geno", genotype_fn) else: self.samplelist = None - #print("after get_samplelist") + print("Sample list: ",self.samplelist) Redis.set(key, json.dumps(self.samplelist)) Redis.expire(key, 60*5) @@ -428,30 +376,14 @@ class DatasetGroup(object): def read_genotype_file(self): '''Read genotype from .geno file instead of database''' - #if self.group == 'BXD300': - # self.group = 'BXD' - # - #assert self.group, "self.group needs to be set" - #genotype_1 is Dataset Object without parents and f1 #genotype_2 is Dataset Object with parents and f1 (not for intercross) genotype_1 = reaper.Dataset() # reaper barfs on unicode filenames, so here we ensure it's a string - full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno')) - if os.path.isfile(full_filename): - #print("Reading file: ", full_filename) - genotype_1.read(full_filename) - #print("File read") - else: - try: - full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno')) - #print("Reading file") - genotype_1.read(full_filename) - #print("File read") - except IOError: - print("File doesn't exist!") + full_filename = str(locate(self.name+'.geno','genotype')) + genotype_1.read(full_filename) if genotype_1.type == "group" and self.parlist: genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1) @@ -460,39 +392,15 @@ class DatasetGroup(object): #determine default genotype object if self.incparentsf1 and genotype_1.type != "intercross": - #self.genotype = genotype_2 genotype = genotype_2 else: self.incparentsf1 = 0 - #self.genotype = genotype_1 genotype = genotype_1 - #self.samplelist = list(self.genotype.prgy) self.samplelist = list(genotype.prgy) - - return genotype - -#class DataSets(object): -# """Builds a list of DataSets""" -# -# def __init__(self): -# self.datasets = list() -# - - - #query = """SELECT Name FROM ProbeSetFreeze - # UNION - # SELECT Name From PublishFreeze - # UNION - # SELECT Name From GenoFreeze""" - # - #for result in g.db.execute(query).fetchall(): - # dataset = DataSet(result.Name) - # self.datasets.append(dataset) + return genotype -#ds = DataSets() -#print("[orange] ds:", ds.datasets) class DataSet(object): """ @@ -516,7 +424,7 @@ class DataSet(object): self.check_confidentiality() self.retrieve_other_names() - + self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype if get_samplelist == True: self.group.get_samplelist() @@ -526,32 +434,11 @@ class DataSet(object): def get_desc(self): """Gets overridden later, at least for Temp...used by trait's get_given_name""" return None - - #@staticmethod - #def get_by_trait_id(trait_id): - # """Gets the dataset object given the trait id""" - # - # - # - # name = g.db.execute(""" SELECT - # - # """) - # - # return DataSet(name) # Delete this eventually @property def riset(): Weve_Renamed_This_As_Group - - - #@property - #def group(self): - # if not self._group: - # self.get_group() - # - # return self._group - def retrieve_other_names(self): """ @@ -561,7 +448,7 @@ class DataSet(object): This is not meant to retrieve the data set info if no name at all is passed. """ - + try: if self.type == "ProbeSet": query_args = tuple(escape(x) for x in ( @@ -597,17 +484,17 @@ class DataSet(object): except TypeError: print("Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass - + def get_trait_data(self, sample_list=None): if sample_list: self.samplelist = sample_list else: self.samplelist = self.group.samplelist - + if self.group.parlist != None and self.group.f1list != None: if (self.group.parlist + self.group.f1list) in self.samplelist: self.samplelist += self.group.parlist + self.group.f1list - + query = """ SELECT Strain.Name, Strain.Id FROM Strain, Species WHERE Strain.Name IN {} @@ -624,21 +511,6 @@ class DataSet(object): number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) trait_sample_data = [] for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): - - #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId - #tempTable = None - #if GeneId and db.type == "ProbeSet": - # if method == "3": - # tempTable = self.getTempLiteratureTable(species=species, - # input_species_geneid=GeneId, - # returnNumber=returnNumber) - # - # if method == "4" or method == "5": - # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, - # TissueProbeSetFreezeId=tissueProbeSetFreezeId, - # method=method, - # returnNumber=returnNumber) - if self.type == "Publish": dataset_type = "Phenotype" else: @@ -659,7 +531,7 @@ class DataSet(object): left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n """.format(*mescape(self.type, item, item, self.type, item, item)) - + if self.type == "Publish": query += """ WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId @@ -676,16 +548,16 @@ class DataSet(object): order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.type, self.name, dataset_type, self.type, self.type, dataset_type)) - + #print("trait data query: ", query) - + results = g.db.execute(query).fetchall() #print("query results:", results) trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) self.trait_data = collections.defaultdict(list) - + # put all of the separate data together into a dictionary where the keys are # trait names and values are lists of sample values for trait_counter in range(trait_count): @@ -698,9 +570,9 @@ class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' def setup(self): - + #print("IS A PHENOTYPEDATASET") - + # Fields in the database table self.search_fields = ['Phenotype.Post_publication_description', 'Phenotype.Pre_publication_description', @@ -771,26 +643,26 @@ class PhenotypeDataSet(DataSet): def get_trait_info(self, trait_list, species = ''): for this_trait in trait_list: - + if not this_trait.haveinfo: this_trait.retrieve_info(get_qtl_info=True) description = this_trait.post_publication_description - + #If the dataset is confidential and the user has access to confidential #phenotype traits, then display the pre-publication description instead #of the post-publication description if this_trait.confidential: this_trait.description_display = "" continue # for now - + if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait( privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users): - + description = this_trait.pre_publication_description - + if len(description) > 0: this_trait.description_display = description.strip() else: @@ -835,7 +707,7 @@ class PhenotypeDataSet(DataSet): this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs this_trait.LRS_score_value = LRS_score_value = this_trait.lrs this_trait.LRS_location_repr = LRS_location_repr = 'Chr%s: %.6f' % (LRS_Chr, float(LRS_Mb)) - + def retrieve_sample_data(self, trait): query = """ SELECT @@ -893,7 +765,7 @@ class GenotypeDataSet(DataSet): def check_confidentiality(self): return geno_mrna_confidentiality(self) - + def get_trait_list(self): query = """ select Geno.Name @@ -927,7 +799,7 @@ class GenotypeDataSet(DataSet): this_trait.location_repr = 'Chr%s: %.6f' % (this_trait.chr, float(this_trait.mb) ) this_trait.location_value = trait_location_value - + def retrieve_sample_data(self, trait): query = """ SELECT @@ -1019,7 +891,7 @@ class MrnaAssayDataSet(DataSet): def check_confidentiality(self): return geno_mrna_confidentiality(self) - + def get_trait_list_1(self): query = """ select ProbeSet.Name @@ -1028,17 +900,14 @@ class MrnaAssayDataSet(DataSet): and ProbeSetFreezeId = {} """.format(escape(str(self.id))) results = g.db.execute(query).fetchall() - #print("After get_trait_list query") trait_data = {} for trait in results: - #print("Retrieving sample_data for ", trait[0]) trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) - #print("After retrieve_sample_data") return trait_data - + def get_trait_info(self, trait_list=None, species=''): - # Note: setting trait_list to [] is probably not a great idea. + # Note: setting trait_list to [] is probably not a great idea. if not trait_list: trait_list = [] @@ -1101,7 +970,7 @@ class MrnaAssayDataSet(DataSet): #print("query is:", pf(query)) result = g.db.execute(query).fetchone() - + mean = result[0] if result else 0 if mean: @@ -1122,28 +991,15 @@ class MrnaAssayDataSet(DataSet): Geno.SpeciesId = Species.Id """.format(species, this_trait.locus) result = g.db.execute(query).fetchone() - + if result: - #if result[0] and result[1]: - # lrs_chr = result[0] - # lrs_mb = result[1] lrs_chr, lrs_mb = result #XZ: LRS_location_value is used for sorting lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb) - - #try: - # lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb) - #except: - # if lrs_chr.upper() == 'X': - # lrs_location_value = 20*1000 + float(lrs_mb) - # else: - # lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 + - # float(lrs_mb)) - this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs this_trait.LRS_score_value = this_trait.lrs this_trait.LRS_location_repr = 'Chr%s: %.6f' % (lrs_chr, float(lrs_mb)) - + def convert_location_to_value(self, chromosome, mb): try: @@ -1154,7 +1010,7 @@ class MrnaAssayDataSet(DataSet): else: location_value = (ord(str(chromosome).upper()[0])*1000 + float(mb)) - + return location_value def get_sequence(self): @@ -1171,7 +1027,7 @@ class MrnaAssayDataSet(DataSet): """ % (escape(self.name), escape(self.dataset.name)) results = g.db.execute(query).fetchone() return results[0] - + def retrieve_sample_data(self, trait): query = """ SELECT @@ -1192,8 +1048,8 @@ class MrnaAssayDataSet(DataSet): results = g.db.execute(query).fetchall() #print("RETRIEVED RESULTS HERE:", results) return results - - + + def retrieve_genes(self, column_name): query = """ select ProbeSet.Name, ProbeSet.%s @@ -1202,37 +1058,8 @@ class MrnaAssayDataSet(DataSet): ProbeSetXRef.ProbeSetId=ProbeSet.Id; """ % (column_name, escape(str(self.id))) results = g.db.execute(query).fetchall() - - return dict(results) - #def retrieve_gene_symbols(self): - # query = """ - # select ProbeSet.Name, ProbeSet.Symbol, ProbeSet.GeneId - # from ProbeSet,ProbeSetXRef - # where ProbeSetXRef.ProbeSetFreezeId = %s and - # ProbeSetXRef.ProbeSetId=ProbeSet.Id; - # """ % (self.id) - # results = g.db.execute(query).fetchall() - # symbol_dict = {} - # for item in results: - # symbol_dict[item[0]] = item[1] - # return symbol_dict - # - #def retrieve_gene_ids(self): - # query = """ - # select ProbeSet.Name, ProbeSet.GeneId - # from ProbeSet,ProbeSetXRef - # where ProbeSetXRef.ProbeSetFreezeId = %s and - # ProbeSetXRef.ProbeSetId=ProbeSet.Id; - # """ % (self.id) - # return process_and_run_query(query) - # results = g.db.execute(query).fetchall() - # symbol_dict = {} - # for item in results: - # symbol_dict[item[0]] = item[1] - # return symbol_dict - - + return dict(results) class TempDataSet(DataSet): @@ -1254,8 +1081,8 @@ class TempDataSet(DataSet): self.id = 1 self.fullname = 'Temporary Storage' self.shortname = 'Temp' - - + + @staticmethod def handle_pca(desc): if 'PCA' in desc: @@ -1264,13 +1091,13 @@ class TempDataSet(DataSet): else: desc = desc[:desc.index('entered')].strip() return desc - + def get_desc(self): g.db.execute('SELECT description FROM Temp WHERE Name=%s', self.name) desc = g.db.fetchone()[0] desc = self.handle_pca(desc) - return desc - + return desc + def get_group(self): self.cursor.execute(""" SELECT @@ -1283,7 +1110,7 @@ class TempDataSet(DataSet): """, self.name) self.group, self.group_id = self.cursor.fetchone() #return self.group - + def retrieve_sample_data(self, trait): query = """ SELECT @@ -1297,7 +1124,7 @@ class TempDataSet(DataSet): Order BY Strain.Name """ % escape(trait.name) - + results = g.db.execute(query).fetchall() |