diff options
author | Lei Yan | 2016-06-16 18:21:01 +0000 |
---|---|---|
committer | Lei Yan | 2016-06-16 18:21:01 +0000 |
commit | 4fec0e6fc0772785a30451d417082bc189f2f6dd (patch) | |
tree | 6548c2d088d5a80561e23df076456caaeda195c7 /wqflask/base | |
parent | e55f38a72d47fbdf5f652a08e8da1db78f1dcdb5 (diff) | |
parent | d90dc3748557d1d6fbaa59f71fe676b8a7c393ca (diff) | |
download | genenetwork2-4fec0e6fc0772785a30451d417082bc189f2f6dd.tar.gz |
Merge /home/gn2/gene
Diffstat (limited to 'wqflask/base')
-rw-r--r-- | wqflask/base/GeneralObject.py | 68 | ||||
-rw-r--r--[-rwxr-xr-x] | wqflask/base/data_set.py | 435 | ||||
-rw-r--r--[-rwxr-xr-x] | wqflask/base/trait.py | 243 | ||||
-rwxr-xr-x | wqflask/base/webqtlCaseData.py | 3 | ||||
-rw-r--r--[-rwxr-xr-x] | wqflask/base/webqtlConfig.py | 50 | ||||
-rwxr-xr-x | wqflask/base/webqtlFormData.py | 2 |
6 files changed, 350 insertions, 451 deletions
diff --git a/wqflask/base/GeneralObject.py b/wqflask/base/GeneralObject.py new file mode 100644 index 00000000..02a1ef06 --- /dev/null +++ b/wqflask/base/GeneralObject.py @@ -0,0 +1,68 @@ +# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License +# as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero General Public License for more details. +# +# This program is available from Source Forge: at GeneNetwork Project +# (sourceforge.net/projects/genenetwork/). +# +# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) +# at rwilliams@uthsc.edu and xzhou15@uthsc.edu +# +# +# +# This module is used by GeneNetwork project (www.genenetwork.org) +# +# Created by GeneNetwork Core Team 2010/08/10 +# +# Last updated by GeneNetwork Core Team 2010/10/20 + +class GeneralObject: + """ + Base class to define an Object. + a = [Spam(1, 4), Spam(9, 3), Spam(4,6)] + a.sort(lambda x, y: cmp(x.eggs, y.eggs)) + """ + + def __init__(self, *args, **kw): + self.contents = list(args) + for name, value in kw.items(): + setattr(self, name, value) + + def __setitem__(self, key, value): + setattr(self, key, value) + + def __getitem__(self, key): + return getattr(self, key) + + def __getattr__(self, key): + if key in self.__dict__.keys(): + return self.__dict__[key] + else: + return eval("self.__dict__.%s" % key) + + def __len__(self): + return len(self.__dict__) - 1 + + def __str__(self): + s = '' + for key in self.__dict__.keys(): + if key != 'contents': + s += '%s = %s\n' % (key,self.__dict__[key]) + return s + + def __repr__(self): + s = '' + for key in self.__dict__.keys(): + s += '%s = %s\n' % (key,self.__dict__[key]) + return s + + def __cmp__(self,other): + return len(self.__dict__.keys()).__cmp__(len(other.__dict__.keys()))
\ No newline at end of file diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 68a2a185..4953e728 100755..100644 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -44,6 +44,7 @@ from dbFunction import webqtlDatabaseFunction from utility import webqtlUtil from utility.benchmark import Bench from utility import chunks +from utility.tools import locate, locate_ignore_error from maintenance import get_group_samplelists @@ -54,43 +55,29 @@ from pprint import pformat as pf # Each subclass will add to this DS_NAME_MAP = {} -def create_dataset(dataset_name, dataset_type = None): +def create_dataset(dataset_name, dataset_type = None, get_samplelist = True): if not dataset_type: dataset_type = Dataset_Getter(dataset_name) - #dataset_type = get_dataset_type_from_json(dataset_name) print("dataset_type is:", dataset_type) - #query = """ - # SELECT DBType.Name - # FROM DBList, DBType - # WHERE DBList.Name = '{}' and - # DBType.Id = DBList.DBTypeId - # """.format(escape(dataset_name)) - #dataset_type = g.db.execute(query).fetchone().Name - dataset_ob = DS_NAME_MAP[dataset_type] dataset_class = globals()[dataset_ob] - return dataset_class(dataset_name) - + return dataset_class(dataset_name, get_samplelist) -#def get_dataset_type_from_json(dataset_name): - class Dataset_Types(object): - + def __init__(self): self.datasets = {} file_name = "wqflask/static/new/javascript/dataset_menu_structure.json" with open(file_name, 'r') as fh: data = json.load(fh) - + print("*" * 70) for species in data['datasets']: for group in data['datasets'][species]: for dataset_type in data['datasets'][species][group]: for dataset in data['datasets'][species][group][dataset_type]: - #print("dataset is:", dataset) - short_dataset_name = dataset[1] if dataset_type == "Phenotypes": new_type = "Publish" @@ -99,32 +86,28 @@ class Dataset_Types(object): else: new_type = "ProbeSet" self.datasets[short_dataset_name] = new_type - + def __call__(self, name): return self.datasets[name] - + # Do the intensive work at startup one time only Dataset_Getter = Dataset_Types() -# -#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611")) - - def create_datasets_list(): key = "all_datasets" result = Redis.get(key) - + if result: print("Cache hit!!!") datasets = pickle.loads(result) - + else: datasets = list() with Bench("Creating DataSets object"): type_dict = {'Publish': 'PublishFreeze', 'ProbeSet': 'ProbeSetFreeze', 'Geno': 'GenoFreeze'} - + for dataset_type in type_dict: query = "SELECT Name FROM {}".format(type_dict[dataset_type]) for result in g.db.execute(query).fetchall(): @@ -133,10 +116,10 @@ def create_datasets_list(): #print("type: {}\tname: {}".format(dataset_type, result.Name)) dataset = create_dataset(result.Name, dataset_type) datasets.append(dataset) - + Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL)) Redis.expire(key, 60*60) - + return datasets @@ -157,31 +140,30 @@ def mescape(*items): class Markers(object): """Todo: Build in cacheing so it saves us reading the same file more than once""" def __init__(self, name): - json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json')) + json_data_fh = open(locate(name + '.json','genotype/json')) try: markers = json.load(json_data_fh) except: markers = [] - + for marker in markers: if (marker['chr'] != "X") and (marker['chr'] != "Y"): marker['chr'] = int(marker['chr']) - print("Mb:", marker['Mb']) marker['Mb'] = float(marker['Mb']) - + self.markers = markers #print("self.markers:", self.markers) - - + + def add_pvalues(self, p_values): print("length of self.markers:", len(self.markers)) print("length of p_values:", len(p_values)) - + if type(p_values) is list: # THIS IS only needed for the case when we are limiting the number of p-values calculated #if len(self.markers) > len(p_values): # self.markers = self.markers[:len(p_values)] - + for marker, p_value in itertools.izip(self.markers, p_values): if not p_value: continue @@ -214,18 +196,11 @@ class Markers(object): #self.markers.remove(marker) #del self.markers[i] self.markers = filtered_markers - - - #for i, marker in enumerate(self.markers): - # if not 'p_value' in marker: - # #print("self.markers[i]", self.markers[i]) - # del self.markers[i] - # #self.markers.remove(self.markers[i]) class HumanMarkers(Markers): - + def __init__(self, name, specified_markers = []): - marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim')) + marker_data_fh = open(locate('genotype') + '/' + name + '.bim') self.markers = [] for line in marker_data_fh: splat = line.strip().split() @@ -244,54 +219,36 @@ class HumanMarkers(Markers): marker['name'] = splat[1] marker['Mb'] = float(splat[3]) / 1000000 self.markers.append(marker) - + #print("markers is: ", pf(self.markers)) def add_pvalues(self, p_values): - #for marker, p_value in itertools.izip(self.markers, p_values): - # if marker['Mb'] <= 0 and marker['chr'] == 0: - # continue - # marker['p_value'] = p_value - # print("p_value is:", marker['p_value']) - # marker['lod_score'] = -math.log10(marker['p_value']) - # #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values - # marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 - - #print("p_values2:", pf(p_values)) super(HumanMarkers, self).add_pvalues(p_values) - - #with Bench("deleting markers"): - # markers = [] - # for marker in self.markers: - # if not marker['Mb'] <= 0 and not marker['chr'] == 0: - # markers.append(marker) - # self.markers = markers - - + class DatasetGroup(object): """ Each group has multiple datasets; each species has multiple groups. - + For example, Mouse has multiple groups (BXD, BXA, etc), and each group has multiple datasets associated with it. - + """ def __init__(self, dataset): """This sets self.group and self.group_id""" - print("DATASET NAME2:", dataset.name) + #print("DATASET NAME2:", dataset.name) self.name, self.id = g.db.execute(dataset.query_for_group).fetchone() if self.name == 'BXD300': self.name = "BXD" - + self.f1list = None self.parlist = None self.get_f1_parent_strains() #print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) - + self.species = webqtlDatabaseFunction.retrieve_species(self.name) - + self.incparentsf1 = False self.allsamples = None self._datasets = None @@ -302,7 +259,7 @@ class DatasetGroup(object): def get_markers(self): #print("self.species is:", self.species) if self.species == "human": - marker_class = HumanMarkers + marker_class = HumanMarkers else: marker_class = Markers @@ -311,12 +268,6 @@ class DatasetGroup(object): def datasets(self): key = "group_dataset_menu:v2:" + self.name print("key is2:", key) - #with Bench("Loading cache"): - # result = Redis.get(key) - #if result: - # self._datasets = pickle.loads(result) - # return self._datasets - dataset_menu = [] print("[tape4] webqtlConfig.PUBLICTHRESH:", webqtlConfig.PUBLICTHRESH) print("[tape4] type webqtlConfig.PUBLICTHRESH:", type(webqtlConfig.PUBLICTHRESH)) @@ -356,7 +307,7 @@ class DatasetGroup(object): dataset_menu.append(dict(tissue=None, datasets=[(dataset, dataset_short)])) else: dataset_sub_menu = [item[1:] for item in dataset] - + tissue_already_exists = False tissue_position = None for i, tissue_dict in enumerate(dataset_menu): @@ -366,7 +317,7 @@ class DatasetGroup(object): break if tissue_already_exists: - print("dataset_menu:", dataset_menu[i]['datasets']) + #print("dataset_menu:", dataset_menu[i]['datasets']) dataset_menu[i]['datasets'].append((dataset, dataset_short)) else: dataset_menu.append(dict(tissue=tissue_name, @@ -384,7 +335,7 @@ class DatasetGroup(object): f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name] except KeyError: f1 = f12 = maternal = paternal = None - + if f1 and f12: self.f1list = [f1, f12] if maternal and paternal: @@ -392,32 +343,28 @@ class DatasetGroup(object): def get_samplelist(self): key = "samplelist:v2:" + self.name - print("key is:", key) - with Bench("Loading cache"): - result = Redis.get(key) + #print("key is:", key) + #with Bench("Loading cache"): + result = Redis.get(key) if result: - print("Sample List Cache hit!!!") - print("Before unjsonifying {}: {}".format(type(result), result)) + #print("Sample List Cache hit!!!") + #print("Before unjsonifying {}: {}".format(type(result), result)) self.samplelist = json.loads(result) - print(" type: ", type(self.samplelist)) - print(" self.samplelist: ", self.samplelist) + #print(" type: ", type(self.samplelist)) + #print(" self.samplelist: ", self.samplelist) else: print("Cache not hit") - from utility.tools import plink_command - PLINK_PATH,PLINK_COMMAND = plink_command() - - geno_file_path = webqtlConfig.GENODIR+self.name+".geno" - plink_file_path = PLINK_PATH+"/"+self.name+".fam" - - if os.path.isfile(plink_file_path): - self.samplelist = get_group_samplelists.get_samplelist("plink", plink_file_path) - elif os.path.isfile(geno_file_path): - self.samplelist = get_group_samplelists.get_samplelist("geno", geno_file_path) + genotype_fn = locate_ignore_error(self.name+".geno",'genotype') + mapping_fn = locate_ignore_error(self.name+".fam",'mapping') + if mapping_fn: + self.samplelist = get_group_samplelists.get_samplelist("plink", mapping_fn) + elif genotype_fn: + self.samplelist = get_group_samplelists.get_samplelist("geno", genotype_fn) else: self.samplelist = None - print("after get_samplelist") + print("Sample list: ",self.samplelist) Redis.set(key, json.dumps(self.samplelist)) Redis.expire(key, 60*5) @@ -429,30 +376,14 @@ class DatasetGroup(object): def read_genotype_file(self): '''Read genotype from .geno file instead of database''' - #if self.group == 'BXD300': - # self.group = 'BXD' - # - #assert self.group, "self.group needs to be set" - #genotype_1 is Dataset Object without parents and f1 #genotype_2 is Dataset Object with parents and f1 (not for intercross) genotype_1 = reaper.Dataset() # reaper barfs on unicode filenames, so here we ensure it's a string - full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno')) - if os.path.isfile(full_filename): - print("Reading file: ", full_filename) - genotype_1.read(full_filename) - print("File read") - else: - try: - full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno')) - #print("Reading file") - genotype_1.read(full_filename) - #print("File read") - except IOError: - print("File doesn't exist!") + full_filename = str(locate(self.name+'.geno','genotype')) + genotype_1.read(full_filename) if genotype_1.type == "group" and self.parlist: genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1) @@ -461,39 +392,15 @@ class DatasetGroup(object): #determine default genotype object if self.incparentsf1 and genotype_1.type != "intercross": - #self.genotype = genotype_2 genotype = genotype_2 else: self.incparentsf1 = 0 - #self.genotype = genotype_1 genotype = genotype_1 - #self.samplelist = list(self.genotype.prgy) self.samplelist = list(genotype.prgy) - - return genotype - -#class DataSets(object): -# """Builds a list of DataSets""" -# -# def __init__(self): -# self.datasets = list() -# - - - #query = """SELECT Name FROM ProbeSetFreeze - # UNION - # SELECT Name From PublishFreeze - # UNION - # SELECT Name From GenoFreeze""" - # - #for result in g.db.execute(query).fetchall(): - # dataset = DataSet(result.Name) - # self.datasets.append(dataset) + return genotype -#ds = DataSets() -#print("[orange] ds:", ds.datasets) class DataSet(object): """ @@ -502,7 +409,7 @@ class DataSet(object): """ - def __init__(self, name): + def __init__(self, name, get_samplelist = True): assert name, "Need a name" self.name = name @@ -510,49 +417,28 @@ class DataSet(object): self.shortname = None self.fullname = None self.type = None + self.data_scale = None #ZS: For example log2 self.setup() self.check_confidentiality() self.retrieve_other_names() - + self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype - self.group.get_samplelist() + if get_samplelist == True: + self.group.get_samplelist() self.species = species.TheSpecies(self) - print("TESTING!!!") - def get_desc(self): """Gets overridden later, at least for Temp...used by trait's get_given_name""" return None - - #@staticmethod - #def get_by_trait_id(trait_id): - # """Gets the dataset object given the trait id""" - # - # - # - # name = g.db.execute(""" SELECT - # - # """) - # - # return DataSet(name) # Delete this eventually @property def riset(): Weve_Renamed_This_As_Group - - - #@property - #def group(self): - # if not self._group: - # self.get_group() - # - # return self._group - def retrieve_other_names(self): """ @@ -562,7 +448,7 @@ class DataSet(object): This is not meant to retrieve the data set info if no name at all is passed. """ - + try: if self.type == "ProbeSet": query_args = tuple(escape(x) for x in ( @@ -571,8 +457,8 @@ class DataSet(object): self.name, self.name)) - self.id, self.name, self.fullname, self.shortname, self.tissue = g.db.execute(""" - SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, Tissue.Name + self.id, self.name, self.fullname, self.shortname, self.data_scale, self.tissue = g.db.execute(""" + SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, ProbeSetFreeze.DataScale, Tissue.Name FROM ProbeSetFreeze, ProbeFreeze, Tissue WHERE ProbeSetFreeze.public > %s AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id AND @@ -598,17 +484,17 @@ class DataSet(object): except TypeError: print("Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass - + def get_trait_data(self, sample_list=None): if sample_list: self.samplelist = sample_list else: self.samplelist = self.group.samplelist - + if self.group.parlist != None and self.group.f1list != None: if (self.group.parlist + self.group.f1list) in self.samplelist: self.samplelist += self.group.parlist + self.group.f1list - + query = """ SELECT Strain.Name, Strain.Id FROM Strain, Species WHERE Strain.Name IN {} @@ -625,21 +511,6 @@ class DataSet(object): number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) trait_sample_data = [] for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): - - #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId - #tempTable = None - #if GeneId and db.type == "ProbeSet": - # if method == "3": - # tempTable = self.getTempLiteratureTable(species=species, - # input_species_geneid=GeneId, - # returnNumber=returnNumber) - # - # if method == "4" or method == "5": - # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, - # TissueProbeSetFreezeId=tissueProbeSetFreezeId, - # method=method, - # returnNumber=returnNumber) - if self.type == "Publish": dataset_type = "Phenotype" else: @@ -660,7 +531,7 @@ class DataSet(object): left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n """.format(*mescape(self.type, item, item, self.type, item, item)) - + if self.type == "Publish": query += """ WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId @@ -677,16 +548,16 @@ class DataSet(object): order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.type, self.name, dataset_type, self.type, self.type, dataset_type)) - + #print("trait data query: ", query) - + results = g.db.execute(query).fetchall() #print("query results:", results) trait_sample_data.append(results) trait_count = len(trait_sample_data[0]) self.trait_data = collections.defaultdict(list) - + # put all of the separate data together into a dictionary where the keys are # trait names and values are lists of sample values for trait_counter in range(trait_count): @@ -699,9 +570,9 @@ class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' def setup(self): - - print("IS A PHENOTYPEDATASET") - + + #print("IS A PHENOTYPEDATASET") + # Fields in the database table self.search_fields = ['Phenotype.Post_publication_description', 'Phenotype.Pre_publication_description', @@ -772,26 +643,26 @@ class PhenotypeDataSet(DataSet): def get_trait_info(self, trait_list, species = ''): for this_trait in trait_list: - + if not this_trait.haveinfo: this_trait.retrieve_info(get_qtl_info=True) description = this_trait.post_publication_description - + #If the dataset is confidential and the user has access to confidential #phenotype traits, then display the pre-publication description instead #of the post-publication description if this_trait.confidential: this_trait.description_display = "" continue # for now - + if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait( privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users): - + description = this_trait.pre_publication_description - + if len(description) > 0: this_trait.description_display = description.strip() else: @@ -836,11 +707,11 @@ class PhenotypeDataSet(DataSet): this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs this_trait.LRS_score_value = LRS_score_value = this_trait.lrs this_trait.LRS_location_repr = LRS_location_repr = 'Chr%s: %.6f' % (LRS_Chr, float(LRS_Mb)) - + def retrieve_sample_data(self, trait): query = """ SELECT - Strain.Name, PublishData.value, PublishSE.error, NStrain.count + Strain.Name, PublishData.value, PublishSE.error, NStrain.count, Strain.Name2 FROM (PublishData, Strain, PublishXRef, PublishFreeze) left join PublishSE on @@ -894,7 +765,7 @@ class GenotypeDataSet(DataSet): def check_confidentiality(self): return geno_mrna_confidentiality(self) - + def get_trait_list(self): query = """ select Geno.Name @@ -928,11 +799,11 @@ class GenotypeDataSet(DataSet): this_trait.location_repr = 'Chr%s: %.6f' % (this_trait.chr, float(this_trait.mb) ) this_trait.location_value = trait_location_value - + def retrieve_sample_data(self, trait): query = """ SELECT - Strain.Name, GenoData.value, GenoSE.error, GenoData.Id + Strain.Name, GenoData.value, GenoSE.error, GenoData.Id, Sample.Name2 FROM (GenoData, GenoFreeze, Strain, Geno, GenoXRef) left join GenoSE on @@ -1020,7 +891,7 @@ class MrnaAssayDataSet(DataSet): def check_confidentiality(self): return geno_mrna_confidentiality(self) - + def get_trait_list_1(self): query = """ select ProbeSet.Name @@ -1029,86 +900,14 @@ class MrnaAssayDataSet(DataSet): and ProbeSetFreezeId = {} """.format(escape(str(self.id))) results = g.db.execute(query).fetchall() - #print("After get_trait_list query") trait_data = {} for trait in results: - print("Retrieving sample_data for ", trait[0]) trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) - #print("After retrieve_sample_data") return trait_data - - #def get_trait_data(self): - # self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list - # query = """ - # SELECT Strain.Name, Strain.Id FROM Strain, Species - # WHERE Strain.Name IN {} - # and Strain.SpeciesId=Species.Id - # and Species.name = '{}' - # """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) - # results = dict(g.db.execute(query).fetchall()) - # sample_ids = [results[item] for item in self.samplelist] - # - # # MySQL limits the number of tables that can be used in a join to 61, - # # so we break the sample ids into smaller chunks - # # Postgres doesn't have that limit, so we can get rid of this after we transition - # chunk_size = 50 - # number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) - # trait_sample_data = [] - # for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): - # - # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId - # #tempTable = None - # #if GeneId and db.type == "ProbeSet": - # # if method == "3": - # # tempTable = self.getTempLiteratureTable(species=species, - # # input_species_geneid=GeneId, - # # returnNumber=returnNumber) - # # - # # if method == "4" or method == "5": - # # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, - # # TissueProbeSetFreezeId=tissueProbeSetFreezeId, - # # method=method, - # # returnNumber=returnNumber) - # - # temp = ['T%s.value' % item for item in sample_ids_step] - # query = "SELECT {}.Name,".format(escape(self.type)) - # data_start_pos = 1 - # query += string.join(temp, ', ') - # query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, - # self.type, - # self.type)) - # - # for item in sample_ids_step: - # query += """ - # left join {}Data as T{} on T{}.Id = {}XRef.DataId - # and T{}.StrainId={}\n - # """.format(*mescape(self.type, item, item, self.type, item, item)) - # - # query += """ - # WHERE {}XRef.{}FreezeId = {}Freeze.Id - # and {}Freeze.Name = '{}' - # and {}.Id = {}XRef.{}Id - # order by {}.Id - # """.format(*mescape(self.type, self.type, self.type, self.type, - # self.name, self.type, self.type, self.type, self.type)) - # results = g.db.execute(query).fetchall() - # trait_sample_data.append(results) - # - # trait_count = len(trait_sample_data[0]) - # self.trait_data = collections.defaultdict(list) - # - # # put all of the separate data together into a dictionary where the keys are - # # trait names and values are lists of sample values - # for trait_counter in range(trait_count): - # trait_name = trait_sample_data[0][trait_counter][0] - # for chunk_counter in range(int(number_chunks)): - # self.trait_data[trait_name] += ( - # trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) - def get_trait_info(self, trait_list=None, species=''): - # Note: setting trait_list to [] is probably not a great idea. + # Note: setting trait_list to [] is probably not a great idea. if not trait_list: trait_list = [] @@ -1171,7 +970,7 @@ class MrnaAssayDataSet(DataSet): #print("query is:", pf(query)) result = g.db.execute(query).fetchone() - + mean = result[0] if result else 0 if mean: @@ -1192,28 +991,15 @@ class MrnaAssayDataSet(DataSet): Geno.SpeciesId = Species.Id """.format(species, this_trait.locus) result = g.db.execute(query).fetchone() - + if result: - #if result[0] and result[1]: - # lrs_chr = result[0] - # lrs_mb = result[1] lrs_chr, lrs_mb = result #XZ: LRS_location_value is used for sorting lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb) - - #try: - # lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb) - #except: - # if lrs_chr.upper() == 'X': - # lrs_location_value = 20*1000 + float(lrs_mb) - # else: - # lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 + - # float(lrs_mb)) - this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs this_trait.LRS_score_value = this_trait.lrs this_trait.LRS_location_repr = 'Chr%s: %.6f' % (lrs_chr, float(lrs_mb)) - + def convert_location_to_value(self, chromosome, mb): try: @@ -1224,7 +1010,7 @@ class MrnaAssayDataSet(DataSet): else: location_value = (ord(str(chromosome).upper()[0])*1000 + float(mb)) - + return location_value def get_sequence(self): @@ -1241,11 +1027,11 @@ class MrnaAssayDataSet(DataSet): """ % (escape(self.name), escape(self.dataset.name)) results = g.db.execute(query).fetchone() return results[0] - + def retrieve_sample_data(self, trait): query = """ SELECT - Strain.Name, ProbeSetData.value, ProbeSetSE.error, ProbeSetData.Id + Strain.Name, ProbeSetData.value, ProbeSetSE.error, ProbeSetData.Id, Strain.Name2 FROM (ProbeSetData, ProbeSetFreeze, Strain, ProbeSet, ProbeSetXRef) left join ProbeSetSE on @@ -1262,8 +1048,8 @@ class MrnaAssayDataSet(DataSet): results = g.db.execute(query).fetchall() #print("RETRIEVED RESULTS HERE:", results) return results - - + + def retrieve_genes(self, column_name): query = """ select ProbeSet.Name, ProbeSet.%s @@ -1272,37 +1058,8 @@ class MrnaAssayDataSet(DataSet): ProbeSetXRef.ProbeSetId=ProbeSet.Id; """ % (column_name, escape(str(self.id))) results = g.db.execute(query).fetchall() - - return dict(results) - #def retrieve_gene_symbols(self): - # query = """ - # select ProbeSet.Name, ProbeSet.Symbol, ProbeSet.GeneId - # from ProbeSet,ProbeSetXRef - # where ProbeSetXRef.ProbeSetFreezeId = %s and - # ProbeSetXRef.ProbeSetId=ProbeSet.Id; - # """ % (self.id) - # results = g.db.execute(query).fetchall() - # symbol_dict = {} - # for item in results: - # symbol_dict[item[0]] = item[1] - # return symbol_dict - # - #def retrieve_gene_ids(self): - # query = """ - # select ProbeSet.Name, ProbeSet.GeneId - # from ProbeSet,ProbeSetXRef - # where ProbeSetXRef.ProbeSetFreezeId = %s and - # ProbeSetXRef.ProbeSetId=ProbeSet.Id; - # """ % (self.id) - # return process_and_run_query(query) - # results = g.db.execute(query).fetchall() - # symbol_dict = {} - # for item in results: - # symbol_dict[item[0]] = item[1] - # return symbol_dict - - + return dict(results) class TempDataSet(DataSet): @@ -1324,8 +1081,8 @@ class TempDataSet(DataSet): self.id = 1 self.fullname = 'Temporary Storage' self.shortname = 'Temp' - - + + @staticmethod def handle_pca(desc): if 'PCA' in desc: @@ -1334,13 +1091,13 @@ class TempDataSet(DataSet): else: desc = desc[:desc.index('entered')].strip() return desc - + def get_desc(self): g.db.execute('SELECT description FROM Temp WHERE Name=%s', self.name) desc = g.db.fetchone()[0] desc = self.handle_pca(desc) - return desc - + return desc + def get_group(self): self.cursor.execute(""" SELECT @@ -1353,7 +1110,7 @@ class TempDataSet(DataSet): """, self.name) self.group, self.group_id = self.cursor.fetchone() #return self.group - + def retrieve_sample_data(self, trait): query = """ SELECT @@ -1367,7 +1124,7 @@ class TempDataSet(DataSet): Order BY Strain.Name """ % escape(trait.name) - + results = g.db.execute(query).fetchall() diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index ff80795c..a71d8157 100755..100644 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -2,7 +2,7 @@ from __future__ import absolute_import, division, print_function import string import resource - +import codecs from htmlgen import HTMLgen2 as HT @@ -31,16 +31,16 @@ class GeneralTrait(object): """ - def __init__(self, get_qtl_info=False, **kw): + def __init__(self, get_qtl_info=False, get_sample_info=True, **kw): # xor assertion assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. or name"; if kw.get('dataset_name'): self.dataset = create_dataset(kw.get('dataset_name')) - print(" in GeneralTrait created dataset:", self.dataset) + #print(" in GeneralTrait created dataset:", self.dataset) else: self.dataset = kw.get('dataset') self.name = kw.get('name') # Trait ID, ProbeSet ID, Published ID, etc. - print("THE NAME IS:", self.name) + #print("THE NAME IS:", self.name) self.cellid = kw.get('cellid') self.identification = kw.get('identification', 'un-named trait') self.haveinfo = kw.get('haveinfo', False) @@ -67,7 +67,8 @@ class GeneralTrait(object): # Todo: These two lines are necessary most of the time, but perhaps not all of the time # So we could add a simple if statement to short-circuit this if necessary self.retrieve_info(get_qtl_info=get_qtl_info) - self.retrieve_sample_data() + if get_sample_info != False: + self.retrieve_sample_data() def jsonable(self): @@ -179,13 +180,15 @@ class GeneralTrait(object): samples = [] vals = [] the_vars = [] + sample_aliases = [] for sample_name, sample_data in self.data.items(): if sample_data.value != None: if not include_variance or sample_data.variance != None: samples.append(sample_name) vals.append(sample_data.value) the_vars.append(sample_data.variance) - return samples, vals, the_vars + sample_aliases.append(sample_data.name2) + return samples, vals, the_vars, sample_aliases # @@ -220,32 +223,6 @@ class GeneralTrait(object): if samplelist == None: samplelist = [] - #assert self.dataset - - #if self.cellid: - # #Probe Data - # query = ''' - # SELECT - # Strain.Name, ProbeData.value, ProbeSE.error, ProbeData.Id - # FROM - # (ProbeData, ProbeFreeze, ProbeSetFreeze, ProbeXRef, - # Strain, Probe, ProbeSet) - # left join ProbeSE on - # (ProbeSE.DataId = ProbeData.Id AND ProbeSE.StrainId = ProbeData.StrainId) - # WHERE - # Probe.Name = '%s' AND ProbeSet.Name = '%s' AND - # Probe.ProbeSetId = ProbeSet.Id AND - # ProbeXRef.ProbeId = Probe.Id AND - # ProbeXRef.ProbeFreezeId = ProbeFreeze.Id AND - # ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id AND - # ProbeSetFreeze.Name = '%s' AND - # ProbeXRef.DataId = ProbeData.Id AND - # ProbeData.StrainId = Strain.Id - # Order BY - # Strain.Name - # ''' % (self.cellid, self.name, self.dataset.name) - # - #else: results = self.dataset.retrieve_sample_data(self.name) # Todo: is this necessary? If not remove @@ -255,19 +232,10 @@ class GeneralTrait(object): if results: for item in results: - name, value, variance, num_cases = item + name, value, variance, num_cases, name2 = item if not samplelist or (samplelist and name in samplelist): self.data[name] = webqtlCaseData(*item) #name, value, variance, num_cases) - #def keys(self): - # return self.__dict__.keys() - # - #def has_key(self, key): - # return self.__dict__.has_key(key) - # - #def items(self): - # return self.__dict__.items() - def retrieve_info(self, get_qtl_info=False): assert self.dataset, "Dataset doesn't exist" if self.dataset.type == 'Publish': @@ -290,10 +258,10 @@ class GeneralTrait(object): PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND PublishFreeze.Id = %s """ % (self.name, self.dataset.id) - - print("query is:", query) trait_info = g.db.execute(query).fetchone() + + #XZ, 05/08/2009: Xiaodong add this block to use ProbeSet.Id to find the probeset instead of just using ProbeSet.Name #XZ, 05/08/2009: to avoid the problem of same probeset name from different platforms. elif self.dataset.type == 'ProbeSet': @@ -328,7 +296,6 @@ class GeneralTrait(object): escape(self.dataset.name), escape(self.name)) trait_info = g.db.execute(query).fetchone() - #print("trait_info is: ", pf(trait_info)) else: #Temp type query = """SELECT %s FROM %s WHERE Name = %s""" trait_info = g.db.execute(query, @@ -339,54 +306,118 @@ class GeneralTrait(object): #XZ: assign SQL query result to trait attributes. for i, field in enumerate(self.dataset.display_fields): - #print(" mike: {} -> {} - {}".format(field, type(trait_info[i]), trait_info[i])) holder = trait_info[i] if isinstance(trait_info[i], basestring): holder = unicode(trait_info[i], "utf8", "ignore") setattr(self, field, holder) - + if self.dataset.type == 'Publish': self.confidential = 0 if self.pre_publication_description and not self.pubmed_id: self.confidential = 1 + + description = self.post_publication_description + + #If the dataset is confidential and the user has access to confidential + #phenotype traits, then display the pre-publication description instead + #of the post-publication description + if self.confidential: + self.description_display = "" + + #if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait( + # privilege=self.dataset.privilege, + # userName=self.dataset.userName, + # authorized_users=self.authorized_users): + # + # description = self.pre_publication_description + + if description: + self.description_display = description.strip() + else: + self.description_display = "" - self.homologeneid = None - - #print("self.geneid is:", self.geneid) - #print(" type:", type(self.geneid)) - #print("self.dataset.group.name is:", self.dataset.group.name) - if self.dataset.type == 'ProbeSet' and self.dataset.group and self.geneid: - #XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number. - #XZ: So I have to test if geneid is number before execute the query. - #XZ: The geneid values in database should be cleaned up. - #try: - # float(self.geneid) - # geneidIsNumber = True - #except ValueError: - # geneidIsNumber = False - - #if geneidIsNumber: - - - query = """ - SELECT - HomologeneId - FROM - Homologene, Species, InbredSet - WHERE - Homologene.GeneId =%s AND - InbredSet.Name = '%s' AND - InbredSet.SpeciesId = Species.Id AND - Species.TaxonomyId = Homologene.TaxonomyId - """ % (escape(str(self.geneid)), escape(self.dataset.group.name)) - result = g.db.execute(query).fetchone() - #else: - # result = None + if not self.year.isdigit(): + self.pubmed_text = "N/A" + else: + self.pubmed_text = self.year - if result: - self.homologeneid = result[0] + if self.pubmed_id: + self.pubmed_link = webqtlConfig.PUBMEDLINK_URL % self.pubmed_id + + + self.homologeneid = None + if self.dataset.type == 'ProbeSet' and self.dataset.group: + if self.geneid: + #XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number. + #XZ: So I have to test if geneid is number before execute the query. + #XZ: The geneid values in database should be cleaned up. + #try: + # float(self.geneid) + # geneidIsNumber = True + #except ValueError: + # geneidIsNumber = False + #if geneidIsNumber: + query = """ + SELECT + HomologeneId + FROM + Homologene, Species, InbredSet + WHERE + Homologene.GeneId =%s AND + InbredSet.Name = '%s' AND + InbredSet.SpeciesId = Species.Id AND + Species.TaxonomyId = Homologene.TaxonomyId + """ % (escape(str(self.geneid)), escape(self.dataset.group.name)) + result = g.db.execute(query).fetchone() + #else: + # result = None + + if result: + self.homologeneid = result[0] + + description_string = unicode(str(self.description).strip(codecs.BOM_UTF8), 'utf-8') + target_string = unicode(str(self.probe_target_description).strip(codecs.BOM_UTF8), 'utf-8') + + if len(description_string) > 1 and description_string != 'None': + description_display = description_string + else: + description_display = self.symbol + + if (len(description_display) > 1 and description_display != 'N/A' and + len(target_string) > 1 and target_string != 'None'): + description_display = description_display + '; ' + target_string.strip() + + # Save it for the jinja2 template + self.description_display = description_display + + #XZ: trait_location_value is used for sorting + trait_location_repr = 'N/A' + trait_location_value = 1000000 + + if self.chr and self.mb: + #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y") + #This is so we can convert the location to a number used for sorting + trait_location_value = convert_location_to_value(self.chr, self.mb) + #try: + # trait_location_value = int(self.chr)*1000 + self.mb + #except ValueError: + # if self.chr.upper() == 'X': + # trait_location_value = 20*1000 + self.mb + # else: + # trait_location_value = (ord(str(self.chr).upper()[0])*1000 + + # self.mb) + + #ZS: Put this in function currently called "convert_location_to_value" + self.location_repr = 'Chr%s: %.6f' % (self.chr, float(self.mb)) + self.location_value = trait_location_value + if get_qtl_info: + #LRS and its location + self.LRS_score_repr = "N/A" + self.LRS_score_value = 0 + self.LRS_location_repr = "N/A" + self.LRS_location_value = 1000000 if self.dataset.type == 'ProbeSet' and not self.cellid: query = """ SELECT @@ -399,12 +430,8 @@ class GeneralTrait(object): ProbeSetXRef.ProbeSetFreezeId ={} """.format(self.name, self.dataset.id) trait_qtl = g.db.execute(query).fetchone() - #self.cursor.execute(query) - #trait_qtl = self.cursor.fetchone() if trait_qtl: - print("trait_qtl:", trait_qtl) self.locus, self.lrs, self.pvalue, self.mean, self.additive= trait_qtl - print("self.locus:", self.locus) if self.locus: query = """ select Geno.Chr, Geno.Mb from Geno, Species @@ -417,9 +444,9 @@ class GeneralTrait(object): self.locus_chr = result[0] self.locus_mb = result[1] else: - self.locus = self.locus_chr = self.locus_mb = "" + self.locus = self.locus_chr = self.locus_mb = self.additive = "" else: - self.locus = self.locus_chr = self.locus_mb = "" + self.locus = self.locus_chr = self.locus_mb = self.additive = "" else: self.locus = self.locus_chr = self.locus_mb = self.lrs = self.pvalue = self.mean = self.additive = "" @@ -437,8 +464,38 @@ class GeneralTrait(object): """, (self.name, self.dataset.id)).fetchone() if trait_qtl: self.locus, self.lrs, self.additive = trait_qtl + if self.locus: + query = """ + select Geno.Chr, Geno.Mb from Geno, Species + where Species.Name = '{}' and + Geno.Name = '{}' and + Geno.SpeciesId = Species.Id + """.format(self.dataset.group.species, self.locus) + result = g.db.execute(query).fetchone() + if result: + self.locus_chr = result[0] + self.locus_mb = result[1] + else: + self.locus = self.locus_chr = self.locus_mb = self.additive = "" + else: + self.locus = self.locus_chr = self.locus_mb = self.additive = "" else: self.locus = self.lrs = self.additive = "" + + if (self.dataset.type == 'Publish' or self.dataset.type == "ProbeSet") and self.locus_chr != "" and self.locus_mb != "": + #XZ: LRS_location_value is used for sorting + try: + LRS_location_value = int(self.locus_chr)*1000 + float(self.locus_mb) + except: + if self.locus_chr.upper() == 'X': + LRS_location_value = 20*1000 + float(self.locus_mb) + else: + LRS_location_value = ord(str(self.locus_chr).upper()[0])*1000 + float(self.locus_mb) + + self.LRS_location_repr = LRS_location_repr = 'Chr%s: %.6f' % (self.locus_chr, float(self.locus_mb)) + if self.lrs != "": + self.LRS_score_repr = LRS_score_repr = '%3.1f' % self.lrs + self.LRS_score_value = LRS_score_value = self.lrs else: raise KeyError, `self.name`+' information is not found in the database.' @@ -646,7 +703,17 @@ class GeneralTrait(object): ZValue = ZValue*sqrt(self.overlap-3) self.p_value = 2.0*(1.0 - reaper.normp(abs(ZValue))) - +def convert_location_to_value(chromosome, mb): + try: + location_value = int(chromosome)*1000 + float(mb) + except ValueError: + if chromosome.upper() == 'X': + location_value = 20*1000 + float(mb) + else: + location_value = (ord(str(chromosome).upper()[0])*1000 + + float(mb)) + + return location_value @app.route("/trait/get_sample_data") def get_sample_data(): diff --git a/wqflask/base/webqtlCaseData.py b/wqflask/base/webqtlCaseData.py index 42763aed..99a34866 100755 --- a/wqflask/base/webqtlCaseData.py +++ b/wqflask/base/webqtlCaseData.py @@ -29,8 +29,9 @@ print("Mr. Mojo Risin 2") class webqtlCaseData(object): """one case data in one trait""" - def __init__(self, name, value=None, variance=None, num_cases=None): + def __init__(self, name, value=None, variance=None, num_cases=None, name2=None): self.name = name + self.name2 = name2 # Other name (for traits like BXD65a) self.value = value # Trait Value self.variance = variance # Trait Variance self.num_cases = num_cases # Number of individuals/cases diff --git a/wqflask/base/webqtlConfig.py b/wqflask/base/webqtlConfig.py index 3eaeb56e..f6140ac3 100755..100644 --- a/wqflask/base/webqtlConfig.py +++ b/wqflask/base/webqtlConfig.py @@ -1,7 +1,15 @@ #########################################' # Environment Variables - public +# +# Note: much of this needs to handled by the settings/environment +# scripts. But rather than migrating everything in one go, we'll +# take it a step at a time. First the hard coded paths get replaced +# with those in utility/tools.py +# ######################################### +from utility.tools import valid_path, mk_dir, assert_dir, flat_files, TEMPDIR + #Debug Level #1 for debug, mod python will reload import each time DEBUG = 1 @@ -48,30 +56,28 @@ UCSC_RUDI_TRACK_URL = " http://genome.cse.ucsc.edu/cgi-bin/hgTracks?org=%s&db=%s GENOMEBROWSER_URL="http://ucscbrowser.genenetwork.org/cgi-bin/hgTracks?clade=mammal&org=Mouse&db=mm9&position=%s&hgt.suggest=&pix=800&Submit=submit" ENSEMBLETRANSCRIPT_URL="http://useast.ensembl.org/Mus_musculus/Lucene/Details?species=Mus_musculus;idx=Transcript;end=1;q=%s" +# The following paths are no longer in use! +# HTMLPATH is replaced by GENODIR +# IMGDIR is replaced by GENERATED_IMAGE_DIR + +# Temporary storage: +TMPDIR = mk_dir(TEMPDIR+'/gn2/') +CACHEDIR = mk_dir(TEMPDIR+'/cache/') +# We can no longer write into the git tree: +GENERATED_IMAGE_DIR = mk_dir(TMPDIR+'/generated/') +GENERATED_TEXT_DIR = mk_dir(TMPDIR+'/generated_text/') + +# Flat file directories +GENODIR = flat_files('genotype')+'/' +JSON_GENODIR = flat_files('genotype/json')+'/' +if not valid_path(JSON_GENODIR): + # fall back on old location (move the dir, FIXME) + JSON_GENODIR = flat_files('json') +assert_dir(GENODIR) -GNROOT = "/home/zas1024/gene/" # Will remove this and dependent items later -SECUREDIR = GNROOT + 'secure/' -COMMON_LIB = GNROOT + 'support/admin' -HTMLPATH = GNROOT + 'genotype_files/' -PYLMM_PATH = '/home/zas1024/plink_gemma/' -SNP_PATH = '/home/zas1024/snps/' -IMGDIR = GNROOT + '/wqflask/wqflask/static/output/' -IMAGESPATH = HTMLPATH + 'images/' -UPLOADPATH = IMAGESPATH + 'upload/' -TMPDIR = '/home/zas1024/tmp/' # Will remove this and dependent items later -GENODIR = HTMLPATH + 'genotypes/' -NEWGENODIR = HTMLPATH + 'new_genotypes/' -GENO_ARCHIVE_DIR = GENODIR + 'archive/' -TEXTDIR = HTMLPATH + 'ProbeSetFreeze_DataMatrix/' -CMDLINEDIR = HTMLPATH + 'webqtl/cmdLine/' -ChangableHtmlPath = GNROOT + 'web/' - -SITENAME = 'GN' PORTADDR = "http://50.16.251.170" -BASEHREF = '<base href="http://50.16.251.170/">' + INFOPAGEHREF = '/dbdoc/%s.html' -GLOSSARYFILE = "/glossary.html" CGIDIR = '/webqtl/' #XZ: The variable name 'CGIDIR' should be changed to 'PYTHONDIR' SCRIPTFILE = 'main.py' -REFRESHSTR = '<meta http-equiv="refresh" content="5;url=%s' + SCRIPTFILE +'?sid=%s">' -REFRESHDIR = '%s' + SCRIPTFILE +'?sid=%s' + diff --git a/wqflask/base/webqtlFormData.py b/wqflask/base/webqtlFormData.py index 44fdcc3f..10251756 100755 --- a/wqflask/base/webqtlFormData.py +++ b/wqflask/base/webqtlFormData.py @@ -157,7 +157,7 @@ class webqtlFormData(object): self.genotype_1 = reaper.Dataset() - full_filename = os.path.join(webqtlConfig.GENODIR, self.group + '.geno') + full_filename = locate(self.group + '.geno','genotype') # reaper barfs on unicode filenames, so here we ensure it's a string full_filename = str(full_filename) |