diff options
author | Lei Yan | 2013-09-13 14:07:27 -0500 |
---|---|---|
committer | Lei Yan | 2013-09-13 14:07:27 -0500 |
commit | af24c0d610d9a2189f86677e4f23deb372ee2bf7 (patch) | |
tree | 53480351b97727670637a37dbd4c78e52446ae88 /wqflask/base | |
parent | 155e2997613c0750de30b734686f8977524956f9 (diff) | |
parent | c5fc931621707865357ace4b637db7481e0be552 (diff) | |
download | genenetwork2-af24c0d610d9a2189f86677e4f23deb372ee2bf7.tar.gz |
Merge https://github.com/zsloan/genenetwork
Resolved conflicts:
wqflask/base/trait.py
wqflask/wqflask/correlation/correlationFunction.py
wqflask/wqflask/correlation/correlation_function.py
wqflask/wqflask/correlation/correlation_functions.py
wqflask/wqflask/correlation/show_corr_results.py
Diffstat (limited to 'wqflask/base')
-rwxr-xr-x | wqflask/base/data_set.py | 366 | ||||
-rwxr-xr-x | wqflask/base/trait.py | 9 | ||||
-rwxr-xr-x | wqflask/base/webqtlConfig.py | 11 | ||||
-rwxr-xr-x | wqflask/base/webqtlConfigLocal.py | 20 |
4 files changed, 278 insertions, 128 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 03b24230..96e04df0 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -16,8 +16,6 @@ # Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) # at rwilliams@uthsc.edu and xzhou15@uthsc.edu # -#we -# # This module is used by GeneNetwork project (www.genenetwork.org) from __future__ import absolute_import, print_function, division @@ -27,6 +25,7 @@ import string import collections import json +import gzip import cPickle as pickle import itertools @@ -44,36 +43,71 @@ from utility import webqtlUtil from utility.benchmark import Bench from wqflask.my_pylmm.pyLMM import chunks +from maintenance import get_group_samplelists + from MySQLdb import escape_string as escape from pprint import pformat as pf # Used by create_database to instantiate objects +# Each subclass will add to this DS_NAME_MAP = {} def create_dataset(dataset_name, dataset_type = None): - #print("dataset_name:", dataset_name) - if not dataset_type: - query = """ - SELECT DBType.Name - FROM DBList, DBType - WHERE DBList.Name = '{}' and - DBType.Id = DBList.DBTypeId - """.format(escape(dataset_name)) - #print("query is: ", pf(query)) - dataset_type = g.db.execute(query).fetchone().Name + dataset_type = Dataset_Getter(dataset_name) + #dataset_type = get_dataset_type_from_json(dataset_name) - #dataset_type = cursor.fetchone()[0] - #print("[blubber] dataset_type:", pf(dataset_type)) + print("dataset_type is:", dataset_type) + #query = """ + # SELECT DBType.Name + # FROM DBList, DBType + # WHERE DBList.Name = '{}' and + # DBType.Id = DBList.DBTypeId + # """.format(escape(dataset_name)) + #dataset_type = g.db.execute(query).fetchone().Name - dataset_ob = DS_NAME_MAP[dataset_type] - #dataset_class = getattr(data_set, dataset_ob) - #print("dataset_ob:", dataset_ob) - #print("DS_NAME_MAP:", pf(DS_NAME_MAP)) + dataset_ob = DS_NAME_MAP[dataset_type] dataset_class = globals()[dataset_ob] return dataset_class(dataset_name) + +#def get_dataset_type_from_json(dataset_name): + +class Dataset_Types(object): + + def __init__(self): + self.datasets = {} + file_name = "wqflask/static/new/javascript/dataset_menu_structure.json" + with open(file_name, 'r') as fh: + data = json.load(fh) + + print("*" * 70) + for species in data['datasets']: + for group in data['datasets'][species]: + for dataset_type in data['datasets'][species][group]: + for dataset in data['datasets'][species][group][dataset_type]: + print("dataset is:", dataset) + + short_dataset_name = dataset[0] + if dataset_type == "Phenotypes": + new_type = "Publish" + elif dataset_type == "Genotypes": + new_type = "Geno" + else: + new_type = "ProbeSet" + self.datasets[short_dataset_name] = new_type + + def __call__(self, name): + return self.datasets[name] + +# Do the intensive work at startup one time only +Dataset_Getter = Dataset_Types() + +# +#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611")) + + def create_datasets_list(): key = "all_datasets" result = Redis.get(key) @@ -94,7 +128,7 @@ def create_datasets_list(): for result in g.db.execute(query).fetchall(): #The query at the beginning of this function isn't necessary here, but still would #rather just reuse it - print("type: {}\tname: {}".format(dataset_type, result.Name)) + #print("type: {}\tname: {}".format(dataset_type, result.Name)) dataset = create_dataset(result.Name, dataset_type) datasets.append(dataset) @@ -134,12 +168,13 @@ class Markers(object): for marker, p_value in itertools.izip(self.markers, p_values): marker['p_value'] = p_value - print("p_value is:", marker['p_value']) - marker['lod_score'] = -math.log10(marker['p_value']) - #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values - marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 - - + if marker['p_value'] == 0: + marker['lod_score'] = 0 + marker['lrs_value'] = 0 + else: + marker['lod_score'] = -math.log10(marker['p_value']) + #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values + marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 class HumanMarkers(Markers): @@ -154,8 +189,6 @@ class HumanMarkers(Markers): marker['name'] = splat[1] marker['Mb'] = float(splat[3]) / 1000000 self.markers.append(marker) - - #print("markers is: ", pf(self.markers)) def add_pvalues(self, p_values): @@ -212,7 +245,7 @@ class DatasetGroup(object): marker_class = Markers self.markers = marker_class(self.name) - + def get_f1_parent_strains(self): try: @@ -225,7 +258,29 @@ class DatasetGroup(object): self.f1list = [f1, f12] if maternal and paternal: self.parlist = [maternal, paternal] - + + def get_samplelist(self): + key = "samplelist:v4:" + self.name + print("key is:", key) + with Bench("Loading cache"): + result = Redis.get(key) + + if result: + print("Sample List Cache hit!!!") + print("Before unjsonifying {}: {}".format(type(result), result)) + self.samplelist = json.loads(result) + print(" type: ", type(self.samplelist)) + print(" self.samplelist: ", self.samplelist) + else: + print("Cache not hit") + try: + self.samplelist = get_group_samplelists.get_samplelist(self.name + ".geno") + except IOError: + self.samplelist = None + print("after get_samplelist") + Redis.set(key, json.dumps(self.samplelist)) + Redis.expire(key, 60*5) + def read_genotype_file(self): '''Read genotype from .geno file instead of database''' #if self.group == 'BXD300': @@ -240,7 +295,18 @@ class DatasetGroup(object): # reaper barfs on unicode filenames, so here we ensure it's a string full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno')) - genotype_1.read(full_filename) + if os.path.isfile(full_filename): + print("Reading file: ", full_filename) + genotype_1.read(full_filename) + print("File read") + else: + try: + full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno')) + #print("Reading file") + genotype_1.read(full_filename) + #print("File read") + except IOError: + print("File doesn't exist!") if genotype_1.type == "group" and self.parlist: genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1) @@ -301,7 +367,7 @@ class DataSet(object): self.retrieve_other_names() self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype - self.group.read_genotype_file() + self.group.get_samplelist() self.species = species.TheSpecies(self) @@ -335,7 +401,6 @@ class DataSet(object): # return self._group - def retrieve_other_names(self): """ If the data set name parameter is not found in the 'Name' field of the data set table, @@ -370,11 +435,98 @@ class DataSet(object): except TypeError: print("Dataset {} is not yet available in GeneNetwork.".format(self.name)) pass + + def get_trait_data(self): + self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list + query = """ + SELECT Strain.Name, Strain.Id FROM Strain, Species + WHERE Strain.Name IN {} + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + results = dict(g.db.execute(query).fetchall()) + sample_ids = [results[item] for item in self.samplelist] + + # MySQL limits the number of tables that can be used in a join to 61, + # so we break the sample ids into smaller chunks + # Postgres doesn't have that limit, so we can get rid of this after we transition + chunk_size = 50 + number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) + trait_sample_data = [] + for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): + + #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId + #tempTable = None + #if GeneId and db.type == "ProbeSet": + # if method == "3": + # tempTable = self.getTempLiteratureTable(species=species, + # input_species_geneid=GeneId, + # returnNumber=returnNumber) + # + # if method == "4" or method == "5": + # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, + # TissueProbeSetFreezeId=tissueProbeSetFreezeId, + # method=method, + # returnNumber=returnNumber) + + if self.type == "Publish": + dataset_type = "Phenotype" + else: + dataset_type = self.type + temp = ['T%s.value' % item for item in sample_ids_step] + if self.type == "Publish": + query = "SELECT {}XRef.Id,".format(escape(self.type)) + else: + query = "SELECT {}.Name,".format(escape(dataset_type)) + data_start_pos = 1 + query += string.join(temp, ', ') + query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type, + self.type, + self.type)) + + for item in sample_ids_step: + query += """ + left join {}Data as T{} on T{}.Id = {}XRef.DataId + and T{}.StrainId={}\n + """.format(*mescape(self.type, item, item, self.type, item, item)) + + if self.type == "Publish": + query += """ + WHERE {}XRef.PublicationId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(self.type, self.type, self.type, self.type, + self.name, dataset_type, self.type, self.type, dataset_type)) + else: + query += """ + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(self.type, self.type, self.type, self.type, + self.name, dataset_type, self.type, self.type, dataset_type)) + results = g.db.execute(query).fetchall() + trait_sample_data.append(results) + + trait_count = len(trait_sample_data[0]) + self.trait_data = collections.defaultdict(list) + + # put all of the separate data together into a dictionary where the keys are + # trait names and values are lists of sample values + for trait_counter in range(trait_count): + trait_name = trait_sample_data[0][trait_counter][0] + for chunk_counter in range(int(number_chunks)): + self.trait_data[trait_name] += ( + trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' def setup(self): + + print("IS A PHENOTYPEDATASET") + # Fields in the database table self.search_fields = ['Phenotype.Post_publication_description', 'Phenotype.Pre_publication_description', @@ -445,14 +597,24 @@ class PhenotypeDataSet(DataSet): def get_trait_info(self, trait_list, species = ''): for this_trait in trait_list: if not this_trait.haveinfo: - this_trait.retrieveInfo(QTL=1) + this_trait.retrieve_info(get_qtl_info=True) description = this_trait.post_publication_description + + #If the dataset is confidential and the user has access to confidential + #phenotype traits, then display the pre-publication description instead + #of the post-publication description if this_trait.confidential: continue # for now - if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users): + + if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait( + privilege=self.privilege, + userName=self.userName, + authorized_users=this_trait.authorized_users): + description = this_trait.pre_publication_description - this_trait.description_display = unicode(description, "utf8") + + this_trait.description_display = description if not this_trait.year.isdigit(): this_trait.pubmed_text = "N/A" @@ -690,73 +852,73 @@ class MrnaAssayDataSet(DataSet): #print("After retrieve_sample_data") return trait_data - def get_trait_data(self): - self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list - query = """ - SELECT Strain.Name, Strain.Id FROM Strain, Species - WHERE Strain.Name IN {} - and Strain.SpeciesId=Species.Id - and Species.name = '{}' - """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) - results = dict(g.db.execute(query).fetchall()) - sample_ids = [results[item] for item in self.samplelist] - - # MySQL limits the number of tables that can be used in a join to 61, - # so we break the sample ids into smaller chunks - # Postgres doesn't have that limit, so we can get rid of this after we transition - chunk_size = 50 - number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) - trait_sample_data = [] - for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): - - #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId - #tempTable = None - #if GeneId and db.type == "ProbeSet": - # if method == "3": - # tempTable = self.getTempLiteratureTable(species=species, - # input_species_geneid=GeneId, - # returnNumber=returnNumber) - # - # if method == "4" or method == "5": - # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, - # TissueProbeSetFreezeId=tissueProbeSetFreezeId, - # method=method, - # returnNumber=returnNumber) - - temp = ['T%s.value' % item for item in sample_ids_step] - query = "SELECT {}.Name,".format(escape(self.type)) - data_start_pos = 1 - query += string.join(temp, ', ') - query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, - self.type, - self.type)) - - for item in sample_ids_step: - query += """ - left join {}Data as T{} on T{}.Id = {}XRef.DataId - and T{}.StrainId={}\n - """.format(*mescape(self.type, item, item, self.type, item, item)) - - query += """ - WHERE {}XRef.{}FreezeId = {}Freeze.Id - and {}Freeze.Name = '{}' - and {}.Id = {}XRef.{}Id - order by {}.Id - """.format(*mescape(self.type, self.type, self.type, self.type, - self.name, self.type, self.type, self.type, self.type)) - results = g.db.execute(query).fetchall() - trait_sample_data.append(results) - - trait_count = len(trait_sample_data[0]) - self.trait_data = collections.defaultdict(list) - - # put all of the separate data together into a dictionary where the keys are - # trait names and values are lists of sample values - for trait_counter in range(trait_count): - trait_name = trait_sample_data[0][trait_counter][0] - for chunk_counter in range(int(number_chunks)): - self.trait_data[trait_name] += ( - trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + #def get_trait_data(self): + # self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list + # query = """ + # SELECT Strain.Name, Strain.Id FROM Strain, Species + # WHERE Strain.Name IN {} + # and Strain.SpeciesId=Species.Id + # and Species.name = '{}' + # """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + # results = dict(g.db.execute(query).fetchall()) + # sample_ids = [results[item] for item in self.samplelist] + # + # # MySQL limits the number of tables that can be used in a join to 61, + # # so we break the sample ids into smaller chunks + # # Postgres doesn't have that limit, so we can get rid of this after we transition + # chunk_size = 50 + # number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) + # trait_sample_data = [] + # for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): + # + # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId + # #tempTable = None + # #if GeneId and db.type == "ProbeSet": + # # if method == "3": + # # tempTable = self.getTempLiteratureTable(species=species, + # # input_species_geneid=GeneId, + # # returnNumber=returnNumber) + # # + # # if method == "4" or method == "5": + # # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, + # # TissueProbeSetFreezeId=tissueProbeSetFreezeId, + # # method=method, + # # returnNumber=returnNumber) + # + # temp = ['T%s.value' % item for item in sample_ids_step] + # query = "SELECT {}.Name,".format(escape(self.type)) + # data_start_pos = 1 + # query += string.join(temp, ', ') + # query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, + # self.type, + # self.type)) + # + # for item in sample_ids_step: + # query += """ + # left join {}Data as T{} on T{}.Id = {}XRef.DataId + # and T{}.StrainId={}\n + # """.format(*mescape(self.type, item, item, self.type, item, item)) + # + # query += """ + # WHERE {}XRef.{}FreezeId = {}Freeze.Id + # and {}Freeze.Name = '{}' + # and {}.Id = {}XRef.{}Id + # order by {}.Id + # """.format(*mescape(self.type, self.type, self.type, self.type, + # self.name, self.type, self.type, self.type, self.type)) + # results = g.db.execute(query).fetchall() + # trait_sample_data.append(results) + # + # trait_count = len(trait_sample_data[0]) + # self.trait_data = collections.defaultdict(list) + # + # # put all of the separate data together into a dictionary where the keys are + # # trait names and values are lists of sample values + # for trait_counter in range(trait_count): + # trait_name = trait_sample_data[0][trait_counter][0] + # for chunk_counter in range(int(number_chunks)): + # self.trait_data[trait_name] += ( + # trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) def get_trait_info(self, trait_list=None, species=''): diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index 401b729c..6a64eeaf 100755 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -318,7 +318,11 @@ class GeneralTrait(object): #XZ: assign SQL query result to trait attributes. for i, field in enumerate(self.dataset.display_fields): - setattr(self, field, trait_info[i]) + #print(" mike: {} -> {} - {}".format(field, type(trait_info[i]), trait_info[i])) + holder = trait_info[i] + if isinstance(trait_info[i], basestring): + holder = unicode(trait_info[i], "utf8", "ignore") + setattr(self, field, holder) if self.dataset.type == 'Publish': self.confidential = 0 @@ -327,6 +331,9 @@ class GeneralTrait(object): self.homologeneid = None + #print("self.geneid is:", self.geneid) + #print(" type:", type(self.geneid)) + #print("self.dataset.group.name is:", self.dataset.group.name) if self.dataset.type == 'ProbeSet' and self.dataset.group and self.geneid: #XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number. #XZ: So I have to test if geneid is number before execute the query. diff --git a/wqflask/base/webqtlConfig.py b/wqflask/base/webqtlConfig.py index a811c3cd..23d32233 100755 --- a/wqflask/base/webqtlConfig.py +++ b/wqflask/base/webqtlConfig.py @@ -1,4 +1,3 @@ -from webqtlConfigLocal import * #########################################' # Environment Variables - public ######################################### @@ -36,7 +35,7 @@ NCBI_LOCUSID = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrie UCSC_REFSEQ = "http://genome.cse.ucsc.edu/cgi-bin/hgGene?db=%s&hgg_gene=%s&hgg_chrom=chr%s&hgg_start=%s&hgg_end=%s" GENBANK_ID = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Nucleotide&cmd=search&doptcmdl=DocSum&term=%s" OMIM_ID = "http://www.ncbi.nlm.nih.gov/omim/%s" -UNIGEN_ID = "http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=%s&CID=%s" +UNIGEN_ID = "http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=%s&CID=%s"; HOMOLOGENE_ID = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=homologene&Cmd=DetailsSearch&Term=%s" PUBMEDLINK_URL = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=%s&dopt=Abstract" UCSC_POS = "http://genome.ucsc.edu/cgi-bin/hgTracks?clade=mammal&org=%s&db=%s&position=chr%s:%s-%s&pix=800&Submit=submit" @@ -49,15 +48,17 @@ UCSC_RUDI_TRACK_URL = " http://genome.cse.ucsc.edu/cgi-bin/hgTracks?org=%s&db=%s GENOMEBROWSER_URL="http://ucscbrowser.genenetwork.org/cgi-bin/hgTracks?clade=mammal&org=Mouse&db=mm9&position=%s&hgt.suggest=&pix=800&Submit=submit" ENSEMBLETRANSCRIPT_URL="http://useast.ensembl.org/Mus_musculus/Lucene/Details?species=Mus_musculus;idx=Transcript;end=1;q=%s" + +GNROOT = "/home/zas1024/gene/" # Will remove this and dependent items later SECUREDIR = GNROOT + 'secure/' COMMON_LIB = GNROOT + 'support/admin' HTMLPATH = GNROOT + 'web/' -PYLMM_PATH = '/home/zas1024/' -SNP_PATH = '/mnt/xvdf1/snps/' +PYLMM_PATH = '/home/zas1024/plink/' +SNP_PATH = '/home/zas1024/snps/' IMGDIR = HTMLPATH +'image/' IMAGESPATH = HTMLPATH + 'images/' UPLOADPATH = IMAGESPATH + 'upload/' -TMPDIR = HTMLPATH + 'tmp/' +TMPDIR = '/tmp/' GENODIR = HTMLPATH + 'genotypes/' NEWGENODIR = HTMLPATH + 'new_genotypes/' GENO_ARCHIVE_DIR = GENODIR + 'archive/' diff --git a/wqflask/base/webqtlConfigLocal.py b/wqflask/base/webqtlConfigLocal.py deleted file mode 100755 index abaeff93..00000000 --- a/wqflask/base/webqtlConfigLocal.py +++ /dev/null @@ -1,20 +0,0 @@ -#########################################' -# Environment Variables - private -######################################### - -MYSQL_SERVER = 'gn.cazhbciu2y1i.us-east-1.rds.amazonaws.com' -DB_NAME = 'db_webqtl' -DB_USER = 'webqtl' -DB_PASSWD = 'f2ZypIflRM' - -MYSQL_UPDSERVER = 'gn.cazhbciu2y1i.us-east-1.rds.amazonaws.com' -DB_UPDNAME = 'db_webqtl' -DB_UPDUSER = 'webqtl' -DB_UPDPASSWD = 'f2ZypIflRM' - -GNROOT = '/home/zas1024/gene/' -ROOT_URL = 'http://50.16.251.170' -PythonPath = '/usr/bin/python' -PIDDLE_FONT_PATH = '/usr/lib/python2.4/site-packages/piddle/truetypefonts/' - -TEXTUI=1 # XZ: This is to protect GN production server. If set to 0, no text UI is allowed. |