diff options
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-x | wqflask/base/data_set.py | 290 |
1 files changed, 239 insertions, 51 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 50ef8f57..9b0a3dcc 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -22,10 +22,14 @@ from __future__ import absolute_import, print_function, division import os +import math +import string +import collections -from flask import Flask, g +import json +import itertools -from htmlgen import HTMLgen2 as HT +from flask import Flask, g import reaper @@ -33,6 +37,7 @@ from base import webqtlConfig from base import species from dbFunction import webqtlDatabaseFunction from utility import webqtlUtil +from utility.benchmark import Bench from MySQLdb import escape_string as escape from pprint import pformat as pf @@ -41,29 +46,95 @@ from pprint import pformat as pf DS_NAME_MAP = {} def create_dataset(dataset_name): - #cursor = db_conn.cursor() print("dataset_name:", dataset_name) query = """ SELECT DBType.Name FROM DBList, DBType - WHERE DBList.Name = '%s' and + WHERE DBList.Name = '{}' and DBType.Id = DBList.DBTypeId - """ % (escape(dataset_name)) - print("query is: ", pf(query)) + """.format(escape(dataset_name)) + #print("query is: ", pf(query)) dataset_type = g.db.execute(query).fetchone().Name #dataset_type = cursor.fetchone()[0] - print("[blubber] dataset_type:", pf(dataset_type)) + #print("[blubber] dataset_type:", pf(dataset_type)) dataset_ob = DS_NAME_MAP[dataset_type] #dataset_class = getattr(data_set, dataset_ob) - print("dataset_ob:", dataset_ob) - print("DS_NAME_MAP:", pf(DS_NAME_MAP)) + #print("dataset_ob:", dataset_ob) + #print("DS_NAME_MAP:", pf(DS_NAME_MAP)) dataset_class = globals()[dataset_ob] return dataset_class(dataset_name) +def mescape(*items): + """Multiple escape""" + escaped = [escape(item) for item in items] + print("escaped is:", escaped) + return escaped + + +class Markers(object): + """Todo: Build in cacheing so it saves us reading the same file more than once""" + def __init__(self, name): + json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json')) + self.markers = json.load(json_data_fh) + + def add_pvalues(self, p_values): + print("length of self.markers:", len(self.markers)) + print("length of p_values:", len(p_values)) + + # THIS IS only needed for the case when we are limiting the number of p-values calculated + if len(self.markers) < len(p_values): + self.markers = self.markers[:len(p_values)] + + for marker, p_value in itertools.izip(self.markers, p_values): + marker['p_value'] = p_value + print("p_value is:", marker['p_value']) + marker['lod_score'] = -math.log10(marker['p_value']) + #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values + marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 + + + + +class HumanMarkers(Markers): + + def __init__(self, name): + marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim')) + self.markers = [] + for line in marker_data_fh: + splat = line.strip().split() + marker = {} + marker['chr'] = int(splat[0]) + marker['name'] = splat[1] + marker['Mb'] = float(splat[3]) / 1000000 + self.markers.append(marker) + + #print("markers is: ", pf(self.markers)) + + + def add_pvalues(self, p_values): + #for marker, p_value in itertools.izip(self.markers, p_values): + # if marker['Mb'] <= 0 and marker['chr'] == 0: + # continue + # marker['p_value'] = p_value + # print("p_value is:", marker['p_value']) + # marker['lod_score'] = -math.log10(marker['p_value']) + # #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values + # marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 + + super(HumanMarkers, self).add_pvalues(p_values) + + with Bench("deleting markers"): + markers = [] + for marker in self.markers: + if not marker['Mb'] <= 0 and not marker['chr'] == 0: + markers.append(marker) + self.markers = markers + + class DatasetGroup(object): """ @@ -79,22 +150,41 @@ class DatasetGroup(object): if self.name == 'BXD300': self.name = "BXD" + self.f1list = None + self.parlist = None + self.get_f1_parent_strains() + print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) + self.species = webqtlDatabaseFunction.retrieve_species(self.name) self.incparentsf1 = False - self.f1list = None - self.parlist = None self.allsamples = None + + + def get_markers(self): + print("self.species is:", self.species) + if self.species == "human": + marker_class = HumanMarkers + else: + marker_class = Markers + self.markers = marker_class(self.name) + - #def read_genotype(self): - # self.read_genotype_file() - # - # if not self.genotype: # Didn'd succeed, so we try method 2 - # self.read_genotype_data() + def get_f1_parent_strains(self): + try: + # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py; + f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name] + except KeyError: + f1 = f12 = maternal = paternal = None + + if f1 and f12: + self.f1list = [f1, f12] + if maternal and paternal: + self.parlist = [maternal, paternal] def read_genotype_file(self): - '''read genotype from .geno file instead of database''' + '''Read genotype from .geno file instead of database''' #if self.group == 'BXD300': # self.group = 'BXD' # @@ -104,38 +194,24 @@ class DatasetGroup(object): #genotype_2 is Dataset Object with parents and f1 (not for intercross) genotype_1 = reaper.Dataset() - + # reaper barfs on unicode filenames, so here we ensure it's a string full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno')) genotype_1.read(full_filename) - print("Got to after read") - - try: - # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py; - f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name] - except KeyError: - f1 = f12 = maternal = paternal = None - - - if genotype_1.type == "group" and maternal and paternal: - genotype_2 = genotype_1.add(Mat=maternal, Pat=paternal) #, F1=_f1) + if genotype_1.type == "group" and self.parlist: + genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1) else: genotype_2 = genotype_1 #determine default genotype object if self.incparentsf1 and genotype_1.type != "intercross": - self.genotype = genotype_2 + genotype = genotype_2 else: self.incparentsf1 = 0 - self.genotype = genotype_1 + genotype = genotype_1 - self.samplelist = list(self.genotype.prgy) - - if f1 and f12: - self.f1list = [f1, f12] - if maternal and paternal: - self.parlist = [maternal, paternal] + self.samplelist = list(genotype.prgy) class DataSet(object): @@ -160,9 +236,8 @@ class DataSet(object): self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype self.species = species.TheSpecies(self) - - - + + def get_desc(self): """Gets overridden later, at least for Temp...used by trait's get_given_name""" return None @@ -227,11 +302,7 @@ class DataSet(object): #self.cursor.execute(query) #self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone() - - - #def genHTML(self, Class='c0dd'): - # return HT.Href(text = HT.Span('%s Database' % self.fullname, Class= "fwb " + Class), - # url= webqtlConfig.INFOPAGEHREF % self.name,target="_blank") + class PhenotypeDataSet(DataSet): DS_NAME_MAP['Publish'] = 'PhenotypeDataSet' @@ -291,6 +362,19 @@ class PhenotypeDataSet(DataSet): # (Urgently?) Need to write this pass + def get_trait_list(self): + query = """ + select PublishXRef.Id + from PublishXRef, PublishFreeze + where PublishFreeze.InbredSetId=PublishXRef.InbredSetId + and PublishFreeze.Id = {} + """.format(escape(str(self.id))) + results = g.db.execute(query).fetchall() + trait_data = {} + for trait in results: + trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) + return trait_data + def get_trait_info(self, trait_list, species = ''): for this_trait in trait_list: if not this_trait.haveinfo: @@ -301,7 +385,7 @@ class PhenotypeDataSet(DataSet): continue # for now if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users): description = this_trait.pre_publication_description - this_trait.description_display = description + this_trait.description_display = unicode(description, "utf8") if not this_trait.year.isdigit(): this_trait.pubmed_text = "N/A" @@ -359,7 +443,7 @@ class PhenotypeDataSet(DataSet): PublishFreeze.Id = %d AND PublishData.StrainId = Strain.Id Order BY Strain.Name - """ % (trait.name, self.id) + """ % (trait, self.id) results = g.db.execute(query).fetchall() return results @@ -399,6 +483,19 @@ class GenotypeDataSet(DataSet): def check_confidentiality(self): return geno_mrna_confidentiality(self) + + def get_trait_list(self): + query = """ + select Geno.Name + from Geno, GenoXRef + where GenoXRef.GenoId = Geno.Id + and GenoFreezeId = {} + """.format(escape(str(self.id))) + results = g.db.execute(query).fetchall() + trait_data = {} + for trait in results: + trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) + return trait_data def get_trait_info(self, trait_list, species=None): for this_trait in trait_list: @@ -437,7 +534,7 @@ class GenotypeDataSet(DataSet): GenoData.StrainId = Strain.Id Order BY Strain.Name - """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait.name, self.name) + """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name) results = g.db.execute(query).fetchall() return results @@ -509,7 +606,95 @@ class MrnaAssayDataSet(DataSet): def check_confidentiality(self): return geno_mrna_confidentiality(self) + + def get_trait_list_1(self): + query = """ + select ProbeSet.Name + from ProbeSet, ProbeSetXRef + where ProbeSetXRef.ProbeSetId = ProbeSet.Id + and ProbeSetFreezeId = {} + """.format(escape(str(self.id))) + results = g.db.execute(query).fetchall() + print("After get_trait_list query") + trait_data = {} + for trait in results: + print("Retrieving sample_data for ", trait[0]) + trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) + print("After retrieve_sample_data") + return trait_data + + def get_trait_data(self): + sample_ids = [] + for sample in self.group.samplelist: + query = """ + SELECT Strain.Id FROM Strain, Species + WHERE Strain.Name = '{}' + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(*mescape(sample, self.group.species)) + this_id = g.db.execute(query).fetchone()[0] + sample_ids.append('%d' % this_id) + print("sample_ids size: ", len(sample_ids)) + + # MySQL limits the number of tables that can be used in a join to 61, + # so we break the sample ids into smaller chunks + chunk_count = 50 + n = len(sample_ids) / chunk_count + if len(sample_ids) % chunk_count: + n += 1 + print("n: ", n) + #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId + #tempTable = None + #if GeneId and db.type == "ProbeSet": + # if method == "3": + # tempTable = self.getTempLiteratureTable(species=species, + # input_species_geneid=GeneId, + # returnNumber=returnNumber) + # + # if method == "4" or method == "5": + # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol, + # TissueProbeSetFreezeId=tissueProbeSetFreezeId, + # method=method, + # returnNumber=returnNumber) + trait_sample_data = [] + for step in range(int(n)): + temp = [] + sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)] + for item in sample_ids_step: + temp.append('T%s.value' % item) + query = "SELECT {}.Name,".format(escape(self.type)) + data_start_pos = 1 + query += string.join(temp, ', ') + query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, + self.type, + self.type)) + #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item) + for item in sample_ids_step: + query += """ + left join {}Data as T{} on T{}.Id = {}XRef.DataId + and T{}.StrainId={}\n + """.format(*mescape(self.type, item, item, self.type, item, item)) + query += """ + WHERE {}XRef.{}FreezeId = {}Freeze.Id + and {}Freeze.Name = '{}' + and {}.Id = {}XRef.{}Id + order by {}.Id + """.format(*mescape(self.type, self.type, self.type, self.type, + self.name, self.type, self.type, self.type, self.type)) + print("query: ", query) + results = g.db.execute(query).fetchall() + trait_sample_data.append(results) + + trait_count = len(trait_sample_data[0]) + self.trait_data = collections.defaultdict(list) + # put all of the separate data together into a dictionary where the keys are + # trait names and values are lists of sample values + for j in range(trait_count): + trait_name = trait_sample_data[0][j][0] + for i in range(int(n)): + self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:] + def get_trait_info(self, trait_list=None, species=''): # Note: setting trait_list to [] is probably not a great idea. @@ -550,8 +735,10 @@ class MrnaAssayDataSet(DataSet): trait_location_value = 1000000 if this_trait.chr and this_trait.mb: + print("this_trait.chr is:", this_trait.chr) + print("this_trait.mb is:", this_trait.mb) try: - trait_location_value = int(this_trait.chr)*1000 + this_trait.mb + trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb) except: if this_trait.chr.upper() == 'X': trait_location_value = 20*1000 + this_trait.mb @@ -633,9 +820,9 @@ class MrnaAssayDataSet(DataSet): ProbeSetFreeze.Name = %s """ % (escape(self.name), escape(self.dataset.name)) results = g.db.execute(query).fetchone() - return results[0] + def retrieve_sample_data(self, trait): query = """ SELECT @@ -652,7 +839,7 @@ class MrnaAssayDataSet(DataSet): ProbeSetData.StrainId = Strain.Id Order BY Strain.Name - """ % (escape(trait.name), escape(self.name)) + """ % (escape(trait), escape(self.name)) results = g.db.execute(query).fetchall() return results @@ -741,3 +928,4 @@ def geno_mrna_confidentiality(ob): if confidential: # Allow confidential data later NoConfindetialDataForYouTodaySorry + |