diff options
Diffstat (limited to 'wqflask/base')
-rwxr-xr-x | wqflask/base/data_set.py | 212 | ||||
-rwxr-xr-x | wqflask/base/trait.py | 52 |
2 files changed, 152 insertions, 112 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 9b0a3dcc..07fe9cd9 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -38,6 +38,7 @@ from base import species from dbFunction import webqtlDatabaseFunction from utility import webqtlUtil from utility.benchmark import Bench +from wqflask.my_pylmm.pyLMM import chunks from MySQLdb import escape_string as escape from pprint import pformat as pf @@ -46,7 +47,7 @@ from pprint import pformat as pf DS_NAME_MAP = {} def create_dataset(dataset_name): - print("dataset_name:", dataset_name) + #print("dataset_name:", dataset_name) query = """ SELECT DBType.Name @@ -68,10 +69,17 @@ def create_dataset(dataset_name): dataset_class = globals()[dataset_ob] return dataset_class(dataset_name) +def create_in_clause(items): + """Create an in clause for mysql""" + in_clause = ', '.join("'{}'".format(x) for x in mescape(*items)) + in_clause = '( {} )'.format(in_clause) + return in_clause + + def mescape(*items): """Multiple escape""" - escaped = [escape(item) for item in items] - print("escaped is:", escaped) + escaped = [escape(str(item)) for item in items] + #print("escaped is:", escaped) return escaped @@ -82,8 +90,8 @@ class Markers(object): self.markers = json.load(json_data_fh) def add_pvalues(self, p_values): - print("length of self.markers:", len(self.markers)) - print("length of p_values:", len(p_values)) + #print("length of self.markers:", len(self.markers)) + #print("length of p_values:", len(p_values)) # THIS IS only needed for the case when we are limiting the number of p-values calculated if len(self.markers) < len(p_values): @@ -153,7 +161,7 @@ class DatasetGroup(object): self.f1list = None self.parlist = None self.get_f1_parent_strains() - print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) + #print("parents/f1s: {}:{}".format(self.parlist, self.f1list)) self.species = webqtlDatabaseFunction.retrieve_species(self.name) @@ -162,7 +170,7 @@ class DatasetGroup(object): def get_markers(self): - print("self.species is:", self.species) + #print("self.species is:", self.species) if self.species == "human": marker_class = HumanMarkers else: @@ -235,6 +243,7 @@ class DataSet(object): self.retrieve_other_names() self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype + self.group.read_genotype_file() self.species = species.TheSpecies(self) @@ -284,14 +293,14 @@ class DataSet(object): self.name, self.name, self.name)) - print("query_args are:", query_args) + #print("query_args are:", query_args) - print(""" - SELECT Id, Name, FullName, ShortName - FROM %s - WHERE public > %s AND - (Name = '%s' OR FullName = '%s' OR ShortName = '%s') - """ % (query_args)) + #print(""" + # SELECT Id, Name, FullName, ShortName + # FROM %s + # WHERE public > %s AND + # (Name = '%s' OR FullName = '%s' OR ShortName = '%s') + # """ % (query_args)) self.id, self.name, self.fullname, self.shortname = g.db.execute(""" SELECT Id, Name, FullName, ShortName @@ -615,34 +624,33 @@ class MrnaAssayDataSet(DataSet): and ProbeSetFreezeId = {} """.format(escape(str(self.id))) results = g.db.execute(query).fetchall() - print("After get_trait_list query") + #print("After get_trait_list query") trait_data = {} for trait in results: print("Retrieving sample_data for ", trait[0]) trait_data[trait[0]] = self.retrieve_sample_data(trait[0]) - print("After retrieve_sample_data") + #print("After retrieve_sample_data") return trait_data def get_trait_data(self): - sample_ids = [] - for sample in self.group.samplelist: - query = """ - SELECT Strain.Id FROM Strain, Species - WHERE Strain.Name = '{}' - and Strain.SpeciesId=Species.Id - and Species.name = '{}' - """.format(*mescape(sample, self.group.species)) - this_id = g.db.execute(query).fetchone()[0] - sample_ids.append('%d' % this_id) - print("sample_ids size: ", len(sample_ids)) + self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list + query = """ + SELECT Strain.Name, Strain.Id FROM Strain, Species + WHERE Strain.Name IN {} + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(create_in_clause(self.samplelist), *mescape(self.group.species)) + results = dict(g.db.execute(query).fetchall()) + sample_ids = [results[item] for item in self.samplelist] # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks - chunk_count = 50 - n = len(sample_ids) / chunk_count - if len(sample_ids) % chunk_count: - n += 1 - print("n: ", n) + # Postgres doesn't have that limit, so we can get rid of this after we transition + chunk_size = 50 + number_chunks = int(math.ceil(len(sample_ids) / chunk_size)) + trait_sample_data = [] + for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks): + #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId #tempTable = None #if GeneId and db.type == "ProbeSet": @@ -656,24 +664,21 @@ class MrnaAssayDataSet(DataSet): # TissueProbeSetFreezeId=tissueProbeSetFreezeId, # method=method, # returnNumber=returnNumber) - trait_sample_data = [] - for step in range(int(n)): - temp = [] - sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)] - for item in sample_ids_step: - temp.append('T%s.value' % item) + + temp = ['T%s.value' % item for item in sample_ids_step] query = "SELECT {}.Name,".format(escape(self.type)) data_start_pos = 1 query += string.join(temp, ', ') query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type, self.type, self.type)) - #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item) + for item in sample_ids_step: query += """ left join {}Data as T{} on T{}.Id = {}XRef.DataId and T{}.StrainId={}\n """.format(*mescape(self.type, item, item, self.type, item, item)) + query += """ WHERE {}XRef.{}FreezeId = {}Freeze.Id and {}Freeze.Name = '{}' @@ -681,23 +686,24 @@ class MrnaAssayDataSet(DataSet): order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.type, self.name, self.type, self.type, self.type, self.type)) - print("query: ", query) results = g.db.execute(query).fetchall() trait_sample_data.append(results) - + trait_count = len(trait_sample_data[0]) self.trait_data = collections.defaultdict(list) + # put all of the separate data together into a dictionary where the keys are # trait names and values are lists of sample values - for j in range(trait_count): - trait_name = trait_sample_data[0][j][0] - for i in range(int(n)): - self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:] - + for trait_counter in range(trait_count): + trait_name = trait_sample_data[0][trait_counter][0] + for chunk_counter in range(int(number_chunks)): + self.trait_data[trait_name] += ( + trait_sample_data[chunk_counter][trait_counter][data_start_pos:]) + def get_trait_info(self, trait_list=None, species=''): - # Note: setting trait_list to [] is probably not a great idea. + # Note: setting trait_list to [] is probably not a great idea. if not trait_list: trait_list = [] @@ -706,9 +712,7 @@ class MrnaAssayDataSet(DataSet): if not this_trait.haveinfo: this_trait.retrieveInfo(QTL=1) - if this_trait.symbol: - pass - else: + if not this_trait.symbol: this_trait.symbol = "N/A" #XZ, 12/08/2008: description @@ -716,62 +720,56 @@ class MrnaAssayDataSet(DataSet): description_string = str(this_trait.description).strip() target_string = str(this_trait.probe_target_description).strip() - description_display = '' - if len(description_string) > 1 and description_string != 'None': description_display = description_string else: description_display = this_trait.symbol - if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None': + if (len(description_display) > 1 and description_display != 'N/A' and + len(target_string) > 1 and target_string != 'None'): description_display = description_display + '; ' + target_string.strip() # Save it for the jinja2 template this_trait.description_display = description_display - #print(" xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display)) #XZ: trait_location_value is used for sorting trait_location_repr = 'N/A' trait_location_value = 1000000 if this_trait.chr and this_trait.mb: - print("this_trait.chr is:", this_trait.chr) - print("this_trait.mb is:", this_trait.mb) - try: - trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb) - except: - if this_trait.chr.upper() == 'X': - trait_location_value = 20*1000 + this_trait.mb - else: - trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb - - this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) ) + #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y") + #This is so we can convert the location to a number used for sorting + trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb) + #try: + # trait_location_value = int(this_trait.chr)*1000 + this_trait.mb + #except ValueError: + # if this_trait.chr.upper() == 'X': + # trait_location_value = 20*1000 + this_trait.mb + # else: + # trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 + + # this_trait.mb) + + #ZS: Put this in function currently called "convert_location_to_value" + this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, + float(this_trait.mb)) this_trait.location_value = trait_location_value - #this_trait.trait_location_value = trait_location_value - #XZ, 01/12/08: This SQL query is much faster. + #Get mean expression value query = ( -"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet - where ProbeSetXRef.ProbeSetFreezeId = %s and - ProbeSet.Id = ProbeSetXRef.ProbeSetId and - ProbeSet.Name = '%s' + """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet + where ProbeSetXRef.ProbeSetFreezeId = %s and + ProbeSet.Id = ProbeSetXRef.ProbeSetId and + ProbeSet.Name = '%s' """ % (escape(str(this_trait.dataset.id)), escape(this_trait.name))) - print("query is:", pf(query)) + #print("query is:", pf(query)) result = g.db.execute(query).fetchone() + + mean = result[0] if result else 0 - if result: - if result[0]: - mean = result[0] - else: - mean=0 - else: - mean = 0 - - #XZ, 06/05/2009: It is neccessary to turn on nowrap - this_trait.mean = repr = "%2.3f" % mean + this_trait.mean = "%2.3f" % mean #LRS and its location this_trait.LRS_score_repr = 'N/A' @@ -790,23 +788,39 @@ class MrnaAssayDataSet(DataSet): result = self.cursor.fetchone() if result: - if result[0] and result[1]: - LRS_Chr = result[0] - LRS_Mb = result[1] - - #XZ: LRS_location_value is used for sorting - try: - LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb) - except: - if LRS_Chr.upper() == 'X': - LRS_location_value = 20*1000 + float(LRS_Mb) - else: - LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb) + #if result[0] and result[1]: + # lrs_chr = result[0] + # lrs_mb = result[1] + lrs_chr, lrs_mb = result + #XZ: LRS_location_value is used for sorting + lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb) + + #try: + # lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb) + #except: + # if lrs_chr.upper() == 'X': + # lrs_location_value = 20*1000 + float(lrs_mb) + # else: + # lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 + + # float(lrs_mb)) + + this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs + this_trait.LRS_score_value = this_trait.lrs + this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb)) + + + def convert_location_to_value(self, chromosome, mb): + try: + location_value = int(chromosome)*1000 + float(mb) + except ValueError: + if chromosome.upper() == 'X': + location_value = 20*1000 + float(mb) + else: + location_value = (ord(str(chromosome).upper()[0])*1000 + + float(mb)) + + return location_value - this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs - this_trait.LRS_score_value = LRS_score_value = this_trait.lrs - this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) ) - def get_sequence(self): query = """ SELECT @@ -912,7 +926,7 @@ class TempDataSet(DataSet): def geno_mrna_confidentiality(ob): dataset_table = ob.type + "Freeze" - print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table)) + #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table)) query = '''SELECT Id, Name, FullName, confidentiality, AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table) diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py index dde8b8d8..38b3a625 100755 --- a/wqflask/base/trait.py +++ b/wqflask/base/trait.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, print_function import string +import resource + from htmlgen import HTMLgen2 as HT @@ -15,22 +17,38 @@ from pprint import pformat as pf from flask import Flask, g -class GeneralTrait: +def print_mem(stage=""): + mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + print("{}: {}".format(stage, mem/1024)) + +class GeneralTrait(object): """ Trait class defines a trait in webqtl, can be either Microarray, Published phenotype, genotype, or user input trait """ - def __init__(self, **kw): - print("in GeneralTrait") - self.dataset = kw.get('dataset') # database name + def __init__(self, get_qtl_info=False, **kw): + # xor assertion + assert bool(kw.get('dataset')) != bool(kw.get('dataset_name')), "Needs dataset ob. xor name"; + if kw.get('dataset_name'): + self.dataset = create_dataset(kw.get('dataset_name')) + else: + self.dataset = kw.get('dataset') self.name = kw.get('name') # Trait ID, ProbeSet ID, Published ID, etc. self.cellid = kw.get('cellid') self.identification = kw.get('identification', 'un-named trait') self.haveinfo = kw.get('haveinfo', False) self.sequence = kw.get('sequence') # Blat sequence, available for ProbeSet self.data = kw.get('data', {}) + + # Sets defaultst + self.locus = None + self.lrs = None + self.pvalue = None + self.mean = None + self.num_overlap = None + if kw.get('fullname'): name2 = value.split("::") @@ -39,13 +57,12 @@ class GeneralTrait: # self.cellid is set to None above elif len(name2) == 3: self.dataset, self.name, self.cellid = name2 - - self.dataset = create_dataset(self.dataset) # Todo: These two lines are necessary most of the time, but perhaps not all of the time # So we could add a simple if statement to short-circuit this if necessary - self.retrieve_info() + self.retrieve_info(get_qtl_info=get_qtl_info) self.retrieve_sample_data() + def get_name(self): @@ -78,7 +95,7 @@ class GeneralTrait: #desc = self.handle_pca(desc) stringy = desc return stringy - + def display_name(self): @@ -229,7 +246,7 @@ class GeneralTrait: #def items(self): # return self.__dict__.items() - def retrieve_info(self, QTL=False): + def retrieve_info(self, get_qtl_info=False): assert self.dataset, "Dataset doesn't exist" if self.dataset.type == 'Publish': query = """ @@ -269,7 +286,7 @@ class GeneralTrait: escape(self.dataset.name), escape(self.name)) traitInfo = g.db.execute(query).fetchone() - print("traitInfo is: ", pf(traitInfo)) + #print("traitInfo is: ", pf(traitInfo)) #XZ, 05/08/2009: We also should use Geno.Id to find marker instead of just using Geno.Name # to avoid the problem of same marker name from different species. elif self.dataset.type == 'Geno': @@ -287,7 +304,7 @@ class GeneralTrait: escape(self.dataset.name), escape(self.name)) traitInfo = g.db.execute(query).fetchone() - print("traitInfo is: ", pf(traitInfo)) + #print("traitInfo is: ", pf(traitInfo)) else: #Temp type query = """SELECT %s FROM %s WHERE Name = %s """ % (string.join(self.dataset.display_fields,','), @@ -339,7 +356,7 @@ class GeneralTrait: if result: self.homologeneid = result[0] - if QTL: + if get_qtl_info: if self.dataset.type == 'ProbeSet' and not self.cellid: traitQTL = g.db.execute(""" SELECT @@ -355,8 +372,17 @@ class GeneralTrait: #traitQTL = self.cursor.fetchone() if traitQTL: self.locus, self.lrs, self.pvalue, self.mean = traitQTL + if self.locus: + result = g.db.execute(""" + select Geno.Chr, Geno.Mb from Geno, Species + where Species.Name = '%s' and + Geno.Name = '%s' and + Geno.SpeciesId = Species.Id + """, (species, self.locus)).fetchone() + self.locus_chr = result[0] + self.locus_mb = result[1] else: - self.locus = self.lrs = self.pvalue = self.mean = "" + self.locus = self.locus_chr = self.locus_mb = self.lrs = self.pvalue = self.mean = "" if self.dataset.type == 'Publish': traitQTL = g.db.execute(""" SELECT |