aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
authorZachary Sloan2013-06-20 22:20:23 +0000
committerZachary Sloan2013-06-20 22:20:23 +0000
commit4ffee373494170e708678039dca132f1bd729ab1 (patch)
treea9d6054380f5a9d612a4d1d88889f68ea4923a75 /wqflask/base/data_set.py
parent939058c4a3b668037974f2876b072c4be008da26 (diff)
parent52ac4b6e1c014801080cbbcad53df868058d2657 (diff)
downloadgenenetwork2-4ffee373494170e708678039dca132f1bd729ab1.tar.gz
Merge branch 'flask'
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py420
1 files changed, 311 insertions, 109 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 50ef8f57..07fe9cd9 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -22,10 +22,14 @@
from __future__ import absolute_import, print_function, division
import os
+import math
+import string
+import collections
-from flask import Flask, g
+import json
+import itertools
-from htmlgen import HTMLgen2 as HT
+from flask import Flask, g
import reaper
@@ -33,6 +37,8 @@ from base import webqtlConfig
from base import species
from dbFunction import webqtlDatabaseFunction
from utility import webqtlUtil
+from utility.benchmark import Bench
+from wqflask.my_pylmm.pyLMM import chunks
from MySQLdb import escape_string as escape
from pprint import pformat as pf
@@ -41,29 +47,102 @@ from pprint import pformat as pf
DS_NAME_MAP = {}
def create_dataset(dataset_name):
- #cursor = db_conn.cursor()
- print("dataset_name:", dataset_name)
+ #print("dataset_name:", dataset_name)
query = """
SELECT DBType.Name
FROM DBList, DBType
- WHERE DBList.Name = '%s' and
+ WHERE DBList.Name = '{}' and
DBType.Id = DBList.DBTypeId
- """ % (escape(dataset_name))
- print("query is: ", pf(query))
+ """.format(escape(dataset_name))
+ #print("query is: ", pf(query))
dataset_type = g.db.execute(query).fetchone().Name
#dataset_type = cursor.fetchone()[0]
- print("[blubber] dataset_type:", pf(dataset_type))
+ #print("[blubber] dataset_type:", pf(dataset_type))
dataset_ob = DS_NAME_MAP[dataset_type]
#dataset_class = getattr(data_set, dataset_ob)
- print("dataset_ob:", dataset_ob)
- print("DS_NAME_MAP:", pf(DS_NAME_MAP))
+ #print("dataset_ob:", dataset_ob)
+ #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
dataset_class = globals()[dataset_ob]
return dataset_class(dataset_name)
+def create_in_clause(items):
+ """Create an in clause for mysql"""
+ in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
+ in_clause = '( {} )'.format(in_clause)
+ return in_clause
+
+
+def mescape(*items):
+ """Multiple escape"""
+ escaped = [escape(str(item)) for item in items]
+ #print("escaped is:", escaped)
+ return escaped
+
+
+class Markers(object):
+ """Todo: Build in cacheing so it saves us reading the same file more than once"""
+ def __init__(self, name):
+ json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+ self.markers = json.load(json_data_fh)
+
+ def add_pvalues(self, p_values):
+ #print("length of self.markers:", len(self.markers))
+ #print("length of p_values:", len(p_values))
+
+ # THIS IS only needed for the case when we are limiting the number of p-values calculated
+ if len(self.markers) < len(p_values):
+ self.markers = self.markers[:len(p_values)]
+
+ for marker, p_value in itertools.izip(self.markers, p_values):
+ marker['p_value'] = p_value
+ print("p_value is:", marker['p_value'])
+ marker['lod_score'] = -math.log10(marker['p_value'])
+ #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+ marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+
+
+
+
+class HumanMarkers(Markers):
+
+ def __init__(self, name):
+ marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+ self.markers = []
+ for line in marker_data_fh:
+ splat = line.strip().split()
+ marker = {}
+ marker['chr'] = int(splat[0])
+ marker['name'] = splat[1]
+ marker['Mb'] = float(splat[3]) / 1000000
+ self.markers.append(marker)
+
+ #print("markers is: ", pf(self.markers))
+
+
+ def add_pvalues(self, p_values):
+ #for marker, p_value in itertools.izip(self.markers, p_values):
+ # if marker['Mb'] <= 0 and marker['chr'] == 0:
+ # continue
+ # marker['p_value'] = p_value
+ # print("p_value is:", marker['p_value'])
+ # marker['lod_score'] = -math.log10(marker['p_value'])
+ # #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+ # marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+
+ super(HumanMarkers, self).add_pvalues(p_values)
+
+ with Bench("deleting markers"):
+ markers = []
+ for marker in self.markers:
+ if not marker['Mb'] <= 0 and not marker['chr'] == 0:
+ markers.append(marker)
+ self.markers = markers
+
+
class DatasetGroup(object):
"""
@@ -79,22 +158,41 @@ class DatasetGroup(object):
if self.name == 'BXD300':
self.name = "BXD"
+ self.f1list = None
+ self.parlist = None
+ self.get_f1_parent_strains()
+ #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+
self.species = webqtlDatabaseFunction.retrieve_species(self.name)
self.incparentsf1 = False
- self.f1list = None
- self.parlist = None
self.allsamples = None
+
+
+ def get_markers(self):
+ #print("self.species is:", self.species)
+ if self.species == "human":
+ marker_class = HumanMarkers
+ else:
+ marker_class = Markers
+ self.markers = marker_class(self.name)
+
- #def read_genotype(self):
- # self.read_genotype_file()
- #
- # if not self.genotype: # Didn'd succeed, so we try method 2
- # self.read_genotype_data()
+ def get_f1_parent_strains(self):
+ try:
+ # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
+ f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
+ except KeyError:
+ f1 = f12 = maternal = paternal = None
+
+ if f1 and f12:
+ self.f1list = [f1, f12]
+ if maternal and paternal:
+ self.parlist = [maternal, paternal]
def read_genotype_file(self):
- '''read genotype from .geno file instead of database'''
+ '''Read genotype from .geno file instead of database'''
#if self.group == 'BXD300':
# self.group = 'BXD'
#
@@ -104,38 +202,24 @@ class DatasetGroup(object):
#genotype_2 is Dataset Object with parents and f1 (not for intercross)
genotype_1 = reaper.Dataset()
-
+
# reaper barfs on unicode filenames, so here we ensure it's a string
full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
genotype_1.read(full_filename)
- print("Got to after read")
-
- try:
- # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
- f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
- except KeyError:
- f1 = f12 = maternal = paternal = None
-
-
- if genotype_1.type == "group" and maternal and paternal:
- genotype_2 = genotype_1.add(Mat=maternal, Pat=paternal) #, F1=_f1)
+ if genotype_1.type == "group" and self.parlist:
+ genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1)
else:
genotype_2 = genotype_1
#determine default genotype object
if self.incparentsf1 and genotype_1.type != "intercross":
- self.genotype = genotype_2
+ genotype = genotype_2
else:
self.incparentsf1 = 0
- self.genotype = genotype_1
-
- self.samplelist = list(self.genotype.prgy)
+ genotype = genotype_1
- if f1 and f12:
- self.f1list = [f1, f12]
- if maternal and paternal:
- self.parlist = [maternal, paternal]
+ self.samplelist = list(genotype.prgy)
class DataSet(object):
@@ -159,10 +243,10 @@ class DataSet(object):
self.retrieve_other_names()
self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype
+ self.group.read_genotype_file()
self.species = species.TheSpecies(self)
-
-
-
+
+
def get_desc(self):
"""Gets overridden later, at least for Temp...used by trait's get_given_name"""
return None
@@ -209,14 +293,14 @@ class DataSet(object):
self.name,
self.name,
self.name))
- print("query_args are:", query_args)
+ #print("query_args are:", query_args)
- print("""
- SELECT Id, Name, FullName, ShortName
- FROM %s
- WHERE public > %s AND
- (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
- """ % (query_args))
+ #print("""
+ # SELECT Id, Name, FullName, ShortName
+ # FROM %s
+ # WHERE public > %s AND
+ # (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+ # """ % (query_args))
self.id, self.name, self.fullname, self.shortname = g.db.execute("""
SELECT Id, Name, FullName, ShortName
@@ -227,11 +311,7 @@ class DataSet(object):
#self.cursor.execute(query)
#self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone()
-
-
- #def genHTML(self, Class='c0dd'):
- # return HT.Href(text = HT.Span('%s Database' % self.fullname, Class= "fwb " + Class),
- # url= webqtlConfig.INFOPAGEHREF % self.name,target="_blank")
+
class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -291,6 +371,19 @@ class PhenotypeDataSet(DataSet):
# (Urgently?) Need to write this
pass
+ def get_trait_list(self):
+ query = """
+ select PublishXRef.Id
+ from PublishXRef, PublishFreeze
+ where PublishFreeze.InbredSetId=PublishXRef.InbredSetId
+ and PublishFreeze.Id = {}
+ """.format(escape(str(self.id)))
+ results = g.db.execute(query).fetchall()
+ trait_data = {}
+ for trait in results:
+ trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+ return trait_data
+
def get_trait_info(self, trait_list, species = ''):
for this_trait in trait_list:
if not this_trait.haveinfo:
@@ -301,7 +394,7 @@ class PhenotypeDataSet(DataSet):
continue # for now
if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
description = this_trait.pre_publication_description
- this_trait.description_display = description
+ this_trait.description_display = unicode(description, "utf8")
if not this_trait.year.isdigit():
this_trait.pubmed_text = "N/A"
@@ -359,7 +452,7 @@ class PhenotypeDataSet(DataSet):
PublishFreeze.Id = %d AND PublishData.StrainId = Strain.Id
Order BY
Strain.Name
- """ % (trait.name, self.id)
+ """ % (trait, self.id)
results = g.db.execute(query).fetchall()
return results
@@ -399,6 +492,19 @@ class GenotypeDataSet(DataSet):
def check_confidentiality(self):
return geno_mrna_confidentiality(self)
+
+ def get_trait_list(self):
+ query = """
+ select Geno.Name
+ from Geno, GenoXRef
+ where GenoXRef.GenoId = Geno.Id
+ and GenoFreezeId = {}
+ """.format(escape(str(self.id)))
+ results = g.db.execute(query).fetchall()
+ trait_data = {}
+ for trait in results:
+ trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+ return trait_data
def get_trait_info(self, trait_list, species=None):
for this_trait in trait_list:
@@ -437,7 +543,7 @@ class GenotypeDataSet(DataSet):
GenoData.StrainId = Strain.Id
Order BY
Strain.Name
- """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait.name, self.name)
+ """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name)
results = g.db.execute(query).fetchall()
return results
@@ -509,10 +615,95 @@ class MrnaAssayDataSet(DataSet):
def check_confidentiality(self):
return geno_mrna_confidentiality(self)
+
+ def get_trait_list_1(self):
+ query = """
+ select ProbeSet.Name
+ from ProbeSet, ProbeSetXRef
+ where ProbeSetXRef.ProbeSetId = ProbeSet.Id
+ and ProbeSetFreezeId = {}
+ """.format(escape(str(self.id)))
+ results = g.db.execute(query).fetchall()
+ #print("After get_trait_list query")
+ trait_data = {}
+ for trait in results:
+ print("Retrieving sample_data for ", trait[0])
+ trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+ #print("After retrieve_sample_data")
+ return trait_data
+
+ def get_trait_data(self):
+ self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+ query = """
+ SELECT Strain.Name, Strain.Id FROM Strain, Species
+ WHERE Strain.Name IN {}
+ and Strain.SpeciesId=Species.Id
+ and Species.name = '{}'
+ """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+ results = dict(g.db.execute(query).fetchall())
+ sample_ids = [results[item] for item in self.samplelist]
+
+ # MySQL limits the number of tables that can be used in a join to 61,
+ # so we break the sample ids into smaller chunks
+ # Postgres doesn't have that limit, so we can get rid of this after we transition
+ chunk_size = 50
+ number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+ trait_sample_data = []
+ for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+ #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ #tempTable = None
+ #if GeneId and db.type == "ProbeSet":
+ # if method == "3":
+ # tempTable = self.getTempLiteratureTable(species=species,
+ # input_species_geneid=GeneId,
+ # returnNumber=returnNumber)
+ #
+ # if method == "4" or method == "5":
+ # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+ # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+ # method=method,
+ # returnNumber=returnNumber)
+
+ temp = ['T%s.value' % item for item in sample_ids_step]
+ query = "SELECT {}.Name,".format(escape(self.type))
+ data_start_pos = 1
+ query += string.join(temp, ', ')
+ query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+ self.type,
+ self.type))
+
+ for item in sample_ids_step:
+ query += """
+ left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ and T{}.StrainId={}\n
+ """.format(*mescape(self.type, item, item, self.type, item, item))
+
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.type,
+ self.name, self.type, self.type, self.type, self.type))
+ results = g.db.execute(query).fetchall()
+ trait_sample_data.append(results)
+
+ trait_count = len(trait_sample_data[0])
+ self.trait_data = collections.defaultdict(list)
+
+ # put all of the separate data together into a dictionary where the keys are
+ # trait names and values are lists of sample values
+ for trait_counter in range(trait_count):
+ trait_name = trait_sample_data[0][trait_counter][0]
+ for chunk_counter in range(int(number_chunks)):
+ self.trait_data[trait_name] += (
+ trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+
def get_trait_info(self, trait_list=None, species=''):
- # Note: setting trait_list to [] is probably not a great idea.
+ # Note: setting trait_list to [] is probably not a great idea.
if not trait_list:
trait_list = []
@@ -521,9 +712,7 @@ class MrnaAssayDataSet(DataSet):
if not this_trait.haveinfo:
this_trait.retrieveInfo(QTL=1)
- if this_trait.symbol:
- pass
- else:
+ if not this_trait.symbol:
this_trait.symbol = "N/A"
#XZ, 12/08/2008: description
@@ -531,60 +720,56 @@ class MrnaAssayDataSet(DataSet):
description_string = str(this_trait.description).strip()
target_string = str(this_trait.probe_target_description).strip()
- description_display = ''
-
if len(description_string) > 1 and description_string != 'None':
description_display = description_string
else:
description_display = this_trait.symbol
- if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None':
+ if (len(description_display) > 1 and description_display != 'N/A' and
+ len(target_string) > 1 and target_string != 'None'):
description_display = description_display + '; ' + target_string.strip()
# Save it for the jinja2 template
this_trait.description_display = description_display
- #print(" xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display))
#XZ: trait_location_value is used for sorting
trait_location_repr = 'N/A'
trait_location_value = 1000000
if this_trait.chr and this_trait.mb:
- try:
- trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
- except:
- if this_trait.chr.upper() == 'X':
- trait_location_value = 20*1000 + this_trait.mb
- else:
- trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb
-
- this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) )
+ #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y")
+ #This is so we can convert the location to a number used for sorting
+ trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb)
+ #try:
+ # trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+ #except ValueError:
+ # if this_trait.chr.upper() == 'X':
+ # trait_location_value = 20*1000 + this_trait.mb
+ # else:
+ # trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 +
+ # this_trait.mb)
+
+ #ZS: Put this in function currently called "convert_location_to_value"
+ this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr,
+ float(this_trait.mb))
this_trait.location_value = trait_location_value
- #this_trait.trait_location_value = trait_location_value
- #XZ, 01/12/08: This SQL query is much faster.
+ #Get mean expression value
query = (
-"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
- where ProbeSetXRef.ProbeSetFreezeId = %s and
- ProbeSet.Id = ProbeSetXRef.ProbeSetId and
- ProbeSet.Name = '%s'
+ """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
+ where ProbeSetXRef.ProbeSetFreezeId = %s and
+ ProbeSet.Id = ProbeSetXRef.ProbeSetId and
+ ProbeSet.Name = '%s'
""" % (escape(str(this_trait.dataset.id)),
escape(this_trait.name)))
- print("query is:", pf(query))
+ #print("query is:", pf(query))
result = g.db.execute(query).fetchone()
+
+ mean = result[0] if result else 0
- if result:
- if result[0]:
- mean = result[0]
- else:
- mean=0
- else:
- mean = 0
-
- #XZ, 06/05/2009: It is neccessary to turn on nowrap
- this_trait.mean = repr = "%2.3f" % mean
+ this_trait.mean = "%2.3f" % mean
#LRS and its location
this_trait.LRS_score_repr = 'N/A'
@@ -603,23 +788,39 @@ class MrnaAssayDataSet(DataSet):
result = self.cursor.fetchone()
if result:
- if result[0] and result[1]:
- LRS_Chr = result[0]
- LRS_Mb = result[1]
-
- #XZ: LRS_location_value is used for sorting
- try:
- LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb)
- except:
- if LRS_Chr.upper() == 'X':
- LRS_location_value = 20*1000 + float(LRS_Mb)
- else:
- LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb)
+ #if result[0] and result[1]:
+ # lrs_chr = result[0]
+ # lrs_mb = result[1]
+ lrs_chr, lrs_mb = result
+ #XZ: LRS_location_value is used for sorting
+ lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
+
+ #try:
+ # lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
+ #except:
+ # if lrs_chr.upper() == 'X':
+ # lrs_location_value = 20*1000 + float(lrs_mb)
+ # else:
+ # lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 +
+ # float(lrs_mb))
+
+ this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
+ this_trait.LRS_score_value = this_trait.lrs
+ this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb))
+
+
+ def convert_location_to_value(self, chromosome, mb):
+ try:
+ location_value = int(chromosome)*1000 + float(mb)
+ except ValueError:
+ if chromosome.upper() == 'X':
+ location_value = 20*1000 + float(mb)
+ else:
+ location_value = (ord(str(chromosome).upper()[0])*1000 +
+ float(mb))
+
+ return location_value
- this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
- this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
- this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) )
-
def get_sequence(self):
query = """
SELECT
@@ -633,9 +834,9 @@ class MrnaAssayDataSet(DataSet):
ProbeSetFreeze.Name = %s
""" % (escape(self.name), escape(self.dataset.name))
results = g.db.execute(query).fetchone()
-
return results[0]
+
def retrieve_sample_data(self, trait):
query = """
SELECT
@@ -652,7 +853,7 @@ class MrnaAssayDataSet(DataSet):
ProbeSetData.StrainId = Strain.Id
Order BY
Strain.Name
- """ % (escape(trait.name), escape(self.name))
+ """ % (escape(trait), escape(self.name))
results = g.db.execute(query).fetchall()
return results
@@ -725,7 +926,7 @@ class TempDataSet(DataSet):
def geno_mrna_confidentiality(ob):
dataset_table = ob.type + "Freeze"
- print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+ #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
query = '''SELECT Id, Name, FullName, confidentiality,
AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)
@@ -741,3 +942,4 @@ def geno_mrna_confidentiality(ob):
if confidential:
# Allow confidential data later
NoConfindetialDataForYouTodaySorry
+