aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py290
1 files changed, 239 insertions, 51 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 50ef8f57..9b0a3dcc 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -22,10 +22,14 @@
from __future__ import absolute_import, print_function, division
import os
+import math
+import string
+import collections
-from flask import Flask, g
+import json
+import itertools
-from htmlgen import HTMLgen2 as HT
+from flask import Flask, g
import reaper
@@ -33,6 +37,7 @@ from base import webqtlConfig
from base import species
from dbFunction import webqtlDatabaseFunction
from utility import webqtlUtil
+from utility.benchmark import Bench
from MySQLdb import escape_string as escape
from pprint import pformat as pf
@@ -41,29 +46,95 @@ from pprint import pformat as pf
DS_NAME_MAP = {}
def create_dataset(dataset_name):
- #cursor = db_conn.cursor()
print("dataset_name:", dataset_name)
query = """
SELECT DBType.Name
FROM DBList, DBType
- WHERE DBList.Name = '%s' and
+ WHERE DBList.Name = '{}' and
DBType.Id = DBList.DBTypeId
- """ % (escape(dataset_name))
- print("query is: ", pf(query))
+ """.format(escape(dataset_name))
+ #print("query is: ", pf(query))
dataset_type = g.db.execute(query).fetchone().Name
#dataset_type = cursor.fetchone()[0]
- print("[blubber] dataset_type:", pf(dataset_type))
+ #print("[blubber] dataset_type:", pf(dataset_type))
dataset_ob = DS_NAME_MAP[dataset_type]
#dataset_class = getattr(data_set, dataset_ob)
- print("dataset_ob:", dataset_ob)
- print("DS_NAME_MAP:", pf(DS_NAME_MAP))
+ #print("dataset_ob:", dataset_ob)
+ #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
dataset_class = globals()[dataset_ob]
return dataset_class(dataset_name)
+def mescape(*items):
+ """Multiple escape"""
+ escaped = [escape(item) for item in items]
+ print("escaped is:", escaped)
+ return escaped
+
+
+class Markers(object):
+ """Todo: Build in cacheing so it saves us reading the same file more than once"""
+ def __init__(self, name):
+ json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+ self.markers = json.load(json_data_fh)
+
+ def add_pvalues(self, p_values):
+ print("length of self.markers:", len(self.markers))
+ print("length of p_values:", len(p_values))
+
+ # THIS IS only needed for the case when we are limiting the number of p-values calculated
+ if len(self.markers) < len(p_values):
+ self.markers = self.markers[:len(p_values)]
+
+ for marker, p_value in itertools.izip(self.markers, p_values):
+ marker['p_value'] = p_value
+ print("p_value is:", marker['p_value'])
+ marker['lod_score'] = -math.log10(marker['p_value'])
+ #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+ marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+
+
+
+
+class HumanMarkers(Markers):
+
+ def __init__(self, name):
+ marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+ self.markers = []
+ for line in marker_data_fh:
+ splat = line.strip().split()
+ marker = {}
+ marker['chr'] = int(splat[0])
+ marker['name'] = splat[1]
+ marker['Mb'] = float(splat[3]) / 1000000
+ self.markers.append(marker)
+
+ #print("markers is: ", pf(self.markers))
+
+
+ def add_pvalues(self, p_values):
+ #for marker, p_value in itertools.izip(self.markers, p_values):
+ # if marker['Mb'] <= 0 and marker['chr'] == 0:
+ # continue
+ # marker['p_value'] = p_value
+ # print("p_value is:", marker['p_value'])
+ # marker['lod_score'] = -math.log10(marker['p_value'])
+ # #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+ # marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
+
+ super(HumanMarkers, self).add_pvalues(p_values)
+
+ with Bench("deleting markers"):
+ markers = []
+ for marker in self.markers:
+ if not marker['Mb'] <= 0 and not marker['chr'] == 0:
+ markers.append(marker)
+ self.markers = markers
+
+
class DatasetGroup(object):
"""
@@ -79,22 +150,41 @@ class DatasetGroup(object):
if self.name == 'BXD300':
self.name = "BXD"
+ self.f1list = None
+ self.parlist = None
+ self.get_f1_parent_strains()
+ print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+
self.species = webqtlDatabaseFunction.retrieve_species(self.name)
self.incparentsf1 = False
- self.f1list = None
- self.parlist = None
self.allsamples = None
+
+
+ def get_markers(self):
+ print("self.species is:", self.species)
+ if self.species == "human":
+ marker_class = HumanMarkers
+ else:
+ marker_class = Markers
+ self.markers = marker_class(self.name)
+
- #def read_genotype(self):
- # self.read_genotype_file()
- #
- # if not self.genotype: # Didn'd succeed, so we try method 2
- # self.read_genotype_data()
+ def get_f1_parent_strains(self):
+ try:
+ # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
+ f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
+ except KeyError:
+ f1 = f12 = maternal = paternal = None
+
+ if f1 and f12:
+ self.f1list = [f1, f12]
+ if maternal and paternal:
+ self.parlist = [maternal, paternal]
def read_genotype_file(self):
- '''read genotype from .geno file instead of database'''
+ '''Read genotype from .geno file instead of database'''
#if self.group == 'BXD300':
# self.group = 'BXD'
#
@@ -104,38 +194,24 @@ class DatasetGroup(object):
#genotype_2 is Dataset Object with parents and f1 (not for intercross)
genotype_1 = reaper.Dataset()
-
+
# reaper barfs on unicode filenames, so here we ensure it's a string
full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
genotype_1.read(full_filename)
- print("Got to after read")
-
- try:
- # NL, 07/27/2010. ParInfo has been moved from webqtlForm.py to webqtlUtil.py;
- f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
- except KeyError:
- f1 = f12 = maternal = paternal = None
-
-
- if genotype_1.type == "group" and maternal and paternal:
- genotype_2 = genotype_1.add(Mat=maternal, Pat=paternal) #, F1=_f1)
+ if genotype_1.type == "group" and self.parlist:
+ genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1)
else:
genotype_2 = genotype_1
#determine default genotype object
if self.incparentsf1 and genotype_1.type != "intercross":
- self.genotype = genotype_2
+ genotype = genotype_2
else:
self.incparentsf1 = 0
- self.genotype = genotype_1
+ genotype = genotype_1
- self.samplelist = list(self.genotype.prgy)
-
- if f1 and f12:
- self.f1list = [f1, f12]
- if maternal and paternal:
- self.parlist = [maternal, paternal]
+ self.samplelist = list(genotype.prgy)
class DataSet(object):
@@ -160,9 +236,8 @@ class DataSet(object):
self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype
self.species = species.TheSpecies(self)
-
-
-
+
+
def get_desc(self):
"""Gets overridden later, at least for Temp...used by trait's get_given_name"""
return None
@@ -227,11 +302,7 @@ class DataSet(object):
#self.cursor.execute(query)
#self.id, self.name, self.fullname, self.shortname = self.cursor.fetchone()
-
-
- #def genHTML(self, Class='c0dd'):
- # return HT.Href(text = HT.Span('%s Database' % self.fullname, Class= "fwb " + Class),
- # url= webqtlConfig.INFOPAGEHREF % self.name,target="_blank")
+
class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -291,6 +362,19 @@ class PhenotypeDataSet(DataSet):
# (Urgently?) Need to write this
pass
+ def get_trait_list(self):
+ query = """
+ select PublishXRef.Id
+ from PublishXRef, PublishFreeze
+ where PublishFreeze.InbredSetId=PublishXRef.InbredSetId
+ and PublishFreeze.Id = {}
+ """.format(escape(str(self.id)))
+ results = g.db.execute(query).fetchall()
+ trait_data = {}
+ for trait in results:
+ trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+ return trait_data
+
def get_trait_info(self, trait_list, species = ''):
for this_trait in trait_list:
if not this_trait.haveinfo:
@@ -301,7 +385,7 @@ class PhenotypeDataSet(DataSet):
continue # for now
if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
description = this_trait.pre_publication_description
- this_trait.description_display = description
+ this_trait.description_display = unicode(description, "utf8")
if not this_trait.year.isdigit():
this_trait.pubmed_text = "N/A"
@@ -359,7 +443,7 @@ class PhenotypeDataSet(DataSet):
PublishFreeze.Id = %d AND PublishData.StrainId = Strain.Id
Order BY
Strain.Name
- """ % (trait.name, self.id)
+ """ % (trait, self.id)
results = g.db.execute(query).fetchall()
return results
@@ -399,6 +483,19 @@ class GenotypeDataSet(DataSet):
def check_confidentiality(self):
return geno_mrna_confidentiality(self)
+
+ def get_trait_list(self):
+ query = """
+ select Geno.Name
+ from Geno, GenoXRef
+ where GenoXRef.GenoId = Geno.Id
+ and GenoFreezeId = {}
+ """.format(escape(str(self.id)))
+ results = g.db.execute(query).fetchall()
+ trait_data = {}
+ for trait in results:
+ trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+ return trait_data
def get_trait_info(self, trait_list, species=None):
for this_trait in trait_list:
@@ -437,7 +534,7 @@ class GenotypeDataSet(DataSet):
GenoData.StrainId = Strain.Id
Order BY
Strain.Name
- """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait.name, self.name)
+ """ % (webqtlDatabaseFunction.retrieve_species_id(self.group.name), trait, self.name)
results = g.db.execute(query).fetchall()
return results
@@ -509,7 +606,95 @@ class MrnaAssayDataSet(DataSet):
def check_confidentiality(self):
return geno_mrna_confidentiality(self)
+
+ def get_trait_list_1(self):
+ query = """
+ select ProbeSet.Name
+ from ProbeSet, ProbeSetXRef
+ where ProbeSetXRef.ProbeSetId = ProbeSet.Id
+ and ProbeSetFreezeId = {}
+ """.format(escape(str(self.id)))
+ results = g.db.execute(query).fetchall()
+ print("After get_trait_list query")
+ trait_data = {}
+ for trait in results:
+ print("Retrieving sample_data for ", trait[0])
+ trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
+ print("After retrieve_sample_data")
+ return trait_data
+
+ def get_trait_data(self):
+ sample_ids = []
+ for sample in self.group.samplelist:
+ query = """
+ SELECT Strain.Id FROM Strain, Species
+ WHERE Strain.Name = '{}'
+ and Strain.SpeciesId=Species.Id
+ and Species.name = '{}'
+ """.format(*mescape(sample, self.group.species))
+ this_id = g.db.execute(query).fetchone()[0]
+ sample_ids.append('%d' % this_id)
+ print("sample_ids size: ", len(sample_ids))
+
+ # MySQL limits the number of tables that can be used in a join to 61,
+ # so we break the sample ids into smaller chunks
+ chunk_count = 50
+ n = len(sample_ids) / chunk_count
+ if len(sample_ids) % chunk_count:
+ n += 1
+ print("n: ", n)
+ #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ #tempTable = None
+ #if GeneId and db.type == "ProbeSet":
+ # if method == "3":
+ # tempTable = self.getTempLiteratureTable(species=species,
+ # input_species_geneid=GeneId,
+ # returnNumber=returnNumber)
+ #
+ # if method == "4" or method == "5":
+ # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+ # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+ # method=method,
+ # returnNumber=returnNumber)
+ trait_sample_data = []
+ for step in range(int(n)):
+ temp = []
+ sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)]
+ for item in sample_ids_step:
+ temp.append('T%s.value' % item)
+ query = "SELECT {}.Name,".format(escape(self.type))
+ data_start_pos = 1
+ query += string.join(temp, ', ')
+ query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+ self.type,
+ self.type))
+ #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
+ for item in sample_ids_step:
+ query += """
+ left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ and T{}.StrainId={}\n
+ """.format(*mescape(self.type, item, item, self.type, item, item))
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.type,
+ self.name, self.type, self.type, self.type, self.type))
+ print("query: ", query)
+ results = g.db.execute(query).fetchall()
+ trait_sample_data.append(results)
+
+ trait_count = len(trait_sample_data[0])
+ self.trait_data = collections.defaultdict(list)
+ # put all of the separate data together into a dictionary where the keys are
+ # trait names and values are lists of sample values
+ for j in range(trait_count):
+ trait_name = trait_sample_data[0][j][0]
+ for i in range(int(n)):
+ self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:]
+
def get_trait_info(self, trait_list=None, species=''):
# Note: setting trait_list to [] is probably not a great idea.
@@ -550,8 +735,10 @@ class MrnaAssayDataSet(DataSet):
trait_location_value = 1000000
if this_trait.chr and this_trait.mb:
+ print("this_trait.chr is:", this_trait.chr)
+ print("this_trait.mb is:", this_trait.mb)
try:
- trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+ trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb)
except:
if this_trait.chr.upper() == 'X':
trait_location_value = 20*1000 + this_trait.mb
@@ -633,9 +820,9 @@ class MrnaAssayDataSet(DataSet):
ProbeSetFreeze.Name = %s
""" % (escape(self.name), escape(self.dataset.name))
results = g.db.execute(query).fetchone()
-
return results[0]
+
def retrieve_sample_data(self, trait):
query = """
SELECT
@@ -652,7 +839,7 @@ class MrnaAssayDataSet(DataSet):
ProbeSetData.StrainId = Strain.Id
Order BY
Strain.Name
- """ % (escape(trait.name), escape(self.name))
+ """ % (escape(trait), escape(self.name))
results = g.db.execute(query).fetchall()
return results
@@ -741,3 +928,4 @@ def geno_mrna_confidentiality(ob):
if confidential:
# Allow confidential data later
NoConfindetialDataForYouTodaySorry
+