aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
authorLei Yan2013-09-13 14:07:27 -0500
committerLei Yan2013-09-13 14:07:27 -0500
commitaf24c0d610d9a2189f86677e4f23deb372ee2bf7 (patch)
tree53480351b97727670637a37dbd4c78e52446ae88 /wqflask/base/data_set.py
parent155e2997613c0750de30b734686f8977524956f9 (diff)
parentc5fc931621707865357ace4b637db7481e0be552 (diff)
downloadgenenetwork2-af24c0d610d9a2189f86677e4f23deb372ee2bf7.tar.gz
Merge https://github.com/zsloan/genenetwork
Resolved conflicts: wqflask/base/trait.py wqflask/wqflask/correlation/correlationFunction.py wqflask/wqflask/correlation/correlation_function.py wqflask/wqflask/correlation/correlation_functions.py wqflask/wqflask/correlation/show_corr_results.py
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py366
1 files changed, 264 insertions, 102 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 03b24230..96e04df0 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -16,8 +16,6 @@
# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010)
# at rwilliams@uthsc.edu and xzhou15@uthsc.edu
#
-#we
-#
# This module is used by GeneNetwork project (www.genenetwork.org)
from __future__ import absolute_import, print_function, division
@@ -27,6 +25,7 @@ import string
import collections
import json
+import gzip
import cPickle as pickle
import itertools
@@ -44,36 +43,71 @@ from utility import webqtlUtil
from utility.benchmark import Bench
from wqflask.my_pylmm.pyLMM import chunks
+from maintenance import get_group_samplelists
+
from MySQLdb import escape_string as escape
from pprint import pformat as pf
# Used by create_database to instantiate objects
+# Each subclass will add to this
DS_NAME_MAP = {}
def create_dataset(dataset_name, dataset_type = None):
- #print("dataset_name:", dataset_name)
-
if not dataset_type:
- query = """
- SELECT DBType.Name
- FROM DBList, DBType
- WHERE DBList.Name = '{}' and
- DBType.Id = DBList.DBTypeId
- """.format(escape(dataset_name))
- #print("query is: ", pf(query))
- dataset_type = g.db.execute(query).fetchone().Name
+ dataset_type = Dataset_Getter(dataset_name)
+ #dataset_type = get_dataset_type_from_json(dataset_name)
- #dataset_type = cursor.fetchone()[0]
- #print("[blubber] dataset_type:", pf(dataset_type))
+ print("dataset_type is:", dataset_type)
+ #query = """
+ # SELECT DBType.Name
+ # FROM DBList, DBType
+ # WHERE DBList.Name = '{}' and
+ # DBType.Id = DBList.DBTypeId
+ # """.format(escape(dataset_name))
+ #dataset_type = g.db.execute(query).fetchone().Name
- dataset_ob = DS_NAME_MAP[dataset_type]
- #dataset_class = getattr(data_set, dataset_ob)
- #print("dataset_ob:", dataset_ob)
- #print("DS_NAME_MAP:", pf(DS_NAME_MAP))
+ dataset_ob = DS_NAME_MAP[dataset_type]
dataset_class = globals()[dataset_ob]
return dataset_class(dataset_name)
+
+#def get_dataset_type_from_json(dataset_name):
+
+class Dataset_Types(object):
+
+ def __init__(self):
+ self.datasets = {}
+ file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
+ with open(file_name, 'r') as fh:
+ data = json.load(fh)
+
+ print("*" * 70)
+ for species in data['datasets']:
+ for group in data['datasets'][species]:
+ for dataset_type in data['datasets'][species][group]:
+ for dataset in data['datasets'][species][group][dataset_type]:
+ print("dataset is:", dataset)
+
+ short_dataset_name = dataset[0]
+ if dataset_type == "Phenotypes":
+ new_type = "Publish"
+ elif dataset_type == "Genotypes":
+ new_type = "Geno"
+ else:
+ new_type = "ProbeSet"
+ self.datasets[short_dataset_name] = new_type
+
+ def __call__(self, name):
+ return self.datasets[name]
+
+# Do the intensive work at startup one time only
+Dataset_Getter = Dataset_Types()
+
+#
+#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611"))
+
+
def create_datasets_list():
key = "all_datasets"
result = Redis.get(key)
@@ -94,7 +128,7 @@ def create_datasets_list():
for result in g.db.execute(query).fetchall():
#The query at the beginning of this function isn't necessary here, but still would
#rather just reuse it
- print("type: {}\tname: {}".format(dataset_type, result.Name))
+ #print("type: {}\tname: {}".format(dataset_type, result.Name))
dataset = create_dataset(result.Name, dataset_type)
datasets.append(dataset)
@@ -134,12 +168,13 @@ class Markers(object):
for marker, p_value in itertools.izip(self.markers, p_values):
marker['p_value'] = p_value
- print("p_value is:", marker['p_value'])
- marker['lod_score'] = -math.log10(marker['p_value'])
- #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
- marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
-
-
+ if marker['p_value'] == 0:
+ marker['lod_score'] = 0
+ marker['lrs_value'] = 0
+ else:
+ marker['lod_score'] = -math.log10(marker['p_value'])
+ #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
+ marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
class HumanMarkers(Markers):
@@ -154,8 +189,6 @@ class HumanMarkers(Markers):
marker['name'] = splat[1]
marker['Mb'] = float(splat[3]) / 1000000
self.markers.append(marker)
-
- #print("markers is: ", pf(self.markers))
def add_pvalues(self, p_values):
@@ -212,7 +245,7 @@ class DatasetGroup(object):
marker_class = Markers
self.markers = marker_class(self.name)
-
+
def get_f1_parent_strains(self):
try:
@@ -225,7 +258,29 @@ class DatasetGroup(object):
self.f1list = [f1, f12]
if maternal and paternal:
self.parlist = [maternal, paternal]
-
+
+ def get_samplelist(self):
+ key = "samplelist:v4:" + self.name
+ print("key is:", key)
+ with Bench("Loading cache"):
+ result = Redis.get(key)
+
+ if result:
+ print("Sample List Cache hit!!!")
+ print("Before unjsonifying {}: {}".format(type(result), result))
+ self.samplelist = json.loads(result)
+ print(" type: ", type(self.samplelist))
+ print(" self.samplelist: ", self.samplelist)
+ else:
+ print("Cache not hit")
+ try:
+ self.samplelist = get_group_samplelists.get_samplelist(self.name + ".geno")
+ except IOError:
+ self.samplelist = None
+ print("after get_samplelist")
+ Redis.set(key, json.dumps(self.samplelist))
+ Redis.expire(key, 60*5)
+
def read_genotype_file(self):
'''Read genotype from .geno file instead of database'''
#if self.group == 'BXD300':
@@ -240,7 +295,18 @@ class DatasetGroup(object):
# reaper barfs on unicode filenames, so here we ensure it's a string
full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno'))
- genotype_1.read(full_filename)
+ if os.path.isfile(full_filename):
+ print("Reading file: ", full_filename)
+ genotype_1.read(full_filename)
+ print("File read")
+ else:
+ try:
+ full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno'))
+ #print("Reading file")
+ genotype_1.read(full_filename)
+ #print("File read")
+ except IOError:
+ print("File doesn't exist!")
if genotype_1.type == "group" and self.parlist:
genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1)
@@ -301,7 +367,7 @@ class DataSet(object):
self.retrieve_other_names()
self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype
- self.group.read_genotype_file()
+ self.group.get_samplelist()
self.species = species.TheSpecies(self)
@@ -335,7 +401,6 @@ class DataSet(object):
# return self._group
-
def retrieve_other_names(self):
"""
If the data set name parameter is not found in the 'Name' field of the data set table,
@@ -370,11 +435,98 @@ class DataSet(object):
except TypeError:
print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
pass
+
+ def get_trait_data(self):
+ self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+ query = """
+ SELECT Strain.Name, Strain.Id FROM Strain, Species
+ WHERE Strain.Name IN {}
+ and Strain.SpeciesId=Species.Id
+ and Species.name = '{}'
+ """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+ results = dict(g.db.execute(query).fetchall())
+ sample_ids = [results[item] for item in self.samplelist]
+
+ # MySQL limits the number of tables that can be used in a join to 61,
+ # so we break the sample ids into smaller chunks
+ # Postgres doesn't have that limit, so we can get rid of this after we transition
+ chunk_size = 50
+ number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+ trait_sample_data = []
+ for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+ #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ #tempTable = None
+ #if GeneId and db.type == "ProbeSet":
+ # if method == "3":
+ # tempTable = self.getTempLiteratureTable(species=species,
+ # input_species_geneid=GeneId,
+ # returnNumber=returnNumber)
+ #
+ # if method == "4" or method == "5":
+ # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+ # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+ # method=method,
+ # returnNumber=returnNumber)
+
+ if self.type == "Publish":
+ dataset_type = "Phenotype"
+ else:
+ dataset_type = self.type
+ temp = ['T%s.value' % item for item in sample_ids_step]
+ if self.type == "Publish":
+ query = "SELECT {}XRef.Id,".format(escape(self.type))
+ else:
+ query = "SELECT {}.Name,".format(escape(dataset_type))
+ data_start_pos = 1
+ query += string.join(temp, ', ')
+ query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
+ self.type,
+ self.type))
+
+ for item in sample_ids_step:
+ query += """
+ left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ and T{}.StrainId={}\n
+ """.format(*mescape(self.type, item, item, self.type, item, item))
+
+ if self.type == "Publish":
+ query += """
+ WHERE {}XRef.PublicationId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.type,
+ self.name, dataset_type, self.type, self.type, dataset_type))
+ else:
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.type,
+ self.name, dataset_type, self.type, self.type, dataset_type))
+ results = g.db.execute(query).fetchall()
+ trait_sample_data.append(results)
+
+ trait_count = len(trait_sample_data[0])
+ self.trait_data = collections.defaultdict(list)
+
+ # put all of the separate data together into a dictionary where the keys are
+ # trait names and values are lists of sample values
+ for trait_counter in range(trait_count):
+ trait_name = trait_sample_data[0][trait_counter][0]
+ for chunk_counter in range(int(number_chunks)):
+ self.trait_data[trait_name] += (
+ trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
def setup(self):
+
+ print("IS A PHENOTYPEDATASET")
+
# Fields in the database table
self.search_fields = ['Phenotype.Post_publication_description',
'Phenotype.Pre_publication_description',
@@ -445,14 +597,24 @@ class PhenotypeDataSet(DataSet):
def get_trait_info(self, trait_list, species = ''):
for this_trait in trait_list:
if not this_trait.haveinfo:
- this_trait.retrieveInfo(QTL=1)
+ this_trait.retrieve_info(get_qtl_info=True)
description = this_trait.post_publication_description
+
+ #If the dataset is confidential and the user has access to confidential
+ #phenotype traits, then display the pre-publication description instead
+ #of the post-publication description
if this_trait.confidential:
continue # for now
- if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(privilege=self.privilege, userName=self.userName, authorized_users=this_trait.authorized_users):
+
+ if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(
+ privilege=self.privilege,
+ userName=self.userName,
+ authorized_users=this_trait.authorized_users):
+
description = this_trait.pre_publication_description
- this_trait.description_display = unicode(description, "utf8")
+
+ this_trait.description_display = description
if not this_trait.year.isdigit():
this_trait.pubmed_text = "N/A"
@@ -690,73 +852,73 @@ class MrnaAssayDataSet(DataSet):
#print("After retrieve_sample_data")
return trait_data
- def get_trait_data(self):
- self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
- query = """
- SELECT Strain.Name, Strain.Id FROM Strain, Species
- WHERE Strain.Name IN {}
- and Strain.SpeciesId=Species.Id
- and Species.name = '{}'
- """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
- results = dict(g.db.execute(query).fetchall())
- sample_ids = [results[item] for item in self.samplelist]
-
- # MySQL limits the number of tables that can be used in a join to 61,
- # so we break the sample ids into smaller chunks
- # Postgres doesn't have that limit, so we can get rid of this after we transition
- chunk_size = 50
- number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
- trait_sample_data = []
- for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-
- #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
- #tempTable = None
- #if GeneId and db.type == "ProbeSet":
- # if method == "3":
- # tempTable = self.getTempLiteratureTable(species=species,
- # input_species_geneid=GeneId,
- # returnNumber=returnNumber)
- #
- # if method == "4" or method == "5":
- # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
- # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
- # method=method,
- # returnNumber=returnNumber)
-
- temp = ['T%s.value' % item for item in sample_ids_step]
- query = "SELECT {}.Name,".format(escape(self.type))
- data_start_pos = 1
- query += string.join(temp, ', ')
- query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
- self.type,
- self.type))
-
- for item in sample_ids_step:
- query += """
- left join {}Data as T{} on T{}.Id = {}XRef.DataId
- and T{}.StrainId={}\n
- """.format(*mescape(self.type, item, item, self.type, item, item))
-
- query += """
- WHERE {}XRef.{}FreezeId = {}Freeze.Id
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(self.type, self.type, self.type, self.type,
- self.name, self.type, self.type, self.type, self.type))
- results = g.db.execute(query).fetchall()
- trait_sample_data.append(results)
-
- trait_count = len(trait_sample_data[0])
- self.trait_data = collections.defaultdict(list)
-
- # put all of the separate data together into a dictionary where the keys are
- # trait names and values are lists of sample values
- for trait_counter in range(trait_count):
- trait_name = trait_sample_data[0][trait_counter][0]
- for chunk_counter in range(int(number_chunks)):
- self.trait_data[trait_name] += (
- trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+ #def get_trait_data(self):
+ # self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+ # query = """
+ # SELECT Strain.Name, Strain.Id FROM Strain, Species
+ # WHERE Strain.Name IN {}
+ # and Strain.SpeciesId=Species.Id
+ # and Species.name = '{}'
+ # """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+ # results = dict(g.db.execute(query).fetchall())
+ # sample_ids = [results[item] for item in self.samplelist]
+ #
+ # # MySQL limits the number of tables that can be used in a join to 61,
+ # # so we break the sample ids into smaller chunks
+ # # Postgres doesn't have that limit, so we can get rid of this after we transition
+ # chunk_size = 50
+ # number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+ # trait_sample_data = []
+ # for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+ #
+ # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ # #tempTable = None
+ # #if GeneId and db.type == "ProbeSet":
+ # # if method == "3":
+ # # tempTable = self.getTempLiteratureTable(species=species,
+ # # input_species_geneid=GeneId,
+ # # returnNumber=returnNumber)
+ # #
+ # # if method == "4" or method == "5":
+ # # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+ # # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+ # # method=method,
+ # # returnNumber=returnNumber)
+ #
+ # temp = ['T%s.value' % item for item in sample_ids_step]
+ # query = "SELECT {}.Name,".format(escape(self.type))
+ # data_start_pos = 1
+ # query += string.join(temp, ', ')
+ # query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+ # self.type,
+ # self.type))
+ #
+ # for item in sample_ids_step:
+ # query += """
+ # left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ # and T{}.StrainId={}\n
+ # """.format(*mescape(self.type, item, item, self.type, item, item))
+ #
+ # query += """
+ # WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ # and {}Freeze.Name = '{}'
+ # and {}.Id = {}XRef.{}Id
+ # order by {}.Id
+ # """.format(*mescape(self.type, self.type, self.type, self.type,
+ # self.name, self.type, self.type, self.type, self.type))
+ # results = g.db.execute(query).fetchall()
+ # trait_sample_data.append(results)
+ #
+ # trait_count = len(trait_sample_data[0])
+ # self.trait_data = collections.defaultdict(list)
+ #
+ # # put all of the separate data together into a dictionary where the keys are
+ # # trait names and values are lists of sample values
+ # for trait_counter in range(trait_count):
+ # trait_name = trait_sample_data[0][trait_counter][0]
+ # for chunk_counter in range(int(number_chunks)):
+ # self.trait_data[trait_name] += (
+ # trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
def get_trait_info(self, trait_list=None, species=''):