aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py194
1 files changed, 97 insertions, 97 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index ce13dd77..6527657a 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -76,22 +76,22 @@ def create_dataset(dataset_name, dataset_type = None, get_samplelist = True):
#def get_dataset_type_from_json(dataset_name):
-
+
class Dataset_Types(object):
-
+
def __init__(self):
self.datasets = {}
file_name = "wqflask/static/new/javascript/dataset_menu_structure.json"
with open(file_name, 'r') as fh:
data = json.load(fh)
-
+
print("*" * 70)
for species in data['datasets']:
for group in data['datasets'][species]:
for dataset_type in data['datasets'][species][group]:
for dataset in data['datasets'][species][group][dataset_type]:
#print("dataset is:", dataset)
-
+
short_dataset_name = dataset[1]
if dataset_type == "Phenotypes":
new_type = "Publish"
@@ -100,32 +100,32 @@ class Dataset_Types(object):
else:
new_type = "ProbeSet"
self.datasets[short_dataset_name] = new_type
-
+
def __call__(self, name):
return self.datasets[name]
-
+
# Do the intensive work at startup one time only
Dataset_Getter = Dataset_Types()
#
#print("Running at startup:", get_dataset_type_from_json("HBTRC-MLPFC_0611"))
-
+
def create_datasets_list():
key = "all_datasets"
result = Redis.get(key)
-
+
if result:
print("Cache hit!!!")
datasets = pickle.loads(result)
-
+
else:
datasets = list()
with Bench("Creating DataSets object"):
type_dict = {'Publish': 'PublishFreeze',
'ProbeSet': 'ProbeSetFreeze',
'Geno': 'GenoFreeze'}
-
+
for dataset_type in type_dict:
query = "SELECT Name FROM {}".format(type_dict[dataset_type])
for result in g.db.execute(query).fetchall():
@@ -134,10 +134,10 @@ def create_datasets_list():
#print("type: {}\tname: {}".format(dataset_type, result.Name))
dataset = create_dataset(result.Name, dataset_type)
datasets.append(dataset)
-
+
Redis.set(key, pickle.dumps(datasets, pickle.HIGHEST_PROTOCOL))
Redis.expire(key, 60*60)
-
+
return datasets
@@ -158,30 +158,30 @@ def mescape(*items):
class Markers(object):
"""Todo: Build in cacheing so it saves us reading the same file more than once"""
def __init__(self, name):
- json_data_fh = open(os.path.join(webqtlConfig.NEWGENODIR + name + '.json'))
+ json_data_fh = open(locate(name + '.json','genotype/json'))
try:
markers = json.load(json_data_fh)
except:
markers = []
-
+
for marker in markers:
if (marker['chr'] != "X") and (marker['chr'] != "Y"):
marker['chr'] = int(marker['chr'])
marker['Mb'] = float(marker['Mb'])
-
+
self.markers = markers
#print("self.markers:", self.markers)
-
-
+
+
def add_pvalues(self, p_values):
print("length of self.markers:", len(self.markers))
print("length of p_values:", len(p_values))
-
+
if type(p_values) is list:
# THIS IS only needed for the case when we are limiting the number of p-values calculated
#if len(self.markers) > len(p_values):
# self.markers = self.markers[:len(p_values)]
-
+
for marker, p_value in itertools.izip(self.markers, p_values):
if not p_value:
continue
@@ -214,7 +214,7 @@ class Markers(object):
#self.markers.remove(marker)
#del self.markers[i]
self.markers = filtered_markers
-
+
#for i, marker in enumerate(self.markers):
# if not 'p_value' in marker:
@@ -223,9 +223,9 @@ class Markers(object):
# #self.markers.remove(self.markers[i])
class HumanMarkers(Markers):
-
+
def __init__(self, name, specified_markers = []):
- marker_data_fh = open(os.path.join(webqtlConfig.PYLMM_PATH + name + '.bim'))
+ marker_data_fh = open(locate('genotype') + '/' + name + '.bim')
self.markers = []
for line in marker_data_fh:
splat = line.strip().split()
@@ -244,7 +244,7 @@ class HumanMarkers(Markers):
marker['name'] = splat[1]
marker['Mb'] = float(splat[3]) / 1000000
self.markers.append(marker)
-
+
#print("markers is: ", pf(self.markers))
@@ -257,26 +257,26 @@ class HumanMarkers(Markers):
# marker['lod_score'] = -math.log10(marker['p_value'])
# #Using -log(p) for the LRS; need to ask Rob how he wants to get LRS from p-values
# marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61
-
+
#print("p_values2:", pf(p_values))
super(HumanMarkers, self).add_pvalues(p_values)
-
+
#with Bench("deleting markers"):
# markers = []
# for marker in self.markers:
# if not marker['Mb'] <= 0 and not marker['chr'] == 0:
# markers.append(marker)
# self.markers = markers
-
-
+
+
class DatasetGroup(object):
"""
Each group has multiple datasets; each species has multiple groups.
-
+
For example, Mouse has multiple groups (BXD, BXA, etc), and each group
has multiple datasets associated with it.
-
+
"""
def __init__(self, dataset):
"""This sets self.group and self.group_id"""
@@ -284,14 +284,14 @@ class DatasetGroup(object):
self.name, self.id = g.db.execute(dataset.query_for_group).fetchone()
if self.name == 'BXD300':
self.name = "BXD"
-
+
self.f1list = None
self.parlist = None
self.get_f1_parent_strains()
#print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
-
+
self.species = webqtlDatabaseFunction.retrieve_species(self.name)
-
+
self.incparentsf1 = False
self.allsamples = None
self._datasets = None
@@ -302,7 +302,7 @@ class DatasetGroup(object):
def get_markers(self):
#print("self.species is:", self.species)
if self.species == "human":
- marker_class = HumanMarkers
+ marker_class = HumanMarkers
else:
marker_class = Markers
@@ -356,7 +356,7 @@ class DatasetGroup(object):
dataset_menu.append(dict(tissue=None, datasets=[(dataset, dataset_short)]))
else:
dataset_sub_menu = [item[1:] for item in dataset]
-
+
tissue_already_exists = False
tissue_position = None
for i, tissue_dict in enumerate(dataset_menu):
@@ -384,7 +384,7 @@ class DatasetGroup(object):
f1, f12, maternal, paternal = webqtlUtil.ParInfo[self.name]
except KeyError:
f1 = f12 = maternal = paternal = None
-
+
if f1 and f12:
self.f1list = [f1, f12]
if maternal and paternal:
@@ -455,18 +455,18 @@ class DatasetGroup(object):
#self.samplelist = list(self.genotype.prgy)
self.samplelist = list(genotype.prgy)
-
+
return genotype
#class DataSets(object):
# """Builds a list of DataSets"""
-#
+#
# def __init__(self):
# self.datasets = list()
-#
+#
+
-
#query = """SELECT Name FROM ProbeSetFreeze
# UNION
# SELECT Name From PublishFreeze
@@ -501,7 +501,7 @@ class DataSet(object):
self.check_confidentiality()
self.retrieve_other_names()
-
+
self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype
if get_samplelist == True:
self.group.get_samplelist()
@@ -511,30 +511,30 @@ class DataSet(object):
def get_desc(self):
"""Gets overridden later, at least for Temp...used by trait's get_given_name"""
return None
-
+
#@staticmethod
#def get_by_trait_id(trait_id):
# """Gets the dataset object given the trait id"""
- #
- #
#
- # name = g.db.execute(""" SELECT
- #
+ #
+ #
+ # name = g.db.execute(""" SELECT
+ #
# """)
- #
+ #
# return DataSet(name)
# Delete this eventually
@property
def riset():
Weve_Renamed_This_As_Group
-
-
+
+
#@property
#def group(self):
# if not self._group:
# self.get_group()
- #
+ #
# return self._group
@@ -546,7 +546,7 @@ class DataSet(object):
This is not meant to retrieve the data set info if no name at all is passed.
"""
-
+
try:
if self.type == "ProbeSet":
query_args = tuple(escape(x) for x in (
@@ -582,17 +582,17 @@ class DataSet(object):
except TypeError:
print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
pass
-
+
def get_trait_data(self, sample_list=None):
if sample_list:
self.samplelist = sample_list
else:
self.samplelist = self.group.samplelist
-
+
if self.group.parlist != None and self.group.f1list != None:
if (self.group.parlist + self.group.f1list) in self.samplelist:
self.samplelist += self.group.parlist + self.group.f1list
-
+
query = """
SELECT Strain.Name, Strain.Id FROM Strain, Species
WHERE Strain.Name IN {}
@@ -610,9 +610,9 @@ class DataSet(object):
trait_sample_data = []
for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
- #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
#tempTable = None
- #if GeneId and db.type == "ProbeSet":
+ #if GeneId and db.type == "ProbeSet":
# if method == "3":
# tempTable = self.getTempLiteratureTable(species=species,
# input_species_geneid=GeneId,
@@ -623,7 +623,7 @@ class DataSet(object):
# TissueProbeSetFreezeId=tissueProbeSetFreezeId,
# method=method,
# returnNumber=returnNumber)
-
+
if self.type == "Publish":
dataset_type = "Phenotype"
else:
@@ -644,7 +644,7 @@ class DataSet(object):
left join {}Data as T{} on T{}.Id = {}XRef.DataId
and T{}.StrainId={}\n
""".format(*mescape(self.type, item, item, self.type, item, item))
-
+
if self.type == "Publish":
query += """
WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
@@ -661,16 +661,16 @@ class DataSet(object):
order by {}.Id
""".format(*mescape(self.type, self.type, self.type, self.type,
self.name, dataset_type, self.type, self.type, dataset_type))
-
+
#print("trait data query: ", query)
-
+
results = g.db.execute(query).fetchall()
#print("query results:", results)
trait_sample_data.append(results)
trait_count = len(trait_sample_data[0])
self.trait_data = collections.defaultdict(list)
-
+
# put all of the separate data together into a dictionary where the keys are
# trait names and values are lists of sample values
for trait_counter in range(trait_count):
@@ -683,9 +683,9 @@ class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
def setup(self):
-
+
#print("IS A PHENOTYPEDATASET")
-
+
# Fields in the database table
self.search_fields = ['Phenotype.Post_publication_description',
'Phenotype.Pre_publication_description',
@@ -756,26 +756,26 @@ class PhenotypeDataSet(DataSet):
def get_trait_info(self, trait_list, species = ''):
for this_trait in trait_list:
-
+
if not this_trait.haveinfo:
this_trait.retrieve_info(get_qtl_info=True)
description = this_trait.post_publication_description
-
+
#If the dataset is confidential and the user has access to confidential
#phenotype traits, then display the pre-publication description instead
#of the post-publication description
if this_trait.confidential:
this_trait.description_display = ""
continue # for now
-
+
if not webqtlUtil.hasAccessToConfidentialPhenotypeTrait(
privilege=self.privilege,
userName=self.userName,
authorized_users=this_trait.authorized_users):
-
+
description = this_trait.pre_publication_description
-
+
if len(description) > 0:
this_trait.description_display = description.strip()
else:
@@ -820,7 +820,7 @@ class PhenotypeDataSet(DataSet):
this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
this_trait.LRS_location_repr = LRS_location_repr = 'Chr%s: %.6f' % (LRS_Chr, float(LRS_Mb))
-
+
def retrieve_sample_data(self, trait):
query = """
SELECT
@@ -878,7 +878,7 @@ class GenotypeDataSet(DataSet):
def check_confidentiality(self):
return geno_mrna_confidentiality(self)
-
+
def get_trait_list(self):
query = """
select Geno.Name
@@ -912,7 +912,7 @@ class GenotypeDataSet(DataSet):
this_trait.location_repr = 'Chr%s: %.6f' % (this_trait.chr, float(this_trait.mb) )
this_trait.location_value = trait_location_value
-
+
def retrieve_sample_data(self, trait):
query = """
SELECT
@@ -1004,7 +1004,7 @@ class MrnaAssayDataSet(DataSet):
def check_confidentiality(self):
return geno_mrna_confidentiality(self)
-
+
def get_trait_list_1(self):
query = """
select ProbeSet.Name
@@ -1020,7 +1020,7 @@ class MrnaAssayDataSet(DataSet):
trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
#print("After retrieve_sample_data")
return trait_data
-
+
#def get_trait_data(self):
# self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
# query = """
@@ -1040,9 +1040,9 @@ class MrnaAssayDataSet(DataSet):
# trait_sample_data = []
# for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
#
- # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
# #tempTable = None
- # #if GeneId and db.type == "ProbeSet":
+ # #if GeneId and db.type == "ProbeSet":
# # if method == "3":
# # tempTable = self.getTempLiteratureTable(species=species,
# # input_species_geneid=GeneId,
@@ -1053,7 +1053,7 @@ class MrnaAssayDataSet(DataSet):
# # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
# # method=method,
# # returnNumber=returnNumber)
- #
+ #
# temp = ['T%s.value' % item for item in sample_ids_step]
# query = "SELECT {}.Name,".format(escape(self.type))
# data_start_pos = 1
@@ -1067,7 +1067,7 @@ class MrnaAssayDataSet(DataSet):
# left join {}Data as T{} on T{}.Id = {}XRef.DataId
# and T{}.StrainId={}\n
# """.format(*mescape(self.type, item, item, self.type, item, item))
- #
+ #
# query += """
# WHERE {}XRef.{}FreezeId = {}Freeze.Id
# and {}Freeze.Name = '{}'
@@ -1080,7 +1080,7 @@ class MrnaAssayDataSet(DataSet):
#
# trait_count = len(trait_sample_data[0])
# self.trait_data = collections.defaultdict(list)
- #
+ #
# # put all of the separate data together into a dictionary where the keys are
# # trait names and values are lists of sample values
# for trait_counter in range(trait_count):
@@ -1088,11 +1088,11 @@ class MrnaAssayDataSet(DataSet):
# for chunk_counter in range(int(number_chunks)):
# self.trait_data[trait_name] += (
# trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
-
+
def get_trait_info(self, trait_list=None, species=''):
- # Note: setting trait_list to [] is probably not a great idea.
+ # Note: setting trait_list to [] is probably not a great idea.
if not trait_list:
trait_list = []
@@ -1155,7 +1155,7 @@ class MrnaAssayDataSet(DataSet):
#print("query is:", pf(query))
result = g.db.execute(query).fetchone()
-
+
mean = result[0] if result else 0
if mean:
@@ -1176,7 +1176,7 @@ class MrnaAssayDataSet(DataSet):
Geno.SpeciesId = Species.Id
""".format(species, this_trait.locus)
result = g.db.execute(query).fetchone()
-
+
if result:
#if result[0] and result[1]:
# lrs_chr = result[0]
@@ -1184,7 +1184,7 @@ class MrnaAssayDataSet(DataSet):
lrs_chr, lrs_mb = result
#XZ: LRS_location_value is used for sorting
lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
-
+
#try:
# lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
#except:
@@ -1197,7 +1197,7 @@ class MrnaAssayDataSet(DataSet):
this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
this_trait.LRS_score_value = this_trait.lrs
this_trait.LRS_location_repr = 'Chr%s: %.6f' % (lrs_chr, float(lrs_mb))
-
+
def convert_location_to_value(self, chromosome, mb):
try:
@@ -1208,7 +1208,7 @@ class MrnaAssayDataSet(DataSet):
else:
location_value = (ord(str(chromosome).upper()[0])*1000 +
float(mb))
-
+
return location_value
def get_sequence(self):
@@ -1225,7 +1225,7 @@ class MrnaAssayDataSet(DataSet):
""" % (escape(self.name), escape(self.dataset.name))
results = g.db.execute(query).fetchone()
return results[0]
-
+
def retrieve_sample_data(self, trait):
query = """
SELECT
@@ -1246,8 +1246,8 @@ class MrnaAssayDataSet(DataSet):
results = g.db.execute(query).fetchall()
#print("RETRIEVED RESULTS HERE:", results)
return results
-
-
+
+
def retrieve_genes(self, column_name):
query = """
select ProbeSet.Name, ProbeSet.%s
@@ -1256,7 +1256,7 @@ class MrnaAssayDataSet(DataSet):
ProbeSetXRef.ProbeSetId=ProbeSet.Id;
""" % (column_name, escape(str(self.id)))
results = g.db.execute(query).fetchall()
-
+
return dict(results)
#def retrieve_gene_symbols(self):
@@ -1285,8 +1285,8 @@ class MrnaAssayDataSet(DataSet):
# for item in results:
# symbol_dict[item[0]] = item[1]
# return symbol_dict
-
-
+
+
class TempDataSet(DataSet):
@@ -1308,8 +1308,8 @@ class TempDataSet(DataSet):
self.id = 1
self.fullname = 'Temporary Storage'
self.shortname = 'Temp'
-
-
+
+
@staticmethod
def handle_pca(desc):
if 'PCA' in desc:
@@ -1318,13 +1318,13 @@ class TempDataSet(DataSet):
else:
desc = desc[:desc.index('entered')].strip()
return desc
-
+
def get_desc(self):
g.db.execute('SELECT description FROM Temp WHERE Name=%s', self.name)
desc = g.db.fetchone()[0]
desc = self.handle_pca(desc)
- return desc
-
+ return desc
+
def get_group(self):
self.cursor.execute("""
SELECT
@@ -1337,7 +1337,7 @@ class TempDataSet(DataSet):
""", self.name)
self.group, self.group_id = self.cursor.fetchone()
#return self.group
-
+
def retrieve_sample_data(self, trait):
query = """
SELECT
@@ -1351,7 +1351,7 @@ class TempDataSet(DataSet):
Order BY
Strain.Name
""" % escape(trait.name)
-
+
results = g.db.execute(query).fetchall()