aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/data_set.py
diff options
context:
space:
mode:
authorZachary Sloan2013-06-19 18:30:59 +0000
committerZachary Sloan2013-06-19 18:30:59 +0000
commit2ebacd207ee9c65d9239626fc18d2c1a50e08fbc (patch)
tree41627942dfce65608d376134ffb00a6765492676 /wqflask/base/data_set.py
parent0ce9a060640710b5a6e18f71b08d79e51ef71d8a (diff)
parent25bd2fa7ac229eb7862fe778fe03eb75ff34368c (diff)
downloadgenenetwork2-2ebacd207ee9c65d9239626fc18d2c1a50e08fbc.tar.gz
Merge branch 'flask' of git://github.com/leiyan/GeneNetwork2-Python into flask
Conflicts: wqflask/base/data_set.py wqflask/wqflask/views.py Fixed a couple conflicts to merge Lei's code related to the correlation page
Diffstat (limited to 'wqflask/base/data_set.py')
-rwxr-xr-xwqflask/base/data_set.py212
1 files changed, 113 insertions, 99 deletions
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 9b0a3dcc..07fe9cd9 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -38,6 +38,7 @@ from base import species
from dbFunction import webqtlDatabaseFunction
from utility import webqtlUtil
from utility.benchmark import Bench
+from wqflask.my_pylmm.pyLMM import chunks
from MySQLdb import escape_string as escape
from pprint import pformat as pf
@@ -46,7 +47,7 @@ from pprint import pformat as pf
DS_NAME_MAP = {}
def create_dataset(dataset_name):
- print("dataset_name:", dataset_name)
+ #print("dataset_name:", dataset_name)
query = """
SELECT DBType.Name
@@ -68,10 +69,17 @@ def create_dataset(dataset_name):
dataset_class = globals()[dataset_ob]
return dataset_class(dataset_name)
+def create_in_clause(items):
+ """Create an in clause for mysql"""
+ in_clause = ', '.join("'{}'".format(x) for x in mescape(*items))
+ in_clause = '( {} )'.format(in_clause)
+ return in_clause
+
+
def mescape(*items):
"""Multiple escape"""
- escaped = [escape(item) for item in items]
- print("escaped is:", escaped)
+ escaped = [escape(str(item)) for item in items]
+ #print("escaped is:", escaped)
return escaped
@@ -82,8 +90,8 @@ class Markers(object):
self.markers = json.load(json_data_fh)
def add_pvalues(self, p_values):
- print("length of self.markers:", len(self.markers))
- print("length of p_values:", len(p_values))
+ #print("length of self.markers:", len(self.markers))
+ #print("length of p_values:", len(p_values))
# THIS IS only needed for the case when we are limiting the number of p-values calculated
if len(self.markers) < len(p_values):
@@ -153,7 +161,7 @@ class DatasetGroup(object):
self.f1list = None
self.parlist = None
self.get_f1_parent_strains()
- print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
+ #print("parents/f1s: {}:{}".format(self.parlist, self.f1list))
self.species = webqtlDatabaseFunction.retrieve_species(self.name)
@@ -162,7 +170,7 @@ class DatasetGroup(object):
def get_markers(self):
- print("self.species is:", self.species)
+ #print("self.species is:", self.species)
if self.species == "human":
marker_class = HumanMarkers
else:
@@ -235,6 +243,7 @@ class DataSet(object):
self.retrieve_other_names()
self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype
+ self.group.read_genotype_file()
self.species = species.TheSpecies(self)
@@ -284,14 +293,14 @@ class DataSet(object):
self.name,
self.name,
self.name))
- print("query_args are:", query_args)
+ #print("query_args are:", query_args)
- print("""
- SELECT Id, Name, FullName, ShortName
- FROM %s
- WHERE public > %s AND
- (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
- """ % (query_args))
+ #print("""
+ # SELECT Id, Name, FullName, ShortName
+ # FROM %s
+ # WHERE public > %s AND
+ # (Name = '%s' OR FullName = '%s' OR ShortName = '%s')
+ # """ % (query_args))
self.id, self.name, self.fullname, self.shortname = g.db.execute("""
SELECT Id, Name, FullName, ShortName
@@ -615,34 +624,33 @@ class MrnaAssayDataSet(DataSet):
and ProbeSetFreezeId = {}
""".format(escape(str(self.id)))
results = g.db.execute(query).fetchall()
- print("After get_trait_list query")
+ #print("After get_trait_list query")
trait_data = {}
for trait in results:
print("Retrieving sample_data for ", trait[0])
trait_data[trait[0]] = self.retrieve_sample_data(trait[0])
- print("After retrieve_sample_data")
+ #print("After retrieve_sample_data")
return trait_data
def get_trait_data(self):
- sample_ids = []
- for sample in self.group.samplelist:
- query = """
- SELECT Strain.Id FROM Strain, Species
- WHERE Strain.Name = '{}'
- and Strain.SpeciesId=Species.Id
- and Species.name = '{}'
- """.format(*mescape(sample, self.group.species))
- this_id = g.db.execute(query).fetchone()[0]
- sample_ids.append('%d' % this_id)
- print("sample_ids size: ", len(sample_ids))
+ self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+ query = """
+ SELECT Strain.Name, Strain.Id FROM Strain, Species
+ WHERE Strain.Name IN {}
+ and Strain.SpeciesId=Species.Id
+ and Species.name = '{}'
+ """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+ results = dict(g.db.execute(query).fetchall())
+ sample_ids = [results[item] for item in self.samplelist]
# MySQL limits the number of tables that can be used in a join to 61,
# so we break the sample ids into smaller chunks
- chunk_count = 50
- n = len(sample_ids) / chunk_count
- if len(sample_ids) % chunk_count:
- n += 1
- print("n: ", n)
+ # Postgres doesn't have that limit, so we can get rid of this after we transition
+ chunk_size = 50
+ number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+ trait_sample_data = []
+ for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
#XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
#tempTable = None
#if GeneId and db.type == "ProbeSet":
@@ -656,24 +664,21 @@ class MrnaAssayDataSet(DataSet):
# TissueProbeSetFreezeId=tissueProbeSetFreezeId,
# method=method,
# returnNumber=returnNumber)
- trait_sample_data = []
- for step in range(int(n)):
- temp = []
- sample_ids_step = sample_ids[step*chunk_count:min(len(sample_ids), (step+1)*chunk_count)]
- for item in sample_ids_step:
- temp.append('T%s.value' % item)
+
+ temp = ['T%s.value' % item for item in sample_ids_step]
query = "SELECT {}.Name,".format(escape(self.type))
data_start_pos = 1
query += string.join(temp, ', ')
query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
self.type,
self.type))
- #XZ, 03/04/2009: Xiaodong changed Data to %sData and changed parameters from %(item,item, db.type,item,item) to %(db.type, item,item, db.type,item,item)
+
for item in sample_ids_step:
query += """
left join {}Data as T{} on T{}.Id = {}XRef.DataId
and T{}.StrainId={}\n
""".format(*mescape(self.type, item, item, self.type, item, item))
+
query += """
WHERE {}XRef.{}FreezeId = {}Freeze.Id
and {}Freeze.Name = '{}'
@@ -681,23 +686,24 @@ class MrnaAssayDataSet(DataSet):
order by {}.Id
""".format(*mescape(self.type, self.type, self.type, self.type,
self.name, self.type, self.type, self.type, self.type))
- print("query: ", query)
results = g.db.execute(query).fetchall()
trait_sample_data.append(results)
-
+
trait_count = len(trait_sample_data[0])
self.trait_data = collections.defaultdict(list)
+
# put all of the separate data together into a dictionary where the keys are
# trait names and values are lists of sample values
- for j in range(trait_count):
- trait_name = trait_sample_data[0][j][0]
- for i in range(int(n)):
- self.trait_data[trait_name] += trait_sample_data[i][j][data_start_pos:]
-
+ for trait_counter in range(trait_count):
+ trait_name = trait_sample_data[0][trait_counter][0]
+ for chunk_counter in range(int(number_chunks)):
+ self.trait_data[trait_name] += (
+ trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+
def get_trait_info(self, trait_list=None, species=''):
- # Note: setting trait_list to [] is probably not a great idea.
+ # Note: setting trait_list to [] is probably not a great idea.
if not trait_list:
trait_list = []
@@ -706,9 +712,7 @@ class MrnaAssayDataSet(DataSet):
if not this_trait.haveinfo:
this_trait.retrieveInfo(QTL=1)
- if this_trait.symbol:
- pass
- else:
+ if not this_trait.symbol:
this_trait.symbol = "N/A"
#XZ, 12/08/2008: description
@@ -716,62 +720,56 @@ class MrnaAssayDataSet(DataSet):
description_string = str(this_trait.description).strip()
target_string = str(this_trait.probe_target_description).strip()
- description_display = ''
-
if len(description_string) > 1 and description_string != 'None':
description_display = description_string
else:
description_display = this_trait.symbol
- if len(description_display) > 1 and description_display != 'N/A' and len(target_string) > 1 and target_string != 'None':
+ if (len(description_display) > 1 and description_display != 'N/A' and
+ len(target_string) > 1 and target_string != 'None'):
description_display = description_display + '; ' + target_string.strip()
# Save it for the jinja2 template
this_trait.description_display = description_display
- #print(" xxxxdd [%s]: %s" % (type(this_trait.description_display), description_display))
#XZ: trait_location_value is used for sorting
trait_location_repr = 'N/A'
trait_location_value = 1000000
if this_trait.chr and this_trait.mb:
- print("this_trait.chr is:", this_trait.chr)
- print("this_trait.mb is:", this_trait.mb)
- try:
- trait_location_value = float(this_trait.chr)*1000 + float(this_trait.mb)
- except:
- if this_trait.chr.upper() == 'X':
- trait_location_value = 20*1000 + this_trait.mb
- else:
- trait_location_value = ord(str(this_trait.chr).upper()[0])*1000 + this_trait.mb
-
- this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr, float(this_trait.mb) )
+ #Checks if the chromosome number can be cast to an int (i.e. isn't "X" or "Y")
+ #This is so we can convert the location to a number used for sorting
+ trait_location_value = self.convert_location_to_value(this_trait.chr, this_trait.mb)
+ #try:
+ # trait_location_value = int(this_trait.chr)*1000 + this_trait.mb
+ #except ValueError:
+ # if this_trait.chr.upper() == 'X':
+ # trait_location_value = 20*1000 + this_trait.mb
+ # else:
+ # trait_location_value = (ord(str(this_trait.chr).upper()[0])*1000 +
+ # this_trait.mb)
+
+ #ZS: Put this in function currently called "convert_location_to_value"
+ this_trait.location_repr = 'Chr %s: %.4f Mb' % (this_trait.chr,
+ float(this_trait.mb))
this_trait.location_value = trait_location_value
- #this_trait.trait_location_value = trait_location_value
- #XZ, 01/12/08: This SQL query is much faster.
+ #Get mean expression value
query = (
-"""select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
- where ProbeSetXRef.ProbeSetFreezeId = %s and
- ProbeSet.Id = ProbeSetXRef.ProbeSetId and
- ProbeSet.Name = '%s'
+ """select ProbeSetXRef.mean from ProbeSetXRef, ProbeSet
+ where ProbeSetXRef.ProbeSetFreezeId = %s and
+ ProbeSet.Id = ProbeSetXRef.ProbeSetId and
+ ProbeSet.Name = '%s'
""" % (escape(str(this_trait.dataset.id)),
escape(this_trait.name)))
- print("query is:", pf(query))
+ #print("query is:", pf(query))
result = g.db.execute(query).fetchone()
+
+ mean = result[0] if result else 0
- if result:
- if result[0]:
- mean = result[0]
- else:
- mean=0
- else:
- mean = 0
-
- #XZ, 06/05/2009: It is neccessary to turn on nowrap
- this_trait.mean = repr = "%2.3f" % mean
+ this_trait.mean = "%2.3f" % mean
#LRS and its location
this_trait.LRS_score_repr = 'N/A'
@@ -790,23 +788,39 @@ class MrnaAssayDataSet(DataSet):
result = self.cursor.fetchone()
if result:
- if result[0] and result[1]:
- LRS_Chr = result[0]
- LRS_Mb = result[1]
-
- #XZ: LRS_location_value is used for sorting
- try:
- LRS_location_value = int(LRS_Chr)*1000 + float(LRS_Mb)
- except:
- if LRS_Chr.upper() == 'X':
- LRS_location_value = 20*1000 + float(LRS_Mb)
- else:
- LRS_location_value = ord(str(LRS_chr).upper()[0])*1000 + float(LRS_Mb)
+ #if result[0] and result[1]:
+ # lrs_chr = result[0]
+ # lrs_mb = result[1]
+ lrs_chr, lrs_mb = result
+ #XZ: LRS_location_value is used for sorting
+ lrs_location_value = self.convert_location_to_value(lrs_chr, lrs_mb)
+
+ #try:
+ # lrs_location_value = int(lrs_chr)*1000 + float(lrs_mb)
+ #except:
+ # if lrs_chr.upper() == 'X':
+ # lrs_location_value = 20*1000 + float(lrs_mb)
+ # else:
+ # lrs_location_value = (ord(str(LRS_chr).upper()[0])*1000 +
+ # float(lrs_mb))
+
+ this_trait.LRS_score_repr = '%3.1f' % this_trait.lrs
+ this_trait.LRS_score_value = this_trait.lrs
+ this_trait.LRS_location_repr = 'Chr %s: %.4f Mb' % (lrs_chr, float(lrs_mb))
+
+
+ def convert_location_to_value(self, chromosome, mb):
+ try:
+ location_value = int(chromosome)*1000 + float(mb)
+ except ValueError:
+ if chromosome.upper() == 'X':
+ location_value = 20*1000 + float(mb)
+ else:
+ location_value = (ord(str(chromosome).upper()[0])*1000 +
+ float(mb))
+
+ return location_value
- this_trait.LRS_score_repr = LRS_score_repr = '%3.1f' % this_trait.lrs
- this_trait.LRS_score_value = LRS_score_value = this_trait.lrs
- this_trait.LRS_location_repr = LRS_location_repr = 'Chr %s: %.4f Mb' % (LRS_Chr, float(LRS_Mb) )
-
def get_sequence(self):
query = """
SELECT
@@ -912,7 +926,7 @@ class TempDataSet(DataSet):
def geno_mrna_confidentiality(ob):
dataset_table = ob.type + "Freeze"
- print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
+ #print("dataset_table [%s]: %s" % (type(dataset_table), dataset_table))
query = '''SELECT Id, Name, FullName, confidentiality,
AuthorisedUsers FROM %s WHERE Name = %%s''' % (dataset_table)