From e31d163325d0d417bf266d1c3d9e52b6ff00f83b Mon Sep 17 00:00:00 2001 From: Lei Yan Date: Thu, 23 May 2013 20:53:11 +0000 Subject: Now calculates correlation values for traits, but not yet in template --- wqflask/base/data_set.py | 46 +++++++++++++------- wqflask/wqflask/correlation/show_corr_results.py | 55 ++++++++++++++++-------- 2 files changed, 67 insertions(+), 34 deletions(-) diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 1520b180..89bbf03d 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -46,7 +46,7 @@ from pprint import pformat as pf DS_NAME_MAP = {} def create_dataset(dataset_name): - print("dataset_name:", dataset_name) + #print("dataset_name:", dataset_name) query = """ SELECT DBType.Name @@ -71,7 +71,7 @@ def create_dataset(dataset_name): def mescape(*items): """Multiple escape""" escaped = [escape(item) for item in items] - print("escaped is:", escaped) + #print("escaped is:", escaped) return escaped @@ -235,6 +235,7 @@ class DataSet(object): self.retrieve_other_names() self.group = DatasetGroup(self) # sets self.group and self.group_id and gets genotype + self.group.read_genotype_file() self.species = species.TheSpecies(self) @@ -624,17 +625,34 @@ class MrnaAssayDataSet(DataSet): return trait_data def get_trait_data(self): + import pdb + pdb.set_trace() + #samplelist = [] + #samplelist += self.group.samplelist + #samplelist += self.group.parlist + #samplelist += self.group.f1list + #self.samplelist = samplelist + + self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list + sample_ids = [] - for sample in self.group.samplelist: - query = """ - SELECT Strain.Id FROM Strain, Species - WHERE Strain.Name = '{}' - and Strain.SpeciesId=Species.Id - and Species.name = '{}' - """.format(*mescape(sample, self.group.species)) - this_id = g.db.execute(query).fetchone()[0] - sample_ids.append('%d' % this_id) - print("sample_ids size: ", len(sample_ids)) + + where_clause = "" + for sample in self.samplelist: + if len(where_clause): + where_clause += " or " + where_clause += """'{}'""".format(*mescape(sample)) + + query = """ + SELECT Strain.Id, Strain.Name FROM Strain, Species + WHERE Strain.Name = '{}' + and Strain.SpeciesId=Species.Id + and Species.name = '{}' + """.format(*mescape(where_clause, self.group.species)) + result = g.db.execute(query).fetchall() + + print("[blueberry] result is:", pf(result)) + #sample_ids.append('%d' % this_id) # MySQL limits the number of tables that can be used in a join to 61, # so we break the sample ids into smaller chunks @@ -642,7 +660,6 @@ class MrnaAssayDataSet(DataSet): n = len(sample_ids) / chunk_count if len(sample_ids) % chunk_count: n += 1 - print("n: ", n) #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId #tempTable = None #if GeneId and db.type == "ProbeSet": @@ -681,10 +698,9 @@ class MrnaAssayDataSet(DataSet): order by {}.Id """.format(*mescape(self.type, self.type, self.type, self.type, self.name, self.type, self.type, self.type, self.type)) - print("query: ", query) results = g.db.execute(query).fetchall() trait_sample_data.append(results) - + trait_count = len(trait_sample_data[0]) self.trait_data = collections.defaultdict(list) # put all of the separate data together into a dictionary where the keys are diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py index 1d0368cc..ee732050 100644 --- a/wqflask/wqflask/correlation/show_corr_results.py +++ b/wqflask/wqflask/correlation/show_corr_results.py @@ -37,6 +37,7 @@ import time #import pyXLWriter as xl import pp import math +import collections from pprint import pformat as pf @@ -285,16 +286,15 @@ class CorrelationResults(object): # name=start_vars['trait_id'], # cellid=None) - print("start_vars: ", pf(start_vars)) + #print("start_vars: ", pf(start_vars)) helper_functions.get_species_dataset_trait(self, start_vars) self.dataset.group.read_genotype_file() - - self.samples = [] # Want only ones with values - self.vals = [] corr_samples_group = start_vars['corr_samples_group'] + self.sample_data = {} + #The two if statements below append samples to the sample list based upon whether the user #selected Primary Samples Only, Other Samples Only, or All Samples @@ -310,16 +310,24 @@ class CorrelationResults(object): self.dataset.group.f1list + self.dataset.group.samplelist) self.process_samples(start_vars, self.this_trait.data.keys(), primary_samples) - - #for i, sample in enumerate(self.samples): - # print("{} : {}".format(sample, self.vals[i])) - self.target_dataset = data_set.create_dataset(start_vars['corr_dataset']) self.target_dataset.get_trait_data() - print("trait_list: {}".format(pf(self.target_dataset.trait_data))) # Lei Yan todo + import pdb + pdb.set_trace() + correlation_data = collections.defaultdict(list) for trait, values in self.target_dataset.trait_data.iteritems(): - correlation = calCorrelation(values, ) + values_1 = [] + values_2 = [] + for index,sample in enumerate(self.target_dataset.samplelist): + target_value = values[index] + if sample in self.sample_data.keys(): + this_value = self.sample_data[sample] + values_1.append(this_value) + values_2.append(target_value) + correlation = calCorrelation(values_1, values_2) + correlation_data[trait] = correlation + print ('%s %s' % (trait, correlation)) #XZ, 09/18/2008: get all information about the user selected database. #target_db_name = fd.corr_dataset @@ -779,19 +787,28 @@ makeWebGestaltTree(thisForm, '%s', %d, 'edag_only.php'); """ + #def process_samples(self, start_vars, sample_names, excluded_samples): + # for sample in sample_names: + # if sample not in excluded_samples: + # value = start_vars['value:' + sample] + # variance = start_vars['variance:' + sample] + # if variance.strip().lower() == 'x': + # variance = 0 + # else: + # variance = float(variance) + # if value.strip().lower() != 'x': + # self.samples.append(str(sample)) + # self.vals.append(float(value)) + # #self.variances.append(variance) + def process_samples(self, start_vars, sample_names, excluded_samples): for sample in sample_names: if sample not in excluded_samples: value = start_vars['value:' + sample] - variance = start_vars['variance:' + sample] - if variance.strip().lower() == 'x': - variance = 0 + if value.strip().lower() == 'x': + self.sample_data[str(sample)] = None else: - variance = float(variance) - if value.strip().lower() != 'x': - self.samples.append(str(sample)) - self.vals.append(float(value)) - #self.variances.append(variance) + self.sample_data[str(sample)] = float(value) def getSortByValue(self, calculationMethod): @@ -2134,7 +2151,7 @@ Resorting this table
def calCorrelation(values_1, values_2): - N = Math.min(len(values_1), len(values_2)) + N = min(len(values_1), len(values_2)) X = [] Y = [] for i in range(N): -- cgit v1.2.3