From eef63adae30c1547f4c4189eb59a18d190c3aa08 Mon Sep 17 00:00:00 2001 From: pjotrp Date: Mon, 11 May 2015 17:03:42 -0500 Subject: Moving pylmm out of the tree --- wqflask/base/data_set.py | 2 +- wqflask/utility/chunks.py | 96 ++++ wqflask/wqflask/heatmap/heatmap.py | 635 +++++++++++---------- .../wqflask/marker_regression/marker_regression.py | 4 +- 4 files changed, 417 insertions(+), 320 deletions(-) create mode 100644 wqflask/utility/chunks.py (limited to 'wqflask') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 489bd374..9f805fc3 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -42,7 +42,7 @@ from base import species from dbFunction import webqtlDatabaseFunction from utility import webqtlUtil from utility.benchmark import Bench -from wqflask.my_pylmm.pyLMM import chunks +from wqflask.utility import chunks from maintenance import get_group_samplelists diff --git a/wqflask/utility/chunks.py b/wqflask/utility/chunks.py new file mode 100644 index 00000000..9565fb96 --- /dev/null +++ b/wqflask/utility/chunks.py @@ -0,0 +1,96 @@ +from __future__ import absolute_import, print_function, division + +import math +import time + + +def divide_into_chunks(the_list, number_chunks): + """Divides a list into approximately number_chunks smaller lists + + >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3) + [[1, 2, 7], [3, 22, 8], [5, 22, 333]] + >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 4) + [[1, 2, 7], [3, 22, 8], [5, 22, 333]] + >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 5) + [[1, 2], [7, 3], [22, 8], [5, 22], [333]] + >>> + + """ + length = len(the_list) + + if length == 0: + return [[]] + + if length <= number_chunks: + number_chunks = length + + chunksize = int(math.ceil(length / number_chunks)) + + chunks = [] + for counter in range(0, length, chunksize): + chunks.append(the_list[counter:counter+chunksize]) + + return chunks + +def _confirm_chunk(original, result): + all_chunked = [] + for chunk in result: + all_chunked.extend(chunk) + print("length of all chunked:", len(all_chunked)) + assert original == all_chunked, "You didn't chunk right" + + +def _chunk_test(divide_func): + import random + random.seed(7) + + number_exact = 0 + total_amount_off = 0 + + for test in range(1, 1001): + print("\n\ntest:", test) + number_chunks = random.randint(1, 20) + number_elements = random.randint(0, 100) + the_list = list(range(1, number_elements)) + result = divide_func(the_list, number_chunks) + + print("Dividing list of length {} into approximately {} chunks - got {} chunks".format( + len(the_list), number_chunks, len(result))) + print("result:", result) + + _confirm_chunk(the_list, result) + + amount_off = abs(number_chunks - len(result)) + if amount_off == 0: + number_exact += 1 + else: + total_amount_off += amount_off + + + print("\n{} exact out of {} [Total amount off: {}]".format(number_exact, + test, + total_amount_off)) + assert number_exact == 558 + assert total_amount_off == 1580 + return number_exact, total_amount_off + + +def _main(): + info = dict() + #funcs = (("sam", sam_divide_into_chunks), ("zach", zach_divide_into_chunks)) + funcs = (("only one", divide_into_chunks),) + for name, func in funcs: + start = time.time() + number_exact, total_amount_off = _chunk_test(func) + took = time.time() - start + info[name] = dict(number_exact=number_exact, + total_amount_off=total_amount_off, + took=took) + + print("info is:", info) + +if __name__ == '__main__': + _main() + print("\nConfirming doctests...") + import doctest + doctest.testmod() diff --git a/wqflask/wqflask/heatmap/heatmap.py b/wqflask/wqflask/heatmap/heatmap.py index 9b6b1b69..035736fd 100644 --- a/wqflask/wqflask/heatmap/heatmap.py +++ b/wqflask/wqflask/heatmap/heatmap.py @@ -1,317 +1,318 @@ -from __future__ import absolute_import, print_function, division - -import sys -sys.path.append(".") - -import gc -import string -import cPickle -import os -import datetime -import time -import pp -import math -import collections -import resource - -import scipy -import numpy as np -from scipy import linalg - -from pprint import pformat as pf - -from htmlgen import HTMLgen2 as HT -import reaper - -from base.trait import GeneralTrait -from base import data_set -from base import species -from base import webqtlConfig -from utility import webqtlUtil -from wqflask.my_pylmm.data import prep_data -from wqflask.my_pylmm.pyLMM import lmm -from wqflask.my_pylmm.pyLMM import input -from utility import helper_functions -from utility import Plot, Bunch -from utility import temp_data - -from MySQLdb import escape_string as escape - -import cPickle as pickle -import simplejson as json - -from pprint import pformat as pf - -from redis import Redis -Redis = Redis() - -from flask import Flask, g - -class Heatmap(object): - - def __init__(self, start_vars, temp_uuid): - - trait_db_list = [trait.strip() for trait in start_vars['trait_list'].split(',')] - - helper_functions.get_trait_db_obs(self, trait_db_list) - - self.temp_uuid = temp_uuid - self.num_permutations = 5000 - self.dataset = self.trait_list[0][1] - - self.json_data = {} #The dictionary that will be used to create the json object that contains all the data needed to create the figure - - self.all_sample_list = [] - self.traits = [] - - chrnames = [] - self.species = species.TheSpecies(dataset=self.trait_list[0][1]) - for key in self.species.chromosomes.chromosomes.keys(): - chrnames.append([self.species.chromosomes.chromosomes[key].name, self.species.chromosomes.chromosomes[key].mb_length]) - - for trait_db in self.trait_list: - - this_trait = trait_db[0] - self.traits.append(this_trait.name) - this_sample_data = this_trait.data - - for sample in this_sample_data: - if sample not in self.all_sample_list: - self.all_sample_list.append(sample) - - self.sample_data = [] - for trait_db in self.trait_list: - this_trait = trait_db[0] - this_sample_data = this_trait.data - - #self.sample_data[this_trait.name] = [] - this_trait_vals = [] - for sample in self.all_sample_list: - if sample in this_sample_data: - this_trait_vals.append(this_sample_data[sample].value) - #self.sample_data[this_trait.name].append(this_sample_data[sample].value) - else: - this_trait_vals.append('') - #self.sample_data[this_trait.name].append('') - self.sample_data.append(this_trait_vals) - - self.gen_reaper_results() - #self.gen_pylmm_results() - - #chrnames = [] - lodnames = [] - chr_pos = [] - pos = [] - markernames = [] - - for trait in self.trait_results.keys(): - lodnames.append(trait) - - for marker in self.dataset.group.markers.markers: - #if marker['chr'] not in chrnames: - # chr_ob = [marker['chr'], "filler"] - # chrnames.append(chr_ob) - chr_pos.append(marker['chr']) - pos.append(marker['Mb']) - markernames.append(marker['name']) - - self.json_data['chrnames'] = chrnames - self.json_data['lodnames'] = lodnames - self.json_data['chr'] = chr_pos - self.json_data['pos'] = pos - self.json_data['markernames'] = markernames - - for trait in self.trait_results: - self.json_data[trait] = self.trait_results[trait] - - self.js_data = dict( - json_data = self.json_data - ) - - print("self.js_data:", self.js_data) - - - def gen_reaper_results(self): - self.trait_results = {} - for trait_db in self.trait_list: - self.dataset.group.get_markers() - this_trait = trait_db[0] - #this_db = trait_db[1] - genotype = self.dataset.group.read_genotype_file() - samples, values, variances = this_trait.export_informative() - - trimmed_samples = [] - trimmed_values = [] - for i in range(0, len(samples)): - if samples[i] in self.dataset.group.samplelist: - trimmed_samples.append(samples[i]) - trimmed_values.append(values[i]) - - self.lrs_array = genotype.permutation(strains = trimmed_samples, - trait = trimmed_values, - nperm= self.num_permutations) - - #self.suggestive = self.lrs_array[int(self.num_permutations*0.37-1)] - #self.significant = self.lrs_array[int(self.num_permutations*0.95-1)] - - reaper_results = genotype.regression(strains = trimmed_samples, - trait = trimmed_values) - - - lrs_values = [float(qtl.lrs) for qtl in reaper_results] - print("lrs_values:", lrs_values) - #self.dataset.group.markers.add_pvalues(p_values) - - self.trait_results[this_trait.name] = [] - for qtl in reaper_results: - if qtl.additive > 0: - self.trait_results[this_trait.name].append(-float(qtl.lrs)) - else: - self.trait_results[this_trait.name].append(float(qtl.lrs)) - #for lrs in lrs_values: - # if - # self.trait_results[this_trait.name].append(lrs) - - - #this_db_samples = self.dataset.group.samplelist - #this_sample_data = this_trait.data - ##print("this_sample_data", this_sample_data) - #this_trait_vals = [] - #for index, sample in enumerate(this_db_samples): - # if sample in this_sample_data: - # sample_value = this_sample_data[sample].value - # this_trait_vals.append(sample_value) - # else: - # this_trait_vals.append("x") - - #pheno_vector = np.array([val == "x" and np.nan or float(val) for val in this_trait_vals]) - - #key = "pylmm:input:" + str(self.temp_uuid) - #print("key is:", pf(key)) - - #genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers] - - #no_val_samples = self.identify_empty_samples(this_trait_vals) - #trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) - - #genotype_matrix = np.array(trimmed_genotype_data).T - - #print("genotype_matrix:", str(genotype_matrix.tolist())) - #print("pheno_vector:", str(pheno_vector.tolist())) - - #params = dict(pheno_vector = pheno_vector.tolist(), - # genotype_matrix = genotype_matrix.tolist(), - # restricted_max_likelihood = True, - # refit = False, - # temp_uuid = str(self.temp_uuid), - # - # # meta data - # timestamp = datetime.datetime.now().isoformat(), - # ) - # - #json_params = json.dumps(params) - ##print("json_params:", json_params) - #Redis.set(key, json_params) - #Redis.expire(key, 60*60) - #print("before printing command") - # - #command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, - # "other") - #print("command is:", command) - #print("after printing command") - # - #os.system(command) - # - #json_results = Redis.blpop("pylmm:results:" + str(self.temp_uuid), 45*60) - - def gen_pylmm_results(self): - self.trait_results = {} - for trait_db in self.trait_list: - this_trait = trait_db[0] - #this_db = trait_db[1] - self.dataset.group.get_markers() - - this_db_samples = self.dataset.group.samplelist - this_sample_data = this_trait.data - #print("this_sample_data", this_sample_data) - this_trait_vals = [] - for index, sample in enumerate(this_db_samples): - if sample in this_sample_data: - sample_value = this_sample_data[sample].value - this_trait_vals.append(sample_value) - else: - this_trait_vals.append("x") - - pheno_vector = np.array([val == "x" and np.nan or float(val) for val in this_trait_vals]) - - key = "pylmm:input:" + str(self.temp_uuid) - #print("key is:", pf(key)) - - genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers] - - no_val_samples = self.identify_empty_samples(this_trait_vals) - trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) - - genotype_matrix = np.array(trimmed_genotype_data).T - - #print("genotype_matrix:", str(genotype_matrix.tolist())) - #print("pheno_vector:", str(pheno_vector.tolist())) - - params = dict(pheno_vector = pheno_vector.tolist(), - genotype_matrix = genotype_matrix.tolist(), - restricted_max_likelihood = True, - refit = False, - temp_uuid = str(self.temp_uuid), - - # meta data - timestamp = datetime.datetime.now().isoformat(), - ) - - json_params = json.dumps(params) - #print("json_params:", json_params) - Redis.set(key, json_params) - Redis.expire(key, 60*60) - print("before printing command") - - command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, - "other") - print("command is:", command) - print("after printing command") - - os.system(command) - - json_results = Redis.blpop("pylmm:results:" + str(self.temp_uuid), 45*60) - results = json.loads(json_results[1]) - p_values = [float(result) for result in results['p_values']] - #print("p_values:", p_values) - self.dataset.group.markers.add_pvalues(p_values) - - self.trait_results[this_trait.name] = [] - for marker in self.dataset.group.markers.markers: - self.trait_results[this_trait.name].append(marker['lod_score']) - - - def identify_empty_samples(self, values): - no_val_samples = [] - for sample_count, val in enumerate(values): - if val == "x": - no_val_samples.append(sample_count) - return no_val_samples - - def trim_genotypes(self, genotype_data, no_value_samples): - trimmed_genotype_data = [] - for marker in genotype_data: - new_genotypes = [] - for item_count, genotype in enumerate(marker): - if item_count in no_value_samples: - continue - try: - genotype = float(genotype) - except ValueError: - genotype = np.nan - pass - new_genotypes.append(genotype) - trimmed_genotype_data.append(new_genotypes) - return trimmed_genotype_data - - \ No newline at end of file +from __future__ import absolute_import, print_function, division + +import sys +sys.path.append(".") + +import gc +import string +import cPickle +import os +import datetime +import time +import pp +import math +import collections +import resource + +import scipy +import numpy as np +from scipy import linalg + +from pprint import pformat as pf + +from htmlgen import HTMLgen2 as HT +import reaper + +from base.trait import GeneralTrait +from base import data_set +from base import species +from base import webqtlConfig +from utility import webqtlUtil +from wqflask.my_pylmm.data import prep_data +# from wqflask.my_pylmm.pyLMM import lmm +# from wqflask.my_pylmm.pyLMM import input +from utility import helper_functions +from utility import Plot, Bunch +from utility import temp_data + +from MySQLdb import escape_string as escape + +import cPickle as pickle +import simplejson as json + +from pprint import pformat as pf + +from redis import Redis +Redis = Redis() + +from flask import Flask, g + +class Heatmap(object): + + def __init__(self, start_vars, temp_uuid): + + trait_db_list = [trait.strip() for trait in start_vars['trait_list'].split(',')] + + helper_functions.get_trait_db_obs(self, trait_db_list) + + self.temp_uuid = temp_uuid + self.num_permutations = 5000 + self.dataset = self.trait_list[0][1] + + self.json_data = {} #The dictionary that will be used to create the json object that contains all the data needed to create the figure + + self.all_sample_list = [] + self.traits = [] + + chrnames = [] + self.species = species.TheSpecies(dataset=self.trait_list[0][1]) + for key in self.species.chromosomes.chromosomes.keys(): + chrnames.append([self.species.chromosomes.chromosomes[key].name, self.species.chromosomes.chromosomes[key].mb_length]) + + for trait_db in self.trait_list: + + this_trait = trait_db[0] + self.traits.append(this_trait.name) + this_sample_data = this_trait.data + + for sample in this_sample_data: + if sample not in self.all_sample_list: + self.all_sample_list.append(sample) + + self.sample_data = [] + for trait_db in self.trait_list: + this_trait = trait_db[0] + this_sample_data = this_trait.data + + #self.sample_data[this_trait.name] = [] + this_trait_vals = [] + for sample in self.all_sample_list: + if sample in this_sample_data: + this_trait_vals.append(this_sample_data[sample].value) + #self.sample_data[this_trait.name].append(this_sample_data[sample].value) + else: + this_trait_vals.append('') + #self.sample_data[this_trait.name].append('') + self.sample_data.append(this_trait_vals) + + self.gen_reaper_results() + #self.gen_pylmm_results() + + #chrnames = [] + lodnames = [] + chr_pos = [] + pos = [] + markernames = [] + + for trait in self.trait_results.keys(): + lodnames.append(trait) + + for marker in self.dataset.group.markers.markers: + #if marker['chr'] not in chrnames: + # chr_ob = [marker['chr'], "filler"] + # chrnames.append(chr_ob) + chr_pos.append(marker['chr']) + pos.append(marker['Mb']) + markernames.append(marker['name']) + + self.json_data['chrnames'] = chrnames + self.json_data['lodnames'] = lodnames + self.json_data['chr'] = chr_pos + self.json_data['pos'] = pos + self.json_data['markernames'] = markernames + + for trait in self.trait_results: + self.json_data[trait] = self.trait_results[trait] + + self.js_data = dict( + json_data = self.json_data + ) + + print("self.js_data:", self.js_data) + + + def gen_reaper_results(self): + self.trait_results = {} + for trait_db in self.trait_list: + self.dataset.group.get_markers() + this_trait = trait_db[0] + #this_db = trait_db[1] + genotype = self.dataset.group.read_genotype_file() + samples, values, variances = this_trait.export_informative() + + trimmed_samples = [] + trimmed_values = [] + for i in range(0, len(samples)): + if samples[i] in self.dataset.group.samplelist: + trimmed_samples.append(samples[i]) + trimmed_values.append(values[i]) + + self.lrs_array = genotype.permutation(strains = trimmed_samples, + trait = trimmed_values, + nperm= self.num_permutations) + + #self.suggestive = self.lrs_array[int(self.num_permutations*0.37-1)] + #self.significant = self.lrs_array[int(self.num_permutations*0.95-1)] + + reaper_results = genotype.regression(strains = trimmed_samples, + trait = trimmed_values) + + + lrs_values = [float(qtl.lrs) for qtl in reaper_results] + print("lrs_values:", lrs_values) + #self.dataset.group.markers.add_pvalues(p_values) + + self.trait_results[this_trait.name] = [] + for qtl in reaper_results: + if qtl.additive > 0: + self.trait_results[this_trait.name].append(-float(qtl.lrs)) + else: + self.trait_results[this_trait.name].append(float(qtl.lrs)) + #for lrs in lrs_values: + # if + # self.trait_results[this_trait.name].append(lrs) + + + #this_db_samples = self.dataset.group.samplelist + #this_sample_data = this_trait.data + ##print("this_sample_data", this_sample_data) + #this_trait_vals = [] + #for index, sample in enumerate(this_db_samples): + # if sample in this_sample_data: + # sample_value = this_sample_data[sample].value + # this_trait_vals.append(sample_value) + # else: + # this_trait_vals.append("x") + + #pheno_vector = np.array([val == "x" and np.nan or float(val) for val in this_trait_vals]) + + #key = "pylmm:input:" + str(self.temp_uuid) + #print("key is:", pf(key)) + + #genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers] + + #no_val_samples = self.identify_empty_samples(this_trait_vals) + #trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + #genotype_matrix = np.array(trimmed_genotype_data).T + + #print("genotype_matrix:", str(genotype_matrix.tolist())) + #print("pheno_vector:", str(pheno_vector.tolist())) + + #params = dict(pheno_vector = pheno_vector.tolist(), + # genotype_matrix = genotype_matrix.tolist(), + # restricted_max_likelihood = True, + # refit = False, + # temp_uuid = str(self.temp_uuid), + # + # # meta data + # timestamp = datetime.datetime.now().isoformat(), + # ) + # + #json_params = json.dumps(params) + ##print("json_params:", json_params) + #Redis.set(key, json_params) + #Redis.expire(key, 60*60) + #print("before printing command") + # + #command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, + # "other") + #print("command is:", command) + #print("after printing command") + # + #os.system(command) + # + #json_results = Redis.blpop("pylmm:results:" + str(self.temp_uuid), 45*60) + + def gen_pylmm_results(self): + # This function is NOT used. If it is, we should use a shared function with marker_regression.py + self.trait_results = {} + for trait_db in self.trait_list: + this_trait = trait_db[0] + #this_db = trait_db[1] + self.dataset.group.get_markers() + + this_db_samples = self.dataset.group.samplelist + this_sample_data = this_trait.data + #print("this_sample_data", this_sample_data) + this_trait_vals = [] + for index, sample in enumerate(this_db_samples): + if sample in this_sample_data: + sample_value = this_sample_data[sample].value + this_trait_vals.append(sample_value) + else: + this_trait_vals.append("x") + + pheno_vector = np.array([val == "x" and np.nan or float(val) for val in this_trait_vals]) + + key = "pylmm:input:" + str(self.temp_uuid) + #print("key is:", pf(key)) + + genotype_data = [marker['genotypes'] for marker in self.dataset.group.markers.markers] + + no_val_samples = self.identify_empty_samples(this_trait_vals) + trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + genotype_matrix = np.array(trimmed_genotype_data).T + + #print("genotype_matrix:", str(genotype_matrix.tolist())) + #print("pheno_vector:", str(pheno_vector.tolist())) + + params = dict(pheno_vector = pheno_vector.tolist(), + genotype_matrix = genotype_matrix.tolist(), + restricted_max_likelihood = True, + refit = False, + temp_uuid = str(self.temp_uuid), + + # meta data + timestamp = datetime.datetime.now().isoformat(), + ) + + json_params = json.dumps(params) + #print("json_params:", json_params) + Redis.set(key, json_params) + Redis.expire(key, 60*60) + print("before printing command") + + command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, + "other") + print("command is:", command) + print("after printing command") + + os.system(command) + + json_results = Redis.blpop("pylmm:results:" + str(self.temp_uuid), 45*60) + results = json.loads(json_results[1]) + p_values = [float(result) for result in results['p_values']] + #print("p_values:", p_values) + self.dataset.group.markers.add_pvalues(p_values) + + self.trait_results[this_trait.name] = [] + for marker in self.dataset.group.markers.markers: + self.trait_results[this_trait.name].append(marker['lod_score']) + + + def identify_empty_samples(self, values): + no_val_samples = [] + for sample_count, val in enumerate(values): + if val == "x": + no_val_samples.append(sample_count) + return no_val_samples + + def trim_genotypes(self, genotype_data, no_value_samples): + trimmed_genotype_data = [] + for marker in genotype_data: + new_genotypes = [] + for item_count, genotype in enumerate(marker): + if item_count in no_value_samples: + continue + try: + genotype = float(genotype) + except ValueError: + genotype = np.nan + pass + new_genotypes.append(genotype) + trimmed_genotype_data.append(new_genotypes) + return trimmed_genotype_data + + diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index 49521bd6..c5fab4ee 100755 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -37,8 +37,8 @@ from utility import webqtlUtil from wqflask.marker_regression import gemma_mapping #from wqflask.marker_regression import rqtl_mapping from wqflask.my_pylmm.data import prep_data -from wqflask.my_pylmm.pyLMM import lmm -from wqflask.my_pylmm.pyLMM import input +# from wqflask.my_pylmm.pyLMM import lmm +# from wqflask.my_pylmm.pyLMM import input from utility import helper_functions from utility import Plot, Bunch from utility import temp_data -- cgit v1.2.3