From 72fb1ac4ade5b812ccafcc00076ae19dcad39b9e Mon Sep 17 00:00:00 2001 From: zsloan Date: Fri, 1 May 2015 22:30:00 +0000 Subject: Added Pjotr's latest pylmm changes Added GEMMA and necessary plink-format genotype files in gemma directory Added the GEMMA genotype files to git LFS Removed some unnecessary/duplicated code from interval_mapping.py Separated GEMMA mapping code into its own file (will also do this for other methods) and fixed and issue that caused it to not run Added the table to the pair scan results page and wrote some code adding its attributes to each marker object, but it still isn't working yet --- wqflask/runserver.py | 7 +- .../wqflask/interval_mapping/interval_mapping.py | 22 +- wqflask/wqflask/marker_regression/gemma_mapping.py | 44 +++ .../wqflask/marker_regression/marker_regression.py | 62 +++- wqflask/wqflask/model.py | 2 +- wqflask/wqflask/my_pylmm/README.md | 52 ++-- wqflask/wqflask/my_pylmm/pyLMM/__init__.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/benchmark.py | 44 +++ wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 10 +- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 19 +- wqflask/wqflask/my_pylmm/pyLMM/gn2.py | 110 +++++++ wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 80 +++-- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 83 +++-- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 341 +++++++++++++-------- wqflask/wqflask/my_pylmm/pyLMM/lmm2.py | 71 +++-- wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 37 ++- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 111 ++++--- wqflask/wqflask/my_pylmm/pyLMM/standalone.py | 110 +++++++ wqflask/wqflask/my_pylmm/pyLMM/temp_data.py | 25 ++ wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py | 48 ++- .../new/javascript/show_trait_mapping_tools.coffee | 13 +- .../new/javascript/show_trait_mapping_tools.js | 7 +- wqflask/wqflask/templates/pair_scan_results.html | 22 ++ wqflask/wqflask/templates/search_result_page.html | 2 +- .../templates/show_trait_mapping_tools.html | 12 +- 25 files changed, 956 insertions(+), 380 deletions(-) create mode 100644 wqflask/wqflask/marker_regression/gemma_mapping.py create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/benchmark.py create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/gn2.py create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/standalone.py create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/temp_data.py (limited to 'wqflask') diff --git a/wqflask/runserver.py b/wqflask/runserver.py index 9d5686a9..5a76d1e2 100755 --- a/wqflask/runserver.py +++ b/wqflask/runserver.py @@ -18,9 +18,10 @@ from wqflask import app #_ch = logging.StreamHandler() #_log.addHandler(_ch) +print app.config + import logging -#from themodule import TheHandlerYouWant -file_handler = logging.FileHandler("/tmp/flask_gn_log_danny_unsecure") +file_handler = logging.FileHandler(app.config['LOGFILE']) file_handler.setLevel(logging.DEBUG) app.logger.addHandler(file_handler) @@ -28,7 +29,7 @@ import logging_tree logging_tree.printout() app.run(host='0.0.0.0', - port=5003, + port=app.config['SERVER_PORT'], use_debugger=False, threaded=True, use_reloader=True) diff --git a/wqflask/wqflask/interval_mapping/interval_mapping.py b/wqflask/wqflask/interval_mapping/interval_mapping.py index 5eb901f7..5511826a 100755 --- a/wqflask/wqflask/interval_mapping/interval_mapping.py +++ b/wqflask/wqflask/interval_mapping/interval_mapping.py @@ -57,13 +57,8 @@ class IntervalMapping(object): self.set_options(start_vars) self.json_data = {} - - #if self.method == "qtl_reaper": self.json_data['lodnames'] = ['lod.hk'] self.gen_reaper_results(tempdata) - #else: - # self.gen_pylmm_results(tempdata) - #self.gen_qtl_results(tempdata) #Get chromosome lengths for drawing the interval map plot chromosome_mb_lengths = {} @@ -93,13 +88,7 @@ class IntervalMapping(object): def set_options(self, start_vars): """Sets various options (physical/genetic mapping, # permutations, which chromosome""" - #self.plot_scale = start_vars['scale'] - #if self.plotScale == 'physic' and not fd.genotype.Mbmap: - # self.plotScale = 'morgan' - #self.method = start_vars['mapping_method'] self.num_permutations = int(start_vars['num_perm']) - #self.do_bootstrap = start_vars['do_bootstrap'] - #self.selected_chr = start_vars['chromosome'] if start_vars['manhattan_plot'] == "true": self.manhattan_plot = True else: @@ -112,9 +101,6 @@ class IntervalMapping(object): self.control_locus = start_vars['control_locus'] else: self.control_locus = None - #self.weighted_regression = start_vars['weighted'] - #self.lrs_lod = start_vars['lrs_lod'] - def gen_qtl_results(self, tempdata): """Generates qtl results for plotting interval map""" @@ -134,7 +120,7 @@ class IntervalMapping(object): #if self.weighted_regression: # self.lrs_array = self.genotype.permutation(strains = trimmed_samples, # trait = trimmed_values, - # variance = _vars, + # variance = variances, # nperm=self.num_permutations) #else: self.lrs_array = genotype.permutation(strains = trimmed_samples, @@ -175,12 +161,6 @@ class IntervalMapping(object): self.json_data['markernames'] = [] for qtl in reaper_results: reaper_locus = qtl.locus - #if reaper_locus.chr == "20": - # print("changing to X") - # self.json_data['chr'].append("X") - #else: - # self.json_data['chr'].append(reaper_locus.chr) - ##self.json_data['chr'].append(reaper_locus.chr) self.json_data['pos'].append(reaper_locus.Mb) self.json_data['lod.hk'].append(qtl.lrs) self.json_data['markernames'].append(reaper_locus.name) diff --git a/wqflask/wqflask/marker_regression/gemma_mapping.py b/wqflask/wqflask/marker_regression/gemma_mapping.py new file mode 100644 index 00000000..b1ab780c --- /dev/null +++ b/wqflask/wqflask/marker_regression/gemma_mapping.py @@ -0,0 +1,44 @@ +import os + +from base import webqtlConfig + +def run_gemma(this_dataset, samples, vals): + """Generates p-values for each marker using GEMMA""" + + print("INSIDE GEMMA_MAPPING") + + gen_pheno_txt_file(this_dataset, samples, vals) + + os.chdir("{}gemma".format(webqtlConfig.HTMLPATH)) + + gemma_command = './gemma -bfile %s -k output_%s.cXX.txt -lmm 1 -o output/%s_output' % (this_dataset.group.name, + this_dataset.group.name, + this_dataset.group.name) + print("gemma_command:" + gemma_command) + + os.system(gemma_command) + + included_markers, p_values = parse_gemma_output(this_dataset) + + return included_markers, p_values + +def gen_pheno_txt_file(this_dataset, samples, vals): + """Generates phenotype file for GEMMA""" + + with open("{}gemma/{}.fam".format(webqtlConfig.HTMLPATH, this_dataset.group.name), "w") as outfile: + for i, sample in enumerate(samples): + outfile.write(str(sample) + " " + str(sample) + " 0 0 0 " + str(vals[i]) + "\n") + +def parse_gemma_output(this_dataset): + included_markers = [] + p_values = [] + with open("{}gemma/output/{}_output.assoc.txt".format(webqtlConfig.HTMLPATH, this_dataset.group.name)) as output_file: + for line in output_file: + if line.startswith("chr"): + continue + else: + included_markers.append(line.split("\t")[1]) + p_values.append(float(line.split("\t")[10])) + + print("p_values: ", p_values) + return included_markers, p_values \ No newline at end of file diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index 7708356b..bed316d7 100755 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -25,12 +25,17 @@ from redis import Redis Redis = Redis() from flask import Flask, g +from wqflask import app from base.trait import GeneralTrait from base import data_set from base import species from base import webqtlConfig from utility import webqtlUtil +from wqflask.marker_regression import qtl_reaper_mapping +from wqflask.marker_regression import plink_mapping +from wqflask.marker_regression import gemma_mapping +from wqflask.marker_regression import rqtl_mapping from wqflask.my_pylmm.data import prep_data from wqflask.my_pylmm.pyLMM import lmm from wqflask.my_pylmm.pyLMM import input @@ -40,6 +45,14 @@ from utility import temp_data from utility.benchmark import Bench +import os +if os.environ.get('PYLMM_PATH') is None: + PYLMM_PATH=app.config.get('PYLMM_PATH') + if PYLMM_PATH is None: + PYLMM_PATH=os.environ['HOME']+'/gene/wqflask/wqflask/my_pylmm/pyLMM' +if not os.path.isfile(PYLMM_PATH+'/lmm.py'): + raise 'PYLMM_PATH unknown or faulty' +PYLMM_COMMAND= 'python '+PYLMM_PATH+'/lmm.py' class MarkerRegression(object): @@ -73,7 +86,10 @@ class MarkerRegression(object): self.dataset.group.get_markers() if self.mapping_method == "gemma": - qtl_results = self.run_gemma() + included_markers, p_values = gemma_mapping.run_gemma(self.dataset, self.samples, self.vals) + self.dataset.group.get_specified_markers(markers = included_markers) + self.dataset.group.markers.add_pvalues(p_values) + qtl_results = self.dataset.group.markers.markers elif self.mapping_method == "rqtl_plink": qtl_results = self.run_rqtl_plink() elif self.mapping_method == "rqtl_geno": @@ -88,9 +104,7 @@ class MarkerRegression(object): if start_vars['pair_scan'] == "true": self.pair_scan = True - print("pair scan:", self.pair_scan) - print("DOING RQTL GENO") qtl_results = self.run_rqtl_geno() print("qtl_results:", qtl_results) elif self.mapping_method == "plink": @@ -126,8 +140,9 @@ class MarkerRegression(object): #Need to convert the QTL objects that qtl reaper returns into a json serializable dictionary self.qtl_results = [] - for qtl in self.filtered_markers: - print("lod score is:", qtl['lod_score']) + for index,qtl in enumerate(self.filtered_markers): + if index<40: + print("lod score is:", qtl['lod_score']) if qtl['chr'] == highest_chr and highest_chr != "X" and highest_chr != "X/Y": print("changing to X") self.json_data['chr'].append("X") @@ -144,7 +159,7 @@ class MarkerRegression(object): self.json_data['chrnames'].append([self.species.chromosomes.chromosomes[key].name, self.species.chromosomes.chromosomes[key].mb_length]) chromosome_mb_lengths[key] = self.species.chromosomes.chromosomes[key].mb_length - print("json_data:", self.json_data) + # print("json_data:", self.json_data) self.js_data = dict( @@ -156,11 +171,11 @@ class MarkerRegression(object): chromosomes = chromosome_mb_lengths, qtl_results = self.filtered_markers, ) + def run_gemma(self): """Generates p-values for each marker using GEMMA""" - #filename = webqtlUtil.genRandStr("{}_{}_".format(self.dataset.group.name, self.this_trait.name)) self.gen_pheno_txt_file() os.chdir("/home/zas1024/gene/web/gemma") @@ -272,7 +287,7 @@ class MarkerRegression(object): """) def run_rqtl_geno(self): - print("Calling R/qtl from python") + print("Calling R/qtl") self.geno_to_rqtl_function() @@ -381,9 +396,23 @@ class MarkerRegression(object): return pheno_as_string def process_pair_scan_results(self, result): - results = [] + pair_scan_results = [] + + result = result[1] + output = [tuple([result[j][i] for j in range(result.ncol)]) for i in range(result.nrow)] + print("R/qtl scantwo output:", output) + + for i, line in enumerate(result.iter_row()): + marker = {} + marker['name'] = result.rownames[i] + marker['chr1'] = output[i][0] + marker['Mb'] = output[i][1] + marker['chr2'] = int(output[i][2]) + pair_scan_results.append(marker) + + print("pair_scan_results:", pair_scan_results) - return results + return pair_scan_results def process_rqtl_results(self, result): # TODO: how to make this a one liner and not copy the stuff in a loop qtl_results = [] @@ -655,8 +684,7 @@ class MarkerRegression(object): Redis.set(key, json_params) Redis.expire(key, 60*60) - command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, - "other") + command = PYLMM_COMMAND+' --key {} --species {}'.format(key,"other") os.system(command) @@ -713,8 +741,8 @@ class MarkerRegression(object): # "refit": False, # "temp_data": tempdata} - print("genotype_matrix:", str(genotype_matrix.tolist())) - print("pheno_vector:", str(pheno_vector.tolist())) + # print("genotype_matrix:", str(genotype_matrix.tolist())) + # print("pheno_vector:", str(pheno_vector.tolist())) params = dict(pheno_vector = pheno_vector.tolist(), genotype_matrix = genotype_matrix.tolist(), @@ -732,7 +760,7 @@ class MarkerRegression(object): Redis.expire(key, 60*60) print("before printing command") - command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, + command = PYLMM_COMMAND + ' --key {} --species {}'.format(key, "other") print("command is:", command) print("after printing command") @@ -745,7 +773,7 @@ class MarkerRegression(object): json_results = Redis.blpop("pylmm:results:" + temp_uuid, 45*60) results = json.loads(json_results[1]) p_values = [float(result) for result in results['p_values']] - print("p_values:", p_values) + print("p_values:", p_values[:10]) #p_values = self.trim_results(p_values) t_stats = results['t_stats'] @@ -806,7 +834,7 @@ class MarkerRegression(object): print("Before creating the command") - command = 'python /home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/lmm.py --key {} --species {}'.format(key, + command = PYLMM_COMMAND+' --key {} --species {}'.format(key, "human") print("command is:", command) diff --git a/wqflask/wqflask/model.py b/wqflask/wqflask/model.py index fa8c1aab..042cb8df 100755 --- a/wqflask/wqflask/model.py +++ b/wqflask/wqflask/model.py @@ -194,4 +194,4 @@ def display_collapsible(number): def user_uuid(): """Unique cookie for a user""" user_uuid = request.cookies.get('user_uuid') - \ No newline at end of file + diff --git a/wqflask/wqflask/my_pylmm/README.md b/wqflask/wqflask/my_pylmm/README.md index f6b0e72d..b844c845 100644 --- a/wqflask/wqflask/my_pylmm/README.md +++ b/wqflask/wqflask/my_pylmm/README.md @@ -1,21 +1,37 @@ -# RELEASE NOTES - -## 0.50-gn2-pre1 release - -This is the first test release of multi-core pylmm into GN2. Both -kinship calculation and GWAS have been made multi-threaded by -introducing the Python multiprocessing module. Note that only -run_other has been updated to use the new routines (so human is still -handled the old way). I have taken care that we can still run both -old-style and new-style LMM (through passing the 'new_code' -boolean). This could be an option in the web server for users to -select and test for any unexpected differences (of which there should -be none, naturally ;). - -The current version can handle missing phenotypes, but as they are -removed there is no way for GN2 to know what SNPs the P-values belong -to. A future version will pass a SNP index to allow for missing -phenotypes. +# Genenetwork2/pylmm RELEASE NOTES + +## 0.51-gn2 (April 19, 2015) + +- Improved GN2 integration +- Less matrix transposes +- Able to run pylmm standalone without Redis again (still requires + the modules) + +## 0.50-gn2 (April 2nd, 2015) + +- Replaced the GN2 genotype normalization + +## 0.50-gn2-pre2 (March 18, 2015) + +- Added abstractions for progress meter and info/debug statements; + Redis perc_complete is now updated through a lambda + +## 0.50-gn2-pre1 (release, March 17, 2015) + +- This is the first test release of multi-core pylmm into GN2. Both + kinship calculation and GWAS have been made multi-threaded by + introducing the Python multiprocessing module. Note that only + run_other has been updated to use the new routines (so human is + still handled the old way). I have taken care that we can still run + both old-style and new-style LMM (through passing the 'new_code' + boolean). This could be an option in the web server for users to + select and test for any unexpected differences (of which there + should be none, naturally ;). + +- The current version can handle missing phenotypes, but as they are + removed there is no way for GN2 to know what SNPs the P-values + belong to. A future version will pass a SNP index to allow for + missing phenotypes. \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py index c40c3221..f33c4e74 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py @@ -1 +1 @@ -PYLMM_VERSION="0.50-gn2-pre1" +PYLMM_VERSION="0.51-gn2" diff --git a/wqflask/wqflask/my_pylmm/pyLMM/benchmark.py b/wqflask/wqflask/my_pylmm/pyLMM/benchmark.py new file mode 100644 index 00000000..6c6c9f88 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/benchmark.py @@ -0,0 +1,44 @@ +from __future__ import print_function, division, absolute_import + +import collections +import inspect +import time + +class Bench(object): + entries = collections.OrderedDict() + + def __init__(self, name=None): + self.name = name + + def __enter__(self): + if self.name: + print("Starting benchmark: %s" % (self.name)) + else: + print("Starting benchmark at: %s [%i]" % (inspect.stack()[1][3], inspect.stack()[1][2])) + self.start_time = time.time() + + def __exit__(self, type, value, traceback): + if self.name: + name = self.name + else: + name = "That" + + time_taken = time.time() - self.start_time + print(" %s took: %f seconds" % (name, (time_taken))) + + if self.name: + Bench.entries[self.name] = Bench.entries.get(self.name, 0) + time_taken + + + @classmethod + def report(cls): + total_time = sum((time_taken for time_taken in cls.entries.itervalues())) + print("\nTiming report\n") + for name, time_taken in cls.entries.iteritems(): + percent = int(round((time_taken/total_time) * 100)) + print("[{}%] {}: {}".format(percent, name, time_taken)) + print() + + def reset(cls): + """Reset the entries""" + cls.entries = collections.OrderedDict() diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 3b6b5d70..4312fed0 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -1,5 +1,5 @@ -# This is a converter for common LMM formats, so as to keep complexity -# outside the main routines. +# This is a converter for common LMM formats, so as to keep file +# reader complexity outside the main routines. # Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) # @@ -31,6 +31,12 @@ python convertlmm.py [--plink] [--prefix out_basename] [--kinship kfile] [--phen Convert files for runlmm.py processing. Writes to stdout by default. try --help for more information + +Examples: + + python ./my_pylmm/pyLMM/convertlmm.py --plink --pheno data/test_snps.132k.clean.noX.fake.phenos > test.pheno + + python ./my_pylmm/pyLMM/convertlmm.py --plink --pheno data/test_snps.132k.clean.noX.fake.phenos --geno data/test_snps.132k.clean.noX > test.geno """ # if len(args) == 0: diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index 315fd824..49f32e3a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -37,14 +37,15 @@ def normalize(ind_g): Run for every SNP list (for one individual) and return normalized SNP genotype values with missing data filled in """ - g1 = np.copy(ind_g) # avoid side effects - x = True - np.isnan(ind_g) # Matrix of True/False - m = ind_g[x].mean() # Global mean value - s = np.sqrt(ind_g[x].var()) # Global stddev - g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data - if s == 0: - g1 = g1 - m # Subtract the mean + g = np.copy(ind_g) # copy to avoid side effects + missing = np.isnan(g) + values = g[True - missing] + mean = values.mean() # Global mean value + stddev = np.sqrt(values.var()) # Global stddev + g[missing] = mean # Plug-in mean values for missing data + if stddev == 0: + g = g - mean # Subtract the mean else: - g1 = (g1 - m) / s # Normalize the deviation - return g1 + g = (g - mean) / stddev # Normalize the deviation + return g diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gn2.py b/wqflask/wqflask/my_pylmm/pyLMM/gn2.py new file mode 100644 index 00000000..821195c8 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/gn2.py @@ -0,0 +1,110 @@ +# Standalone specific methods and callback handler +# +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# Set the log level with +# +# logging.basicConfig(level=logging.DEBUG) + +from __future__ import absolute_import, print_function, division + +import numpy as np +import sys +import logging + +# logger = logging.getLogger(__name__) +logger = logging.getLogger('lmm2') +logging.basicConfig(level=logging.DEBUG) +np.set_printoptions(precision=3,suppress=True) + +progress_location = None +progress_current = None +progress_prev_perc = None + +def progress_default_func(location,count,total): + global progress_current + value = round(count*100.0/total) + progress_current = value + +progress_func = progress_default_func + +def progress_set_func(func): + global progress_func + progress_func = func + +def progress(location, count, total): + global progress_location + global progress_prev_perc + + perc = round(count*100.0/total) + if perc != progress_prev_perc and (location != progress_location or perc > 98 or perc > progress_prev_perc + 5): + progress_func(location, count, total) + logger.info("Progress: %s %d%%" % (location,perc)) + progress_location = location + progress_prev_perc = perc + +def mprint(msg,data): + """ + Array/matrix print function + """ + m = np.array(data) + if m.ndim == 1: + print(msg,m.shape,"=\n",m[0:3]," ... ",m[-3:]) + if m.ndim == 2: + print(msg,m.shape,"=\n[", + m[0][0:3]," ... ",m[0][-3:],"\n ", + m[1][0:3]," ... ",m[1][-3:],"\n ...\n ", + m[-2][0:3]," ... ",m[-2][-3:],"\n ", + m[-1][0:3]," ... ",m[-1][-3:],"]") + +def fatal(msg): + logger.critical(msg) + raise Exception(msg) + +def callbacks(): + return dict( + write = sys.stdout.write, + writeln = print, + debug = logger.debug, + info = logger.info, + warning = logger.warning, + error = logger.error, + critical = logger.critical, + fatal = fatal, + progress = progress, + mprint = mprint + ) + +def uses(*funcs): + """ + Some sugar + """ + return [callbacks()[func] for func in funcs] + +# ----- Minor test cases: + +if __name__ == '__main__': + # logging.basicConfig(level=logging.DEBUG) + logging.debug("Test %i" % (1)) + d = callbacks()['debug'] + d("TEST") + wrln = callbacks()['writeln'] + wrln("Hello %i" % 34) + progress = callbacks()['progress'] + progress("I am half way",50,100) + list = [0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15] + mprint("list",list) + matrix = [[1,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [2,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [3,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [4,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [5,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [6,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15]] + mprint("matrix",matrix) + ix,dx = uses("info","debug") + ix("ix") + dx("dx") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py index b901c0e2..247a8729 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py @@ -20,7 +20,6 @@ import pdb import time import sys -# from utility import temp_data import lmm2 import os @@ -32,16 +31,25 @@ from lmm2 import LMM2 import multiprocessing as mp # Multiprocessing is part of the Python stdlib import Queue +# ---- A trick to decide on the environment: +try: + from wqflask.my_pylmm.pyLMM import chunks + from gn2 import uses +except ImportError: + has_gn2=False + from standalone import uses + +progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal') + + def formatResult(id,beta,betaSD,ts,ps): return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" def compute_snp(j,n,snp_ids,lmm2,REML,q = None): - # print("COMPUTE SNP",j,snp_ids,"\n") result = [] for snp_id in snp_ids: snp,id = snp_id x = snp.reshape((n,1)) # all the SNPs - # print "X=",x # if refit: # L.fit(X=snp,REML=REML) ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True) @@ -51,33 +59,28 @@ def compute_snp(j,n,snp_ids,lmm2,REML,q = None): q = compute_snp.q q.put([j,result]) return j - # PS.append(ps) - # TS.append(ts) - # return len(result) - # compute.q.put(result) - # return None def f_init(q): compute_snp.q = q def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): """ - Execute a GWAS. The G matrix should be n inds (cols) x m snps (rows) + GWAS. The G matrix should be n inds (cols) x m snps (rows) """ + info("In gwas.gwas") matrix_initialize() cpu_num = mp.cpu_count() numThreads = None # for now use all available threads kfile2 = False reml = restricted_max_likelihood - sys.stderr.write(str(G.shape)+"\n") + mprint("G",G) n = G.shape[1] # inds inds = n m = G.shape[0] # snps snps = m - sys.stderr.write(str(m)+" SNPs\n") - # print "***** GWAS: G",G.shape,G - assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds) + info("%s SNPs",snps) + assert snps>=inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds) # CREATE LMM object for association # if not kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=verbose) @@ -85,19 +88,10 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): lmm2 = LMM2(Y,K) # ,Kva,Kve,X0,verbose=verbose) if not refit: - if verbose: sys.stderr.write("Computing fit for null model\n") + info("Computing fit for null model") lmm2.fit() # follow GN model in run_other - if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (lmm2.optH,lmm2.optSigma)) - - # outFile = "test.out" - # out = open(outFile,'w') - out = sys.stderr - - def outputResult(id,beta,betaSD,ts,ps): - out.write(formatResult(id,beta,betaSD,ts,ps)) - def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") - - # printOutHead() + info("heritability=%0.3f, sigma=%0.3f" % (lmm2.optH,lmm2.optSigma)) + res = [] # Set up the pool @@ -106,26 +100,24 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): p = mp.Pool(numThreads, f_init, [q]) collect = [] - # Buffers for pvalues and t-stats - # PS = [] - # TS = [] count = 0 job = 0 jobs_running = 0 + jobs_completed = 0 for snp in G: snp_id = (snp,'SNPID') count += 1 if count % 1000 == 0: job += 1 - if verbose: - sys.stderr.write("Job %d At SNP %d\n" % (job,count)) + debug("Job %d At SNP %d" % (job,count)) if numThreads == 1: - print "Running on 1 THREAD" + debug("Running on 1 THREAD") compute_snp(job,n,collect,lmm2,reml,q) collect = [] j,lst = q.get() - if verbose: - sys.stderr.write("Job "+str(j)+" finished\n") + debug("Job "+str(j)+" finished") + jobs_completed += 1 + progress("GWAS2",jobs_completed,snps/1000) res.append((j,lst)) else: p.apply_async(compute_snp,(job,n,collect,lmm2,reml)) @@ -134,8 +126,9 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): while jobs_running > cpu_num: try: j,lst = q.get_nowait() - if verbose: - sys.stderr.write("Job "+str(j)+" finished\n") + debug("Job "+str(j)+" finished") + jobs_completed += 1 + progress("GWAS2",jobs_completed,snps/1000) res.append((j,lst)) jobs_running -= 1 except Queue.Empty: @@ -150,24 +143,23 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): if numThreads==1 or count<1000 or len(collect)>0: job += 1 - print "Collect final batch size %i job %i @%i: " % (len(collect), job, count) + debug("Collect final batch size %i job %i @%i: " % (len(collect), job, count)) compute_snp(job,n,collect,lmm2,reml,q) collect = [] j,lst = q.get() res.append((j,lst)) - print "count=",count," running=",jobs_running," collect=",len(collect) + debug("count=%i running=%i collect=%i" % (count,jobs_running,len(collect))) for job in range(jobs_running): j,lst = q.get(True,15) # time out - if verbose: - sys.stderr.write("Job "+str(j)+" finished\n") + debug("Job "+str(j)+" finished") + jobs_completed += 1 + progress("GWAS2",jobs_completed,snps/1000) res.append((j,lst)) - print "Before sort",[res1[0] for res1 in res] + mprint("Before sort",[res1[0] for res1 in res]) res = sorted(res,key=lambda x: x[0]) - # if verbose: - # print "res=",res[0][0:10] - print "After sort",[res1[0] for res1 in res] - print [len(res1[1]) for res1 in res] + mprint("After sort",[res1[0] for res1 in res]) + info([len(res1[1]) for res1 in res]) ts = [item[0] for j,res1 in res for item in res1] ps = [item[1] for j,res1 in res for item in res1] return ts,ps diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 0c43587e..1c157fd8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -28,17 +28,30 @@ import time from optmatrix import matrix_initialize, matrixMultT +# ---- A trick to decide on the environment: +try: + from wqflask.my_pylmm.pyLMM import chunks + from gn2 import uses, progress_set_func +except ImportError: + has_gn2=False + import standalone as handlers + from standalone import uses, progress_set_func + +progress,debug,info,mprint = uses('progress','debug','info','mprint') + def kinship_full(G): """ Calculate the Kinship matrix using a full dot multiplication """ - print G.shape + # mprint("kinship_full G",G) m = G.shape[0] # snps n = G.shape[1] # inds - sys.stderr.write(str(m)+" SNPs\n") - assert m>n, "n should be larger than m (snps>inds)" - m = np.dot(G.T,G) + info("%d SNPs",m) + assert m>n, "n should be larger than m (%d snps > %d inds)" % (m,n) + # m = np.dot(G.T,G) + m = matrixMultT(G.T) m = m/G.shape[0] + # mprint("kinship_full K",m) return m def compute_W(job,G,n,snps,compute_size): @@ -74,46 +87,38 @@ def f_init(q): # Calculate the kinship matrix from G (SNPs as rows!), returns K # -def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True): - numThreads = None - if numThreads: - numThreads = int(numThreads) +def kinship(G,computeSize=1000,numThreads=None,useBLAS=False): + matrix_initialize(useBLAS) - - sys.stderr.write(str(G.shape)+"\n") + + mprint("G",G) n = G.shape[1] # inds inds = n m = G.shape[0] # snps snps = m - sys.stderr.write(str(m)+" SNPs\n") - assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds) + info("%i SNPs" % (m)) + assert snps>=inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds) q = mp.Queue() p = mp.Pool(numThreads, f_init, [q]) cpu_num = mp.cpu_count() - print "CPU cores:",cpu_num - print snps,computeSize + info("CPU cores: %i" % cpu_num) iterations = snps/computeSize+1 - # if testing: - # iterations = 8 - # jobs = range(0,8) # range(0,iterations) results = [] - K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals completed = 0 for job in range(iterations): - if verbose: - sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*computeSize))) + info("Processing job %d first %d SNPs" % (job, ((job+1)*computeSize))) W = compute_W(job,G,n,snps,computeSize) if numThreads == 1: # Single-core compute_matrixMult(job,W,q) j,x = q.get() - if verbose: sys.stderr.write("Job "+str(j)+" finished\n") + debug("Job "+str(j)+" finished") + progress("kinship",j,iterations) K_j = x - # print j,K_j[:,0] K = K + K_j else: # Multi-core @@ -123,52 +128,38 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True): time.sleep(0.1) try: j,x = q.get_nowait() - if verbose: sys.stderr.write("Job "+str(j)+" finished\n") + debug("Job "+str(j)+" finished") K_j = x - # print j,K_j[:,0] K = K + K_j completed += 1 + progress("kinship",completed,iterations) except Queue.Empty: pass if numThreads == None or numThreads > 1: - # results contains the growing result set for job in range(len(results)-completed): j,x = q.get(True,15) - if verbose: sys.stderr.write("Job "+str(j)+" finished\n") + debug("Job "+str(j)+" finished") K_j = x - # print j,K_j[:,0] K = K + K_j completed += 1 + progress("kinship",completed,iterations) K = K / float(snps) - - # outFile = 'runtest.kin' - # if verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) - # np.savetxt(outFile,K) - - # if saveKvaKve: - # if verbose: sys.stderr.write("Obtaining Eigendecomposition\n") - # Kva,Kve = linalg.eigh(K) - # if verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) - # np.savetxt(outFile+".kva",Kva) - # np.savetxt(outFile+".kve",Kve) return K -def kvakve(K, verbose=True): +def kvakve(K): """ Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned of small values < 1e-6 (notably smaller than zero) """ - if verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) - + info("Obtaining eigendecomposition for %dx%d matrix" % (K.shape[0],K.shape[1]) ) Kva,Kve = linalg.eigh(K) - if verbose: - print("Kva is: ", Kva.shape, Kva) - print("Kve is: ", Kve.shape, Kve) + mprint("Kva",Kva) + mprint("Kve",Kve) - if sum(Kva < 1e-6): - if verbose: sys.stderr.write("Cleaning %d eigen values (Kva<0)\n" % (sum(Kva < 0))) + if sum(Kva < 0): + info("Cleaning %d eigen values (Kva<0)" % (sum(Kva < 0))) Kva[Kva < 1e-6] = 1e-6 return Kva,Kve diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 53689071..2a0c7fdc 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -19,49 +19,59 @@ from __future__ import absolute_import, print_function, division import sys import time -import argparse import uuid import numpy as np from scipy import linalg from scipy import optimize from scipy import stats -import pdb +# import pdb -import simplejson as json - -import gzip -import zlib +# import gzip +# import zlib import datetime -import cPickle as pickle -import simplejson as json - +# import cPickle as pickle from pprint import pformat as pf -from redis import Redis -Redis = Redis() - -import sys -sys.path.append("/home/zas1024/gene/wqflask/") - -has_gn2=True - -from utility.benchmark import Bench -from utility import temp_data - -sys.path.append("/home/zas1024/gene/wqflask/wqflask/my_pylmm/pyLMM/") +# Add local dir to PYTHONPATH +import os +cwd = os.path.dirname(__file__) +if sys.path[0] != cwd: + sys.path.insert(1,cwd) +# pylmm imports from kinship import kinship, kinship_full, kvakve import genotype import phenotype import gwas +from benchmark import Bench +# The following imports are for exchanging data with the webserver +import simplejson as json +from redis import Redis +Redis = Redis() +import temp_data + +has_gn2=None + +# sys.stderr.write("INFO: pylmm system path is "+":".join(sys.path)+"\n") +sys.stderr.write("INFO: pylmm file is "+__file__+"\n") + +# ---- A trick to decide on the environment: try: - from wqflask.my_pylmm.pyLMM import chunks + sys.stderr.write("INFO: lmm try loading module\n") + import utility.formatting # this is never used, just to check the environment + sys.stderr.write("INFO: This is a genenetwork2 environment\n") + from gn2 import uses, progress_set_func + has_gn2=True except ImportError: - print("WARNING: Standalone version missing the Genenetwork2 environment\n") + # Failed to load gn2 has_gn2=False - pass + import standalone as handlers + from standalone import uses, progress_set_func + sys.stderr.write("WARNING: LMM standalone version missing the Genenetwork2 environment\n") + +progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal') #np.seterr('raise') @@ -76,8 +86,7 @@ def run_human(pheno_vector, covariate_matrix, plink_input_file, kinship_matrix, - refit=False, - tempdata=None): + refit=False): v = np.isnan(pheno_vector) keep = True - v @@ -169,10 +178,7 @@ def run_human(pheno_vector, #if count > 1000: # break count += 1 - - percent_complete = (float(count) / total_snps) * 100 - #print("percent_complete: ", percent_complete) - tempdata.store("percent_complete", percent_complete) + progress("human",count,total_snps) #with Bench("actual association"): ps, ts = human_association(snp, @@ -260,23 +266,19 @@ def human_association(snp, def run_other_old(pheno_vector, genotype_matrix, restricted_max_likelihood=True, - refit=False, - tempdata=None # <---- can not be None - ): + refit=False): """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics restricted_max_likelihood -- whether to use restricted max likelihood; True or False refit -- whether to refit the variance component for each marker - temp_data -- TempData object that stores the progress for each major step of the - calculations ("calculate_kinship" and "GWAS" take the majority of time) """ print("Running the original LMM engine in run_other (old)") print("REML=",restricted_max_likelihood," REFIT=",refit) with Bench("Calculate Kinship"): - kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata) + kinship_matrix,genotype_matrix = calculate_kinship_new(genotype_matrix) print("kinship_matrix: ", pf(kinship_matrix)) print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) @@ -292,27 +294,22 @@ def run_other_old(pheno_vector, with Bench("Doing GWAS"): t_stats, p_values = GWAS(pheno_vector, - genotype_matrix, + genotype_matrix.T, kinship_matrix, restricted_max_likelihood=True, - refit=False, - temp_data=tempdata) + refit=False) Bench().report() return p_values, t_stats -def run_other_new(pheno_vector, - genotype_matrix, +def run_other_new(n,m,pheno_vector, + geno, restricted_max_likelihood=True, - refit=False, - tempdata=None # <---- can not be None - ): + refit=False): """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics restricted_max_likelihood -- whether to use restricted max likelihood; True or False refit -- whether to refit the variance component for each marker - temp_data -- TempData object that stores the progress for each major step of the - calculations ("calculate_kinship" and "GWAS" take the majority of time) """ @@ -320,8 +317,7 @@ def run_other_new(pheno_vector, print("REML=",restricted_max_likelihood," REFIT=",refit) # Adjust phenotypes - Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True) - print("Removed missing phenotypes",Y.shape) + n,Y,keep = phenotype.remove_missing_new(n,pheno_vector) # if options.maf_normalization: # G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) @@ -329,8 +325,9 @@ def run_other_new(pheno_vector, # if not options.skip_genotype_normalization: # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + geno = geno[:,keep] with Bench("Calculate Kinship"): - K,G = calculate_kinship(G, tempdata) + K,G = calculate_kinship_new(geno) print("kinship_matrix: ", pf(K)) print("kinship_matrix.shape: ", pf(K.shape)) @@ -345,7 +342,7 @@ def run_other_new(pheno_vector, with Bench("Doing GWAS"): t_stats, p_values = gwas.gwas(Y, - G.T, + G, K, restricted_max_likelihood=True, refit=False,verbose=True) @@ -385,18 +382,30 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) - -def calculate_kinship_new(genotype_matrix, temp_data=None): +def calculate_kinship_new(genotype_matrix): + """ + Call the new kinship calculation where genotype_matrix contains + inds (columns) by snps (rows). + """ + assert type(genotype_matrix) is np.ndarray + info("call genotype.normalize") + G = np.apply_along_axis( genotype.normalize, axis=1, arr=genotype_matrix) + mprint("G",genotype_matrix) + info("call calculate_kinship_new") + return kinship(G),G # G gets transposed, we'll turn this into an iterator (FIXME) + +def calculate_kinship_iter(geno): """ Call the new kinship calculation where genotype_matrix contains inds (columns) by snps (rows). """ - print("call genotype.normalize") + assert type(genotype_matrix) is iter + info("call genotype.normalize") G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix) - print("call calculate_kinship_new") - return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME) + info("call calculate_kinship_new") + return kinship(G) -def calculate_kinship_old(genotype_matrix, temp_data=None): +def calculate_kinship_old(genotype_matrix): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -404,13 +413,15 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): normalizes the resulting vectors and returns the RRM matrix. """ - print("call calculate_kinship_old") + info("call calculate_kinship_old") + fatal("THE FUNCTION calculate_kinship_old IS OBSOLETE, use calculate_kinship_new instead - see Genotype Normalization Problem on Pjotr's blog") n = genotype_matrix.shape[0] m = genotype_matrix.shape[1] - print("genotype 2D matrix n (inds) is:", n) - print("genotype 2D matrix m (snps) is:", m) + info("genotype 2D matrix n (inds) is: %d" % (n)) + info("genotype 2D matrix m (snps) is: %d" % (m)) assert m>n, "n should be larger than m (snps>inds)" keep = [] + mprint("G (before old normalize)",genotype_matrix) for counter in range(m): #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers @@ -429,17 +440,13 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): continue keep.append(counter) genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr) - - percent_complete = int(round((counter/m)*45)) - if temp_data != None: - temp_data.store("percent_complete", percent_complete) + progress('kinship_old normalize genotype',counter,m) genotype_matrix = genotype_matrix[:,keep] - print("After kinship (old) genotype_matrix: ", pf(genotype_matrix)) + mprint("G (after old normalize)",genotype_matrix.T) kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) return kinship_matrix,genotype_matrix - -calculate_kinship = calculate_kinship_new # alias + # return kinship_full(genotype_matrix.T),genotype_matrix def GWAS(pheno_vector, genotype_matrix, @@ -465,9 +472,9 @@ def GWAS(pheno_vector, refit - refit the variance component for each SNP """ - if kinship_eigen_vals == None: + if kinship_eigen_vals is None: kinship_eigen_vals = [] - if kinship_eigen_vectors == None: + if kinship_eigen_vectors is None: kinship_eigen_vectors = [] n = genotype_matrix.shape[0] @@ -537,9 +544,8 @@ def GWAS(pheno_vector, lmm_ob.fit(X=x) ts, ps, beta, betaVar = lmm_ob.association(x, REML=restricted_max_likelihood) - percent_complete = 45 + int(round((counter/m)*55)) - temp_data.store("percent_complete", percent_complete) - + progress("gwas_old",counter,m) + p_values.append(ps) t_statistics.append(ts) @@ -572,7 +578,7 @@ class LMM: When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. """ - if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) + if X0 is None: X0 = np.ones(len(Y)).reshape(len(Y),1) self.verbose = verbose #x = Y != -9 @@ -665,7 +671,7 @@ class LMM: REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. """ - if X == None: + if X is None: X = self.X0t elif stack: self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] @@ -804,48 +810,96 @@ class LMM: pl.ylabel("Probability of data") pl.title(title) - -def gn2_redis(key,species,new_code=True): +def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,new_code=True): """ - Invoke pylmm using Redis as a container. new_code runs the new - version + Invoke pylmm using genotype as a matrix or as a (SNP) iterator. """ - json_params = Redis.get(key) - - params = json.loads(json_params) - - tempdata = temp_data.TempData(params['temp_uuid']) - - - print('pheno', np.array(params['pheno_vector'])) + info("run_gwas") + print('pheno', y) if species == "human" : - print('kinship', np.array(params['kinship_matrix'])) - ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']), - covariate_matrix = np.array(params['covariate_matrix']), - plink_input_file = params['input_file_name'], - kinship_matrix = np.array(params['kinship_matrix']), - refit = params['refit'], - tempdata = tempdata) + print('kinship', k ) + ps, ts = run_human(pheno_vector = y, + covariate_matrix = cov, + plink_input_file = inputfn, + kinship_matrix = k, + refit = refit) else: - geno = np.array(params['genotype_matrix']) print('geno', geno.shape, geno) if new_code: - ps, ts = run_other_new(pheno_vector = np.array(params['pheno_vector']), - genotype_matrix = geno, - restricted_max_likelihood = params['restricted_max_likelihood'], - refit = params['refit'], - tempdata = tempdata) + ps, ts = run_other_new(n,m,pheno_vector = y, + geno = geno, + restricted_max_likelihood = reml, + refit = refit) else: - ps, ts = run_other_old(pheno_vector = np.array(params['pheno_vector']), + ps, ts = run_other_old(pheno_vector = y, genotype_matrix = geno, - restricted_max_likelihood = params['restricted_max_likelihood'], - refit = params['refit'], - tempdata = tempdata) + restricted_max_likelihood = reml, + refit = refit) + return ps,ts + +def gwas_with_redis(key,species,new_code=True): + """ + Invoke pylmm using Redis as a container. new_code runs the new + version. All the Redis code goes here! + """ + json_params = Redis.get(key) + + params = json.loads(json_params) + + tempdata = temp_data.TempData(params['temp_uuid']) + def update_tempdata(loc,i,total): + """ + This is the single method that updates Redis for percentage complete! + """ + tempdata.store("percent_complete",round(i*100.0/total)) + debug("Updating REDIS percent_complete=%d" % (round(i*100.0/total))) + progress_set_func(update_tempdata) + + def narray(t): + info("Type is "+t) + v = params.get(t) + if v is not None: + # Note input values can be array of string or float + v1 = [x if x != 'NA' else 'nan' for x in v] + v = np.array(v1).astype(np.float) + return v + + def marray(t): + info("Type is "+t) + v = params.get(t) + if v is not None: + m = [] + for r in v: + # Note input values can be array of string or float + r1 = [x if x != 'NA' else 'nan' for x in r] + m.append(np.array(r1).astype(np.float)) + return np.array(m) + return np.array(v) + + def marrayT(t): + m = marray(t) + if m is not None: + return m.T + return m + + # We are transposing before we enter run_gwas - this should happen on the webserver + # side (or when reading data from file) + k = marray('kinship_matrix') + g = marrayT('genotype_matrix') + mprint("geno",g) + y = narray('pheno_vector') + n = len(y) + m = params.get('num_genotypes') + if m is None: + m = g.shape[0] + info("m=%d,n=%d" % (m,n)) + ps,ts = run_gwas(species,n,m,k,y,g,narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params.get('input_file_name'),new_code) results_key = "pylmm:results:" + params['temp_uuid'] + # fatal(results_key) json_results = json.dumps(dict(p_values = ps, t_stats = ts)) @@ -854,28 +908,56 @@ def gn2_redis(key,species,new_code=True): Redis.expire(results_key, 60*60) return ps, ts -# This is the main function used by Genenetwork2 (with environment) -def gn2_main(): - parser = argparse.ArgumentParser(description='Run pyLMM') - parser.add_argument('-k', '--key') - parser.add_argument('-s', '--species') - - opts = parser.parse_args() - - key = opts.key - species = opts.species +def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True): + """ + This function emulates current GN2 behaviour by pre-loading Redis (note the input + genotype is transposed to emulate GN2 (FIXME!) + """ + info("Loading Redis from parsed data") + if kinship == None: + k = None + else: + k = kinship.tolist() + params = dict(pheno_vector = pheno.tolist(), + genotype_matrix = geno.T.tolist(), + num_genotypes = geno.shape[0], + kinship_matrix = k, + covariate_matrix = None, + input_file_name = None, + restricted_max_likelihood = True, + refit = False, + temp_uuid = "testrun_temp_uuid", + + # meta data + timestamp = datetime.datetime.now().isoformat()) + + json_params = json.dumps(params) + Redis.set(key, json_params) + Redis.expire(key, 60*60) - gn2_redis(key,species) + return gwas_with_redis(key,species,new_code) -def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True): - print("Loading Redis from parsed data") +def gn2_load_redis_iter(key,species,kinship,pheno,geno_iterator): + """ + This function emulates GN2 behaviour by pre-loading Redis with + a SNP iterator, for this it sets a key for every genotype (SNP) + """ + print("Loading Redis using a SNP iterator") + for i,genotypes in enumerate(geno_iterator): + gkey = key+'_geno_'+str(i) + Redis.set(gkey, genotypes) + Redis.expire(gkey, 60*60) + if kinship == None: k = None else: k = kinship.tolist() params = dict(pheno_vector = pheno.tolist(), - genotype_matrix = geno.tolist(), - kinship_matrix= k, + genotype_matrix = "iterator", + num_genotypes = i, + kinship_matrix = k, + covariate_matrix = None, + input_file_name = None, restricted_max_likelihood = True, refit = False, temp_uuid = "testrun_temp_uuid", @@ -887,14 +969,27 @@ def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True): json_params = json.dumps(params) Redis.set(key, json_params) Redis.expire(key, 60*60) + return gwas_with_redis(key,species) - return gn2_redis(key,species,new_code) +# This is the main function used by Genenetwork2 (with environment) +# +# Note that this calling route will become OBSOLETE (we should use runlmm.py +# instead) +def gn2_main(): + import argparse + parser = argparse.ArgumentParser(description='Run pyLMM') + parser.add_argument('-k', '--key') + parser.add_argument('-s', '--species') + opts = parser.parse_args() + + key = opts.key + species = opts.species + + gwas_with_redis(key,species) + + if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") - if has_gn2: - gn2_main() - else: - print("Run from runlmm.py instead") - + gn2_main() diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py index d4b3ac82..d871d8d2 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py @@ -24,6 +24,25 @@ from scipy import optimize from optmatrix import matrixMult import kinship +sys.stderr.write("INFO: pylmm (lmm2) system path is "+":".join(sys.path)+"\n") +sys.stderr.write("INFO: pylmm (lmm2) file is "+__file__+"\n") + +# ---- A trick to decide on the environment: +try: + sys.stderr.write("INFO: lmm2 try loading module\n") + import utility.formatting # this is never used, just to check the environment + sys.stderr.write("INFO: This is a genenetwork2 environment (lmm2)\n") + from gn2 import uses, progress_set_func +except ImportError: + # Failed to load gn2 + has_gn2=False + import standalone as handlers + from standalone import uses, progress_set_func + sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n") + +progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal') + + def calculateKinship(W,center=False): """ W is an n x m matrix encoding SNP minor alleles. @@ -75,7 +94,7 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): print("genotype matrix n is:", n) print("genotype matrix m is:", m) - if X0 == None: + if X0 is None: X0 = np.ones((n,1)) # Remove missing values in Y and adjust associated parameters @@ -139,31 +158,35 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): class LMM2: - """ - This is a simple version of EMMA/fastLMM. - The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) - and to optimize this model by finding the maximum-likelihood estimates for the model parameters. - There are three model parameters: heritability (h), covariate coefficients (beta) and the total - phenotypic variance (sigma). - Heritability as defined here is the proportion of the total variance (sigma) that is attributed to - the kinship matrix. - - For simplicity, we assume that everything being input is a numpy array. - If this is not the case, the module may throw an error as conversion from list to numpy array - is not done consistently. + """This is a simple version of EMMA/fastLMM. + + The main purpose of this module is to take a phenotype vector (Y), + a set of covariates (X) and a kinship matrix (K) and to optimize + this model by finding the maximum-likelihood estimates for the + model parameters. There are three model parameters: heritability + (h), covariate coefficients (beta) and the total phenotypic + variance (sigma). Heritability as defined here is the proportion + of the total variance (sigma) that is attributed to the kinship + matrix. + + For simplicity, we assume that everything being input is a numpy + array. If this is not the case, the module may throw an error as + conversion from list to numpy array is not done consistently. """ def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False): - """ - The constructor takes a phenotype vector or array Y of size n. - It takes a kinship matrix K of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. - If they are not provided, the constructor will calculate them. - X0 is an optional covariate matrix of size n x q, where there are q covariates. - When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. + """The constructor takes a phenotype vector or array Y of size n. It + takes a kinship matrix K of size n x n. Kva and Kve can be + computed as Kva,Kve = linalg.eigh(K) and cached. If they are + not provided, the constructor will calculate them. X0 is an + optional covariate matrix of size n x q, where there are q + covariates. When this parameter is not provided, the + constructor will set X0 to an n x 1 matrix of all ones to + represent a mean effect. """ - if X0 == None: + if X0 is None: X0 = np.ones(len(Y)).reshape(len(Y),1) self.verbose = verbose @@ -250,7 +273,7 @@ class LMM2: REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. """ - if X == None: X = self.X0t + if X is None: X = self.X0t elif stack: self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] X = self.X0t_stack @@ -306,7 +329,7 @@ class LMM2: Given this optimum, the function computes the LL and associated ML solutions. """ - if X == None: X = self.X0t + if X is None: X = self.X0t else: #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] @@ -330,7 +353,7 @@ class LMM2: def association(self,X,h=None,stack=True,REML=True,returnBeta=False): """ Calculates association statitics for the SNPs encoded in the vector X of size n. - If h == None, the optimal h stored in optH is used. + If h is None, the optimal h stored in optH is used. """ if False: @@ -348,7 +371,7 @@ class LMM2: self.X0t_stack[:,(self.q)] = m X = self.X0t_stack - if h == None: h = self.optH + if h is None: h = self.optH L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML) q = len(beta) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py index 682ba371..7b652515 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -19,22 +19,47 @@ import sys import numpy as np -def remove_missing(y,g,verbose=False): +# ---- A trick to decide on the environment: +try: + from wqflask.my_pylmm.pyLMM import chunks + from gn2 import uses, progress_set_func +except ImportError: + has_gn2=False + import standalone as handlers + from standalone import uses, progress_set_func + +progress,debug,info,mprint = uses('progress','debug','info','mprint') + +def remove_missing(n,y,g): """ Remove missing data from matrices, make sure the genotype data has individuals as rows """ - assert(y!=None) - assert(y.shape[0] == g.shape[0]) + assert(y is not None) + assert y.shape[0] == g.shape[0],"y (n) %d, g (n,m) %s" % (y.shape[0],g.shape) y1 = y g1 = g v = np.isnan(y) keep = True - v if v.sum(): - if verbose: - sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) + info("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) y1 = y[keep] g1 = g[keep,:] - return y1,g1,keep + n = y1.shape[0] + return n,y1,g1,keep + +def remove_missing_new(n,y): + """ + Remove missing data. Returns new n,y,keep + """ + assert(y is not None) + y1 = y + v = np.isnan(y) + keep = True - v + if v.sum(): + info("runlmm.py: Cleaning the phenotype vector by removing %d individuals" % (v.sum())) + y1 = y[keep] + n = y1.shape[0] + return n,y1,keep diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 324c4f2c..6b241cd6 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -21,10 +21,21 @@ from optparse import OptionParser import sys import tsvreader import numpy as np -from lmm import gn2_load_redis, calculate_kinship_old + +# Add local dir to PYTHONPATH +import os +cwd = os.path.dirname(__file__) +if sys.path[0] != cwd: + sys.path.insert(1,cwd) + +# pylmm modules +from lmm import gn2_load_redis, gn2_load_redis_iter, calculate_kinship_new, run_gwas from kinship import kinship, kinship_full import genotype import phenotype +from standalone import uses + +progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal') usage = """ python runlmm.py [options] command @@ -98,44 +109,64 @@ if options.pheno: y = tsvreader.pheno(options.pheno) print y.shape -if options.geno: +if options.geno and cmd != 'iterator': g = tsvreader.geno(options.geno) print g.shape -if cmd == 'redis_new': - # The main difference between redis_new and redis is that missing - # phenotypes are handled by the first - Y = y - G = g - print "Original G",G.shape, "\n", G - - gt = G.T - G = None - ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True) +def check_results(ps,ts): print np.array(ps) print len(ps),sum(ps) - # Test results p1 = round(ps[0],4) p2 = round(ps[-1],4) - sys.stderr.write(options.geno+"\n") if options.geno == 'data/small.geno': - assert p1==0.0708, "p1=%f" % p1 - assert p2==0.1417, "p2=%f" % p2 + info("Validating results for "+options.geno) + assert p1==0.7387, "p1=%f" % p1 + assert p2==0.7387, "p2=%f" % p2 if options.geno == 'data/small_na.geno': - assert p1==0.0897, "p1=%f" % p1 - assert p2==0.0405, "p2=%f" % p2 + info("Validating results for "+options.geno) + assert p1==0.062, "p1=%f" % p1 + assert p2==0.062, "p2=%f" % p2 if options.geno == 'data/test8000.geno': - # assert p1==0.8984, "p1=%f" % p1 - # assert p2==0.9621, "p2=%f" % p2 + info("Validating results for "+options.geno) assert round(sum(ps)) == 4070 assert len(ps) == 8000 + info("Run completed") + +if y is not None: + n = y.shape[0] + +if cmd == 'run': + if options.remove_missing_phenotypes: + raise Exception('Can not use --remove-missing-phenotypes with LMM2') + n = len(y) + m = g.shape[1] + ps, ts = run_gwas('other',n,m,k,y,g) # <--- pass in geno by SNP + check_results(ps,ts) +elif cmd == 'iterator': + if options.remove_missing_phenotypes: + raise Exception('Can not use --remove-missing-phenotypes with LMM2') + geno_iterator = tsvreader.geno_iter(options.geno) + ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,geno_iterator) + check_results(ps,ts) +elif cmd == 'redis_new': + # The main difference between redis_new and redis is that missing + # phenotypes are handled by the first + if options.remove_missing_phenotypes: + raise Exception('Can not use --remove-missing-phenotypes with LMM2') + Y = y + G = g + print "Original G",G.shape, "\n", G + # gt = G.T + # G = None + ps, ts = gn2_load_redis('testrun','other',k,Y,G,new_code=True) + check_results(ps,ts) elif cmd == 'redis': # Emulating the redis setup of GN2 G = g print "Original G",G.shape, "\n", G - if y != None and options.remove_missing_phenotypes: + if y is not None and options.remove_missing_phenotypes: gnt = np.array(g).T - Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose) + n,Y,g,keep = phenotype.remove_missing(n,y,gnt) G = g.T print "Removed missing phenotypes",G.shape, "\n", G else: @@ -148,30 +179,16 @@ elif cmd == 'redis': g = None gnt = None - gt = G.T - G = None - ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False) - print np.array(ps) - print len(ps),sum(ps) - # Test results 4070.02346579 - p1 = round(ps[0],4) - p2 = round(ps[-1],4) - sys.stderr.write(options.geno+"\n") - if options.geno == 'data/small.geno': - assert p1==0.0708, "p1=%f" % p1 - assert p2==0.1417, "p2=%f" % p2 - if options.geno == 'data/small_na.geno': - assert p1==0.0897, "p1=%f" % p1 - assert p2==0.0405, "p2=%f" % p2 - if options.geno == 'data/test8000.geno': - assert round(sum(ps)) == 4070 - assert len(ps) == 8000 + # gt = G.T + # G = None + ps, ts = gn2_load_redis('testrun','other',k,Y,G, new_code=False) + check_results(ps,ts) elif cmd == 'kinship': G = g print "Original G",G.shape, "\n", G if y != None and options.remove_missing_phenotypes: gnt = np.array(g).T - Y,g = phenotype.remove_missing(y,g.T,options.verbose) + n,Y,g,keep = phenotype.remove_missing(n,y,g.T) G = g.T print "Removed missing phenotypes",G.shape, "\n", G if options.maf_normalization: @@ -187,20 +204,20 @@ elif cmd == 'kinship': print "Genotype",G.shape, "\n", G print "first Kinship method",K.shape,"\n",K k1 = round(K[0][0],4) - K2,G = calculate_kinship_old(np.copy(G).T,temp_data=None) + K2,G = calculate_kinship_new(np.copy(G)) print "Genotype",G.shape, "\n", G print "GN2 Kinship method",K2.shape,"\n",K2 k2 = round(K2[0][0],4) print "Genotype",G.shape, "\n", G - K3 = kinship(G.T) + K3 = kinship(G) print "third Kinship method",K3.shape,"\n",K3 sys.stderr.write(options.geno+"\n") k3 = round(K3[0][0],4) if options.geno == 'data/small.geno': - assert k1==0.8, "k1=%f" % k1 - assert k2==0.7939, "k2=%f" % k2 - assert k3==0.7939, "k3=%f" % k3 + assert k1==0.8333, "k1=%f" % k1 + assert k2==0.9375, "k2=%f" % k2 + assert k3==0.9375, "k3=%f" % k3 if options.geno == 'data/small_na.geno': assert k1==0.8333, "k1=%f" % k1 assert k2==0.7172, "k2=%f" % k2 @@ -209,4 +226,4 @@ elif cmd == 'kinship': assert k3==1.4352, "k3=%f" % k3 else: - print "Doing nothing" + fatal("Doing nothing") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/standalone.py b/wqflask/wqflask/my_pylmm/pyLMM/standalone.py new file mode 100644 index 00000000..40b2021d --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/standalone.py @@ -0,0 +1,110 @@ +# Standalone specific methods and callback handler +# +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# Set the log level with +# +# logging.basicConfig(level=logging.DEBUG) + +from __future__ import absolute_import, print_function, division + +import numpy as np +import sys +import logging + +# logger = logging.getLogger(__name__) +logger = logging.getLogger('lmm2') +logging.basicConfig(level=logging.DEBUG) +np.set_printoptions(precision=3,suppress=True) + +progress_location = None +progress_current = None +progress_prev_perc = None + +def progress_default_func(location,count,total): + global progress_current + value = round(count*100.0/total) + progress_current = value + +progress_func = progress_default_func + +def progress_set_func(func): + global progress_func + progress_func = func + +def progress(location, count, total): + global progress_location + global progress_prev_perc + + perc = round(count*100.0/total) + if perc != progress_prev_perc and (location != progress_location or perc > 98 or perc > progress_prev_perc + 5): + progress_func(location, count, total) + logger.info("Progress: %s %d%%" % (location,perc)) + progress_location = location + progress_prev_perc = perc + +def mprint(msg,data): + """ + Array/matrix print function + """ + m = np.array(data) + if m.ndim == 1: + print(msg,m.shape,"=\n",m[0:3]," ... ",m[-3:]) + if m.ndim == 2: + print(msg,m.shape,"=\n[", + m[0][0:3]," ... ",m[0][-3:],"\n ", + m[1][0:3]," ... ",m[1][-3:],"\n ...\n ", + m[-2][0:3]," ... ",m[-2][-3:],"\n ", + m[-1][0:3]," ... ",m[-1][-3:],"]") + +def fatal(msg): + logger.critical(msg) + raise Exception(msg) + +def callbacks(): + return dict( + write = sys.stdout.write, + writeln = print, + debug = logger.debug, + info = logger.info, + warning = logger.warning, + error = logger.error, + critical = logger.critical, + fatal = fatal, + progress = progress, + mprint = mprint + ) + +def uses(*funcs): + """ + Some sugar + """ + return [callbacks()[func] for func in funcs] + +# ----- Minor test cases: + +if __name__ == '__main__': + # logging.basicConfig(level=logging.DEBUG) + logging.debug("Test %i" % (1)) + d = callbacks()['debug'] + d("TEST") + wrln = callbacks()['writeln'] + wrln("Hello %i" % 34) + progress = callbacks()['progress'] + progress("I am half way",50,100) + list = [0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15, + 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15] + mprint("list",list) + matrix = [[1,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [2,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [3,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [4,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [5,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15], + [6,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15]] + mprint("matrix",matrix) + ix,dx = uses("info","debug") + ix("ix") + dx("dx") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/temp_data.py b/wqflask/wqflask/my_pylmm/pyLMM/temp_data.py new file mode 100644 index 00000000..004d45c6 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/temp_data.py @@ -0,0 +1,25 @@ +from __future__ import print_function, division, absolute_import +from redis import Redis + +import simplejson as json + +class TempData(object): + + def __init__(self, temp_uuid): + self.temp_uuid = temp_uuid + self.redis = Redis() + self.key = "tempdata:{}".format(self.temp_uuid) + + def store(self, field, value): + self.redis.hset(self.key, field, value) + self.redis.expire(self.key, 60*15) # Expire in 15 minutes + + def get_all(self): + return self.redis.hgetall(self.key) + + +if __name__ == "__main__": + redis = Redis() + for key in redis.keys(): + for field in redis.hkeys(key): + print("{}.{}={}".format(key, field, redis.hget(key, field))) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py index b4027fa3..66b34ee2 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py @@ -56,7 +56,8 @@ def geno(fn): print fn with open(fn,'r') as tsvin: - assert(tsvin.readline().strip() == "# Genotype format version 1.0") + line = tsvin.readline().strip() + assert line == "# Genotype format version 1.0", line tsvin.readline() tsvin.readline() tsvin.readline() @@ -74,3 +75,48 @@ def geno(fn): G = np.array(G1) return G +def geno(fn): + G1 = [] + for id,values in geno_iter(fn): + G1.append(values) # <--- slow + G = np.array(G1) + return G + +def geno_callback(fn,func): + hab_mapper = {'A':0,'H':1,'B':2,'-':3} + pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ] + + print fn + with open(fn,'r') as tsvin: + assert(tsvin.readline().strip() == "# Genotype format version 1.0") + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + id = row[0] + gs = list(row[1]) + gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs] + func(id,gs2) + +def geno_iter(fn): + """ + Yield a tuple of snpid and values + """ + hab_mapper = {'A':0,'H':1,'B':2,'-':3} + pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ] + + print fn + with open(fn,'r') as tsvin: + assert(tsvin.readline().strip() == "# Genotype format version 1.0") + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + id = row[0] + gs = list(row[1]) + gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs] + yield (id,gs2) diff --git a/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.coffee b/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.coffee index a87d3537..881ea74d 100755 --- a/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.coffee +++ b/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.coffee @@ -79,8 +79,8 @@ open_mapping_results = (data) -> $.colorbox( html: data href: "#mapping_results_holder" - height: "80%" - width: "80%" + height: "90%" + width: "90%" ) showalert = (message,alerttype) -> @@ -154,8 +154,8 @@ $("#plink_compute").click(() => $("#static_progress_bar_container").modal() url = "/marker_regression" $('input[name=method]').val("plink") - $('input[name=mapping_display_all]').val($('input[name=display_all_plink]').val()) - $('input[name=suggestive]').val($('input[name=suggestive_plink]').val()) + #$('input[name=mapping_display_all]').val($('input[name=display_all_plink]').val()) + #$('input[name=suggestive]').val($('input[name=suggestive_plink]').val()) $('input[name=maf]').val($('input[name=maf_plink]').val()) form_data = $('#trait_data_form').serialize() console.log("form_data is:", form_data) @@ -164,11 +164,12 @@ $("#plink_compute").click(() => ) $("#gemma_compute").click(() => + console.log("RUNNING GEMMA") $("#static_progress_bar_container").modal() url = "/marker_regression" $('input[name=method]').val("gemma") - $('input[name=mapping_display_all]').val($('input[name=display_all_gemma]').val()) - $('input[name=suggestive]').val($('input[name=suggestive_gemma]').val()) + #$('input[name=mapping_display_all]').val($('input[name=display_all_gemma]').val()) + #$('input[name=suggestive]').val($('input[name=suggestive_gemma]').val()) $('input[name=maf]').val($('input[name=maf_gemma]').val()) form_data = $('#trait_data_form').serialize() console.log("form_data is:", form_data) diff --git a/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.js b/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.js index c8988cdc..1779df4b 100755 --- a/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.js +++ b/wqflask/wqflask/static/new/javascript/show_trait_mapping_tools.js @@ -99,8 +99,8 @@ open_mapping_results = function(data) { return $.colorbox({ html: data, href: "#mapping_results_holder", - height: "80%", - width: "80%" + height: "90%", + width: "90%" }); }; @@ -187,11 +187,10 @@ $("#plink_compute").click((function(_this) { $("#gemma_compute").click((function(_this) { return function() { var form_data, url; + console.log("RUNNING GEMMA"); $("#static_progress_bar_container").modal(); url = "/marker_regression"; $('input[name=method]').val("gemma"); - $('input[name=mapping_display_all]').val($('input[name=display_all_gemma]').val()); - $('input[name=suggestive]').val($('input[name=suggestive_gemma]').val()); $('input[name=maf]').val($('input[name=maf_gemma]').val()); form_data = $('#trait_data_form').serialize(); console.log("form_data is:", form_data); diff --git a/wqflask/wqflask/templates/pair_scan_results.html b/wqflask/wqflask/templates/pair_scan_results.html index 869dabed..f46d7cbf 100644 --- a/wqflask/wqflask/templates/pair_scan_results.html +++ b/wqflask/wqflask/templates/pair_scan_results.html @@ -33,6 +33,28 @@
Index | +Locus | +Chr 1 | +Mb | +Chr 2 | +
{{loop.index}} | +{{marker.name}} | +{{marker.chr1}} | +{{marker.Mb}} | +{{marker.chr2}} | +