From d85ca24cfa021785504ad1d84b712880c8dfbcd1 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 20 Feb 2015 14:25:15 +0300 Subject: Split out behaviour for calling lmm from GN2 or (testing) CLI --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 369fcbcc..8de3b3a7 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -45,11 +45,16 @@ import sys sys.path.append("/home/zas1024/gene/wqflask/") print("sys.path2:", sys.path) -from utility.benchmark import Bench -from utility import temp_data - -from wqflask.my_pylmm.pyLMM import chunks +has_gn2=True +try: + from utility.benchmark import Bench + from utility import temp_data + from wqflask.my_pylmm.pyLMM import chunks +except ImportError: + print("WARNING: Standalone version\n") + has_gn2=False + pass #np.seterr('raise') @@ -288,6 +293,7 @@ def run_other(pheno_vector, def matrixMult(A,B): # If there is no fblas then we will revert to np.dot() + try: linalg.fblas except AttributeError: @@ -708,7 +714,7 @@ class LMM: pl.ylabel("Probability of data") pl.title(title) -def main(): +def gn2_main(): parser = argparse.ArgumentParser(description='Run pyLMM') parser.add_argument('-k', '--key') parser.add_argument('-s', '--species') @@ -750,7 +756,10 @@ def main(): Redis.expire(results_key, 60*60) if __name__ == '__main__': - main() + if has_gn2: + gn2_main() + else: + cli_main() -- cgit v1.2.3 From 2a2021e491b7c4aceebec95e4d184c98187d7201 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 20 Feb 2015 14:27:55 +0300 Subject: Remove empty dir --- wqflask/wqflask/pylmm | 1 - 1 file changed, 1 deletion(-) delete mode 160000 wqflask/wqflask/pylmm (limited to 'wqflask') diff --git a/wqflask/wqflask/pylmm b/wqflask/wqflask/pylmm deleted file mode 160000 index cede848b..00000000 --- a/wqflask/wqflask/pylmm +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cede848b7ce648366c1bdd7bc5df43c633eeb0d7 -- cgit v1.2.3 From 61b8ae229687612b4c9d4a0b089a9ef0ef47ff2c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 20 Feb 2015 14:40:55 +0300 Subject: Moved pylmm scripts that are not used by GN2 --- wqflask/wqflask/my_pylmm/example.py | 58 ---- wqflask/wqflask/my_pylmm/pylmmGWAS.py | 346 ----------------------- wqflask/wqflask/my_pylmm/pylmmKinship.py | 186 ------------ wqflask/wqflask/my_pylmm/run_pylmm.py | 77 ----- wqflask/wqflask/my_pylmm/scripts/example.py | 58 ++++ wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py | 346 +++++++++++++++++++++++ wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py | 186 ++++++++++++ wqflask/wqflask/my_pylmm/scripts/run_pylmm.py | 77 +++++ 8 files changed, 667 insertions(+), 667 deletions(-) delete mode 100755 wqflask/wqflask/my_pylmm/example.py delete mode 100755 wqflask/wqflask/my_pylmm/pylmmGWAS.py delete mode 100755 wqflask/wqflask/my_pylmm/pylmmKinship.py delete mode 100755 wqflask/wqflask/my_pylmm/run_pylmm.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/example.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/run_pylmm.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/example.py b/wqflask/wqflask/my_pylmm/example.py deleted file mode 100755 index 8b30debd..00000000 --- a/wqflask/wqflask/my_pylmm/example.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/python - -from __future__ import absolute_import, print_function, division - -import sys -import time - -import numpy as np -from pyLMM import lmm - -from pprint import pformat as pf - - -Y = np.genfromtxt('data/mdp.exprs.1.new') -print("exprs is:", pf(Y.shape)) - -# Loading npdump and first 1000 snps for speed -#K = np.load('data/hmdp.liver.K.npdump') -#snps = np.load('data/hmdp.liver.snps.1000.npdump').T - -# These three lines will load all SNPs (from npdump or from txt) and -# calculate the kinship -snps = np.genfromtxt('/home/zas1024/gene/web/new_genotypers/mdp.snps.1000.new').T -print("snps is:", pf(snps.shape)) -#snps = snps[~np.isnan(snps).all(axis=1)] -#print ("snps is now:", pf(snps)) -np.savetxt("/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.trimmed", snps, fmt='%s', delimiter=' ') -#snps = np.load('data/hmdp.liver.snps.npdump').T -K = lmm.calculateKinship(snps) -#print("K is:", pf(K)) -#print("Y is:", pf(Y.shape)) - -# Instantiate a LMM object for the phentoype Y and fit the null model -L = lmm.LMM(Y,K) -L.fit() - -# Manually calculate the association at one SNP -X = snps[:,0] -X[np.isnan(X)] = X[True - np.isnan(X)].mean() # Fill missing with MAF -X = X.reshape(len(X),1) -if X.var() == 0: ts,ps = (np.nan,np.nan) -else: ts,ps = L.association(X) - -# If I want to refit the variance component -L.fit(X=X) -ts,ps = L.association(X) - -# If I want to do a genome-wide scan over the 1000 SNPs. -# This call will use REML (REML = False means use ML). -# It will also refit the variance components for each SNP. -# Setting refit = False will cause the program to fit the model once -# and hold those variance component estimates for each SNP. -begin = time.time() -TS,PS = lmm.GWAS(Y,snps,K,REML=True,refit=False) -print("TS is:", pf(TS)) -print("PS is:", pf(PS)) -end = time.time() -sys.stderr.write("Total time for 1000 SNPs: %0.3f\n" % (end- begin)) \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py deleted file mode 100755 index 74b58dd6..00000000 --- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/python - -# pylmm is a python-based linear mixed-model solver with applications to GWAS - -# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) - -#The program is free for academic use. Please contact Nick Furlotte -# if you are interested in using the software for -#commercial purposes. - -#The software must not be modified and distributed without prior -#permission of the author. - -#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pdb -import time -import sys - -def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") - -def formatResult(id,beta,betaSD,ts,ps): - return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" - -def outputResult(id,beta,betaSD,ts,ps): - out.write(formatResult(id,beta,betaSD,ts,ps)) - -from optparse import OptionParser,OptionGroup -usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase - -This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. -The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. - -Basic usage: - - python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile - - """ -parser = OptionParser(usage=usage) - -basicGroup = OptionGroup(parser, "Basic Options") -advancedGroup = OptionGroup(parser, "Advanced Options") -experimentalGroup = OptionGroup(parser, "Experimental Options") - -#basicGroup.add_option("--pfile", dest="pfile", -# help="The base for a PLINK ped file") -basicGroup.add_option("--tfile", dest="tfile", - help="The base for a PLINK tped file") -basicGroup.add_option("--bfile", dest="bfile", - help="The base for a PLINK binary bed file") -basicGroup.add_option("--phenofile", dest="phenoFile", default=None, - help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. This file should be in plink format. ") - -# EMMA Options -basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the columns and snps on the rows.") -basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each phenotype as one row.") -basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each covariate as one row.") - -basicGroup.add_option("--kfile", dest="kfile", - help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") -basicGroup.add_option("--covfile", dest="covfile", - help="The location of a covariate file file. This is a plink formatted covariate file.") -basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0) - - - -advancedGroup.add_option("--removeMissingGenotypes", - action="store_false", dest="normalizeGenotype", default=True, - help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.") -advancedGroup.add_option("--refit", - action="store_true", dest="refit", default=False, - help="Refit the variance components at each SNP (default is to lock in the variance components under the null).") - -advancedGroup.add_option("--REML", - action="store_true", dest="REML", default=False, - help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).") -#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") -advancedGroup.add_option("--eigen", dest="eigenfile", - help="The location of the precomputed eigendecomposition for the kinship file. These can be computed with pylmmKinship.py.") -advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true", - help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.") - -basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") - -advancedGroup.add_option("-v", "--verbose", - action="store_true", dest="verbose", default=False, - help="Print extra info") - -# Experimental Group Options -experimentalGroup.add_option("--kfile2", dest="kfile2", - help="The location of a second kinship file. This file has the same format as the first kinship. This might be used if you want to correct for another form of confounding.") - -parser.add_option_group(basicGroup) -parser.add_option_group(advancedGroup) -parser.add_option_group(experimentalGroup) - -(options, args) = parser.parse_args() - -import sys -import os -import numpy as np -from scipy import linalg -from pylmm.lmm import LMM -from pylmm import input - -import multiprocessing as mp # Multiprocessing is part of the Python stdlib -import Queue - -if len(args) != 1: - parser.print_help() - sys.exit() - -outFile = args[0] - -if not options.tfile and not options.bfile and not options.emmaFile: -#if not options.pfile and not options.tfile and not options.bfile: - parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).") -if not options.kfile: - parser.error("Please provide a pre-computed kinship file") - -numThreads = None -if options.numThreads: - numThreads = int(options.numThreads) - -# READING PLINK input -if options.verbose: sys.stderr.write("Reading SNP input...\n") -if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -else: parser.error("You must provide at least one PLINK input file base") - -if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno): - parser.error("No .pheno file exist for %s. Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos')) - -# Read the emma phenotype file if provided. -# Format should be rows are phenotypes and columns are individuals. -if options.emmaPheno: - f = open(options.emmaPheno,'r') - P = [] - for line in f: - v = line.strip().split() - p = [] - for x in v: - try: - p.append(float(x)) - except: p.append(np.nan) - P.append(p) - f.close() - IN.phenos = np.array(P).T - -# READING Covariate File -if options.covfile: - if options.verbose: - sys.stderr.write("Reading covariate file...\n") - P = IN.getCovariates(options.covfile) - if options.noMean: - X0 = P - else: - X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) -elif options.emmaCov: - if options.verbose: - sys.stderr.write("Reading covariate file...\n") - P = IN.getCovariatesEMMA(options.emmaCov) - if options.noMean: - X0 = P - else: - X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) -else: - X0 = np.ones((IN.phenos.shape[0],1)) - -if np.isnan(X0).sum(): - parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") - -# READING Kinship - major bottleneck for large datasets -if options.verbose: sys.stderr.write("Reading kinship...\n") -begin = time.time() -# This method seems to be the fastest and works if you already know the size of the matrix -if options.kfile[-3:] == '.gz': - import gzip - f = gzip.open(options.kfile,'r') - F = f.read() # might exhaust mem if the file is huge - K = np.fromstring(F,sep=' ') # Assume that space separated - f.close() -else: - K = np.fromfile(open(options.kfile,'r'),sep=" ") -K.resize((len(IN.indivs),len(IN.indivs))) -end = time.time() -# Other slower ways -#K = np.loadtxt(options.kfile) -#K = np.genfromtxt(options.kfile) -if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) - -if options.kfile2: - if options.verbose: sys.stderr.write("Reading second kinship...\n") - begin = time.time() - # This method seems to be the fastest and works if you already know the size of the matrix - if options.kfile2[-3:] == '.gz': - import gzip - f = gzip.open(options.kfile2,'r') - F = f.read() # might exhaust mem if the file is huge - K2 = np.fromstring(F,sep=' ') # Assume that space separated - f.close() - else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ") - K2.resize((len(IN.indivs),len(IN.indivs))) - end = time.time() - if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin)) - -# PROCESS the phenotype data -- Remove missing phenotype values -# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data -Y = IN.phenos[:,options.pheno] -v = np.isnan(Y) -keep = True - v -if v.sum(): - if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - Y = Y[keep] - X0 = X0[keep,:] - K = K[keep,:][:,keep] - if options.kfile2: K2 = K2[keep,:][:,keep] - Kva = [] - Kve = [] - -# Only load the decomposition if we did not remove individuals. -# Otherwise it would not be correct and we would have to compute it again. -if not v.sum() and options.eigenfile: - if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") - Kva = np.load(options.eigenfile + ".Kva") - Kve = np.load(options.eigenfile + ".Kve") -else: - Kva = [] - Kve = [] - -# CREATE LMM object for association -n = K.shape[0] -if not options.kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) -else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2) - -# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here -if not options.refit: - if options.verbose: sys.stderr.write("Computing fit for null model\n") - L.fit() - if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) - if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW)) - -def compute_snp(collect): - snp = collect[0] - id = collect[1] - # result = [] - # Check SNPs for missing values - x = snp[keep].reshape((n,1)) # all the SNPs - v = np.isnan(x).reshape((-1,)) - if v.sum(): - keeps = True - v - xs = x[keeps,:] - if keeps.sum() <= 1 or xs.var() <= 1e-6: - # PS.append(np.nan) - # TS.append(np.nan) - # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) - # continue - return formatResult(id,np.nan,np.nan,np.nan,np.nan) - - # Its ok to center the genotype - I used options.normalizeGenotype to - # force the removal of missing genotypes as opposed to replacing them with MAF. - if not options.normalizeGenotype: - xs = (xs - xs.mean()) / np.sqrt(xs.var()) - Ys = Y[keeps] - X0s = X0[keeps,:] - Ks = K[keeps,:][:,keeps] - if options.kfile2: - K2s = K2[keeps,:][:,keeps] - Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s) - else: - Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) - if options.refit: - Ls.fit(X=xs,REML=options.REML) - else: - #try: - Ls.fit(REML=options.REML) - #except: pdb.set_trace() - ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) - else: - if x.var() == 0: - # PS.append(np.nan) - # TS.append(np.nan) - # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values - return formatResult(id,np.nan,np.nan,np.nan,np.nan) - # continue - - if options.refit: - L.fit(X=x,REML=options.REML) - # This is where it happens - ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) - - return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) - # PS.append(ps) - # TS.append(ts) - # return len(result) - # compute.q.put(result) - # return None - -def f_init(q): - compute_snp.q = q - -# Set up the pool -# mp.set_start_method('spawn') -q = mp.Queue() -p = mp.Pool(numThreads, f_init, [q]) -collect = [] - -# Buffers for pvalues and t-stats -# PS = [] -# TS = [] -count = 0 -out = open(outFile,'w') -printOutHead() - -for snp_id in IN: - count += 1 - if count % 1000 == 0: - if options.verbose: - sys.stderr.write("At SNP %d\n" % count) - # if count>8000 : - # break # for testing only - if count % 100 == 0: - for line in p.imap(compute_snp,collect): - out.write(line) - collect = [] - - collect.append(snp_id) -for line in p.imap(compute_snp,collect): - out.write(line) - - diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py deleted file mode 100755 index c9a73ece..00000000 --- a/wqflask/wqflask/my_pylmm/pylmmKinship.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/python - -# pylmm is a python-based linear mixed-model solver with applications to GWAS - -# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) - -#The program is free for academic use. Please contact Nick Furlotte -# if you are interested in using the software for -#commercial purposes. - -#The software must not be modified and distributed without prior -#permission of the author. - -#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys -import pdb - -from optparse import OptionParser,OptionGroup -usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile - -""" - -parser = OptionParser(usage=usage) - -basicGroup = OptionGroup(parser, "Basic Options") -#advancedGroup = OptionGroup(parser, "Advanced Options") - -#basicGroup.add_option("--pfile", dest="pfile", -# help="The base for a PLINK ped file") -basicGroup.add_option("--tfile", dest="tfile", - help="The base for a PLINK tped file") -basicGroup.add_option("--bfile", dest="bfile", - help="The base for a PLINK binary ped file") -basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the rows and snps on the columns.") -basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0, - help="When providing the emmaSNP file you need to specify how many snps are in the file") - -basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") -basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000). This is important when there is a large number of SNPs, because memory could be an issue.") -basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") - -basicGroup.add_option("-v", "--verbose", - action="store_true", dest="verbose", default=False, - help="Print extra info") - -parser.add_option_group(basicGroup) -#parser.add_option_group(advancedGroup) - -(options, args) = parser.parse_args() -if len(args) != 1: - parser.print_help() - sys.exit() - -outFile = args[0] - -import sys -import os -import numpy as np -from scipy import linalg -from pylmm.lmm import calculateKinship -from pylmm import input -import multiprocessing as mp # Multiprocessing is part of the Python stdlib -import Queue - -if not options.tfile and not options.bfile and not options.emmaFile: - parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") - -numThreads = None -if options.numThreads: - numThreads = int(options.numThreads) - -if options.verbose: sys.stderr.write("Reading PLINK input...\n") -if options.bfile: IN = input.plink(options.bfile,type='b') -elif options.tfile: IN = input.plink(options.tfile,type='t') -#elif options.pfile: IN = input.plink(options.pfile,type='p') -elif options.emmaFile: - if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.") - IN = input.plink(options.emmaFile,type='emma') -else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") - -def compute_W(job): - """ - Read 1000 SNPs at a time into matrix and return the result - """ - n = len(IN.indivs) - m = options.computeSize - W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) - for j in range(0,options.computeSize): - row = job*m + j - if row >= IN.numSNPs: - W = W[:,range(0,j)] - break - snp,id = IN.next() - if snp.var() == 0: - continue - W[:,j] = snp # set row to list of SNPs - return W - -def compute_dgemm(job,W): - """ - Compute Kinship(W)*j - - For every set of SNPs dgemm is used to multiply matrices T(W)*W - """ - res = None - try: - res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) - except AttributeError: - res = np.dot(W,W.T) - compute_dgemm.q.put([job,res]) - return job - -def f_init(q): - compute_dgemm.q = q - -n = len(IN.indivs) -# m = options.computeSize -# jobsize=m - -IN.getSNPIterator() -# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file -if options.emmaFile: IN.numSNPs = options.numSNPs -# i = 0 - -# mp.set_start_method('spawn') -q = mp.Queue() -p = mp.Pool(numThreads, f_init, [q]) -iterations = IN.numSNPs/options.computeSize+1 -# jobs = range(0,8) # range(0,iterations) - -results = [] - -K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals - -completed = 0 - -# for job in range(8): -for job in range(iterations): - if options.verbose: - sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) - W = compute_W(job) - results.append(p.apply_async(compute_dgemm, (job,W))) - # Do we have a result? - try: - j,x = q.get_nowait() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - completed += 1 - except Queue.Empty: - pass - -for job in range(len(results)-completed): - j,x = q.get() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - -K = K / float(IN.numSNPs) -if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) -np.savetxt(outFile,K) - -if options.saveEig: - if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") - Kva,Kve = linalg.eigh(K) - if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) - np.savetxt(outFile+".kva",Kva) - np.savetxt(outFile+".kve",Kve) - - - - diff --git a/wqflask/wqflask/my_pylmm/run_pylmm.py b/wqflask/wqflask/my_pylmm/run_pylmm.py deleted file mode 100755 index 0c96d986..00000000 --- a/wqflask/wqflask/my_pylmm/run_pylmm.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import absolute_import, print_function, division - -from base import data_set -from base.species import TheSpecies - - def run(dataset_name, vals, temp_uuid): - """Generates p-values for each marker""" - - tempdata = temp_data.TempData(temp_uuid) - - dataset = data_set.create_dataset(dataset_name) - species = TheSpecies(dataset=dataset) - - samples = [] # Want only ones with values - vals = vals - - for sample in dataset.group.samplelist: - samples.append(str(sample)) - - gen_data(dataset, vals, tempdata) - - - def gen_data(dataset, vals) - dataset.group.get_markers() - - pheno_vector = np.array([val == "x" and np.nan or float(val) for val in vals]) - - if dataset.group.species == "human": - p_values, t_stats = gen_human_results(pheno_vector, tempdata) - else: - genotype_data = [marker['genotypes'] for marker in dataset.group.markers.markers] - - no_val_samples = self.identify_empty_samples() - trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) - - genotype_matrix = np.array(trimmed_genotype_data).T - - #print("pheno_vector: ", pf(pheno_vector)) - #print("genotype_matrix: ", pf(genotype_matrix)) - #print("genotype_matrix.shape: ", pf(genotype_matrix.shape)) - - t_stats, p_values = lmm.run( - pheno_vector, - genotype_matrix, - restricted_max_likelihood=True, - refit=False, - temp_data=tempdata - ) - #print("p_values:", p_values) - - self.dataset.group.markers.add_pvalues(p_values) - return self.dataset.group.markers.markers - - - def gen_human_results(self, pheno_vector, tempdata): - file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name) - - plink_input = input.plink(file_base, type='b') - input_file_name = os.path.join(webqtlConfig.SNP_PATH, self.dataset.group.name + ".snps.gz") - - pheno_vector = pheno_vector.reshape((len(pheno_vector), 1)) - covariate_matrix = np.ones((pheno_vector.shape[0],1)) - kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ") - kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs))) - - p_values, t_stats = lmm.run_human( - pheno_vector, - covariate_matrix, - input_file_name, - kinship_matrix, - loading_progress=tempdata - ) - - return p_values, t_stats - -if __name__ == '__main__': - run(dataset_name, vals, temp_uuid) \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/scripts/example.py b/wqflask/wqflask/my_pylmm/scripts/example.py new file mode 100755 index 00000000..8b30debd --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/example.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division + +import sys +import time + +import numpy as np +from pyLMM import lmm + +from pprint import pformat as pf + + +Y = np.genfromtxt('data/mdp.exprs.1.new') +print("exprs is:", pf(Y.shape)) + +# Loading npdump and first 1000 snps for speed +#K = np.load('data/hmdp.liver.K.npdump') +#snps = np.load('data/hmdp.liver.snps.1000.npdump').T + +# These three lines will load all SNPs (from npdump or from txt) and +# calculate the kinship +snps = np.genfromtxt('/home/zas1024/gene/web/new_genotypers/mdp.snps.1000.new').T +print("snps is:", pf(snps.shape)) +#snps = snps[~np.isnan(snps).all(axis=1)] +#print ("snps is now:", pf(snps)) +np.savetxt("/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.trimmed", snps, fmt='%s', delimiter=' ') +#snps = np.load('data/hmdp.liver.snps.npdump').T +K = lmm.calculateKinship(snps) +#print("K is:", pf(K)) +#print("Y is:", pf(Y.shape)) + +# Instantiate a LMM object for the phentoype Y and fit the null model +L = lmm.LMM(Y,K) +L.fit() + +# Manually calculate the association at one SNP +X = snps[:,0] +X[np.isnan(X)] = X[True - np.isnan(X)].mean() # Fill missing with MAF +X = X.reshape(len(X),1) +if X.var() == 0: ts,ps = (np.nan,np.nan) +else: ts,ps = L.association(X) + +# If I want to refit the variance component +L.fit(X=X) +ts,ps = L.association(X) + +# If I want to do a genome-wide scan over the 1000 SNPs. +# This call will use REML (REML = False means use ML). +# It will also refit the variance components for each SNP. +# Setting refit = False will cause the program to fit the model once +# and hold those variance component estimates for each SNP. +begin = time.time() +TS,PS = lmm.GWAS(Y,snps,K,REML=True,refit=False) +print("TS is:", pf(TS)) +print("PS is:", pf(PS)) +end = time.time() +sys.stderr.write("Total time for 1000 SNPs: %0.3f\n" % (end- begin)) \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py new file mode 100755 index 00000000..74b58dd6 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py @@ -0,0 +1,346 @@ +#!/usr/bin/python + +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) + +#The program is free for academic use. Please contact Nick Furlotte +# if you are interested in using the software for +#commercial purposes. + +#The software must not be modified and distributed without prior +#permission of the author. + +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pdb +import time +import sys + +def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") + +def formatResult(id,beta,betaSD,ts,ps): + return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" + +def outputResult(id,beta,betaSD,ts,ps): + out.write(formatResult(id,beta,betaSD,ts,ps)) + +from optparse import OptionParser,OptionGroup +usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase + +This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. +The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. + +Basic usage: + + python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile + + """ +parser = OptionParser(usage=usage) + +basicGroup = OptionGroup(parser, "Basic Options") +advancedGroup = OptionGroup(parser, "Advanced Options") +experimentalGroup = OptionGroup(parser, "Experimental Options") + +#basicGroup.add_option("--pfile", dest="pfile", +# help="The base for a PLINK ped file") +basicGroup.add_option("--tfile", dest="tfile", + help="The base for a PLINK tped file") +basicGroup.add_option("--bfile", dest="bfile", + help="The base for a PLINK binary bed file") +basicGroup.add_option("--phenofile", dest="phenoFile", default=None, + help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. This file should be in plink format. ") + +# EMMA Options +basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the columns and snps on the rows.") +basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each phenotype as one row.") +basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each covariate as one row.") + +basicGroup.add_option("--kfile", dest="kfile", + help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") +basicGroup.add_option("--covfile", dest="covfile", + help="The location of a covariate file file. This is a plink formatted covariate file.") +basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0) + + + +advancedGroup.add_option("--removeMissingGenotypes", + action="store_false", dest="normalizeGenotype", default=True, + help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.") +advancedGroup.add_option("--refit", + action="store_true", dest="refit", default=False, + help="Refit the variance components at each SNP (default is to lock in the variance components under the null).") + +advancedGroup.add_option("--REML", + action="store_true", dest="REML", default=False, + help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).") +#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") +advancedGroup.add_option("--eigen", dest="eigenfile", + help="The location of the precomputed eigendecomposition for the kinship file. These can be computed with pylmmKinship.py.") +advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true", + help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.") + +basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") + +advancedGroup.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") + +# Experimental Group Options +experimentalGroup.add_option("--kfile2", dest="kfile2", + help="The location of a second kinship file. This file has the same format as the first kinship. This might be used if you want to correct for another form of confounding.") + +parser.add_option_group(basicGroup) +parser.add_option_group(advancedGroup) +parser.add_option_group(experimentalGroup) + +(options, args) = parser.parse_args() + +import sys +import os +import numpy as np +from scipy import linalg +from pylmm.lmm import LMM +from pylmm import input + +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +if len(args) != 1: + parser.print_help() + sys.exit() + +outFile = args[0] + +if not options.tfile and not options.bfile and not options.emmaFile: +#if not options.pfile and not options.tfile and not options.bfile: + parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).") +if not options.kfile: + parser.error("Please provide a pre-computed kinship file") + +numThreads = None +if options.numThreads: + numThreads = int(options.numThreads) + +# READING PLINK input +if options.verbose: sys.stderr.write("Reading SNP input...\n") +if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +else: parser.error("You must provide at least one PLINK input file base") + +if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno): + parser.error("No .pheno file exist for %s. Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos')) + +# Read the emma phenotype file if provided. +# Format should be rows are phenotypes and columns are individuals. +if options.emmaPheno: + f = open(options.emmaPheno,'r') + P = [] + for line in f: + v = line.strip().split() + p = [] + for x in v: + try: + p.append(float(x)) + except: p.append(np.nan) + P.append(p) + f.close() + IN.phenos = np.array(P).T + +# READING Covariate File +if options.covfile: + if options.verbose: + sys.stderr.write("Reading covariate file...\n") + P = IN.getCovariates(options.covfile) + if options.noMean: + X0 = P + else: + X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) +elif options.emmaCov: + if options.verbose: + sys.stderr.write("Reading covariate file...\n") + P = IN.getCovariatesEMMA(options.emmaCov) + if options.noMean: + X0 = P + else: + X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) +else: + X0 = np.ones((IN.phenos.shape[0],1)) + +if np.isnan(X0).sum(): + parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") + +# READING Kinship - major bottleneck for large datasets +if options.verbose: sys.stderr.write("Reading kinship...\n") +begin = time.time() +# This method seems to be the fastest and works if you already know the size of the matrix +if options.kfile[-3:] == '.gz': + import gzip + f = gzip.open(options.kfile,'r') + F = f.read() # might exhaust mem if the file is huge + K = np.fromstring(F,sep=' ') # Assume that space separated + f.close() +else: + K = np.fromfile(open(options.kfile,'r'),sep=" ") +K.resize((len(IN.indivs),len(IN.indivs))) +end = time.time() +# Other slower ways +#K = np.loadtxt(options.kfile) +#K = np.genfromtxt(options.kfile) +if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) + +if options.kfile2: + if options.verbose: sys.stderr.write("Reading second kinship...\n") + begin = time.time() + # This method seems to be the fastest and works if you already know the size of the matrix + if options.kfile2[-3:] == '.gz': + import gzip + f = gzip.open(options.kfile2,'r') + F = f.read() # might exhaust mem if the file is huge + K2 = np.fromstring(F,sep=' ') # Assume that space separated + f.close() + else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ") + K2.resize((len(IN.indivs),len(IN.indivs))) + end = time.time() + if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin)) + +# PROCESS the phenotype data -- Remove missing phenotype values +# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data +Y = IN.phenos[:,options.pheno] +v = np.isnan(Y) +keep = True - v +if v.sum(): + if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + Y = Y[keep] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + if options.kfile2: K2 = K2[keep,:][:,keep] + Kva = [] + Kve = [] + +# Only load the decomposition if we did not remove individuals. +# Otherwise it would not be correct and we would have to compute it again. +if not v.sum() and options.eigenfile: + if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") + Kva = np.load(options.eigenfile + ".Kva") + Kve = np.load(options.eigenfile + ".Kve") +else: + Kva = [] + Kve = [] + +# CREATE LMM object for association +n = K.shape[0] +if not options.kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) +else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2) + +# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here +if not options.refit: + if options.verbose: sys.stderr.write("Computing fit for null model\n") + L.fit() + if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) + if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW)) + +def compute_snp(collect): + snp = collect[0] + id = collect[1] + # result = [] + # Check SNPs for missing values + x = snp[keep].reshape((n,1)) # all the SNPs + v = np.isnan(x).reshape((-1,)) + if v.sum(): + keeps = True - v + xs = x[keeps,:] + if keeps.sum() <= 1 or xs.var() <= 1e-6: + # PS.append(np.nan) + # TS.append(np.nan) + # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) + # continue + return formatResult(id,np.nan,np.nan,np.nan,np.nan) + + # Its ok to center the genotype - I used options.normalizeGenotype to + # force the removal of missing genotypes as opposed to replacing them with MAF. + if not options.normalizeGenotype: + xs = (xs - xs.mean()) / np.sqrt(xs.var()) + Ys = Y[keeps] + X0s = X0[keeps,:] + Ks = K[keeps,:][:,keeps] + if options.kfile2: + K2s = K2[keeps,:][:,keeps] + Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s) + else: + Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) + if options.refit: + Ls.fit(X=xs,REML=options.REML) + else: + #try: + Ls.fit(REML=options.REML) + #except: pdb.set_trace() + ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) + else: + if x.var() == 0: + # PS.append(np.nan) + # TS.append(np.nan) + # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values + return formatResult(id,np.nan,np.nan,np.nan,np.nan) + # continue + + if options.refit: + L.fit(X=x,REML=options.REML) + # This is where it happens + ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) + + return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) + # PS.append(ps) + # TS.append(ts) + # return len(result) + # compute.q.put(result) + # return None + +def f_init(q): + compute_snp.q = q + +# Set up the pool +# mp.set_start_method('spawn') +q = mp.Queue() +p = mp.Pool(numThreads, f_init, [q]) +collect = [] + +# Buffers for pvalues and t-stats +# PS = [] +# TS = [] +count = 0 +out = open(outFile,'w') +printOutHead() + +for snp_id in IN: + count += 1 + if count % 1000 == 0: + if options.verbose: + sys.stderr.write("At SNP %d\n" % count) + # if count>8000 : + # break # for testing only + if count % 100 == 0: + for line in p.imap(compute_snp,collect): + out.write(line) + collect = [] + + collect.append(snp_id) +for line in p.imap(compute_snp,collect): + out.write(line) + + diff --git a/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py new file mode 100755 index 00000000..c9a73ece --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py @@ -0,0 +1,186 @@ +#!/usr/bin/python + +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) + +#The program is free for academic use. Please contact Nick Furlotte +# if you are interested in using the software for +#commercial purposes. + +#The software must not be modified and distributed without prior +#permission of the author. + +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys +import pdb + +from optparse import OptionParser,OptionGroup +usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile + +""" + +parser = OptionParser(usage=usage) + +basicGroup = OptionGroup(parser, "Basic Options") +#advancedGroup = OptionGroup(parser, "Advanced Options") + +#basicGroup.add_option("--pfile", dest="pfile", +# help="The base for a PLINK ped file") +basicGroup.add_option("--tfile", dest="tfile", + help="The base for a PLINK tped file") +basicGroup.add_option("--bfile", dest="bfile", + help="The base for a PLINK binary ped file") +basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the rows and snps on the columns.") +basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0, + help="When providing the emmaSNP file you need to specify how many snps are in the file") + +basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") +basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000). This is important when there is a large number of SNPs, because memory could be an issue.") +basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") + +basicGroup.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") + +parser.add_option_group(basicGroup) +#parser.add_option_group(advancedGroup) + +(options, args) = parser.parse_args() +if len(args) != 1: + parser.print_help() + sys.exit() + +outFile = args[0] + +import sys +import os +import numpy as np +from scipy import linalg +from pylmm.lmm import calculateKinship +from pylmm import input +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +if not options.tfile and not options.bfile and not options.emmaFile: + parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") + +numThreads = None +if options.numThreads: + numThreads = int(options.numThreads) + +if options.verbose: sys.stderr.write("Reading PLINK input...\n") +if options.bfile: IN = input.plink(options.bfile,type='b') +elif options.tfile: IN = input.plink(options.tfile,type='t') +#elif options.pfile: IN = input.plink(options.pfile,type='p') +elif options.emmaFile: + if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.") + IN = input.plink(options.emmaFile,type='emma') +else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") + +def compute_W(job): + """ + Read 1000 SNPs at a time into matrix and return the result + """ + n = len(IN.indivs) + m = options.computeSize + W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + for j in range(0,options.computeSize): + row = job*m + j + if row >= IN.numSNPs: + W = W[:,range(0,j)] + break + snp,id = IN.next() + if snp.var() == 0: + continue + W[:,j] = snp # set row to list of SNPs + return W + +def compute_dgemm(job,W): + """ + Compute Kinship(W)*j + + For every set of SNPs dgemm is used to multiply matrices T(W)*W + """ + res = None + try: + res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) + except AttributeError: + res = np.dot(W,W.T) + compute_dgemm.q.put([job,res]) + return job + +def f_init(q): + compute_dgemm.q = q + +n = len(IN.indivs) +# m = options.computeSize +# jobsize=m + +IN.getSNPIterator() +# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file +if options.emmaFile: IN.numSNPs = options.numSNPs +# i = 0 + +# mp.set_start_method('spawn') +q = mp.Queue() +p = mp.Pool(numThreads, f_init, [q]) +iterations = IN.numSNPs/options.computeSize+1 +# jobs = range(0,8) # range(0,iterations) + +results = [] + +K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + +completed = 0 + +# for job in range(8): +for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job) + results.append(p.apply_async(compute_dgemm, (job,W))) + # Do we have a result? + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + +for job in range(len(results)-completed): + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + +K = K / float(IN.numSNPs) +if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) +np.savetxt(outFile,K) + +if options.saveEig: + if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) + + + + diff --git a/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py new file mode 100755 index 00000000..0c96d986 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py @@ -0,0 +1,77 @@ +from __future__ import absolute_import, print_function, division + +from base import data_set +from base.species import TheSpecies + + def run(dataset_name, vals, temp_uuid): + """Generates p-values for each marker""" + + tempdata = temp_data.TempData(temp_uuid) + + dataset = data_set.create_dataset(dataset_name) + species = TheSpecies(dataset=dataset) + + samples = [] # Want only ones with values + vals = vals + + for sample in dataset.group.samplelist: + samples.append(str(sample)) + + gen_data(dataset, vals, tempdata) + + + def gen_data(dataset, vals) + dataset.group.get_markers() + + pheno_vector = np.array([val == "x" and np.nan or float(val) for val in vals]) + + if dataset.group.species == "human": + p_values, t_stats = gen_human_results(pheno_vector, tempdata) + else: + genotype_data = [marker['genotypes'] for marker in dataset.group.markers.markers] + + no_val_samples = self.identify_empty_samples() + trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + genotype_matrix = np.array(trimmed_genotype_data).T + + #print("pheno_vector: ", pf(pheno_vector)) + #print("genotype_matrix: ", pf(genotype_matrix)) + #print("genotype_matrix.shape: ", pf(genotype_matrix.shape)) + + t_stats, p_values = lmm.run( + pheno_vector, + genotype_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=tempdata + ) + #print("p_values:", p_values) + + self.dataset.group.markers.add_pvalues(p_values) + return self.dataset.group.markers.markers + + + def gen_human_results(self, pheno_vector, tempdata): + file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name) + + plink_input = input.plink(file_base, type='b') + input_file_name = os.path.join(webqtlConfig.SNP_PATH, self.dataset.group.name + ".snps.gz") + + pheno_vector = pheno_vector.reshape((len(pheno_vector), 1)) + covariate_matrix = np.ones((pheno_vector.shape[0],1)) + kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ") + kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs))) + + p_values, t_stats = lmm.run_human( + pheno_vector, + covariate_matrix, + input_file_name, + kinship_matrix, + loading_progress=tempdata + ) + + return p_values, t_stats + +if __name__ == '__main__': + run(dataset_name, vals, temp_uuid) \ No newline at end of file -- cgit v1.2.3 From 7fee695b79984912dd610a55e84db602780becac Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 20 Feb 2015 15:02:30 +0300 Subject: Splitting out handlers --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 8de3b3a7..d7d60e6e 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -714,6 +714,7 @@ class LMM: pl.ylabel("Probability of data") pl.title(title) +# This is the main function used by Genenetwork2 (with environment) def gn2_main(): parser = argparse.ArgumentParser(description='Run pyLMM') parser.add_argument('-k', '--key') @@ -755,6 +756,15 @@ def gn2_main(): Redis.rpush(results_key, json_results) Redis.expire(results_key, 60*60) +# This is the main version used without Genenetwork2's environment +def cli_main(): + ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']), + covariate_matrix = np.array(params['covariate_matrix']), + plink_input_file = params['input_file_name'], + kinship_matrix = np.array(params['kinship_matrix']), + refit = params['refit'], + tempdata = tempdata) + if __name__ == '__main__': if has_gn2: gn2_main() -- cgit v1.2.3 From d73ef7975657ed937d4a603ca32d6ca547292ec5 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 28 Feb 2015 12:31:46 +0300 Subject: Started adaptations for introducing multi-core version --- wqflask/wqflask/my_pylmm/pyLMM/input.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 46 +++++++++++------------- wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py | 55 +++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 26 deletions(-) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py index dfe28a4e..fdf02e42 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/input.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py @@ -260,4 +260,4 @@ class plink: L.append(D[self.indivs[i]]) P = P[L,:] - return P \ No newline at end of file + return P diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index d7d60e6e..54a40e1f 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -47,12 +47,13 @@ print("sys.path2:", sys.path) has_gn2=True +from utility.benchmark import Bench +from utility import temp_data + try: - from utility.benchmark import Bench - from utility import temp_data from wqflask.my_pylmm.pyLMM import chunks except ImportError: - print("WARNING: Standalone version\n") + print("WARNING: Standalone version missing the Genenetwork2 environment\n") has_gn2=False pass @@ -253,20 +254,21 @@ def run_other(pheno_vector, genotype_matrix, restricted_max_likelihood=True, refit=False, - tempdata=None): + tempdata=None, # <---- can not be None + is_testing=False): + """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics restricted_max_likelihood -- whether to use restricted max likelihood; True or False refit -- whether to refit the variance component for each marker temp_data -- TempData object that stores the progress for each major step of the - calculations ("calculate_kinship" and "GWAS" take the majority of time) + calculations ("calculate_kinship" and "GWAS" take the majority of time) """ - print("In run_other") with Bench("Calculate Kinship"): - kinship_matrix = calculate_kinship(genotype_matrix, tempdata) + kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing) print("kinship_matrix: ", pf(kinship_matrix)) print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) @@ -321,7 +323,7 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) -def calculate_kinship(genotype_matrix, temp_data): +def calculate_kinship(genotype_matrix, temp_data, is_testing=False): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -331,10 +333,12 @@ def calculate_kinship(genotype_matrix, temp_data): """ n = genotype_matrix.shape[0] m = genotype_matrix.shape[1] - print("n is:", n) - print("m is:", m) + print("genotype matrix n is:", n) + print("genotype matrix m is:", m) keep = [] for counter in range(m): + if is_testing and counter>8: + break #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers not_number = np.isnan(genotype_matrix[:,counter]) @@ -478,7 +482,7 @@ class LMM: is not done consistently. """ - def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True): + def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False): """ The constructor takes a phenotype vector or array of size n. @@ -487,7 +491,8 @@ class LMM: X0 is an optional covariate matrix of size n x q, where there are q covariates. When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. """ - + + self.is_testing = True if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) self.verbose = verbose @@ -507,14 +512,14 @@ class LMM: self.nonmissing = x print("this K is:", pf(K)) - + if len(Kva) == 0 or len(Kve) == 0: if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) begin = time.time() Kva,Kve = linalg.eigh(K) end = time.time() if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) - + self.K = K self.Kva = Kva self.Kve = Kve @@ -745,7 +750,7 @@ def gn2_main(): genotype_matrix = np.array(params['genotype_matrix']), restricted_max_likelihood = params['restricted_max_likelihood'], refit = params['refit'], - tempdata = tempdata) + tempdata = tempdata, is_testing=False) results_key = "pylmm:results:" + params['temp_uuid'] @@ -755,21 +760,12 @@ def gn2_main(): #Pushing json_results into a list where it is the only item because blpop needs a list Redis.rpush(results_key, json_results) Redis.expire(results_key, 60*60) - -# This is the main version used without Genenetwork2's environment -def cli_main(): - ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']), - covariate_matrix = np.array(params['covariate_matrix']), - plink_input_file = params['input_file_name'], - kinship_matrix = np.array(params['kinship_matrix']), - refit = params['refit'], - tempdata = tempdata) if __name__ == '__main__': if has_gn2: gn2_main() else: - cli_main() + print("Run from pylmmGWAS.py instead") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py new file mode 100644 index 00000000..abb72081 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py @@ -0,0 +1,55 @@ +import sys +import time +import numpy as np +from numpy.distutils.system_info import get_info; +from scipy import linalg +from scipy import optimize +from scipy import stats + +useNumpy = None +hasBLAS = None + +def matrix_initialize(useBLAS): + global useNumpy # module based variable + if useBLAS and useNumpy == None: + print get_info('blas_opt') + try: + linalg.fblas + sys.stderr.write("INFO: using linalg.fblas\n") + useNumpy = False + hasBLAS = True + except AttributeError: + sys.stderr.write("WARNING: linalg.fblas not found, using numpy.dot instead!\n") + useNumpy = True + else: + sys.stderr.write("INFO: using numpy.dot\n") + useNumpy=True + +def matrixMult(A,B): + global useNumpy # module based variable + + if useNumpy: + return np.dot(A,B) + + # If the matrices are in Fortran order then the computations will be faster + # when using dgemm. Otherwise, the function will copy the matrix and that takes time. + if not A.flags['F_CONTIGUOUS']: + AA = A.T + transA = True + else: + AA = A + transA = False + + if not B.flags['F_CONTIGUOUS']: + BB = B.T + transB = True + else: + BB = B + transB = False + + return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) + +def matrixMultT(M): + # res = np.dot(W,W.T) + # return linalg.fblas.dgemm(alpha=1.,a=M.T,b=M.T,trans_a=True,trans_b=False) + return matrixMult(M,M.T) -- cgit v1.2.3 From c523f48392d7adf1ec2f5cb84a4e5ab3a594e569 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 1 Mar 2015 12:27:41 +0300 Subject: Add warning to running lmm.py from the command line --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 1 + 1 file changed, 1 insertion(+) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 54a40e1f..d3214d6c 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -762,6 +762,7 @@ def gn2_main(): Redis.expire(results_key, 60*60) if __name__ == '__main__': + print "WARNING: Calling pylmm from lmm.py will become OBSOLETE, use run_pylmm instead!" if has_gn2: gn2_main() else: -- cgit v1.2.3 From 52204e9f7ec8fce00c0244ce5286fa394615604c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 1 Mar 2015 13:15:30 +0300 Subject: Warning --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index d3214d6c..f69d4958 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -762,7 +762,7 @@ def gn2_main(): Redis.expire(results_key, 60*60) if __name__ == '__main__': - print "WARNING: Calling pylmm from lmm.py will become OBSOLETE, use run_pylmm instead!" + print "WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!" if has_gn2: gn2_main() else: -- cgit v1.2.3 From b8fd98ed442fd7e0188652077d3d0b807c50417f Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 3 Mar 2015 17:19:39 +0300 Subject: fix print --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index f69d4958..3cbab149 100755 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -762,7 +762,7 @@ def gn2_main(): Redis.expire(results_key, 60*60) if __name__ == '__main__': - print "WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!" + print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") if has_gn2: gn2_main() else: -- cgit v1.2.3 From d422300b8781049a903c68d9cc342c4596079ccd Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 5 Mar 2015 14:56:25 +0300 Subject: Removed execution flag x from files --- wqflask/wqflask/my_pylmm/pyLMM/__init__.py | 0 wqflask/wqflask/my_pylmm/pyLMM/chunks.py | 0 wqflask/wqflask/my_pylmm/pyLMM/input.py | 0 wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 0 wqflask/wqflask/my_pylmm/pyLMM/process_plink.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 wqflask/wqflask/my_pylmm/pyLMM/__init__.py mode change 100755 => 100644 wqflask/wqflask/my_pylmm/pyLMM/chunks.py mode change 100755 => 100644 wqflask/wqflask/my_pylmm/pyLMM/input.py mode change 100755 => 100644 wqflask/wqflask/my_pylmm/pyLMM/lmm.py mode change 100755 => 100644 wqflask/wqflask/my_pylmm/pyLMM/process_plink.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py old mode 100755 new mode 100644 diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py old mode 100755 new mode 100644 diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py old mode 100755 new mode 100644 diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py old mode 100755 new mode 100644 diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py old mode 100755 new mode 100644 -- cgit v1.2.3 From 8053869aa61fb430b89a32b0f8024fc133cea28f Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 5 Mar 2015 16:12:14 +0300 Subject: Helpers to split out functionality --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 36 ++++++++++++++++++++++++++++ wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 20 ++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py new file mode 100644 index 00000000..4147d90b --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -0,0 +1,36 @@ +# This is a converter for common LMM formats, so as to keep complexity +# outside the main routines. + +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from optparse import OptionParser + +usage = """ +Convert files for runlmm.py processing +""" + +parser = OptionParser(usage=usage) +# parser.add_option("-f", "--file", dest="input file", +# help="In", metavar="FILE") +parser.add_option("--kinship", + help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") +parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + +(options, args) = parser.parse_args() + + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py new file mode 100644 index 00000000..df86f718 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -0,0 +1,20 @@ +# This is the LMM runner that calls the possible methods using command line +# switches. It acts as a multiplexer where all the invocation complexity +# is kept outside the main LMM routines. +# +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + + -- cgit v1.2.3 From 0754dbf77f4362beaef45b0ac651676d47659b70 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 5 Mar 2015 16:50:14 +0300 Subject: Convert textual kinship file --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 35 ++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 4147d90b..35fccef4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -17,15 +17,25 @@ # along with this program. If not, see . from optparse import OptionParser +import sys +import os +import numpy as np +# from lmm import LMM, run_other +import input + usage = """ -Convert files for runlmm.py processing +python convertlmm.py [--kinship] infile + + Convert files for runlmm.py processing. Writes to stdout. + + try --help for more information """ parser = OptionParser(usage=usage) # parser.add_option("-f", "--file", dest="input file", # help="In", metavar="FILE") -parser.add_option("--kinship", +parser.add_option("--kinship",action="store_false", dest="kinship", default=True, help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, @@ -33,4 +43,25 @@ parser.add_option("-q", "--quiet", (options, args) = parser.parse_args() +if len(args) == 0: + print usage + sys.exit(1) +if options.kinship: + is_header = True + assert(len(args)==1) + count = 0 + for line in open(args[0],'r'): + count += 1 + if is_header: + size = len(line.split()) + print "# Kinship format version 1.0" + print "# Size=",size + for i in range(size): + sys.stdout.write("\t"+str(i+1)) + sys.stdout.write("\n") + is_header = False + sys.stdout.write(str(count)) + sys.stdout.write("\t") + sys.stdout.write("\t".join(line.split())) + sys.stdout.write("\n") -- cgit v1.2.3 From b9d8057c2017ade6dd6ce54c45d718beb7c25c6a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 5 Mar 2015 17:27:36 +0300 Subject: Kinship file format support (reader and writer) --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 1 - wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 49 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 35fccef4..012e96b2 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -23,7 +23,6 @@ import numpy as np # from lmm import LMM, run_other import input - usage = """ python convertlmm.py [--kinship] infile diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index df86f718..4a640d6d 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -17,4 +17,53 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from optparse import OptionParser +import sys +import os +import numpy as np +# from lmm import LMM, run_other +import csv + +usage = """ +python runlmm.py [options] command + + runlmm.py processing multiplexer reads standard input types and calls the routines + + Current commands are: + + parse : only parse input files + + try --help for more information +""" + +parser = OptionParser(usage=usage) +# parser.add_option("-f", "--file", dest="input file", +# help="In", metavar="FILE") +parser.add_option("--kinship",dest="kinship", + help="Kinship file format") +parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + +(options, args) = parser.parse_args() + +if len(args) != 1: + print usage + sys.exit(1) + +cmd = args[0] +print "Command: ",cmd + +if options.kinship: + K1 = [] + print options.kinship + with open(options.kinship,'r') as tsvin: + assert(tsvin.readline().strip() == "# Kinship format version 1.0") + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + ns = np.genfromtxt(row[1:]) + K1.append(ns) # <--- slow + K = np.array(K1) -- cgit v1.2.3 From 15b3b5b103fdee72a8dc0dac1c9c29e5680fbef9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 7 Mar 2015 11:02:00 +0300 Subject: Kinship converter --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 35 +++++++++++++++++----------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 012e96b2..2af84477 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -24,33 +24,38 @@ import numpy as np import input usage = """ -python convertlmm.py [--kinship] infile +python convertlmm.py [--kinship kfile] Convert files for runlmm.py processing. Writes to stdout. try --help for more information """ -parser = OptionParser(usage=usage) -# parser.add_option("-f", "--file", dest="input file", -# help="In", metavar="FILE") -parser.add_option("--kinship",action="store_false", dest="kinship", default=True, - help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") -parser.add_option("-q", "--quiet", +# if len(args) == 0: +# print usage +# sys.exit(1) + +option_parser = OptionParser(usage=usage) +option_parser.add_option("--kinship", dest="kinship", + help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program") +# option_parser.add_option("--kinship",action="store_false", dest="kinship", default=True, +# help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") +option_parser.add_option("--prefix", dest="prefix", + help="Output prefix for output file(s)") +option_parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") +option_parser.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") -(options, args) = parser.parse_args() - -if len(args) == 0: - print usage - sys.exit(1) +(options, args) = option_parser.parse_args() if options.kinship: is_header = True - assert(len(args)==1) count = 0 - for line in open(args[0],'r'): + sys.stderr.write("Converting "+options.kinship+"\n") + for line in open(options.kinship,'r'): count += 1 if is_header: size = len(line.split()) @@ -64,3 +69,5 @@ if options.kinship: sys.stdout.write("\t") sys.stdout.write("\t".join(line.split())) sys.stdout.write("\n") + +sys.stderr.write("Converting done\n") -- cgit v1.2.3 From 11856132296916f7788437681bea9a2c71fe1e50 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 7 Mar 2015 12:21:29 +0300 Subject: Write phenotypes --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 72 ++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 10 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 2af84477..0604762e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -38,6 +38,8 @@ python convertlmm.py [--kinship kfile] option_parser = OptionParser(usage=usage) option_parser.add_option("--kinship", dest="kinship", help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program") +option_parser.add_option("--plink", dest="plink", + help="Parse a phenotype file (PLINK style)") # option_parser.add_option("--kinship",action="store_false", dest="kinship", default=True, # help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") option_parser.add_option("--prefix", dest="prefix", @@ -51,23 +53,73 @@ option_parser.add_option("-v", "--verbose", (options, args) = option_parser.parse_args() +writer = None + +def msg(s): + sys.stderr.write("INFO: ") + sys.stderr.write(s) + sys.stderr.write("\n") + +def wr(s): + if writer: + writer.write(s) + else: + sys.stdout.write(s) + +def wrln(s): + wr(s) + wr("\n") + if options.kinship: is_header = True count = 0 - sys.stderr.write("Converting "+options.kinship+"\n") + msg("Converting "+options.kinship) + if options.prefix: + writer = open(options.prefix+".kin","w") for line in open(options.kinship,'r'): count += 1 if is_header: size = len(line.split()) - print "# Kinship format version 1.0" - print "# Size=",size + wrln("# Kinship format version 1.0") + wrln("# Size="+str(size)) for i in range(size): - sys.stdout.write("\t"+str(i+1)) - sys.stdout.write("\n") + wr("\t"+str(i+1)) + wr("\n") is_header = False - sys.stdout.write(str(count)) - sys.stdout.write("\t") - sys.stdout.write("\t".join(line.split())) - sys.stdout.write("\n") + wr(str(count)) + wr("\t") + wr("\t".join(line.split())) + wr("\n") + msg(str(count)+" lines written") + +if options.plink: + # Because plink does not track size we need to read the whole thing first + msg("Converting "+options.plink) + phenos = [] + count = 0 + count_pheno = None + for line in open(options.plink,'r'): + count += 1 + list = line.split() + pcount = len(list)-2 + assert(pcount > 0) + if count_pheno == None: + count_pheno = pcount + assert(count_pheno == pcount) + row = [list[0]]+list[2:] + phenos.append(row) -sys.stderr.write("Converting done\n") + if options.prefix: + writer = open(options.prefix+".pheno","w") + wrln("# Phenotype format version 1.0") + wrln("# Individuals = "+str(count)) + wrln("# Phenotypes = "+str(count_pheno)) + for i in range(count_pheno): + wr("\t"+str(i+1)) + wr("\n") + for i in range(count): + wr("\t".join(phenos[i])) + wr("\n") + msg(str(count)+" lines written") + +msg("Converting done") -- cgit v1.2.3 From 9485cbbad2596b3105ad995e15ac3cb3dca6615d Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 08:43:12 +0300 Subject: Parse new phenotype format --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 4a640d6d..27afd6f4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -41,7 +41,9 @@ parser = OptionParser(usage=usage) # parser.add_option("-f", "--file", dest="input file", # help="In", metavar="FILE") parser.add_option("--kinship",dest="kinship", - help="Kinship file format") + help="Kinship file format 1.0") +parser.add_option("--pheno",dest="pheno", + help="Phenotype file format 1.0") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") @@ -67,3 +69,19 @@ if options.kinship: ns = np.genfromtxt(row[1:]) K1.append(ns) # <--- slow K = np.array(K1) + +if options.pheno: + Y1 = [] + print options.pheno + with open(options.pheno,'r') as tsvin: + assert(tsvin.readline().strip() == "# Phenotype format version 1.0") + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + ns = np.genfromtxt(row[1:]) + Y1.append(ns) # <--- slow + Y = np.array(Y1) + +print Y -- cgit v1.2.3 From d76d0fa9057aa774cc9675584b4737dda4dbf3a1 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 11:35:04 +0300 Subject: Parse new phenotype format --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 0604762e..3e216475 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -24,9 +24,9 @@ import numpy as np import input usage = """ -python convertlmm.py [--kinship kfile] +python convertlmm.py [--plink] [--prefix basename] [--kinship kfile] [--pheno] [--geno] - Convert files for runlmm.py processing. Writes to stdout. + Convert files for runlmm.py processing. Writes to stdout by default. try --help for more information """ @@ -38,8 +38,12 @@ python convertlmm.py [--kinship kfile] option_parser = OptionParser(usage=usage) option_parser.add_option("--kinship", dest="kinship", help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program") -option_parser.add_option("--plink", dest="plink", - help="Parse a phenotype file (PLINK style)") +option_parser.add_option("--pheno", dest="pheno", + help="Parse a phenotype file (use with --plink only)") +option_parser.add_option("--geno", dest="geno", + help="Parse a genotype file (use with --plink only)") +option_parser.add_option("--plink", dest="plink", default=False, + help="Parse PLINK style") # option_parser.add_option("--kinship",action="store_false", dest="kinship", default=True, # help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") option_parser.add_option("--prefix", dest="prefix", @@ -92,7 +96,9 @@ if options.kinship: wr("\n") msg(str(count)+" lines written") -if options.plink: +if options.pheno: + if not options.plink: + raise Exception("Use --plink switch") # Because plink does not track size we need to read the whole thing first msg("Converting "+options.plink) phenos = [] @@ -121,5 +127,6 @@ if options.plink: wr("\t".join(phenos[i])) wr("\n") msg(str(count)+" lines written") - + + msg("Converting done") -- cgit v1.2.3 From b955d8c4d2b1c6857905513a5de77039892fe18a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 11:44:17 +0300 Subject: convertlmm.py: plink support --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 3e216475..3932a4d9 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -24,7 +24,7 @@ import numpy as np import input usage = """ -python convertlmm.py [--plink] [--prefix basename] [--kinship kfile] [--pheno] [--geno] +python convertlmm.py [--plink] [--prefix out_basename] [--kinship kfile] [--pheno pname] [--geno gname] Convert files for runlmm.py processing. Writes to stdout by default. @@ -42,7 +42,7 @@ option_parser.add_option("--pheno", dest="pheno", help="Parse a phenotype file (use with --plink only)") option_parser.add_option("--geno", dest="geno", help="Parse a genotype file (use with --plink only)") -option_parser.add_option("--plink", dest="plink", default=False, +option_parser.add_option("--plink", dest="plink", action="store_true", default=False, help="Parse PLINK style") # option_parser.add_option("--kinship",action="store_false", dest="kinship", default=True, # help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") @@ -78,6 +78,7 @@ if options.kinship: is_header = True count = 0 msg("Converting "+options.kinship) + writer = None if options.prefix: writer = open(options.prefix+".kin","w") for line in open(options.kinship,'r'): @@ -94,17 +95,17 @@ if options.kinship: wr("\t") wr("\t".join(line.split())) wr("\n") - msg(str(count)+" lines written") + msg(str(count)+" kinship lines written") if options.pheno: if not options.plink: raise Exception("Use --plink switch") # Because plink does not track size we need to read the whole thing first - msg("Converting "+options.plink) + msg("Converting "+options.pheno) phenos = [] count = 0 count_pheno = None - for line in open(options.plink,'r'): + for line in open(options.pheno,'r'): count += 1 list = line.split() pcount = len(list)-2 @@ -115,6 +116,7 @@ if options.pheno: row = [list[0]]+list[2:] phenos.append(row) + writer = None if options.prefix: writer = open(options.prefix+".pheno","w") wrln("# Phenotype format version 1.0") @@ -126,7 +128,7 @@ if options.pheno: for i in range(count): wr("\t".join(phenos[i])) wr("\n") - msg(str(count)+" lines written") + msg(str(count)+" pheno lines written") msg("Converting done") -- cgit v1.2.3 From 0565d021e3c65256d53cfdfb32ccf55dacd1ecdf Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 12:41:32 +0300 Subject: convertlmm.py: options.geno --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 3932a4d9..25011b18 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -21,7 +21,8 @@ import sys import os import numpy as np # from lmm import LMM, run_other -import input +# import input +import plink usage = """ python convertlmm.py [--plink] [--prefix out_basename] [--kinship kfile] [--pheno pname] [--geno gname] @@ -77,7 +78,7 @@ def wrln(s): if options.kinship: is_header = True count = 0 - msg("Converting "+options.kinship) + msg("Converting kinship "+options.kinship) writer = None if options.prefix: writer = open(options.prefix+".kin","w") @@ -101,7 +102,7 @@ if options.pheno: if not options.plink: raise Exception("Use --plink switch") # Because plink does not track size we need to read the whole thing first - msg("Converting "+options.pheno) + msg("Converting pheno "+options.pheno) phenos = [] count = 0 count_pheno = None @@ -130,5 +131,12 @@ if options.pheno: wr("\n") msg(str(count)+" pheno lines written") +if options.geno: + if not options.plink: + raise Exception("Use --plink switch") + # msg("Converting geno "+options.geno) + # plink.readbim(options.geno+'.bim') + msg("Converting geno "+options.geno+'.bed') + plink.readbed(options.geno+'.bed',1000,8) msg("Converting done") -- cgit v1.2.3 From eca1af83ecbca6c0ac018514f322cc5b49cfa405 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 12:43:54 +0300 Subject: plink support --- wqflask/wqflask/my_pylmm/pyLMM/plink.py | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/plink.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py new file mode 100644 index 00000000..53151b21 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -0,0 +1,106 @@ +# Plink module +# +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# Some of the BED file parsing came from pylmm: +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# According to the PLINK information + +# Parse a textual BIM file and return the contents as a list of tuples +# +# Extended variant information file accompanying a .bed binary genotype table. +# +# A text file with no header line, and one line per variant with the following six fields: +# +# Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name +# Variant identifier +# Position in morgans or centimorgans (safe to use dummy value of '0') +# Base-pair coordinate (normally 1-based, but 0 ok; limited to 231-2) +# Allele 1 (corresponding to clear bits in .bed; usually minor) +# Allele 2 (corresponding to set bits in .bed; usually major) +# +# Allele codes can contain more than one character. Variants with negative bp coordinates are ignored by PLINK. Example +# +# 1 mm37-1-3125499 0 3125499 1 2 +# 1 mm37-1-3125701 0 3125701 1 2 +# 1 mm37-1-3187481 0 3187481 1 2 + +import struct +import numpy as np + +def readbim(fn): + res = [] + for line in open(fn): + list = line.split() + if len([True for e in list if e == 'nan']) == 0: + res.append( (list[0],list[1],int(list[2]),int(list[3]),int(list[4]),int(list[5])) ) + return res + +# .bed (PLINK binary biallelic genotype table) +# +# Primary representation of genotype calls at biallelic variants. Must be accompanied by .bim and .fam files. +# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed +# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam +# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim + +def readbed(fn,inds,snps): + bytes = inds / 4 + (inds % 4 and 1 or 0) + format = 'c'*bytes + f = open(fn,'rb') + magic = f.read(3) + assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01") + + def formatBinaryGenotypes(X): + D = { \ + '00': 0.0, \ + '10': 0.5, \ + '11': 1.0, \ + '01': np.nan \ + } + + D_tped = { \ + '00': '1 1', \ + '10': '1 2', \ + '11': '2 2', \ + '01': '0 0' \ + } + + #D = D_tped + + G = [] + for x in X: + if not len(x) == 10: + xx = x[2:] + x = '0b' + '0'*(8 - len(xx)) + xx + a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) + L = [D[y] for y in [a,b,c,d]] + G += L + # only take the leading values because whatever is left should be null + G = G[:inds] + G = np.array(G) + # if normalize: + # G = self.normalizeGenotype(G) + return G + + def next(): + X = f.read(bytes) + XX = [bin(ord(x)) for x in struct.unpack(format,X)] + # snp = 0 + # snp = self.snpFileHandle.readline().strip().split()[1] + return formatBinaryGenotypes(XX) + + for x in range(snps): + print next() -- cgit v1.2.3 From aaf0f336b69fb1fb77c9dd3782ba4db0d732106c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 13:38:03 +0300 Subject: plink: no need to know the number of SNPs, but you need INDs --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 17 +++++++-- wqflask/wqflask/my_pylmm/pyLMM/plink.py | 56 ++++++++++++++-------------- 2 files changed, 41 insertions(+), 32 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 25011b18..d374407e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -59,6 +59,7 @@ option_parser.add_option("-v", "--verbose", (options, args) = option_parser.parse_args() writer = None +num_inds = None def msg(s): sys.stderr.write("INFO: ") @@ -96,6 +97,7 @@ if options.kinship: wr("\t") wr("\t".join(line.split())) wr("\n") + num_inds = count msg(str(count)+" kinship lines written") if options.pheno: @@ -129,14 +131,23 @@ if options.pheno: for i in range(count): wr("\t".join(phenos[i])) wr("\n") + num_inds = count msg(str(count)+" pheno lines written") if options.geno: if not options.plink: raise Exception("Use --plink switch") + if not num_inds: + raise Exception("Can not figure out the number of individuals, use --pheno or --kinship") # msg("Converting geno "+options.geno) - # plink.readbim(options.geno+'.bim') + + snps = plink.readbim(options.geno+'.bim') msg("Converting geno "+options.geno+'.bed') - plink.readbed(options.geno+'.bed',1000,8) - + + def out(i,x): + print i,x + + snps = plink.readbed(options.geno+'.bed',num_inds, out) + msg(str(count)+" geno lines written (with "+str(snps)+" snps)") + msg("Converting done") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py index 53151b21..3e850c9b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/plink.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -51,19 +51,24 @@ def readbim(fn): # .bed (PLINK binary biallelic genotype table) # -# Primary representation of genotype calls at biallelic variants. Must be accompanied by .bim and .fam files. +# Primary representation of genotype calls at biallelic variants. Must +# be accompanied by .bim and .fam files. Basically contains num SNP +# blocks containing IND (compressed 4 IND into a byte) +# +# Since it is a biallelic format it supports for every individual +# whether the first allele is homozygous (b00), the second allele is +# homozygous (b11), it is heterozygous (b10) or that it is missing +# (b01). + # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim -def readbed(fn,inds,snps): - bytes = inds / 4 + (inds % 4 and 1 or 0) - format = 'c'*bytes - f = open(fn,'rb') - magic = f.read(3) - assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01") +def readbed(fn,inds,func=None): - def formatBinaryGenotypes(X): + # For every SNP block fetch the individual genotypes using values + # 0.0 and 1.0 for homozygous and 0.5 for heterozygous alleles + def fetchGenotypes(X): D = { \ '00': 0.0, \ '10': 0.5, \ @@ -71,15 +76,6 @@ def readbed(fn,inds,snps): '01': np.nan \ } - D_tped = { \ - '00': '1 1', \ - '10': '1 2', \ - '11': '2 2', \ - '01': '0 0' \ - } - - #D = D_tped - G = [] for x in X: if not len(x) == 10: @@ -88,19 +84,21 @@ def readbed(fn,inds,snps): a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) L = [D[y] for y in [a,b,c,d]] G += L - # only take the leading values because whatever is left should be null G = G[:inds] G = np.array(G) - # if normalize: - # G = self.normalizeGenotype(G) return G - def next(): - X = f.read(bytes) - XX = [bin(ord(x)) for x in struct.unpack(format,X)] - # snp = 0 - # snp = self.snpFileHandle.readline().strip().split()[1] - return formatBinaryGenotypes(XX) - - for x in range(snps): - print next() + bytes = inds / 4 + (inds % 4 and 1 or 0) + format = 'c'*bytes + count = 0 + with open(fn,'rb') as f: + magic = f.read(3) + assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01") + while True: + count += 1 + X = f.read(bytes) + if not X: + return(count-1) + XX = [bin(ord(x)) for x in struct.unpack(format,X)] + xs = fetchGenotypes(XX) + func(count,xs) -- cgit v1.2.3 From 0919dd489deeb736b08c50d4b7636b6af00d92c8 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 13:51:13 +0300 Subject: Not using numpy - is part of the caller --- wqflask/wqflask/my_pylmm/pyLMM/plink.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py index 3e850c9b..75d51c3b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/plink.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -39,7 +39,7 @@ # 1 mm37-1-3187481 0 3187481 1 2 import struct -import numpy as np +# import numpy as np def readbim(fn): res = [] @@ -73,7 +73,7 @@ def readbed(fn,inds,func=None): '00': 0.0, \ '10': 0.5, \ '11': 1.0, \ - '01': np.nan \ + '01': float('nan') \ } G = [] @@ -85,7 +85,7 @@ def readbed(fn,inds,func=None): L = [D[y] for y in [a,b,c,d]] G += L G = G[:inds] - G = np.array(G) + # G = np.array(G) return G bytes = inds / 4 + (inds % 4 and 1 or 0) -- cgit v1.2.3 From 60ef29c2b003f9d9a76d901ca4107b8cce3b9799 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 13:54:24 +0300 Subject: Now we can use print in lamdba --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index d374407e..173bae2e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -16,6 +16,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from __future__ import print_function from optparse import OptionParser import sys import os @@ -144,10 +145,10 @@ if options.geno: snps = plink.readbim(options.geno+'.bim') msg("Converting geno "+options.geno+'.bed') - def out(i,x): - print i,x + # def out(i,x): + # print i,x - snps = plink.readbed(options.geno+'.bed',num_inds, out) + snps = plink.readbed(options.geno+'.bed',num_inds, lambda i,x: print(i,x)) msg(str(count)+" geno lines written (with "+str(snps)+" snps)") msg("Converting done") -- cgit v1.2.3 From d924bc5842d2813aad8359b9cb614c2a87df1ee9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 14:37:04 +0300 Subject: plink: missing values in bim --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 27 +++++++++++++++++++-------- wqflask/wqflask/my_pylmm/pyLMM/plink.py | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 173bae2e..d04f9b94 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -136,19 +136,30 @@ if options.pheno: msg(str(count)+" pheno lines written") if options.geno: + msg("Converting geno "+options.geno+'.bed') if not options.plink: raise Exception("Use --plink switch") if not num_inds: raise Exception("Can not figure out the number of individuals, use --pheno or --kinship") - # msg("Converting geno "+options.geno) - - snps = plink.readbim(options.geno+'.bim') - msg("Converting geno "+options.geno+'.bed') - - # def out(i,x): - # print i,x + num_snps = plink.readbim(options.geno+'.bim') + writer = None + if options.prefix: + writer = open(options.prefix+".geno","w") + wrln("# Genotype format version 1.0") + wrln("# Individuals = "+str(num_inds)) + wrln("# Phenotypes = "+str(len(num_snps))) + + def out(i,x): + # print(i,x) + pass - snps = plink.readbed(options.geno+'.bed',num_inds, lambda i,x: print(i,x)) + snps = plink.readbed(options.geno+'.bed',num_inds, out) + # for i in range(num_snps): + # wr("\t"+str(i+1)) + # wr("\n") + # for i in range(count): + # wr("\t".join(phenos[i])) + # wr("\n") msg(str(count)+" geno lines written (with "+str(snps)+" snps)") msg("Converting done") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py index 75d51c3b..4ad2c5f7 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/plink.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -47,6 +47,8 @@ def readbim(fn): list = line.split() if len([True for e in list if e == 'nan']) == 0: res.append( (list[0],list[1],int(list[2]),int(list[3]),int(list[4]),int(list[5])) ) + else: + res.append( (list[0],list[1],list[2],float('nan'),float('nan'),float('nan')) ) return res # .bed (PLINK binary biallelic genotype table) -- cgit v1.2.3 From f92657b08cb2dab8b24c69dee525f52c125713a3 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 15:34:16 +0300 Subject: Write genotype file (start) --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index d04f9b94..7376a1d5 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -150,8 +150,10 @@ if options.geno: wrln("# Phenotypes = "+str(len(num_snps))) def out(i,x): - # print(i,x) - pass + wr(str(i)+"\t") + for v in x: + wr("\t".join(str(v))) + wr("\n") snps = plink.readbed(options.geno+'.bed',num_inds, out) # for i in range(num_snps): -- cgit v1.2.3 From 02cb9710a83bc81abe198cb089dc5a70baf1b657 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 15:37:39 +0300 Subject: Write genotype file (start) --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 7376a1d5..a3728bd2 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -151,8 +151,8 @@ if options.geno: def out(i,x): wr(str(i)+"\t") - for v in x: - wr("\t".join(str(v))) + lst = [str(v) for v in x] + wr("\t".join(lst)) wr("\n") snps = plink.readbed(options.geno+'.bed',num_inds, out) -- cgit v1.2.3 From 749cf072da1849d926ab4d74e288ddf582e84c5a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 16:47:33 +0300 Subject: Write genotype file non-transposed --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 12 ++++++------ wqflask/wqflask/my_pylmm/pyLMM/plink.py | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index a3728bd2..f35cc47c 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -148,17 +148,17 @@ if options.geno: wrln("# Genotype format version 1.0") wrln("# Individuals = "+str(num_inds)) wrln("# Phenotypes = "+str(len(num_snps))) + for i in range(len(num_snps)): + wr("\t"+str(i+1)) + wr("\n") + m = [] def out(i,x): wr(str(i)+"\t") - lst = [str(v) for v in x] - wr("\t".join(lst)) + wr("\t".join(x)) wr("\n") - snps = plink.readbed(options.geno+'.bed',num_inds, out) - # for i in range(num_snps): - # wr("\t"+str(i+1)) - # wr("\n") + snps = plink.readbed(options.geno+'.bed',num_inds, ('A','H','B','NA'), out) # for i in range(count): # wr("\t".join(phenos[i])) # wr("\n") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py index 4ad2c5f7..7bd2df91 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/plink.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -66,25 +66,26 @@ def readbim(fn): # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim -def readbed(fn,inds,func=None): +def readbed(fn,inds,encoding,func=None): # For every SNP block fetch the individual genotypes using values # 0.0 and 1.0 for homozygous and 0.5 for heterozygous alleles def fetchGenotypes(X): - D = { \ - '00': 0.0, \ - '10': 0.5, \ - '11': 1.0, \ - '01': float('nan') \ - } - + # D = { \ + # '00': 0.0, \ + # '10': 0.5, \ + # '11': 1.0, \ + # '01': float('nan') \ + # } + + Didx = { '00': 0, '10': 1, '11': 2, '01': 3 } G = [] for x in X: if not len(x) == 10: xx = x[2:] x = '0b' + '0'*(8 - len(xx)) + xx a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) - L = [D[y] for y in [a,b,c,d]] + L = [encoding[Didx[y]] for y in [a,b,c,d]] G += L G = G[:inds] # G = np.array(G) -- cgit v1.2.3 From f7668a8b9fe552b9d908e2c68367e660d3a81482 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 17:14:19 +0300 Subject: Add HAB encoding --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 24 +++++++++++++++--------- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 20 +++++++++++++++++++- 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index f35cc47c..8a1f03ad 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -148,20 +148,26 @@ if options.geno: wrln("# Genotype format version 1.0") wrln("# Individuals = "+str(num_inds)) wrln("# Phenotypes = "+str(len(num_snps))) - for i in range(len(num_snps)): + for i in range(num_inds): wr("\t"+str(i+1)) - wr("\n") + wr("\n") m = [] def out(i,x): - wr(str(i)+"\t") - wr("\t".join(x)) - wr("\n") + # wr(str(i)+"\t") + # wr("\t".join(x)) + # wr("\n") + m.append(x) - snps = plink.readbed(options.geno+'.bed',num_inds, ('A','H','B','NA'), out) - # for i in range(count): - # wr("\t".join(phenos[i])) - # wr("\n") + snps = plink.readbed(options.geno+'.bed',num_inds, ('A','H','B','-'), out) + + msg("Write transposed genotype matrix") + for g in range(len(num_snps)): + wr(str(g+1)+"\t") + for i in range(num_inds): + wr(m[g][i]) + wr("\n") + msg(str(count)+" geno lines written (with "+str(snps)+" snps)") msg("Converting done") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 27afd6f4..4398926f 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -44,6 +44,8 @@ parser.add_option("--kinship",dest="kinship", help="Kinship file format 1.0") parser.add_option("--pheno",dest="pheno", help="Phenotype file format 1.0") +parser.add_option("--geno",dest="geno", + help="Genotype file format 1.0") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") @@ -84,4 +86,20 @@ if options.pheno: Y1.append(ns) # <--- slow Y = np.array(Y1) -print Y +if options.geno: + G1 = [] + print options.geno + with open(options.geno,'r') as tsvin: + assert(tsvin.readline().strip() == "# Genotype format version 1.0") + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + print(row) + # ns = np.genfromtxt(row[1:]) + G1.append(ns) # <--- slow + G = np.array(G1) + +print G -- cgit v1.2.3 From 5528b84927e8e27ad4c13621272bb4bee4a9d694 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 11:17:31 +0300 Subject: Turn HAB encoding into pylmm genotyping --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 3 ++- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 8a1f03ad..89c09b1e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -147,7 +147,8 @@ if options.geno: writer = open(options.prefix+".geno","w") wrln("# Genotype format version 1.0") wrln("# Individuals = "+str(num_inds)) - wrln("# Phenotypes = "+str(len(num_snps))) + wrln("# SNPs = "+str(len(num_snps))) + wrln("# Encoding = HAB") for i in range(num_inds): wr("\t"+str(i+1)) wr("\n") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 4398926f..ce8e32be 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -28,7 +28,8 @@ import csv usage = """ python runlmm.py [options] command - runlmm.py processing multiplexer reads standard input types and calls the routines + runlmm.py processing multiplexer reads standardised input formats + and calls the different routines Current commands are: @@ -88,6 +89,9 @@ if options.pheno: if options.geno: G1 = [] + hab_mapper = {'A':0,'H':1,'B':2,'-':3} + pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ] + print options.geno with open(options.geno,'r') as tsvin: assert(tsvin.readline().strip() == "# Genotype format version 1.0") @@ -97,9 +101,14 @@ if options.geno: tsvin.readline() tsv = csv.reader(tsvin, delimiter='\t') for row in tsv: - print(row) + # print(row) + id = row[0] + gs = list(row[1]) + # print id,gs + gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs] + # print id,gs2 # ns = np.genfromtxt(row[1:]) - G1.append(ns) # <--- slow + G1.append(gs2) # <--- slow G = np.array(G1) print G -- cgit v1.2.3 From 5489f84854c6197d4cf532238de8d90b3d6fcb5e Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 11:38:35 +0300 Subject: convertlmm.py: add SNP names --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 58 +++++++++++++++------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 89c09b1e..3b6b5d70 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -61,6 +61,8 @@ option_parser.add_option("-v", "--verbose", writer = None num_inds = None +snp_names = [] +ind_names = [] def msg(s): sys.stderr.write("INFO: ") @@ -77,29 +79,6 @@ def wrln(s): wr(s) wr("\n") -if options.kinship: - is_header = True - count = 0 - msg("Converting kinship "+options.kinship) - writer = None - if options.prefix: - writer = open(options.prefix+".kin","w") - for line in open(options.kinship,'r'): - count += 1 - if is_header: - size = len(line.split()) - wrln("# Kinship format version 1.0") - wrln("# Size="+str(size)) - for i in range(size): - wr("\t"+str(i+1)) - wr("\n") - is_header = False - wr(str(count)) - wr("\t") - wr("\t".join(line.split())) - wr("\n") - num_inds = count - msg(str(count)+" kinship lines written") if options.pheno: if not options.plink: @@ -135,19 +114,44 @@ if options.pheno: num_inds = count msg(str(count)+" pheno lines written") +if options.kinship: + is_header = True + count = 0 + msg("Converting kinship "+options.kinship) + writer = None + if options.prefix: + writer = open(options.prefix+".kin","w") + for line in open(options.kinship,'r'): + count += 1 + if is_header: + size = len(line.split()) + wrln("# Kinship format version 1.0") + wrln("# Size="+str(size)) + for i in range(size): + wr("\t"+str(i+1)) + wr("\n") + is_header = False + wr(str(count)) + wr("\t") + wr("\t".join(line.split())) + wr("\n") + num_inds = count + msg(str(count)+" kinship lines written") + if options.geno: msg("Converting geno "+options.geno+'.bed') if not options.plink: raise Exception("Use --plink switch") if not num_inds: raise Exception("Can not figure out the number of individuals, use --pheno or --kinship") - num_snps = plink.readbim(options.geno+'.bim') + bim_snps = plink.readbim(options.geno+'.bim') + num_snps = len(bim_snps) writer = None if options.prefix: writer = open(options.prefix+".geno","w") wrln("# Genotype format version 1.0") wrln("# Individuals = "+str(num_inds)) - wrln("# SNPs = "+str(len(num_snps))) + wrln("# SNPs = "+str(num_snps)) wrln("# Encoding = HAB") for i in range(num_inds): wr("\t"+str(i+1)) @@ -163,8 +167,8 @@ if options.geno: snps = plink.readbed(options.geno+'.bed',num_inds, ('A','H','B','-'), out) msg("Write transposed genotype matrix") - for g in range(len(num_snps)): - wr(str(g+1)+"\t") + for g in range(num_snps): + wr(bim_snps[g][1]+"\t") for i in range(num_inds): wr(m[g][i]) wr("\n") -- cgit v1.2.3 From 7d566c7bed8887dc18c1311ee84d22a8dc520889 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 12:00:17 +0300 Subject: Factoring out tsvreaders --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 58 ++--------------------- wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py | 73 +++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 53 deletions(-) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index ce8e32be..f5ecddb9 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -18,12 +18,7 @@ # along with this program. If not, see . from optparse import OptionParser -import sys -import os -import numpy as np -# from lmm import LMM, run_other -import csv - +import tsvreader usage = """ python runlmm.py [options] command @@ -38,6 +33,7 @@ python runlmm.py [options] command try --help for more information """ + parser = OptionParser(usage=usage) # parser.add_option("-f", "--file", dest="input file", # help="In", metavar="FILE") @@ -61,54 +57,10 @@ cmd = args[0] print "Command: ",cmd if options.kinship: - K1 = [] - print options.kinship - with open(options.kinship,'r') as tsvin: - assert(tsvin.readline().strip() == "# Kinship format version 1.0") - tsvin.readline() - tsvin.readline() - tsv = csv.reader(tsvin, delimiter='\t') - for row in tsv: - ns = np.genfromtxt(row[1:]) - K1.append(ns) # <--- slow - K = np.array(K1) + k = tsvreader.kinship() if options.pheno: - Y1 = [] - print options.pheno - with open(options.pheno,'r') as tsvin: - assert(tsvin.readline().strip() == "# Phenotype format version 1.0") - tsvin.readline() - tsvin.readline() - tsvin.readline() - tsv = csv.reader(tsvin, delimiter='\t') - for row in tsv: - ns = np.genfromtxt(row[1:]) - Y1.append(ns) # <--- slow - Y = np.array(Y1) + y = tsvreader.pheno() if options.geno: - G1 = [] - hab_mapper = {'A':0,'H':1,'B':2,'-':3} - pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ] - - print options.geno - with open(options.geno,'r') as tsvin: - assert(tsvin.readline().strip() == "# Genotype format version 1.0") - tsvin.readline() - tsvin.readline() - tsvin.readline() - tsvin.readline() - tsv = csv.reader(tsvin, delimiter='\t') - for row in tsv: - # print(row) - id = row[0] - gs = list(row[1]) - # print id,gs - gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs] - # print id,gs2 - # ns = np.genfromtxt(row[1:]) - G1.append(gs2) # <--- slow - G = np.array(G1) - -print G + g = tsvreader.geno() diff --git a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py new file mode 100644 index 00000000..7cea976e --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py @@ -0,0 +1,73 @@ +# Standard file readers +# +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import sys +import os +import numpy as np +import csv + +def kinship(): + K1 = [] + print options.kinship + with open(options.kinship,'r') as tsvin: + assert(tsvin.readline().strip() == "# Kinship format version 1.0") + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + ns = np.genfromtxt(row[1:]) + K1.append(ns) # <--- slow + K = np.array(K1) + +def pheno(): + Y1 = [] + print options.pheno + with open(options.pheno,'r') as tsvin: + assert(tsvin.readline().strip() == "# Phenotype format version 1.0") + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + ns = np.genfromtxt(row[1:]) + Y1.append(ns) # <--- slow + Y = np.array(Y1) + +def geno(): + G1 = [] + hab_mapper = {'A':0,'H':1,'B':2,'-':3} + pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ] + + print options.geno + with open(options.geno,'r') as tsvin: + assert(tsvin.readline().strip() == "# Genotype format version 1.0") + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsvin.readline() + tsv = csv.reader(tsvin, delimiter='\t') + for row in tsv: + # print(row) + id = row[0] + gs = list(row[1]) + # print id,gs + gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs] + # print id,gs2 + # ns = np.genfromtxt(row[1:]) + G1.append(gs2) # <--- slow + G = np.array(G1) + -- cgit v1.2.3 From 926225cf75991142e82d95b2505c59446fa1a8c4 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 12:05:50 +0300 Subject: Refactored tsvreader out --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 9 ++++++--- wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py | 21 ++++++++++++--------- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index f5ecddb9..bed2e39e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -57,10 +57,13 @@ cmd = args[0] print "Command: ",cmd if options.kinship: - k = tsvreader.kinship() + k = tsvreader.kinship(options.kinship) + print len(k) if options.pheno: - y = tsvreader.pheno() + y = tsvreader.pheno(options.pheno) + print len(y) if options.geno: - g = tsvreader.geno() + g = tsvreader.geno(options.geno) + print len(g) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py index 7cea976e..b4027fa3 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py @@ -20,10 +20,10 @@ import os import numpy as np import csv -def kinship(): +def kinship(fn): K1 = [] - print options.kinship - with open(options.kinship,'r') as tsvin: + print fn + with open(fn,'r') as tsvin: assert(tsvin.readline().strip() == "# Kinship format version 1.0") tsvin.readline() tsvin.readline() @@ -32,11 +32,12 @@ def kinship(): ns = np.genfromtxt(row[1:]) K1.append(ns) # <--- slow K = np.array(K1) + return K -def pheno(): +def pheno(fn): Y1 = [] - print options.pheno - with open(options.pheno,'r') as tsvin: + print fn + with open(fn,'r') as tsvin: assert(tsvin.readline().strip() == "# Phenotype format version 1.0") tsvin.readline() tsvin.readline() @@ -46,14 +47,15 @@ def pheno(): ns = np.genfromtxt(row[1:]) Y1.append(ns) # <--- slow Y = np.array(Y1) + return Y -def geno(): +def geno(fn): G1 = [] hab_mapper = {'A':0,'H':1,'B':2,'-':3} pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ] - print options.geno - with open(options.geno,'r') as tsvin: + print fn + with open(fn,'r') as tsvin: assert(tsvin.readline().strip() == "# Genotype format version 1.0") tsvin.readline() tsvin.readline() @@ -70,4 +72,5 @@ def geno(): # ns = np.genfromtxt(row[1:]) G1.append(gs2) # <--- slow G = np.array(G1) + return G -- cgit v1.2.3 From 4902e2a8f4a4c9e08272dbd752eeb61f0e6b60f2 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 12:47:27 +0300 Subject: Populationg Redis from TSV data --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 49 ++++++++++++++++++++++---------- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 11 +++++-- 2 files changed, 42 insertions(+), 18 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 3cbab149..8c6e30e8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -719,23 +719,11 @@ class LMM: pl.ylabel("Probability of data") pl.title(title) -# This is the main function used by Genenetwork2 (with environment) -def gn2_main(): - parser = argparse.ArgumentParser(description='Run pyLMM') - parser.add_argument('-k', '--key') - parser.add_argument('-s', '--species') - - opts = parser.parse_args() - - key = opts.key - species = opts.species - + +def gn2_redis(key,species): json_params = Redis.get(key) params = json.loads(json_params) - #print("params:", params) - - #print("kinship_matrix:", params['kinship_matrix']) tempdata = temp_data.TempData(params['temp_uuid']) if species == "human" : @@ -760,7 +748,38 @@ def gn2_main(): #Pushing json_results into a list where it is the only item because blpop needs a list Redis.rpush(results_key, json_results) Redis.expire(results_key, 60*60) - + +# This is the main function used by Genenetwork2 (with environment) +def gn2_main(): + parser = argparse.ArgumentParser(description='Run pyLMM') + parser.add_argument('-k', '--key') + parser.add_argument('-s', '--species') + + opts = parser.parse_args() + + key = opts.key + species = opts.species + + gn2_redis(key,species) + +def gn2_load_redis(key,species,kinship,pheno,geno): + print("Loading Redis from parsed data") + params = dict(pheno_vector = pheno.tolist(), + genotype_matrix = geno.tolist(), + restricted_max_likelihood = True, + refit = False, + temp_uuid = "testrun_temp_uuid", + + # meta data + timestamp = datetime.datetime.now().isoformat(), + ) + + json_params = json.dumps(params) + Redis.set(key, json_params) + Redis.expire(key, 60*60) + + gn2_redis(key,species) + if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") if has_gn2: diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index bed2e39e..525c43fc 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -19,6 +19,7 @@ from optparse import OptionParser import tsvreader +from lmm import gn2_load_redis usage = """ python runlmm.py [options] command @@ -29,6 +30,7 @@ python runlmm.py [options] command Current commands are: parse : only parse input files + redis : use Redis to call into GN2 try --help for more information """ @@ -58,12 +60,15 @@ print "Command: ",cmd if options.kinship: k = tsvreader.kinship(options.kinship) - print len(k) + print k.shape if options.pheno: y = tsvreader.pheno(options.pheno) - print len(y) + print y.shape if options.geno: g = tsvreader.geno(options.geno) - print len(g) + print g.shape + +if cmd == 'redis': + gn2_load_redis('testrun','other',k,y,g.T) -- cgit v1.2.3 From d6a1f63d08645ff680dad9880030f58881a127d9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 15:15:48 +0300 Subject: Missing genotypes and normalize --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 9 ++++++++- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 8c6e30e8..00bbf144 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -726,6 +726,11 @@ def gn2_redis(key,species): params = json.loads(json_params) tempdata = temp_data.TempData(params['temp_uuid']) + + print('kinship', np.array(params['kinship_matrix'][0:10][0:10])) + print('pheno', params['pheno_vector'][0:10]) + print('geno', params['genotype_matrix'][0:10][0:10]) + if species == "human" : ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']), covariate_matrix = np.array(params['covariate_matrix']), @@ -748,6 +753,7 @@ def gn2_redis(key,species): #Pushing json_results into a list where it is the only item because blpop needs a list Redis.rpush(results_key, json_results) Redis.expire(results_key, 60*60) + return ps, ts # This is the main function used by Genenetwork2 (with environment) def gn2_main(): @@ -766,6 +772,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno): print("Loading Redis from parsed data") params = dict(pheno_vector = pheno.tolist(), genotype_matrix = geno.tolist(), + kinship_matrix= kinship.tolist(), restricted_max_likelihood = True, refit = False, temp_uuid = "testrun_temp_uuid", @@ -778,7 +785,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno): Redis.set(key, json_params) Redis.expire(key, 60*60) - gn2_redis(key,species) + return gn2_redis(key,species) if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 525c43fc..6db1bdbc 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -18,7 +18,9 @@ # along with this program. If not, see . from optparse import OptionParser +import sys import tsvreader +import numpy as np from lmm import gn2_load_redis usage = """ @@ -70,5 +72,26 @@ if options.geno: g = tsvreader.geno(options.geno) print g.shape +def normalizeGenotype(G): + x = True - np.isnan(G) + m = G[x].mean() + s = np.sqrt(G[x].var()) + G[np.isnan(G)] = m + if s == 0: G = G - m + else: G = (G - m) / s + return G + +gT = normalizeGenotype(g.T) + +# Remove individuals with missing phenotypes +v = np.isnan(y) +keep = True - v +if v.sum(): + if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + y = y[keep] + gT = gT[keep,:] + k = k[keep,:][:,keep] + if cmd == 'redis': - gn2_load_redis('testrun','other',k,y,g.T) + ps, ts = gn2_load_redis('testrun','other',k,y,gT) + print ps[0:10],ps[-10:] -- cgit v1.2.3 From f116f59923ebe3fc52c27fac8fd571a70e323a68 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 15:25:00 +0300 Subject: lmm.py: debugging output --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 00bbf144..c2271611 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -727,9 +727,9 @@ def gn2_redis(key,species): tempdata = temp_data.TempData(params['temp_uuid']) - print('kinship', np.array(params['kinship_matrix'][0:10][0:10])) - print('pheno', params['pheno_vector'][0:10]) - print('geno', params['genotype_matrix'][0:10][0:10]) + print('kinship', np.array(params['kinship_matrix'])) + print('pheno', np.array(params['pheno_vector'])) + print('geno', np.array(params['genotype_matrix'])) if species == "human" : ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']), -- cgit v1.2.3 From 6c165a4b9ec848eae0baeb4e4e3382bde4e9c1be Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Mar 2015 17:18:26 +0300 Subject: Normalisation --- wqflask/wqflask/my_pylmm/pyLMM/input.py | 4 +++- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 5 +++-- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 38 +++++++++++++++++++++++--------- 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py index fdf02e42..f7b556a5 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/input.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py @@ -133,13 +133,15 @@ class plink: return G def normalizeGenotype(self,G): + # print "Before",G + # print G.shape x = True - np.isnan(G) m = G[x].mean() s = np.sqrt(G[x].var()) G[np.isnan(G)] = m if s == 0: G = G - m else: G = (G - m) / s - + # print "After",G return G def getPhenos(self,phenoFile=None): diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index c2271611..ab19bf08 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -333,8 +333,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False): """ n = genotype_matrix.shape[0] m = genotype_matrix.shape[1] - print("genotype matrix n is:", n) - print("genotype matrix m is:", m) + print("genotype 2D matrix n (inds) is:", n) + print("genotype 2D matrix m (snps) is:", m) keep = [] for counter in range(m): if is_testing and counter>8: @@ -730,6 +730,7 @@ def gn2_redis(key,species): print('kinship', np.array(params['kinship_matrix'])) print('pheno', np.array(params['pheno_vector'])) print('geno', np.array(params['genotype_matrix'])) + # sys.exit(1) if species == "human" : ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']), diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 6db1bdbc..907a6835 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -73,15 +73,33 @@ if options.geno: print g.shape def normalizeGenotype(G): - x = True - np.isnan(G) - m = G[x].mean() - s = np.sqrt(G[x].var()) - G[np.isnan(G)] = m - if s == 0: G = G - m - else: G = (G - m) / s + # Run for every SNP list (num individuals) + x = True - np.isnan(G) # Matrix of True/False + m = G[x].mean() # Global mean value + print m + s = np.sqrt(G[x].var()) # Global stddev + print s + G[np.isnan(G)] = m # Plug-in mean values for missing + if s == 0: + G = G - m # Subtract the mean + else: + G = (G - m) / s # Normalize the deviation return G -gT = normalizeGenotype(g.T) +# g = g.reshape((1, -1))[0] +print("Before",g) +gn = [] +for snp in g: + gn.append( normalizeGenotype(snp) ) + +gn = np.array(gn) +print("After1",gn) +gnT = gn.T +print("After",gnT) +# G = gnT +G = gnT +print "G shape",G.shape +# assert(G[0,0]==-2.25726341) # Remove individuals with missing phenotypes v = np.isnan(y) @@ -89,9 +107,9 @@ keep = True - v if v.sum(): if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) y = y[keep] - gT = gT[keep,:] + G = G[keep,:] k = k[keep,:][:,keep] if cmd == 'redis': - ps, ts = gn2_load_redis('testrun','other',k,y,gT) - print ps[0:10],ps[-10:] + ps, ts = gn2_load_redis('testrun','other',k,y,G) + print np.array(ps) -- cgit v1.2.3 From a5d0d42f6977c4473c28f12999a62ad360750041 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 11 Mar 2015 12:07:54 +0300 Subject: & Added testing option, now the results match --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 16 ++++++++++------ wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index ab19bf08..5ffb9a4c 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -267,6 +267,7 @@ def run_other(pheno_vector, """ print("In run_other") + print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing) with Bench("Calculate Kinship"): kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing) @@ -280,6 +281,7 @@ def run_other(pheno_vector, lmm_ob.fit() print("genotype_matrix: ", genotype_matrix.shape) + print(genotype_matrix) with Bench("Doing GWAS"): t_stats, p_values = GWAS(pheno_vector, @@ -720,7 +722,7 @@ class LMM: pl.title(title) -def gn2_redis(key,species): +def gn2_redis(key,species,is_testing=False): json_params = Redis.get(key) params = json.loads(json_params) @@ -729,7 +731,8 @@ def gn2_redis(key,species): print('kinship', np.array(params['kinship_matrix'])) print('pheno', np.array(params['pheno_vector'])) - print('geno', np.array(params['genotype_matrix'])) + geno = np.array(params['genotype_matrix']) + print('geno', geno.shape, geno) # sys.exit(1) if species == "human" : @@ -741,10 +744,11 @@ def gn2_redis(key,species): tempdata = tempdata) else: ps, ts = run_other(pheno_vector = np.array(params['pheno_vector']), - genotype_matrix = np.array(params['genotype_matrix']), + genotype_matrix = geno, restricted_max_likelihood = params['restricted_max_likelihood'], refit = params['refit'], - tempdata = tempdata, is_testing=False) + tempdata = tempdata, + is_testing=is_testing) results_key = "pylmm:results:" + params['temp_uuid'] @@ -769,7 +773,7 @@ def gn2_main(): gn2_redis(key,species) -def gn2_load_redis(key,species,kinship,pheno,geno): +def gn2_load_redis(key,species,kinship,pheno,geno,is_testing): print("Loading Redis from parsed data") params = dict(pheno_vector = pheno.tolist(), genotype_matrix = geno.tolist(), @@ -786,7 +790,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno): Redis.set(key, json_params) Redis.expire(key, 60*60) - return gn2_redis(key,species) + return gn2_redis(key,species,is_testing) if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 907a6835..738268be 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -50,6 +50,9 @@ parser.add_option("--geno",dest="geno", parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") +parser.add_option("--test", + action="store_true", dest="testing", default=False, + help="Testing mode") (options, args) = parser.parse_args() @@ -76,9 +79,9 @@ def normalizeGenotype(G): # Run for every SNP list (num individuals) x = True - np.isnan(G) # Matrix of True/False m = G[x].mean() # Global mean value - print m + # print m s = np.sqrt(G[x].var()) # Global stddev - print s + # print s G[np.isnan(G)] = m # Plug-in mean values for missing if s == 0: G = G - m # Subtract the mean @@ -92,6 +95,7 @@ gn = [] for snp in g: gn.append( normalizeGenotype(snp) ) +gn = g gn = np.array(gn) print("After1",gn) gnT = gn.T @@ -99,6 +103,7 @@ print("After",gnT) # G = gnT G = gnT print "G shape",G.shape +# sys.exit(1) # assert(G[0,0]==-2.25726341) # Remove individuals with missing phenotypes @@ -111,5 +116,5 @@ if v.sum(): k = k[keep,:][:,keep] if cmd == 'redis': - ps, ts = gn2_load_redis('testrun','other',k,y,G) + ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) print np.array(ps) -- cgit v1.2.3 From 084c8fe8c1c3255d59bd1da59b7bebb5c8811ba0 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 11 Mar 2015 12:11:27 +0300 Subject: lmm.py: change message on running from CLI --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 5ffb9a4c..58d7593d 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -797,7 +797,7 @@ if __name__ == '__main__': if has_gn2: gn2_main() else: - print("Run from pylmmGWAS.py instead") + print("Run from runlmm.py instead") -- cgit v1.2.3