From 61b8ae229687612b4c9d4a0b089a9ef0ef47ff2c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 20 Feb 2015 14:40:55 +0300 Subject: Moved pylmm scripts that are not used by GN2 --- wqflask/wqflask/my_pylmm/example.py | 58 ---- wqflask/wqflask/my_pylmm/pylmmGWAS.py | 346 ----------------------- wqflask/wqflask/my_pylmm/pylmmKinship.py | 186 ------------ wqflask/wqflask/my_pylmm/run_pylmm.py | 77 ----- wqflask/wqflask/my_pylmm/scripts/example.py | 58 ++++ wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py | 346 +++++++++++++++++++++++ wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py | 186 ++++++++++++ wqflask/wqflask/my_pylmm/scripts/run_pylmm.py | 77 +++++ 8 files changed, 667 insertions(+), 667 deletions(-) delete mode 100755 wqflask/wqflask/my_pylmm/example.py delete mode 100755 wqflask/wqflask/my_pylmm/pylmmGWAS.py delete mode 100755 wqflask/wqflask/my_pylmm/pylmmKinship.py delete mode 100755 wqflask/wqflask/my_pylmm/run_pylmm.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/example.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py create mode 100755 wqflask/wqflask/my_pylmm/scripts/run_pylmm.py diff --git a/wqflask/wqflask/my_pylmm/example.py b/wqflask/wqflask/my_pylmm/example.py deleted file mode 100755 index 8b30debd..00000000 --- a/wqflask/wqflask/my_pylmm/example.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/python - -from __future__ import absolute_import, print_function, division - -import sys -import time - -import numpy as np -from pyLMM import lmm - -from pprint import pformat as pf - - -Y = np.genfromtxt('data/mdp.exprs.1.new') -print("exprs is:", pf(Y.shape)) - -# Loading npdump and first 1000 snps for speed -#K = np.load('data/hmdp.liver.K.npdump') -#snps = np.load('data/hmdp.liver.snps.1000.npdump').T - -# These three lines will load all SNPs (from npdump or from txt) and -# calculate the kinship -snps = np.genfromtxt('/home/zas1024/gene/web/new_genotypers/mdp.snps.1000.new').T -print("snps is:", pf(snps.shape)) -#snps = snps[~np.isnan(snps).all(axis=1)] -#print ("snps is now:", pf(snps)) -np.savetxt("/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.trimmed", snps, fmt='%s', delimiter=' ') -#snps = np.load('data/hmdp.liver.snps.npdump').T -K = lmm.calculateKinship(snps) -#print("K is:", pf(K)) -#print("Y is:", pf(Y.shape)) - -# Instantiate a LMM object for the phentoype Y and fit the null model -L = lmm.LMM(Y,K) -L.fit() - -# Manually calculate the association at one SNP -X = snps[:,0] -X[np.isnan(X)] = X[True - np.isnan(X)].mean() # Fill missing with MAF -X = X.reshape(len(X),1) -if X.var() == 0: ts,ps = (np.nan,np.nan) -else: ts,ps = L.association(X) - -# If I want to refit the variance component -L.fit(X=X) -ts,ps = L.association(X) - -# If I want to do a genome-wide scan over the 1000 SNPs. -# This call will use REML (REML = False means use ML). -# It will also refit the variance components for each SNP. -# Setting refit = False will cause the program to fit the model once -# and hold those variance component estimates for each SNP. -begin = time.time() -TS,PS = lmm.GWAS(Y,snps,K,REML=True,refit=False) -print("TS is:", pf(TS)) -print("PS is:", pf(PS)) -end = time.time() -sys.stderr.write("Total time for 1000 SNPs: %0.3f\n" % (end- begin)) \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py deleted file mode 100755 index 74b58dd6..00000000 --- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/python - -# pylmm is a python-based linear mixed-model solver with applications to GWAS - -# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) - -#The program is free for academic use. Please contact Nick Furlotte -# if you are interested in using the software for -#commercial purposes. - -#The software must not be modified and distributed without prior -#permission of the author. - -#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pdb -import time -import sys - -def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") - -def formatResult(id,beta,betaSD,ts,ps): - return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" - -def outputResult(id,beta,betaSD,ts,ps): - out.write(formatResult(id,beta,betaSD,ts,ps)) - -from optparse import OptionParser,OptionGroup -usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase - -This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. -The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. - -Basic usage: - - python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile - - """ -parser = OptionParser(usage=usage) - -basicGroup = OptionGroup(parser, "Basic Options") -advancedGroup = OptionGroup(parser, "Advanced Options") -experimentalGroup = OptionGroup(parser, "Experimental Options") - -#basicGroup.add_option("--pfile", dest="pfile", -# help="The base for a PLINK ped file") -basicGroup.add_option("--tfile", dest="tfile", - help="The base for a PLINK tped file") -basicGroup.add_option("--bfile", dest="bfile", - help="The base for a PLINK binary bed file") -basicGroup.add_option("--phenofile", dest="phenoFile", default=None, - help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. This file should be in plink format. ") - -# EMMA Options -basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the columns and snps on the rows.") -basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each phenotype as one row.") -basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each covariate as one row.") - -basicGroup.add_option("--kfile", dest="kfile", - help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") -basicGroup.add_option("--covfile", dest="covfile", - help="The location of a covariate file file. This is a plink formatted covariate file.") -basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0) - - - -advancedGroup.add_option("--removeMissingGenotypes", - action="store_false", dest="normalizeGenotype", default=True, - help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.") -advancedGroup.add_option("--refit", - action="store_true", dest="refit", default=False, - help="Refit the variance components at each SNP (default is to lock in the variance components under the null).") - -advancedGroup.add_option("--REML", - action="store_true", dest="REML", default=False, - help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).") -#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") -advancedGroup.add_option("--eigen", dest="eigenfile", - help="The location of the precomputed eigendecomposition for the kinship file. These can be computed with pylmmKinship.py.") -advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true", - help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.") - -basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") - -advancedGroup.add_option("-v", "--verbose", - action="store_true", dest="verbose", default=False, - help="Print extra info") - -# Experimental Group Options -experimentalGroup.add_option("--kfile2", dest="kfile2", - help="The location of a second kinship file. This file has the same format as the first kinship. This might be used if you want to correct for another form of confounding.") - -parser.add_option_group(basicGroup) -parser.add_option_group(advancedGroup) -parser.add_option_group(experimentalGroup) - -(options, args) = parser.parse_args() - -import sys -import os -import numpy as np -from scipy import linalg -from pylmm.lmm import LMM -from pylmm import input - -import multiprocessing as mp # Multiprocessing is part of the Python stdlib -import Queue - -if len(args) != 1: - parser.print_help() - sys.exit() - -outFile = args[0] - -if not options.tfile and not options.bfile and not options.emmaFile: -#if not options.pfile and not options.tfile and not options.bfile: - parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).") -if not options.kfile: - parser.error("Please provide a pre-computed kinship file") - -numThreads = None -if options.numThreads: - numThreads = int(options.numThreads) - -# READING PLINK input -if options.verbose: sys.stderr.write("Reading SNP input...\n") -if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -else: parser.error("You must provide at least one PLINK input file base") - -if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno): - parser.error("No .pheno file exist for %s. Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos')) - -# Read the emma phenotype file if provided. -# Format should be rows are phenotypes and columns are individuals. -if options.emmaPheno: - f = open(options.emmaPheno,'r') - P = [] - for line in f: - v = line.strip().split() - p = [] - for x in v: - try: - p.append(float(x)) - except: p.append(np.nan) - P.append(p) - f.close() - IN.phenos = np.array(P).T - -# READING Covariate File -if options.covfile: - if options.verbose: - sys.stderr.write("Reading covariate file...\n") - P = IN.getCovariates(options.covfile) - if options.noMean: - X0 = P - else: - X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) -elif options.emmaCov: - if options.verbose: - sys.stderr.write("Reading covariate file...\n") - P = IN.getCovariatesEMMA(options.emmaCov) - if options.noMean: - X0 = P - else: - X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) -else: - X0 = np.ones((IN.phenos.shape[0],1)) - -if np.isnan(X0).sum(): - parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") - -# READING Kinship - major bottleneck for large datasets -if options.verbose: sys.stderr.write("Reading kinship...\n") -begin = time.time() -# This method seems to be the fastest and works if you already know the size of the matrix -if options.kfile[-3:] == '.gz': - import gzip - f = gzip.open(options.kfile,'r') - F = f.read() # might exhaust mem if the file is huge - K = np.fromstring(F,sep=' ') # Assume that space separated - f.close() -else: - K = np.fromfile(open(options.kfile,'r'),sep=" ") -K.resize((len(IN.indivs),len(IN.indivs))) -end = time.time() -# Other slower ways -#K = np.loadtxt(options.kfile) -#K = np.genfromtxt(options.kfile) -if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) - -if options.kfile2: - if options.verbose: sys.stderr.write("Reading second kinship...\n") - begin = time.time() - # This method seems to be the fastest and works if you already know the size of the matrix - if options.kfile2[-3:] == '.gz': - import gzip - f = gzip.open(options.kfile2,'r') - F = f.read() # might exhaust mem if the file is huge - K2 = np.fromstring(F,sep=' ') # Assume that space separated - f.close() - else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ") - K2.resize((len(IN.indivs),len(IN.indivs))) - end = time.time() - if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin)) - -# PROCESS the phenotype data -- Remove missing phenotype values -# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data -Y = IN.phenos[:,options.pheno] -v = np.isnan(Y) -keep = True - v -if v.sum(): - if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - Y = Y[keep] - X0 = X0[keep,:] - K = K[keep,:][:,keep] - if options.kfile2: K2 = K2[keep,:][:,keep] - Kva = [] - Kve = [] - -# Only load the decomposition if we did not remove individuals. -# Otherwise it would not be correct and we would have to compute it again. -if not v.sum() and options.eigenfile: - if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") - Kva = np.load(options.eigenfile + ".Kva") - Kve = np.load(options.eigenfile + ".Kve") -else: - Kva = [] - Kve = [] - -# CREATE LMM object for association -n = K.shape[0] -if not options.kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) -else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2) - -# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here -if not options.refit: - if options.verbose: sys.stderr.write("Computing fit for null model\n") - L.fit() - if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) - if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW)) - -def compute_snp(collect): - snp = collect[0] - id = collect[1] - # result = [] - # Check SNPs for missing values - x = snp[keep].reshape((n,1)) # all the SNPs - v = np.isnan(x).reshape((-1,)) - if v.sum(): - keeps = True - v - xs = x[keeps,:] - if keeps.sum() <= 1 or xs.var() <= 1e-6: - # PS.append(np.nan) - # TS.append(np.nan) - # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) - # continue - return formatResult(id,np.nan,np.nan,np.nan,np.nan) - - # Its ok to center the genotype - I used options.normalizeGenotype to - # force the removal of missing genotypes as opposed to replacing them with MAF. - if not options.normalizeGenotype: - xs = (xs - xs.mean()) / np.sqrt(xs.var()) - Ys = Y[keeps] - X0s = X0[keeps,:] - Ks = K[keeps,:][:,keeps] - if options.kfile2: - K2s = K2[keeps,:][:,keeps] - Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s) - else: - Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) - if options.refit: - Ls.fit(X=xs,REML=options.REML) - else: - #try: - Ls.fit(REML=options.REML) - #except: pdb.set_trace() - ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) - else: - if x.var() == 0: - # PS.append(np.nan) - # TS.append(np.nan) - # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values - return formatResult(id,np.nan,np.nan,np.nan,np.nan) - # continue - - if options.refit: - L.fit(X=x,REML=options.REML) - # This is where it happens - ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) - - return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) - # PS.append(ps) - # TS.append(ts) - # return len(result) - # compute.q.put(result) - # return None - -def f_init(q): - compute_snp.q = q - -# Set up the pool -# mp.set_start_method('spawn') -q = mp.Queue() -p = mp.Pool(numThreads, f_init, [q]) -collect = [] - -# Buffers for pvalues and t-stats -# PS = [] -# TS = [] -count = 0 -out = open(outFile,'w') -printOutHead() - -for snp_id in IN: - count += 1 - if count % 1000 == 0: - if options.verbose: - sys.stderr.write("At SNP %d\n" % count) - # if count>8000 : - # break # for testing only - if count % 100 == 0: - for line in p.imap(compute_snp,collect): - out.write(line) - collect = [] - - collect.append(snp_id) -for line in p.imap(compute_snp,collect): - out.write(line) - - diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py deleted file mode 100755 index c9a73ece..00000000 --- a/wqflask/wqflask/my_pylmm/pylmmKinship.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/python - -# pylmm is a python-based linear mixed-model solver with applications to GWAS - -# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) - -#The program is free for academic use. Please contact Nick Furlotte -# if you are interested in using the software for -#commercial purposes. - -#The software must not be modified and distributed without prior -#permission of the author. - -#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys -import pdb - -from optparse import OptionParser,OptionGroup -usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile - -""" - -parser = OptionParser(usage=usage) - -basicGroup = OptionGroup(parser, "Basic Options") -#advancedGroup = OptionGroup(parser, "Advanced Options") - -#basicGroup.add_option("--pfile", dest="pfile", -# help="The base for a PLINK ped file") -basicGroup.add_option("--tfile", dest="tfile", - help="The base for a PLINK tped file") -basicGroup.add_option("--bfile", dest="bfile", - help="The base for a PLINK binary ped file") -basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, - help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the rows and snps on the columns.") -basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0, - help="When providing the emmaSNP file you need to specify how many snps are in the file") - -basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") -basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000). This is important when there is a large number of SNPs, because memory could be an issue.") -basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") - -basicGroup.add_option("-v", "--verbose", - action="store_true", dest="verbose", default=False, - help="Print extra info") - -parser.add_option_group(basicGroup) -#parser.add_option_group(advancedGroup) - -(options, args) = parser.parse_args() -if len(args) != 1: - parser.print_help() - sys.exit() - -outFile = args[0] - -import sys -import os -import numpy as np -from scipy import linalg -from pylmm.lmm import calculateKinship -from pylmm import input -import multiprocessing as mp # Multiprocessing is part of the Python stdlib -import Queue - -if not options.tfile and not options.bfile and not options.emmaFile: - parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") - -numThreads = None -if options.numThreads: - numThreads = int(options.numThreads) - -if options.verbose: sys.stderr.write("Reading PLINK input...\n") -if options.bfile: IN = input.plink(options.bfile,type='b') -elif options.tfile: IN = input.plink(options.tfile,type='t') -#elif options.pfile: IN = input.plink(options.pfile,type='p') -elif options.emmaFile: - if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.") - IN = input.plink(options.emmaFile,type='emma') -else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") - -def compute_W(job): - """ - Read 1000 SNPs at a time into matrix and return the result - """ - n = len(IN.indivs) - m = options.computeSize - W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) - for j in range(0,options.computeSize): - row = job*m + j - if row >= IN.numSNPs: - W = W[:,range(0,j)] - break - snp,id = IN.next() - if snp.var() == 0: - continue - W[:,j] = snp # set row to list of SNPs - return W - -def compute_dgemm(job,W): - """ - Compute Kinship(W)*j - - For every set of SNPs dgemm is used to multiply matrices T(W)*W - """ - res = None - try: - res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) - except AttributeError: - res = np.dot(W,W.T) - compute_dgemm.q.put([job,res]) - return job - -def f_init(q): - compute_dgemm.q = q - -n = len(IN.indivs) -# m = options.computeSize -# jobsize=m - -IN.getSNPIterator() -# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file -if options.emmaFile: IN.numSNPs = options.numSNPs -# i = 0 - -# mp.set_start_method('spawn') -q = mp.Queue() -p = mp.Pool(numThreads, f_init, [q]) -iterations = IN.numSNPs/options.computeSize+1 -# jobs = range(0,8) # range(0,iterations) - -results = [] - -K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals - -completed = 0 - -# for job in range(8): -for job in range(iterations): - if options.verbose: - sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) - W = compute_W(job) - results.append(p.apply_async(compute_dgemm, (job,W))) - # Do we have a result? - try: - j,x = q.get_nowait() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - completed += 1 - except Queue.Empty: - pass - -for job in range(len(results)-completed): - j,x = q.get() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - -K = K / float(IN.numSNPs) -if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) -np.savetxt(outFile,K) - -if options.saveEig: - if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") - Kva,Kve = linalg.eigh(K) - if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) - np.savetxt(outFile+".kva",Kva) - np.savetxt(outFile+".kve",Kve) - - - - diff --git a/wqflask/wqflask/my_pylmm/run_pylmm.py b/wqflask/wqflask/my_pylmm/run_pylmm.py deleted file mode 100755 index 0c96d986..00000000 --- a/wqflask/wqflask/my_pylmm/run_pylmm.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import absolute_import, print_function, division - -from base import data_set -from base.species import TheSpecies - - def run(dataset_name, vals, temp_uuid): - """Generates p-values for each marker""" - - tempdata = temp_data.TempData(temp_uuid) - - dataset = data_set.create_dataset(dataset_name) - species = TheSpecies(dataset=dataset) - - samples = [] # Want only ones with values - vals = vals - - for sample in dataset.group.samplelist: - samples.append(str(sample)) - - gen_data(dataset, vals, tempdata) - - - def gen_data(dataset, vals) - dataset.group.get_markers() - - pheno_vector = np.array([val == "x" and np.nan or float(val) for val in vals]) - - if dataset.group.species == "human": - p_values, t_stats = gen_human_results(pheno_vector, tempdata) - else: - genotype_data = [marker['genotypes'] for marker in dataset.group.markers.markers] - - no_val_samples = self.identify_empty_samples() - trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) - - genotype_matrix = np.array(trimmed_genotype_data).T - - #print("pheno_vector: ", pf(pheno_vector)) - #print("genotype_matrix: ", pf(genotype_matrix)) - #print("genotype_matrix.shape: ", pf(genotype_matrix.shape)) - - t_stats, p_values = lmm.run( - pheno_vector, - genotype_matrix, - restricted_max_likelihood=True, - refit=False, - temp_data=tempdata - ) - #print("p_values:", p_values) - - self.dataset.group.markers.add_pvalues(p_values) - return self.dataset.group.markers.markers - - - def gen_human_results(self, pheno_vector, tempdata): - file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name) - - plink_input = input.plink(file_base, type='b') - input_file_name = os.path.join(webqtlConfig.SNP_PATH, self.dataset.group.name + ".snps.gz") - - pheno_vector = pheno_vector.reshape((len(pheno_vector), 1)) - covariate_matrix = np.ones((pheno_vector.shape[0],1)) - kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ") - kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs))) - - p_values, t_stats = lmm.run_human( - pheno_vector, - covariate_matrix, - input_file_name, - kinship_matrix, - loading_progress=tempdata - ) - - return p_values, t_stats - -if __name__ == '__main__': - run(dataset_name, vals, temp_uuid) \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/scripts/example.py b/wqflask/wqflask/my_pylmm/scripts/example.py new file mode 100755 index 00000000..8b30debd --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/example.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division + +import sys +import time + +import numpy as np +from pyLMM import lmm + +from pprint import pformat as pf + + +Y = np.genfromtxt('data/mdp.exprs.1.new') +print("exprs is:", pf(Y.shape)) + +# Loading npdump and first 1000 snps for speed +#K = np.load('data/hmdp.liver.K.npdump') +#snps = np.load('data/hmdp.liver.snps.1000.npdump').T + +# These three lines will load all SNPs (from npdump or from txt) and +# calculate the kinship +snps = np.genfromtxt('/home/zas1024/gene/web/new_genotypers/mdp.snps.1000.new').T +print("snps is:", pf(snps.shape)) +#snps = snps[~np.isnan(snps).all(axis=1)] +#print ("snps is now:", pf(snps)) +np.savetxt("/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.trimmed", snps, fmt='%s', delimiter=' ') +#snps = np.load('data/hmdp.liver.snps.npdump').T +K = lmm.calculateKinship(snps) +#print("K is:", pf(K)) +#print("Y is:", pf(Y.shape)) + +# Instantiate a LMM object for the phentoype Y and fit the null model +L = lmm.LMM(Y,K) +L.fit() + +# Manually calculate the association at one SNP +X = snps[:,0] +X[np.isnan(X)] = X[True - np.isnan(X)].mean() # Fill missing with MAF +X = X.reshape(len(X),1) +if X.var() == 0: ts,ps = (np.nan,np.nan) +else: ts,ps = L.association(X) + +# If I want to refit the variance component +L.fit(X=X) +ts,ps = L.association(X) + +# If I want to do a genome-wide scan over the 1000 SNPs. +# This call will use REML (REML = False means use ML). +# It will also refit the variance components for each SNP. +# Setting refit = False will cause the program to fit the model once +# and hold those variance component estimates for each SNP. +begin = time.time() +TS,PS = lmm.GWAS(Y,snps,K,REML=True,refit=False) +print("TS is:", pf(TS)) +print("PS is:", pf(PS)) +end = time.time() +sys.stderr.write("Total time for 1000 SNPs: %0.3f\n" % (end- begin)) \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py new file mode 100755 index 00000000..74b58dd6 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py @@ -0,0 +1,346 @@ +#!/usr/bin/python + +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) + +#The program is free for academic use. Please contact Nick Furlotte +# if you are interested in using the software for +#commercial purposes. + +#The software must not be modified and distributed without prior +#permission of the author. + +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pdb +import time +import sys + +def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") + +def formatResult(id,beta,betaSD,ts,ps): + return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" + +def outputResult(id,beta,betaSD,ts,ps): + out.write(formatResult(id,beta,betaSD,ts,ps)) + +from optparse import OptionParser,OptionGroup +usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase + +This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. +The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. + +Basic usage: + + python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile + + """ +parser = OptionParser(usage=usage) + +basicGroup = OptionGroup(parser, "Basic Options") +advancedGroup = OptionGroup(parser, "Advanced Options") +experimentalGroup = OptionGroup(parser, "Experimental Options") + +#basicGroup.add_option("--pfile", dest="pfile", +# help="The base for a PLINK ped file") +basicGroup.add_option("--tfile", dest="tfile", + help="The base for a PLINK tped file") +basicGroup.add_option("--bfile", dest="bfile", + help="The base for a PLINK binary bed file") +basicGroup.add_option("--phenofile", dest="phenoFile", default=None, + help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. This file should be in plink format. ") + +# EMMA Options +basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the columns and snps on the rows.") +basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each phenotype as one row.") +basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each covariate as one row.") + +basicGroup.add_option("--kfile", dest="kfile", + help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") +basicGroup.add_option("--covfile", dest="covfile", + help="The location of a covariate file file. This is a plink formatted covariate file.") +basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0) + + + +advancedGroup.add_option("--removeMissingGenotypes", + action="store_false", dest="normalizeGenotype", default=True, + help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.") +advancedGroup.add_option("--refit", + action="store_true", dest="refit", default=False, + help="Refit the variance components at each SNP (default is to lock in the variance components under the null).") + +advancedGroup.add_option("--REML", + action="store_true", dest="REML", default=False, + help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).") +#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") +advancedGroup.add_option("--eigen", dest="eigenfile", + help="The location of the precomputed eigendecomposition for the kinship file. These can be computed with pylmmKinship.py.") +advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true", + help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.") + +basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") + +advancedGroup.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") + +# Experimental Group Options +experimentalGroup.add_option("--kfile2", dest="kfile2", + help="The location of a second kinship file. This file has the same format as the first kinship. This might be used if you want to correct for another form of confounding.") + +parser.add_option_group(basicGroup) +parser.add_option_group(advancedGroup) +parser.add_option_group(experimentalGroup) + +(options, args) = parser.parse_args() + +import sys +import os +import numpy as np +from scipy import linalg +from pylmm.lmm import LMM +from pylmm import input + +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +if len(args) != 1: + parser.print_help() + sys.exit() + +outFile = args[0] + +if not options.tfile and not options.bfile and not options.emmaFile: +#if not options.pfile and not options.tfile and not options.bfile: + parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).") +if not options.kfile: + parser.error("Please provide a pre-computed kinship file") + +numThreads = None +if options.numThreads: + numThreads = int(options.numThreads) + +# READING PLINK input +if options.verbose: sys.stderr.write("Reading SNP input...\n") +if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +else: parser.error("You must provide at least one PLINK input file base") + +if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno): + parser.error("No .pheno file exist for %s. Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos')) + +# Read the emma phenotype file if provided. +# Format should be rows are phenotypes and columns are individuals. +if options.emmaPheno: + f = open(options.emmaPheno,'r') + P = [] + for line in f: + v = line.strip().split() + p = [] + for x in v: + try: + p.append(float(x)) + except: p.append(np.nan) + P.append(p) + f.close() + IN.phenos = np.array(P).T + +# READING Covariate File +if options.covfile: + if options.verbose: + sys.stderr.write("Reading covariate file...\n") + P = IN.getCovariates(options.covfile) + if options.noMean: + X0 = P + else: + X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) +elif options.emmaCov: + if options.verbose: + sys.stderr.write("Reading covariate file...\n") + P = IN.getCovariatesEMMA(options.emmaCov) + if options.noMean: + X0 = P + else: + X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) +else: + X0 = np.ones((IN.phenos.shape[0],1)) + +if np.isnan(X0).sum(): + parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") + +# READING Kinship - major bottleneck for large datasets +if options.verbose: sys.stderr.write("Reading kinship...\n") +begin = time.time() +# This method seems to be the fastest and works if you already know the size of the matrix +if options.kfile[-3:] == '.gz': + import gzip + f = gzip.open(options.kfile,'r') + F = f.read() # might exhaust mem if the file is huge + K = np.fromstring(F,sep=' ') # Assume that space separated + f.close() +else: + K = np.fromfile(open(options.kfile,'r'),sep=" ") +K.resize((len(IN.indivs),len(IN.indivs))) +end = time.time() +# Other slower ways +#K = np.loadtxt(options.kfile) +#K = np.genfromtxt(options.kfile) +if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) + +if options.kfile2: + if options.verbose: sys.stderr.write("Reading second kinship...\n") + begin = time.time() + # This method seems to be the fastest and works if you already know the size of the matrix + if options.kfile2[-3:] == '.gz': + import gzip + f = gzip.open(options.kfile2,'r') + F = f.read() # might exhaust mem if the file is huge + K2 = np.fromstring(F,sep=' ') # Assume that space separated + f.close() + else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ") + K2.resize((len(IN.indivs),len(IN.indivs))) + end = time.time() + if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin)) + +# PROCESS the phenotype data -- Remove missing phenotype values +# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data +Y = IN.phenos[:,options.pheno] +v = np.isnan(Y) +keep = True - v +if v.sum(): + if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + Y = Y[keep] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + if options.kfile2: K2 = K2[keep,:][:,keep] + Kva = [] + Kve = [] + +# Only load the decomposition if we did not remove individuals. +# Otherwise it would not be correct and we would have to compute it again. +if not v.sum() and options.eigenfile: + if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") + Kva = np.load(options.eigenfile + ".Kva") + Kve = np.load(options.eigenfile + ".Kve") +else: + Kva = [] + Kve = [] + +# CREATE LMM object for association +n = K.shape[0] +if not options.kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) +else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2) + +# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here +if not options.refit: + if options.verbose: sys.stderr.write("Computing fit for null model\n") + L.fit() + if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) + if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW)) + +def compute_snp(collect): + snp = collect[0] + id = collect[1] + # result = [] + # Check SNPs for missing values + x = snp[keep].reshape((n,1)) # all the SNPs + v = np.isnan(x).reshape((-1,)) + if v.sum(): + keeps = True - v + xs = x[keeps,:] + if keeps.sum() <= 1 or xs.var() <= 1e-6: + # PS.append(np.nan) + # TS.append(np.nan) + # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) + # continue + return formatResult(id,np.nan,np.nan,np.nan,np.nan) + + # Its ok to center the genotype - I used options.normalizeGenotype to + # force the removal of missing genotypes as opposed to replacing them with MAF. + if not options.normalizeGenotype: + xs = (xs - xs.mean()) / np.sqrt(xs.var()) + Ys = Y[keeps] + X0s = X0[keeps,:] + Ks = K[keeps,:][:,keeps] + if options.kfile2: + K2s = K2[keeps,:][:,keeps] + Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s) + else: + Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) + if options.refit: + Ls.fit(X=xs,REML=options.REML) + else: + #try: + Ls.fit(REML=options.REML) + #except: pdb.set_trace() + ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) + else: + if x.var() == 0: + # PS.append(np.nan) + # TS.append(np.nan) + # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values + return formatResult(id,np.nan,np.nan,np.nan,np.nan) + # continue + + if options.refit: + L.fit(X=x,REML=options.REML) + # This is where it happens + ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) + + return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) + # PS.append(ps) + # TS.append(ts) + # return len(result) + # compute.q.put(result) + # return None + +def f_init(q): + compute_snp.q = q + +# Set up the pool +# mp.set_start_method('spawn') +q = mp.Queue() +p = mp.Pool(numThreads, f_init, [q]) +collect = [] + +# Buffers for pvalues and t-stats +# PS = [] +# TS = [] +count = 0 +out = open(outFile,'w') +printOutHead() + +for snp_id in IN: + count += 1 + if count % 1000 == 0: + if options.verbose: + sys.stderr.write("At SNP %d\n" % count) + # if count>8000 : + # break # for testing only + if count % 100 == 0: + for line in p.imap(compute_snp,collect): + out.write(line) + collect = [] + + collect.append(snp_id) +for line in p.imap(compute_snp,collect): + out.write(line) + + diff --git a/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py new file mode 100755 index 00000000..c9a73ece --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py @@ -0,0 +1,186 @@ +#!/usr/bin/python + +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) + +#The program is free for academic use. Please contact Nick Furlotte +# if you are interested in using the software for +#commercial purposes. + +#The software must not be modified and distributed without prior +#permission of the author. + +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys +import pdb + +from optparse import OptionParser,OptionGroup +usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile + +""" + +parser = OptionParser(usage=usage) + +basicGroup = OptionGroup(parser, "Basic Options") +#advancedGroup = OptionGroup(parser, "Advanced Options") + +#basicGroup.add_option("--pfile", dest="pfile", +# help="The base for a PLINK ped file") +basicGroup.add_option("--tfile", dest="tfile", + help="The base for a PLINK tped file") +basicGroup.add_option("--bfile", dest="bfile", + help="The base for a PLINK binary ped file") +basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the rows and snps on the columns.") +basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0, + help="When providing the emmaSNP file you need to specify how many snps are in the file") + +basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") +basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000). This is important when there is a large number of SNPs, because memory could be an issue.") +basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") + +basicGroup.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") + +parser.add_option_group(basicGroup) +#parser.add_option_group(advancedGroup) + +(options, args) = parser.parse_args() +if len(args) != 1: + parser.print_help() + sys.exit() + +outFile = args[0] + +import sys +import os +import numpy as np +from scipy import linalg +from pylmm.lmm import calculateKinship +from pylmm import input +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +if not options.tfile and not options.bfile and not options.emmaFile: + parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") + +numThreads = None +if options.numThreads: + numThreads = int(options.numThreads) + +if options.verbose: sys.stderr.write("Reading PLINK input...\n") +if options.bfile: IN = input.plink(options.bfile,type='b') +elif options.tfile: IN = input.plink(options.tfile,type='t') +#elif options.pfile: IN = input.plink(options.pfile,type='p') +elif options.emmaFile: + if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.") + IN = input.plink(options.emmaFile,type='emma') +else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") + +def compute_W(job): + """ + Read 1000 SNPs at a time into matrix and return the result + """ + n = len(IN.indivs) + m = options.computeSize + W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + for j in range(0,options.computeSize): + row = job*m + j + if row >= IN.numSNPs: + W = W[:,range(0,j)] + break + snp,id = IN.next() + if snp.var() == 0: + continue + W[:,j] = snp # set row to list of SNPs + return W + +def compute_dgemm(job,W): + """ + Compute Kinship(W)*j + + For every set of SNPs dgemm is used to multiply matrices T(W)*W + """ + res = None + try: + res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) + except AttributeError: + res = np.dot(W,W.T) + compute_dgemm.q.put([job,res]) + return job + +def f_init(q): + compute_dgemm.q = q + +n = len(IN.indivs) +# m = options.computeSize +# jobsize=m + +IN.getSNPIterator() +# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file +if options.emmaFile: IN.numSNPs = options.numSNPs +# i = 0 + +# mp.set_start_method('spawn') +q = mp.Queue() +p = mp.Pool(numThreads, f_init, [q]) +iterations = IN.numSNPs/options.computeSize+1 +# jobs = range(0,8) # range(0,iterations) + +results = [] + +K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + +completed = 0 + +# for job in range(8): +for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job) + results.append(p.apply_async(compute_dgemm, (job,W))) + # Do we have a result? + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + +for job in range(len(results)-completed): + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + +K = K / float(IN.numSNPs) +if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) +np.savetxt(outFile,K) + +if options.saveEig: + if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) + + + + diff --git a/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py new file mode 100755 index 00000000..0c96d986 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py @@ -0,0 +1,77 @@ +from __future__ import absolute_import, print_function, division + +from base import data_set +from base.species import TheSpecies + + def run(dataset_name, vals, temp_uuid): + """Generates p-values for each marker""" + + tempdata = temp_data.TempData(temp_uuid) + + dataset = data_set.create_dataset(dataset_name) + species = TheSpecies(dataset=dataset) + + samples = [] # Want only ones with values + vals = vals + + for sample in dataset.group.samplelist: + samples.append(str(sample)) + + gen_data(dataset, vals, tempdata) + + + def gen_data(dataset, vals) + dataset.group.get_markers() + + pheno_vector = np.array([val == "x" and np.nan or float(val) for val in vals]) + + if dataset.group.species == "human": + p_values, t_stats = gen_human_results(pheno_vector, tempdata) + else: + genotype_data = [marker['genotypes'] for marker in dataset.group.markers.markers] + + no_val_samples = self.identify_empty_samples() + trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + genotype_matrix = np.array(trimmed_genotype_data).T + + #print("pheno_vector: ", pf(pheno_vector)) + #print("genotype_matrix: ", pf(genotype_matrix)) + #print("genotype_matrix.shape: ", pf(genotype_matrix.shape)) + + t_stats, p_values = lmm.run( + pheno_vector, + genotype_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=tempdata + ) + #print("p_values:", p_values) + + self.dataset.group.markers.add_pvalues(p_values) + return self.dataset.group.markers.markers + + + def gen_human_results(self, pheno_vector, tempdata): + file_base = os.path.join(webqtlConfig.PYLMM_PATH, self.dataset.group.name) + + plink_input = input.plink(file_base, type='b') + input_file_name = os.path.join(webqtlConfig.SNP_PATH, self.dataset.group.name + ".snps.gz") + + pheno_vector = pheno_vector.reshape((len(pheno_vector), 1)) + covariate_matrix = np.ones((pheno_vector.shape[0],1)) + kinship_matrix = np.fromfile(open(file_base + '.kin','r'),sep=" ") + kinship_matrix.resize((len(plink_input.indivs),len(plink_input.indivs))) + + p_values, t_stats = lmm.run_human( + pheno_vector, + covariate_matrix, + input_file_name, + kinship_matrix, + loading_progress=tempdata + ) + + return p_values, t_stats + +if __name__ == '__main__': + run(dataset_name, vals, temp_uuid) \ No newline at end of file -- cgit v1.2.3