From a71c3400f78fe4f63526308a0a7356b3c98e7b27 Mon Sep 17 00:00:00 2001 From: Zachary Sloan Date: Tue, 19 Mar 2013 22:54:36 +0000 Subject: In the process of trying to use Nick's pylmmKinship.py code to precompute the kinship matrix for the human datasets we have plink .bed genotype data for --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 72 +++++----- wqflask/wqflask/my_pylmm/pylmmGWAS.py | 223 +++++++++++++++++++++++++++++++ wqflask/wqflask/my_pylmm/pylmmKinship.py | 129 ++++++++++++++++++ 3 files changed, 388 insertions(+), 36 deletions(-) create mode 100644 wqflask/wqflask/my_pylmm/pylmmGWAS.py create mode 100644 wqflask/wqflask/my_pylmm/pylmmKinship.py diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index ffc6283c..66830662 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -26,42 +26,42 @@ from scipy import stats from pprint import pformat as pf -from utility.benchmark import Bench - -#np.seterr('raise') - -def run(pheno_vector, - genotype_matrix, - restricted_max_likelihood=True, - refit=False, - temp_data=None): - """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics - - restricted_max_likelihood -- whether to use restricted max likelihood; True or False - refit -- whether to refit the variance component for each marker - temp_data -- TempData object that stores the progress for each major step of the - calculations ("calculate_kinship" and "GWAS" take the majority of time) - - """ - - with Bench("Calculate Kinship"): - kinship_matrix = calculate_kinship(genotype_matrix, temp_data) - - with Bench("Create LMM object"): - lmm_ob = LMM(pheno_vector, kinship_matrix) - - with Bench("LMM_ob fitting"): - lmm_ob.fit() - - with Bench("Doing GWAS"): - t_stats, p_values = GWAS(pheno_vector, - genotype_matrix, - kinship_matrix, - restricted_max_likelihood=True, - refit=False, - temp_data=temp_data) - Bench().report() - return t_stats, p_values +#from utility.benchmark import Bench +# +##np.seterr('raise') +# +#def run(pheno_vector, +# genotype_matrix, +# restricted_max_likelihood=True, +# refit=False, +# temp_data=None): +# """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics +# +# restricted_max_likelihood -- whether to use restricted max likelihood; True or False +# refit -- whether to refit the variance component for each marker +# temp_data -- TempData object that stores the progress for each major step of the +# calculations ("calculate_kinship" and "GWAS" take the majority of time) +# +# """ +# +# with Bench("Calculate Kinship"): +# kinship_matrix = calculate_kinship(genotype_matrix, temp_data) +# +# with Bench("Create LMM object"): +# lmm_ob = LMM(pheno_vector, kinship_matrix) +# +# with Bench("LMM_ob fitting"): +# lmm_ob.fit() +# +# with Bench("Doing GWAS"): +# t_stats, p_values = GWAS(pheno_vector, +# genotype_matrix, +# kinship_matrix, +# restricted_max_likelihood=True, +# refit=False, +# temp_data=temp_data) +# Bench().report() +# return t_stats, p_values def matrixMult(A,B): diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py new file mode 100644 index 00000000..487949f0 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pylmmGWAS.py @@ -0,0 +1,223 @@ +#!/usr/bin/python + +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import pdb +import time + +def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") +def outputResult(id,beta,betaSD,ts,ps): + out.write("\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n") + +from optparse import OptionParser,OptionGroup +usage = """usage: %prog [options] --kfile kinshipFile --[t | b | p]file plinkFileBase outfileBase + +This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. +The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. + +Basic usage: + + python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile + +NOTE: The current running version only supports binary PED files (PLINK). It is simple to convert between ped or tped and bed using PLINK. Sorry for the inconvinience. + + """ +parser = OptionParser(usage=usage) + +basicGroup = OptionGroup(parser, "Basic Options") +advancedGroup = OptionGroup(parser, "Advanced Options") + +basicGroup.add_option("--pfile", dest="pfile", + help="The base for a PLINK ped file") +basicGroup.add_option("--tfile", dest="tfile", + help="The base for a PLINK tped file") +basicGroup.add_option("--bfile", dest="bfile", + help="The base for a PLINK binary bed file") +basicGroup.add_option("--phenofile", dest="phenoFile", default=None, + help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. The order does not matter for this file. ") +basicGroup.add_option("--kfile", dest="kfile", + help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") +basicGroup.add_option("--covfile", dest="covfile", + help="The location of a covariate file file. This is a plink formatted covariate file.") +basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0) + +advancedGroup.add_option("--removeMissingGenotypes", + action="store_false", dest="normalizeGenotype", default=True, + help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.") +advancedGroup.add_option("--refit", + action="store_true", dest="refit", default=False, + help="Refit the variance components at each SNP (default is to lock in the variance components under the null).") + +advancedGroup.add_option("--REML", + action="store_true", dest="REML", default=False, + help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).") +#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") +advancedGroup.add_option("--eigen", dest="eigenfile", + help="The location of the precomputed eigendecomposition for the kinship file") +advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true", + help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.") + +advancedGroup.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") + +parser.add_option_group(basicGroup) +parser.add_option_group(advancedGroup) + +(options, args) = parser.parse_args() + +import sys +import os +import numpy as np +from scipy import linalg +from pylmm.lmm import LMM +from pylmm import input + +if len(args) != 1: parser.error("Incorrect number of arguments") +outFile = args[0] + +if not options.pfile and not options.tfile and not options.bfile: + parser.error("You must provide at least one PLINK input file base") +if not options.kfile: + parser.error("Please provide a pre-computed kinship file") + +# READING PLINK input +if options.verbose: sys.stderr.write("Reading PLINK input...\n") +if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +else: parser.error("You must provide at least one PLINK input file base") + +if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos'): + parser.error("No .pheno file exist for %s" % (options.phenoFile or IN.fbase + '.phenos')) + +# READING Covariate File +if options.covfile: + if options.verbose: sys.stderr.write("Reading covariate file...\n") + # Read the covariate file -- write this into input.plink + P = IN.getCovariates(options.covfile) + + if options.noMean: X0 = P + else: X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) + + if np.isnan(X0).sum(): + parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") +else: X0 = np.ones((IN.phenos.shape[0],1)) + +# READING Kinship - major bottleneck for large datasets +if options.verbose: sys.stderr.write("Reading kinship...\n") +begin = time.time() +# This method seems to be the fastest and works if you already know the size of the matrix +if options.kfile[-3:] == '.gz': + import gzip + f = gzip.open(options.kfile,'r') + F = f.read() # might exhaust mem if the file is huge + K = np.fromstring(F,sep=' ') # Assume that space separated + f.close() +else: K = np.fromfile(open(options.kfile,'r'),sep=" ") +K.resize((len(IN.indivs),len(IN.indivs))) +end = time.time() +# Other slower ways +#K = np.loadtxt(options.kfile) +#K = np.genfromtxt(options.kfile) +if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) + + +# PROCESS the phenotype data -- Remove missing phenotype values +# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data +Y = IN.phenos[:,options.pheno] +v = np.isnan(Y) +keep = True - v +if v.sum(): + if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + Y = Y[keep] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + Kva = [] + Kve = [] + +# Only load the decomposition if we did not remove individuals. +# Otherwise it would not be correct and we would have to compute it again. +if not v.sum() and options.eigenfile: + if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") + Kva = np.load(options.eigenfile + ".Kva") + Kve = np.load(options.eigenfile + ".Kve") +else: + Kva = [] + Kve = [] + +# CREATE LMM object for association +n = K.shape[0] +L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) +# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here +if not options.refit: + if options.verbose: sys.stderr.write("Computing fit for null model\n") + L.fit() + if options.verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) + +# Buffers for pvalues and t-stats +PS = [] +TS = [] +count = 0 +out = open(outFile,'w') +printOutHead() + +for snp,id in IN: + count += 1 + if options.verbose and count % 1000 == 0: + sys.stderr.write("At SNP %d\n" % count) + + x = snp[keep].reshape((n,1)) + #x[[1,50,100,200,3000],:] = np.nan + v = np.isnan(x).reshape((-1,)) + # Check SNPs for missing values + if v.sum(): + keeps = True - v + xs = x[keeps,:] + if keeps.sum() <= 1 or xs.var() <= 1e-6: + PS.append(np.nan) + TS.append(np.nan) + outputResult(id,np.nan,np.nan,np.nan,np.nan) + continue + + # Its ok to center the genotype - I used options.normalizeGenotype to + # force the removal of missing genotypes as opposed to replacing them with MAF. + if not options.normalizeGenotype: xs = (xs - xs.mean()) / np.sqrt(xs.var()) + Ys = Y[keeps] + X0s = X0[keeps,:] + Ks = K[keeps,:][:,keeps] + Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) + if options.refit: Ls.fit(X=xs,REML=options.REML) + else: + #try: + Ls.fit(REML=options.REML) + #except: pdb.set_trace() + ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) + else: + if x.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + outputResult(id,np.nan,np.nan,np.nan,np.nan) + continue + + if options.refit: L.fit(X=x,REML=options.REML) + ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) + + outputResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) + PS.append(ps) + TS.append(ts) diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py new file mode 100644 index 00000000..403e739d --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pylmmKinship.py @@ -0,0 +1,129 @@ +#!/usr/bin/python + +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from optparse import OptionParser,OptionGroup +usage = """usage: %prog [options] --[t | b | p]file plinkFileBase outfile + +NOTE: The current running version only supports binary PED files (PLINK). It is simple to convert between ped or tped and bed using PLINK. Sorry for the inconvinience. +""" + +parser = OptionParser(usage=usage) + +basicGroup = OptionGroup(parser, "Basic Options") +#advancedGroup = OptionGroup(parser, "Advanced Options") + +basicGroup.add_option("--pfile", + dest="pfile", + help="The base for a PLINK ped file") +basicGroup.add_option("--tfile", + dest="tfile", + help="The base for a PLINK tped file") +basicGroup.add_option("--bfile", + dest="bfile", + help="The base for a PLINK binary ped file") + +basicGroup.add_option("-e", + "--efile", + dest="saveEig", + help="Save eigendecomposition to this file.") +basicGroup.add_option("-n", + default=1000, + dest="computeSize", + type="int", + help="""The maximum number of SNPs to read into memory at once (default 1000). + This is important when there is a large number of SNPs, because memory could + be an issue.""") + +basicGroup.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="Print extra info") + +parser.add_option_group(basicGroup) +#parser.add_option_group(advancedGroup) + +(options, args) = parser.parse_args() +if len(args) != 1: + parser.error("Incorrect number of arguments") +outFile = args[0] + +import sys +import os +import numpy as np +from scipy import linalg +from pyLMM.lmm import calculate_kinship +from pyLMM import input + +if not options.pfile and not options.tfile and not options.bfile: + parser.error("You must provide at least one PLINK input file base") + +if options.verbose: + sys.stderr.write("Reading PLINK input...\n") +if options.bfile: + IN = input.plink(options.bfile,type='b') +elif options.tfile: + IN = input.plink(options.tfile,type='t') +elif options.pfile: + IN = input.plink(options.pfile,type='p') +else: + parser.error("You must provide at least one PLINK input file base") + +n = len(IN.indivs) +m = options.computeSize +W = np.ones((n,m)) * np.nan + +IN.getSNPIterator() +i = 0 +K = None +while i < IN.numSNPs: + j = 0 + while j < options.computeSize and i < IN.numSNPs: + snp,id = IN.next() + if snp.var() == 0: + i += 1 + continue + W[:,j] = snp + + i += 1 + j += 1 + if j < options.computeSize: + W = W[:,range(0,j)] + + if options.verbose: + sys.stderr.write("Processing first %d SNPs\n" % i) + if K == None: + K = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculate_kinship(W) * j + #if K == None: + # K = np.dot(W,W.T) # calculate_kinship(W) * j + else: + K_j = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculate_kinship(W) * j + K = K + K_j + +K = K / float(IN.numSNPs) +if options.verbose: + sys.stderr.write("Saving Kinship file to %s\n" % outFile) +np.savetxt(outFile,K) + +if options.saveEig: + if options.verbose: + sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: + sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) \ No newline at end of file -- cgit v1.2.3