From a71c3400f78fe4f63526308a0a7356b3c98e7b27 Mon Sep 17 00:00:00 2001
From: Zachary Sloan
Date: Tue, 19 Mar 2013 22:54:36 +0000
Subject: In the process of trying to use Nick's pylmmKinship.py code to
 precompute the kinship matrix for the human datasets we have plink .bed
 genotype data for

---
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py    |  72 +++++-----
 wqflask/wqflask/my_pylmm/pylmmGWAS.py    | 223 +++++++++++++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pylmmKinship.py | 129 ++++++++++++++++++
 3 files changed, 388 insertions(+), 36 deletions(-)
 create mode 100644 wqflask/wqflask/my_pylmm/pylmmGWAS.py
 create mode 100644 wqflask/wqflask/my_pylmm/pylmmKinship.py

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index ffc6283c..66830662 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -26,42 +26,42 @@ from scipy import stats
 
 from pprint import pformat as pf
 
-from utility.benchmark import Bench
-
-#np.seterr('raise')
-
-def run(pheno_vector,
-        genotype_matrix,
-        restricted_max_likelihood=True,
-        refit=False,
-        temp_data=None):
-    """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
-    
-    restricted_max_likelihood -- whether to use restricted max likelihood; True or False
-    refit -- whether to refit the variance component for each marker
-    temp_data -- TempData object that stores the progress for each major step of the
-    calculations ("calculate_kinship" and "GWAS" take the majority of time)
-    
-    """
-    
-    with Bench("Calculate Kinship"):
-        kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
-    
-    with Bench("Create LMM object"):
-        lmm_ob = LMM(pheno_vector, kinship_matrix)
-    
-    with Bench("LMM_ob fitting"):
-        lmm_ob.fit()
-
-    with Bench("Doing GWAS"):
-        t_stats, p_values = GWAS(pheno_vector,
-                                genotype_matrix,
-                                kinship_matrix,
-                                restricted_max_likelihood=True,
-                                refit=False,
-                                temp_data=temp_data)
-    Bench().report()
-    return t_stats, p_values
+#from utility.benchmark import Bench
+#
+##np.seterr('raise')
+#
+#def run(pheno_vector,
+#        genotype_matrix,
+#        restricted_max_likelihood=True,
+#        refit=False,
+#        temp_data=None):
+#    """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
+#    
+#    restricted_max_likelihood -- whether to use restricted max likelihood; True or False
+#    refit -- whether to refit the variance component for each marker
+#    temp_data -- TempData object that stores the progress for each major step of the
+#    calculations ("calculate_kinship" and "GWAS" take the majority of time)
+#    
+#    """
+#    
+#    with Bench("Calculate Kinship"):
+#        kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
+#    
+#    with Bench("Create LMM object"):
+#        lmm_ob = LMM(pheno_vector, kinship_matrix)
+#    
+#    with Bench("LMM_ob fitting"):
+#        lmm_ob.fit()
+#
+#    with Bench("Doing GWAS"):
+#        t_stats, p_values = GWAS(pheno_vector,
+#                                genotype_matrix,
+#                                kinship_matrix,
+#                                restricted_max_likelihood=True,
+#                                refit=False,
+#                                temp_data=temp_data)
+#    Bench().report()
+#    return t_stats, p_values
 
 
 def matrixMult(A,B):
diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py
new file mode 100644
index 00000000..487949f0
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pylmmGWAS.py
@@ -0,0 +1,223 @@
+#!/usr/bin/python
+
+# pylmm is a python-based linear mixed-model solver with applications to GWAS
+
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import pdb
+import time
+
+def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
+def outputResult(id,beta,betaSD,ts,ps):
+    out.write("\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n")
+
+from optparse import OptionParser,OptionGroup
+usage = """usage: %prog [options] --kfile kinshipFile --[t | b | p]file plinkFileBase outfileBase
+
+This program provides basic genome-wide association (GWAS) functionality.  You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value.
+The input file are all standard plink formatted with the first two columns specifiying the individual and family ID.  For the phenotype file, we accept either NA or -9 to denote missing values.
+
+Basic usage:
+
+      python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile
+
+NOTE: The current running version only supports binary PED files (PLINK).  It is simple to convert between ped or tped and bed using PLINK.  Sorry for the inconvinience.
+
+            """
+parser = OptionParser(usage=usage)
+
+basicGroup = OptionGroup(parser, "Basic Options")
+advancedGroup = OptionGroup(parser, "Advanced Options")
+
+basicGroup.add_option("--pfile", dest="pfile",
+                  help="The base for a PLINK ped file")
+basicGroup.add_option("--tfile", dest="tfile",
+                  help="The base for a PLINK tped file")
+basicGroup.add_option("--bfile", dest="bfile",
+                  help="The base for a PLINK binary bed file")
+basicGroup.add_option("--phenofile", dest="phenoFile", default=None,
+                  help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root.  If you want to specify an alternative phenotype file, then use this argument.  The order does not matter for this file. ")
+basicGroup.add_option("--kfile", dest="kfile",
+                  help="The location of a kinship file.  This is an nxn plain text file and can be computed with the pylmmKinship program.")
+basicGroup.add_option("--covfile", dest="covfile",
+                  help="The location of a covariate file file.  This is a plink formatted covariate file.")
+basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0)
+
+advancedGroup.add_option("--removeMissingGenotypes",
+                  action="store_false", dest="normalizeGenotype", default=True,
+                  help="By default the program replaces missing genotypes with the minor allele frequency.  This option overrides that behavior making the program remove missing individuals.  NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.")
+advancedGroup.add_option("--refit",
+                  action="store_true", dest="refit", default=False,
+                  help="Refit the variance components at each SNP (default is to lock in the variance components under the null).")
+
+advancedGroup.add_option("--REML",
+                  action="store_true", dest="REML", default=False,
+                  help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).")
+#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.")
+advancedGroup.add_option("--eigen", dest="eigenfile",
+                  help="The location of the precomputed eigendecomposition for the kinship file")
+advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true",
+                  help="This option only applies when --cofile is used.  When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.")
+
+advancedGroup.add_option("-v", "--verbose",
+                  action="store_true", dest="verbose", default=False,
+                  help="Print extra info")
+
+parser.add_option_group(basicGroup)
+parser.add_option_group(advancedGroup)
+
+(options, args) = parser.parse_args()
+
+import sys
+import os
+import numpy as np
+from scipy import linalg
+from pylmm.lmm import LMM
+from pylmm import input
+
+if len(args) != 1: parser.error("Incorrect number of arguments")
+outFile = args[0]
+
+if not options.pfile and not options.tfile and not options.bfile:
+    parser.error("You must provide at least one PLINK input file base")
+if not options.kfile:
+    parser.error("Please provide a pre-computed kinship file")
+
+# READING PLINK input
+if options.verbose: sys.stderr.write("Reading PLINK input...\n")
+if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+else: parser.error("You must provide at least one PLINK input file base")
+
+if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos'):
+    parser.error("No .pheno file exist for %s" % (options.phenoFile or IN.fbase + '.phenos'))
+
+# READING Covariate File
+if options.covfile:
+    if options.verbose: sys.stderr.write("Reading covariate file...\n")
+    # Read the covariate file -- write this into input.plink
+    P = IN.getCovariates(options.covfile)
+
+    if options.noMean: X0 = P
+    else: X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
+
+    if np.isnan(X0).sum():
+        parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case.  Either remove those individuals with missing values or replace them in some way.")
+else: X0 = np.ones((IN.phenos.shape[0],1))
+
+# READING Kinship - major bottleneck for large datasets
+if options.verbose: sys.stderr.write("Reading kinship...\n")
+begin = time.time()
+# This method seems to be the fastest and works if you already know the size of the matrix
+if options.kfile[-3:] == '.gz':
+    import gzip
+    f = gzip.open(options.kfile,'r')
+    F = f.read() # might exhaust mem if the file is huge
+    K = np.fromstring(F,sep=' ') # Assume that space separated
+    f.close()
+else: K = np.fromfile(open(options.kfile,'r'),sep=" ")
+K.resize((len(IN.indivs),len(IN.indivs)))
+end = time.time()
+# Other slower ways
+#K = np.loadtxt(options.kfile)
+#K = np.genfromtxt(options.kfile)
+if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin))
+
+
+# PROCESS the phenotype data -- Remove missing phenotype values
+# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data
+Y = IN.phenos[:,options.pheno]
+v = np.isnan(Y)
+keep = True - v
+if v.sum():
+    if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+    Y = Y[keep]
+    X0 = X0[keep,:]
+    K = K[keep,:][:,keep]
+    Kva = []
+    Kve = []
+
+# Only load the decomposition if we did not remove individuals.
+# Otherwise it would not be correct and we would have to compute it again.
+if not v.sum() and options.eigenfile:
+    if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n")
+    Kva = np.load(options.eigenfile + ".Kva")
+    Kve = np.load(options.eigenfile + ".Kve")
+else:
+    Kva = []
+    Kve = []
+
+# CREATE LMM object for association
+n = K.shape[0]
+L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose)
+# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here
+if not options.refit:
+    if options.verbose: sys.stderr.write("Computing fit for null model\n")
+    L.fit()
+    if options.verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma))
+
+# Buffers for pvalues and t-stats
+PS = []
+TS = []
+count = 0
+out = open(outFile,'w')
+printOutHead()
+
+for snp,id in IN:
+    count += 1
+    if options.verbose and count % 1000 == 0:
+        sys.stderr.write("At SNP %d\n" % count)
+
+    x = snp[keep].reshape((n,1))
+    #x[[1,50,100,200,3000],:] = np.nan
+    v = np.isnan(x).reshape((-1,))
+    # Check SNPs for missing values
+    if v.sum():
+        keeps = True - v
+        xs = x[keeps,:]
+        if keeps.sum() <= 1 or xs.var() <= 1e-6:
+            PS.append(np.nan)
+            TS.append(np.nan)
+            outputResult(id,np.nan,np.nan,np.nan,np.nan)
+            continue
+
+        # Its ok to center the genotype -  I used options.normalizeGenotype to
+        # force the removal of missing genotypes as opposed to replacing them with MAF.
+        if not options.normalizeGenotype: xs = (xs - xs.mean()) / np.sqrt(xs.var())
+        Ys = Y[keeps]
+        X0s = X0[keeps,:]
+        Ks = K[keeps,:][:,keeps]
+        Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose)
+        if options.refit: Ls.fit(X=xs,REML=options.REML)
+        else:
+            #try:
+            Ls.fit(REML=options.REML)
+            #except: pdb.set_trace()
+        ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True)
+    else:
+        if x.var() == 0:
+            PS.append(np.nan)
+            TS.append(np.nan)
+            outputResult(id,np.nan,np.nan,np.nan,np.nan)
+            continue
+
+        if options.refit: L.fit(X=x,REML=options.REML)
+        ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True)
+
+    outputResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)
+    PS.append(ps)
+    TS.append(ts)
diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py
new file mode 100644
index 00000000..403e739d
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pylmmKinship.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python
+
+# pylmm is a python-based linear mixed-model solver with applications to GWAS
+
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from optparse import OptionParser,OptionGroup
+usage = """usage: %prog [options] --[t | b | p]file plinkFileBase outfile
+
+NOTE: The current running version only supports binary PED files (PLINK).  It is simple to convert between ped or tped and bed using PLINK.  Sorry for the inconvinience.
+"""
+
+parser = OptionParser(usage=usage)
+
+basicGroup = OptionGroup(parser, "Basic Options")
+#advancedGroup = OptionGroup(parser, "Advanced Options")
+
+basicGroup.add_option("--pfile",
+                    dest="pfile",
+                    help="The base for a PLINK ped file")
+basicGroup.add_option("--tfile",
+                    dest="tfile",
+                    help="The base for a PLINK tped file")
+basicGroup.add_option("--bfile",
+                    dest="bfile",
+                    help="The base for a PLINK binary ped file")
+
+basicGroup.add_option("-e",
+                    "--efile",
+                    dest="saveEig",
+                    help="Save eigendecomposition to this file.")
+basicGroup.add_option("-n",
+                    default=1000,
+                    dest="computeSize",
+                    type="int",
+                    help="""The maximum number of SNPs to read into memory at once (default 1000).
+                    This is important when there is a large number of SNPs, because memory could
+                    be an issue.""")
+
+basicGroup.add_option("-v", "--verbose",
+                    action="store_true", dest="verbose", default=False,
+                    help="Print extra info")
+
+parser.add_option_group(basicGroup)
+#parser.add_option_group(advancedGroup)
+
+(options, args) = parser.parse_args()
+if len(args) != 1:
+    parser.error("Incorrect number of arguments")
+outFile = args[0]
+
+import sys
+import os
+import numpy as np
+from scipy import linalg
+from pyLMM.lmm import calculate_kinship
+from pyLMM import input
+
+if not options.pfile and not options.tfile and not options.bfile: 
+   parser.error("You must provide at least one PLINK input file base")
+
+if options.verbose:
+    sys.stderr.write("Reading PLINK input...\n")
+if options.bfile:
+    IN = input.plink(options.bfile,type='b')
+elif options.tfile:
+    IN = input.plink(options.tfile,type='t')
+elif options.pfile:
+    IN = input.plink(options.pfile,type='p')
+else:
+    parser.error("You must provide at least one PLINK input file base")
+
+n = len(IN.indivs)
+m = options.computeSize
+W = np.ones((n,m)) * np.nan
+
+IN.getSNPIterator()
+i = 0
+K = None
+while i < IN.numSNPs:
+    j = 0
+    while j < options.computeSize and i < IN.numSNPs:
+        snp,id = IN.next()
+        if snp.var() == 0:
+            i += 1
+            continue
+        W[:,j] = snp
+  
+        i += 1
+        j += 1
+    if j < options.computeSize:
+        W = W[:,range(0,j)] 
+
+    if options.verbose:
+        sys.stderr.write("Processing first %d SNPs\n" % i)
+    if K == None:
+        K = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculate_kinship(W) * j
+    #if K == None:
+    #   K = np.dot(W,W.T) # calculate_kinship(W) * j
+    else:
+        K_j = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculate_kinship(W) * j
+        K = K + K_j
+
+K = K / float(IN.numSNPs)
+if options.verbose:
+    sys.stderr.write("Saving Kinship file to %s\n" % outFile)
+np.savetxt(outFile,K)
+
+if options.saveEig:
+    if options.verbose:
+        sys.stderr.write("Obtaining Eigendecomposition\n")
+    Kva,Kve = linalg.eigh(K)
+    if options.verbose:
+        sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+    np.savetxt(outFile+".kva",Kva)
+    np.savetxt(outFile+".kve",Kve)
\ No newline at end of file
-- 
cgit v1.2.3