aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/chunks.py2
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/genotype.py50
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/kinship.py158
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm.py45
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py2
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/phenotype.py40
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/process_plink.py127
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/runlmm.py143
8 files changed, 376 insertions, 191 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
index abeffee7..9565fb96 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
@@ -93,4 +93,4 @@ if __name__ == '__main__':
_main()
print("\nConfirming doctests...")
import doctest
- doctest.testmod() \ No newline at end of file
+ doctest.testmod()
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
new file mode 100644
index 00000000..315fd824
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -0,0 +1,50 @@
+# Genotype routines
+
+# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+from collections import Counter
+import operator
+
+def replace_missing_with_MAF(snp_g):
+ """
+ Replace the missing genotype with the minor allele frequency (MAF)
+ in the snp row. It is rather slow!
+ """
+ cnt = Counter(snp_g)
+ tuples = sorted(cnt.items(), key=operator.itemgetter(1))
+ l2 = [t for t in tuples if not np.isnan(t[0])]
+ maf = l2[0][0]
+ res = np.array([maf if np.isnan(snp) else snp for snp in snp_g])
+ return res
+
+def normalize(ind_g):
+ """
+ Run for every SNP list (for one individual) and return
+ normalized SNP genotype values with missing data filled in
+ """
+ g1 = np.copy(ind_g) # avoid side effects
+ x = True - np.isnan(ind_g) # Matrix of True/False
+ m = ind_g[x].mean() # Global mean value
+ s = np.sqrt(ind_g[x].var()) # Global stddev
+ g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data
+ if s == 0:
+ g1 = g1 - m # Subtract the mean
+ else:
+ g1 = (g1 - m) / s # Normalize the deviation
+ return g1
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
new file mode 100644
index 00000000..9ab48510
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -0,0 +1,158 @@
+# pylmm kinship calculation
+#
+# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# env PYTHONPATH=$pylmm_lib_path:./lib python $pylmm_lib_path/runlmm.py --pheno test.pheno --geno test9000.geno kinship --test
+
+import sys
+import os
+import numpy as np
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue
+import time
+
+from optmatrix import matrix_initialize, matrixMultT
+
+def kinship_full(G):
+ """
+ Calculate the Kinship matrix using a full dot multiplication
+ """
+ print G.shape
+ m = G.shape[0] # snps
+ n = G.shape[1] # inds
+ sys.stderr.write(str(m)+" SNPs\n")
+ assert m>n, "n should be larger than m (snps>inds)"
+ m = np.dot(G.T,G)
+ m = m/G.shape[0]
+ return m
+
+def compute_W(job,G,n,snps,compute_size):
+ """
+ Read 1000 SNPs at a time into matrix and return the result
+ """
+ m = compute_size
+ W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
+ for j in range(0,compute_size):
+ pos = job*m + j # real position
+ if pos >= snps:
+ W = W[:,range(0,j)]
+ break
+ snp = G[job*compute_size+j]
+ if snp.var() == 0:
+ continue
+ W[:,j] = snp # set row to list of SNPs
+ return W
+
+def compute_matrixMult(job,W,q = None):
+ """
+ Compute Kinship(W)*j
+
+ For every set of SNPs matrixMult is used to multiply matrices T(W)*W
+ """
+ res = matrixMultT(W)
+ if not q: q=compute_matrixMult.q
+ q.put([job,res])
+ return job
+
+def f_init(q):
+ compute_matrixMult.q = q
+
+# Calculate the kinship matrix from G (SNPs as rows!), returns K
+#
+def kinship(G,options):
+ numThreads = None
+ if options.numThreads:
+ numThreads = int(options.numThreads)
+ options.computeSize = 1000
+ matrix_initialize(options.useBLAS)
+
+ sys.stderr.write(str(G.shape)+"\n")
+ n = G.shape[1] # inds
+ inds = n
+ m = G.shape[0] # snps
+ snps = m
+ sys.stderr.write(str(m)+" SNPs\n")
+ assert m>n, "n should be larger than m (snps>inds)"
+
+ q = mp.Queue()
+ p = mp.Pool(numThreads, f_init, [q])
+ cpu_num = mp.cpu_count()
+ print "CPU cores:",cpu_num
+ iterations = snps/options.computeSize+1
+ if options.testing:
+ iterations = 8
+ # jobs = range(0,8) # range(0,iterations)
+
+ results = []
+
+ K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals
+
+ completed = 0
+ for job in range(iterations):
+ if options.verbose:
+ sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
+ W = compute_W(job,G,n,snps,options.computeSize)
+ if numThreads == 1:
+ # Single-core
+ compute_matrixMult(job,W,q)
+ j,x = q.get()
+ if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ K_j = x
+ # print j,K_j[:,0]
+ K = K + K_j
+ else:
+ # Multi-core
+ results.append(p.apply_async(compute_matrixMult, (job,W)))
+ # Do we have a result?
+ while (len(results)-completed>cpu_num*2):
+ time.sleep(0.1)
+ try:
+ j,x = q.get_nowait()
+ if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ K_j = x
+ # print j,K_j[:,0]
+ K = K + K_j
+ completed += 1
+ except Queue.Empty:
+ pass
+
+ if numThreads == None or numThreads > 1:
+ # results contains the growing result set
+ for job in range(len(results)-completed):
+ j,x = q.get(True,15)
+ if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ K_j = x
+ # print j,K_j[:,0]
+ K = K + K_j
+ completed += 1
+
+ K = K / float(snps)
+
+ # outFile = 'runtest.kin'
+ # if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
+ # np.savetxt(outFile,K)
+
+ # if options.saveKvaKve:
+ # if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+ # Kva,Kve = linalg.eigh(K)
+ # if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+ # np.savetxt(outFile+".kva",Kva)
+ # np.savetxt(outFile+".kve",Kve)
+ return K
+
+
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 16073d2c..99a5a940 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -143,6 +143,7 @@ def run_human(pheno_vector,
timestamp = datetime.datetime.utcnow().isoformat()
+ # Pickle chunks of input SNPs (from Plink interator) and compress them
#print("Starting adding loop")
for part, result in enumerate(results):
#data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL)
@@ -254,8 +255,8 @@ def run_other(pheno_vector,
genotype_matrix,
restricted_max_likelihood=True,
refit=False,
- tempdata=None, # <---- can not be None
- is_testing=False):
+ tempdata=None # <---- can not be None
+ ):
"""Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
@@ -267,9 +268,9 @@ def run_other(pheno_vector,
"""
print("In run_other")
- print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing)
+ print("REML=",restricted_max_likelihood," REFIT=",refit)
with Bench("Calculate Kinship"):
- kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing)
+ kinship_matrix = calculate_kinship(genotype_matrix, tempdata)
print("kinship_matrix: ", pf(kinship_matrix))
print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
@@ -325,7 +326,7 @@ def matrixMult(A,B):
return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
-def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
+def calculate_kinship(genotype_matrix, temp_data=None):
"""
genotype_matrix is an n x m matrix encoding SNP minor alleles.
@@ -337,10 +338,9 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
m = genotype_matrix.shape[1]
print("genotype 2D matrix n (inds) is:", n)
print("genotype 2D matrix m (snps) is:", m)
+ assert m>n, "n should be larger than m (snps>inds)"
keep = []
for counter in range(m):
- if is_testing and counter>8:
- break
#print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
#Checks if any values in column are not numbers
not_number = np.isnan(genotype_matrix[:,counter])
@@ -360,7 +360,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr)
percent_complete = int(round((counter/m)*45))
- temp_data.store("percent_complete", percent_complete)
+ if temp_data != None:
+ temp_data.store("percent_complete", percent_complete)
genotype_matrix = genotype_matrix[:,keep]
print("genotype_matrix: ", pf(genotype_matrix))
@@ -406,6 +407,8 @@ def GWAS(pheno_vector,
v = np.isnan(pheno_vector)
if v.sum():
keep = True - v
+ print(pheno_vector.shape,pheno_vector)
+ print(keep.shape,keep)
pheno_vector = pheno_vector[keep]
#genotype_matrix = genotype_matrix[keep,:]
#covariate_matrix = covariate_matrix[keep,:]
@@ -437,6 +440,8 @@ def GWAS(pheno_vector,
p_values.append(0)
t_statistics.append(np.nan)
continue
+
+ print(genotype_matrix.shape,pheno_vector.shape,keep.shape)
pheno_vector = pheno_vector[keep]
covariate_matrix = covariate_matrix[keep,:]
@@ -484,7 +489,7 @@ class LMM:
is not done consistently.
"""
- def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False):
+ def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True):
"""
The constructor takes a phenotype vector or array of size n.
@@ -494,7 +499,6 @@ class LMM:
When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
"""
- self.is_testing = True
if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
self.verbose = verbose
@@ -513,7 +517,7 @@ class LMM:
Kve = []
self.nonmissing = x
- print("this K is:", pf(K))
+ print("this K is:", K.shape, pf(K))
if len(Kva) == 0 or len(Kve) == 0:
if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
@@ -525,8 +529,8 @@ class LMM:
self.K = K
self.Kva = Kva
self.Kve = Kve
- print("self.Kva is: ", pf(self.Kva))
- print("self.Kve is: ", pf(self.Kve))
+ print("self.Kva is: ", self.Kva.shape, pf(self.Kva))
+ print("self.Kve is: ", self.Kve.shape, pf(self.Kve))
self.Y = Y
self.X0 = X0
self.N = self.K.shape[0]
@@ -722,7 +726,7 @@ class LMM:
pl.title(title)
-def gn2_redis(key,species,is_testing=False):
+def gn2_redis(key,species):
json_params = Redis.get(key)
params = json.loads(json_params)
@@ -749,8 +753,7 @@ def gn2_redis(key,species,is_testing=False):
genotype_matrix = geno,
restricted_max_likelihood = params['restricted_max_likelihood'],
refit = params['refit'],
- tempdata = tempdata,
- is_testing=is_testing)
+ tempdata = tempdata)
results_key = "pylmm:results:" + params['temp_uuid']
@@ -775,11 +778,15 @@ def gn2_main():
gn2_redis(key,species)
-def gn2_load_redis(key,species,kinship,pheno,geno,is_testing):
+def gn2_load_redis(key,species,kinship,pheno,geno):
print("Loading Redis from parsed data")
+ if kinship == None:
+ k = None
+ else:
+ k = kinship.tolist()
params = dict(pheno_vector = pheno.tolist(),
genotype_matrix = geno.tolist(),
- kinship_matrix= kinship.tolist(),
+ kinship_matrix= k,
restricted_max_likelihood = True,
refit = False,
temp_uuid = "testrun_temp_uuid",
@@ -792,7 +799,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno,is_testing):
Redis.set(key, json_params)
Redis.expire(key, 60*60)
- return gn2_redis(key,species,is_testing)
+ return gn2_redis(key,species)
if __name__ == '__main__':
print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
index abb72081..5c71db6a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
@@ -9,7 +9,7 @@ from scipy import stats
useNumpy = None
hasBLAS = None
-def matrix_initialize(useBLAS):
+def matrix_initialize(useBLAS=True):
global useNumpy # module based variable
if useBLAS and useNumpy == None:
print get_info('blas_opt')
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
new file mode 100644
index 00000000..bb620052
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -0,0 +1,40 @@
+# Phenotype routines
+
+# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import numpy as np
+
+def remove_missing(y,g,verbose=False):
+ """
+ Remove missing data from matrices, make sure the genotype data has
+ individuals as rows
+ """
+ assert(y!=None)
+ assert(y.shape[0] == g.shape[0])
+
+ y1 = y
+ g1 = g
+ v = np.isnan(y)
+ keep = True - v
+ if v.sum():
+ if verbose:
+ sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
+ y1 = y[keep]
+ g1 = g[keep,:]
+ return y1,g1
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
deleted file mode 100644
index e47c18e1..00000000
--- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from __future__ import absolute_import, print_function, division
-
-import sys
-sys.path.append("../../..")
-
-print("sys.path: ", sys.path)
-
-import numpy as np
-
-import zlib
-import cPickle as pickle
-import redis
-Redis = redis.Redis()
-
-import lmm
-
-class ProcessLmmChunk(object):
-
- def __init__(self):
- self.get_snp_data()
- self.get_lmm_vars()
-
- keep = self.trim_matrices()
-
- self.do_association(keep)
-
- print("p_values is: ", self.p_values)
-
- def get_snp_data(self):
- plink_pickled = zlib.decompress(Redis.lpop("plink_inputs"))
- plink_data = pickle.loads(plink_pickled)
-
- self.snps = np.array(plink_data['result'])
- self.identifier = plink_data['identifier']
-
- def get_lmm_vars(self):
- lmm_vars_pickled = Redis.hget(self.identifier, "lmm_vars")
- lmm_vars = pickle.loads(lmm_vars_pickled)
-
- self.pheno_vector = np.array(lmm_vars['pheno_vector'])
- self.covariate_matrix = np.array(lmm_vars['covariate_matrix'])
- self.kinship_matrix = np.array(lmm_vars['kinship_matrix'])
-
- def trim_matrices(self):
- v = np.isnan(self.pheno_vector)
- keep = True - v
- keep = keep.reshape((len(keep),))
-
- if v.sum():
- self.pheno_vector = self.pheno_vector[keep]
- self.covariate_matrix = self.covariate_matrix[keep,:]
- self.kinship_matrix = self.kinship_matrix[keep,:][:,keep]
-
- return keep
-
- def do_association(self, keep):
- n = self.kinship_matrix.shape[0]
- lmm_ob = lmm.LMM(self.pheno_vector,
- self.kinship_matrix,
- self.covariate_matrix)
- lmm_ob.fit()
-
- self.p_values = []
-
- for snp in self.snps:
- snp = snp[0]
- p_value, t_stat = lmm.human_association(snp,
- n,
- keep,
- lmm_ob,
- self.pheno_vector,
- self.covariate_matrix,
- self.kinship_matrix,
- False)
-
- self.p_values.append(p_value)
-
-
-#plink_pickled = zlib.decompress(Redis.lpop("plink_inputs"))
-#
-#plink_data = pickle.loads(plink_pickled)
-#result = np.array(plink_data['result'])
-#print("snp size is: ", result.shape)
-#identifier = plink_data['identifier']
-#
-#lmm_vars_pickled = Redis.hget(identifier, "lmm_vars")
-#lmm_vars = pickle.loads(lmm_vars_pickled)
-#
-#pheno_vector = np.array(lmm_vars['pheno_vector'])
-#covariate_matrix = np.array(lmm_vars['covariate_matrix'])
-#kinship_matrix = np.array(lmm_vars['kinship_matrix'])
-#
-#v = np.isnan(pheno_vector)
-#keep = True - v
-#keep = keep.reshape((len(keep),))
-#print("keep is: ", keep)
-#
-#if v.sum():
-# pheno_vector = pheno_vector[keep]
-# covariate_matrix = covariate_matrix[keep,:]
-# kinship_matrix = kinship_matrix[keep,:][:,keep]
-#
-#n = kinship_matrix.shape[0]
-#print("n is: ", n)
-#lmm_ob = lmm.LMM(pheno_vector,
-# kinship_matrix,
-# covariate_matrix)
-#lmm_ob.fit()
-#
-#p_values = []
-#
-#for snp in result:
-# snp = snp[0]
-# p_value, t_stat = lmm.human_association(snp,
-# n,
-# keep,
-# lmm_ob,
-# pheno_vector,
-# covariate_matrix,
-# kinship_matrix,
-# False)
-#
-# p_values.append(p_value)
-
-
-
-
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 738268be..469ba6c9 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -21,18 +21,22 @@ from optparse import OptionParser
import sys
import tsvreader
import numpy as np
-from lmm import gn2_load_redis
+from lmm import gn2_load_redis, calculate_kinship
+from kinship import kinship, kinship_full
+import genotype
+import phenotype
usage = """
python runlmm.py [options] command
runlmm.py processing multiplexer reads standardised input formats
- and calls the different routines
+ and calls the different routines (writes to stdout)
Current commands are:
parse : only parse input files
redis : use Redis to call into GN2
+ kinship : calculate (new) kinship matrix
try --help for more information
"""
@@ -47,12 +51,28 @@ parser.add_option("--pheno",dest="pheno",
help="Phenotype file format 1.0")
parser.add_option("--geno",dest="geno",
help="Genotype file format 1.0")
+parser.add_option("--maf-normalization",
+ action="store_true", dest="maf_normalization", default=False,
+ help="Apply MAF genotype normalization")
+parser.add_option("--skip-genotype-normalization",
+ action="store_true", dest="skip_genotype_normalization", default=False,
+ help="Skip genotype normalization")
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
+parser.add_option("--blas", action="store_true", default=False, dest="useBLAS", help="Use BLAS instead of numpy matrix multiplication")
+parser.add_option("-t", "--threads",
+ type="int", dest="numThreads",
+ help="Threads to use")
+parser.add_option("--saveKvaKve",
+ action="store_true", dest="saveKvaKve", default=False,
+ help="Testing mode")
parser.add_option("--test",
action="store_true", dest="testing", default=False,
help="Testing mode")
+parser.add_option("--test-kinship",
+ action="store_true", dest="test_kinship", default=False,
+ help="Testing mode for Kinship calculation")
(options, args) = parser.parse_args()
@@ -63,6 +83,10 @@ if len(args) != 1:
cmd = args[0]
print "Command: ",cmd
+k = None
+y = None
+g = None
+
if options.kinship:
k = tsvreader.kinship(options.kinship)
print k.shape
@@ -75,46 +99,79 @@ if options.geno:
g = tsvreader.geno(options.geno)
print g.shape
-def normalizeGenotype(G):
- # Run for every SNP list (num individuals)
- x = True - np.isnan(G) # Matrix of True/False
- m = G[x].mean() # Global mean value
- # print m
- s = np.sqrt(G[x].var()) # Global stddev
- # print s
- G[np.isnan(G)] = m # Plug-in mean values for missing
- if s == 0:
- G = G - m # Subtract the mean
- else:
- G = (G - m) / s # Normalize the deviation
- return G
-
-# g = g.reshape((1, -1))[0]
-print("Before",g)
-gn = []
-for snp in g:
- gn.append( normalizeGenotype(snp) )
-
-gn = g
-gn = np.array(gn)
-print("After1",gn)
-gnT = gn.T
-print("After",gnT)
-# G = gnT
-G = gnT
-print "G shape",G.shape
-# sys.exit(1)
-# assert(G[0,0]==-2.25726341)
-
-# Remove individuals with missing phenotypes
-v = np.isnan(y)
-keep = True - v
-if v.sum():
- if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
- y = y[keep]
- G = G[keep,:]
- k = k[keep,:][:,keep]
-
if cmd == 'redis':
- ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
+ # Emulating the redis setup of GN2
+ G = g
+ print "Original G",G.shape, "\n", G
+ if y != None:
+ gnt = np.array(g).T
+ Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+ G = g.T
+ print "Removed missing phenotypes",G.shape, "\n", G
+ if options.maf_normalization:
+ G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+ print "MAF replacements: \n",G
+ if not options.skip_genotype_normalization:
+ G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+ g = None
+ gnt = None
+
+ ps, ts = gn2_load_redis('testrun','other',k,Y,G.T)
print np.array(ps)
+ # Test results
+ p1 = round(ps[0],4)
+ p2 = round(ps[-1],4)
+ sys.stderr.write(options.geno+"\n")
+ if options.geno == 'data/small.geno':
+ assert p1==0.0708, "p1=%f" % p1
+ assert p2==0.1417, "p2=%f" % p2
+ if options.geno == 'data/small_na.geno':
+ assert p1==0.0958, "p1=%f" % p1
+ assert p2==0.0435, "p2=%f" % p2
+ if options.geno == 'data/test8000.geno':
+ assert p1==0.8984, "p1=%f" % p1
+ assert p2==0.9623, "p2=%f" % p2
+elif cmd == 'kinship':
+ G = g
+ print "Original G",G.shape, "\n", G
+ if y != None:
+ gnt = np.array(g).T
+ Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+ G = g.T
+ print "Removed missing phenotypes",G.shape, "\n", G
+ if options.maf_normalization:
+ G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+ print "MAF replacements: \n",G
+ if not options.skip_genotype_normalization:
+ G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+ g = None
+ gnt = None
+
+ if options.test_kinship:
+ K = kinship_full(G)
+ print "Genotype",G.shape, "\n", G
+ print "first Kinship method",K.shape,"\n",K
+ k1 = round(K[0][0],4)
+ K2 = calculate_kinship(np.copy(G.T),temp_data=None)
+ print "Genotype",G.shape, "\n", G
+ print "GN2 Kinship method",K2.shape,"\n",K2
+ k2 = round(K2[0][0],4)
+
+ print "Genotype",G.shape, "\n", G
+ K3 = kinship(np.copy(G),options)
+ print "third Kinship method",K3.shape,"\n",K3
+ sys.stderr.write(options.geno+"\n")
+ k3 = round(K3[0][0],4)
+ if options.geno == 'data/small.geno':
+ assert k1==0.7939, "k1=%f" % k1
+ assert k2==0.7939, "k2=%f" % k2
+ assert k3==0.7939, "k3=%f" % k3
+ if options.geno == 'data/small_na.geno':
+ assert k1==0.7172, "k1=%f" % k1
+ assert k2==0.7172, "k2=%f" % k2
+ assert k3==0.7172, "k3=%f" % k3
+ if options.geno == 'data/test8000.geno':
+ assert k3==1.4352, "k3=%f" % k3
+
+else:
+ print "Doing nothing"