diff options
author | zsloan | 2015-03-17 19:17:58 +0000 |
---|---|---|
committer | zsloan | 2015-03-17 19:17:58 +0000 |
commit | 667a8441c6f549dd9820838ea9fd8d77368dd767 (patch) | |
tree | e1bed7b65ff4ac1eeefb3a28d1ddf0453d1efbcf | |
parent | 29217946be40bcf8635e65bd381126fd30b00cc4 (diff) | |
parent | 41ad9e7ae1807398be80a65f9427f3f799c1f200 (diff) | |
download | genenetwork2-667a8441c6f549dd9820838ea9fd8d77368dd767.tar.gz |
Merge branch 'master' of github.com:pjotrp/genenetwork2 into pjotr_test
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/chunks.py | 2 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 50 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 158 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 45 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py | 2 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 40 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/process_plink.py | 127 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 143 |
8 files changed, 376 insertions, 191 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py index abeffee7..9565fb96 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py @@ -93,4 +93,4 @@ if __name__ == '__main__': _main() print("\nConfirming doctests...") import doctest - doctest.testmod()
\ No newline at end of file + doctest.testmod() diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py new file mode 100644 index 00000000..315fd824 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -0,0 +1,50 @@ +# Genotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import numpy as np +from collections import Counter +import operator + +def replace_missing_with_MAF(snp_g): + """ + Replace the missing genotype with the minor allele frequency (MAF) + in the snp row. It is rather slow! + """ + cnt = Counter(snp_g) + tuples = sorted(cnt.items(), key=operator.itemgetter(1)) + l2 = [t for t in tuples if not np.isnan(t[0])] + maf = l2[0][0] + res = np.array([maf if np.isnan(snp) else snp for snp in snp_g]) + return res + +def normalize(ind_g): + """ + Run for every SNP list (for one individual) and return + normalized SNP genotype values with missing data filled in + """ + g1 = np.copy(ind_g) # avoid side effects + x = True - np.isnan(ind_g) # Matrix of True/False + m = ind_g[x].mean() # Global mean value + s = np.sqrt(ind_g[x].var()) # Global stddev + g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data + if s == 0: + g1 = g1 - m # Subtract the mean + else: + g1 = (g1 - m) / s # Normalize the deviation + return g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py new file mode 100644 index 00000000..9ab48510 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -0,0 +1,158 @@ +# pylmm kinship calculation +# +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# env PYTHONPATH=$pylmm_lib_path:./lib python $pylmm_lib_path/runlmm.py --pheno test.pheno --geno test9000.geno kinship --test + +import sys +import os +import numpy as np +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue +import time + +from optmatrix import matrix_initialize, matrixMultT + +def kinship_full(G): + """ + Calculate the Kinship matrix using a full dot multiplication + """ + print G.shape + m = G.shape[0] # snps + n = G.shape[1] # inds + sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" + m = np.dot(G.T,G) + m = m/G.shape[0] + return m + +def compute_W(job,G,n,snps,compute_size): + """ + Read 1000 SNPs at a time into matrix and return the result + """ + m = compute_size + W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + for j in range(0,compute_size): + pos = job*m + j # real position + if pos >= snps: + W = W[:,range(0,j)] + break + snp = G[job*compute_size+j] + if snp.var() == 0: + continue + W[:,j] = snp # set row to list of SNPs + return W + +def compute_matrixMult(job,W,q = None): + """ + Compute Kinship(W)*j + + For every set of SNPs matrixMult is used to multiply matrices T(W)*W + """ + res = matrixMultT(W) + if not q: q=compute_matrixMult.q + q.put([job,res]) + return job + +def f_init(q): + compute_matrixMult.q = q + +# Calculate the kinship matrix from G (SNPs as rows!), returns K +# +def kinship(G,options): + numThreads = None + if options.numThreads: + numThreads = int(options.numThreads) + options.computeSize = 1000 + matrix_initialize(options.useBLAS) + + sys.stderr.write(str(G.shape)+"\n") + n = G.shape[1] # inds + inds = n + m = G.shape[0] # snps + snps = m + sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" + + q = mp.Queue() + p = mp.Pool(numThreads, f_init, [q]) + cpu_num = mp.cpu_count() + print "CPU cores:",cpu_num + iterations = snps/options.computeSize+1 + if options.testing: + iterations = 8 + # jobs = range(0,8) # range(0,iterations) + + results = [] + + K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + + completed = 0 + for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job,G,n,snps,options.computeSize) + if numThreads == 1: + # Single-core + compute_matrixMult(job,W,q) + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + else: + # Multi-core + results.append(p.apply_async(compute_matrixMult, (job,W))) + # Do we have a result? + while (len(results)-completed>cpu_num*2): + time.sleep(0.1) + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + + if numThreads == None or numThreads > 1: + # results contains the growing result set + for job in range(len(results)-completed): + j,x = q.get(True,15) + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + + K = K / float(snps) + + # outFile = 'runtest.kin' + # if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) + # np.savetxt(outFile,K) + + # if options.saveKvaKve: + # if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + # Kva,Kve = linalg.eigh(K) + # if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + # np.savetxt(outFile+".kva",Kva) + # np.savetxt(outFile+".kve",Kve) + return K + + + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 16073d2c..99a5a940 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -143,6 +143,7 @@ def run_human(pheno_vector, timestamp = datetime.datetime.utcnow().isoformat() + # Pickle chunks of input SNPs (from Plink interator) and compress them #print("Starting adding loop") for part, result in enumerate(results): #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL) @@ -254,8 +255,8 @@ def run_other(pheno_vector, genotype_matrix, restricted_max_likelihood=True, refit=False, - tempdata=None, # <---- can not be None - is_testing=False): + tempdata=None # <---- can not be None + ): """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics @@ -267,9 +268,9 @@ def run_other(pheno_vector, """ print("In run_other") - print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing) + print("REML=",restricted_max_likelihood," REFIT=",refit) with Bench("Calculate Kinship"): - kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing) + kinship_matrix = calculate_kinship(genotype_matrix, tempdata) print("kinship_matrix: ", pf(kinship_matrix)) print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) @@ -325,7 +326,7 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) -def calculate_kinship(genotype_matrix, temp_data, is_testing=False): +def calculate_kinship(genotype_matrix, temp_data=None): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -337,10 +338,9 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False): m = genotype_matrix.shape[1] print("genotype 2D matrix n (inds) is:", n) print("genotype 2D matrix m (snps) is:", m) + assert m>n, "n should be larger than m (snps>inds)" keep = [] for counter in range(m): - if is_testing and counter>8: - break #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers not_number = np.isnan(genotype_matrix[:,counter]) @@ -360,7 +360,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False): genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr) percent_complete = int(round((counter/m)*45)) - temp_data.store("percent_complete", percent_complete) + if temp_data != None: + temp_data.store("percent_complete", percent_complete) genotype_matrix = genotype_matrix[:,keep] print("genotype_matrix: ", pf(genotype_matrix)) @@ -406,6 +407,8 @@ def GWAS(pheno_vector, v = np.isnan(pheno_vector) if v.sum(): keep = True - v + print(pheno_vector.shape,pheno_vector) + print(keep.shape,keep) pheno_vector = pheno_vector[keep] #genotype_matrix = genotype_matrix[keep,:] #covariate_matrix = covariate_matrix[keep,:] @@ -437,6 +440,8 @@ def GWAS(pheno_vector, p_values.append(0) t_statistics.append(np.nan) continue + + print(genotype_matrix.shape,pheno_vector.shape,keep.shape) pheno_vector = pheno_vector[keep] covariate_matrix = covariate_matrix[keep,:] @@ -484,7 +489,7 @@ class LMM: is not done consistently. """ - def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False): + def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True): """ The constructor takes a phenotype vector or array of size n. @@ -494,7 +499,6 @@ class LMM: When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. """ - self.is_testing = True if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) self.verbose = verbose @@ -513,7 +517,7 @@ class LMM: Kve = [] self.nonmissing = x - print("this K is:", pf(K)) + print("this K is:", K.shape, pf(K)) if len(Kva) == 0 or len(Kve) == 0: if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) @@ -525,8 +529,8 @@ class LMM: self.K = K self.Kva = Kva self.Kve = Kve - print("self.Kva is: ", pf(self.Kva)) - print("self.Kve is: ", pf(self.Kve)) + print("self.Kva is: ", self.Kva.shape, pf(self.Kva)) + print("self.Kve is: ", self.Kve.shape, pf(self.Kve)) self.Y = Y self.X0 = X0 self.N = self.K.shape[0] @@ -722,7 +726,7 @@ class LMM: pl.title(title) -def gn2_redis(key,species,is_testing=False): +def gn2_redis(key,species): json_params = Redis.get(key) params = json.loads(json_params) @@ -749,8 +753,7 @@ def gn2_redis(key,species,is_testing=False): genotype_matrix = geno, restricted_max_likelihood = params['restricted_max_likelihood'], refit = params['refit'], - tempdata = tempdata, - is_testing=is_testing) + tempdata = tempdata) results_key = "pylmm:results:" + params['temp_uuid'] @@ -775,11 +778,15 @@ def gn2_main(): gn2_redis(key,species) -def gn2_load_redis(key,species,kinship,pheno,geno,is_testing): +def gn2_load_redis(key,species,kinship,pheno,geno): print("Loading Redis from parsed data") + if kinship == None: + k = None + else: + k = kinship.tolist() params = dict(pheno_vector = pheno.tolist(), genotype_matrix = geno.tolist(), - kinship_matrix= kinship.tolist(), + kinship_matrix= k, restricted_max_likelihood = True, refit = False, temp_uuid = "testrun_temp_uuid", @@ -792,7 +799,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno,is_testing): Redis.set(key, json_params) Redis.expire(key, 60*60) - return gn2_redis(key,species,is_testing) + return gn2_redis(key,species) if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py index abb72081..5c71db6a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py @@ -9,7 +9,7 @@ from scipy import stats useNumpy = None hasBLAS = None -def matrix_initialize(useBLAS): +def matrix_initialize(useBLAS=True): global useNumpy # module based variable if useBLAS and useNumpy == None: print get_info('blas_opt') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py new file mode 100644 index 00000000..bb620052 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -0,0 +1,40 @@ +# Phenotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import sys +import numpy as np + +def remove_missing(y,g,verbose=False): + """ + Remove missing data from matrices, make sure the genotype data has + individuals as rows + """ + assert(y!=None) + assert(y.shape[0] == g.shape[0]) + + y1 = y + g1 = g + v = np.isnan(y) + keep = True - v + if v.sum(): + if verbose: + sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) + y1 = y[keep] + g1 = g[keep,:] + return y1,g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py deleted file mode 100644 index e47c18e1..00000000 --- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import absolute_import, print_function, division - -import sys -sys.path.append("../../..") - -print("sys.path: ", sys.path) - -import numpy as np - -import zlib -import cPickle as pickle -import redis -Redis = redis.Redis() - -import lmm - -class ProcessLmmChunk(object): - - def __init__(self): - self.get_snp_data() - self.get_lmm_vars() - - keep = self.trim_matrices() - - self.do_association(keep) - - print("p_values is: ", self.p_values) - - def get_snp_data(self): - plink_pickled = zlib.decompress(Redis.lpop("plink_inputs")) - plink_data = pickle.loads(plink_pickled) - - self.snps = np.array(plink_data['result']) - self.identifier = plink_data['identifier'] - - def get_lmm_vars(self): - lmm_vars_pickled = Redis.hget(self.identifier, "lmm_vars") - lmm_vars = pickle.loads(lmm_vars_pickled) - - self.pheno_vector = np.array(lmm_vars['pheno_vector']) - self.covariate_matrix = np.array(lmm_vars['covariate_matrix']) - self.kinship_matrix = np.array(lmm_vars['kinship_matrix']) - - def trim_matrices(self): - v = np.isnan(self.pheno_vector) - keep = True - v - keep = keep.reshape((len(keep),)) - - if v.sum(): - self.pheno_vector = self.pheno_vector[keep] - self.covariate_matrix = self.covariate_matrix[keep,:] - self.kinship_matrix = self.kinship_matrix[keep,:][:,keep] - - return keep - - def do_association(self, keep): - n = self.kinship_matrix.shape[0] - lmm_ob = lmm.LMM(self.pheno_vector, - self.kinship_matrix, - self.covariate_matrix) - lmm_ob.fit() - - self.p_values = [] - - for snp in self.snps: - snp = snp[0] - p_value, t_stat = lmm.human_association(snp, - n, - keep, - lmm_ob, - self.pheno_vector, - self.covariate_matrix, - self.kinship_matrix, - False) - - self.p_values.append(p_value) - - -#plink_pickled = zlib.decompress(Redis.lpop("plink_inputs")) -# -#plink_data = pickle.loads(plink_pickled) -#result = np.array(plink_data['result']) -#print("snp size is: ", result.shape) -#identifier = plink_data['identifier'] -# -#lmm_vars_pickled = Redis.hget(identifier, "lmm_vars") -#lmm_vars = pickle.loads(lmm_vars_pickled) -# -#pheno_vector = np.array(lmm_vars['pheno_vector']) -#covariate_matrix = np.array(lmm_vars['covariate_matrix']) -#kinship_matrix = np.array(lmm_vars['kinship_matrix']) -# -#v = np.isnan(pheno_vector) -#keep = True - v -#keep = keep.reshape((len(keep),)) -#print("keep is: ", keep) -# -#if v.sum(): -# pheno_vector = pheno_vector[keep] -# covariate_matrix = covariate_matrix[keep,:] -# kinship_matrix = kinship_matrix[keep,:][:,keep] -# -#n = kinship_matrix.shape[0] -#print("n is: ", n) -#lmm_ob = lmm.LMM(pheno_vector, -# kinship_matrix, -# covariate_matrix) -#lmm_ob.fit() -# -#p_values = [] -# -#for snp in result: -# snp = snp[0] -# p_value, t_stat = lmm.human_association(snp, -# n, -# keep, -# lmm_ob, -# pheno_vector, -# covariate_matrix, -# kinship_matrix, -# False) -# -# p_values.append(p_value) - - - - diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 738268be..469ba6c9 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -21,18 +21,22 @@ from optparse import OptionParser import sys import tsvreader import numpy as np -from lmm import gn2_load_redis +from lmm import gn2_load_redis, calculate_kinship +from kinship import kinship, kinship_full +import genotype +import phenotype usage = """ python runlmm.py [options] command runlmm.py processing multiplexer reads standardised input formats - and calls the different routines + and calls the different routines (writes to stdout) Current commands are: parse : only parse input files redis : use Redis to call into GN2 + kinship : calculate (new) kinship matrix try --help for more information """ @@ -47,12 +51,28 @@ parser.add_option("--pheno",dest="pheno", help="Phenotype file format 1.0") parser.add_option("--geno",dest="geno", help="Genotype file format 1.0") +parser.add_option("--maf-normalization", + action="store_true", dest="maf_normalization", default=False, + help="Apply MAF genotype normalization") +parser.add_option("--skip-genotype-normalization", + action="store_true", dest="skip_genotype_normalization", default=False, + help="Skip genotype normalization") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") +parser.add_option("--blas", action="store_true", default=False, dest="useBLAS", help="Use BLAS instead of numpy matrix multiplication") +parser.add_option("-t", "--threads", + type="int", dest="numThreads", + help="Threads to use") +parser.add_option("--saveKvaKve", + action="store_true", dest="saveKvaKve", default=False, + help="Testing mode") parser.add_option("--test", action="store_true", dest="testing", default=False, help="Testing mode") +parser.add_option("--test-kinship", + action="store_true", dest="test_kinship", default=False, + help="Testing mode for Kinship calculation") (options, args) = parser.parse_args() @@ -63,6 +83,10 @@ if len(args) != 1: cmd = args[0] print "Command: ",cmd +k = None +y = None +g = None + if options.kinship: k = tsvreader.kinship(options.kinship) print k.shape @@ -75,46 +99,79 @@ if options.geno: g = tsvreader.geno(options.geno) print g.shape -def normalizeGenotype(G): - # Run for every SNP list (num individuals) - x = True - np.isnan(G) # Matrix of True/False - m = G[x].mean() # Global mean value - # print m - s = np.sqrt(G[x].var()) # Global stddev - # print s - G[np.isnan(G)] = m # Plug-in mean values for missing - if s == 0: - G = G - m # Subtract the mean - else: - G = (G - m) / s # Normalize the deviation - return G - -# g = g.reshape((1, -1))[0] -print("Before",g) -gn = [] -for snp in g: - gn.append( normalizeGenotype(snp) ) - -gn = g -gn = np.array(gn) -print("After1",gn) -gnT = gn.T -print("After",gnT) -# G = gnT -G = gnT -print "G shape",G.shape -# sys.exit(1) -# assert(G[0,0]==-2.25726341) - -# Remove individuals with missing phenotypes -v = np.isnan(y) -keep = True - v -if v.sum(): - if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - y = y[keep] - G = G[keep,:] - k = k[keep,:][:,keep] - if cmd == 'redis': - ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) + # Emulating the redis setup of GN2 + G = g + print "Original G",G.shape, "\n", G + if y != None: + gnt = np.array(g).T + Y,g = phenotype.remove_missing(y,g.T,options.verbose) + G = g.T + print "Removed missing phenotypes",G.shape, "\n", G + if options.maf_normalization: + G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + print "MAF replacements: \n",G + if not options.skip_genotype_normalization: + G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + g = None + gnt = None + + ps, ts = gn2_load_redis('testrun','other',k,Y,G.T) print np.array(ps) + # Test results + p1 = round(ps[0],4) + p2 = round(ps[-1],4) + sys.stderr.write(options.geno+"\n") + if options.geno == 'data/small.geno': + assert p1==0.0708, "p1=%f" % p1 + assert p2==0.1417, "p2=%f" % p2 + if options.geno == 'data/small_na.geno': + assert p1==0.0958, "p1=%f" % p1 + assert p2==0.0435, "p2=%f" % p2 + if options.geno == 'data/test8000.geno': + assert p1==0.8984, "p1=%f" % p1 + assert p2==0.9623, "p2=%f" % p2 +elif cmd == 'kinship': + G = g + print "Original G",G.shape, "\n", G + if y != None: + gnt = np.array(g).T + Y,g = phenotype.remove_missing(y,g.T,options.verbose) + G = g.T + print "Removed missing phenotypes",G.shape, "\n", G + if options.maf_normalization: + G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + print "MAF replacements: \n",G + if not options.skip_genotype_normalization: + G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + g = None + gnt = None + + if options.test_kinship: + K = kinship_full(G) + print "Genotype",G.shape, "\n", G + print "first Kinship method",K.shape,"\n",K + k1 = round(K[0][0],4) + K2 = calculate_kinship(np.copy(G.T),temp_data=None) + print "Genotype",G.shape, "\n", G + print "GN2 Kinship method",K2.shape,"\n",K2 + k2 = round(K2[0][0],4) + + print "Genotype",G.shape, "\n", G + K3 = kinship(np.copy(G),options) + print "third Kinship method",K3.shape,"\n",K3 + sys.stderr.write(options.geno+"\n") + k3 = round(K3[0][0],4) + if options.geno == 'data/small.geno': + assert k1==0.7939, "k1=%f" % k1 + assert k2==0.7939, "k2=%f" % k2 + assert k3==0.7939, "k3=%f" % k3 + if options.geno == 'data/small_na.geno': + assert k1==0.7172, "k1=%f" % k1 + assert k2==0.7172, "k2=%f" % k2 + assert k3==0.7172, "k3=%f" % k3 + if options.geno == 'data/test8000.geno': + assert k3==1.4352, "k3=%f" % k3 + +else: + print "Doing nothing" |