aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--[-rwxr-xr-x]wqflask/wqflask/my_pylmm/pyLMM/__init__.py0
-rw-r--r--[-rwxr-xr-x]wqflask/wqflask/my_pylmm/pyLMM/chunks.py0
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py178
-rw-r--r--[-rwxr-xr-x]wqflask/wqflask/my_pylmm/pyLMM/input.py6
-rw-r--r--[-rwxr-xr-x]wqflask/wqflask/my_pylmm/pyLMM/lmm.py105
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py55
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/plink.py107
-rw-r--r--[-rwxr-xr-x]wqflask/wqflask/my_pylmm/pyLMM/process_plink.py0
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/runlmm.py92
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py76
-rwxr-xr-xwqflask/wqflask/my_pylmm/scripts/example.py (renamed from wqflask/wqflask/my_pylmm/example.py)0
-rwxr-xr-xwqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py (renamed from wqflask/wqflask/my_pylmm/pylmmGWAS.py)0
-rwxr-xr-xwqflask/wqflask/my_pylmm/scripts/pylmmKinship.py (renamed from wqflask/wqflask/my_pylmm/pylmmKinship.py)0
-rwxr-xr-xwqflask/wqflask/my_pylmm/scripts/run_pylmm.py (renamed from wqflask/wqflask/my_pylmm/run_pylmm.py)0
m---------wqflask/wqflask/pylmm0
15 files changed, 564 insertions, 55 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
index e69de29b..e69de29b 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
index abeffee7..abeffee7 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
new file mode 100644
index 00000000..3b6b5d70
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
@@ -0,0 +1,178 @@
+# This is a converter for common LMM formats, so as to keep complexity
+# outside the main routines.
+
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function
+from optparse import OptionParser
+import sys
+import os
+import numpy as np
+# from lmm import LMM, run_other
+# import input
+import plink
+
+usage = """
+python convertlmm.py [--plink] [--prefix out_basename] [--kinship kfile] [--pheno pname] [--geno gname]
+
+ Convert files for runlmm.py processing. Writes to stdout by default.
+
+ try --help for more information
+"""
+
+# if len(args) == 0:
+# print usage
+# sys.exit(1)
+
+option_parser = OptionParser(usage=usage)
+option_parser.add_option("--kinship", dest="kinship",
+ help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program")
+option_parser.add_option("--pheno", dest="pheno",
+ help="Parse a phenotype file (use with --plink only)")
+option_parser.add_option("--geno", dest="geno",
+ help="Parse a genotype file (use with --plink only)")
+option_parser.add_option("--plink", dest="plink", action="store_true", default=False,
+ help="Parse PLINK style")
+# option_parser.add_option("--kinship",action="store_false", dest="kinship", default=True,
+# help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.")
+option_parser.add_option("--prefix", dest="prefix",
+ help="Output prefix for output file(s)")
+option_parser.add_option("-q", "--quiet",
+ action="store_false", dest="verbose", default=True,
+ help="don't print status messages to stdout")
+option_parser.add_option("-v", "--verbose",
+ action="store_true", dest="verbose", default=False,
+ help="Print extra info")
+
+(options, args) = option_parser.parse_args()
+
+writer = None
+num_inds = None
+snp_names = []
+ind_names = []
+
+def msg(s):
+ sys.stderr.write("INFO: ")
+ sys.stderr.write(s)
+ sys.stderr.write("\n")
+
+def wr(s):
+ if writer:
+ writer.write(s)
+ else:
+ sys.stdout.write(s)
+
+def wrln(s):
+ wr(s)
+ wr("\n")
+
+
+if options.pheno:
+ if not options.plink:
+ raise Exception("Use --plink switch")
+ # Because plink does not track size we need to read the whole thing first
+ msg("Converting pheno "+options.pheno)
+ phenos = []
+ count = 0
+ count_pheno = None
+ for line in open(options.pheno,'r'):
+ count += 1
+ list = line.split()
+ pcount = len(list)-2
+ assert(pcount > 0)
+ if count_pheno == None:
+ count_pheno = pcount
+ assert(count_pheno == pcount)
+ row = [list[0]]+list[2:]
+ phenos.append(row)
+
+ writer = None
+ if options.prefix:
+ writer = open(options.prefix+".pheno","w")
+ wrln("# Phenotype format version 1.0")
+ wrln("# Individuals = "+str(count))
+ wrln("# Phenotypes = "+str(count_pheno))
+ for i in range(count_pheno):
+ wr("\t"+str(i+1))
+ wr("\n")
+ for i in range(count):
+ wr("\t".join(phenos[i]))
+ wr("\n")
+ num_inds = count
+ msg(str(count)+" pheno lines written")
+
+if options.kinship:
+ is_header = True
+ count = 0
+ msg("Converting kinship "+options.kinship)
+ writer = None
+ if options.prefix:
+ writer = open(options.prefix+".kin","w")
+ for line in open(options.kinship,'r'):
+ count += 1
+ if is_header:
+ size = len(line.split())
+ wrln("# Kinship format version 1.0")
+ wrln("# Size="+str(size))
+ for i in range(size):
+ wr("\t"+str(i+1))
+ wr("\n")
+ is_header = False
+ wr(str(count))
+ wr("\t")
+ wr("\t".join(line.split()))
+ wr("\n")
+ num_inds = count
+ msg(str(count)+" kinship lines written")
+
+if options.geno:
+ msg("Converting geno "+options.geno+'.bed')
+ if not options.plink:
+ raise Exception("Use --plink switch")
+ if not num_inds:
+ raise Exception("Can not figure out the number of individuals, use --pheno or --kinship")
+ bim_snps = plink.readbim(options.geno+'.bim')
+ num_snps = len(bim_snps)
+ writer = None
+ if options.prefix:
+ writer = open(options.prefix+".geno","w")
+ wrln("# Genotype format version 1.0")
+ wrln("# Individuals = "+str(num_inds))
+ wrln("# SNPs = "+str(num_snps))
+ wrln("# Encoding = HAB")
+ for i in range(num_inds):
+ wr("\t"+str(i+1))
+ wr("\n")
+
+ m = []
+ def out(i,x):
+ # wr(str(i)+"\t")
+ # wr("\t".join(x))
+ # wr("\n")
+ m.append(x)
+
+ snps = plink.readbed(options.geno+'.bed',num_inds, ('A','H','B','-'), out)
+
+ msg("Write transposed genotype matrix")
+ for g in range(num_snps):
+ wr(bim_snps[g][1]+"\t")
+ for i in range(num_inds):
+ wr(m[g][i])
+ wr("\n")
+
+ msg(str(count)+" geno lines written (with "+str(snps)+" snps)")
+
+msg("Converting done")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index dfe28a4e..f7b556a5 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -133,13 +133,15 @@ class plink:
return G
def normalizeGenotype(self,G):
+ # print "Before",G
+ # print G.shape
x = True - np.isnan(G)
m = G[x].mean()
s = np.sqrt(G[x].var())
G[np.isnan(G)] = m
if s == 0: G = G - m
else: G = (G - m) / s
-
+ # print "After",G
return G
def getPhenos(self,phenoFile=None):
@@ -260,4 +262,4 @@ class plink:
L.append(D[self.indivs[i]])
P = P[L,:]
- return P \ No newline at end of file
+ return P
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 369fcbcc..58d7593d 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -45,11 +45,17 @@ import sys
sys.path.append("/home/zas1024/gene/wqflask/")
print("sys.path2:", sys.path)
+has_gn2=True
+
from utility.benchmark import Bench
from utility import temp_data
-from wqflask.my_pylmm.pyLMM import chunks
-
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+except ImportError:
+ print("WARNING: Standalone version missing the Genenetwork2 environment\n")
+ has_gn2=False
+ pass
#np.seterr('raise')
@@ -248,20 +254,22 @@ def run_other(pheno_vector,
genotype_matrix,
restricted_max_likelihood=True,
refit=False,
- tempdata=None):
+ tempdata=None, # <---- can not be None
+ is_testing=False):
+
"""Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
restricted_max_likelihood -- whether to use restricted max likelihood; True or False
refit -- whether to refit the variance component for each marker
temp_data -- TempData object that stores the progress for each major step of the
- calculations ("calculate_kinship" and "GWAS" take the majority of time)
+ calculations ("calculate_kinship" and "GWAS" take the majority of time)
"""
-
print("In run_other")
+ print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing)
with Bench("Calculate Kinship"):
- kinship_matrix = calculate_kinship(genotype_matrix, tempdata)
+ kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing)
print("kinship_matrix: ", pf(kinship_matrix))
print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
@@ -273,6 +281,7 @@ def run_other(pheno_vector,
lmm_ob.fit()
print("genotype_matrix: ", genotype_matrix.shape)
+ print(genotype_matrix)
with Bench("Doing GWAS"):
t_stats, p_values = GWAS(pheno_vector,
@@ -288,6 +297,7 @@ def run_other(pheno_vector,
def matrixMult(A,B):
# If there is no fblas then we will revert to np.dot()
+
try:
linalg.fblas
except AttributeError:
@@ -315,7 +325,7 @@ def matrixMult(A,B):
return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
-def calculate_kinship(genotype_matrix, temp_data):
+def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
"""
genotype_matrix is an n x m matrix encoding SNP minor alleles.
@@ -325,10 +335,12 @@ def calculate_kinship(genotype_matrix, temp_data):
"""
n = genotype_matrix.shape[0]
m = genotype_matrix.shape[1]
- print("n is:", n)
- print("m is:", m)
+ print("genotype 2D matrix n (inds) is:", n)
+ print("genotype 2D matrix m (snps) is:", m)
keep = []
for counter in range(m):
+ if is_testing and counter>8:
+ break
#print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
#Checks if any values in column are not numbers
not_number = np.isnan(genotype_matrix[:,counter])
@@ -472,7 +484,7 @@ class LMM:
is not done consistently.
"""
- def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True):
+ def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False):
"""
The constructor takes a phenotype vector or array of size n.
@@ -481,7 +493,8 @@ class LMM:
X0 is an optional covariate matrix of size n x q, where there are q covariates.
When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
"""
-
+
+ self.is_testing = True
if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
self.verbose = verbose
@@ -501,14 +514,14 @@ class LMM:
self.nonmissing = x
print("this K is:", pf(K))
-
+
if len(Kva) == 0 or len(Kve) == 0:
if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
begin = time.time()
Kva,Kve = linalg.eigh(K)
end = time.time()
if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
-
+
self.K = K
self.Kva = Kva
self.Kve = Kve
@@ -708,24 +721,20 @@ class LMM:
pl.ylabel("Probability of data")
pl.title(title)
-def main():
- parser = argparse.ArgumentParser(description='Run pyLMM')
- parser.add_argument('-k', '--key')
- parser.add_argument('-s', '--species')
-
- opts = parser.parse_args()
-
- key = opts.key
- species = opts.species
-
+
+def gn2_redis(key,species,is_testing=False):
json_params = Redis.get(key)
params = json.loads(json_params)
- #print("params:", params)
-
- #print("kinship_matrix:", params['kinship_matrix'])
tempdata = temp_data.TempData(params['temp_uuid'])
+
+ print('kinship', np.array(params['kinship_matrix']))
+ print('pheno', np.array(params['pheno_vector']))
+ geno = np.array(params['genotype_matrix'])
+ print('geno', geno.shape, geno)
+ # sys.exit(1)
+
if species == "human" :
ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']),
covariate_matrix = np.array(params['covariate_matrix']),
@@ -735,10 +744,11 @@ def main():
tempdata = tempdata)
else:
ps, ts = run_other(pheno_vector = np.array(params['pheno_vector']),
- genotype_matrix = np.array(params['genotype_matrix']),
+ genotype_matrix = geno,
restricted_max_likelihood = params['restricted_max_likelihood'],
refit = params['refit'],
- tempdata = tempdata)
+ tempdata = tempdata,
+ is_testing=is_testing)
results_key = "pylmm:results:" + params['temp_uuid']
@@ -748,9 +758,46 @@ def main():
#Pushing json_results into a list where it is the only item because blpop needs a list
Redis.rpush(results_key, json_results)
Redis.expire(results_key, 60*60)
+ return ps, ts
+
+# This is the main function used by Genenetwork2 (with environment)
+def gn2_main():
+ parser = argparse.ArgumentParser(description='Run pyLMM')
+ parser.add_argument('-k', '--key')
+ parser.add_argument('-s', '--species')
+
+ opts = parser.parse_args()
+
+ key = opts.key
+ species = opts.species
+
+ gn2_redis(key,species)
+
+def gn2_load_redis(key,species,kinship,pheno,geno,is_testing):
+ print("Loading Redis from parsed data")
+ params = dict(pheno_vector = pheno.tolist(),
+ genotype_matrix = geno.tolist(),
+ kinship_matrix= kinship.tolist(),
+ restricted_max_likelihood = True,
+ refit = False,
+ temp_uuid = "testrun_temp_uuid",
+
+ # meta data
+ timestamp = datetime.datetime.now().isoformat(),
+ )
+
+ json_params = json.dumps(params)
+ Redis.set(key, json_params)
+ Redis.expire(key, 60*60)
+ return gn2_redis(key,species,is_testing)
+
if __name__ == '__main__':
- main()
+ print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
+ if has_gn2:
+ gn2_main()
+ else:
+ print("Run from runlmm.py instead")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
new file mode 100644
index 00000000..abb72081
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
@@ -0,0 +1,55 @@
+import sys
+import time
+import numpy as np
+from numpy.distutils.system_info import get_info;
+from scipy import linalg
+from scipy import optimize
+from scipy import stats
+
+useNumpy = None
+hasBLAS = None
+
+def matrix_initialize(useBLAS):
+ global useNumpy # module based variable
+ if useBLAS and useNumpy == None:
+ print get_info('blas_opt')
+ try:
+ linalg.fblas
+ sys.stderr.write("INFO: using linalg.fblas\n")
+ useNumpy = False
+ hasBLAS = True
+ except AttributeError:
+ sys.stderr.write("WARNING: linalg.fblas not found, using numpy.dot instead!\n")
+ useNumpy = True
+ else:
+ sys.stderr.write("INFO: using numpy.dot\n")
+ useNumpy=True
+
+def matrixMult(A,B):
+ global useNumpy # module based variable
+
+ if useNumpy:
+ return np.dot(A,B)
+
+ # If the matrices are in Fortran order then the computations will be faster
+ # when using dgemm. Otherwise, the function will copy the matrix and that takes time.
+ if not A.flags['F_CONTIGUOUS']:
+ AA = A.T
+ transA = True
+ else:
+ AA = A
+ transA = False
+
+ if not B.flags['F_CONTIGUOUS']:
+ BB = B.T
+ transB = True
+ else:
+ BB = B
+ transB = False
+
+ return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
+
+def matrixMultT(M):
+ # res = np.dot(W,W.T)
+ # return linalg.fblas.dgemm(alpha=1.,a=M.T,b=M.T,trans_a=True,trans_b=False)
+ return matrixMult(M,M.T)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
new file mode 100644
index 00000000..7bd2df91
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
@@ -0,0 +1,107 @@
+# Plink module
+#
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+# Some of the BED file parsing came from pylmm:
+# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# According to the PLINK information
+
+# Parse a textual BIM file and return the contents as a list of tuples
+#
+# Extended variant information file accompanying a .bed binary genotype table.
+#
+# A text file with no header line, and one line per variant with the following six fields:
+#
+# Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
+# Variant identifier
+# Position in morgans or centimorgans (safe to use dummy value of '0')
+# Base-pair coordinate (normally 1-based, but 0 ok; limited to 231-2)
+# Allele 1 (corresponding to clear bits in .bed; usually minor)
+# Allele 2 (corresponding to set bits in .bed; usually major)
+#
+# Allele codes can contain more than one character. Variants with negative bp coordinates are ignored by PLINK. Example
+#
+# 1 mm37-1-3125499 0 3125499 1 2
+# 1 mm37-1-3125701 0 3125701 1 2
+# 1 mm37-1-3187481 0 3187481 1 2
+
+import struct
+# import numpy as np
+
+def readbim(fn):
+ res = []
+ for line in open(fn):
+ list = line.split()
+ if len([True for e in list if e == 'nan']) == 0:
+ res.append( (list[0],list[1],int(list[2]),int(list[3]),int(list[4]),int(list[5])) )
+ else:
+ res.append( (list[0],list[1],list[2],float('nan'),float('nan'),float('nan')) )
+ return res
+
+# .bed (PLINK binary biallelic genotype table)
+#
+# Primary representation of genotype calls at biallelic variants. Must
+# be accompanied by .bim and .fam files. Basically contains num SNP
+# blocks containing IND (compressed 4 IND into a byte)
+#
+# Since it is a biallelic format it supports for every individual
+# whether the first allele is homozygous (b00), the second allele is
+# homozygous (b11), it is heterozygous (b10) or that it is missing
+# (b01).
+
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim
+
+def readbed(fn,inds,encoding,func=None):
+
+ # For every SNP block fetch the individual genotypes using values
+ # 0.0 and 1.0 for homozygous and 0.5 for heterozygous alleles
+ def fetchGenotypes(X):
+ # D = { \
+ # '00': 0.0, \
+ # '10': 0.5, \
+ # '11': 1.0, \
+ # '01': float('nan') \
+ # }
+
+ Didx = { '00': 0, '10': 1, '11': 2, '01': 3 }
+ G = []
+ for x in X:
+ if not len(x) == 10:
+ xx = x[2:]
+ x = '0b' + '0'*(8 - len(xx)) + xx
+ a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4])
+ L = [encoding[Didx[y]] for y in [a,b,c,d]]
+ G += L
+ G = G[:inds]
+ # G = np.array(G)
+ return G
+
+ bytes = inds / 4 + (inds % 4 and 1 or 0)
+ format = 'c'*bytes
+ count = 0
+ with open(fn,'rb') as f:
+ magic = f.read(3)
+ assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01")
+ while True:
+ count += 1
+ X = f.read(bytes)
+ if not X:
+ return(count-1)
+ XX = [bin(ord(x)) for x in struct.unpack(format,X)]
+ xs = fetchGenotypes(XX)
+ func(count,xs)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
index e47c18e1..e47c18e1 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index ff53b4ea..738268be 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -19,32 +19,40 @@
from optparse import OptionParser
import sys
-import os
+import tsvreader
import numpy as np
-# from lmm import LMM, run_other
-import csv
-
+from lmm import gn2_load_redis
usage = """
python runlmm.py [options] command
- runlmm.py processing multiplexer reads standard input types and calls the routines
+ runlmm.py processing multiplexer reads standardised input formats
+ and calls the different routines
Current commands are:
parse : only parse input files
+ redis : use Redis to call into GN2
try --help for more information
"""
+
parser = OptionParser(usage=usage)
# parser.add_option("-f", "--file", dest="input file",
# help="In", metavar="FILE")
parser.add_option("--kinship",dest="kinship",
- help="Kinship file format")
+ help="Kinship file format 1.0")
+parser.add_option("--pheno",dest="pheno",
+ help="Phenotype file format 1.0")
+parser.add_option("--geno",dest="geno",
+ help="Genotype file format 1.0")
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose", default=True,
help="don't print status messages to stdout")
+parser.add_option("--test",
+ action="store_true", dest="testing", default=False,
+ help="Testing mode")
(options, args) = parser.parse_args()
@@ -55,22 +63,58 @@ if len(args) != 1:
cmd = args[0]
print "Command: ",cmd
-
if options.kinship:
- K1 = []
- print options.kinship
- with open(options.kinship,'r') as tsvin:
- if tsvin.readline().strip() != "# Kinship format version 1.0":
- raise Exception("Expecting Kinship format version 1.0")
- tsvin.readline()
- tsvin.readline()
- tsv = csv.reader(tsvin, delimiter='\t')
- for row in tsv:
- ns = np.genfromtxt(row[1:])
- K1.append(ns) # <--- slow
- K = np.array(K1)
-
-if cmd == 'parse':
- pass
-else:
- raise Exception("Unknown command")
+ k = tsvreader.kinship(options.kinship)
+ print k.shape
+
+if options.pheno:
+ y = tsvreader.pheno(options.pheno)
+ print y.shape
+
+if options.geno:
+ g = tsvreader.geno(options.geno)
+ print g.shape
+
+def normalizeGenotype(G):
+ # Run for every SNP list (num individuals)
+ x = True - np.isnan(G) # Matrix of True/False
+ m = G[x].mean() # Global mean value
+ # print m
+ s = np.sqrt(G[x].var()) # Global stddev
+ # print s
+ G[np.isnan(G)] = m # Plug-in mean values for missing
+ if s == 0:
+ G = G - m # Subtract the mean
+ else:
+ G = (G - m) / s # Normalize the deviation
+ return G
+
+# g = g.reshape((1, -1))[0]
+print("Before",g)
+gn = []
+for snp in g:
+ gn.append( normalizeGenotype(snp) )
+
+gn = g
+gn = np.array(gn)
+print("After1",gn)
+gnT = gn.T
+print("After",gnT)
+# G = gnT
+G = gnT
+print "G shape",G.shape
+# sys.exit(1)
+# assert(G[0,0]==-2.25726341)
+
+# Remove individuals with missing phenotypes
+v = np.isnan(y)
+keep = True - v
+if v.sum():
+ if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+ y = y[keep]
+ G = G[keep,:]
+ k = k[keep,:][:,keep]
+
+if cmd == 'redis':
+ ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
+ print np.array(ps)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
new file mode 100644
index 00000000..b4027fa3
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
@@ -0,0 +1,76 @@
+# Standard file readers
+#
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import os
+import numpy as np
+import csv
+
+def kinship(fn):
+ K1 = []
+ print fn
+ with open(fn,'r') as tsvin:
+ assert(tsvin.readline().strip() == "# Kinship format version 1.0")
+ tsvin.readline()
+ tsvin.readline()
+ tsv = csv.reader(tsvin, delimiter='\t')
+ for row in tsv:
+ ns = np.genfromtxt(row[1:])
+ K1.append(ns) # <--- slow
+ K = np.array(K1)
+ return K
+
+def pheno(fn):
+ Y1 = []
+ print fn
+ with open(fn,'r') as tsvin:
+ assert(tsvin.readline().strip() == "# Phenotype format version 1.0")
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsv = csv.reader(tsvin, delimiter='\t')
+ for row in tsv:
+ ns = np.genfromtxt(row[1:])
+ Y1.append(ns) # <--- slow
+ Y = np.array(Y1)
+ return Y
+
+def geno(fn):
+ G1 = []
+ hab_mapper = {'A':0,'H':1,'B':2,'-':3}
+ pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ]
+
+ print fn
+ with open(fn,'r') as tsvin:
+ assert(tsvin.readline().strip() == "# Genotype format version 1.0")
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsv = csv.reader(tsvin, delimiter='\t')
+ for row in tsv:
+ # print(row)
+ id = row[0]
+ gs = list(row[1])
+ # print id,gs
+ gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs]
+ # print id,gs2
+ # ns = np.genfromtxt(row[1:])
+ G1.append(gs2) # <--- slow
+ G = np.array(G1)
+ return G
+
diff --git a/wqflask/wqflask/my_pylmm/example.py b/wqflask/wqflask/my_pylmm/scripts/example.py
index 8b30debd..8b30debd 100755
--- a/wqflask/wqflask/my_pylmm/example.py
+++ b/wqflask/wqflask/my_pylmm/scripts/example.py
diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py
index 74b58dd6..74b58dd6 100755
--- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py
+++ b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py
diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py
index c9a73ece..c9a73ece 100755
--- a/wqflask/wqflask/my_pylmm/pylmmKinship.py
+++ b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py
diff --git a/wqflask/wqflask/my_pylmm/run_pylmm.py b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py
index 0c96d986..0c96d986 100755
--- a/wqflask/wqflask/my_pylmm/run_pylmm.py
+++ b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py
diff --git a/wqflask/wqflask/pylmm b/wqflask/wqflask/pylmm
deleted file mode 160000
-Subproject cede848b7ce648366c1bdd7bc5df43c633eeb0d