Merged lmm branch

author: Pjotr Prins 2015-03-11 12:32:59 +0300
committer: Pjotr Prins 2015-03-11 12:32:59 +0300
commit: 3d144d2610acf122dc7b00fbeba15ee4b21348af (patch)
tree: dfaead7181059f45a63334cd928523489209c01d
parent: 3a7d9fd232200a5cd95563caf2f067328035dfd3 (diff)
parent: 084c8fe8c1c3255d59bd1da59b7bebb5c8811ba0 (diff)
download: genenetwork2-3d144d2610acf122dc7b00fbeba15ee4b21348af.tar.gz
15 files changed, 564 insertions, 55 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
index e69de29b..e69de29b 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
index abeffee7..abeffee7 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
new file mode 100644
index 00000000..3b6b5d70
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
@@ -0,0 +1,178 @@
+# This is a converter for common LMM formats, so as to keep complexity
+# outside the main routines. 
+
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function
+from optparse import OptionParser
+import sys
+import os
+import numpy as np
+# from lmm import LMM, run_other
+# import input
+import plink
+
+usage = """
+python convertlmm.py [--plink] [--prefix out_basename] [--kinship kfile] [--pheno pname] [--geno gname]
+
+  Convert files for runlmm.py processing. Writes to stdout by default.
+
+  try --help for more information
+"""
+
+# if len(args) == 0:
+#     print usage
+#     sys.exit(1)
+
+option_parser = OptionParser(usage=usage)
+option_parser.add_option("--kinship", dest="kinship",
+                  help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program")
+option_parser.add_option("--pheno", dest="pheno",
+                         help="Parse a phenotype file (use with --plink only)")
+option_parser.add_option("--geno", dest="geno",
+                         help="Parse a genotype file (use with --plink only)")
+option_parser.add_option("--plink", dest="plink", action="store_true", default=False,
+                  help="Parse PLINK style")
+# option_parser.add_option("--kinship",action="store_false", dest="kinship", default=True,
+#                   help="Parse a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.")
+option_parser.add_option("--prefix", dest="prefix",
+                  help="Output prefix for output file(s)")
+option_parser.add_option("-q", "--quiet",
+                  action="store_false", dest="verbose", default=True,
+                  help="don't print status messages to stdout")
+option_parser.add_option("-v", "--verbose",
+                  action="store_true", dest="verbose", default=False,
+                  help="Print extra info")
+
+(options, args) = option_parser.parse_args()
+
+writer = None
+num_inds = None
+snp_names = []
+ind_names = []
+
+def msg(s):
+    sys.stderr.write("INFO: ")
+    sys.stderr.write(s)
+    sys.stderr.write("\n")
+    
+def wr(s):
+    if writer:
+        writer.write(s)
+    else:
+        sys.stdout.write(s)
+
+def wrln(s):
+    wr(s)
+    wr("\n")
+            
+
+if options.pheno:
+    if not options.plink:
+        raise Exception("Use --plink switch")
+    # Because plink does not track size we need to read the whole thing first
+    msg("Converting pheno "+options.pheno)
+    phenos = []
+    count = 0
+    count_pheno = None
+    for line in open(options.pheno,'r'):
+        count += 1
+        list = line.split()
+        pcount = len(list)-2
+        assert(pcount > 0)
+        if count_pheno == None:
+            count_pheno = pcount
+        assert(count_pheno == pcount)
+        row = [list[0]]+list[2:]
+        phenos.append(row)
+
+    writer = None
+    if options.prefix:
+        writer = open(options.prefix+".pheno","w")
+    wrln("# Phenotype format version 1.0")
+    wrln("# Individuals = "+str(count))
+    wrln("# Phenotypes = "+str(count_pheno))
+    for i in range(count_pheno):
+        wr("\t"+str(i+1))
+        wr("\n")
+    for i in range(count):
+        wr("\t".join(phenos[i]))
+        wr("\n")
+    num_inds = count
+    msg(str(count)+" pheno lines written")
+
+if options.kinship:
+    is_header = True
+    count = 0
+    msg("Converting kinship "+options.kinship)
+    writer = None
+    if options.prefix:
+        writer = open(options.prefix+".kin","w")
+    for line in open(options.kinship,'r'):
+        count += 1
+        if is_header:
+            size = len(line.split())
+            wrln("# Kinship format version 1.0")
+            wrln("# Size="+str(size))
+            for i in range(size):
+                wr("\t"+str(i+1))
+            wr("\n")
+            is_header = False
+        wr(str(count))
+        wr("\t")
+        wr("\t".join(line.split()))
+        wr("\n")
+    num_inds = count
+    msg(str(count)+" kinship lines written")
+    
+if options.geno:
+    msg("Converting geno "+options.geno+'.bed')
+    if not options.plink:
+        raise Exception("Use --plink switch")
+    if not num_inds:
+        raise Exception("Can not figure out the number of individuals, use --pheno or --kinship")    
+    bim_snps = plink.readbim(options.geno+'.bim')
+    num_snps = len(bim_snps)
+    writer = None
+    if options.prefix:
+        writer = open(options.prefix+".geno","w")
+    wrln("# Genotype format version 1.0")
+    wrln("# Individuals = "+str(num_inds))
+    wrln("# SNPs = "+str(num_snps))
+    wrln("# Encoding = HAB")
+    for i in range(num_inds):
+        wr("\t"+str(i+1))
+    wr("\n")
+
+    m = []
+    def out(i,x):
+        # wr(str(i)+"\t")
+        # wr("\t".join(x))
+        # wr("\n")
+        m.append(x)
+        
+    snps = plink.readbed(options.geno+'.bed',num_inds, ('A','H','B','-'), out)
+
+    msg("Write transposed genotype matrix")
+    for g in range(num_snps):
+        wr(bim_snps[g][1]+"\t")
+        for i in range(num_inds):
+            wr(m[g][i])
+        wr("\n")
+            
+    msg(str(count)+" geno lines written (with "+str(snps)+" snps)")
+   
+msg("Converting done")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index dfe28a4e..f7b556a5 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -133,13 +133,15 @@ class plink:
         return G
     
     def normalizeGenotype(self,G):
+        # print "Before",G
+        # print G.shape
         x = True - np.isnan(G)
         m = G[x].mean()
         s = np.sqrt(G[x].var())
         G[np.isnan(G)] = m
         if s == 0: G = G - m
         else: G = (G - m) / s
-     
+        # print "After",G
         return G
     
     def getPhenos(self,phenoFile=None):
@@ -260,4 +262,4 @@ class plink:
             L.append(D[self.indivs[i]])
         P = P[L,:]
      
-        return P
-\ No newline at end of file
+        return P
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 369fcbcc..58d7593d 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -45,11 +45,17 @@ import sys
 sys.path.append("/home/zas1024/gene/wqflask/")
 print("sys.path2:", sys.path)
 
+has_gn2=True
+
 from utility.benchmark import Bench
 from utility import temp_data
 
-from wqflask.my_pylmm.pyLMM import chunks
-
+try:
+    from wqflask.my_pylmm.pyLMM import chunks
+except ImportError:
+    print("WARNING: Standalone version missing the Genenetwork2 environment\n")
+    has_gn2=False
+    pass
 
 #np.seterr('raise')
 
@@ -248,20 +254,22 @@ def run_other(pheno_vector,
         genotype_matrix,
         restricted_max_likelihood=True,
         refit=False,
-        tempdata=None):
+        tempdata=None,      # <---- can not be None
+        is_testing=False):
+    
     """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
     
     restricted_max_likelihood -- whether to use restricted max likelihood; True or False
     refit -- whether to refit the variance component for each marker
     temp_data -- TempData object that stores the progress for each major step of the
-    calculations ("calculate_kinship" and "GWAS" take the majority of time)
+    calculations ("calculate_kinship" and "GWAS" take the majority of time) 
     
     """
     
-    
     print("In run_other")
+    print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing)
     with Bench("Calculate Kinship"):
-        kinship_matrix = calculate_kinship(genotype_matrix, tempdata)
+        kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing)
     
     print("kinship_matrix: ", pf(kinship_matrix))
     print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
@@ -273,6 +281,7 @@ def run_other(pheno_vector,
         lmm_ob.fit()
 
     print("genotype_matrix: ", genotype_matrix.shape)
+    print(genotype_matrix)
 
     with Bench("Doing GWAS"):
         t_stats, p_values = GWAS(pheno_vector,
@@ -288,6 +297,7 @@ def run_other(pheno_vector,
 def matrixMult(A,B):
 
     # If there is no fblas then we will revert to np.dot()
+
     try:
         linalg.fblas
     except AttributeError:
@@ -315,7 +325,7 @@ def matrixMult(A,B):
     return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
 
 
-def calculate_kinship(genotype_matrix, temp_data):
+def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
     """
     genotype_matrix is an n x m matrix encoding SNP minor alleles.
     
@@ -325,10 +335,12 @@ def calculate_kinship(genotype_matrix, temp_data):
     """
     n = genotype_matrix.shape[0]
     m = genotype_matrix.shape[1]
-    print("n is:", n)
-    print("m is:", m)
+    print("genotype 2D matrix n (inds) is:", n)
+    print("genotype 2D matrix m (snps) is:", m)
     keep = []
     for counter in range(m):
+        if is_testing and counter>8:
+            break
         #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
         #Checks if any values in column are not numbers
         not_number = np.isnan(genotype_matrix[:,counter])
@@ -472,7 +484,7 @@ class LMM:
           is not done consistently.
  
     """
-    def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True):
+    def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False):
  
        """
        The constructor takes a phenotype vector or array of size n.
@@ -481,7 +493,8 @@ class LMM:
        X0 is an optional covariate matrix of size n x q, where there are q covariates.
        When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
        """
- 
+
+       self.is_testing = True
        if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
        self.verbose = verbose
  
@@ -501,14 +514,14 @@ class LMM:
        self.nonmissing = x
  
        print("this K is:", pf(K))
- 
+       
        if len(Kva) == 0 or len(Kve) == 0:
           if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
           begin = time.time()
           Kva,Kve = linalg.eigh(K)
           end = time.time()
           if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
-       
+
        self.K = K
        self.Kva = Kva
        self.Kve = Kve
@@ -708,24 +721,20 @@ class LMM:
        pl.ylabel("Probability of data")
        pl.title(title)
 
-def main():
-    parser = argparse.ArgumentParser(description='Run pyLMM')
-    parser.add_argument('-k', '--key')
-    parser.add_argument('-s', '--species')
-    
-    opts = parser.parse_args()
-    
-    key = opts.key
-    species = opts.species
-    
+
+def gn2_redis(key,species,is_testing=False):
     json_params = Redis.get(key)
     
     params = json.loads(json_params)
-    #print("params:", params)
-    
-    #print("kinship_matrix:", params['kinship_matrix'])
     
     tempdata = temp_data.TempData(params['temp_uuid'])
+
+    print('kinship', np.array(params['kinship_matrix']))
+    print('pheno', np.array(params['pheno_vector']))
+    geno = np.array(params['genotype_matrix'])
+    print('geno', geno.shape, geno)
+    # sys.exit(1)
+    
     if species == "human" :
         ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']),
                   covariate_matrix = np.array(params['covariate_matrix']),
@@ -735,10 +744,11 @@ def main():
                   tempdata = tempdata)
     else:
         ps, ts = run_other(pheno_vector = np.array(params['pheno_vector']),
-                  genotype_matrix = np.array(params['genotype_matrix']),
+                  genotype_matrix = geno,
                   restricted_max_likelihood = params['restricted_max_likelihood'],
                   refit = params['refit'],
-                  tempdata = tempdata)
+                  tempdata = tempdata,
+                  is_testing=is_testing)
         
     results_key = "pylmm:results:" + params['temp_uuid']
 
@@ -748,9 +758,46 @@ def main():
     #Pushing json_results into a list where it is the only item because blpop needs a list
     Redis.rpush(results_key, json_results)
     Redis.expire(results_key, 60*60)
+    return ps, ts
+
+# This is the main function used by Genenetwork2 (with environment)
+def gn2_main():
+    parser = argparse.ArgumentParser(description='Run pyLMM')
+    parser.add_argument('-k', '--key')
+    parser.add_argument('-s', '--species')
+    
+    opts = parser.parse_args()
+    
+    key = opts.key
+    species = opts.species
+
+    gn2_redis(key,species)
+
+def gn2_load_redis(key,species,kinship,pheno,geno,is_testing):
+    print("Loading Redis from parsed data")
+    params = dict(pheno_vector = pheno.tolist(),
+                  genotype_matrix = geno.tolist(),
+                  kinship_matrix= kinship.tolist(),
+                  restricted_max_likelihood = True,
+                  refit = False,
+                  temp_uuid = "testrun_temp_uuid",
+                        
+                  # meta data
+                  timestamp = datetime.datetime.now().isoformat(),
+    )
+            
+    json_params = json.dumps(params)
+    Redis.set(key, json_params)
+    Redis.expire(key, 60*60)
 
+    return gn2_redis(key,species,is_testing)
+    
 if __name__ == '__main__':
-    main()
+    print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
+    if has_gn2:
+        gn2_main()
+    else:
+        print("Run from runlmm.py instead")
 
 
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
new file mode 100644
index 00000000..abb72081
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
@@ -0,0 +1,55 @@
+import sys
+import time
+import numpy as np
+from numpy.distutils.system_info import get_info;
+from scipy import linalg
+from scipy import optimize
+from scipy import stats
+
+useNumpy = None
+hasBLAS = None
+
+def matrix_initialize(useBLAS):
+    global useNumpy  # module based variable
+    if useBLAS and useNumpy == None:
+        print get_info('blas_opt')
+        try:
+            linalg.fblas
+            sys.stderr.write("INFO: using linalg.fblas\n")
+            useNumpy = False
+            hasBLAS  = True
+        except AttributeError:
+            sys.stderr.write("WARNING: linalg.fblas not found, using numpy.dot instead!\n")
+            useNumpy = True
+    else:
+        sys.stderr.write("INFO: using numpy.dot\n")
+        useNumpy=True
+        
+def matrixMult(A,B):
+   global useNumpy  # module based variable
+
+   if useNumpy:
+       return np.dot(A,B)
+
+   # If the matrices are in Fortran order then the computations will be faster
+   # when using dgemm.  Otherwise, the function will copy the matrix and that takes time.
+   if not A.flags['F_CONTIGUOUS']:
+      AA = A.T
+      transA = True
+   else:
+      AA = A
+      transA = False
+
+   if not B.flags['F_CONTIGUOUS']:
+      BB = B.T
+      transB = True
+   else:
+      BB = B
+      transB = False
+
+   return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
+
+def matrixMultT(M):
+    # res = np.dot(W,W.T)
+    # return linalg.fblas.dgemm(alpha=1.,a=M.T,b=M.T,trans_a=True,trans_b=False)
+    return matrixMult(M,M.T)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
new file mode 100644
index 00000000..7bd2df91
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
@@ -0,0 +1,107 @@
+# Plink module
+#
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+# Some of the BED file parsing came from pylmm:
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# According to the PLINK information
+
+# Parse a textual BIM file and return the contents as a list of tuples
+# 
+# Extended variant information file accompanying a .bed binary genotype table.
+# 
+# A text file with no header line, and one line per variant with the following six fields:
+# 
+#     Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
+#     Variant identifier
+#     Position in morgans or centimorgans (safe to use dummy value of '0')
+#     Base-pair coordinate (normally 1-based, but 0 ok; limited to 231-2)
+#     Allele 1 (corresponding to clear bits in .bed; usually minor)
+#     Allele 2 (corresponding to set bits in .bed; usually major)
+# 
+# Allele codes can contain more than one character. Variants with negative bp coordinates are ignored by PLINK. Example
+#
+#     1 mm37-1-3125499  0 3125499 1 2
+#     1 mm37-1-3125701  0 3125701 1 2
+#     1 mm37-1-3187481  0 3187481 1 2
+
+import struct
+# import numpy as np
+
+def readbim(fn):
+    res = []
+    for line in open(fn):
+        list = line.split()
+        if len([True for e in list if e == 'nan']) == 0:
+          res.append( (list[0],list[1],int(list[2]),int(list[3]),int(list[4]),int(list[5])) )
+        else:
+          res.append( (list[0],list[1],list[2],float('nan'),float('nan'),float('nan')) )
+    return res
+
+# .bed (PLINK binary biallelic genotype table)
+# 
+# Primary representation of genotype calls at biallelic variants. Must
+# be accompanied by .bim and .fam files. Basically contains num SNP
+# blocks containing IND (compressed 4 IND into a byte)
+#
+# Since it is a biallelic format it supports for every individual
+# whether the first allele is homozygous (b00), the second allele is
+# homozygous (b11), it is heterozygous (b10) or that it is missing
+# (b01).
+
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim
+
+def readbed(fn,inds,encoding,func=None):
+
+    # For every SNP block fetch the individual genotypes using values
+    # 0.0 and 1.0 for homozygous and 0.5 for heterozygous alleles
+    def fetchGenotypes(X):
+        # D = { \
+        #      '00': 0.0, \
+        #      '10': 0.5, \
+        #      '11': 1.0, \
+        #      '01': float('nan') \
+        #   }
+
+        Didx = { '00': 0, '10': 1, '11': 2, '01': 3 }
+        G = []
+        for x in X:
+            if not len(x) == 10:
+                xx = x[2:]
+                x = '0b' + '0'*(8 - len(xx)) + xx
+            a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) 
+            L = [encoding[Didx[y]] for y in [a,b,c,d]]
+            G += L
+        G = G[:inds]
+        # G = np.array(G)
+        return G
+
+    bytes = inds / 4 + (inds % 4 and 1 or 0)
+    format = 'c'*bytes
+    count = 0
+    with open(fn,'rb') as f:
+        magic = f.read(3)
+        assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01")
+        while True:
+            count += 1
+            X = f.read(bytes)
+            if not X:
+                return(count-1)
+            XX = [bin(ord(x)) for x in struct.unpack(format,X)]
+            xs = fetchGenotypes(XX)
+            func(count,xs)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
index e47c18e1..e47c18e1 100755..100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index ff53b4ea..738268be 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -19,32 +19,40 @@
 
 from optparse import OptionParser
 import sys
-import os
+import tsvreader
 import numpy as np
-# from lmm import LMM, run_other
-import csv
-
+from lmm import gn2_load_redis
 
 usage = """
 python runlmm.py [options] command
 
-  runlmm.py processing multiplexer reads standard input types and calls the routines
+  runlmm.py processing multiplexer reads standardised input formats
+  and calls the different routines
 
   Current commands are:
 
     parse        : only parse input files
+    redis        : use Redis to call into GN2
 
   try --help for more information
 """
 
+
 parser = OptionParser(usage=usage)
 # parser.add_option("-f", "--file", dest="input file",
 #                   help="In", metavar="FILE")
 parser.add_option("--kinship",dest="kinship",
-                  help="Kinship file format")
+                  help="Kinship file format 1.0")
+parser.add_option("--pheno",dest="pheno",
+                  help="Phenotype file format 1.0")
+parser.add_option("--geno",dest="geno",
+                  help="Genotype file format 1.0")
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
+parser.add_option("--test",
+                  action="store_true", dest="testing", default=False,
+                  help="Testing mode")
 
 (options, args) = parser.parse_args()
 
@@ -55,22 +63,58 @@ if len(args) != 1:
 cmd = args[0]
 print "Command: ",cmd
 
-
 if options.kinship:
-    K1 = []
-    print options.kinship
-    with open(options.kinship,'r') as tsvin:
-        if tsvin.readline().strip() != "# Kinship format version 1.0":
-            raise Exception("Expecting Kinship format version 1.0")
-        tsvin.readline()
-        tsvin.readline()
-        tsv = csv.reader(tsvin, delimiter='\t')
-        for row in tsv:
-            ns = np.genfromtxt(row[1:])
-            K1.append(ns) # <--- slow
-    K = np.array(K1)
-
-if cmd == 'parse':
-  pass
-else:
-  raise Exception("Unknown command")
+    k = tsvreader.kinship(options.kinship)
+    print k.shape
+
+if options.pheno:
+    y = tsvreader.pheno(options.pheno)
+    print y.shape
+
+if options.geno:
+    g = tsvreader.geno(options.geno)
+    print g.shape
+
+def normalizeGenotype(G):
+    # Run for every SNP list (num individuals)
+    x = True - np.isnan(G)  # Matrix of True/False
+    m = G[x].mean()         # Global mean value
+    # print m
+    s = np.sqrt(G[x].var()) # Global stddev
+    # print s
+    G[np.isnan(G)] = m      # Plug-in mean values for missing
+    if s == 0:
+        G = G - m           # Subtract the mean
+    else:
+        G = (G - m) / s     # Normalize the deviation
+    return G
+
+# g = g.reshape((1, -1))[0]
+print("Before",g)
+gn = []
+for snp in g:
+    gn.append( normalizeGenotype(snp) )
+
+gn = g
+gn = np.array(gn)
+print("After1",gn)
+gnT = gn.T
+print("After",gnT)
+# G = gnT
+G = gnT
+print "G shape",G.shape
+# sys.exit(1)
+# assert(G[0,0]==-2.25726341)
+
+# Remove individuals with missing phenotypes
+v = np.isnan(y)
+keep = True - v
+if v.sum():
+   if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+   y  = y[keep]
+   G = G[keep,:]
+   k = k[keep,:][:,keep]
+
+if cmd == 'redis':
+    ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
+    print np.array(ps)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
new file mode 100644
index 00000000..b4027fa3
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
@@ -0,0 +1,76 @@
+# Standard file readers
+#
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import os
+import numpy as np
+import csv
+
+def kinship(fn):
+    K1 = []
+    print fn
+    with open(fn,'r') as tsvin:
+        assert(tsvin.readline().strip() == "# Kinship format version 1.0")
+        tsvin.readline()
+        tsvin.readline()
+        tsv = csv.reader(tsvin, delimiter='\t')
+        for row in tsv:
+            ns = np.genfromtxt(row[1:])
+            K1.append(ns) # <--- slow
+    K = np.array(K1)
+    return K
+
+def pheno(fn):
+    Y1 = []
+    print fn
+    with open(fn,'r') as tsvin:
+        assert(tsvin.readline().strip() == "# Phenotype format version 1.0")
+        tsvin.readline()
+        tsvin.readline()
+        tsvin.readline()
+        tsv = csv.reader(tsvin, delimiter='\t')
+        for row in tsv:
+            ns = np.genfromtxt(row[1:])
+            Y1.append(ns) # <--- slow
+    Y = np.array(Y1)
+    return Y
+
+def geno(fn):
+    G1 = []
+    hab_mapper = {'A':0,'H':1,'B':2,'-':3}
+    pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ]
+
+    print fn
+    with open(fn,'r') as tsvin:
+        assert(tsvin.readline().strip() == "# Genotype format version 1.0")
+        tsvin.readline()
+        tsvin.readline()
+        tsvin.readline()
+        tsvin.readline()
+        tsv = csv.reader(tsvin, delimiter='\t')
+        for row in tsv:
+            # print(row)
+            id = row[0]
+            gs = list(row[1])
+            # print id,gs
+            gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs]
+            # print id,gs2
+            # ns = np.genfromtxt(row[1:])
+            G1.append(gs2) # <--- slow
+    G = np.array(G1)
+    return G
+
diff --git a/wqflask/wqflask/my_pylmm/example.py b/wqflask/wqflask/my_pylmm/scripts/example.py
index 8b30debd..8b30debd 100755
--- a/wqflask/wqflask/my_pylmm/example.py
+++ b/wqflask/wqflask/my_pylmm/scripts/example.py
diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py
index 74b58dd6..74b58dd6 100755
--- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py
+++ b/wqflask/wqflask/my_pylmm/scripts/pylmmGWAS.py
diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py
index c9a73ece..c9a73ece 100755
--- a/wqflask/wqflask/my_pylmm/pylmmKinship.py
+++ b/wqflask/wqflask/my_pylmm/scripts/pylmmKinship.py
diff --git a/wqflask/wqflask/my_pylmm/run_pylmm.py b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py
index 0c96d986..0c96d986 100755
--- a/wqflask/wqflask/my_pylmm/run_pylmm.py
+++ b/wqflask/wqflask/my_pylmm/scripts/run_pylmm.py
diff --git a/wqflask/wqflask/pylmm b/wqflask/wqflask/pylmm
deleted file mode 160000
-Subproject cede848b7ce648366c1bdd7bc5df43c633eeb0d
author	Pjotr Prins	2015-03-11 12:32:59 +0300
committer	Pjotr Prins	2015-03-11 12:32:59 +0300
commit	3d144d2610acf122dc7b00fbeba15ee4b21348af (patch)
tree	dfaead7181059f45a63334cd928523489209c01d
parent	3a7d9fd232200a5cd95563caf2f067328035dfd3 (diff)
parent	084c8fe8c1c3255d59bd1da59b7bebb5c8811ba0 (diff)
download	genenetwork2-3d144d2610acf122dc7b00fbeba15ee4b21348af.tar.gz