about summary refs log tree commit diff
path: root/wqflask
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask')
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py17
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/plink.py56
2 files changed, 41 insertions, 32 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
index 25011b18..d374407e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py
@@ -59,6 +59,7 @@ option_parser.add_option("-v", "--verbose",
 (options, args) = option_parser.parse_args()
 
 writer = None
+num_inds = None
 
 def msg(s):
     sys.stderr.write("INFO: ")
@@ -96,6 +97,7 @@ if options.kinship:
         wr("\t")
         wr("\t".join(line.split()))
         wr("\n")
+    num_inds = count
     msg(str(count)+" kinship lines written")
 
 if options.pheno:
@@ -129,14 +131,23 @@ if options.pheno:
     for i in range(count):
         wr("\t".join(phenos[i]))
         wr("\n")
+    num_inds = count
     msg(str(count)+" pheno lines written")
 
 if options.geno:
     if not options.plink:
         raise Exception("Use --plink switch")
+    if not num_inds:
+        raise Exception("Can not figure out the number of individuals, use --pheno or --kinship")    
     # msg("Converting geno "+options.geno)
-    # plink.readbim(options.geno+'.bim')
+
+    snps = plink.readbim(options.geno+'.bim')
     msg("Converting geno "+options.geno+'.bed')
-    plink.readbed(options.geno+'.bed',1000,8)
-    
+
+    def out(i,x):
+        print i,x
+        
+    snps = plink.readbed(options.geno+'.bed',num_inds, out)
+    msg(str(count)+" geno lines written (with "+str(snps)+" snps)")
+   
 msg("Converting done")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
index 53151b21..3e850c9b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/plink.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
@@ -51,19 +51,24 @@ def readbim(fn):
 
 # .bed (PLINK binary biallelic genotype table)
 # 
-# Primary representation of genotype calls at biallelic variants. Must be accompanied by .bim and .fam files.
+# Primary representation of genotype calls at biallelic variants. Must
+# be accompanied by .bim and .fam files. Basically contains num SNP
+# blocks containing IND (compressed 4 IND into a byte)
+#
+# Since it is a biallelic format it supports for every individual
+# whether the first allele is homozygous (b00), the second allele is
+# homozygous (b11), it is heterozygous (b10) or that it is missing
+# (b01).
+
 # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed
 # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam
 # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim
 
-def readbed(fn,inds,snps):
-    bytes = inds / 4 + (inds % 4 and 1 or 0)
-    format = 'c'*bytes
-    f = open(fn,'rb')
-    magic = f.read(3)
-    assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01")
+def readbed(fn,inds,func=None):
 
-    def formatBinaryGenotypes(X):
+    # For every SNP block fetch the individual genotypes using values
+    # 0.0 and 1.0 for homozygous and 0.5 for heterozygous alleles
+    def fetchGenotypes(X):
         D = { \
               '00': 0.0, \
               '10': 0.5, \
@@ -71,15 +76,6 @@ def readbed(fn,inds,snps):
               '01': np.nan \
            }
   
-        D_tped = { \
-              '00': '1 1', \
-              '10': '1 2', \
-              '11': '2 2', \
-              '01': '0 0' \
-           }
-  
-        #D = D_tped
-              
         G = []
         for x in X:
             if not len(x) == 10:
@@ -88,19 +84,21 @@ def readbed(fn,inds,snps):
             a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) 
             L = [D[y] for y in [a,b,c,d]]
             G += L
-        # only take the leading values because whatever is left should be null
         G = G[:inds]
         G = np.array(G)
-        # if normalize:
-        #     G = self.normalizeGenotype(G)
         return G
 
-    def next():
-        X = f.read(bytes)
-        XX = [bin(ord(x)) for x in struct.unpack(format,X)]
-        # snp = 0
-        # snp = self.snpFileHandle.readline().strip().split()[1]
-        return formatBinaryGenotypes(XX)
-
-    for x in range(snps):
-        print next()
+    bytes = inds / 4 + (inds % 4 and 1 or 0)
+    format = 'c'*bytes
+    count = 0
+    with open(fn,'rb') as f:
+        magic = f.read(3)
+        assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01")
+        while True:
+            count += 1
+            X = f.read(bytes)
+            if not X:
+                return(count-1)
+            XX = [bin(ord(x)) for x in struct.unpack(format,X)]
+            xs = fetchGenotypes(XX)
+            func(count,xs)