aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/plink.py106
1 files changed, 106 insertions, 0 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
new file mode 100644
index 00000000..53151b21
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py
@@ -0,0 +1,106 @@
+# Plink module
+#
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+# Some of the BED file parsing came from pylmm:
+# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# According to the PLINK information
+
+# Parse a textual BIM file and return the contents as a list of tuples
+#
+# Extended variant information file accompanying a .bed binary genotype table.
+#
+# A text file with no header line, and one line per variant with the following six fields:
+#
+# Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name
+# Variant identifier
+# Position in morgans or centimorgans (safe to use dummy value of '0')
+# Base-pair coordinate (normally 1-based, but 0 ok; limited to 231-2)
+# Allele 1 (corresponding to clear bits in .bed; usually minor)
+# Allele 2 (corresponding to set bits in .bed; usually major)
+#
+# Allele codes can contain more than one character. Variants with negative bp coordinates are ignored by PLINK. Example
+#
+# 1 mm37-1-3125499 0 3125499 1 2
+# 1 mm37-1-3125701 0 3125701 1 2
+# 1 mm37-1-3187481 0 3187481 1 2
+
+import struct
+import numpy as np
+
+def readbim(fn):
+ res = []
+ for line in open(fn):
+ list = line.split()
+ if len([True for e in list if e == 'nan']) == 0:
+ res.append( (list[0],list[1],int(list[2]),int(list[3]),int(list[4]),int(list[5])) )
+ return res
+
+# .bed (PLINK binary biallelic genotype table)
+#
+# Primary representation of genotype calls at biallelic variants. Must be accompanied by .bim and .fam files.
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam
+# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim
+
+def readbed(fn,inds,snps):
+ bytes = inds / 4 + (inds % 4 and 1 or 0)
+ format = 'c'*bytes
+ f = open(fn,'rb')
+ magic = f.read(3)
+ assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01")
+
+ def formatBinaryGenotypes(X):
+ D = { \
+ '00': 0.0, \
+ '10': 0.5, \
+ '11': 1.0, \
+ '01': np.nan \
+ }
+
+ D_tped = { \
+ '00': '1 1', \
+ '10': '1 2', \
+ '11': '2 2', \
+ '01': '0 0' \
+ }
+
+ #D = D_tped
+
+ G = []
+ for x in X:
+ if not len(x) == 10:
+ xx = x[2:]
+ x = '0b' + '0'*(8 - len(xx)) + xx
+ a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4])
+ L = [D[y] for y in [a,b,c,d]]
+ G += L
+ # only take the leading values because whatever is left should be null
+ G = G[:inds]
+ G = np.array(G)
+ # if normalize:
+ # G = self.normalizeGenotype(G)
+ return G
+
+ def next():
+ X = f.read(bytes)
+ XX = [bin(ord(x)) for x in struct.unpack(format,X)]
+ # snp = 0
+ # snp = self.snpFileHandle.readline().strip().split()[1]
+ return formatBinaryGenotypes(XX)
+
+ for x in range(snps):
+ print next()