From eca1af83ecbca6c0ac018514f322cc5b49cfa405 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 12:43:54 +0300 Subject: plink support --- wqflask/wqflask/my_pylmm/pyLMM/plink.py | 106 ++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/plink.py (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py new file mode 100644 index 00000000..53151b21 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -0,0 +1,106 @@ +# Plink module +# +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# Some of the BED file parsing came from pylmm: +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# According to the PLINK information + +# Parse a textual BIM file and return the contents as a list of tuples +# +# Extended variant information file accompanying a .bed binary genotype table. +# +# A text file with no header line, and one line per variant with the following six fields: +# +# Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0' indicates unknown) or name +# Variant identifier +# Position in morgans or centimorgans (safe to use dummy value of '0') +# Base-pair coordinate (normally 1-based, but 0 ok; limited to 231-2) +# Allele 1 (corresponding to clear bits in .bed; usually minor) +# Allele 2 (corresponding to set bits in .bed; usually major) +# +# Allele codes can contain more than one character. Variants with negative bp coordinates are ignored by PLINK. Example +# +# 1 mm37-1-3125499 0 3125499 1 2 +# 1 mm37-1-3125701 0 3125701 1 2 +# 1 mm37-1-3187481 0 3187481 1 2 + +import struct +import numpy as np + +def readbim(fn): + res = [] + for line in open(fn): + list = line.split() + if len([True for e in list if e == 'nan']) == 0: + res.append( (list[0],list[1],int(list[2]),int(list[3]),int(list[4]),int(list[5])) ) + return res + +# .bed (PLINK binary biallelic genotype table) +# +# Primary representation of genotype calls at biallelic variants. Must be accompanied by .bim and .fam files. +# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed +# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam +# http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim + +def readbed(fn,inds,snps): + bytes = inds / 4 + (inds % 4 and 1 or 0) + format = 'c'*bytes + f = open(fn,'rb') + magic = f.read(3) + assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01") + + def formatBinaryGenotypes(X): + D = { \ + '00': 0.0, \ + '10': 0.5, \ + '11': 1.0, \ + '01': np.nan \ + } + + D_tped = { \ + '00': '1 1', \ + '10': '1 2', \ + '11': '2 2', \ + '01': '0 0' \ + } + + #D = D_tped + + G = [] + for x in X: + if not len(x) == 10: + xx = x[2:] + x = '0b' + '0'*(8 - len(xx)) + xx + a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) + L = [D[y] for y in [a,b,c,d]] + G += L + # only take the leading values because whatever is left should be null + G = G[:inds] + G = np.array(G) + # if normalize: + # G = self.normalizeGenotype(G) + return G + + def next(): + X = f.read(bytes) + XX = [bin(ord(x)) for x in struct.unpack(format,X)] + # snp = 0 + # snp = self.snpFileHandle.readline().strip().split()[1] + return formatBinaryGenotypes(XX) + + for x in range(snps): + print next() -- cgit v1.2.3