From aaf0f336b69fb1fb77c9dd3782ba4db0d732106c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 9 Mar 2015 13:38:03 +0300 Subject: plink: no need to know the number of SNPs, but you need INDs --- wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py | 17 +++++++-- wqflask/wqflask/my_pylmm/pyLMM/plink.py | 56 ++++++++++++++-------------- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py index 25011b18..d374407e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/convertlmm.py @@ -59,6 +59,7 @@ option_parser.add_option("-v", "--verbose", (options, args) = option_parser.parse_args() writer = None +num_inds = None def msg(s): sys.stderr.write("INFO: ") @@ -96,6 +97,7 @@ if options.kinship: wr("\t") wr("\t".join(line.split())) wr("\n") + num_inds = count msg(str(count)+" kinship lines written") if options.pheno: @@ -129,14 +131,23 @@ if options.pheno: for i in range(count): wr("\t".join(phenos[i])) wr("\n") + num_inds = count msg(str(count)+" pheno lines written") if options.geno: if not options.plink: raise Exception("Use --plink switch") + if not num_inds: + raise Exception("Can not figure out the number of individuals, use --pheno or --kinship") # msg("Converting geno "+options.geno) - # plink.readbim(options.geno+'.bim') + + snps = plink.readbim(options.geno+'.bim') msg("Converting geno "+options.geno+'.bed') - plink.readbed(options.geno+'.bed',1000,8) - + + def out(i,x): + print i,x + + snps = plink.readbed(options.geno+'.bed',num_inds, out) + msg(str(count)+" geno lines written (with "+str(snps)+" snps)") + msg("Converting done") diff --git a/wqflask/wqflask/my_pylmm/pyLMM/plink.py b/wqflask/wqflask/my_pylmm/pyLMM/plink.py index 53151b21..3e850c9b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/plink.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/plink.py @@ -51,19 +51,24 @@ def readbim(fn): # .bed (PLINK binary biallelic genotype table) # -# Primary representation of genotype calls at biallelic variants. Must be accompanied by .bim and .fam files. +# Primary representation of genotype calls at biallelic variants. Must +# be accompanied by .bim and .fam files. Basically contains num SNP +# blocks containing IND (compressed 4 IND into a byte) +# +# Since it is a biallelic format it supports for every individual +# whether the first allele is homozygous (b00), the second allele is +# homozygous (b11), it is heterozygous (b10) or that it is missing +# (b01). + # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bed # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#fam # http://pngu.mgh.harvard.edu/~purcell/plink2/formats.html#bim -def readbed(fn,inds,snps): - bytes = inds / 4 + (inds % 4 and 1 or 0) - format = 'c'*bytes - f = open(fn,'rb') - magic = f.read(3) - assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01") +def readbed(fn,inds,func=None): - def formatBinaryGenotypes(X): + # For every SNP block fetch the individual genotypes using values + # 0.0 and 1.0 for homozygous and 0.5 for heterozygous alleles + def fetchGenotypes(X): D = { \ '00': 0.0, \ '10': 0.5, \ @@ -71,15 +76,6 @@ def readbed(fn,inds,snps): '01': np.nan \ } - D_tped = { \ - '00': '1 1', \ - '10': '1 2', \ - '11': '2 2', \ - '01': '0 0' \ - } - - #D = D_tped - G = [] for x in X: if not len(x) == 10: @@ -88,19 +84,21 @@ def readbed(fn,inds,snps): a,b,c,d = (x[8:],x[6:8],x[4:6],x[2:4]) L = [D[y] for y in [a,b,c,d]] G += L - # only take the leading values because whatever is left should be null G = G[:inds] G = np.array(G) - # if normalize: - # G = self.normalizeGenotype(G) return G - def next(): - X = f.read(bytes) - XX = [bin(ord(x)) for x in struct.unpack(format,X)] - # snp = 0 - # snp = self.snpFileHandle.readline().strip().split()[1] - return formatBinaryGenotypes(XX) - - for x in range(snps): - print next() + bytes = inds / 4 + (inds % 4 and 1 or 0) + format = 'c'*bytes + count = 0 + with open(fn,'rb') as f: + magic = f.read(3) + assert( ":".join("{:02x}".format(ord(c)) for c in magic) == "6c:1b:01") + while True: + count += 1 + X = f.read(bytes) + if not X: + return(count-1) + XX = [bin(ord(x)) for x in struct.unpack(format,X)] + xs = fetchGenotypes(XX) + func(count,xs) -- cgit v1.2.3