Browse Source

Merge pull request #1 from biod/master

Fixes and refactoring
bio2
Pjotr Prins 5 years ago
committed by GitHub
parent
commit
d0c58c43c5
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 30
      README.md
  2. 290
      bio/bam/cigar.d
  3. 57
      bio/bam/md/reconstruct.d
  4. 3
      bio/bam/pileup.d
  5. 230
      bio/bam/read.d
  6. 2
      bio/bam/utils/samheadermerger.d
  7. 2
      bio/core/bgzf/compress.d
  8. 9
      bio/core/bgzf/outputstream.d
  9. 151
      bio/core/fastq.d
  10. 288
      bio/core/region.d
  11. 4
      bio/core/utils/outbuffer.d
  12. 8
      bio/core/utils/roundbuf.d
  13. 101
      bio/sam/header.d
  14. 465
      bio/sam/utils/fastrecordparser.d
  15. 63
      bio/sam/utils/recordparser.d
  16. 7
      src_ragel/Makefile
  17. 55
      src_ragel/sam_alignment.rl
  18. 47
      test/unittests.d

30
README.md

@ -1,25 +1,33 @@
### BioD [![Build Status](https://travis-ci.org/biod/BioD.svg?branch=master)](https://travis-ci.org/biod/BioD)
# BioD [![Build Status](https://travis-ci.org/biod/BioD.svg?branch=master)](https://travis-ci.org/biod/BioD) [![DUB Package](https://img.shields.io/badge/dub-v0.1.0-red.svg)](https://code.dlang.org/packages/biod)
[BioD](https://github.com/biod/BioD) is a fast and memory efficient bioinformatics library written in the [D programming language](http://dlang.org).
BioD aims to:
* Provide a platform for writing high-performance bioinformatics applications in D. BioD achieves this by:
- automatic parallelization of tasks where possible for example reading and writing BAM files.
- reducing the GC overhead by avoiding unnecessary memory allocations
* Offer support for manipulating common biological data formats.
* Write clear documented and maintainable codebase.
- automatic parallelization of tasks where possible for example reading and writing BAM files
- reducing the GC overhead by avoiding unnecessary memory allocations
* Offer support for manipulating common biological data formats
* Write clear documented and maintainable codebase
# Usage
BioD can be installed via the [D package manager](https://code.dlang.org/packages/biod).
See the [examples directory](https://github.com/biod/BioD/tree/master/examples)
for examples and usage.
BioD is also a crucial part of the [sambamba](https://github.com/biod/sambamba) tool.
### Usage
See the [examples directory](https://github.com/biod/BioD/tree/master/examples) for examples and usage.
# BioD contributors and support
See [contributors](https://github.com/biod/BioD/graphs/contributors). For support
contact
#### BioD core
* [Artem Tarasov](https://github.com/lomereiter)
* [Pjotr Prins](https://github.com/pjotrp)
##### License
This library is licensed under the MIT license. Please see the license file for more details
# License
##### Citation
BioD is licensed under the liberal MIT (expat) [license](./LICENSE).

290
bio/bam/cigar.d

@ -0,0 +1,290 @@
/*
This file is part of BioD.
Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.cigar;
import std.algorithm;
import std.range;
import std.conv;
import std.format;
import std.exception;
import std.system;
import std.traits;
import std.array;
import std.bitmanip;
import core.stdc.stdlib;
import bio.core.base;
import bio.core.utils.format;
import bio.bam.abstractreader;
import bio.bam.cigar;
import bio.bam.writer;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
import bio.bam.md.core;
import bio.bam.utils.array;
import bio.bam.utils.value;
import bio.core.utils.switchendianness;
import bio.bam.thirdparty.msgpack : Packer, unpack;
/**
Represents single CIGAR operation
*/
struct CigarOperation {
static assert(CigarOperation.sizeof == uint.sizeof);
/*
WARNING!
It is very essential that the size of
this struct is EXACTLY equal to uint.sizeof!
The reason is to avoid copying of arrays during alignment parsing.
Namely, when some_pointer points to raw cigar data,
we can just do a cast. This allows to access those data
directly, not doing any memory allocations.
*/
private uint raw; // raw data from BAM
private static ubyte char2op(char c) {
switch(c) {
case 'M': return 0;
case 'I': return 1;
case 'D': return 2;
case 'N': return 3;
case 'S': return 4;
case 'H': return 5;
case 'P': return 6;
case '=': return 7;
case 'X': return 8;
default: return 15; // 15 is used as invalid value
}
}
/// Length must be strictly less than 2^28.
/// $(BR)
/// Operation type must be one of M, I, D, N, S, H, P, =, X.
this(uint length, char operation_type) {
enforce(length < (1<<28), "Too big length of CIGAR operation");
raw = (length << 4) | char2op(operation_type);
}
this(uint _raw) {
raw = _raw;
}
/// Operation length
uint length() @property const nothrow @nogc {
return raw >> 4;
}
/// CIGAR operation as one of MIDNSHP=X.
/// Absent or invalid operation is represented by '?'
char type() @property const nothrow @nogc {
return "MIDNSHP=X????????"[raw & 0xF];
}
// Each pair of bits has first bit set iff the operation is query consuming,
// and second bit set iff it is reference consuming.
// X = P H S N D I M
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
/// True iff operation is one of M, =, X, I, S
bool is_query_consuming() @property const nothrow @nogc {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
}
/// True iff operation is one of M, =, X, D, N
bool is_reference_consuming() @property const nothrow @nogc {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
}
/// True iff operation is one of M, =, X
bool is_match_or_mismatch() @property const nothrow @nogc {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
/// True iff operation is one of 'S', 'H'
bool is_clipping() @property const nothrow @nogc {
return ((raw & 0xF) >> 1) == 2; // 4 or 5
}
private void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(length);
sink.write(type);
}
void toString(scope void delegate(const(char)[]) sink) const {
toSam(sink);
}
}
alias CigarOperation[] CigarOperations;
bool is_unavailable(CigarOperations cigar) @property nothrow @nogc {
return (cigar.length == 1 && cigar[0].raw == '*');
}
/// Forward range of extended CIGAR operations, with =/X instead of M
/// Useful for, e.g., detecting positions of mismatches.
struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
private {
CigarOpRange _cigar;
MdOpRange _md_ops;
CigarOperation _front_cigar_op;
MdOperation _front_md_op;
uint _n_mismatches;
bool _empty;
}
///
this(CigarOpRange cigar, MdOpRange md_ops) {
_cigar = cigar;
_md_ops = md_ops;
fetchNextCigarOp();
fetchNextMdOp();
}
/// Forward range primitives
bool empty() @property const {
return _empty;
}
/// ditto
CigarOperation front() @property {
debug {
import std.stdio;
writeln(_front_cigar_op, " - ", _front_md_op);
}
if (_front_cigar_op.type != 'M')
return _front_cigar_op;
if (_n_mismatches == 0) {
assert(_front_md_op.is_match);
uint len = min(_front_md_op.match, _front_cigar_op.length);
return CigarOperation(len, '=');
}
assert(_front_md_op.is_mismatch);
return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X');
}
/// ditto
ExtendedCigarRange save() @property {
typeof(return) r = void;
r._cigar = _cigar.save;
r._md_ops = _md_ops.save;
r._front_cigar_op = _front_cigar_op;
r._front_md_op = _front_md_op;
r._n_mismatches = _n_mismatches;
r._empty = _empty;
return r;
}
/// ditto
void popFront() {
if (!_front_cigar_op.is_match_or_mismatch) {
if (_front_cigar_op.is_reference_consuming)
fetchNextMdOp();
fetchNextCigarOp();
return;
}
auto len = _front_cigar_op.length;
if (_n_mismatches > 0) {
enforce(_front_md_op.is_mismatch);
if (len > _n_mismatches) {
_front_cigar_op = CigarOperation(len - _n_mismatches, 'M');
_n_mismatches = 0;
fetchNextMdOp();
} else if (len < _n_mismatches) {
_n_mismatches -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
} else {
enforce(_front_md_op.is_match);
auto n_matches = _front_md_op.match;
if (len > n_matches) {
_front_cigar_op = CigarOperation(len - n_matches, 'M');
fetchNextMdOp();
} else if (len < n_matches) {
_front_md_op.match -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
}
}
private {
void fetchNextCigarOp() {
if (_cigar.empty) {
_empty = true;
return;
}
_front_cigar_op = _cigar.front;
_cigar.popFront();
}
void fetchNextMdOp() {
if (_md_ops.empty)
return;
_n_mismatches = 0;
_front_md_op = _md_ops.front;
_md_ops.popFront();
if (_front_md_op.is_mismatch) {
_n_mismatches = 1;
while (!_md_ops.empty && _md_ops.front.is_mismatch) {
_md_ops.popFront();
_n_mismatches += 1;
}
}
}
}
}
auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) {
return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops);
}

57
bio/bam/md/reconstruct.d

@ -8,10 +8,10 @@
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@ -23,6 +23,7 @@
*/
module bio.bam.md.reconstruct;
import bio.bam.cigar;
import bio.bam.read;
import bio.bam.md.core;
@ -34,7 +35,7 @@ import std.range;
/// Reconstruct read DNA.
/// Returns lazy sequence.
auto dna(T)(T read)
auto dna(T)(T read)
if(isBamRead!(Unqual!T))
{
@ -56,14 +57,14 @@ auto dna(T)(T read)
_seq = seq;
_ops = ops;
}
auto front() @property {
auto op = _ops.front;
return QueryChunk!S(_seq[0 .. op.length], op);
}
bool empty() @property {
return _ops.empty;
return _ops.empty;
}
void popFront() {
@ -81,7 +82,7 @@ auto dna(T)(T read)
// Get read sequence chunks corresponding to query-consuming operations in read.sequence
static auto queryChunks(ref T read) {
return getQueryChunksResult(read.sequence, filter!"a.is_query_consuming"(read.cigar));
}
@ -95,13 +96,13 @@ auto dna(T)(T read)
debug {
_initial_qseq = to!string(query_sequence);
}
_qseq = query_sequence;
_qseq = query_sequence;
_md = md_operations;
_fetchNextMdOperation();
}
bool empty() @property {
return _empty;
return _empty;
}
/*
@ -177,23 +178,23 @@ auto dna(T)(T read)
}
R _qseq;
M _md;
bool _empty;
MdOperation _cur_md_op;
}
}
auto md = _read["MD"];
string md_str;
if (!md.is_nothing) {
md_str = cast(string)_read["MD"];
}
static auto getResult(R, M)(ref T read, R query, M md_ops) {
return Result!(R, M)(read, query, md_ops);
}
auto result = getResult(_read,
auto result = getResult(_read,
joiner(map!"a.sequence"(filter!"a.operation.is_reference_consuming"(query_chunks))),
mdOperations(md_str));
@ -215,7 +216,7 @@ auto dna(T)(T read)
unittest {
import std.stdio;
writeln("Testing reconstruction of reference from MD tags and CIGAR");
// stderr.writeln("Testing reconstruction of reference from MD tags and CIGAR");
// Test reference reconstruction from MD and CIGAR.
// (Tests are taken from http://davetang.org/muse/2011/01/28/perl-and-sam/)
@ -314,19 +315,19 @@ auto dna(R)(R reads)
/*
* If current chunk is empty, get the next one.
*
* Here's the reference:
*
* Here's the reference:
* .........................*.......................................
* _reference_pos (we are here)
* Last chunk ended just now:
* [..........]
* Go through subsequent reads while their leftmost position is
* less or equal to _reference_pos, select the one which covers
* more bases to the right of _reference_pos.
* [...............]
* [....]
* [..........]
* [.........] <- this one is the best
* _reference_pos (we are here)
* Last chunk ended just now:
* [..........]
* Go through subsequent reads while their leftmost position is
* less or equal to _reference_pos, select the one which covers
* more bases to the right of _reference_pos.
* [...............]
* [....]
* [..........]
* [.........] <- this one is the best
*/
if (_chunk.empty) {
if (_reads.empty) {
@ -377,7 +378,7 @@ auto dna(R)(R reads)
debug {
/*
import std.stdio;
writeln("_reference_pos = ", _reference_pos,
writeln("_reference_pos = ", _reference_pos,
"; best_read.position = ", best_read.position,
"; _chunk length = ", best_read.basesCovered());
*/
@ -400,7 +401,7 @@ auto dna(R)(R reads)
unittest {
// reads are taken from HG00110.chrom20.ILLUMINA.bwa.GBR.exome.20111114.bam
// reads are taken from HG00110.chrom20.ILLUMINA.bwa.GBR.exome.20111114.bam
auto r1 = BamRead("r1",
"AGGTTTTGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTA",
@ -415,7 +416,7 @@ unittest {
CigarOperation(7, 'S')]);
r2.position = 60252;
r2["MD"] = "82T0";
auto r3 = BamRead("r3",
"CATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTAGCCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCT",
[CigarOperation(90, 'M')]);

3
bio/bam/pileup.d

@ -69,6 +69,7 @@
/// ---------------------------------------------------------
module bio.bam.pileup;
import bio.bam.cigar;
import bio.bam.read;
import bio.bam.md.reconstruct;
import bio.bam.splitter;
@ -760,7 +761,7 @@ unittest {
auto reference = to!string(dna(reads));
import std.stdio;
writeln("Testing pileup (low-level aspects)...");
// stderr.writeln("Testing pileup (low-level aspects)...");
auto pileup = makePileup(reads, true, 796, 849, false);
auto pileup2 = makePileup(reads, true, 0, ulong.max, false);

230
bio/bam/read.d

@ -51,6 +51,7 @@ import bio.core.base;
import bio.core.utils.format;
import bio.bam.abstractreader;
import bio.bam.cigar;
import bio.bam.writer;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
@ -74,233 +75,6 @@ import std.array;
import std.bitmanip;
import core.stdc.stdlib;
/**
Represents single CIGAR operation
*/
struct CigarOperation {
static assert(CigarOperation.sizeof == uint.sizeof);
/*
WARNING!
It is very essential that the size of
this struct is EXACTLY equal to uint.sizeof!
The reason is to avoid copying of arrays during alignment parsing.
Namely, when some_pointer points to raw cigar data,
we can just do a cast. This allows to access those data
directly, not doing any memory allocations.
*/
private uint raw; // raw data from BAM
private static ubyte char2op(char c) {
switch(c) {
case 'M': return 0;
case 'I': return 1;
case 'D': return 2;
case 'N': return 3;
case 'S': return 4;
case 'H': return 5;
case 'P': return 6;
case '=': return 7;
case 'X': return 8;
default: return 15; // 15 is used as invalid value
}
}
/// Length must be strictly less than 2^28.
/// $(BR)
/// Operation type must be one of M, I, D, N, S, H, P, =, X.
this(uint length, char operation_type) {
enforce(length < (1<<28), "Too big length of CIGAR operation");
raw = (length << 4) | char2op(operation_type);
}
/// Operation length
uint length() @property const nothrow {
return raw >> 4;
}
/// CIGAR operation as one of MIDNSHP=X.
/// Absent or invalid operation is represented by '?'
char type() @property const nothrow {
return "MIDNSHP=X????????"[raw & 0xF];
}
// Each pair of bits has first bit set iff the operation is query consuming,
// and second bit set iff it is reference consuming.
// X = P H S N D I M
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
/// True iff operation is one of M, =, X, I, S
bool is_query_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
}
/// True iff operation is one of M, =, X, D, N
bool is_reference_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
}
/// True iff operation is one of M, =, X
bool is_match_or_mismatch() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
/// True iff operation is one of 'S', 'H'
bool is_clipping() @property const {
return ((raw & 0xF) >> 1) == 2; // 4 or 5
}
private void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(length);
sink.write(type);
}
void toString(scope void delegate(const(char)[]) sink) const {
toSam(sink);
}
}
/// Forward range of extended CIGAR operations, with =/X instead of M
/// Useful for, e.g., detecting positions of mismatches.
struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
private {
CigarOpRange _cigar;
MdOpRange _md_ops;
CigarOperation _front_cigar_op;
MdOperation _front_md_op;
uint _n_mismatches;
bool _empty;
}
///
this(CigarOpRange cigar, MdOpRange md_ops) {
_cigar = cigar;
_md_ops = md_ops;
fetchNextCigarOp();
fetchNextMdOp();
}
/// Forward range primitives
bool empty() @property const {
return _empty;
}
/// ditto
CigarOperation front() @property {
debug {
import std.stdio;
writeln(_front_cigar_op, " - ", _front_md_op);
}
if (_front_cigar_op.type != 'M')
return _front_cigar_op;
if (_n_mismatches == 0) {
assert(_front_md_op.is_match);
uint len = min(_front_md_op.match, _front_cigar_op.length);
return CigarOperation(len, '=');
}
assert(_front_md_op.is_mismatch);
return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X');
}
/// ditto
ExtendedCigarRange save() @property {
typeof(return) r = void;
r._cigar = _cigar.save;
r._md_ops = _md_ops.save;
r._front_cigar_op = _front_cigar_op;
r._front_md_op = _front_md_op;
r._n_mismatches = _n_mismatches;
r._empty = _empty;
return r;
}
/// ditto
void popFront() {
if (!_front_cigar_op.is_match_or_mismatch) {
if (_front_cigar_op.is_reference_consuming)
fetchNextMdOp();
fetchNextCigarOp();
return;
}
auto len = _front_cigar_op.length;
if (_n_mismatches > 0) {
enforce(_front_md_op.is_mismatch);
if (len > _n_mismatches) {
_front_cigar_op = CigarOperation(len - _n_mismatches, 'M');
_n_mismatches = 0;
fetchNextMdOp();
} else if (len < _n_mismatches) {
_n_mismatches -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
} else {
enforce(_front_md_op.is_match);
auto n_matches = _front_md_op.match;
if (len > n_matches) {
_front_cigar_op = CigarOperation(len - n_matches, 'M');
fetchNextMdOp();
} else if (len < n_matches) {
_front_md_op.match -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
}
}
private {
void fetchNextCigarOp() {
if (_cigar.empty) {
_empty = true;
return;
}
_front_cigar_op = _cigar.front;
_cigar.popFront();
}
void fetchNextMdOp() {
if (_md_ops.empty)
return;
_n_mismatches = 0;
_front_md_op = _md_ops.front;
_md_ops.popFront();
if (_front_md_op.is_mismatch) {
_n_mismatches = 1;
while (!_md_ops.empty && _md_ops.front.is_mismatch) {
_md_ops.popFront();
_n_mismatches += 1;
}
}
}
}
}
auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) {
return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops);
}
/**
BAM record representation.
*/
@ -1508,7 +1282,7 @@ unittest {
import std.stdio;
import std.math;
writeln("Testing BamRead behaviour...");
// stderr.writeln("Testing BamRead behaviour...");
auto read = BamRead("readname",
"AGCTGACTACGTAATAGCCCTA",
[CigarOperation(22, 'M')]);

2
bio/bam/utils/samheadermerger.d

@ -304,7 +304,7 @@ unittest {
import std.stdio;
import std.algorithm;
writeln("Testing SAM header merging...");
// stderr.writeln("Testing SAM header merging...");
auto h1 = new SamHeader();
auto h2 = new SamHeader();
auto h3 = new SamHeader();

2
bio/core/bgzf/compress.d

@ -35,7 +35,7 @@ import std.bitmanip: nativeToLittleEndian;
///
/// Params:
/// chunk = chunk of memory to be compressed
/// level = compression level, see zlib documentation
/// level = compression level, see zlib documentation (https://github.com/madler/zlib/blob/master/zlib.h)
/// buffer = optional buffer which will be used for storing
/// decompressed data
///

9
bio/core/bgzf/outputstream.d

@ -190,11 +190,14 @@ class BgzfOutputStream : Stream {
/// Flush all remaining BGZF blocks and underlying stream.
override void flush() {
flushCurrentBlock();
foreach (task; _compression_tasks) {
while (!_compression_tasks.empty) {
auto task = _compression_tasks.front;
auto block = task.yieldForce();
writeResult(block);
writeResult(block);
_compression_tasks.popFront();
}
_stream.flush();
_current_size = 0;
}

151
bio/core/fastq.d

@ -0,0 +1,151 @@
/*
This file is part of BioD.
Copyright (C) 2016 George Githinji <biorelated@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
/*
The bio.core.fastq module is based from a question that I posted on forums.dlang.org.
I have bundled the answer as a module to support parsing fastq files with D.
Credit should go to Rikki Cattermole.
*/
module bio.core.fastq;
struct FastqRecord {
const(char)[] id;
const(char)[] seq;
const(char)[] qual;
static auto read(const(char)[] from) {
struct Result {
private {
const(char)[] source;
FastqRecord value;
bool isEmpty;
}
this(const(char)[] source) {
this.source = source;
popFront;
}
@property {
FastqRecord front() {
return value;
}
bool empty() {
return isEmpty;
}
}
void popFront() {
import std.string : indexOf;
if (source is null) {
isEmpty = true;
return;
}
void tidyInput() {
foreach(i, c; source) {
switch(c) {
case 0: .. case ' ':
break;
default:
source = source[i .. $];
return;
}
}
source = null;
}
tidyInput();
if (source is null)
return;
// id
assert(source[0] == '@');
ptrdiff_t len = source.indexOf("\n");
assert(len > 0);
value.id = source[1 .. len];
if (value.id[$-1] == "\r"[0])
value.id = value.id[0 .. $-1];
source = source[len + 1 .. $];
// seq
len = source.indexOf("\n");
assert(len > 0);
value.seq = source[0 .. len];
if (value.seq[$-1] == "\r"[0])
value.seq = value.seq[0 .. $-1];
source = source[len + 1 .. $];
// +id
len = source.indexOf("\n");
assert(len > 0);
source = source[len + 1 .. $];
// qual
len = source.indexOf("\n");
assert(len > 0);
value.qual = source[0 .. len];
if (value.qual[$-1] == "\r"[0])
value.qual = value.qual[0 .. $-1];
if (source.length > len + 1) {
source = source[len + 1 .. $];
tidyInput();
} else
source = null;
}
}
return Result(from);
}
}
unittest {
string input = """
@seq1
TTATTTTAAT
+
?+BBB/DHH@
@seq2
GACCCTTTGCA
+
?+BHB/DIH@
@SEQ_ID
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
"""[1 .. $];
foreach(record; FastqRecord.read(input)) {
import std.stdio;
// stderr.writeln(record); -> should be an assert statement
}
}

288
bio/core/region.d

@ -26,15 +26,64 @@
module bio.core.region;
#line 26 "region.d"
static const int region_parser_start = 1;
static const int region_parser_first_final = 3;
static const int region_parser_error = 0;
#line 30 "region.d"
static byte[] _region_parser_actions = [
0, 1, 1, 1, 2, 1, 3, 1,
4, 2, 0, 1
];
static const int region_parser_en_region = 1;
static byte[] _region_parser_key_offsets = [
0, 0, 6, 9, 12, 19, 21, 25
];
static char[] _region_parser_trans_keys = [
33u, 41u, 43u, 60u, 62u, 126u, 44u, 48u,
57u, 58u, 33u, 126u, 44u, 33u, 47u, 48u,
57u, 58u, 126u, 33u, 126u, 44u, 45u, 48u,
57u, 44u, 48u, 57u, 0
];
#line 40 "region.rl"
static byte[] _region_parser_single_lengths = [
0, 0, 1, 1, 1, 0, 2, 1
];
static byte[] _region_parser_range_lengths = [
0, 3, 1, 1, 3, 1, 1, 1
];
static byte[] _region_parser_index_offsets = [
0, 0, 4, 7, 10, 15, 17, 21
];
static byte[] _region_parser_indicies = [
0, 0, 0, 1, 2, 2, 1, 3,
0, 1, 5, 4, 5, 4, 1, 4,
1, 6, 7, 6, 1, 8, 8, 1,
0
];
static byte[] _region_parser_trans_targs = [
3, 0, 7, 4, 5, 6, 6, 2,
7
];
static byte[] _region_parser_trans_actions = [
0, 0, 9, 3, 0, 9, 1, 5,
1
];
static byte[] _region_parser_eof_actions = [
0, 0, 0, 3, 3, 3, 5, 7
];
static int region_parser_start = 1;
static int region_parser_first_final = 3;
static int region_parser_error = 0;
static int region_parser_en_region = 1;
#line 44 "region.rl"
import std.conv;
@ -57,161 +106,142 @@ Region parseRegion(string str) {
region.end = uint.max;
#line 57 "region.d"
#line 110 "region.d"
{
cs = region_parser_start;
}
#line 62 "region.rl"
#line 66 "region.rl"
#line 64 "region.d"
#line 117 "region.d"
{
int _klen;
uint _trans;
byte* _acts;
uint _nacts;
char* _keys;
if ( p == pe )
goto _test_eof;
switch ( cs )
if ( cs == 0 )
goto _out;
_resume:
_keys = &_region_parser_trans_keys[_region_parser_key_offsets[cs]];
_trans = _region_parser_index_offsets[cs];
_klen = _region_parser_single_lengths[cs];
if ( _klen > 0 ) {
char* _lower = _keys;
char* _mid;
char* _upper = _keys + _klen - 1;
while (1) {
if ( _upper < _lower )
break;
_mid = _lower + ((_upper-_lower) >> 1);
if ( (*p) < *_mid )
_upper = _mid - 1;
else if ( (*p) > *_mid )
_lower = _mid + 1;
else {
_trans += cast(uint)(_mid - _keys);
goto _match;
}
}
_keys += _klen;
_trans += _klen;
}
_klen = _region_parser_range_lengths[cs];
if ( _klen > 0 ) {
char* _lower = _keys;
char* _mid;
char* _upper = _keys + (_klen<<1) - 2;
while (1) {
if ( _upper < _lower )
break;
_mid = _lower + (((_upper-_lower) >> 1) & ~1);
if ( (*p) < _mid[0] )
_upper = _mid - 2;
else if ( (*p) > _mid[1] )
_lower = _mid + 2;
else {
_trans += cast(uint)((_mid - _keys)>>1);
goto _match;
}
}
_trans += _klen;
}
_match:
_trans = _region_parser_indicies[_trans];
cs = _region_parser_trans_targs[_trans];
if ( _region_parser_trans_actions[_trans] == 0 )
goto _again;
_acts = &_region_parser_actions[_region_parser_trans_actions[_trans]];
_nacts = cast(uint) *_acts++;
while ( _nacts-- > 0 )
{
goto case; case 1:
if ( (*p) < 43u ) {
if ( 33u <= (*p) && (*p) <= 41u )
goto st3;
} else if ( (*p) > 60u ) {
if ( 62u <= (*p) && (*p) <= 126u )
goto st3;
} else
goto st3;
goto st0;
st0:
cs = 0;
goto _out;
st3:
if ( ++p == pe )
goto _test_eof3;
goto case; case 3:
if ( (*p) == 58u )
goto tr3;
if ( 33u <= (*p) && (*p) <= 126u )
goto st3;
goto st0;
tr3:
switch ( *_acts++ )
{
case 0:
#line 29 "region.rl"
{ region.reference = str[0 .. p - str.ptr]; }
goto st4;
st4:
if ( ++p == pe )
goto _test_eof4;
goto case; case 4:
#line 100 "region.d"
if ( (*p) == 44u )
goto tr5;
if ( (*p) < 48u ) {
if ( 33u <= (*p) && (*p) <= 47u )
goto st5;
} else if ( (*p) > 57u ) {
if ( 58u <= (*p) && (*p) <= 126u )
goto st5;
} else
goto tr5;
goto st0;
st5:
if ( ++p == pe )
goto _test_eof5;
goto case; case 5:
if ( 33u <= (*p) && (*p) <= 126u )
goto st5;
goto st0;
tr5:
#line 25 "region.rl"
{ uint_value = 0; }
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st6;
tr6:
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st6;
st6:
if ( ++p == pe )
goto _test_eof6;
goto case; case 6:
#line 133 "region.d"
switch( (*p) ) {
case 44u: goto tr6;
case 45u: goto tr7;
default: break;
}
if ( 48u <= (*p) && (*p) <= 57u )
goto tr6;
goto st0;
tr7:
break;
case 1:
#line 30 "region.rl"
{ region.beg = to!uint(uint_value - 1); }
goto st2;
st2:
if ( ++p == pe )
goto _test_eof2;
goto case; case 2:
#line 150 "region.d"
if ( (*p) == 44u )
goto tr2;
if ( 48u <= (*p) && (*p) <= 57u )
goto tr2;
goto st0;
tr2:
#line 25 "region.rl"
{ uint_value = 0; }
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st7;
tr8:
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st7;
st7:
if ( ++p == pe )
goto _test_eof7;
goto case; case 7:
#line 170 "region.d"
if ( (*p) == 44u )
goto tr8;
if ( 48u <= (*p) && (*p) <= 57u )
goto tr8;
goto st0;
break;
case 2:
#line 33 "region.rl"
{ region.reference = str[0 .. p - str.ptr]; }
break;
case 3:
#line 34 "region.rl"
{ region.beg = to!uint(uint_value - 1); }
break;
#line 207 "region.d"
default: break;
}
}
_test_eof3: cs = 3; goto _test_eof;
_test_eof4: cs = 4; goto _test_eof;
_test_eof5: cs = 5; goto _test_eof;
_test_eof6: cs = 6; goto _test_eof;
_test_eof2: cs = 2; goto _test_eof;
_test_eof7: cs = 7; goto _test_eof;
_again:
if ( cs == 0 )
goto _out;
if ( ++p != pe )
goto _resume;
_test_eof: {}
if ( p == eof )
{
switch ( cs ) {
case 3:
case 4:
case 5:
#line 29 "region.rl"
byte* __acts = &_region_parser_actions[_region_parser_eof_actions[cs]];
uint __nacts = cast(uint) *__acts++;
while ( __nacts-- > 0 ) {
switch ( *__acts++ ) {
case 2:
#line 33 "region.rl"
{ region.reference = str[0 .. p - str.ptr]; }
break;
case 6:
#line 30 "region.rl"
case 3:
#line 34 "region.rl"
{ region.beg = to!uint(uint_value - 1); }
break;
case 7:
#line 31 "region.rl"
case 4:
#line 35 "region.rl"
{ region.end = to!uint(uint_value); }
break;
#line 203 "region.d"
#line 236 "region.d"
default: break;
}
}
}
_out: {}
}
#line 63 "region.rl"
#line 67 "region.rl"
return region;
}

4
bio/core/utils/outbuffer.d

@ -1,6 +1,6 @@
/*
This file is part of BioD.
Copyright (C) 2013 Artem Tarasov <lomereiter@gmail.com>
Copyright (C) 2013-2017 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
@ -79,7 +79,7 @@ class OutBuffer {
do {
_heap_capacity = _heap_capacity * 3 / 2;
} while (_heap_capacity < needed);
capacity = _heap_capacity;
_heap.length = _heap_capacity;
}
_heap_ptr[_heap_used .. _heap_used + bytes.length] = bytes[];
_heap_used = needed;

8
bio/core/utils/roundbuf.d

@ -46,24 +46,26 @@ struct RoundBuf(T) {
/// ditto
auto ref front() @property {
enforce(!empty, "buffer is empty");
enforce(!empty, "roundbuffer is empty");
return _items[_taken % $];
}
/// ditto
void popFront() {
enforce(!empty, "roundbuffer is empty");
++_taken;
}
///
auto ref back() @property {
enforce(!empty, "buffer is empty");
enforce(!empty, "roundbuffer is empty");
return _items[(_put - 1) % $];
}
/// Output range primitive
void put(T item) {
enforce(!full, "buffer is full");
enforce(!full, "roundbuffer is full");
enforce(_put < _put.max, "ringbuffer overflow");
_items[_put % $] = item;
++_put;
}

101
bio/sam/header.d

@ -251,12 +251,12 @@ unittest {
import std.algorithm;
import std.stdio;
writeln("Testing @HD line parsing...");
// stderr.writeln("Testing @HD line parsing...");
auto hd_line = HdLine.parse("@HD\tVN:1.0\tSO:coordinate");
assert(hd_line.format_version == "1.0");
assert(hd_line.sorting_order == "coordinate");
writeln("Testing @SQ line parsing...");
// stderr.writeln("Testing @SQ line parsing...");
auto sq_line = SqLine.parse("@SQ\tSN:NC_007605\tLN:171823\tM5:6743bd63b3ff2b5b8985d8933c53290a\tUR:ftp://.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz\tAS:NCBI37\tSP:HUMAN");
assert(sq_line.name == "NC_007605");
assert(sq_line.length == 171823);
@ -265,7 +265,7 @@ unittest {
assert(sq_line.assembly == "NCBI37");
assert(sq_line.species == "HUMAN");
writeln("Testing @RG line parsing...");
// stderr.writeln("Testing @RG line parsing...");
auto rg_line = RgLine.parse("@RG\tID:ERR016155\tLB:HUMgdtRAGDIAAPE\tSM:HG00125\tPI:488\tCN:BGI\tPL:ILLUMINA\tDS:SRP001294");
assert(rg_line.identifier == "ERR016155");
assert(rg_line.library == "HUMgdtRAGDIAAPE");
@ -275,7 +275,7 @@ unittest {
assert(rg_line.platform == "ILLUMINA");
assert(rg_line.description == "SRP001294");
writeln("Testing @PG line parsing...");
// stderr.writeln("Testing @PG line parsing...");
auto pg_line = PgLine.parse("@PG\tID:bam_calculate_bq\tPN:samtools\tPP:bam_recalibrate_quality_scores\tVN:0.1.17 (r973:277)\tCL:samtools calmd -Erb $bam_file $reference_fasta > $bq_bam_file");
assert(pg_line.identifier == "bam_calculate_bq");
assert(pg_line.name == "samtools");
@ -651,66 +651,65 @@ class SamHeader {
void toJson(Sink)(auto ref Sink sink) if (isSomeSink!Sink) {
JSONValue[string] result;
result["format_version"].str = format_version;
result["format_version"] = format_version;
if (sorting_order != SortingOrder.unknown) {
result["sorting_order"].str = sorting_order.to!string;
result["sorting_order"] = sorting_order.to!string;
}
auto tmp = new JSONValue[sequences.length];
for (auto i = 0; i < sequences.length; i++) {
auto line = getSequence(i);
JSONValue[string] sq;
sq["sequence_name"].str = line.name;
sq["sequence_length"].uinteger = line.length;
sq["assembly"].str = line.assembly;
sq["md5"].str = line.md5;
sq["species"].str = line.species;
sq["uri"].str = line.uri;
tmp[i].object = sq;
auto line = getSequence(i);
JSONValue[string] sq;
sq["sequence_name"] = line.name;
sq["sequence_length"] = line.length;
sq["assembly"] = line.assembly;
sq["md5"] = line.md5;
sq["species"] = line.species;
sq["uri"] = line.uri;
tmp[i].object = sq;
}
result["sq_lines"].array = tmp.dup;
tmp.length = read_groups.length;
size_t i = 0;
foreach (line; read_groups) {
JSONValue[string] sq;
sq["identifier"].str = line.identifier;
sq["sequencing_center"].str = line.sequencing_center;
sq["description"].str = line.description;
sq["date"].str = line.date;
sq["flow_order"].str = line.flow_order;
sq["key_sequence"].str = line.key_sequence;
sq["library"].str = line.library;
sq["programs"].str = line.programs;
sq["predicted_insert_size"].integer = line.predicted_insert_size;
sq["platform"].str = line.platform;
sq["platform_unit"].str = line.platform_unit;
sq["sample"].str = line.sample;
tmp[i++].object = sq;
result["sq_lines"] = tmp.dup;
tmp = null;
auto tmp2 = new JSONValue[read_groups.length];
foreach (i, line; read_groups) {
JSONValue[string] sq;
sq["identifier"] = line.identifier;
sq["sequencing_center"] = line.sequencing_center;
sq["description"] = line.description;
sq["date"] = line.date;
sq["flow_order"] = line.flow_order;
sq["key_sequence"] = line.key_sequence;
sq["library"] = line.library;
sq["programs"] = line.programs;
sq["predicted_insert_size"] = line.predicted_insert_size;
sq["platform"] = line.platform;
sq["platform_unit"] = line.platform_unit;
sq["sample"] = line.sample;
tmp2[i].object = sq;
}
result["rg_lines"].array = tmp.dup;
tmp.length = programs.length;
i = 0;
foreach (line; programs) {
JSONValue[string] sq;
sq["identifier"].str = line.identifier;
sq["program_name"].str = line.name;
sq["command_line"].str = line.command_line;
sq["previous_program"].str = line.previous_program;
sq["program_version"].str = line.program_version;
tmp.array[i++].object = sq;
result["rg_lines"] = tmp2;
tmp2 = null;
auto tmp3 = new JSONValue[programs.length];
foreach (i, line; programs) {
JSONValue[string] sq;
sq["identifier"] = line.identifier;
sq["program_name"] = line.name;
sq["command_line"] = line.command_line;
sq["previous_program"] = line.previous_program;
sq["program_version"] = line.program_version;
tmp3[i].object = sq;
}
result["pg_lines"].array = tmp;
result["pg_lines"] = tmp3;
JSONValue json;
json.object = result;
static if (__VERSION__ < 2072)