@@ -5,4 +5,5 @@ __dummy.html | |||
*.obj | |||
dub.selections.json | |||
bin/ | |||
old_biod/ | |||
build/ |
@@ -21,7 +21,7 @@ | |||
DEALINGS IN THE SOFTWARE. | |||
*/ | |||
module bio.core.fasta; | |||
module bio.std.file.fasta; | |||
import std.file; | |||
import std.exception; | |||
@@ -26,7 +26,7 @@ | |||
Credit should go to Rikki Cattermole. | |||
*/ | |||
module bio.core.fastq; | |||
module bio.std.file.fastq; | |||
struct FastqRecord { | |||
const(char)[] id; | |||
@@ -1,3 +0,0 @@ | |||
bio2 modules are the next generation BAM reading tools. Some of the | |||
functionality aims to replace that of bio modules in this same | |||
repository. |
@@ -1,101 +0,0 @@ | |||
/* | |||
New style BAM reader. This file is part of Sambamba. | |||
Copyright (C) 2017,2018 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
// This is a complete rewrite of Artem Tarasov's original reader. | |||
module bio2.bam.header; | |||
/* | |||
import std.conv; | |||
import core.stdc.stdio: fopen, fread, fclose; | |||
import std.typecons; | |||
import std.bitmanip; | |||
import bio.bam.cigar; | |||
*/ | |||
import std.exception; | |||
import std.file; | |||
import std.stdio; | |||
import std.string; | |||
import bio.bam.constants; | |||
import bio2.bgzf; | |||
import bio2.bgzf_writer; | |||
import bio2.constants; | |||
struct RefSequence { | |||
size_d length; | |||
string name; | |||
} | |||
struct BamHeader { | |||
string id; | |||
string text; | |||
RefSequence[] refs; | |||
@disable this(this); // disable copy semantics; | |||
} | |||
void fetch_bam_header(ref BamHeader header, ref BgzfStream stream) { | |||
// stderr.writeln("Fetching BAM header"); | |||
ubyte[4] ubyte4; | |||
stream.read(ubyte4); | |||
enforce(ubyte4 == BAM_MAGIC,"Invalid file format: expected BAM magic number"); | |||
immutable text_size = stream.read!int(); | |||
// stderr.writeln("Text size ",text_size.sizeof," ",text_size); | |||
immutable text = stream.read!string(text_size); | |||
header = BamHeader(BAM_MAGIC,text); | |||
immutable n_refs = stream.read!int(); | |||
// stderr.writeln("Fetching ",n_refs," references"); | |||
foreach(int n_ref; 0..n_refs) { | |||
immutable l_name = stream.read!int(); | |||
// stderr.writeln("!!",l_name); | |||
auto ref_name = stream.read!string(l_name); | |||
immutable l_ref = stream.read!int(); // length of reference sequence (bps) | |||
// stderr.writeln(l_name," ",ref_name," ",l_ref); | |||
header.refs ~= RefSequence(l_ref,ref_name[0..l_name-1]); // drop zero terminator | |||
} | |||
} | |||
void write_bam_header(ref BgzfWriter bw, ref BamHeader header) { | |||
// stderr.writeln("Writing BAM header"); | |||
ubyte[4] magic = cast(ubyte[])BAM_MAGIC; | |||
bw.write(magic); | |||
// stderr.writeln("Text size ",int.sizeof," ",header.text.length); | |||
bw.write!int(cast(int)header.text.length); | |||
bw.write(header.text); | |||
auto n_refs = cast(int)header.refs.length; | |||
bw.write!int(cast(int)header.refs.length); | |||
// stderr.writeln("Writing ",n_refs," references"); | |||
foreach(int n_ref; 0..n_refs) { | |||
immutable refseq = header.refs[n_ref]; | |||
bw.write!int(cast(int)(refseq.name.length+1)); // incl. zero terminator | |||
// stderr.writeln("!!",refseq.name.length+1); | |||
bw.write(refseq.name); | |||
bw.write!ubyte(cast(ubyte)'\0'); | |||
bw.write!int(cast(int)refseq.length); | |||
// stderr.writeln(refseq.name.length+1," ",refseq.name," ",refseq.length); | |||
} | |||
// stderr.writeln("!!"); | |||
bw.flush_block(); | |||
} |
@@ -1,548 +0,0 @@ | |||
/* | |||
New style BAM reader. This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
// This is a complete rewrite of Artem Tarasov's original reader. | |||
module bio2.bam.reader; | |||
import std.conv; | |||
import core.stdc.stdio: fopen, fread, fclose; | |||
import std.exception; | |||
import std.file; | |||
import std.format : format; | |||
import std.stdio; | |||
import std.string; | |||
import std.typecons; | |||
import std.bitmanip; | |||
import bio.bam.cigar; | |||
import bio.bam.constants; | |||
import bio.core.utils.exception; | |||
import bio2.bgzf; | |||
import bio2.constants; | |||
import bio2.bam.header; | |||
template ReadFlags(alias flag) { | |||
@property bool is_paired() nothrow { return cast(bool)(flag & 0x1); } | |||
/// Each segment properly aligned according to the aligner | |||
@property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); } | |||
@property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); } | |||
@property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); } | |||
@property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); } | |||
@property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); } | |||
@property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); } | |||
@property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); } | |||
@property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); } | |||
@property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); } | |||
@property bool is_qc_fail() { | |||
assert(is_mapped_raw,to!string(this)); | |||
return cast(bool)(flag & 0x200); } | |||
alias is_qc_fail failed_quality_control; | |||
/// PCR or optical duplicate | |||
@property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); } | |||
/// Supplementary alignment | |||
@property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); } | |||
@property string show_flags() { | |||
string res = format("b%b-%d",flag,flag); | |||
if (is_paired) res ~= " pair"; | |||
if (is_proper_pair) res ~= " proper"; | |||
if (is_mapped_raw) res ~= " mapped"; | |||
if (is_unmapped_raw) res ~= " unmapped"; | |||
if (mate_is_unmapped) res ~= " mate_unmapped"; | |||
if (is_reverse_strand) res ~= " rev_strand"; | |||
if (mate_is_reverse_strand) res ~= " mate_rev_strand"; | |||
if (is_first_of_pair) res ~= " first_of_pair"; | |||
if (is_second_of_pair) res ~= " second_of_pair"; | |||
if (is_secondary_alignment) res ~= " secondary_aln"; | |||
if (is_mapped_raw && is_qc_fail) res ~= " qc_fail"; | |||
if (is_duplicate) res ~= " duplicate"; | |||
if (is_supplementary) res ~= " suppl"; | |||
return res; | |||
} | |||
} | |||
template CheckMapped(alias refid) { | |||
@property nothrow bool is_unmapped() { | |||
return is_unmapped_raw; | |||
} | |||
@property bool is_mapped() { | |||
debug { | |||
if (is_mapped_raw) { | |||
assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec | |||
} | |||
} | |||
return !is_unmapped_raw; | |||
} | |||
} | |||
enum Offset { | |||
bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24 | |||
}; | |||
/** | |||
Raw Read buffer containing unparsed data. It should be considered | |||
read-only. | |||
Current implementation is a cluct (class-struct hybrid). The _data | |||
pointer is shared when ReadBlob is assigned to another variable | |||
(i.e., there is a remote dependency). The advantage is that for | |||
each Read data gets allocated on the heap only once. | |||
All offsets are indexed on init (except for tags). When using | |||
fields beyond refid,pos use ProcessReadBlob instead because it | |||
caches values. | |||
*/ | |||
struct ReadBlob { | |||
RefId refid; // -1 is invalid (BAM Spec) | |||
GenomePos pos; // 0 coordinate based (BAM spec) | |||
private ubyte[] _data; | |||
uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max; | |||
mixin ReadFlags!(_flag); | |||
mixin CheckMapped!(refid); | |||
/* | |||
this(RefId ref_id, GenomePos read_pos, ubyte[] buf) { | |||
refid = ref_id; | |||
pos = read_pos; | |||
_data = buf; | |||
} | |||
*/ | |||
@property void cleanup() { | |||
destroy(_data); | |||
_data = null; | |||
} | |||
// Turn ReadBlob into class-struct hybrid or a cluct ;) | |||
// @disable this(this); // disable copy semantics; | |||
@property @trusted nothrow private const T fetch(T)(uint raw_offset) { | |||
ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof]; | |||
return cast(const(T))buf.read!(T,Endian.littleEndian)(); | |||
} | |||
@property @trusted nothrow private const | |||
uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); } | |||
@property @trusted nothrow private const | |||
uint _flag_nc() { return fetch!uint(Offset.flag_nc); } | |||
@property @trusted nothrow private const | |||
int sequence_length() { return fetch!int(Offset.l_seq); } | |||
@property @trusted nothrow private const | |||
int _next_refID() { return fetch!int(Offset.next_refID); } | |||
@property @trusted nothrow private const | |||
int _next_pos() { return fetch!int(Offset.next_pos); } | |||
@property @trusted nothrow private const | |||
int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN | |||
@property @trusted nothrow private const | |||
ushort _bin() { return _bin_mq_nl >> 16; } | |||
@property @trusted nothrow private const | |||
ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; } | |||
@property @trusted nothrow private const | |||
ubyte _l_read_name() { return _bin_mq_nl & 0xFF; } | |||
@property @trusted nothrow private const | |||
ushort _flag() { return fetch!ushort(Offset.flag); } | |||
@property @trusted nothrow private const | |||
ushort _n_cigar_op() { return _flag_nc & 0xFFFF; } | |||
@property @trusted nothrow private const | |||
uint _read_name_offset() { return Offset.read_name; } | |||
@property @trusted nothrow private | |||
uint _cigar_offset() { | |||
if (offset_cigar == int.max) | |||
offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof); | |||
return offset_cigar; | |||
} | |||
@property @trusted nothrow private | |||
uint _seq_offset() { | |||
if (offset_seq == int.max) | |||
offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof); | |||
return offset_seq; | |||
} | |||
@property @trusted nothrow private | |||
uint _qual_offset() { | |||
if (offset_qual == int.max) | |||
offset_qual = _seq_offset + (sequence_length + 1)/2; | |||
return offset_qual; | |||
} | |||
@property @trusted nothrow private | |||
uint _tags_offset() { return _qual_offset + sequence_length; } | |||
@property @trusted nothrow private | |||
ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; } | |||
@property @trusted nothrow private | |||
ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; } | |||
@property @trusted nothrow private | |||
ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; } | |||
alias sequence_length _l_seq; | |||
alias _mapq mapping_quality; // MAPQ | |||
string toString() { | |||
return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">"; | |||
} | |||
} | |||
/** | |||
ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use | |||
this when you need to access field/elements multiple times. Note | |||
that ProcessReadBlob becomes invalid when ReadBlob goes out of scope. | |||
*/ | |||
struct ProcessReadBlob { | |||
private Nullable!ReadBlob _read2; | |||
Nullable!int sequence_length2; | |||
private Nullable!string sequence2, read_name2; | |||
private Nullable!CigarOperations cigar2; | |||
private Nullable!GenomePos consumed_reference_bases2; | |||
mixin ReadFlags!(_flag); | |||
mixin CheckMapped!(ref_id); | |||
this(Nullable!ReadBlob _r) { | |||
_read2 = _r; | |||
} | |||
@property void cleanup() { | |||
_read2.cleanup; | |||
} | |||
@property nothrow bool isNull() { | |||
return _read2.isNull; | |||
} | |||
@property RefId ref_id() { | |||
enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2)); | |||
return _read2.refid; | |||
} | |||
@property RefId raw_ref_id() { | |||
return _read2.refid; | |||
} | |||
@property nothrow uint _flag_nc() { | |||
return _read2._flag_nc; | |||
} | |||
@property nothrow ushort _flag() { | |||
return _read2._flag; | |||
} | |||
alias ref_id refid; | |||
/// Get the start position on the reference sequence (better use | |||
/// start_loc), i.e., the first base that gets consumed in the | |||
/// CIGAR. | |||
@property GenomePos start_pos() { | |||
assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec | |||
asserte(_read2.pos < GenomePos.max); | |||
return cast(GenomePos)_read2.pos; | |||
} | |||
@property GenomePos raw_start_pos() { | |||
return cast(GenomePos)_read2.pos; | |||
} | |||
/// Get the end position on the reference sequence (better use end_loc) | |||
@property GenomePos end_pos() { | |||
assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence"); | |||
assert(!consumed_reference_bases.isNull); | |||
return start_pos + consumed_reference_bases; | |||
} | |||
@property GenomePos raw_end_pos() { | |||
return raw_start_pos + consumed_reference_bases; | |||
} | |||
@property GenomeLocation start_loc() { | |||
return GenomeLocation(ref_id,start_pos); | |||
} | |||
@property GenomeLocation end_loc() { | |||
return GenomeLocation(ref_id,end_pos); | |||
} | |||
@property @trusted MappingQuality mapping_quality() { // MAPQ | |||
assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec | |||
return MappingQuality(_read2.mapping_quality); | |||
} | |||
@property @trusted int tlen() { // do not use | |||
return _read2._tlen; | |||
} | |||
@property @trusted GenomePos sequence_length() { | |||
if (sequence_length2.isNull) | |||
sequence_length2 = _read2.sequence_length; | |||
return sequence_length2; | |||
} | |||
/// Count and caches consumed reference bases. Uses raw_cigar to | |||
/// avoid a heap allocation. | |||
@property @trusted Nullable!GenomePos consumed_reference_bases() { | |||
if (consumed_reference_bases2.isNull) { | |||
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec | |||
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec | |||
auto raw = cast(uint[]) _read2.raw_cigar(); | |||
if (raw.length==1 && raw[0] == '*') | |||
return consumed_reference_bases2; // null | |||
else { | |||
GenomePos bases = 0; | |||
for (size_t i = 0; i < raw.length; i++) { | |||
auto cigarop = CigarOperation(raw[i]); | |||
if (cigarop.is_reference_consuming) | |||
bases += cigarop.length; | |||
} | |||
consumed_reference_bases2 = bases; | |||
} | |||
} | |||
return consumed_reference_bases2; | |||
} | |||
/// Count query consumed bases. Uses raw_cigar to avoid a heap | |||
/// allocation. | |||
@property @trusted GenomePos consumed_query_bases() { | |||
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec | |||
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec | |||
auto raw = cast(uint[]) _read2.raw_cigar(); | |||
if (raw.length==1 && raw[0] == '*') | |||
return consumed_reference_bases2; // null | |||
else { | |||
GenomePos bases = 0; | |||
for (size_t i = 0; i < raw.length; i++) { | |||
auto cigarop = CigarOperation(raw[i]); | |||
if (cigarop.is_query_consuming) | |||
bases += cigarop.length; | |||
} | |||
return bases; | |||
} | |||
} | |||
/// Return read name as a string. If unavailable returns | |||
/// null. Caches name. | |||
@property Nullable!string read_name() { | |||
if (read_name2.isNull) { | |||
assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec | |||
auto raw = _read2.read_name; | |||
if (raw.length == 0 || (raw.length ==1 && raw[0] == '*')) | |||
return read_name2; // null | |||
assert(raw.length < 255); // BAM spec | |||
if (raw[raw.length-1] == 0) // strip trailing C zero | |||
raw.length -= 1; | |||
read_name2 = Nullable!string(cast(string)raw); | |||
} | |||
return read_name2; | |||
} | |||
/// Returns Cigar as an array of operations. Returns null if no | |||
/// operations. Caches Cigar when there are operations. | |||
@property Nullable!CigarOperations cigar() { | |||
if (cigar2.isNull) { | |||
assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec | |||
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec | |||
auto raw = cast(uint[]) _read2.raw_cigar(); | |||
if (raw.length==0 || (raw.length==1 && raw[0] == '*')) | |||
return cigar2; // null | |||
else { | |||
auto s = new CigarOperation[raw.length]; // Heap alloc | |||
s.length = 0; | |||
for (size_t i = 0; i < raw.length; i++) { | |||
s ~= CigarOperation(raw[i]); | |||
} | |||
cigar2 = s; | |||
} | |||
} | |||
return cigar2; | |||
} | |||
/// Return human readable sequence fragment - null if | |||
/// undefined. Caches sequence. | |||
@property Nullable!string sequence() { | |||
if (sequence2.isNull) { // is it cached in sequence2? | |||
auto raw = _read2.raw_sequence(); | |||
if (raw[0] == '*') { | |||
assert(raw.length == 1); | |||
return sequence2; // null | |||
} | |||
auto raw_length = (sequence_length + 1) / 2; | |||
char[16] convert = "=ACMGRSVTWYHKDBN"; | |||
string s; | |||
s.reserve(sequence_length); // Heap alloc | |||
for (size_t i = 0; i < sequence_length; i++) { | |||
auto is_odd = i % 2; | |||
auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4); | |||
s ~= convert[nuc]; | |||
} | |||
sequence2 = s; | |||
} | |||
return sequence2; | |||
} | |||
@property ubyte[] toBlob() { | |||
return _read2._data; | |||
} | |||
@property string posString() { | |||
return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped"); | |||
} | |||
string toString() { | |||
// return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">"; | |||
return _read2.get.toString(); | |||
} | |||
} | |||
/** | |||
BamReader2 is used for foreach loops | |||
*/ | |||
struct BamReadBlobs { | |||
BgzfStream stream; | |||
BamHeader header; | |||
this(string fn) { | |||
stream = BgzfStream(fn); | |||
} | |||
int opApply(scope int delegate(ref ReadBlob) dg) { | |||
fetch_bam_header(header, stream); | |||
// parse the reads | |||
while (!stream.eof()) { | |||
immutable block_size = stream.read!int(); | |||
immutable refid = stream.read!int(); | |||
immutable pos = stream.read!int(); | |||
ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME | |||
auto read = ReadBlob(refid,pos,stream.read(data)); | |||
dg(read); | |||
} | |||
return 0; | |||
} | |||
} | |||
/** | |||
Read streamer - use on single thread only | |||
*/ | |||
// import core.memory : pureMalloc; | |||
struct BamReadBlobStream { | |||
BgzfStream stream; | |||
BamHeader header; | |||
Nullable!ReadBlob readbuf; // points to current read | |||
ubyte[] data; // in sync with readbuf | |||
this(string fn) { | |||
stream = BgzfStream(fn); | |||
fetch_bam_header(header, stream); | |||
if (!empty) | |||
popFront(); // preload front | |||
} | |||
bool empty() @property { | |||
return stream.eof(); | |||
} | |||
// Returns first read available. If past eof returns null. | |||
Nullable!ReadBlob front() { | |||
return readbuf; | |||
} | |||
void popFront() { | |||
asserte(!empty); // should have been checked for | |||
immutable block_size = stream.read!int(); | |||
immutable refid = stream.read!int(); | |||
immutable pos = stream.read!int(); | |||
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness | |||
data = new ubyte[block_size-2*int.sizeof]; | |||
readbuf = ReadBlob(refid,pos,stream.read(data)); | |||
assert(readbuf._data.ptr == data.ptr); | |||
} | |||
} | |||
/** | |||
Reader - use on single thread only | |||
This one provides peek support. Peek looks one read ahead in the read stream. | |||
*/ | |||
struct BamBlobReader { | |||
BgzfStream stream; | |||
BamHeader header; | |||
Nullable!ReadBlob peekbuf; // points to current read | |||
// ubyte[] data; // in sync with peekbuf | |||
this(string fn) { | |||
stream = BgzfStream(fn); | |||
fetch_bam_header(header, stream); | |||
} | |||
bool empty() @property { | |||
return peekbuf.isNull && stream.eof(); | |||
} | |||
Nullable!ReadBlob peek() { | |||
if (peekbuf.isNull && !empty) | |||
fetch(); | |||
return peekbuf; | |||
} | |||
/// Fetches the next read. If the peekbuf is not empty return that | |||
/// first and reset peekbuf. | |||
Nullable!ReadBlob fetch() { | |||
if (!peekbuf.isNull) { | |||
auto readbuf = peekbuf; | |||
peekbuf = Nullable!ReadBlob(); | |||
return readbuf; | |||
} | |||
asserte(!empty); // should have been checked for | |||
immutable block_size = stream.read!int(); | |||
immutable refid = stream.read!int(); | |||
immutable pos = stream.read!int(); | |||
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness | |||
auto data = new ubyte[block_size-2*int.sizeof]; | |||
peekbuf = ReadBlob(refid,pos,stream.read(data)); | |||
return peekbuf; | |||
} | |||
/// Returns the next matching read. Otherwise null | |||
/// | |||
/// Example: | |||
/// | |||
/// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped)); | |||
/* | |||
Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) { | |||
while(!empty()) { | |||
auto readbuf = read(); | |||
if (is_match(R(readbuf))) | |||
return readbuf; | |||
else | |||
return Nullable!ReadBlob(); | |||
} | |||
return Nullable!ReadBlob(); | |||
} | |||
*/ | |||
} |
@@ -1,86 +0,0 @@ | |||
/* | |||
New style BAM writer. This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
module bio2.bam.writer; | |||
import std.conv; | |||
import core.stdc.stdio: fopen, fread, fclose; | |||
import std.exception; | |||
import std.file; | |||
import std.stdio; | |||
import std.string; | |||
import std.typecons; | |||
import std.bitmanip; | |||
import bio.bam.cigar; | |||
import bio.bam.constants; | |||
import bio2.bgzf; | |||
import bio2.bgzf_writer; | |||
import bio2.constants; | |||
import bio2.bam.header; | |||
import bio2.bam.reader : ProcessReadBlob, Offset; | |||
struct ModifyProcessReadBlob { // make this generic later | |||
ProcessReadBlob _read2; | |||
@property ubyte[] toBlob() { | |||
return _read2.toBlob(); | |||
} | |||
@property void set_qc_fail() { | |||
auto data = _read2.toBlob; | |||
// writeln(_read2._flag); | |||
// data[Offset.flag_nc] = data[Offset.flag_nc] & 0x200; | |||
// writeln(data[Offset.flag_nc]); | |||
// buf.write!(T,Endian.littleEndian)(value,0); | |||
// ushort _flag() { return fetch!ushort(Offset.flag); } | |||
ushort flag = _read2._flag | 0x200; | |||
// writeln("flag=",flag); | |||
data[Offset.flag..Offset.flag+4].write!(ushort,Endian.littleEndian)(flag,0); | |||
} | |||
} | |||
struct BamWriter { | |||
BgzfWriter bgzf_writer; | |||
this(string fn, ref BamHeader header, int compression_level = -1) { | |||
bgzf_writer = BgzfWriter(fn,compression_level); | |||
write_bam_header(bgzf_writer,header); | |||
} | |||
void push(ModifyProcessReadBlob read) { | |||
auto mod = read; | |||
auto blob = mod.toBlob; | |||
// another hack for now: | |||
bgzf_writer.write!int(cast(int)(blob.length+2*int.sizeof)); | |||
bgzf_writer.write!int(cast(int)mod._read2.raw_ref_id); | |||
bgzf_writer.write!int(cast(int)mod._read2.raw_start_pos); | |||
bgzf_writer.write(blob); | |||
} | |||
void push(ProcessReadBlob read) { | |||
push(ModifyProcessReadBlob(read)); | |||
} | |||
} |
@@ -1,451 +0,0 @@ | |||
/* | |||
This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
module bio2.bgzf; | |||
/** | |||
This is a new version of sambamba bgzf (under development). Bgzf is | |||
a blocked version of the ubiquous gzip format. By making it blocked | |||
it allows for seeking in gz files. Note that without seeking it | |||
can take standard gz files too. | |||
The new version is a prototype for new sambamba architecture using | |||
canonical D language features, including immutable and improved | |||
laziness and a more functional programming style. It should provide | |||
improved performance and minimize RAM use, as well as better | |||
composability. | |||
Authors: Pjotr Prins | |||
*/ | |||
import core.stdc.string : memcpy; | |||
import std.bitmanip; | |||
import std.conv; | |||
import std.exception; | |||
import std.file; | |||
import std.stdio; | |||
import std.typecons; | |||
import std.zlib : calc_crc32 = crc32, ZlibException; | |||
import bio.bam.constants; | |||
import bio.core.bgzf.block; | |||
import bio.core.bgzf.constants; | |||
import bio.core.utils.zlib : inflateInit2, inflate, inflateEnd, Z_OK, Z_FINISH, Z_STREAM_END; | |||
import bio2.constants; | |||
class BgzfException : Exception { | |||
this(string msg) { super(msg); } | |||
} | |||
alias Nullable!ulong FilePos; | |||
alias immutable(uint) CRC32; | |||
alias BGZF_MAX_BLOCK_SIZE BLOCK_SIZE; | |||
alias ubyte[BLOCK_SIZE] BlockBuffer; | |||
@property ubyte read_ubyte(File f) { | |||
ubyte[1] ubyte1; // read buffer | |||
immutable ubyte[1] buf = f.rawRead(ubyte1); | |||
return buf[0]; | |||
} | |||
@property ushort read_ushort(File f) { | |||
ubyte[2] ubyte2; // read buffer | |||
immutable ubyte[2] buf = f.rawRead(ubyte2); | |||
return littleEndianToNative!ushort(buf); | |||
} | |||
@property uint read_uint(File f) { | |||
ubyte[4] ubyte4; // read buffer | |||
immutable ubyte[4] buf = f.rawRead(ubyte4); | |||
return littleEndianToNative!uint(buf); | |||
} | |||
/** | |||
Uncompress a zlib buffer (without header) | |||
*/ | |||
ubyte[] deflate(ubyte[] uncompressed_buf, const ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) { | |||
assert(uncompressed_buf.length == BLOCK_SIZE); | |||
bio.core.utils.zlib.z_stream zs; | |||
zs.next_in = cast(typeof(zs.next_in))compressed_buf; | |||
zs.avail_in = to!uint(compressed_buf.length); | |||
auto err = inflateInit2(&zs, /* winbits = */-15); | |||
if (err != Z_OK) throw new ZlibException(err); | |||
zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr; | |||
zs.avail_out = cast(int)uncompressed_buf.length; | |||
scope(exit) { inflateEnd(&zs); } | |||
err = inflate(&zs, Z_FINISH); | |||
if (err != Z_STREAM_END) throw new ZlibException(err); | |||
assert(zs.total_out == uncompressed_size); | |||
uncompressed_buf.length = uncompressed_size; | |||
assert(crc32 == calc_crc32(0, uncompressed_buf[])); | |||
return uncompressed_buf; | |||
} | |||
/** | |||
BgzfReader is designed to run on a single thread. All it does is | |||
fetch block headers and data, so the thread should easily keep up | |||
with IO. All data processing is happening lazily in other threads. | |||
*/ | |||
struct BgzfReader { | |||
File f; | |||
FilePos report_fpos; // for error handler - assumes one thread! | |||
this(string fn) { | |||
enforce(fn.isFile); | |||
f = File(fn,"r"); | |||
} | |||
@disable this(this); // BgzfReader does not have copy semantics; | |||
void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) { | |||
throw new BgzfException("Error reading BGZF block starting in "~f.name ~" @ " ~ | |||
to!string(report_fpos) ~ " (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg); | |||
} | |||
void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) { | |||
if (!check) | |||
throwBgzfException(msg,file,line); | |||
} | |||
/** | |||
Reads the block header and returns the contained compressed data | |||
size with the file pointer positioned at the associated | |||
compressed data. | |||
*/ | |||
size_t read_block_header() { | |||
ubyte[4] ubyte4; | |||
auto magic = f.rawRead(ubyte4); | |||
enforce1(magic.length == 4, "Premature end of file"); | |||
enforce1(magic[0..4] == BGZF_MAGIC,"Invalid file format: expected bgzf magic number"); | |||
ubyte[uint.sizeof + 2 * ubyte.sizeof] skip; | |||
f.rawRead(skip); // skip gzip info | |||
ushort gzip_extra_length = f.read_ushort(); | |||
immutable fpos1 = f.tell; | |||
size_t bsize = 0; | |||
while (f.tell < fpos1 + gzip_extra_length) { | |||
immutable subfield_id1 = f.read_ubyte(); | |||
immutable subfield_id2 = f.read_ubyte(); | |||
immutable subfield_len = f.read_ushort(); | |||
if (subfield_id1 == BAM_SI1 && subfield_id2 == BAM_SI2) { | |||
// BC identifier | |||
enforce(gzip_extra_length == 6); | |||
// FIXME: always picks first BC block | |||
bsize = 1+f.read_ushort(); // BLOCK size | |||
enforce1(subfield_len == 2, "BC subfield len should be 2"); | |||
break; | |||
} | |||
else { | |||
f.seek(subfield_len,SEEK_CUR); | |||
} | |||
enforce1(bsize!=0,"block size not found"); | |||
f.seek(fpos1+gzip_extra_length); // skip any extra subfields - note we don't check for second BC | |||
} | |||
immutable compressed_size = bsize - 1 - gzip_extra_length - 19; | |||
enforce1(compressed_size <= BLOCK_SIZE, "compressed size larger than allowed"); | |||
// stderr.writeln("[compressed] size ", compressed_size, " bytes starting block @ ", report_fpos); | |||
return compressed_size; | |||
} | |||
/** | |||
Fetch the compressed data part of the block and return it with | |||
the uncompressed size and CRC32. The file pointer is assumed to | |||
be at the start of the compressed data and will be at the end of | |||
that section after. | |||
*/ | |||
Tuple!(ubyte[],immutable(uint),CRC32) read_compressed_data(ubyte[] buffer) { | |||
auto compressed_buf = f.rawRead(buffer); | |||
immutable CRC32 crc32 = f.read_uint(); | |||
immutable uncompressed_size = f.read_uint(); | |||
// stderr.writeln("[uncompressed] size ",uncompressed_size); | |||
return tuple(compressed_buf,uncompressed_size,crc32); | |||
} | |||
/** | |||
* Returns new tuple of the new file position, the compressed buffer and | |||
* the CRC32 o the uncompressed data. file pos is NULL when done | |||
*/ | |||
Tuple!(FilePos,ubyte[],size_t,CRC32) read_compressed_block(FilePos fpos, ubyte[] buffer) { | |||
immutable start_offset = fpos; | |||
try { | |||
if (fpos.isNull) throwBgzfException("Trying to read past eof"); | |||
report_fpos = fpos; | |||
f.seek(fpos); | |||
immutable compressed_size = read_block_header(); | |||
auto ret = read_compressed_data(buffer[0..compressed_size]); | |||
auto compressed_buf = ret[0]; | |||
immutable uncompressed_size = ret[1]; | |||
immutable crc32 = ret[2]; | |||
if (uncompressed_size == 0) { | |||
// check for eof marker, rereading block header | |||
auto lastpos = f.tell(); | |||
f.seek(start_offset); | |||
ubyte[BGZF_EOF.length] buf; | |||
f.rawRead(buf); | |||
f.seek(lastpos); | |||
if (buf == BGZF_EOF) | |||
return tuple(FilePos(),compressed_buf,cast(ulong)0,crc32); // sets fpos to null | |||
} | |||
return tuple(FilePos(f.tell()),compressed_buf,cast(size_t)uncompressed_size,crc32); | |||
} catch (Exception e) { throwBgzfException(e.msg,e.file,e.line); } | |||
assert(0); // never reached | |||
} | |||
} | |||
/** | |||
Simple block iterator | |||
*/ | |||
struct BgzfBlocks { | |||
BgzfReader bgzf; | |||
this(string fn) { | |||
bgzf = BgzfReader(fn); | |||
} | |||
@disable this(this); // disable copy semantics; | |||
int opApply(scope int delegate(ubyte[]) dg) { | |||
FilePos fpos = 0; | |||
try { | |||
while (!fpos.isNull) { | |||
BlockBuffer stack_buffer; | |||
auto res = bgzf.read_compressed_block(fpos,stack_buffer); | |||
fpos = res[0]; // point fpos to next block | |||
if (fpos.isNull) break; | |||
auto compressed_buf = res[1]; // same as stack_buffer | |||
auto uncompressed_size = res[2]; | |||
auto crc32 = res[3]; | |||
BlockBuffer uncompressed_buf; | |||
// call delegated function with new block | |||
dg(deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32)); | |||
} | |||
} catch (Exception e) { bgzf.throwBgzfException(e.msg,e.file,e.line); } | |||
return 0; | |||
} | |||
} | |||
Tuple!(size_t,FilePos) read_blockx(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) { | |||
BlockBuffer compressed_buf; | |||
auto res = bgzf.read_compressed_block(fpos,compressed_buf); | |||
fpos = res[0]; // point fpos to next block | |||
if (fpos.isNull) return tuple(cast(size_t)0,fpos); | |||
auto data = res[1]; | |||
assert(data.ptr == compressed_buf.ptr); | |||
size_t uncompressed_size = res[2]; | |||
auto crc32 = res[3]; | |||
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); | |||
return tuple(uncompressed_size,fpos); | |||
} | |||
import std.parallelism; | |||
int kick_off_reading_block_ahead(ubyte[] uncompressed_buf, ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) { | |||
// writeln("HEY " ~ to!string(uncompressed_size)); | |||
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); | |||
return -1; | |||
} | |||
/** | |||
*/ | |||
struct BlockReadAhead { | |||
bool task_running = false, we_have_a_task = false; | |||
Task!(kick_off_reading_block_ahead, ubyte[], ubyte[], size_t, CRC32)* t; | |||
FilePos fpos2; | |||
size_t uncompressed_size2 = 0; | |||
BlockBuffer compressed_buf2; | |||
BlockBuffer uncompressed_buf2; | |||
private void read_next_block() { | |||
} | |||
private void add_deflate_task() { | |||
} | |||
private void copy_deflated_buffer() { | |||
} | |||
void setup_block_reader() { | |||
read_next_block(); | |||
add_deflate_task(); | |||
throw new Exception("NYI"); | |||
} | |||
Tuple!(size_t,FilePos) read_block(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) { | |||
assert(we_have_a_task); | |||
copy_deflated_buffer(); | |||
read_next_block(); | |||
add_deflate_task(); | |||
// return | |||
if (task_running) { | |||
int res = t.yieldForce; | |||
// writeln(res); | |||
task_running = false; | |||
memcpy(uncompressed_buf.ptr,compressed_buf2.ptr,uncompressed_size2); | |||
return tuple(uncompressed_size2, fpos2); | |||
} | |||
else { | |||
BlockBuffer compressed_buf; | |||
auto res = bgzf.read_compressed_block(fpos,compressed_buf); | |||
fpos = res[0]; // point fpos to next block | |||
if (fpos.isNull) return tuple(cast(size_t)0,fpos); | |||
auto data = res[1]; | |||
assert(data.ptr == compressed_buf.ptr); | |||
size_t uncompressed_size = res[2]; | |||
auto crc32 = res[3]; | |||
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); | |||
// now set up a new buffer | |||
auto res2 = bgzf.read_compressed_block(fpos,compressed_buf2); | |||
fpos2 = res[0]; // point fpos to next block | |||
if (!fpos2.isNull) { | |||
auto data2 = res2[1]; | |||
uncompressed_size2 = res2[2]; | |||
t = task!kick_off_reading_block_ahead(cast(ubyte[])uncompressed_buf2,cast(ubyte[])compressed_buf2,uncompressed_size2,res2[3]); | |||
t.executeInNewThread(); | |||
task_running = true; | |||
} | |||
return tuple(uncompressed_size,fpos); | |||
} | |||
} | |||
} | |||
/** | |||
*/ | |||
struct BlockReadUnbuffered { | |||
void setup_block_reader() { | |||
} | |||
Tuple!(ubyte[], size_t, FilePos) read_block(ref BgzfReader bgzf, in FilePos fpos, ubyte[] uncompressed_buf) { | |||
BlockBuffer compressed_buf; | |||
auto res = bgzf.read_compressed_block(fpos,compressed_buf); | |||
auto fpos2 = res[0]; // point fpos to next block | |||
if (fpos.isNull) return tuple(uncompressed_buf,cast(size_t)0,fpos2); | |||
auto data = res[1]; | |||
assert(data.ptr == compressed_buf.ptr); | |||
size_t uncompressed_size = res[2]; | |||
auto crc32 = res[3]; | |||
auto buf = deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); | |||
assert(buf.ptr == uncompressed_buf.ptr); | |||
return tuple(uncompressed_buf,uncompressed_size,fpos2); | |||
} | |||
} | |||
/** | |||
Streams bgzf data and fetch items by unit or buffer. These can go beyond | |||
the size of individual blocks(!) | |||
*/ | |||
struct BgzfStream { | |||
BgzfReader bgzf; | |||
FilePos fpos; // track file position | |||
ubyte[] uncompressed_buf; // current data buffer | |||
size_t uncompressed_size; // current data buffer size | |||
Nullable!int block_pos; // position in block | |||
BlockReadUnbuffered blockread; | |||
this(string fn) { | |||
bgzf = BgzfReader(fn); | |||
uncompressed_buf = new ubyte[BLOCK_SIZE]; | |||
fpos = 0; | |||
} | |||
@disable this(this); // disable copy semantics; | |||
@property bool eof() { | |||
return fpos.isNull; | |||
} | |||
/** | |||
Fetch data into buffer. The size of the buffer can be larger than | |||
one or more multiple blocks | |||
*/ | |||
ubyte[] fetch(ubyte[] buffer) { | |||
if (block_pos.isNull) { | |||
blockread.setup_block_reader(); | |||
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf); // read first block | |||
assert(res[0].ptr == uncompressed_buf.ptr); | |||
uncompressed_size = res[1]; | |||
fpos = res[2]; | |||
block_pos = 0; | |||
} | |||
immutable buffer_length = buffer.length; | |||
size_t buffer_pos = 0; | |||
size_t remaining = buffer_length; | |||
while (remaining > 0) { | |||
if (block_pos + remaining < uncompressed_size) { | |||
// full copy | |||
assert(buffer_pos + remaining == buffer_length); | |||
memcpy(buffer[buffer_pos..buffer_pos+remaining].ptr,uncompressed_buf[block_pos..block_pos+remaining].ptr,remaining); | |||
block_pos += remaining; | |||
remaining = 0; | |||
} | |||
else { | |||
// read tail of buffer | |||
immutable tail = uncompressed_size - block_pos; | |||
memcpy(buffer[buffer_pos..buffer_pos+tail].ptr,uncompressed_buf[block_pos..uncompressed_size].ptr,tail); | |||
buffer_pos += tail; | |||
remaining -= tail; | |||
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf); | |||
assert(res[0].ptr == uncompressed_buf.ptr); | |||
uncompressed_size = res[1]; | |||
fpos = res[2]; | |||
block_pos = 0; | |||
} | |||
} | |||
return buffer; | |||
} | |||
int read(T)() { // for integers | |||
ubyte[T.sizeof] buf; | |||
auto b = fetch(buf); | |||
return b.read!(T,Endian.littleEndian)(); | |||
} | |||
string read(T)(size_t len) { | |||
ubyte[] buf = new ubyte[len]; // heap allocation | |||
fetch(buf); | |||
return cast(T)buf; | |||
} | |||
T[] read(T)(T[] buffer) { return cast(T[])fetch(cast(ubyte[])buffer); }; | |||
} |
@@ -1,243 +0,0 @@ | |||
/* | |||
New style BGZF writer. This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
// Based on the original by Artem Tarasov. | |||
module bio2.bgzf_writer; | |||
// import core.stdc.stdlib : malloc, free; | |||
import core.memory: pureMalloc, pureFree; | |||
import core.stdc.stdio: fopen, fread, fclose; | |||
import std.bitmanip; | |||
import std.conv; | |||
import std.exception; | |||
import std.typecons; | |||
import std.parallelism; | |||
import std.array; | |||
import std.algorithm : max; | |||
import std.stdio; | |||
import std.typecons; | |||
import bio.core.bgzf.compress; | |||
import bio.core.utils.roundbuf; | |||
// import undead.stream; | |||
import bio.bam.constants: BGZF_MAX_BLOCK_SIZE, BGZF_BLOCK_SIZE, BGZF_EOF; | |||
import bio2.bgzf; | |||
import bio2.constants; | |||
alias void delegate(ubyte[], ubyte[]) BlockWriteHandler; | |||
/// Convenience function for Taskpool handler | |||
Tuple!(ubyte[], ubyte[], BlockWriteHandler) bgzfCompressFunc(ubyte[] input, | |||
int level, | |||
ubyte[] output_buffer, | |||
BlockWriteHandler handler) | |||
{ | |||
auto output = bgzfCompress(input, level, output_buffer); | |||
return tuple(input, output, handler); | |||
} | |||
/// BGZF compression - this is a port of the original that used the | |||
/// undead.stream library. | |||
struct BgzfWriter { | |||
private: | |||
File f; | |||
TaskPool task_pool; | |||
ubyte[] buffer; // a slice into compression_buf (uncompressed data) | |||
ubyte[] tmp; // a slice into compression_buf (compressed data) | |||
size_t current_size; | |||
int compression_level; | |||
alias Task!(bgzfCompressFunc, | |||
ubyte[], int, ubyte[], BlockWriteHandler) CompressionTask; | |||
RoundBuf!(CompressionTask*) _compression_tasks; | |||
ubyte[] compression_buf; | |||
public: | |||
/// Create new BGZF output stream with a multi-threaded writer | |||
this(string fn, int _compression_level=-1) { | |||
f = File(fn,"wb"); | |||
enforce1(-1 <= compression_level && compression_level <= 9, | |||
"BGZF compression level must be a number in interval [-1, 9]"); | |||
size_t max_block_size = BGZF_MAX_BLOCK_SIZE; | |||
size_t block_size = BGZF_BLOCK_SIZE; | |||
task_pool = taskPool(), | |||
compression_level = _compression_level; | |||
// create a ring buffer that is large enough | |||
size_t n_tasks = max(task_pool.size, 1) * 16; | |||
_compression_tasks = RoundBuf!(CompressionTask*)(n_tasks); | |||
// create extra block to which we can write while n_tasks are | |||
// executed | |||
auto comp_buf_size = (2 * n_tasks + 2) * max_block_size; | |||
auto p = cast(ubyte*)pureMalloc(comp_buf_size); | |||
compression_buf = p[0 .. comp_buf_size]; | |||
buffer = compression_buf[0 .. block_size]; | |||
tmp = compression_buf[max_block_size .. max_block_size * 2]; | |||
} | |||
~this() { | |||
close(); | |||
} | |||
@disable this(this); // BgzfWriter does not have copy semantics; | |||
void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) { | |||
throw new BgzfException("Error writing BGZF block starting in "~f.name ~ | |||
" (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg); | |||
} | |||
void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) { | |||
if (!check) | |||
throwBgzfException(msg,file,line); | |||
} | |||
void write(const void* buf, size_t size) { | |||
// stderr.writeln("HEY1 writing bytes ",size); | |||
if (size + current_size >= buffer.length) { | |||
size_t room; | |||
ubyte[] data = (cast(ubyte*)buf)[0 .. size]; | |||
while (data.length + current_size >= buffer.length) { | |||
room = buffer.length - current_size; | |||
buffer[$ - room .. $] = data[0 .. room]; | |||
data = data[room .. $]; | |||
current_size = buffer.length; | |||
flush_block(); | |||
} | |||
buffer[0 .. data.length] = data[]; | |||
current_size = data.length; | |||
} else { | |||
buffer[current_size .. current_size + size] = (cast(ubyte*)buf)[0 .. size]; | |||
current_size += size; | |||
} | |||
// return size; | |||
} | |||
void write(ubyte[] buf) { | |||
write(buf.ptr, buf.length); | |||
} | |||
void write(string s) { | |||
write(cast(ubyte[])s); | |||
} | |||
void write(T)(T value) { // int values | |||
// ubyte[T.sizeof] buf; | |||
ubyte[] buf = [0,0,0,0,0,0,0,0,0,0]; | |||
assert(T.sizeof < buf.length); | |||
buf.write!(T,Endian.littleEndian)(value,0); | |||
// writeln("HEY T.sizeof: ",T.sizeof," value ",value," ",buf[0..T.sizeof]); | |||
write(buf[0..T.sizeof]); | |||
} | |||
/// Force flushing current block, even if it is not yet filled. | |||
/// Should also be used when it's not desired to have records | |||
/// crossing block borders. | |||
void flush_block() { | |||
if (current_size == 0) | |||
return; | |||
Tuple!(ubyte[], ubyte[], BlockWriteHandler) front_result; | |||
if (_compression_tasks.full) { | |||
front_result = _compression_tasks.front.yieldForce(); | |||
_compression_tasks.popFront(); | |||
} | |||
auto compression_task = task!bgzfCompressFunc(buffer[0 .. current_size], | |||
compression_level, tmp, | |||
_before_write); | |||
_compression_tasks.put(compression_task); | |||
task_pool.put(compression_task); | |||
size_t offset = buffer.ptr - compression_buf.ptr; | |||
immutable N = tmp.length; | |||
offset += 2 * N; | |||
if (offset == compression_buf.length) | |||
offset = 0; | |||
buffer = compression_buf[offset .. offset + buffer.length]; | |||
tmp = compression_buf[offset + N .. offset + 2 * N]; | |||
current_size = 0; | |||
if (front_result[0] !is null) | |||
writeResult(front_result); | |||
while (!_compression_tasks.empty) { | |||
auto task = _compression_tasks.front; | |||
if (!task.done()) | |||
break; | |||
auto result = task.yieldForce(); | |||
writeResult(result); | |||
_compression_tasks.popFront(); | |||
} | |||
} | |||
private void delegate(ubyte[], ubyte[]) _before_write; | |||
void setWriteHandler(void delegate(ubyte[], ubyte[]) handler) { | |||
_before_write = handler; | |||
} | |||
private void writeResult(Tuple!(ubyte[], ubyte[], BlockWriteHandler) block) { | |||
auto uncompressed = block[0]; | |||
auto compressed = block[1]; | |||
auto handler = block[2]; | |||
if (handler) {// write handler enabled | |||
handler(uncompressed, compressed); | |||
} | |||
// _stream.writeExact(compressed.ptr, compressed.length); | |||
f.rawWrite(compressed); | |||
} | |||
/// Flush all remaining BGZF blocks and underlying stream. | |||
void flush() { | |||
flush_block(); | |||
while (!_compression_tasks.empty) { | |||
auto task = _compression_tasks.front; | |||
auto block = task.yieldForce(); | |||
writeResult(block); | |||
_compression_tasks.popFront(); | |||
} | |||
f.flush(); | |||
current_size = 0; | |||
} | |||
/// Flush all remaining BGZF blocks and close source stream. | |||
/// Automatically adds empty block at the end, serving as indicator | |||
/// of end of stream. This function is automatically called on | |||
/// destruction. | |||
void close() { | |||
flush(); | |||
f.rawWrite(BGZF_EOF); | |||
f.close(); | |||
pureFree(compression_buf.ptr); | |||
} | |||
} |
@@ -1,11 +0,0 @@ | |||
module bio2.constants; | |||
alias ulong size_d; | |||
alias int RefId; // -1 is invalid FIXME | |||
alias uint GenomePos; // 32-bits, do check when reading! | |||
alias ubyte MappingQuality; // 255 is invalid FIXME | |||
struct GenomeLocation { | |||
RefId ref_id; | |||
GenomePos pos; | |||
} |
@@ -1,99 +0,0 @@ | |||
/* | |||
This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
module bio2.hashing; | |||
import std.conv; | |||
import std.stdio; | |||
pragma(inline): | |||
pure nothrow @nogc ushort get16bits(char *p) | |||
{ | |||
ushort p0 = p[0]; | |||
ushort p1 = p[1]; | |||
// return p[1]*256+p[0]; | |||
return cast(ushort)(p1*256+p0); | |||
} | |||
/// Paul Hsieh's fast hash (LGPL license), see http://www.azillionmonkeys.com/qed/hash.html | |||
pure nothrow @nogc uint SuperFastHash(string str, uint hashinc = 0) { | |||
auto data = cast(char*)str.ptr; | |||
auto len = cast(uint)str.length; | |||
uint hash = (hashinc > 0 ? hashinc : 0); | |||
if (len == 0) return 0; | |||
int rem = len & 3; | |||
len >>= 2; | |||
/* Main loop */ | |||
for (;len > 0; len--) { | |||
hash += get16bits(data); | |||
uint tmp = (get16bits(data+2) << 11) ^ hash; | |||
hash = (hash << 16) ^ tmp; | |||
data += 2*ushort.sizeof; | |||
hash += hash >> 11; | |||
} | |||
/* Handle end cases */ | |||
switch (rem) { | |||
case 3: hash += get16bits(data); | |||
hash ^= hash << 16; | |||
hash ^= (cast(ushort)data[ushort.sizeof]) << 18; | |||
hash += hash >> 11; | |||
break; | |||
case 2: hash += get16bits(data); | |||
hash ^= hash << 11; | |||
hash += hash >> 17; | |||
break; | |||
case 1: hash += cast(ushort)*data; | |||
hash ^= hash << 10; | |||
hash += hash >> 1; | |||
break; | |||
default: | |||
break; | |||
} | |||
/* Force "avalanching" of final 127 bits */ | |||
hash ^= hash << 3; | |||
hash += hash >> 5; | |||
hash ^= hash << 4; | |||
hash += hash >> 17; | |||
hash ^= hash << 25; | |||
hash += hash >> 6; | |||
return hash; | |||
} | |||
unittest { | |||
// Make sure it behaves the same as the original | |||
auto test = get16bits(cast(char *)("xy".ptr)); | |||
assert(test == 31096, to!string(test)); | |||
auto test2 = get16bits(cast(char *)("xyz".ptr)); | |||
assert(test2 == 31096, to!string(test2)); | |||
assert(get16bits(cast(char *)"xyz".ptr) == 31096); | |||
assert(SuperFastHash("*")==1029965590,to!string(SuperFastHash("*"))); | |||
assert(SuperFastHash("hst")==1867544282,to!string(SuperFastHash("hst"))); | |||
assert(SuperFastHash("hstiaashccaht")==2173265029,to!string(SuperFastHash("hstiaashccaht"))); | |||
assert(SuperFastHash("Pjotr")==2808102289,to!string(SuperFastHash("Pjotr"))); | |||
} |
@@ -1 +0,0 @@ | |||
module bio2.logger; |
@@ -1,341 +0,0 @@ | |||
/* | |||
This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
module bio2.pileup; | |||
import std.conv; | |||
import std.exception; | |||
import std.stdio; | |||
import std.traits; | |||
import std.typecons; | |||
import std.experimental.logger; | |||
import bio2.constants; | |||
import bio.core.utils.exception; | |||
immutable ulong DEFAULT_BUFFER_SIZE = 1_000_000; | |||
/** | |||
Cyclic buffer or ringbuffer based on Artem's original. Uses copy | |||
semantics to copy a read in a pre-allocated buffer. New items get | |||
added to the tail, and used items get popped from the head | |||
(FIFO). Basically it is empty when pointers align and full when | |||
head - tail equals length. Not that the pointers are of size_t | |||
which puts a theoretical limit on the number of items that can be | |||
pushed. | |||
*/ | |||
import core.stdc.string : memcpy; | |||
// alias ulong RingBufferIndex; | |||
struct RingBufferIndex { | |||
alias Representation = ulong; | |||
private ulong value = 0; | |||
this(ulong v) { | |||
value = v; | |||
} | |||
// @disable this(this); // disable copy semantics; | |||
auto get() inout { | |||
return value; | |||
} | |||
auto max() @property { | |||
return value.max; | |||
} | |||
void opAssign(U)(U rhs) if (is(typeof(Checked!(T, Hook)(rhs)))) { | |||
value = rhs; | |||
} | |||
bool opEquals(U, this _)(U rhs) { | |||
return value == rhs; | |||
} | |||
auto opCmp(U, this _)(const U rhs) { | |||
return value < rhs ? -1 : value > rhs; | |||
} | |||
ulong opUnary(string s)() if (s == "++") { | |||
return ++value; | |||
} | |||
} | |||
struct RingBuffer(T) { | |||
T[] _items; | |||
RingBufferIndex _head; | |||
RingBufferIndex _tail; | |||
size_t max_size = 0; | |||
/** initializes round buffer of size $(D n) */ | |||
this(size_t n) { | |||
_items = new T[n]; | |||
// _items.reserve(n); | |||
} | |||
@disable this(this); // disable copy semantics; | |||
/* | |||
Does not work because data is no longer available! | |||
~this() { | |||
// assert(is_empty); // make sure all items have been popped | |||
} | |||
*/ | |||
bool empty() @property @nogc nothrow const { | |||
return _tail == _head; | |||
} | |||
alias empty is_empty; | |||
auto ref front() @property { | |||
enforce(!is_empty, "ringbuffer is empty"); | |||
return _items[_head.get() % $]; | |||
} | |||
alias back last; | |||
auto ref back() @property { | |||
enforce(!is_empty, "ringbuffer is empty"); | |||
return _items[(_tail.get() - 1) % $]; | |||
} | |||
alias front first; | |||
bool is_tail(RingBufferIndex idx) { | |||
return idx == _tail.get()-1; | |||
} | |||
ref T get_at(RingBufferIndex idx) { | |||
enforce(!is_empty, "ringbuffer is empty"); | |||
enforce(idx >= _head, "ringbuffer range error (idx before front)"); | |||
enforce(idx != _tail, "ringbuffer range error (idx at end)"); | |||
enforce(idx < _tail, "ringbuffer range error (idx after end)"); | |||
return _items[idx.get() % $]; | |||
} | |||
bool is_valid(RingBufferIndex idx) { | |||
enforce(!is_empty, "ringbuffer is empty"); | |||
enforce(idx >= _head, "ringbuffer range error (idx before front)"); | |||
enforce(idx != _tail, "ringbuffer range error (idx at end)"); | |||
enforce(idx < _tail, "ringbuffer range error (idx after end)"); | |||
return true; | |||
} | |||
// This function is a hack. | |||
void update_at(RingBufferIndex idx, T item) { | |||
is_valid(idx); | |||
_items[idx.get() % $] = item; // uses copy semantics | |||
} | |||
RingBufferIndex popFront() { | |||
enforce(!is_empty, "ringbuffer is empty"); | |||
static if (__traits(compiles, _items[0].cleanupx)) { | |||
// write("x"); | |||
_items[_head.get() % $].cleanup(); | |||
} | |||
++_head.value; | |||
return _head; | |||
} | |||
/// Puts item on the stack and returns the index | |||
RingBufferIndex put(T item) { | |||
enforce(!is_full, "ringbuffer is full - you need to expand buffer"); | |||
enforce(_tail < _tail.max, "ringbuffer overflow"); | |||
max_size = length > max_size ? length : max_size; | |||
_items[_tail.get() % $] = item; // uses copy semantics | |||
auto prev = _tail; | |||
++_tail.value; | |||
return prev; | |||
} | |||
ulong length() @property const { | |||
// writeln(_tail.get(),":",_head.get(),"= len ",_tail.get()-_head.get()); | |||
return _tail.get() - _head.get(); | |||
} | |||
bool is_full() @property const { | |||
return _items.length == length(); | |||
} | |||
bool in_range(RingBufferIndex idx) @property const { | |||
return idx >= _head && idx < _tail; | |||
} | |||
ulong pushed() @property const { | |||
return _tail.value; | |||
} | |||
ulong popped() @property const { | |||
return _head.value; | |||
} | |||
@property void cleanup() { | |||
_head = RingBufferIndex(); | |||
_tail = RingBufferIndex(); | |||
} | |||
string toString() { | |||
string res = "ring "; | |||
for(RingBufferIndex i = _head; i<_tail; i++) | |||
res ~= to!string(get_at(i)); | |||
return res; | |||
} | |||
@property string stats() { | |||
return "Ringbuffer pushed " ~ to!string(pushed) ~ " popped " ~ to!string(popped) ~ " max-size " ~ | |||
to!string(max_size) // , "/", (pileup.ring.max_size+1)/pileup.ring.length); | |||
; | |||
} | |||
} | |||
unittest { | |||
auto buf = RingBuffer!int(4); | |||
assert(buf.is_empty); | |||
buf.put(1); | |||
buf.put(2); | |||
assert(buf.length == 2); | |||
assert(buf.front == 1); | |||
buf.popFront(); // 1 | |||
buf.popFront(); // 2 | |||
buf.put(2); | |||
buf.put(1); | |||
buf.put(0); | |||
buf.put(3); | |||
assert(buf.is_full); | |||
assert(buf.front == 2); | |||
buf.popFront(); | |||
assert(buf.front == 1); | |||
buf.put(4); | |||
buf.popFront(); | |||
assert(buf.front == 0); | |||
buf.popFront(); | |||
assert(buf.front == 3); | |||
buf.popFront(); | |||
assert(buf.front == 4); | |||
buf.popFront(); | |||
assert(buf.is_empty); | |||
} | |||
/** | |||
Represent a pileup of reads in a buffer. | |||
*/ | |||
/* | |||
Read RingBuffer with current pointer, so you have three states | |||
(first, current, last). | |||
*/ | |||
class PileUp(R) { | |||
RingBuffer!R ring; | |||
Nullable!RingBufferIndex current; | |||
this(ulong bufsize=DEFAULT_BUFFER_SIZE) { | |||
ring = RingBuffer!R(bufsize); | |||
set_current_to_head; | |||
} | |||
RingBufferIndex push(R r) { return ring.put(r); } | |||
bool empty() @property const { return ring.is_empty();} | |||
bool is_full() @property const { return ring.is_full();} | |||
RingBufferIndex popFront() { return ring.popFront(); } | |||
ref R front() { return ring.front(); } | |||
alias front leftmost; | |||
ref R rightmost() { return ring.back(); } | |||
ref R read(RingBufferIndex idx) { | |||
enforce(ring.in_range(idx), "idx should be set for PileUp.read"); | |||
return ring.get_at(idx); | |||
} | |||
ref R read_current() { | |||
enforce(!current.isNull, "current should be set for PileUp.read_current"); | |||
return read(current); | |||
} | |||
bool is_at_end(RingBufferIndex idx) { return ring.is_tail(idx); } | |||
@property void current_inc() { | |||
asserte(!empty); | |||
asserte(!ring.is_tail(current)); | |||
++current; | |||
} | |||
@property void set_current_to_head() { | |||
current = ring._head; // note pileup can be empty | |||
} | |||
void current_reset() { | |||
current = RingBufferIndex(); | |||
} | |||
@property bool current_is_tail() { | |||
return ring.is_tail(current); | |||
} | |||
void each(void delegate(R) dg) { | |||
auto idx = ring._head; | |||
while(!ring.is_tail(idx)) { | |||
auto r = read(idx); | |||
dg(r); | |||
idx++; | |||
} | |||
} | |||
void each_left_of_current(void delegate(RingBufferIndex, R) dg) { | |||
R cur = read_current; | |||
if (cur.is_unmapped) return; | |||
auto idx = ring._head; | |||
while(!ring.is_tail(idx)) { | |||
auto r = read(idx); | |||
if (r.is_mapped && r.end_pos >= cur.start_pos) | |||
return; | |||
dg(idx,r); | |||
idx++; | |||
} | |||
} | |||
void purge_while(bool delegate(R) dg) { | |||
while(!empty) { | |||
if (!dg(front)) | |||
return; // skip the rest | |||
popFront(); | |||
} | |||
} | |||
void purge(void delegate(R) dg) { | |||
while(!empty) { | |||
dg(front); | |||
popFront(); | |||
} | |||
set_current_to_head(); | |||
} | |||
@property string stats() { | |||
return ring.stats(); | |||
} | |||
override string toString() { | |||
return stats; | |||
} | |||
} |
@@ -1,46 +0,0 @@ | |||
/* | |||
This file is part of Sambamba. | |||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published | |||
by the Free Software Foundation; either version 2 of the License, | |||
or (at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, but | |||
WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |||
02111-1307 USA | |||
*/ | |||
module bio2.reads; | |||
import bio2.constants; | |||
import bio.core.utils.exception; | |||
import std.stdio; | |||
bool read_overlaps(R)(GenomeLocation loc, R r) { | |||
assert(r.is_mapped); | |||
return r.ref_id == loc.ref_id && loc.pos >= r.start_pos && loc.pos <= r.end_pos; | |||
} | |||
bool reads_overlap(R)(R r1, R r2) { | |||
asserte(r1.is_mapped); | |||
asserte(r2.is_mapped); | |||
asserte(r2.ref_id == r1.ref_id); | |||
// r1 rrrrrrrrrrr | |||
// r2 ---------??????????? | |||
if (r2.start_pos < r1.start_pos && r2.end_pos >= r1.start_pos) { | |||
return true; | |||
} | |||
// r1 rrrrrrrrrrr | |||
// r2 ----????? | |||
return r2.start_pos >= r1.start_pos && r2.start_pos <= r1.end_pos; | |||
} |
@@ -1,41 +0,0 @@ | |||
/* | |||
This file is part of Sambamba. | |||
Copyright © 2012-2017 Pjotr Prins <pjotr.prins@thebird.nl> | |||
Sambamba is free software; you can redistribute it and/or modify | |||
it under the terms of the GNU General Public License as published by | |||
the Free Software Foundation; either version 2 of the License, or | |||
(at your option) any later version. | |||
Sambamba is distributed in the hope that it will be useful, | |||
but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
GNU General Public License for more details. | |||
You should have received a copy of the GNU General Public License | |||
along with this program; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |||
*/ | |||
/** | |||
Module for streaming unpacked (BAM) blocks | |||
*/ | |||
module bio2.unpack; | |||
import std.stdio; | |||
import bio2.bgzf; | |||
int unpack_bams(string[] infns, File outfile) { | |||
foreach (string fn; infns) { | |||
foreach (ubyte[] read; BgzfBlocks(fn)) { | |||
outfile.rawWrite(read); // raw unpacking | |||
} | |||
} | |||
return 0; | |||
} |
@@ -1,16 +0,0 @@ | |||
{ | |||
"name": "biod", | |||
"authors": [ | |||
"Artem Tarasov", | |||
"Pjotr Prins" | |||
], | |||
"description": "Bioinformatics library in D (utils for working with SAM, BAM, SFF formats)", | |||
"copyright": "Copyright © 2016, BioD developers", | |||
"license": "MIT", | |||
"sourcePaths": ["bio"], | |||
"importPaths": ["bio"], | |||
"dependencies": { | |||
"undead": "~>1.0.6" | |||
}, | |||
"buildRequirements": ["allowWarnings"] | |||
} |
@@ -1,51 +0,0 @@ | |||
// BioD depends on stream.d which is no longer included with phobos. | |||
// To run this example from this directory, | |||
// Clone the undead repository with | |||
// git clone https://github.com:dlang/undeaD.git at the appropriate location and ensure | |||
// it is available on your path | |||
// Run example: rdmd -I.. -I../location_of_undead/src calculate_gc_content_from_reference.d | |||
import bio.bam.reader; | |||
import bio.bam.md.reconstruct : dna; | |||
import std.stdio; | |||
import std.datetime.stopwatch: benchmark, StopWatch; | |||
import std.range; | |||
import std.array; | |||
void main() { | |||
auto bam = new BamReader("../test/data/b7_295_chunk.bam"); | |||
// the sequence starts at first mapped base of first read | |||
auto reference = dna(bam.reads); | |||
int n_bases = 0, gc = 0; | |||
foreach (base; reference) { | |||
if (base == 'N') continue; // happens if coverage is zero | |||
if (base == 'G' || base == 'C') gc += 1; | |||
n_bases += 1; | |||
} | |||
writeln("total bases: ", n_bases); | |||
writeln(" GC%: ", cast(float)gc / n_bases); | |||
writeln(" sequence: ", reference); | |||
writeln(" #reads: ", walkLength(bam.reads)); | |||
auto reads = array(bam.reads); // no I/O during measurements | |||
StopWatch sw; // for a range of reads, minimum number of MD tags is parsed | |||
sw.start(); | |||
walkLength(dna(reads)); | |||
sw.stop(); | |||
writeln("extracting reference from range of reads: ", sw.peek.total!"usecs", "μs"); | |||
sw.reset(); | |||
sw.start(); | |||
foreach (read; reads) walkLength(dna(read)); | |||
sw.stop(); | |||