Browse Source

Introducing my recent D-style rewrite of bamreader. This version is still

single threaded (there is room for pre-fetching blocks), but proves very fast
and lazy.

The original working commits can be found on this branch

  https://github.com/pjotrp/sambamba/tree/markdup2
bio2
Pjotr Prins 4 years ago
parent
commit
f4a6c1c55c
  1. 23
      README.md
  2. 3
      bio2/README.md
  3. 101
      bio2/bam/header.d
  4. 548
      bio2/bam/reader.d
  5. 86
      bio2/bam/writer.d
  6. 450
      bio2/bgzf.d
  7. 243
      bio2/bgzf_writer.d
  8. 11
      bio2/constants.d
  9. 99
      bio2/hashing.d
  10. 1
      bio2/logger.d
  11. 341
      bio2/pileup.d
  12. 46
      bio2/reads.d
  13. 41
      bio2/unpack.d

23
README.md

@ -8,25 +8,34 @@ BioD aims to:
- automatic parallelization of tasks where possible for example reading and writing BAM files
- reducing the GC overhead by avoiding unnecessary memory allocations
* Offer support for manipulating common biological data formats
* Write clear documented and maintainable codebase
# Usage
# Install
The current default is to provide the path to the checked out repo to the D-compiler. For example
in sambamba we use
BioD can be installed via the [D package manager](https://code.dlang.org/packages/biod).
DFLAGS = -wi -I. -IBioD -g
# Usage
See the [examples directory](https://github.com/biod/BioD/tree/master/examples)
for examples and usage.
BioD is also a crucial part of the [sambamba](https://github.com/biod/sambamba) tool.
# Contributing
Simply clone the repository on github and put in a pull request.
# BioD contributors and support
See [contributors](https://github.com/biod/BioD/graphs/contributors). For support
contact
See
[contributors](https://github.com/biod/BioD/graphs/contributors). For
support use the [issue tracker](https://github.com/biod/BioD/issues) or contact
* [Artem Tarasov](https://github.com/lomereiter)
* [Pjotr Prins](https://github.com/pjotrp)
* [Artem Tarasov](https://github.com/lomereiter)
* [George Githinji](https://github.com/biorelated)
# License

3
bio2/README.md

@ -0,0 +1,3 @@
bio2 modules are the next generation BAM reading tools. Some of the
functionality aims to replace that of bio modules in this same
repository.

101
bio2/bam/header.d

@ -0,0 +1,101 @@
/*
New style BAM reader. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
// This is a complete rewrite of Artem Tarasov's original reader.
module bio2.bam.header;
/*
import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.typecons;
import std.bitmanip;
import bio.bam.cigar;
*/
import std.exception;
import std.file;
import std.stdio;
import std.string;
import bio.bam.constants;
import bio2.bgzf;
import bio2.bgzf_writer;
import bio2.constants;
struct RefSequence {
size_d length;
string name;
}
struct BamHeader {
string id;
string text;
RefSequence[] refs;
@disable this(this); // disable copy semantics;
}
void fetch_bam_header(ref BamHeader header, ref BgzfStream stream) {
// stderr.writeln("Fetching BAM header");
ubyte[4] ubyte4;
stream.read(ubyte4);
enforce(ubyte4 == BAM_MAGIC,"Invalid file format: expected BAM magic number");
immutable text_size = stream.read!int();
// stderr.writeln("Text size ",text_size.sizeof," ",text_size);
immutable text = stream.read!string(text_size);
header = BamHeader(BAM_MAGIC,text);
immutable n_refs = stream.read!int();
// stderr.writeln("Fetching ",n_refs," references");
foreach(int n_ref; 0..n_refs) {
immutable l_name = stream.read!int();
// stderr.writeln("!!",l_name);
auto ref_name = stream.read!string(l_name);
immutable l_ref = stream.read!int(); // length of reference sequence (bps)
// stderr.writeln(l_name," ",ref_name," ",l_ref);
header.refs ~= RefSequence(l_ref,ref_name[0..l_name-1]); // drop zero terminator
}
}
void write_bam_header(ref BgzfWriter bw, ref BamHeader header) {
// stderr.writeln("Writing BAM header");
ubyte[4] magic = cast(ubyte[])BAM_MAGIC;
bw.write(magic);
// stderr.writeln("Text size ",int.sizeof," ",header.text.length);
bw.write!int(cast(int)header.text.length);
bw.write(header.text);
auto n_refs = cast(int)header.refs.length;
bw.write!int(cast(int)header.refs.length);
// stderr.writeln("Writing ",n_refs," references");
foreach(int n_ref; 0..n_refs) {
immutable refseq = header.refs[n_ref];
bw.write!int(cast(int)(refseq.name.length+1)); // incl. zero terminator
// stderr.writeln("!!",refseq.name.length+1);
bw.write(refseq.name);
bw.write!ubyte(cast(ubyte)'\0');
bw.write!int(cast(int)refseq.length);
// stderr.writeln(refseq.name.length+1," ",refseq.name," ",refseq.length);
}
// stderr.writeln("!!");
bw.flush_block();
}

548
bio2/bam/reader.d

@ -0,0 +1,548 @@
/*
New style BAM reader. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
// This is a complete rewrite of Artem Tarasov's original reader.
module bio2.bam.reader;
import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.exception;
import std.file;
import std.format : format;
import std.stdio;
import std.string;
import std.typecons;
import std.bitmanip;
import bio.bam.cigar;
import bio.bam.constants;
import bio.core.utils.exception;
import bio2.bgzf;
import bio2.constants;
import bio2.bam.header;
template ReadFlags(alias flag) {
@property bool is_paired() nothrow { return cast(bool)(flag & 0x1); }
/// Each segment properly aligned according to the aligner
@property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); }
@property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); }
@property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); }
@property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); }
@property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); }
@property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); }
@property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); }
@property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); }
@property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); }
@property bool is_qc_fail() {
assert(is_mapped_raw,to!string(this));
return cast(bool)(flag & 0x200); }
alias is_qc_fail failed_quality_control;
/// PCR or optical duplicate
@property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); }
/// Supplementary alignment
@property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); }
@property string show_flags() {
string res = format("b%b-%d",flag,flag);
if (is_paired) res ~= " pair";
if (is_proper_pair) res ~= " proper";
if (is_mapped_raw) res ~= " mapped";
if (is_unmapped_raw) res ~= " unmapped";
if (mate_is_unmapped) res ~= " mate_unmapped";
if (is_reverse_strand) res ~= " rev_strand";
if (mate_is_reverse_strand) res ~= " mate_rev_strand";
if (is_first_of_pair) res ~= " first_of_pair";
if (is_second_of_pair) res ~= " second_of_pair";
if (is_secondary_alignment) res ~= " secondary_aln";
if (is_mapped_raw && is_qc_fail) res ~= " qc_fail";
if (is_duplicate) res ~= " duplicate";
if (is_supplementary) res ~= " suppl";
return res;
}
}
template CheckMapped(alias refid) {
@property nothrow bool is_unmapped() {
return is_unmapped_raw;
}
@property bool is_mapped() {
debug {
if (is_mapped_raw) {
assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec
}
}
return !is_unmapped_raw;
}
}
enum Offset {
bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24
};
/**
Raw Read buffer containing unparsed data. It should be considered
read-only.
Current implementation is a cluct (class-struct hybrid). The _data
pointer is shared when ReadBlob is assigned to another variable
(i.e., there is a remote dependency). The advantage is that for
each Read data gets allocated on the heap only once.
All offsets are indexed on init (except for tags). When using
fields beyond refid,pos use ProcessReadBlob instead because it
caches values.
*/
struct ReadBlob {
RefId refid; // -1 is invalid (BAM Spec)
GenomePos pos; // 0 coordinate based (BAM spec)
private ubyte[] _data;
uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max;
mixin ReadFlags!(_flag);
mixin CheckMapped!(refid);
/*
this(RefId ref_id, GenomePos read_pos, ubyte[] buf) {
refid = ref_id;
pos = read_pos;
_data = buf;
}
*/
@property void cleanup() {
delete _data;
_data = null;
}
// Turn ReadBlob into class-struct hybrid or a cluct ;)
// @disable this(this); // disable copy semantics;
@property @trusted nothrow private const T fetch(T)(uint raw_offset) {
ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof];
return cast(const(T))buf.read!(T,Endian.littleEndian)();
}
@property @trusted nothrow private const
uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); }
@property @trusted nothrow private const
uint _flag_nc() { return fetch!uint(Offset.flag_nc); }
@property @trusted nothrow private const
int sequence_length() { return fetch!int(Offset.l_seq); }
@property @trusted nothrow private const
int _next_refID() { return fetch!int(Offset.next_refID); }
@property @trusted nothrow private const
int _next_pos() { return fetch!int(Offset.next_pos); }
@property @trusted nothrow private const
int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN
@property @trusted nothrow private const
ushort _bin() { return _bin_mq_nl >> 16; }
@property @trusted nothrow private const
ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; }
@property @trusted nothrow private const
ubyte _l_read_name() { return _bin_mq_nl & 0xFF; }
@property @trusted nothrow private const
ushort _flag() { return fetch!ushort(Offset.flag); }
@property @trusted nothrow private const
ushort _n_cigar_op() { return _flag_nc & 0xFFFF; }
@property @trusted nothrow private const
uint _read_name_offset() { return Offset.read_name; }
@property @trusted nothrow private
uint _cigar_offset() {
if (offset_cigar == int.max)
offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof);
return offset_cigar;
}
@property @trusted nothrow private
uint _seq_offset() {
if (offset_seq == int.max)
offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof);
return offset_seq;
}
@property @trusted nothrow private
uint _qual_offset() {
if (offset_qual == int.max)
offset_qual = _seq_offset + (sequence_length + 1)/2;
return offset_qual;
}
@property @trusted nothrow private
uint _tags_offset() { return _qual_offset + sequence_length; }
@property @trusted nothrow private
ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; }
@property @trusted nothrow private
ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; }
@property @trusted nothrow private
ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; }
alias sequence_length _l_seq;
alias _mapq mapping_quality; // MAPQ
string toString() {
return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">";
}
}
/**
ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use
this when you need to access field/elements multiple times. Note
that ProcessReadBlob becomes invalid when ReadBlob goes out of scope.
*/
struct ProcessReadBlob {
private Nullable!ReadBlob _read2;
Nullable!int sequence_length2;
private Nullable!string sequence2, read_name2;
private Nullable!CigarOperations cigar2;
private Nullable!GenomePos consumed_reference_bases2;
mixin ReadFlags!(_flag);
mixin CheckMapped!(ref_id);
this(Nullable!ReadBlob _r) {
_read2 = _r;
}
@property void cleanup() {
_read2.cleanup;
}
@property nothrow bool isNull() {
return _read2.isNull;
}
@property RefId ref_id() {
enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2));
return _read2.refid;
}
@property RefId raw_ref_id() {
return _read2.refid;
}
@property nothrow uint _flag_nc() {
return _read2._flag_nc;
}
@property nothrow ushort _flag() {
return _read2._flag;
}
alias ref_id refid;
/// Get the start position on the reference sequence (better use
/// start_loc), i.e., the first base that gets consumed in the
/// CIGAR.
@property GenomePos start_pos() {
assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec
asserte(_read2.pos < GenomePos.max);
return cast(GenomePos)_read2.pos;
}
@property GenomePos raw_start_pos() {
return cast(GenomePos)_read2.pos;
}
/// Get the end position on the reference sequence (better use end_loc)
@property GenomePos end_pos() {
assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence");
assert(!consumed_reference_bases.isNull);
return start_pos + consumed_reference_bases;
}
@property GenomePos raw_end_pos() {
return raw_start_pos + consumed_reference_bases;
}
@property GenomeLocation start_loc() {
return GenomeLocation(ref_id,start_pos);
}
@property GenomeLocation end_loc() {
return GenomeLocation(ref_id,end_pos);
}
@property @trusted MappingQuality mapping_quality() { // MAPQ
assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec
return MappingQuality(_read2.mapping_quality);
}
@property @trusted int tlen() { // do not use
return _read2._tlen;
}
@property @trusted GenomePos sequence_length() {
if (sequence_length2.isNull)
sequence_length2 = _read2.sequence_length;
return sequence_length2;
}
/// Count and caches consumed reference bases. Uses raw_cigar to
/// avoid a heap allocation.
@property @trusted Nullable!GenomePos consumed_reference_bases() {
if (consumed_reference_bases2.isNull) {
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) _read2.raw_cigar();
if (raw.length==1 && raw[0] == '*')
return consumed_reference_bases2; // null
else {
GenomePos bases = 0;
for (size_t i = 0; i < raw.length; i++) {
auto cigarop = CigarOperation(raw[i]);
if (cigarop.is_reference_consuming)
bases += cigarop.length;
}
consumed_reference_bases2 = bases;
}
}
return consumed_reference_bases2;
}
/// Count query consumed bases. Uses raw_cigar to avoid a heap
/// allocation.
@property @trusted GenomePos consumed_query_bases() {
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) _read2.raw_cigar();
if (raw.length==1 && raw[0] == '*')
return consumed_reference_bases2; // null
else {
GenomePos bases = 0;
for (size_t i = 0; i < raw.length; i++) {
auto cigarop = CigarOperation(raw[i]);
if (cigarop.is_query_consuming)
bases += cigarop.length;
}
return bases;
}
}
/// Return read name as a string. If unavailable returns
/// null. Caches name.
@property Nullable!string read_name() {
if (read_name2.isNull) {
assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec
auto raw = _read2.read_name;
if (raw.length == 0 || (raw.length ==1 && raw[0] == '*'))
return read_name2; // null
assert(raw.length < 255); // BAM spec
if (raw[raw.length-1] == 0) // strip trailing C zero
raw.length -= 1;
read_name2 = Nullable!string(cast(string)raw);
}
return read_name2;
}
/// Returns Cigar as an array of operations. Returns null if no
/// operations. Caches Cigar when there are operations.
@property Nullable!CigarOperations cigar() {
if (cigar2.isNull) {
assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) _read2.raw_cigar();
if (raw.length==0 || (raw.length==1 && raw[0] == '*'))
return cigar2; // null
else {
auto s = new CigarOperation[raw.length]; // Heap alloc
s.length = 0;
for (size_t i = 0; i < raw.length; i++) {
s ~= CigarOperation(raw[i]);
}
cigar2 = s;
}
}
return cigar2;
}
/// Return human readable sequence fragment - null if
/// undefined. Caches sequence.
@property Nullable!string sequence() {
if (sequence2.isNull) { // is it cached in sequence2?
auto raw = _read2.raw_sequence();
if (raw[0] == '*') {
assert(raw.length == 1);
return sequence2; // null
}
auto raw_length = (sequence_length + 1) / 2;
char[16] convert = "=ACMGRSVTWYHKDBN";
string s;
s.reserve(sequence_length); // Heap alloc
for (size_t i = 0; i < sequence_length; i++) {
auto is_odd = i % 2;
auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4);
s ~= convert[nuc];
}
sequence2 = s;
}
return sequence2;
}
@property ubyte[] toBlob() {
return _read2._data;
}
@property string posString() {
return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped");
}
string toString() {
// return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">";
return _read2.get.toString();
}
}
/**
BamReader2 is used for foreach loops
*/
struct BamReadBlobs {
BgzfStream stream;
BamHeader header;
this(string fn) {
stream = BgzfStream(fn);
}
int opApply(scope int delegate(ref ReadBlob) dg) {
fetch_bam_header(header, stream);
// parse the reads
while (!stream.eof()) {
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();
ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME
auto read = ReadBlob(refid,pos,stream.read(data));
dg(read);
}
return 0;
}
}
/**
Read streamer - use on single thread only
*/
// import core.memory : pureMalloc;
struct BamReadBlobStream {
BgzfStream stream;
BamHeader header;
Nullable!ReadBlob readbuf; // points to current read
ubyte[] data; // in sync with readbuf
this(string fn) {
stream = BgzfStream(fn);
fetch_bam_header(header, stream);
if (!empty)
popFront(); // preload front
}
bool empty() @property {
return stream.eof();
}
// Returns first read available. If past eof returns null.
Nullable!ReadBlob front() {
return readbuf;
}
void popFront() {
asserte(!empty); // should have been checked for
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
data = new ubyte[block_size-2*int.sizeof];
readbuf = ReadBlob(refid,pos,stream.read(data));
assert(readbuf._data.ptr == data.ptr);
}
}
/**
Reader - use on single thread only
This one provides peek support. Peek looks one read ahead in the read stream.
*/
struct BamBlobReader {
BgzfStream stream;
BamHeader header;
Nullable!ReadBlob peekbuf; // points to current read
// ubyte[] data; // in sync with peekbuf
this(string fn) {
stream = BgzfStream(fn);
fetch_bam_header(header, stream);
}
bool empty() @property {
return peekbuf.isNull && stream.eof();
}
Nullable!ReadBlob peek() {
if (peekbuf.isNull && !empty)
fetch();
return peekbuf;
}
/// Fetches the next read. If the peekbuf is not empty return that
/// first and reset peekbuf.
Nullable!ReadBlob fetch() {
if (!peekbuf.isNull) {
auto readbuf = peekbuf;
peekbuf = Nullable!ReadBlob();
return readbuf;
}
asserte(!empty); // should have been checked for
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
auto data = new ubyte[block_size-2*int.sizeof];
peekbuf = ReadBlob(refid,pos,stream.read(data));
return peekbuf;
}
/// Returns the next matching read. Otherwise null
///
/// Example:
///
/// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped));
/*
Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) {
while(!empty()) {
auto readbuf = read();
if (is_match(R(readbuf)))
return readbuf;
else
return Nullable!ReadBlob();
}
return Nullable!ReadBlob();
}
*/
}

86
bio2/bam/writer.d

@ -0,0 +1,86 @@
/*
New style BAM writer. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
module bio2.bam.writer;
import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.exception;
import std.file;
import std.stdio;
import std.string;
import std.typecons;
import std.bitmanip;
import bio.bam.cigar;
import bio.bam.constants;
import bio2.bgzf;
import bio2.bgzf_writer;
import bio2.constants;
import bio2.bam.header;
import bio2.bam.reader : ProcessReadBlob, Offset;
struct ModifyProcessReadBlob { // make this generic later
ProcessReadBlob _read2;
@property ubyte[] toBlob() {
return _read2.toBlob();
}
@property void set_qc_fail() {
auto data = _read2.toBlob;
// writeln(_read2._flag);
// data[Offset.flag_nc] = data[Offset.flag_nc] & 0x200;
// writeln(data[Offset.flag_nc]);
// buf.write!(T,Endian.littleEndian)(value,0);
// ushort _flag() { return fetch!ushort(Offset.flag); }
ushort flag = _read2._flag | 0x200;
// writeln("flag=",flag);
data[Offset.flag..Offset.flag+4].write!(ushort,Endian.littleEndian)(flag,0);
}
}
struct BamWriter {
BgzfWriter bgzf_writer;
this(string fn, ref BamHeader header, int compression_level = -1) {
bgzf_writer = BgzfWriter(fn,compression_level);
write_bam_header(bgzf_writer,header);
}
void push(ModifyProcessReadBlob read) {
auto mod = read;
auto blob = mod.toBlob;
// another hack for now:
bgzf_writer.write!int(cast(int)(blob.length+2*int.sizeof));
bgzf_writer.write!int(cast(int)mod._read2.raw_ref_id);
bgzf_writer.write!int(cast(int)mod._read2.raw_start_pos);
bgzf_writer.write(blob);
}
void push(ProcessReadBlob read) {
push(ModifyProcessReadBlob(read));
}
}

450
bio2/bgzf.d

@ -0,0 +1,450 @@
/*
This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
module bio2.bgzf;
/**
This is a new version of sambamba bgzf (under development).
The new version is a prototype for new sambamba architecture using
canonical D language features, including immutable and improved
laziness and a more functional programming style. It should provide
improved performance and minimize RAM use, as well as better
composability.
Authors: Pjotr Prins
*/
import core.stdc.string : memcpy;
import std.bitmanip;
import std.conv;
import std.exception;
import std.file;
import std.stdio;
import std.typecons;
import std.zlib : calc_crc32 = crc32, ZlibException;
import bio.bam.constants;
import bio.core.bgzf.block;
import bio.core.bgzf.constants;
import bio.core.utils.zlib : inflateInit2, inflate, inflateEnd, Z_OK, Z_FINISH, Z_STREAM_END;
import bio2.constants;
class BgzfException : Exception {
this(string msg) { super(msg); }
}
alias Nullable!ulong FilePos;
alias immutable(uint) CRC32;
alias BGZF_MAX_BLOCK_SIZE BLOCK_SIZE;
alias ubyte[BLOCK_SIZE] BlockBuffer;
@property ubyte read_ubyte(File f) {
ubyte[1] ubyte1; // read buffer
immutable ubyte[1] buf = f.rawRead(ubyte1);
return buf[0];
}
@property ushort read_ushort(File f) {
ubyte[2] ubyte2; // read buffer
immutable ubyte[2] buf = f.rawRead(ubyte2);
return littleEndianToNative!ushort(buf);
}
@property uint read_uint(File f) {
ubyte[4] ubyte4; // read buffer
immutable ubyte[4] buf = f.rawRead(ubyte4);
return littleEndianToNative!uint(buf);
}
/**
Uncompress a zlib buffer (without header)
*/
ubyte[] deflate(ubyte[] uncompressed_buf, const ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) {
assert(uncompressed_buf.length == BLOCK_SIZE);
bio.core.utils.zlib.z_stream zs;
zs.next_in = cast(typeof(zs.next_in))compressed_buf;
zs.avail_in = to!uint(compressed_buf.length);
auto err = inflateInit2(&zs, /* winbits = */-15);
if (err != Z_OK) throw new ZlibException(err);
zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr;
zs.avail_out = cast(int)uncompressed_buf.length;
scope(exit) { inflateEnd(&zs); }
err = inflate(&zs, Z_FINISH);
if (err != Z_STREAM_END) throw new ZlibException(err);
assert(zs.total_out == uncompressed_size);
uncompressed_buf.length = uncompressed_size;
assert(crc32 == calc_crc32(0, uncompressed_buf[]));
return uncompressed_buf;
}
/**
BgzfReader is designed to run on a single thread. All it does is
fetch block headers and data, so the thread should easily keep up
with IO. All data processing is happening lazily in other threads.
*/
struct BgzfReader {
File f;
FilePos report_fpos; // for error handler - assumes one thread!
this(string fn) {
enforce(fn.isFile);
f = File(fn,"r");
}
@disable this(this); // BgzfReader does not have copy semantics;
void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) {
throw new BgzfException("Error reading BGZF block starting in "~f.name ~" @ " ~
to!string(report_fpos) ~ " (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg);
}
void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) {
if (!check)
throwBgzfException(msg,file,line);
}
/**
Reads the block header and returns the contained compressed data
size with the file pointer positioned at the associated
compressed data.
*/
size_t read_block_header() {
ubyte[4] ubyte4;
auto magic = f.rawRead(ubyte4);
enforce1(magic.length == 4, "Premature end of file");
enforce1(magic[0..4] == BGZF_MAGIC,"Invalid file format: expected bgzf magic number");
ubyte[uint.sizeof + 2 * ubyte.sizeof] skip;
f.rawRead(skip); // skip gzip info
ushort gzip_extra_length = f.read_ushort();
immutable fpos1 = f.tell;
size_t bsize = 0;
while (f.tell < fpos1 + gzip_extra_length) {
immutable subfield_id1 = f.read_ubyte();
immutable subfield_id2 = f.read_ubyte();
immutable subfield_len = f.read_ushort();
if (subfield_id1 == BAM_SI1 && subfield_id2 == BAM_SI2) {
// BC identifier
enforce(gzip_extra_length == 6);
// FIXME: always picks first BC block
bsize = 1+f.read_ushort(); // BLOCK size
enforce1(subfield_len == 2, "BC subfield len should be 2");
break;
}
else {
f.seek(subfield_len,SEEK_CUR);
}
enforce1(bsize!=0,"block size not found");
f.seek(fpos1+gzip_extra_length); // skip any extra subfields - note we don't check for second BC
}
immutable compressed_size = bsize - 1 - gzip_extra_length - 19;
enforce1(compressed_size <= BLOCK_SIZE, "compressed size larger than allowed");
// stderr.writeln("[compressed] size ", compressed_size, " bytes starting block @ ", report_fpos);
return compressed_size;
}
/**
Fetch the compressed data part of the block and return it with
the uncompressed size and CRC32. The file pointer is assumed to
be at the start of the compressed data and will be at the end of
that section after.
*/
Tuple!(ubyte[],immutable(uint),CRC32) read_compressed_data(ubyte[] buffer) {
auto compressed_buf = f.rawRead(buffer);
immutable CRC32 crc32 = f.read_uint();
immutable uncompressed_size = f.read_uint();
// stderr.writeln("[uncompressed] size ",uncompressed_size);
return tuple(compressed_buf,uncompressed_size,crc32);
}
/**
* Returns new tuple of the new file position, the compressed buffer and
* the CRC32 o the uncompressed data. file pos is NULL when done
*/
Tuple!(FilePos,ubyte[],size_t,CRC32) read_compressed_block(FilePos fpos, ubyte[] buffer) {
immutable start_offset = fpos;
try {
if (fpos.isNull) throwBgzfException("Trying to read past eof");
report_fpos = fpos;
f.seek(fpos);
immutable compressed_size = read_block_header();
auto ret = read_compressed_data(buffer[0..compressed_size]);
auto compressed_buf = ret[0];
immutable uncompressed_size = ret[1];
immutable crc32 = ret[2];
if (uncompressed_size == 0) {
// check for eof marker, rereading block header
auto lastpos = f.tell();
f.seek(start_offset);
ubyte[BGZF_EOF.length] buf;
f.rawRead(buf);
f.seek(lastpos);
if (buf == BGZF_EOF)
return tuple(FilePos(),compressed_buf,cast(ulong)0,crc32); // sets fpos to null
}
return tuple(FilePos(f.tell()),compressed_buf,cast(size_t)uncompressed_size,crc32);
} catch (Exception e) { throwBgzfException(e.msg,e.file,e.line); }
assert(0); // never reached
}
}
/**
Simple block iterator
*/
struct BgzfBlocks {
BgzfReader bgzf;
this(string fn) {
bgzf = BgzfReader(fn);
}
@disable this(this); // disable copy semantics;
int opApply(scope int delegate(ubyte[]) dg) {
FilePos fpos = 0;
try {
while (!fpos.isNull) {
BlockBuffer stack_buffer;
auto res = bgzf.read_compressed_block(fpos,stack_buffer);
fpos = res[0]; // point fpos to next block
if (fpos.isNull) break;
auto compressed_buf = res[1]; // same as stack_buffer
auto uncompressed_size = res[2];
auto crc32 = res[3];
BlockBuffer uncompressed_buf;
// call delegated function with new block
dg(deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32));
}
} catch (Exception e) { bgzf.throwBgzfException(e.msg,e.file,e.line); }
return 0;
}
}
Tuple!(size_t,FilePos) read_blockx(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) {
BlockBuffer compressed_buf;
auto res = bgzf.read_compressed_block(fpos,compressed_buf);
fpos = res[0]; // point fpos to next block
if (fpos.isNull) return tuple(cast(size_t)0,fpos);
auto data = res[1];
assert(data.ptr == compressed_buf.ptr);
size_t uncompressed_size = res[2];
auto crc32 = res[3];
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
return tuple(uncompressed_size,fpos);
}
import std.parallelism;
int kick_off_reading_block_ahead(ubyte[] uncompressed_buf, ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) {
// writeln("HEY " ~ to!string(uncompressed_size));
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
return -1;
}
/**
*/
struct BlockReadAhead {
bool task_running = false, we_have_a_task = false;
Task!(kick_off_reading_block_ahead, ubyte[], ubyte[], size_t, CRC32)* t;
FilePos fpos2;
size_t uncompressed_size2 = 0;
BlockBuffer compressed_buf2;
BlockBuffer uncompressed_buf2;
private void read_next_block() {
}
private void add_deflate_task() {
}
private void copy_deflated_buffer() {
}
void setup_block_reader() {
read_next_block();
add_deflate_task();
throw new Exception("NYI");
}
Tuple!(size_t,FilePos) read_block(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) {
assert(we_have_a_task);
copy_deflated_buffer();
read_next_block();
add_deflate_task();
// return
if (task_running) {
int res = t.yieldForce;
// writeln(res);
task_running = false;
memcpy(uncompressed_buf.ptr,compressed_buf2.ptr,uncompressed_size2);
return tuple(uncompressed_size2, fpos2);
}
else {
BlockBuffer compressed_buf;
auto res = bgzf.read_compressed_block(fpos,compressed_buf);
fpos = res[0]; // point fpos to next block
if (fpos.isNull) return tuple(cast(size_t)0,fpos);
auto data = res[1];
assert(data.ptr == compressed_buf.ptr);
size_t uncompressed_size = res[2];
auto crc32 = res[3];
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
// now set up a new buffer
auto res2 = bgzf.read_compressed_block(fpos,compressed_buf2);
fpos2 = res[0]; // point fpos to next block
if (!fpos2.isNull) {
auto data2 = res2[1];
uncompressed_size2 = res2[2];
t = task!kick_off_reading_block_ahead(cast(ubyte[])uncompressed_buf2,cast(ubyte[])compressed_buf2,uncompressed_size2,res2[3]);
t.executeInNewThread();
task_running = true;
}
return tuple(uncompressed_size,fpos);
}
}
}
/**
*/
struct BlockReadUnbuffered {
void setup_block_reader() {
}
Tuple!(ubyte[], size_t, FilePos) read_block(ref BgzfReader bgzf, in FilePos fpos, ubyte[] uncompressed_buf) {
BlockBuffer compressed_buf;
auto res = bgzf.read_compressed_block(fpos,compressed_buf);
auto fpos2 = res[0]; // point fpos to next block
if (fpos.isNull) return tuple(uncompressed_buf,cast(size_t)0,fpos2);
auto data = res[1];
assert(data.ptr == compressed_buf.ptr);
size_t uncompressed_size = res[2];
auto crc32 = res[3];
auto buf = deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
assert(buf.ptr == uncompressed_buf.ptr);
return tuple(uncompressed_buf,uncompressed_size,fpos2);
}
}
/**
Streams bgzf data and fetch items by unit or buffer. These can go beyond
the size of individual blocks(!)
*/
struct BgzfStream {
BgzfReader bgzf;
FilePos fpos; // track file position
ubyte[] uncompressed_buf; // current data buffer
size_t uncompressed_size; // current data buffer size
Nullable!int block_pos; // position in block
BlockReadUnbuffered blockread;
this(string fn) {
bgzf = BgzfReader(fn);
uncompressed_buf = new ubyte[BLOCK_SIZE];
fpos = 0;
}
@disable this(this); // disable copy semantics;
bool eof() {
return fpos.isNull;
}
/**
Fetch data into buffer. The size of the buffer can be larger than
one or more multiple blocks
*/
ubyte[] fetch(ubyte[] buffer) {
if (block_pos.isNull) {
blockread.setup_block_reader();
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf); // read first block
assert(res[0].ptr == uncompressed_buf.ptr);
uncompressed_size = res[1];
fpos = res[2];
block_pos = 0;
}
immutable buffer_length = buffer.length;
size_t buffer_pos = 0;
size_t remaining = buffer_length;
while (remaining > 0) {
if (block_pos + remaining < uncompressed_size) {
// full copy
assert(buffer_pos + remaining == buffer_length);
memcpy(buffer[buffer_pos..buffer_pos+remaining].ptr,uncompressed_buf[block_pos..block_pos+remaining].ptr,remaining);
block_pos += remaining;
remaining = 0;
}
else {
// read tail of buffer
immutable tail = uncompressed_size - block_pos;
memcpy(buffer[buffer_pos..buffer_pos+tail].ptr,uncompressed_buf[block_pos..uncompressed_size].ptr,tail);
buffer_pos += tail;
remaining -= tail;
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf);
assert(res[0].ptr == uncompressed_buf.ptr);
uncompressed_size = res[1];
fpos = res[2];
block_pos = 0;
}
}
return buffer;
}
int read(T)() { // for integers
ubyte[T.sizeof] buf;
auto b = fetch(buf);
return b.read!(T,Endian.littleEndian)();
}
string read(T)(size_t len) {
ubyte[] buf = new ubyte[len]; // heap allocation
fetch(buf);
return cast(T)buf;
}
T[] read(T)(T[] buffer) { return cast(T[])fetch(cast(ubyte[])buffer); };
}

243
bio2/bgzf_writer.d

@ -0,0 +1,243 @@
/*
New style BGZF writer. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
// Based on the original by Artem Tarasov.
module bio2.bgzf_writer;
// import core.stdc.stdlib : malloc, free;
import core.memory: pureMalloc, pureFree;
import core.stdc.stdio: fopen, fread, fclose;
import std.bitmanip;
import std.conv;
import std.exception;
import std.typecons;
import std.parallelism;
import std.array;
import std.algorithm : max;
import std.stdio;
import std.typecons;
import bio.core.bgzf.compress;
import bio.core.utils.roundbuf;
// import undead.stream;
import bio.bam.constants: BGZF_MAX_BLOCK_SIZE, BGZF_BLOCK_SIZE, BGZF_EOF;
import bio2.bgzf;
import bio2.constants;
alias void delegate(ubyte[], ubyte[]) BlockWriteHandler;
/// Convenience function for Taskpool handler
Tuple!(ubyte[], ubyte[], BlockWriteHandler) bgzfCompressFunc(ubyte[] input,
int level,
ubyte[] output_buffer,
BlockWriteHandler handler)
{
auto output = bgzfCompress(input, level, output_buffer);
return tuple(input, output, handler);
}
/// BGZF compression - this is a port of the original that used the
/// undead.stream library.
struct BgzfWriter {
private:
File f;
TaskPool task_pool;
ubyte[] buffer; // a slice into compression_buf (uncompressed data)
ubyte[] tmp; // a slice into compression_buf (compressed data)
size_t current_size;
int compression_level;
alias Task!(bgzfCompressFunc,
ubyte[], int, ubyte[], BlockWriteHandler) CompressionTask;
RoundBuf!(CompressionTask*) _compression_tasks;
ubyte[] compression_buf;
public:
/// Create new BGZF output stream with a multi-threaded writer
this(string fn, int _compression_level=-1) {
f = File(fn,"wb");
enforce1(-1 <= compression_level && compression_level <= 9,
"BGZF compression level must be a number in interval [-1, 9]");
size_t max_block_size = BGZF_MAX_BLOCK_SIZE;
size_t block_size = BGZF_BLOCK_SIZE;
task_pool = taskPool(),
compression_level = _compression_level;
// create a ring buffer that is large enough
size_t n_tasks = max(task_pool.size, 1) * 16;
_compression_tasks = RoundBuf!(CompressionTask*)(n_tasks);
// create extra block to which we can write while n_tasks are
// executed
auto comp_buf_size = (2 * n_tasks + 2) * max_block_size;
auto p = cast(ubyte*)pureMalloc(comp_buf_size);
compression_buf = p[0 .. comp_buf_size];
buffer = compression_buf[0 .. block_size];
tmp = compression_buf[max_block_size .. max_block_size * 2];
}
~this() {
close();
}
@disable this(this); // BgzfWriter does not have copy semantics;
void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) {
throw new BgzfException("Error writing BGZF block starting in "~f.name ~
" (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg);
}
void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) {
if (!check)
throwBgzfException(msg,file,line);
}
void write(const void* buf, size_t size) {
// stderr.writeln("HEY1 writing bytes ",size);
if (size + current_size >= buffer.length) {
size_t room;
ubyte[] data = (cast(ubyte*)buf)[0 .. size];
while (data.length + current_size >= buffer.length) {
room = buffer.length - current_size;
buffer[$ - room .. $] = data[0 .. room];
data = data[room .. $];
current_size = buffer.length;
flush_block();
}
buffer[0 .. data.length] = data[];
current_size = data.length;
} else {
buffer[current_size .. current_size + size] = (cast(ubyte*)buf)[0 .. size];
current_size += size;
}
// return size;
}
void write(ubyte[] buf) {
write(buf.ptr, buf.length);
}
void write(string s) {
write(cast(ubyte[])s);
}
void write(T)(T value) { // int values
// ubyte[T.sizeof] buf;
ubyte[] buf = [0,0,0,0,0,0,0,0,0,0];
assert(T.sizeof < buf.length);
buf.write!(T,Endian.littleEndian)(value,0);
// writeln("HEY T.sizeof: ",T.sizeof," value ",value," ",buf[0..T.sizeof]);
write(buf[0..T.sizeof]);
}
/// Force flushing current block, even if it is not yet filled.
/// Should also be used when it's not desired to have records
/// crossing block borders.
void flush_block() {
if (current_size == 0)
return;
Tuple!(ubyte[], ubyte[], BlockWriteHandler) front_result;
if (_compression_tasks.full) {
front_result = _compression_tasks.front.yieldForce();
_compression_tasks.popFront();
}
auto compression_task = task!bgzfCompressFunc(buffer[0 .. current_size],
compression_level, tmp,
_before_write);
_compression_tasks.put(compression_task);
task_pool.put(compression_task);
size_t offset = buffer.ptr - compression_buf.ptr;
immutable N = tmp.length;
offset += 2 * N;
if (offset == compression_buf.length)
offset = 0;
buffer = compression_buf[offset .. offset + buffer.length];
tmp = compression_buf[offset + N .. offset + 2 * N];
current_size = 0;
if (front_result[0] !is null)
writeResult(front_result);
while (!_compression_tasks.empty) {
auto task = _compression_tasks.front;
if (!task.done())
break;
auto result = task.yieldForce();
writeResult(result);
_compression_tasks.popFront();
}
}
private void delegate(ubyte[], ubyte[]) _before_write;
void setWriteHandler(void delegate(ubyte[], ubyte[]) handler) {
_before_write = handler;
}
private void writeResult(Tuple!(ubyte[], ubyte[], BlockWriteHandler) block) {
auto uncompressed = block[0];
auto compressed = block[1];
auto handler = block[2];
if (handler) {// write handler enabled
handler(uncompressed, compressed);
}
// _stream.writeExact(compressed.ptr, compressed.length);
f.rawWrite(compressed);
}
/// Flush all remaining BGZF blocks and underlying stream.
void flush() {
flush_block();
while (!_compression_tasks.empty) {
auto task = _compression_tasks.front;
auto block = task.yieldForce();
writeResult(block);
_compression_tasks.popFront();
}
f.flush();
current_size = 0;
}
/// Flush all remaining BGZF blocks and close source stream.
/// Automatically adds empty block at the end, serving as indicator
/// of end of stream. This function is automatically called on
/// destruction.
void close() {
flush();
f.rawWrite(BGZF_EOF);
f.close();
pureFree(compression_buf.ptr);
}
}

11
bio2/constants.d

@ -0,0 +1,11 @@
module bio2.constants;
alias ulong size_d;
alias int RefId; // -1 is invalid FIXME
alias uint GenomePos; // 32-bits, do check when reading!
alias ubyte MappingQuality; // 255 is invalid FIXME
struct GenomeLocation {
RefId ref_id;
GenomePos pos;
}

99
bio2/hashing.d

@ -0,0 +1,99 @@
/*
This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.
Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA
*/
module bio2.hashing;
import std.conv;
import std.stdio;
pragma(inline):
pure nothrow @nogc ushort get16bits(char *p)
{
ushort p0 = p[0];
ushort p1 = p[1];
// return p[1]*256+p[0];
return cast(ushort)(p1*256+p0);
}
/// Paul Hsieh's fast hash (LGPL license), see http://www.azillionmonkeys.com/qed/hash.html
pure nothrow @nogc uint SuperFastHash(string str, uint hashinc = 0) {
auto data = cast(char*)str.ptr;
auto len = cast(uint)str.length;
uint hash = (hashinc > 0 ? hashinc : 0);
if (len == 0) return 0;
int rem = len & 3;
len >>= 2;
/* Main loop */
for (;len > 0; len--) {
hash += get16bits(data);
uint tmp = (get16bits(data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*ushort.sizeof;
hash += hash >> 11;
}
/* Handle end cases */
switch (rem) {
case 3: hash += get16bits(data);
hash ^= hash << 16;
hash ^= (cast(ushort)data[ushort.sizeof]) << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits(data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1: hash += cast(ushort)*data;
hash ^= hash << 10;
hash += hash >> 1;
break;
default: