Browse Source

backup the old directory and module structure

remotes/georgeg/master
George Githinji 1 year ago
parent
commit
7deab8abd4
50 changed files with 3 additions and 11007 deletions
  1. 1
    0
      .gitignore
  2. 1
    1
      bio/std/file/fasta.d
  3. 1
    1
      bio/std/file/fastq.d
  4. 0
    3
      bio2/README.md
  5. 0
    101
      bio2/bam/header.d
  6. 0
    548
      bio2/bam/reader.d
  7. 0
    86
      bio2/bam/writer.d
  8. 0
    451
      bio2/bgzf.d
  9. 0
    243
      bio2/bgzf_writer.d
  10. 0
    11
      bio2/constants.d
  11. 0
    99
      bio2/hashing.d
  12. 0
    1
      bio2/logger.d
  13. 0
    341
      bio2/pileup.d
  14. 0
    46
      bio2/reads.d
  15. 0
    41
      bio2/unpack.d
  16. 0
    16
      dub.json
  17. 0
    51
      examples/calculate_gc_content_from_reference.d
  18. 0
    48
      examples/create_bam_from_scratch.d
  19. 0
    58
      examples/iterate_tags.d
  20. 0
    37
      examples/make_pileup.d
  21. 0
    41
      examples/print_base_info.d
  22. 0
    42
      examples/read_bam_file.d
  23. 0
    37
      examples/transverse_multiple_bam_files.d
  24. 0
    147
      meson.build
  25. 0
    31
      src_ragel/Makefile
  26. 0
    153
      src_ragel/maf_block.rl
  27. 0
    86
      src_ragel/region.rl
  28. 0
    525
      src_ragel/sam_alignment.rl
  29. 0
    6
      src_ragel/workarounds/fix_static_const.sh
  30. 0
    6
      src_ragel/workarounds/fix_switch_case_fallthrough.sh
  31. BIN
      test/data/BXD_geno.txt.gz
  32. 0
    3993
      test/data/b.sam
  33. BIN
      test/data/b7_295_chunk.bam
  34. BIN
      test/data/bins.bam
  35. BIN
      test/data/bins.bam.bai
  36. BIN
      test/data/corrupted_zlib_archive.bam
  37. BIN
      test/data/duplicated_block_size.bam
  38. BIN
      test/data/ex1_header.bam
  39. BIN
      test/data/ex1_header.bam.bai
  40. 0
    3273
      test/data/ex1_header.sam
  41. BIN
      test/data/illu_20_chunk.bam
  42. BIN
      test/data/ion_20_chunk.bam
  43. BIN
      test/data/long_header.bam
  44. BIN
      test/data/mg1655_chunk.bam
  45. BIN
      test/data/no_block_size.bam
  46. BIN
      test/data/tags.bam
  47. BIN
      test/data/tags.bam.bai
  48. BIN
      test/data/wrong_bc_subfield_length.bam
  49. BIN
      test/data/wrong_extra_gzip_length.bam
  50. 0
    484
      test/unittests.d

+ 1
- 0
.gitignore View File

@@ -5,4 +5,5 @@ __dummy.html
*.obj
dub.selections.json
bin/
old_biod/
build/

+ 1
- 1
bio/std/file/fasta.d View File

@@ -21,7 +21,7 @@
DEALINGS IN THE SOFTWARE.

*/
module bio.core.fasta;
module bio.std.file.fasta;

import std.file;
import std.exception;

+ 1
- 1
bio/std/file/fastq.d View File

@@ -26,7 +26,7 @@
Credit should go to Rikki Cattermole.
*/

module bio.core.fastq;
module bio.std.file.fastq;

struct FastqRecord {
const(char)[] id;

+ 0
- 3
bio2/README.md View File

@@ -1,3 +0,0 @@
bio2 modules are the next generation BAM reading tools. Some of the
functionality aims to replace that of bio modules in this same
repository.

+ 0
- 101
bio2/bam/header.d View File

@@ -1,101 +0,0 @@
/*
New style BAM reader. This file is part of Sambamba.
Copyright (C) 2017,2018 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

// This is a complete rewrite of Artem Tarasov's original reader.

module bio2.bam.header;

/*
import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.typecons;
import std.bitmanip;

import bio.bam.cigar;
*/

import std.exception;
import std.file;
import std.stdio;
import std.string;

import bio.bam.constants;

import bio2.bgzf;
import bio2.bgzf_writer;
import bio2.constants;

struct RefSequence {
size_d length;
string name;
}

struct BamHeader {
string id;
string text;
RefSequence[] refs;

@disable this(this); // disable copy semantics;
}

void fetch_bam_header(ref BamHeader header, ref BgzfStream stream) {
// stderr.writeln("Fetching BAM header");
ubyte[4] ubyte4;
stream.read(ubyte4);
enforce(ubyte4 == BAM_MAGIC,"Invalid file format: expected BAM magic number");
immutable text_size = stream.read!int();
// stderr.writeln("Text size ",text_size.sizeof," ",text_size);
immutable text = stream.read!string(text_size);
header = BamHeader(BAM_MAGIC,text);
immutable n_refs = stream.read!int();
// stderr.writeln("Fetching ",n_refs," references");
foreach(int n_ref; 0..n_refs) {
immutable l_name = stream.read!int();
// stderr.writeln("!!",l_name);
auto ref_name = stream.read!string(l_name);
immutable l_ref = stream.read!int(); // length of reference sequence (bps)
// stderr.writeln(l_name," ",ref_name," ",l_ref);
header.refs ~= RefSequence(l_ref,ref_name[0..l_name-1]); // drop zero terminator
}
}

void write_bam_header(ref BgzfWriter bw, ref BamHeader header) {
// stderr.writeln("Writing BAM header");
ubyte[4] magic = cast(ubyte[])BAM_MAGIC;
bw.write(magic);
// stderr.writeln("Text size ",int.sizeof," ",header.text.length);
bw.write!int(cast(int)header.text.length);
bw.write(header.text);
auto n_refs = cast(int)header.refs.length;
bw.write!int(cast(int)header.refs.length);
// stderr.writeln("Writing ",n_refs," references");
foreach(int n_ref; 0..n_refs) {
immutable refseq = header.refs[n_ref];
bw.write!int(cast(int)(refseq.name.length+1)); // incl. zero terminator
// stderr.writeln("!!",refseq.name.length+1);
bw.write(refseq.name);
bw.write!ubyte(cast(ubyte)'\0');
bw.write!int(cast(int)refseq.length);
// stderr.writeln(refseq.name.length+1," ",refseq.name," ",refseq.length);
}
// stderr.writeln("!!");
bw.flush_block();
}

+ 0
- 548
bio2/bam/reader.d View File

@@ -1,548 +0,0 @@
/*
New style BAM reader. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

// This is a complete rewrite of Artem Tarasov's original reader.

module bio2.bam.reader;

import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.exception;
import std.file;
import std.format : format;
import std.stdio;
import std.string;
import std.typecons;
import std.bitmanip;

import bio.bam.cigar;
import bio.bam.constants;
import bio.core.utils.exception;

import bio2.bgzf;
import bio2.constants;

import bio2.bam.header;

template ReadFlags(alias flag) {
@property bool is_paired() nothrow { return cast(bool)(flag & 0x1); }
/// Each segment properly aligned according to the aligner
@property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); }
@property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); }
@property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); }
@property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); }
@property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); }
@property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); }
@property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); }
@property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); }
@property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); }
@property bool is_qc_fail() {
assert(is_mapped_raw,to!string(this));
return cast(bool)(flag & 0x200); }
alias is_qc_fail failed_quality_control;
/// PCR or optical duplicate
@property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); }
/// Supplementary alignment
@property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); }
@property string show_flags() {
string res = format("b%b-%d",flag,flag);
if (is_paired) res ~= " pair";
if (is_proper_pair) res ~= " proper";
if (is_mapped_raw) res ~= " mapped";
if (is_unmapped_raw) res ~= " unmapped";
if (mate_is_unmapped) res ~= " mate_unmapped";
if (is_reverse_strand) res ~= " rev_strand";
if (mate_is_reverse_strand) res ~= " mate_rev_strand";
if (is_first_of_pair) res ~= " first_of_pair";
if (is_second_of_pair) res ~= " second_of_pair";
if (is_secondary_alignment) res ~= " secondary_aln";
if (is_mapped_raw && is_qc_fail) res ~= " qc_fail";
if (is_duplicate) res ~= " duplicate";
if (is_supplementary) res ~= " suppl";
return res;
}
}

template CheckMapped(alias refid) {
@property nothrow bool is_unmapped() {
return is_unmapped_raw;
}
@property bool is_mapped() {
debug {
if (is_mapped_raw) {
assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec
}
}
return !is_unmapped_raw;
}
}

enum Offset {
bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24
};

/**
Raw Read buffer containing unparsed data. It should be considered
read-only.

Current implementation is a cluct (class-struct hybrid). The _data
pointer is shared when ReadBlob is assigned to another variable
(i.e., there is a remote dependency). The advantage is that for
each Read data gets allocated on the heap only once.

All offsets are indexed on init (except for tags). When using
fields beyond refid,pos use ProcessReadBlob instead because it
caches values.
*/

struct ReadBlob {
RefId refid; // -1 is invalid (BAM Spec)
GenomePos pos; // 0 coordinate based (BAM spec)
private ubyte[] _data;
uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max;

mixin ReadFlags!(_flag);
mixin CheckMapped!(refid);

/*
this(RefId ref_id, GenomePos read_pos, ubyte[] buf) {
refid = ref_id;
pos = read_pos;
_data = buf;
}
*/

@property void cleanup() {
destroy(_data);
_data = null;
}

// Turn ReadBlob into class-struct hybrid or a cluct ;)
// @disable this(this); // disable copy semantics;

@property @trusted nothrow private const T fetch(T)(uint raw_offset) {
ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof];
return cast(const(T))buf.read!(T,Endian.littleEndian)();
}

@property @trusted nothrow private const
uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); }
@property @trusted nothrow private const
uint _flag_nc() { return fetch!uint(Offset.flag_nc); }
@property @trusted nothrow private const
int sequence_length() { return fetch!int(Offset.l_seq); }
@property @trusted nothrow private const
int _next_refID() { return fetch!int(Offset.next_refID); }
@property @trusted nothrow private const
int _next_pos() { return fetch!int(Offset.next_pos); }
@property @trusted nothrow private const
int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN
@property @trusted nothrow private const
ushort _bin() { return _bin_mq_nl >> 16; }
@property @trusted nothrow private const
ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; }
@property @trusted nothrow private const
ubyte _l_read_name() { return _bin_mq_nl & 0xFF; }
@property @trusted nothrow private const
ushort _flag() { return fetch!ushort(Offset.flag); }
@property @trusted nothrow private const
ushort _n_cigar_op() { return _flag_nc & 0xFFFF; }
@property @trusted nothrow private const
uint _read_name_offset() { return Offset.read_name; }
@property @trusted nothrow private
uint _cigar_offset() {
if (offset_cigar == int.max)
offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof);
return offset_cigar;
}
@property @trusted nothrow private
uint _seq_offset() {
if (offset_seq == int.max)
offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof);
return offset_seq;
}
@property @trusted nothrow private
uint _qual_offset() {
if (offset_qual == int.max)
offset_qual = _seq_offset + (sequence_length + 1)/2;
return offset_qual;
}
@property @trusted nothrow private
uint _tags_offset() { return _qual_offset + sequence_length; }
@property @trusted nothrow private
ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; }
@property @trusted nothrow private
ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; }
@property @trusted nothrow private
ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; }

alias sequence_length _l_seq;
alias _mapq mapping_quality; // MAPQ

string toString() {
return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">";
}

}

/**
ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use
this when you need to access field/elements multiple times. Note
that ProcessReadBlob becomes invalid when ReadBlob goes out of scope.
*/
struct ProcessReadBlob {
private Nullable!ReadBlob _read2;
Nullable!int sequence_length2;
private Nullable!string sequence2, read_name2;
private Nullable!CigarOperations cigar2;
private Nullable!GenomePos consumed_reference_bases2;

mixin ReadFlags!(_flag);
mixin CheckMapped!(ref_id);

this(Nullable!ReadBlob _r) {
_read2 = _r;
}

@property void cleanup() {
_read2.cleanup;
}

@property nothrow bool isNull() {
return _read2.isNull;
}

@property RefId ref_id() {
enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2));
return _read2.refid;
}

@property RefId raw_ref_id() {
return _read2.refid;
}

@property nothrow uint _flag_nc() {
return _read2._flag_nc;
}

@property nothrow ushort _flag() {
return _read2._flag;
}

alias ref_id refid;

/// Get the start position on the reference sequence (better use
/// start_loc), i.e., the first base that gets consumed in the
/// CIGAR.
@property GenomePos start_pos() {
assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec
asserte(_read2.pos < GenomePos.max);
return cast(GenomePos)_read2.pos;
}

@property GenomePos raw_start_pos() {
return cast(GenomePos)_read2.pos;
}

/// Get the end position on the reference sequence (better use end_loc)
@property GenomePos end_pos() {
assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence");
assert(!consumed_reference_bases.isNull);
return start_pos + consumed_reference_bases;
}

@property GenomePos raw_end_pos() {
return raw_start_pos + consumed_reference_bases;
}

@property GenomeLocation start_loc() {
return GenomeLocation(ref_id,start_pos);
}

@property GenomeLocation end_loc() {
return GenomeLocation(ref_id,end_pos);
}

@property @trusted MappingQuality mapping_quality() { // MAPQ
assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec
return MappingQuality(_read2.mapping_quality);
}

@property @trusted int tlen() { // do not use
return _read2._tlen;
}

@property @trusted GenomePos sequence_length() {
if (sequence_length2.isNull)
sequence_length2 = _read2.sequence_length;
return sequence_length2;
}

/// Count and caches consumed reference bases. Uses raw_cigar to
/// avoid a heap allocation.
@property @trusted Nullable!GenomePos consumed_reference_bases() {
if (consumed_reference_bases2.isNull) {
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) _read2.raw_cigar();
if (raw.length==1 && raw[0] == '*')
return consumed_reference_bases2; // null
else {
GenomePos bases = 0;
for (size_t i = 0; i < raw.length; i++) {
auto cigarop = CigarOperation(raw[i]);
if (cigarop.is_reference_consuming)
bases += cigarop.length;
}
consumed_reference_bases2 = bases;
}
}
return consumed_reference_bases2;
}

/// Count query consumed bases. Uses raw_cigar to avoid a heap
/// allocation.
@property @trusted GenomePos consumed_query_bases() {
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) _read2.raw_cigar();
if (raw.length==1 && raw[0] == '*')
return consumed_reference_bases2; // null
else {
GenomePos bases = 0;
for (size_t i = 0; i < raw.length; i++) {
auto cigarop = CigarOperation(raw[i]);
if (cigarop.is_query_consuming)
bases += cigarop.length;
}
return bases;
}
}

/// Return read name as a string. If unavailable returns
/// null. Caches name.
@property Nullable!string read_name() {
if (read_name2.isNull) {
assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec
auto raw = _read2.read_name;
if (raw.length == 0 || (raw.length ==1 && raw[0] == '*'))
return read_name2; // null
assert(raw.length < 255); // BAM spec
if (raw[raw.length-1] == 0) // strip trailing C zero
raw.length -= 1;
read_name2 = Nullable!string(cast(string)raw);
}
return read_name2;
}

/// Returns Cigar as an array of operations. Returns null if no
/// operations. Caches Cigar when there are operations.
@property Nullable!CigarOperations cigar() {
if (cigar2.isNull) {
assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
auto raw = cast(uint[]) _read2.raw_cigar();
if (raw.length==0 || (raw.length==1 && raw[0] == '*'))
return cigar2; // null
else {
auto s = new CigarOperation[raw.length]; // Heap alloc
s.length = 0;
for (size_t i = 0; i < raw.length; i++) {
s ~= CigarOperation(raw[i]);
}
cigar2 = s;
}
}
return cigar2;
}

/// Return human readable sequence fragment - null if
/// undefined. Caches sequence.
@property Nullable!string sequence() {
if (sequence2.isNull) { // is it cached in sequence2?
auto raw = _read2.raw_sequence();
if (raw[0] == '*') {
assert(raw.length == 1);
return sequence2; // null
}
auto raw_length = (sequence_length + 1) / 2;
char[16] convert = "=ACMGRSVTWYHKDBN";
string s;
s.reserve(sequence_length); // Heap alloc
for (size_t i = 0; i < sequence_length; i++) {
auto is_odd = i % 2;
auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4);
s ~= convert[nuc];
}
sequence2 = s;
}
return sequence2;
}

@property ubyte[] toBlob() {
return _read2._data;
}

@property string posString() {
return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped");
}

string toString() {
// return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">";
return _read2.get.toString();
}

}

/**
BamReader2 is used for foreach loops
*/

struct BamReadBlobs {
BgzfStream stream;
BamHeader header;

this(string fn) {
stream = BgzfStream(fn);
}

int opApply(scope int delegate(ref ReadBlob) dg) {
fetch_bam_header(header, stream);
// parse the reads
while (!stream.eof()) {
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();

ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME
auto read = ReadBlob(refid,pos,stream.read(data));
dg(read);
}
return 0;
}
}

/**
Read streamer - use on single thread only
*/

// import core.memory : pureMalloc;

struct BamReadBlobStream {
BgzfStream stream;
BamHeader header;
Nullable!ReadBlob readbuf; // points to current read
ubyte[] data; // in sync with readbuf

this(string fn) {
stream = BgzfStream(fn);
fetch_bam_header(header, stream);
if (!empty)
popFront(); // preload front
}

bool empty() @property {
return stream.eof();
}

// Returns first read available. If past eof returns null.
Nullable!ReadBlob front() {
return readbuf;
}

void popFront() {
asserte(!empty); // should have been checked for
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();

// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
data = new ubyte[block_size-2*int.sizeof];
readbuf = ReadBlob(refid,pos,stream.read(data));
assert(readbuf._data.ptr == data.ptr);
}

}

/**
Reader - use on single thread only

This one provides peek support. Peek looks one read ahead in the read stream.
*/

struct BamBlobReader {
BgzfStream stream;
BamHeader header;
Nullable!ReadBlob peekbuf; // points to current read
// ubyte[] data; // in sync with peekbuf

this(string fn) {
stream = BgzfStream(fn);
fetch_bam_header(header, stream);
}

bool empty() @property {
return peekbuf.isNull && stream.eof();
}

Nullable!ReadBlob peek() {
if (peekbuf.isNull && !empty)
fetch();
return peekbuf;
}

/// Fetches the next read. If the peekbuf is not empty return that
/// first and reset peekbuf.
Nullable!ReadBlob fetch() {
if (!peekbuf.isNull) {
auto readbuf = peekbuf;
peekbuf = Nullable!ReadBlob();
return readbuf;
}
asserte(!empty); // should have been checked for
immutable block_size = stream.read!int();
immutable refid = stream.read!int();
immutable pos = stream.read!int();

// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
auto data = new ubyte[block_size-2*int.sizeof];
peekbuf = ReadBlob(refid,pos,stream.read(data));
return peekbuf;
}

/// Returns the next matching read. Otherwise null
///
/// Example:
///
/// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped));
/*
Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) {
while(!empty()) {
auto readbuf = read();
if (is_match(R(readbuf)))
return readbuf;
else
return Nullable!ReadBlob();
}
return Nullable!ReadBlob();
}
*/
}

+ 0
- 86
bio2/bam/writer.d View File

@@ -1,86 +0,0 @@
/*
New style BAM writer. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

module bio2.bam.writer;

import std.conv;
import core.stdc.stdio: fopen, fread, fclose;
import std.exception;
import std.file;
import std.stdio;
import std.string;
import std.typecons;
import std.bitmanip;

import bio.bam.cigar;
import bio.bam.constants;

import bio2.bgzf;
import bio2.bgzf_writer;
import bio2.constants;

import bio2.bam.header;
import bio2.bam.reader : ProcessReadBlob, Offset;

struct ModifyProcessReadBlob { // make this generic later
ProcessReadBlob _read2;

@property ubyte[] toBlob() {
return _read2.toBlob();
}

@property void set_qc_fail() {
auto data = _read2.toBlob;
// writeln(_read2._flag);
// data[Offset.flag_nc] = data[Offset.flag_nc] & 0x200;
// writeln(data[Offset.flag_nc]);
// buf.write!(T,Endian.littleEndian)(value,0);
// ushort _flag() { return fetch!ushort(Offset.flag); }

ushort flag = _read2._flag | 0x200;
// writeln("flag=",flag);
data[Offset.flag..Offset.flag+4].write!(ushort,Endian.littleEndian)(flag,0);
}
}

struct BamWriter {
BgzfWriter bgzf_writer;

this(string fn, ref BamHeader header, int compression_level = -1) {
bgzf_writer = BgzfWriter(fn,compression_level);
write_bam_header(bgzf_writer,header);
}

void push(ModifyProcessReadBlob read) {
auto mod = read;
auto blob = mod.toBlob;
// another hack for now:
bgzf_writer.write!int(cast(int)(blob.length+2*int.sizeof));
bgzf_writer.write!int(cast(int)mod._read2.raw_ref_id);
bgzf_writer.write!int(cast(int)mod._read2.raw_start_pos);
bgzf_writer.write(blob);
}

void push(ProcessReadBlob read) {
push(ModifyProcessReadBlob(read));
}

}

+ 0
- 451
bio2/bgzf.d View File

@@ -1,451 +0,0 @@
/*
This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

module bio2.bgzf;

/**

This is a new version of sambamba bgzf (under development). Bgzf is
a blocked version of the ubiquous gzip format. By making it blocked
it allows for seeking in gz files. Note that without seeking it
can take standard gz files too.

The new version is a prototype for new sambamba architecture using
canonical D language features, including immutable and improved
laziness and a more functional programming style. It should provide
improved performance and minimize RAM use, as well as better
composability.

Authors: Pjotr Prins

*/

import core.stdc.string : memcpy;

import std.bitmanip;
import std.conv;
import std.exception;
import std.file;
import std.stdio;
import std.typecons;
import std.zlib : calc_crc32 = crc32, ZlibException;

import bio.bam.constants;
import bio.core.bgzf.block;
import bio.core.bgzf.constants;
import bio.core.utils.zlib : inflateInit2, inflate, inflateEnd, Z_OK, Z_FINISH, Z_STREAM_END;

import bio2.constants;

class BgzfException : Exception {
this(string msg) { super(msg); }
}

alias Nullable!ulong FilePos;
alias immutable(uint) CRC32;

alias BGZF_MAX_BLOCK_SIZE BLOCK_SIZE;

alias ubyte[BLOCK_SIZE] BlockBuffer;

@property ubyte read_ubyte(File f) {
ubyte[1] ubyte1; // read buffer
immutable ubyte[1] buf = f.rawRead(ubyte1);
return buf[0];
}

@property ushort read_ushort(File f) {
ubyte[2] ubyte2; // read buffer
immutable ubyte[2] buf = f.rawRead(ubyte2);
return littleEndianToNative!ushort(buf);
}
@property uint read_uint(File f) {
ubyte[4] ubyte4; // read buffer
immutable ubyte[4] buf = f.rawRead(ubyte4);
return littleEndianToNative!uint(buf);
}

/**
Uncompress a zlib buffer (without header)
*/
ubyte[] deflate(ubyte[] uncompressed_buf, const ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) {
assert(uncompressed_buf.length == BLOCK_SIZE);
bio.core.utils.zlib.z_stream zs;
zs.next_in = cast(typeof(zs.next_in))compressed_buf;
zs.avail_in = to!uint(compressed_buf.length);

auto err = inflateInit2(&zs, /* winbits = */-15);
if (err != Z_OK) throw new ZlibException(err);

zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr;
zs.avail_out = cast(int)uncompressed_buf.length;

scope(exit) { inflateEnd(&zs); }
err = inflate(&zs, Z_FINISH);
if (err != Z_STREAM_END) throw new ZlibException(err);

assert(zs.total_out == uncompressed_size);
uncompressed_buf.length = uncompressed_size;
assert(crc32 == calc_crc32(0, uncompressed_buf[]));

return uncompressed_buf;
}

/**
BgzfReader is designed to run on a single thread. All it does is
fetch block headers and data, so the thread should easily keep up
with IO. All data processing is happening lazily in other threads.
*/
struct BgzfReader {
File f;
FilePos report_fpos; // for error handler - assumes one thread!

this(string fn) {
enforce(fn.isFile);
f = File(fn,"r");
}

@disable this(this); // BgzfReader does not have copy semantics;

void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) {
throw new BgzfException("Error reading BGZF block starting in "~f.name ~" @ " ~
to!string(report_fpos) ~ " (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg);
}

void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) {
if (!check)
throwBgzfException(msg,file,line);
}

/**
Reads the block header and returns the contained compressed data
size with the file pointer positioned at the associated
compressed data.
*/
size_t read_block_header() {
ubyte[4] ubyte4;
auto magic = f.rawRead(ubyte4);
enforce1(magic.length == 4, "Premature end of file");
enforce1(magic[0..4] == BGZF_MAGIC,"Invalid file format: expected bgzf magic number");
ubyte[uint.sizeof + 2 * ubyte.sizeof] skip;
f.rawRead(skip); // skip gzip info
ushort gzip_extra_length = f.read_ushort();
immutable fpos1 = f.tell;
size_t bsize = 0;
while (f.tell < fpos1 + gzip_extra_length) {
immutable subfield_id1 = f.read_ubyte();
immutable subfield_id2 = f.read_ubyte();
immutable subfield_len = f.read_ushort();
if (subfield_id1 == BAM_SI1 && subfield_id2 == BAM_SI2) {
// BC identifier
enforce(gzip_extra_length == 6);
// FIXME: always picks first BC block
bsize = 1+f.read_ushort(); // BLOCK size
enforce1(subfield_len == 2, "BC subfield len should be 2");
break;
}
else {
f.seek(subfield_len,SEEK_CUR);
}
enforce1(bsize!=0,"block size not found");
f.seek(fpos1+gzip_extra_length); // skip any extra subfields - note we don't check for second BC
}
immutable compressed_size = bsize - 1 - gzip_extra_length - 19;
enforce1(compressed_size <= BLOCK_SIZE, "compressed size larger than allowed");

// stderr.writeln("[compressed] size ", compressed_size, " bytes starting block @ ", report_fpos);
return compressed_size;
}

/**
Fetch the compressed data part of the block and return it with
the uncompressed size and CRC32. The file pointer is assumed to
be at the start of the compressed data and will be at the end of
that section after.
*/
Tuple!(ubyte[],immutable(uint),CRC32) read_compressed_data(ubyte[] buffer) {
auto compressed_buf = f.rawRead(buffer);

immutable CRC32 crc32 = f.read_uint();
immutable uncompressed_size = f.read_uint();
// stderr.writeln("[uncompressed] size ",uncompressed_size);
return tuple(compressed_buf,uncompressed_size,crc32);
}

/**
* Returns new tuple of the new file position, the compressed buffer and
* the CRC32 o the uncompressed data. file pos is NULL when done
*/
Tuple!(FilePos,ubyte[],size_t,CRC32) read_compressed_block(FilePos fpos, ubyte[] buffer) {
immutable start_offset = fpos;
try {
if (fpos.isNull) throwBgzfException("Trying to read past eof");
report_fpos = fpos;
f.seek(fpos);
immutable compressed_size = read_block_header();
auto ret = read_compressed_data(buffer[0..compressed_size]);
auto compressed_buf = ret[0];
immutable uncompressed_size = ret[1];
immutable crc32 = ret[2];

if (uncompressed_size == 0) {
// check for eof marker, rereading block header
auto lastpos = f.tell();
f.seek(start_offset);
ubyte[BGZF_EOF.length] buf;
f.rawRead(buf);
f.seek(lastpos);
if (buf == BGZF_EOF)
return tuple(FilePos(),compressed_buf,cast(ulong)0,crc32); // sets fpos to null
}
return tuple(FilePos(f.tell()),compressed_buf,cast(size_t)uncompressed_size,crc32);
} catch (Exception e) { throwBgzfException(e.msg,e.file,e.line); }
assert(0); // never reached
}
}

/**
Simple block iterator
*/
struct BgzfBlocks {
BgzfReader bgzf;

this(string fn) {
bgzf = BgzfReader(fn);
}

@disable this(this); // disable copy semantics;

int opApply(scope int delegate(ubyte[]) dg) {
FilePos fpos = 0;

try {
while (!fpos.isNull) {
BlockBuffer stack_buffer;
auto res = bgzf.read_compressed_block(fpos,stack_buffer);
fpos = res[0]; // point fpos to next block
if (fpos.isNull) break;

auto compressed_buf = res[1]; // same as stack_buffer
auto uncompressed_size = res[2];
auto crc32 = res[3];
BlockBuffer uncompressed_buf;
// call delegated function with new block
dg(deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32));
}
} catch (Exception e) { bgzf.throwBgzfException(e.msg,e.file,e.line); }
return 0;
}
}


Tuple!(size_t,FilePos) read_blockx(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) {
BlockBuffer compressed_buf;
auto res = bgzf.read_compressed_block(fpos,compressed_buf);
fpos = res[0]; // point fpos to next block
if (fpos.isNull) return tuple(cast(size_t)0,fpos);
auto data = res[1];

assert(data.ptr == compressed_buf.ptr);
size_t uncompressed_size = res[2];
auto crc32 = res[3];
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
return tuple(uncompressed_size,fpos);
}

import std.parallelism;

int kick_off_reading_block_ahead(ubyte[] uncompressed_buf, ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) {
// writeln("HEY " ~ to!string(uncompressed_size));
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
return -1;
}

/**
*/
struct BlockReadAhead {
bool task_running = false, we_have_a_task = false;
Task!(kick_off_reading_block_ahead, ubyte[], ubyte[], size_t, CRC32)* t;
FilePos fpos2;
size_t uncompressed_size2 = 0;
BlockBuffer compressed_buf2;
BlockBuffer uncompressed_buf2;

private void read_next_block() {
}

private void add_deflate_task() {
}

private void copy_deflated_buffer() {
}

void setup_block_reader() {
read_next_block();
add_deflate_task();
throw new Exception("NYI");
}

Tuple!(size_t,FilePos) read_block(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) {
assert(we_have_a_task);
copy_deflated_buffer();
read_next_block();
add_deflate_task();
// return

if (task_running) {
int res = t.yieldForce;
// writeln(res);
task_running = false;
memcpy(uncompressed_buf.ptr,compressed_buf2.ptr,uncompressed_size2);
return tuple(uncompressed_size2, fpos2);
}
else {
BlockBuffer compressed_buf;
auto res = bgzf.read_compressed_block(fpos,compressed_buf);
fpos = res[0]; // point fpos to next block
if (fpos.isNull) return tuple(cast(size_t)0,fpos);
auto data = res[1];
assert(data.ptr == compressed_buf.ptr);
size_t uncompressed_size = res[2];
auto crc32 = res[3];

deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);

// now set up a new buffer
auto res2 = bgzf.read_compressed_block(fpos,compressed_buf2);
fpos2 = res[0]; // point fpos to next block
if (!fpos2.isNull) {
auto data2 = res2[1];
uncompressed_size2 = res2[2];
t = task!kick_off_reading_block_ahead(cast(ubyte[])uncompressed_buf2,cast(ubyte[])compressed_buf2,uncompressed_size2,res2[3]);
t.executeInNewThread();
task_running = true;
}
return tuple(uncompressed_size,fpos);
}
}
}

/**
*/
struct BlockReadUnbuffered {

void setup_block_reader() {
}

Tuple!(ubyte[], size_t, FilePos) read_block(ref BgzfReader bgzf, in FilePos fpos, ubyte[] uncompressed_buf) {
BlockBuffer compressed_buf;
auto res = bgzf.read_compressed_block(fpos,compressed_buf);
auto fpos2 = res[0]; // point fpos to next block
if (fpos.isNull) return tuple(uncompressed_buf,cast(size_t)0,fpos2);
auto data = res[1];
assert(data.ptr == compressed_buf.ptr);
size_t uncompressed_size = res[2];
auto crc32 = res[3];

auto buf = deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32);
assert(buf.ptr == uncompressed_buf.ptr);
return tuple(uncompressed_buf,uncompressed_size,fpos2);
}
}

/**
Streams bgzf data and fetch items by unit or buffer. These can go beyond
the size of individual blocks(!)
*/

struct BgzfStream {
BgzfReader bgzf;
FilePos fpos; // track file position
ubyte[] uncompressed_buf; // current data buffer
size_t uncompressed_size; // current data buffer size
Nullable!int block_pos; // position in block
BlockReadUnbuffered blockread;

this(string fn) {
bgzf = BgzfReader(fn);
uncompressed_buf = new ubyte[BLOCK_SIZE];
fpos = 0;
}

@disable this(this); // disable copy semantics;

@property bool eof() {
return fpos.isNull;
}

/**
Fetch data into buffer. The size of the buffer can be larger than
one or more multiple blocks
*/
ubyte[] fetch(ubyte[] buffer) {
if (block_pos.isNull) {
blockread.setup_block_reader();
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf); // read first block
assert(res[0].ptr == uncompressed_buf.ptr);
uncompressed_size = res[1];
fpos = res[2];
block_pos = 0;
}

immutable buffer_length = buffer.length;
size_t buffer_pos = 0;
size_t remaining = buffer_length;

while (remaining > 0) {
if (block_pos + remaining < uncompressed_size) {
// full copy
assert(buffer_pos + remaining == buffer_length);
memcpy(buffer[buffer_pos..buffer_pos+remaining].ptr,uncompressed_buf[block_pos..block_pos+remaining].ptr,remaining);
block_pos += remaining;
remaining = 0;
}
else {
// read tail of buffer
immutable tail = uncompressed_size - block_pos;
memcpy(buffer[buffer_pos..buffer_pos+tail].ptr,uncompressed_buf[block_pos..uncompressed_size].ptr,tail);
buffer_pos += tail;
remaining -= tail;
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf);
assert(res[0].ptr == uncompressed_buf.ptr);
uncompressed_size = res[1];
fpos = res[2];
block_pos = 0;
}
}
return buffer;
}

int read(T)() { // for integers
ubyte[T.sizeof] buf;
auto b = fetch(buf);
return b.read!(T,Endian.littleEndian)();
}

string read(T)(size_t len) {
ubyte[] buf = new ubyte[len]; // heap allocation
fetch(buf);
return cast(T)buf;
}

T[] read(T)(T[] buffer) { return cast(T[])fetch(cast(ubyte[])buffer); };
}

+ 0
- 243
bio2/bgzf_writer.d View File

@@ -1,243 +0,0 @@
/*
New style BGZF writer. This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

// Based on the original by Artem Tarasov.

module bio2.bgzf_writer;

// import core.stdc.stdlib : malloc, free;
import core.memory: pureMalloc, pureFree;
import core.stdc.stdio: fopen, fread, fclose;
import std.bitmanip;
import std.conv;
import std.exception;
import std.typecons;
import std.parallelism;
import std.array;
import std.algorithm : max;
import std.stdio;
import std.typecons;

import bio.core.bgzf.compress;
import bio.core.utils.roundbuf;

// import undead.stream;

import bio.bam.constants: BGZF_MAX_BLOCK_SIZE, BGZF_BLOCK_SIZE, BGZF_EOF;
import bio2.bgzf;
import bio2.constants;

alias void delegate(ubyte[], ubyte[]) BlockWriteHandler;

/// Convenience function for Taskpool handler
Tuple!(ubyte[], ubyte[], BlockWriteHandler) bgzfCompressFunc(ubyte[] input,
int level,
ubyte[] output_buffer,
BlockWriteHandler handler)
{
auto output = bgzfCompress(input, level, output_buffer);
return tuple(input, output, handler);
}

/// BGZF compression - this is a port of the original that used the
/// undead.stream library.
struct BgzfWriter {

private:
File f;
TaskPool task_pool;

ubyte[] buffer; // a slice into compression_buf (uncompressed data)
ubyte[] tmp; // a slice into compression_buf (compressed data)
size_t current_size;
int compression_level;

alias Task!(bgzfCompressFunc,
ubyte[], int, ubyte[], BlockWriteHandler) CompressionTask;
RoundBuf!(CompressionTask*) _compression_tasks;
ubyte[] compression_buf;

public:

/// Create new BGZF output stream with a multi-threaded writer
this(string fn, int _compression_level=-1) {
f = File(fn,"wb");
enforce1(-1 <= compression_level && compression_level <= 9,
"BGZF compression level must be a number in interval [-1, 9]");
size_t max_block_size = BGZF_MAX_BLOCK_SIZE;
size_t block_size = BGZF_BLOCK_SIZE;
task_pool = taskPool(),
compression_level = _compression_level;

// create a ring buffer that is large enough
size_t n_tasks = max(task_pool.size, 1) * 16;
_compression_tasks = RoundBuf!(CompressionTask*)(n_tasks);

// create extra block to which we can write while n_tasks are
// executed
auto comp_buf_size = (2 * n_tasks + 2) * max_block_size;
auto p = cast(ubyte*)pureMalloc(comp_buf_size);
compression_buf = p[0 .. comp_buf_size];
buffer = compression_buf[0 .. block_size];
tmp = compression_buf[max_block_size .. max_block_size * 2];
}

~this() {
close();
}

@disable this(this); // BgzfWriter does not have copy semantics;

void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) {
throw new BgzfException("Error writing BGZF block starting in "~f.name ~
" (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg);
}

void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) {
if (!check)
throwBgzfException(msg,file,line);
}

void write(const void* buf, size_t size) {
// stderr.writeln("HEY1 writing bytes ",size);
if (size + current_size >= buffer.length) {
size_t room;
ubyte[] data = (cast(ubyte*)buf)[0 .. size];

while (data.length + current_size >= buffer.length) {
room = buffer.length - current_size;
buffer[$ - room .. $] = data[0 .. room];
data = data[room .. $];

current_size = buffer.length;

flush_block();
}

buffer[0 .. data.length] = data[];
current_size = data.length;
} else {
buffer[current_size .. current_size + size] = (cast(ubyte*)buf)[0 .. size];
current_size += size;
}
// return size;
}

void write(ubyte[] buf) {
write(buf.ptr, buf.length);
}

void write(string s) {
write(cast(ubyte[])s);
}

void write(T)(T value) { // int values
// ubyte[T.sizeof] buf;
ubyte[] buf = [0,0,0,0,0,0,0,0,0,0];
assert(T.sizeof < buf.length);
buf.write!(T,Endian.littleEndian)(value,0);
// writeln("HEY T.sizeof: ",T.sizeof," value ",value," ",buf[0..T.sizeof]);
write(buf[0..T.sizeof]);
}

/// Force flushing current block, even if it is not yet filled.
/// Should also be used when it's not desired to have records
/// crossing block borders.
void flush_block() {
if (current_size == 0)
return;

Tuple!(ubyte[], ubyte[], BlockWriteHandler) front_result;
if (_compression_tasks.full) {
front_result = _compression_tasks.front.yieldForce();
_compression_tasks.popFront();
}

auto compression_task = task!bgzfCompressFunc(buffer[0 .. current_size],
compression_level, tmp,
_before_write);
_compression_tasks.put(compression_task);
task_pool.put(compression_task);

size_t offset = buffer.ptr - compression_buf.ptr;
immutable N = tmp.length;
offset += 2 * N;
if (offset == compression_buf.length)
offset = 0;
buffer = compression_buf[offset .. offset + buffer.length];
tmp = compression_buf[offset + N .. offset + 2 * N];
current_size = 0;

if (front_result[0] !is null)
writeResult(front_result);

while (!_compression_tasks.empty) {
auto task = _compression_tasks.front;
if (!task.done())
break;
auto result = task.yieldForce();
writeResult(result);
_compression_tasks.popFront();
}
}

private void delegate(ubyte[], ubyte[]) _before_write;
void setWriteHandler(void delegate(ubyte[], ubyte[]) handler) {
_before_write = handler;
}

private void writeResult(Tuple!(ubyte[], ubyte[], BlockWriteHandler) block) {
auto uncompressed = block[0];
auto compressed = block[1];
auto handler = block[2];
if (handler) {// write handler enabled
handler(uncompressed, compressed);
}
// _stream.writeExact(compressed.ptr, compressed.length);
f.rawWrite(compressed);
}

/// Flush all remaining BGZF blocks and underlying stream.
void flush() {
flush_block();

while (!_compression_tasks.empty) {
auto task = _compression_tasks.front;
auto block = task.yieldForce();
writeResult(block);
_compression_tasks.popFront();
}

f.flush();
current_size = 0;
}

/// Flush all remaining BGZF blocks and close source stream.
/// Automatically adds empty block at the end, serving as indicator
/// of end of stream. This function is automatically called on
/// destruction.
void close() {
flush();
f.rawWrite(BGZF_EOF);
f.close();
pureFree(compression_buf.ptr);
}
}

+ 0
- 11
bio2/constants.d View File

@@ -1,11 +0,0 @@
module bio2.constants;

alias ulong size_d;
alias int RefId; // -1 is invalid FIXME
alias uint GenomePos; // 32-bits, do check when reading!
alias ubyte MappingQuality; // 255 is invalid FIXME

struct GenomeLocation {
RefId ref_id;
GenomePos pos;
}

+ 0
- 99
bio2/hashing.d View File

@@ -1,99 +0,0 @@
/*
This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

module bio2.hashing;

import std.conv;
import std.stdio;

pragma(inline):
pure nothrow @nogc ushort get16bits(char *p)
{
ushort p0 = p[0];
ushort p1 = p[1];
// return p[1]*256+p[0];
return cast(ushort)(p1*256+p0);
}

/// Paul Hsieh's fast hash (LGPL license), see http://www.azillionmonkeys.com/qed/hash.html
pure nothrow @nogc uint SuperFastHash(string str, uint hashinc = 0) {
auto data = cast(char*)str.ptr;
auto len = cast(uint)str.length;

uint hash = (hashinc > 0 ? hashinc : 0);

if (len == 0) return 0;

int rem = len & 3;
len >>= 2;

/* Main loop */
for (;len > 0; len--) {
hash += get16bits(data);
uint tmp = (get16bits(data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*ushort.sizeof;
hash += hash >> 11;
}

/* Handle end cases */
switch (rem) {
case 3: hash += get16bits(data);
hash ^= hash << 16;
hash ^= (cast(ushort)data[ushort.sizeof]) << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits(data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1: hash += cast(ushort)*data;
hash ^= hash << 10;
hash += hash >> 1;
break;
default:
break;
}

/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;

return hash;
}


unittest {
// Make sure it behaves the same as the original
auto test = get16bits(cast(char *)("xy".ptr));
assert(test == 31096, to!string(test));
auto test2 = get16bits(cast(char *)("xyz".ptr));
assert(test2 == 31096, to!string(test2));
assert(get16bits(cast(char *)"xyz".ptr) == 31096);
assert(SuperFastHash("*")==1029965590,to!string(SuperFastHash("*")));
assert(SuperFastHash("hst")==1867544282,to!string(SuperFastHash("hst")));
assert(SuperFastHash("hstiaashccaht")==2173265029,to!string(SuperFastHash("hstiaashccaht")));
assert(SuperFastHash("Pjotr")==2808102289,to!string(SuperFastHash("Pjotr")));
}

+ 0
- 1
bio2/logger.d View File

@@ -1 +0,0 @@
module bio2.logger;

+ 0
- 341
bio2/pileup.d View File

@@ -1,341 +0,0 @@
/*
This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/
module bio2.pileup;

import std.conv;
import std.exception;
import std.stdio;
import std.traits;
import std.typecons;

import std.experimental.logger;

import bio2.constants;
import bio.core.utils.exception;

immutable ulong DEFAULT_BUFFER_SIZE = 1_000_000;

/**
Cyclic buffer or ringbuffer based on Artem's original. Uses copy
semantics to copy a read in a pre-allocated buffer. New items get
added to the tail, and used items get popped from the head
(FIFO). Basically it is empty when pointers align and full when
head - tail equals length. Not that the pointers are of size_t
which puts a theoretical limit on the number of items that can be
pushed.
*/

import core.stdc.string : memcpy;

// alias ulong RingBufferIndex;

struct RingBufferIndex {
alias Representation = ulong;
private ulong value = 0;

this(ulong v) {
value = v;
}

// @disable this(this); // disable copy semantics;

auto get() inout {
return value;
}

auto max() @property {
return value.max;
}

void opAssign(U)(U rhs) if (is(typeof(Checked!(T, Hook)(rhs)))) {
value = rhs;
}

bool opEquals(U, this _)(U rhs) {
return value == rhs;
}

auto opCmp(U, this _)(const U rhs) {
return value < rhs ? -1 : value > rhs;
}

ulong opUnary(string s)() if (s == "++") {
return ++value;
}

}


struct RingBuffer(T) {

T[] _items;
RingBufferIndex _head;
RingBufferIndex _tail;
size_t max_size = 0;

/** initializes round buffer of size $(D n) */
this(size_t n) {
_items = new T[n];
// _items.reserve(n);
}

@disable this(this); // disable copy semantics;

/*
Does not work because data is no longer available!
~this() {
// assert(is_empty); // make sure all items have been popped
}
*/

bool empty() @property @nogc nothrow const {
return _tail == _head;
}

alias empty is_empty;

auto ref front() @property {
enforce(!is_empty, "ringbuffer is empty");
return _items[_head.get() % $];
}
alias back last;

auto ref back() @property {
enforce(!is_empty, "ringbuffer is empty");
return _items[(_tail.get() - 1) % $];
}

alias front first;

bool is_tail(RingBufferIndex idx) {
return idx == _tail.get()-1;
}

ref T get_at(RingBufferIndex idx) {
enforce(!is_empty, "ringbuffer is empty");
enforce(idx >= _head, "ringbuffer range error (idx before front)");
enforce(idx != _tail, "ringbuffer range error (idx at end)");
enforce(idx < _tail, "ringbuffer range error (idx after end)");
return _items[idx.get() % $];
}

bool is_valid(RingBufferIndex idx) {
enforce(!is_empty, "ringbuffer is empty");
enforce(idx >= _head, "ringbuffer range error (idx before front)");
enforce(idx != _tail, "ringbuffer range error (idx at end)");
enforce(idx < _tail, "ringbuffer range error (idx after end)");
return true;
}

// This function is a hack.
void update_at(RingBufferIndex idx, T item) {
is_valid(idx);
_items[idx.get() % $] = item; // uses copy semantics
}

RingBufferIndex popFront() {
enforce(!is_empty, "ringbuffer is empty");
static if (__traits(compiles, _items[0].cleanupx)) {
// write("x");
_items[_head.get() % $].cleanup();
}
++_head.value;
return _head;
}

/// Puts item on the stack and returns the index
RingBufferIndex put(T item) {
enforce(!is_full, "ringbuffer is full - you need to expand buffer");
enforce(_tail < _tail.max, "ringbuffer overflow");
max_size = length > max_size ? length : max_size;
_items[_tail.get() % $] = item; // uses copy semantics
auto prev = _tail;
++_tail.value;
return prev;
}

ulong length() @property const {
// writeln(_tail.get(),":",_head.get(),"= len ",_tail.get()-_head.get());
return _tail.get() - _head.get();
}

bool is_full() @property const {
return _items.length == length();
}

bool in_range(RingBufferIndex idx) @property const {
return idx >= _head && idx < _tail;
}

ulong pushed() @property const {
return _tail.value;
}
ulong popped() @property const {
return _head.value;
}

@property void cleanup() {
_head = RingBufferIndex();
_tail = RingBufferIndex();
}

string toString() {
string res = "ring ";
for(RingBufferIndex i = _head; i<_tail; i++)
res ~= to!string(get_at(i));
return res;
}

@property string stats() {
return "Ringbuffer pushed " ~ to!string(pushed) ~ " popped " ~ to!string(popped) ~ " max-size " ~
to!string(max_size) // , "/", (pileup.ring.max_size+1)/pileup.ring.length);
;
}
}

unittest {
auto buf = RingBuffer!int(4);
assert(buf.is_empty);

buf.put(1);
buf.put(2);
assert(buf.length == 2);
assert(buf.front == 1);
buf.popFront(); // 1
buf.popFront(); // 2
buf.put(2);
buf.put(1);
buf.put(0);
buf.put(3);
assert(buf.is_full);
assert(buf.front == 2);
buf.popFront();
assert(buf.front == 1);
buf.put(4);
buf.popFront();
assert(buf.front == 0);
buf.popFront();
assert(buf.front == 3);
buf.popFront();
assert(buf.front == 4);
buf.popFront();
assert(buf.is_empty);
}

/**
Represent a pileup of reads in a buffer.
*/

/*
Read RingBuffer with current pointer, so you have three states
(first, current, last).
*/

class PileUp(R) {
RingBuffer!R ring;
Nullable!RingBufferIndex current;

this(ulong bufsize=DEFAULT_BUFFER_SIZE) {
ring = RingBuffer!R(bufsize);
set_current_to_head;
}

RingBufferIndex push(R r) { return ring.put(r); }
bool empty() @property const { return ring.is_empty();}
bool is_full() @property const { return ring.is_full();}
RingBufferIndex popFront() { return ring.popFront(); }
ref R front() { return ring.front(); }
alias front leftmost;
ref R rightmost() { return ring.back(); }
ref R read(RingBufferIndex idx) {
enforce(ring.in_range(idx), "idx should be set for PileUp.read");
return ring.get_at(idx);
}
ref R read_current() {
enforce(!current.isNull, "current should be set for PileUp.read_current");
return read(current);
}
bool is_at_end(RingBufferIndex idx) { return ring.is_tail(idx); }

@property void current_inc() {
asserte(!empty);
asserte(!ring.is_tail(current));
++current;
}

@property void set_current_to_head() {
current = ring._head; // note pileup can be empty
}

void current_reset() {
current = RingBufferIndex();
}

@property bool current_is_tail() {
return ring.is_tail(current);
}

void each(void delegate(R) dg) {
auto idx = ring._head;
while(!ring.is_tail(idx)) {
auto r = read(idx);
dg(r);
idx++;
}
}

void each_left_of_current(void delegate(RingBufferIndex, R) dg) {
R cur = read_current;
if (cur.is_unmapped) return;
auto idx = ring._head;
while(!ring.is_tail(idx)) {
auto r = read(idx);
if (r.is_mapped && r.end_pos >= cur.start_pos)
return;
dg(idx,r);
idx++;
}
}

void purge_while(bool delegate(R) dg) {
while(!empty) {
if (!dg(front))
return; // skip the rest
popFront();
}
}

void purge(void delegate(R) dg) {
while(!empty) {
dg(front);
popFront();
}
set_current_to_head();
}

@property string stats() {
return ring.stats();
}

override string toString() {
return stats;
}
}

+ 0
- 46
bio2/reads.d View File

@@ -1,46 +0,0 @@
/*
This file is part of Sambamba.
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 2 of the License,
or (at your option) any later version.

Sambamba is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA

*/

module bio2.reads;

import bio2.constants;
import bio.core.utils.exception;

import std.stdio;

bool read_overlaps(R)(GenomeLocation loc, R r) {
assert(r.is_mapped);
return r.ref_id == loc.ref_id && loc.pos >= r.start_pos && loc.pos <= r.end_pos;
}

bool reads_overlap(R)(R r1, R r2) {
asserte(r1.is_mapped);
asserte(r2.is_mapped);
asserte(r2.ref_id == r1.ref_id);
// r1 rrrrrrrrrrr
// r2 ---------???????????
if (r2.start_pos < r1.start_pos && r2.end_pos >= r1.start_pos) {
return true;
}
// r1 rrrrrrrrrrr
// r2 ----?????
return r2.start_pos >= r1.start_pos && r2.start_pos <= r1.end_pos;
}

+ 0
- 41
bio2/unpack.d View File

@@ -1,41 +0,0 @@
/*
This file is part of Sambamba.

Copyright © 2012-2017 Pjotr Prins <pjotr.prins@thebird.nl>

Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

Sambamba is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/

/**
Module for streaming unpacked (BAM) blocks
*/
module bio2.unpack;

import std.stdio;

import bio2.bgzf;

int unpack_bams(string[] infns, File outfile) {

foreach (string fn; infns) {
foreach (ubyte[] read; BgzfBlocks(fn)) {
outfile.rawWrite(read); // raw unpacking
}
}

return 0;

}

+ 0
- 16
dub.json View File

@@ -1,16 +0,0 @@
{
"name": "biod",
"authors": [
"Artem Tarasov",
"Pjotr Prins"
],
"description": "Bioinformatics library in D (utils for working with SAM, BAM, SFF formats)",
"copyright": "Copyright © 2016, BioD developers",
"license": "MIT",
"sourcePaths": ["bio"],
"importPaths": ["bio"],
"dependencies": {
"undead": "~>1.0.6"
},
"buildRequirements": ["allowWarnings"]
}

+ 0
- 51
examples/calculate_gc_content_from_reference.d View File

@@ -1,51 +0,0 @@
// BioD depends on stream.d which is no longer included with phobos.
// To run this example from this directory,
// Clone the undead repository with
// git clone https://github.com:dlang/undeaD.git at the appropriate location and ensure
// it is available on your path
// Run example: rdmd -I.. -I../location_of_undead/src calculate_gc_content_from_reference.d

import bio.bam.reader;
import bio.bam.md.reconstruct : dna;

import std.stdio;
import std.datetime.stopwatch: benchmark, StopWatch;
import std.range;
import std.array;

void main() {
auto bam = new BamReader("../test/data/b7_295_chunk.bam");

// the sequence starts at first mapped base of first read
auto reference = dna(bam.reads);

int n_bases = 0, gc = 0;

foreach (base; reference) {
if (base == 'N') continue; // happens if coverage is zero
if (base == 'G' || base == 'C') gc += 1;
n_bases += 1;
}

writeln("total bases: ", n_bases);
writeln(" GC%: ", cast(float)gc / n_bases);
writeln(" sequence: ", reference);
writeln(" #reads: ", walkLength(bam.reads));

auto reads = array(bam.reads); // no I/O during measurements

StopWatch sw; // for a range of reads, minimum number of MD tags is parsed

sw.start();
walkLength(dna(reads));
sw.stop();

writeln("extracting reference from range of reads: ", sw.peek.total!"usecs", "μs");

sw.reset();
sw.start();
foreach (read; reads) walkLength(dna(read));
sw.stop();

writeln("extracting reference from each read: ", sw.peek.total!"usecs", "μs");
}

+ 0
- 48
examples/create_bam_from_scratch.d View File

@@ -1,48 +0,0 @@
// BioD depends on stream.d which is no longer included with phobos.
// To run this example from this directory,
// Clone the undead repository with
// git clone https://github.com:dlang/undeaD.git at the appropriate location and ensure
// it is available on your path
// Run example: rdmd -I.. -I../location_of_undead/src create_bam_from_scratch.d

// this example shows how to create BAM files from scratch
import bio.bam.read;
import bio.bam.referenceinfo;