Browse Source

initial commit

remotes/georgeg/bam_output_redesign
lomereiter 9 years ago
commit
a1dcb7075e
  1. 86
      bio/bam/bai/bin.d
  2. 36
      bio/bam/bai/chunk.d
  3. 279
      bio/bam/bai/indexing.d
  4. 87
      bio/bam/bai/utils/algo.d
  5. 161
      bio/bam/baifile.d
  6. 420
      bio/bam/bamfile.d
  7. 359
      bio/bam/baseinfo.d
  8. 120
      bio/bam/bgzf/block.d
  9. 287
      bio/bam/bgzf/blockrange.d
  10. 111
      bio/bam/bgzf/compress.d
  11. 324
      bio/bam/chunkinputstream.d
  12. 64
      bio/bam/constants.d
  13. 231
      bio/bam/fz/flowcall.d
  14. 95
      bio/bam/fz/flowindex.d
  15. 4
      bio/bam/md/core.d
  16. 68
      bio/bam/md/operation.d
  17. 101
      bio/bam/md/parse.d
  18. 433
      bio/bam/md/reconstruct.d
  19. 301
      bio/bam/output.d
  20. 807
      bio/bam/pileuprange.d
  21. 487
      bio/bam/randomaccessmanager.d
  22. 1129
      bio/bam/read.d
  23. 148
      bio/bam/readrange.d
  24. 210
      bio/bam/reference.d
  25. 157
      bio/bam/samfile.d
  26. 642
      bio/bam/samheader.d
  27. 299
      bio/bam/serialization/json.d
  28. 233
      bio/bam/serialization/sam.d
  29. 541
      bio/bam/snpcallers/maq.d
  30. 47
      bio/bam/snpcallers/simple.d
  31. 102
      bio/bam/splitter.d
  32. 416
      bio/bam/tagvalue.d
  33. 4782
      bio/bam/thirdparty/msgpack.d
  34. 109
      bio/bam/utils/array.d
  35. 4123
      bio/bam/utils/fastsamrecordparser.d
  36. 88
      bio/bam/utils/graph.d
  37. 361
      bio/bam/utils/samheadermerger.d
  38. 1120
      bio/bam/utils/samrecordparser.d
  39. 52
      bio/bam/utils/tagstoragebuilder.d
  40. 117
      bio/bam/utils/value.d
  41. 558
      bio/bam/validation/alignment.d
  42. 442
      bio/bam/validation/samheader.d
  43. 90
      bio/bam/virtualoffset.d
  44. 233
      bio/core/base.d
  45. 102
      bio/core/call.d
  46. 48
      bio/core/fasta.d
  47. 89
      bio/core/genotype.d
  48. 230
      bio/core/region.d
  49. 289
      bio/core/tinymap.d
  50. 85
      bio/core/utils/algo.d
  51. 247
      bio/core/utils/format.d
  52. 259
      bio/core/utils/memoize.d
  53. 245
      bio/core/utils/range.d
  54. 40
      bio/core/utils/stream.d
  55. 50
      bio/core/utils/switchendianness.d
  56. 20
      bio/core/utils/tmpfile.d
  57. 28
      src_ragel/Makefile
  58. 82
      src_ragel/region.rl
  59. 385
      src_ragel/sam_alignment.rl
  60. 6
      src_ragel/workarounds/fix_static_const.sh
  61. 6
      src_ragel/workarounds/fix_switch_case_fallthrough.sh
  62. BIN
      test/data/bins.bam
  63. BIN
      test/data/bins.bam.bai
  64. BIN
      test/data/corrupted_zlib_archive.bam
  65. BIN
      test/data/duplicated_block_size.bam
  66. BIN
      test/data/ex1_header.bam
  67. BIN
      test/data/ex1_header.bam.bai
  68. 3273
      test/data/ex1_header.sam
  69. BIN
      test/data/mg1655_chunk.bam
  70. BIN
      test/data/no_block_size.bam
  71. BIN
      test/data/tags.bam
  72. BIN
      test/data/tags.bam.bai
  73. BIN
      test/data/wrong_bc_subfield_length.bam
  74. BIN
      test/data/wrong_extra_gzip_length.bam
  75. 375
      test/unittests.d

86
bio/bam/bai/bin.d

@ -0,0 +1,86 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bai.bin;
import bio.bam.bai.chunk;
import bio.bam.constants;
/// Distinct bin
struct Bin {
/// Construct a bin with an id
this(ushort id) nothrow {
this.id = id;
}
uint id; /// bin number
Chunk[] chunks;
/// How deep the bin is in the tree
int level() @property const nothrow {
if (id == 0) return 0;
if (id < 9) return 1;
if (id < 73) return 2;
if (id < 585) return 3;
if (id < 4681) return 4;
return 5;
}
/// Returns whether the bin is a leaf in the B-tree
bool is_leaf() @property const nothrow {
return id > BAI_MAX_NONLEAF_BIN_ID;
}
/// Check if bin can overlap with a region
bool canOverlapWith(int begin, int end) const nothrow {
if (id == 0) return true;
if (id > BAI_MAX_BIN_ID) return false;
/// The following code is based on reg2bins() function
if (begin < 0) begin = 0;
auto magic_number = 4681;
auto b = begin >> 14;
auto e = end >> 14;
while (true) {
auto delta = id - magic_number;
if (b <= delta && delta <= e) return true;
magic_number >>= 3;
if (magic_number == 0) return false;
b >>= 3;
e >>= 3;
}
}
}
/// Returns bin number for [beg, end) interval (zero-based).
/// Taken from SAM/BAM specification.
ushort reg2bin(int beg, int end) {
--end;
if (beg>>14 == end>>14) return cast(ushort)(((1<<15)-1)/7 + (beg>>14));
if (beg>>17 == end>>17) return cast(ushort)(((1<<12)-1)/7 + (beg>>17));
if (beg>>20 == end>>20) return cast(ushort)(((1<<9)-1)/7 + (beg>>20));
if (beg>>23 == end>>23) return cast(ushort)(((1<<6)-1)/7 + (beg>>23));
if (beg>>26 == end>>26) return cast(ushort)(((1<<3)-1)/7 + (beg>>26));
return 0;
}

36
bio/bam/bai/chunk.d

@ -0,0 +1,36 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bai.chunk;
import bio.bam.virtualoffset;
struct Chunk {
VirtualOffset beg; /// virtual file offset of the start of the chunk
VirtualOffset end; /// virtual file offset of the end of the chunk
/// Compare beginnings
int opCmp(Chunk other) const nothrow {
if (beg < other.beg) return -1;
if (beg > other.beg) return 1;
if (end < other.end) return -1;
if (end > other.end) return 1;
return 0;
}
}

279
bio/bam/bai/indexing.d

@ -0,0 +1,279 @@
/*
This file is part of Sambamba.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Sambamba is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bai.indexing;
import bamfile;
import constants;
import bai.bin;
import bai.chunk;
import std.stream;
import std.array;
import std.algorithm;
import std.system;
import std.exception;
// Suppose we have an alignment which covers bases on a reference,
// starting from one position and ending at another position.
// In order to build linear index, we need to find to which windows
// the two positions correspond.
//
//
// (K = 16384)
//
// [0, K)[K, 2K)[2K, 3K)... <- windows
// [.......) <- alignment
//
private size_t toLinearIndexOffset(int position) {
return position < 0 ? 0 : position / BAI_LINEAR_INDEX_WINDOW_SIZE;
}
void defaultProgressBarFunc(lazy float dummy) {}
/// Writes BAM index to the $(D stream)
///
/// Accepts optional $(D progressBarFunc)
void createIndex(ref BamFile bam, ref Stream stream, void delegate(lazy float p) progressBarFunc=null) {
auto endian_stream = new EndianStream(stream, Endian.littleEndian);
auto refs = bam.reference_sequences;
auto nrefs = refs.length;
endian_stream.writeString(BAI_MAGIC); // write BAI magic string
endian_stream.write(cast(int)nrefs); // and number of references
void writeEmptyReference() {
endian_stream.write(cast(int)0); // n_bins
endian_stream.write(cast(int)0); // n_intv
}
// BAM file contains no alignments at all or all reads are unmapped
if (bam.alignments!withOffsets.empty ||
bam.alignments!withOffsets.front.alignment.ref_id < 0) {
foreach (i; 0 .. nrefs) {
writeEmptyReference();
}
return;
}
// OK, now let's deal with non-degenerate case
auto alignment_blocks = bam.alignmentsWithProgress!withOffsets(progressBarFunc);
auto prev_block = alignment_blocks.front;
alignment_blocks.popFront();
// this is the main character hereafter
auto prev_read = prev_block.alignment;
// array of linear offsets for the current reference entry
ulong[BAI_MAX_BIN_ID - BAI_MAX_NONLEAF_BIN_ID + 1] linear_index;
// (maximum index in linear_index where data was written) + 1
size_t linear_index_write_length;
// map: bin ID -> array of chunks
Chunk[][uint] chunks;
auto first_ref_id = prev_block.alignment.ref_id;
auto current_chunk_beg = prev_block.start_virtual_offset;
assert(first_ref_id >= 0);
foreach (i; 0 .. first_ref_id) {
writeEmptyReference();
}
void updateLinearIndex() {
assert(prev_read.ref_id >= 0);
size_t beg, end;
if (prev_read.is_unmapped) {
end = beg = toLinearIndexOffset(prev_read.position);
} else {
beg = toLinearIndexOffset(prev_read.position);
end = toLinearIndexOffset(prev_read.position + prev_read.basesCovered() - 1);
}
debug {
import std.stdio;
if (end >= linear_index.length) {
writeln("beg: ", beg);
writeln("end: ", end);
writeln("pos: ", prev_read.position);
writeln("bases: ", prev_read.basesCovered());
}
}
foreach (i; beg .. end + 1) {
if (linear_index[i] == 0UL) {
linear_index[i] = cast(ulong)prev_block.start_virtual_offset;
}
}
if (end + 1 > linear_index_write_length) {
linear_index_write_length = end + 1;
}
}
void dumpCurrentLinearIndex() {
endian_stream.write(cast(int)linear_index_write_length);
//
// There might be untouched places in linear index
// with virtual offset equal to zero.
// However, it's not a good idea to leave those zeros,
// since we can start lookup from the last non-zero virtual offset
// encountered before the untouched window.
//
ulong last_voffset = 0;
foreach (voffset; linear_index[0 .. linear_index_write_length])
{
if (voffset == 0) {
voffset = last_voffset;
} else {
last_voffset = voffset;
}
endian_stream.write(voffset);
}
}
void dumpCurrentReference() {
endian_stream.write(cast(int)chunks.length);
foreach (bin_id, bin_chunks; chunks) {
if (bin_chunks.length > 0) {
endian_stream.write(bin_id);
endian_stream.write(cast(int)bin_chunks.length);
foreach (chunk; bin_chunks) {
endian_stream.write(cast(ulong)chunk.beg);
endian_stream.write(cast(ulong)chunk.end);
}
}
}
dumpCurrentLinearIndex();
// reset data
linear_index[] = 0;
linear_index_write_length = 0;
chunks = null;
current_chunk_beg = prev_block.end_virtual_offset;
}
// adds chunk to the current bin (which is determined from prev_read)
void updateChunks() {
auto current_chunk_end = prev_block.end_virtual_offset;
auto bin_id = prev_read.bin.id;
if (bin_id !in chunks) {
chunks[bin_id] = [];
}
auto cs = chunks[bin_id];
bool canMergeWithPreviousChunk() {
assert(cs.length > 0);
auto last_chunk = cs[$ - 1];
if (last_chunk.end.coffset == current_chunk_beg.coffset)
return true;
return false;
}
if (cs.length == 0 || !canMergeWithPreviousChunk()) {
chunks[prev_read.bin.id] ~= Chunk(current_chunk_beg, current_chunk_end);
} else {
chunks[prev_read.bin.id][$ - 1].end = current_chunk_end;
}
current_chunk_beg = current_chunk_end;
}
foreach (block; alignment_blocks) {
auto read = block.alignment;
// new reference, so write data for previous one(s)
if (read.ref_id != prev_read.ref_id) {
updateLinearIndex();
updateChunks();
dumpCurrentReference();
foreach (i; prev_read.ref_id + 1 .. read.ref_id)
writeEmptyReference();
}
// this and all the following reads are unmapped
if (read.ref_id < 0) {
break;
}
// start position is unavailable, skip
if (read.position < 0) {
prev_block = block;
prev_read = read;
continue;
}
// check if the BAM file is indeed sorted
if ((read.ref_id == prev_read.ref_id &&
read.position < prev_read.position) ||
(read.ref_id < prev_read.ref_id))
{
throw new Exception("BAM file is not properly sorted: " ~
"read '" ~ read.read_name ~ "'" ~
" must be before read '" ~
prev_read.read_name ~
"' (at virtual offset " ~
to!string(prev_block.start_virtual_offset)~
")");
}
// ---------------------------------------------------------------------
if (read.ref_id == prev_read.ref_id) {
updateLinearIndex();
if (read.bin.id != prev_read.bin.id) {
updateChunks();
}
}
// ---------------------------------------------------------------------
prev_block = block;
prev_read = read;
}
// after the loop, prev_read is the last read with ref_id >= 0
assert(prev_read.ref_id >= 0);
updateLinearIndex();
updateChunks();
dumpCurrentReference();
// write the rest
foreach (i; prev_read.ref_id + 1 .. nrefs) {
writeEmptyReference();
}
}

87
bio/bam/bai/utils/algo.d

@ -0,0 +1,87 @@
/*
This file is part of Sambamba.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Sambamba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Sambamba is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bai.utils.algo;
import bio.bam.bai.chunk;
import std.range;
import std.algorithm;
struct NonOverlappingChunks(R) {
this(R r) {
_range = r;
popFront();
}
bool empty() @property {
return _empty;
}
auto front() @property {
return _front;
}
void popFront() {
if (!_range.empty()) {
_front = _range.front;
tryToJoinWithNextChunks();
} else {
_empty = true;
}
}
private:
R _range;
void tryToJoinWithNextChunks() {
_range.popFront();
while (!_range.empty()) {
/// Get next element
auto next = _range.front;
/// It's presumed that chunks are sorted
assert(next >= _front);
/// Check if the chunk can be joined with the previous one
if (_front.end >= next.beg) {
/// update end of _front
_front.end = max(_front.end, next.end);
_range.popFront(); /// next is consumed
} else {
/// can't join
break;
}
}
}
Chunk _front;
bool _empty = false;
}
/// Params:
/// r - range of chunks, sorted by leftmost coordinate
/// Returns:
/// range of non-overlapping chunks, covering the same subset
/// as original chunks
auto nonOverlappingChunks(R)(R r)
if (is(ElementType!R == Chunk))
{
return NonOverlappingChunks!R(r);
}

161
bio/bam/baifile.d

@ -0,0 +1,161 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.baifile;
public import bio.bam.bai.chunk;
public import bio.bam.bai.bin;
import bio.bam.virtualoffset;
import bio.bam.constants;
import std.stream;
import std.system;
import std.exception;
import std.algorithm;
import std.conv;
import std.range;
import std.file;
import std.path;
/// Represents index for a single reference
struct Index {
/// Information about bins
Bin[] bins;
/// Virtual file offsets of first alignments overlapping 16384-byte windows
/// on the reference sequence. This linear index is used to reduce amount
/// of file seeks for region queries, since with its help one can reduce the
/// number of chunks to be investigated based on their end position.
///
///
/// Suppose you have a region [beg, end) and want to do a query.
///
/// Here's the reference:
/// [....................!..............!.................................]
/// beg end
///
/// Here's the same reference with 16384-byte long windows:
/// [%...........%.......!....%.........!..%...........%...........%......]
/// beg end
/// [ 1st window][ 2nd window][...
///
/// With linear index, we can take the second window, find out what is
/// the minimum virtual offset among alignments overlapping this window,
/// and skip all chunks which end position is less or equal to this offset:
///
/// [........@...........!..............!.................................]
/// . ..min. offset beg end
/// [ ). . <- this chunk is skipped
/// [ ) <- this one is not
///
VirtualOffset[] ioffsets;
/// Get (approximate) virtual offset of the first alignment overlapping $(D position)
///
/// Returned virtual offset is less or equal to real offset.
VirtualOffset getMinimumOffset(int position) {
int pos = max(0, position);
int _i = min(pos / BAI_LINEAR_INDEX_WINDOW_SIZE, cast(int)ioffsets.length - 1);
auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i];
return min_offset;
}
}
struct BaiFile {
Index[] indices;
/// Initialize from stream which contains BAI data
this(ref Stream stream) {
_stream = stream;
parse();
}
/// Open BAI file given either filename of BAM file or that of BAI file.
this(string filename) {
Stream fstream;
if (!endsWith(filename, ".bai")) {
/// Unfortunately, std.path.addExt is going to be deprecated
auto first_filename = filename ~ ".bai";
auto second_filename = to!string(retro(find(retro(filename), '.'))) ~ "bai";
if (std.file.exists(first_filename)) {
fstream = new BufferedFile(absolutePath(first_filename));
} else {
if (std.file.exists(second_filename)) {
fstream = new BufferedFile(absolutePath(second_filename));
} else {
throw new Exception("searched for " ~ first_filename ~ " or " ~
second_filename ~ ", found neither");
}
}
} else {
fstream = new BufferedFile(filename);
}
Stream estream = new EndianStream(fstream, Endian.littleEndian);
this(estream);
}
private:
Stream _stream;
/// according to section 4.2 of SAM/BAM specification
void parse() {
auto magic = _stream.readString(4);
enforce(magic == "BAI\1", "Invalid file format: expected BAI\\1");
int n_ref;
_stream.read(n_ref);
indices.length = n_ref;
foreach (i; 0 .. n_ref) {
int n_bin;
_stream.read(n_bin);
indices[i].bins.length = n_bin;
foreach (j; 0 .. n_bin) {
_stream.read(indices[i].bins[j].id);
int n_chunk;
_stream.read(n_chunk);
indices[i].bins[j].chunks.length = n_chunk;
foreach (k; 0 .. n_chunk) {
ulong tmp;
_stream.read(tmp);
indices[i].bins[j].chunks[k].beg = VirtualOffset(tmp);
_stream.read(tmp);
indices[i].bins[j].chunks[k].end = VirtualOffset(tmp);
}
}
int n_intv;
_stream.read(n_intv);
indices[i].ioffsets.length = n_intv;
foreach (j; 0 .. n_intv) {
ulong tmp;
_stream.read(tmp);
indices[i].ioffsets[j] = VirtualOffset(tmp);
}
}
}
}

420
bio/bam/bamfile.d

@ -0,0 +1,420 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bamfile;
public import bio.bam.samheader;
public import bio.bam.reference;
public import bio.bam.read;
public import bio.bam.virtualoffset;
public import bio.bam.tagvalue;
public import bio.bam.readrange;
import bio.bam.bgzf.blockrange;
import bio.bam.chunkinputstream;
import bio.bam.randomaccessmanager;
import bio.bam.baifile;
import bio.core.utils.range;
import bio.core.utils.stream;
import std.system;
import std.stdio;
import std.algorithm : map, min;
import std.range : zip;
import std.conv : to;
import std.exception : enforce;
import std.parallelism;
import std.array : uninitializedArray;
import core.stdc.stdio;
import std.string;
/**
Represents BAM file
*/
struct BamFile {
this(Stream stream, TaskPool task_pool = taskPool) {
_source_stream = new EndianStream(stream, Endian.littleEndian);
_task_pool = task_pool;
if (stream.seekable) {
_stream_is_seekable = true;
}
initializeStreams();
auto magic = _bam.readString(4);
enforce(magic == "BAM\1", "Invalid file format: expected BAM\\1");
readSamHeader();
readReferenceSequencesInfo();
// right after construction, we are at the beginning
// of the list of reads
if (_stream_is_seekable) {
_reads_start_voffset = _decompressed_stream.virtualTell();
}
}
/**
Constructor taking filename of BAM file to open,
and optionally, task pool to use.
Currently, opens the file read-only since library
has no support for writing yet.
*/
this(string filename, TaskPool task_pool = taskPool) {
_filename = filename;
_source_stream = getNativeEndianSourceStream();
this(_source_stream, task_pool);
}
/// True if associated BAI file was found
bool has_index() @property {
return _random_access_manager.found_index_file;
}
/// If file ends with EOF block, returns virtual offset of the start of EOF block.
/// Otherwise, returns virtual offset of the physical end of file.
VirtualOffset eofVirtualOffset() {
return _random_access_manager.eofVirtualOffset();
}
/// Get BGZF block at a given file offset.
BgzfBlock getBgzfBlockAt(ulong offset) {
return _random_access_manager.getBgzfBlockAt(offset);
}
/*
Get SAM header of file.
*/
SamHeader header() @property {
if (_header is null) {
synchronized {
if (_header is null) {
_header = new SamHeader(_headertext);
_headertext = null;
}
}
}
return _header;
}
/**
Returns: information about reference sequences
*/
ReferenceSequenceInfo[] reference_sequences() @property {
return _reference_sequences;
}
/**
Returns: range of all reads in the file.
However, using several ranges is not recommended since it can hurt
disk access performance.
*/
auto reads(alias IteratePolicy=withoutOffsets)() @property {
auto _decompressed_stream = getDecompressedBamReadStream();
return bamReadRange!IteratePolicy(_decompressed_stream);
}
/**
Returns: range of all reads in the file, calling $(D progressBarFunc)
for each read.
$(D progressBarFunc) will be called
each time next alignment is read, with the argument being a number from [0.0, 1.0],
which is estimated progress percentage.
*/
auto readsWithProgress(alias IteratePolicy=withoutOffsets)
(void delegate(lazy float p) progressBarFunc)
{
auto _decompressed_stream = getDecompressedBamReadStream();
auto reads_with_offsets = bamReadRange!withOffsets(_decompressed_stream);
static struct Result(alias IteratePolicy, R, S) {
this(R range, S stream, void delegate(lazy float p) progressBarFunc) {
_range = range;
_stream = stream;
_progress_bar_func = progressBarFunc;
}
static if (__traits(identifier, IteratePolicy) == "withOffsets") {
auto front() @property {
return _range.front;
}
} else static if (__traits(identifier, IteratePolicy) == "withoutOffsets") {
auto front() @property {
return _range.front.read;
}
} else static assert(0, __traits(identifier, IteratePolicy));
bool empty() @property {
return _range.empty;
}
void popFront() {
_bytes_read += _range.front.read.size_in_bytes;
_range.popFront();
if (_progress_bar_func !is null) {
_progress_bar_func(min(1.0,
cast(float)_bytes_read / (_stream.compressed_file_size *
_stream.average_compression_ratio)));
}
}
private R _range;
private S _stream;
private size_t _bytes_read;
private void delegate(lazy float p) _progress_bar_func;
}
return Result!(IteratePolicy,
typeof(reads_with_offsets),
typeof(_decompressed_stream))(reads_with_offsets,
_decompressed_stream,
progressBarFunc);
}
/**
Get the read at a given virtual offset.
*/
BamRead getReadAt(VirtualOffset offset) {
enforce(_random_access_manager !is null);
return _random_access_manager.getReadAt(offset);
}
/**
Get all reads between two virtual offsets.
First offset must point to the start of an alignment record,
and be strictly less than the second one.
For decompression, uses task pool specified at BamFile construction.
*/
auto getReadsBetween(VirtualOffset from, VirtualOffset to) {
enforce(from < to, "First offset must be strictly less than second");
enforce(_stream_is_seekable, "Stream is not seekable");
return _random_access_manager.getReadsBetween(from, to, _task_pool);
}
/**
Get BAI chunks containing all reads overlapping specified region.
*/
Chunk[] getChunks(int ref_id, int beg, int end) {
enforce(_random_access_manager !is null);
enforce(beg < end);
return _random_access_manager.getChunks(ref_id, beg, end);
}
/**
Returns reference sequence with id $(D ref_id).
*/
ReferenceSequence reference(int ref_id) {
enforce(ref_id < _reference_sequences.length, "Invalid reference index");
return ReferenceSequence(_random_access_manager,
ref_id,
_reference_sequences[ref_id]);
}
/**
Returns reference sequence named $(D ref_name).
*/
ReferenceSequence opIndex(string ref_name) {
enforce(hasReference(ref_name), "Reference with name " ~ ref_name ~ " does not exist");
auto ref_id = _reference_sequence_dict[ref_name];
return reference(ref_id);
}
/**
Check if reference named $(D ref_name) is presented in BAM header.
*/
bool hasReference(string ref_name) {
return null != (ref_name in _reference_sequence_dict);
}
/**
Set buffer size for I/O operations.
*/
void setBufferSize(size_t buffer_size) {
this.buffer_size = buffer_size;
}
private:
string _filename; // filename (if available)
Stream _source_stream; // compressed
IChunkInputStream _decompressed_stream; // decompressed
Stream _bam; // decompressed + endian conversion
bool _stream_is_seekable;
// Virtual offset at which alignment records start.
VirtualOffset _reads_start_voffset;
BaiFile _dont_access_me_directly_use_bai_file_for_that;
enum BaiStatus {
notInitialized,
initialized,
fileNotFound
}
BaiStatus _bai_status = BaiStatus.notInitialized;
// provides access to index file
@property ref BaiFile _bai_file() { // initialized lazily
if (_bai_status == BaiStatus.notInitialized) {
synchronized {
try {
_dont_access_me_directly_use_bai_file_for_that = BaiFile(_filename);
_bai_status = BaiStatus.initialized;
} catch (Exception e) {
_bai_status = BaiStatus.fileNotFound;
}
}
}
return _dont_access_me_directly_use_bai_file_for_that;
};
RandomAccessManager _rndaccssmgr; // unreadable for a purpose
@property RandomAccessManager _random_access_manager() {
if (_rndaccssmgr is null) {
synchronized {
auto bai = _bai_file;
// remember that it's lazily initialized,
// so we need to do that to get the right BAI status
if (_bai_status == BaiStatus.initialized) {
_rndaccssmgr = new RandomAccessManager(_filename, bai);
} else {
_rndaccssmgr = new RandomAccessManager(_filename);
}
}
}
return _rndaccssmgr;
}
SamHeader _header;
string _headertext; // for lazy SAM header parsing
ReferenceSequenceInfo[] _reference_sequences;
int[string] _reference_sequence_dict; /// name -> index mapping
TaskPool _task_pool;
size_t buffer_size = 8192; // buffer size to be used for I/O
Stream getNativeEndianSourceStream() {
assert(_filename !is null);
return new bio.core.utils.stream.File(_filename);
}
Stream getSeekableCompressedStream() {
if (_stream_is_seekable) {
if (_filename !is null) {
auto file = getNativeEndianSourceStream();
version(development)
{
std.stdio.stderr.writeln("[info] file size: ", file.size);
}
return new EndianStream(file, Endian.littleEndian);
} else {
_source_stream.seekSet(0);
return _source_stream;
}
} else {
return null;
}
}
// get decompressed stream out of compressed BAM file
IChunkInputStream getDecompressedStream() {
auto compressed_stream = getSeekableCompressedStream();
auto bgzf_range = (compressed_stream is null) ? BgzfRange(_source_stream) :
BgzfRange(compressed_stream);
version(serial) {
auto chunk_range = map!decompressBgzfBlock(bgzf_range);
} else {
auto chunk_range = _task_pool.map!decompressBgzfBlock(bgzf_range, 24);
}
if (compressed_stream !is null) {
return makeChunkInputStream(chunk_range, cast(size_t)compressed_stream.size);
} else {
return makeChunkInputStream(chunk_range);
}
}
// get decompressed stream starting from the first alignment record
IChunkInputStream getDecompressedBamReadStream() {
auto compressed_stream = getSeekableCompressedStream();
if (compressed_stream !is null) {
enforce(_reads_start_voffset != 0UL);
compressed_stream.seekCur(_reads_start_voffset.coffset);
auto bgzf_range = BgzfRange(compressed_stream);
version(serial) {
auto chunk_range = map!decompressBgzfBlock(bgzf_range);
} else {
auto chunk_range = _task_pool.map!decompressBgzfBlock(bgzf_range, 24);
}
auto sz = compressed_stream.size;
auto stream = makeChunkInputStream(chunk_range, cast(size_t)sz);
stream.readString(_reads_start_voffset.uoffset);
return stream;
} else {
// must be initialized in initializeStreams()
return _decompressed_stream;
}
}
// sets up the streams and ranges
void initializeStreams() {
_decompressed_stream = getDecompressedStream();
_bam = new EndianStream(_decompressed_stream, Endian.littleEndian);
}
// initializes _header
void readSamHeader() {
int l_text;
_bam.read(l_text);
_headertext = to!string(_bam.readString(l_text));
}
// initialize _reference_sequences
void readReferenceSequencesInfo() {
int n_ref;
_bam.read(n_ref);
_reference_sequences = new ReferenceSequenceInfo[n_ref];
foreach (i; 0..n_ref) {
_reference_sequences[i] = ReferenceSequenceInfo(_bam);
// provide mapping Name -> Index
_reference_sequence_dict[_reference_sequences[i].name] = i;
}
}
}

359
bio/bam/baseinfo.d

@ -0,0 +1,359 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.baseinfo;
import bio.core.base;
import bio.bam.read;
import bio.bam.fz.flowcall;
import std.range;
import std.conv;
import std.traits;
import std.typetuple;
///
struct MixinArg(T, string Tag) {
T value;
alias value this;
alias Tag TagName;
}
/// Wrapper for arguments to $(D basesWith) function (see below).
/// Required to distinguish to which tag each parameter refers.
MixinArg!(T, Tag) arg(string Tag, T)(T value) {
return MixinArg!(T, Tag)(value);
}
struct PerBaseInfo(R, Tags...) {
private alias TypeTuple!("CIGAR", Tags) Extensions;
// /////////////////////////////////////////////////////////////////////////
//
// Each 'extension' is a template with name TAGbaseInfo, containing
// a couple of mixin templates:
//
// * resultProperties
// These are additional properties provided by the template
//
// * rangeMethods
// These describe how to proceed to the next base.
// The following methods must be implemented:
//
// - void setup(Args...)(const ref R read, Args args);
// Gets called during range construction. All constructor
// arguments are forwarded, and it's this function which
// is responsible for getting required parameters for this
// particular template.
//
// - void populate(Result)(ref Result r);
// Populates fields of the result declared in resultProperties.
// Should run in O(1), just copying a few variables.
//
// - void update(const ref R read);
// Encapsulates logic of moving to the next base and updating
// mixin variables correspondingly.
//
// - void copy(Range)(const ref Range source, ref Range target);
// Gets called during $(D source.save). Therefore, any ranges
// used in mixin templates must be saved as well at that time.
//
// /////////////////////////////////////////////////////////////////////////
private static string getResultProperties(Exts...)() {
char[] result;
foreach (ext; Exts)
result ~= "mixin " ~ ext ~ "baseInfo!R.resultProperties;".dup;
return cast(string)result;
}
static struct Result {
Base base;
alias base this;
string opCast(T)() if (is(T == string))
{
return to!string(base);
}
mixin(getResultProperties!Extensions());
}
private static string getRangeMethods(Exts...)() {
char[] result;
foreach (ext; Exts)
result ~= "mixin " ~ ext ~ "baseInfo!R.rangeMethods " ~ ext ~ ";".dup;
return cast(string)result;
}
mixin(getRangeMethods!Extensions());
private void setup(string tag, Args...)(R read, Args args) {
mixin(tag ~ ".setup(read, args);");
}
private void populate(string tag)(ref Result r) {
mixin(tag ~ ".populate(r);");
}
private void update(string tag)() {
mixin(tag ~ ".update(_read);");
}
private void copy(string tag)(ref typeof(this) other) {
mixin(tag ~ ".copy(this, other);");
}
this(Args...)(R read, Args args) {
_read = read;
_seq = read.sequence;
_rev = read.is_reverse_strand;
foreach (t; Extensions) {
setup!t(read, args);
}
}
bool empty() @property const {
return _seq.empty;
}
Result front() @property {
Result r = void;
r.base = _rev ? _seq.back : _seq.front;
foreach (t; Extensions)
populate!t(r);
return r;
}
void popFront() {
moveToNextBase();
}
PerBaseInfo!(R, Tags) save() @property {
PerBaseInfo!(R, Tags) r = void;
r._read = _read.dup;
r._seq = r._read.sequence;
r._rev = _rev;
foreach (t; Extensions)
copy!t(r);
return r;
}
private void moveToNextBase() {
foreach (t; Extensions) {
update!t();
}
if (_rev)
_seq.popBack();
else
_seq.popFront();
}
private {
R _read;
typeof(_read.sequence) _seq;
bool _rev;
}
}
///
/// Collect per-base information from available tags.
/// Use $(D arg!TagName) to pass a parameter related to a particular tag.
///
/// Example:
///
/// basesWith!"FZ"(arg!"FZ"(flow_order));
///
template basesWith(Tags...) {
auto basesWith(R, Args...)(R read, Args args) {
return PerBaseInfo!(R, Tags)(read, args);
}
}
/// Provides additional property $(D flow_call).
template FZbaseInfo(R) {
mixin template resultProperties() {
/// Current flow call
ReadFlowCall flow_call() @property {
return _flow_call;
}
private {
ReadFlowCall _flow_call;
}
}
mixin template rangeMethods() {
private {
ForwardRange!ReadFlowCall _flow_calls;
ushort _at;
debug {
string _read_name;
}
}
void setup(Args...)(const ref R read, Args args)
{
string flow_order;
debug {
_read_name = read.read_name.idup;
}
enum argExists = staticIndexOf!(MixinArg!(string, "FZ"), Args);
static assert(argExists != -1, `Flow order must be provided via arg!"FZ"`);
foreach (arg; args) {
static if(is(typeof(arg) == MixinArg!(string, "FZ")))
flow_order = arg;
}
_flow_calls = readFlowCalls(read, flow_order);
_at = 0;
}
void populate(Result)(ref Result result) {
result._flow_call = _flow_calls.front;
debug {
if ((_rev && result.base != result._flow_call.base.complement)
|| (!_rev && result.base != result._flow_call.base)) {
import std.stdio;
stderr.writeln("invalid flow call at ", _read_name, ": ", result.position);
}
}
}
void update(const ref R read)
{
++_at;
if (_at == _flow_calls.front.length) {
_flow_calls.popFront();
_at = 0;
}
}
void copy(Range)(ref Range source, ref Range target) {
target.FZ._flow_calls = source._flow_calls.save();
target.FZ._at = source.FZ._at;
debug {
target._read_name = _read_name;
}
}
}
}
/// Provides additional properties
/// * position
/// * cigar_operation
template CIGARbaseInfo(R) {
mixin template resultProperties() {
/// Current CIGAR operation
CigarOperation cigar_operation() @property {
return _cigar_operation;
}
/// Position of the corresponding base on the reference.
/// If current CIGAR operation is not one of 'M', '=', 'X',
/// returns the position of the previous valid base.
ulong position() @property {
return _reference_position;
}
private {
CigarOperation _cigar_operation;
ulong _reference_position;
}
}
mixin template rangeMethods() {
private {
const(CigarOperation)[] _cigar;
long _index;
ulong _at;
ulong _ref_pos;
}
void setup(Args...)(const ref R read, Args)
{
_cigar = read.cigar;
_index = read.is_reverse_strand ? _cigar.length : -1;
_ref_pos = read.is_reverse_strand ? (read.position + read.basesCovered() - 1)
: read.position;
_moveToNextCigarOperator(read.is_reverse_strand);
}
void populate(Result)(ref Result result) {
result._cigar_operation = _cigar[_index];
result._reference_position = _ref_pos;
}
void update(const ref R read)
{
++_at;
if (_cigar[_index].is_reference_consuming) {
_ref_pos += read.is_reverse_strand ? -1 : 1;
}
if (_at == _cigar[_index].length) {
_moveToNextCigarOperator(read.is_reverse_strand);
}
}
void copy(Range)(const ref Range source, ref Range target) {
target.CIGAR._cigar = source.CIGAR._cigar;
target.CIGAR._index = source.CIGAR._index;
target.CIGAR._at = source.CIGAR._at;
target.CIGAR._ref_pos = source.CIGAR._ref_pos;
}
private void _moveToNextCigarOperator(bool reverse) {
_at = 0;
if (!reverse) {
for (++_index; _index < _cigar.length; ++_index)
{
if (_cigar[_index].is_query_consuming)
break;
if (_cigar[_index].is_reference_consuming)
_ref_pos += _cigar[_index].length;
}
} else {
for (--_index; _index >= 0; --_index) {
if (_cigar[_index].is_query_consuming)
break;
if (_cigar[_index].is_reference_consuming)
_ref_pos -= _cigar[_index].length;
}
}
}
}
}

120
bio/bam/bgzf/block.d

@ -0,0 +1,120 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bgzf.block;
import std.array : uninitializedArray;
import std.conv;
import std.zlib : crc32, ZlibException;
import etc.c.zlib;
/**
Structure representing BGZF block.
*/
struct BgzfBlock {
// field types are as in the SAM/BAM specification
// ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t
public ulong start_offset; /// start offset in the file, in bytes
/// end offset in the file, in bytes
public ulong end_offset() @property const {
return start_offset + bsize + 1;
}
public ushort bsize; /// total Block SIZE minus one
public ubyte[] compressed_data = void;
public uint crc32;
public uint input_size; /// size of uncompressed data
hash_t toHash() const pure @safe nothrow {
return crc32;
}
bool opEquals(const ref BgzfBlock other) pure @safe nothrow {
return crc32 == other.crc32;
}
int opCmp(const ref BgzfBlock other) const pure @safe nothrow {
return crc32 < other.crc32 ? -1 :
crc32 > other.crc32 ? 1 : 0;
}
}
/**
Struct representing decompressed BgzfBlock
Start offset is needed to be able to tell current virtual offset,
and yet be able to decompress blocks in parallel.
*/
struct DecompressedBgzfBlock {
ulong start_offset;
ulong end_offset;
ubyte[] decompressed_data;
}
/// Function for BGZF block decompression.
DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block) {
if (block.input_size == 0) {
return DecompressedBgzfBlock(block.start_offset,
block.start_offset + block.bsize + 1,
cast(ubyte[])[]); // EOF marker
// TODO: add check for correctness of EOF marker
}
int err;
ubyte[] uncompressed = uninitializedArray!(ubyte[])(block.input_size);
// set input data
etc.c.zlib.z_stream zs;
zs.next_in = cast(typeof(zs.next_in))block.compressed_data;
zs.avail_in = to!uint(block.compressed_data.length);
err = etc.c.zlib.inflateInit2(&zs, /* winbits = */-15);
if (err)
{
throw new ZlibException(err);
}
zs.next_out = cast(typeof(zs.next_out))uncompressed.ptr;
zs.avail_out = block.input_size;
err = etc.c.zlib.inflate(&zs, Z_FINISH);
switch (err)
{
case Z_STREAM_END:
assert(zs.total_out == block.input_size);
err = etc.c.zlib.inflateEnd(&zs);
if (err != Z_OK) {
throw new ZlibException(err);
}
break;
default:
etc.c.zlib.inflateEnd(&zs);
throw new ZlibException(err);
}
assert(block.crc32 == crc32(0, uncompressed));
return DecompressedBgzfBlock(block.start_offset,
block.start_offset + block.bsize + 1,
cast(ubyte[])uncompressed);
}

287
bio/bam/bgzf/blockrange.d

@ -0,0 +1,287 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bgzf.blockrange;