Browse Source

moved BGZF-related code to bio.core.bgzf

remotes/georgeg/bam_output_redesign
lomereiter 10 years ago
parent
commit
9218e12a38
  1. 2
      bio/bam/bai/chunk.d
  2. 2
      bio/bam/baifile.d
  3. 159
      bio/bam/bgzf/block.d
  4. 305
      bio/bam/bgzf/blockrange.d
  5. 111
      bio/bam/bgzf/compress.d
  6. 324
      bio/bam/chunkinputstream.d
  7. 24
      bio/bam/constants.d
  8. 2
      bio/bam/output.d
  9. 8
      bio/bam/randomaccessmanager.d
  10. 6
      bio/bam/reader.d
  11. 4
      bio/bam/readrange.d
  12. 2
      bio/bam/reference.d
  13. 90
      bio/bam/virtualoffset.d
  14. 3
      test/unittests.d

2
bio/bam/bai/chunk.d

@ -19,7 +19,7 @@
*/
module bio.bam.bai.chunk;
import bio.bam.virtualoffset;
import bio.core.bgzf.virtualoffset;
struct Chunk {
VirtualOffset beg; /// virtual file offset of the start of the chunk

2
bio/bam/baifile.d

@ -21,7 +21,7 @@ module bio.bam.baifile;
public import bio.bam.bai.chunk;
public import bio.bam.bai.bin;
import bio.bam.virtualoffset;
import bio.core.bgzf.virtualoffset;
import bio.bam.constants;
import std.stream;

159
bio/bam/bgzf/block.d

@ -1,159 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bgzf.block;
import bio.bam.constants;
import std.array : uninitializedArray;
import std.conv;
import std.zlib : crc32, ZlibException;
import etc.c.zlib;
import std.exception;
/**
Structure representing BGZF block.
In general, users shouldn't use it, as it is EXTREMELY low-level.
*/
struct BgzfBlock {
// field types are as in the SAM/BAM specification
// ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t
public ulong start_offset; /// start offset in the file, in bytes
/// end offset in the file, in bytes
public ulong end_offset() @property const {
return start_offset + bsize + 1;
}
public ushort bsize; /// total Block SIZE minus one
public ushort cdata_size; /// compressed data size
/// A buffer is used to reduce number of allocations.
///
/// Its size is max(bsize + 1, input_size)
/// Initially, it contains compressed data, but is rewritten
/// during decompressBgzfBlock -- indeed, who cares about
/// compressed data after it has been uncompressed?
public ubyte[] _buffer = void;
/// If block has been already decompressed, result is undefined.
public ubyte[] compressed_data() @property {
return _buffer[0 .. cast(size_t)cdata_size];
}
public uint crc32;
public uint input_size; /// size of uncompressed data
hash_t toHash() const pure @safe nothrow {
// since the block can be either compressed or decompressed,
// returning CRC sum is the easiest and safest thing to do
return crc32;
}
bool opEquals(const ref BgzfBlock other) pure @safe nothrow {
return crc32 == other.crc32;
}
int opCmp(const ref BgzfBlock other) const pure @safe nothrow {
return crc32 < other.crc32 ? -1 :
crc32 > other.crc32 ? 1 : 0;
}
}
/**
Struct representing decompressed BgzfBlock
Start offset is needed to be able to tell current virtual offset,
and yet be able to decompress blocks in parallel.
*/
struct DecompressedBgzfBlock {
ulong start_offset;
ulong end_offset;
ubyte[] decompressed_data;
}
/// Function for BGZF block decompression.
/// Reuses buffer allocated for storing compressed data,
/// i.e. after execution buffer of the passed $(D block)
/// is overwritten with uncompressed data.
DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block) {
if (block.input_size == 0) {
return DecompressedBgzfBlock(block.start_offset,
block.start_offset + block.bsize + 1,
cast(ubyte[])[]); // EOF marker
// TODO: add check for correctness of EOF marker
}
int err = void;
// allocate buffer on the stack
ubyte[BGZF_MAX_BLOCK_SIZE] uncompressed_buf = void;
// check that block follows BAM specification
enforce(block.input_size <= BGZF_MAX_BLOCK_SIZE,
"Uncompressed block size must be within " ~
to!string(BGZF_MAX_BLOCK_SIZE) ~ " bytes");
// for convenience, provide a slice
auto uncompressed = uncompressed_buf[0 .. block.input_size];
// set input data
etc.c.zlib.z_stream zs;
zs.next_in = cast(typeof(zs.next_in))block.compressed_data;
zs.avail_in = to!uint(block.compressed_data.length);
err = etc.c.zlib.inflateInit2(&zs, /* winbits = */-15);
if (err)
{
throw new ZlibException(err);
}
// uncompress it into a buffer on the stack
zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr;
zs.avail_out = block.input_size;
err = etc.c.zlib.inflate(&zs, Z_FINISH);
switch (err)
{
case Z_STREAM_END:
assert(zs.total_out == block.input_size);
err = etc.c.zlib.inflateEnd(&zs);
if (err != Z_OK) {
throw new ZlibException(err);
}
break;
default:
etc.c.zlib.inflateEnd(&zs);
throw new ZlibException(err);
}
assert(block.crc32 == crc32(0, uncompressed[]));
// Now copy back to block._buffer, overwriting existing data.
// It should have enough bytes already allocated.
assert(block._buffer.length >= block.input_size);
block._buffer[0 .. block.input_size] = uncompressed[];
return DecompressedBgzfBlock(block.start_offset,
block.start_offset + block.bsize + 1,
block._buffer[0 .. block.input_size]);
}

305
bio/bam/bgzf/blockrange.d

@ -1,305 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bgzf.blockrange;
public import bio.bam.bgzf.block;
import bio.bam.constants;
import std.stream;
import std.array : uninitializedArray;
import std.conv;
import std.algorithm : max;
/// Exception type, thrown in case of encountering corrupt BGZF blocks
class BgzfException : Exception {
this(string msg) { super(msg); }
}
/**
Range for iterating over BGZF blocks coming from any Stream
*/
struct BgzfRange {
// /////////////////////////////////////////////////////////////////////////
//
// | Here is the general picture of what happens.
// |
// | First of all, BgzfRange reads bytes from the stream and determines
// | boundaries of BGZF blocks. Elements of the range are blocks of
// | compressed data, together with their start offsets in the file.
// |
// | After that, blocks are decompressed, and another range comes into play.
// | Start offsets are still provided together with blocks, because they are
// | needed for random access, to calculate virtual offsets of alignments.
//
// - Virtual offset is a pair of numbers which uniquely identifies
// - location of an individual alignment record in the file. The first is
// - the start offset of the bgzf block in which the alignment record begins,
// - and the second is offset in decompressed data of that block.
// - Blocks are required to contain no more than 65536 bytes of uncompressed
// - data, and virtual offsets are stored as uint64_t numbers as follows:
// - [ {block offset (48 bits)} {offset in decompressed data (16 bits)} ]
// -
// - Relatively small size of BGZF blocks makes for fast random access,
// - still allowing good compression (about 3x).
//
// | Now that we have range of decompressed blocks, those blocks have to be
// | made into a proper input stream of bytes. BGZF specification does not
// | deal with alignment records, it deals with blocks of arbitrary data.
// | Therefore it's possible that some alignments will get splitted, though
// | nowadays most software which produces BAM files avoids that.
// |
// | ChunkInputStream joins decompressed blocks into a stream, providing
// | virtualTell() method, which returns current virtual offset.
// |
// | Then, in case of BAM file, SAM header is read first, then comes some
// | basic information about reference sequences (names and lengths),
// | just so as not to duplicate it in alignment records and use integer
// | indices instead.
// |
// | After that, alignment records come. They are typically quite small,
// | about 100-1000 bytes depending on sequence length and amount of tags.
// |
// | In order to avoid copying memory, and, more importantly, allocating it
// | so frequently (GC is not well suited for 10000s of allocations/sec),
// | the input stream provides yet another method, readSlice, which tries to
// | return a slice of underlying decompressed block, and only in case it's
// | impossible it allocates memory which is very rare.
//
// - There're also two possible policies for iterating alignments,
// - either packed together with their virtual offsets, or without them.
// - The first one is used for random access, the second one - for serial.
//
// -----------------------------------------------------------------------
// Picture summarizing the above description:
//
// Stream BgzfRange range of input range of
// decompressed stream alignment
// each block BGZF blocks, records
// is ~20kB each ~65kB
//
// ------- ---------. --------- ---------
// | r | | 1st | \ | f|d | | SAM |
// | a | -> | bgzf |\ -----> | i|e | | header|
// | w | | block | \ | r|c | |-------|
// | | |_______|\ \ | s|o | -> | r|s |
// | b | | 2nd | \ \ | t|m | | e|e |
// | y | -> | bgzf | \ ---> | p | | f|q|i |
// | t | | block | \ | r | | e|u|n |
// | e | |_______| \ | e|b | | r|e|f |
// | s | | 3rd | \ | s|l | -> | e|n|o |
// | . | -> | bgzf | \ | s|o | | n|c |
// | . | | block | ->| e|c | | c|e |
// | . | |-------| | d|k | | e|s |
// | . | -> | ... | | | |-------| --------
// | . | | | |-------| -> |records| -> |------|
// | . | | ... | | ... | | ... | -> |------|
//
// /////////////////////////////////////////////////////////////////////////
/**
Constructs range from stream
*/
this(Stream stream) {
_stream = stream;
_seekable = stream.seekable;
loadNextBlock();
}
/**
Returns: offset of the start of the current BGZF block
in underlying stream. If the stream is non-seekable,
the result is always 0.
*/
@property ulong start_offset() { return _start_offset; }
bool empty() @property {
return _empty;
}
void popFront() {
loadNextBlock();
}
BgzfBlock front() @property {
return _current_block;
}
private:
Stream _stream;
ulong _start_offset;
bool _empty = false;
bool _seekable = false;
BgzfBlock _current_block;
void throwBgzfException(string msg) {
throw new BgzfException("Error reading BGZF block starting from offset " ~
to!string(_start_offset) ~ ": " ~ msg);
}
void loadNextBlock() {
if (_seekable) {
_start_offset = _stream.position;
}
if (_stream.eof()) {
_empty = true; // indicate that range is now empty
version(development) {
import std.stdio;
stderr.writeln("[info][BGZF range] EOF, current offset is ", _stream.position);
}
return;
}
try {
uint bgzf_magic = void;
// TODO: fix byte order if needed
auto bytes_read = _stream.read((cast(ubyte*)&bgzf_magic)[0 .. 4]);
if (bytes_read == 0) {
_empty = true;
version(development) {
import std.stdio;
stderr.writeln("[info][BGZF range] end of stream, current offset is ", _stream.position);
}
return;
// TODO: check if last BGZF block was empty, and if not throw a warning
}
if (bgzf_magic != BGZF_MAGIC) {
throwBgzfException("wrong BGZF magic");
}
ushort gzip_extra_length = void;
if (_seekable) {
_stream.seekCur(uint.sizeof + 2 * ubyte.sizeof);
} else {
uint gzip_mod_time = void;
ubyte gzip_extra_flags = void;
ubyte gzip_os = void;
_stream.read(gzip_mod_time);
_stream.read(gzip_extra_flags);
_stream.read(gzip_os);
}
_stream.read(gzip_extra_length);
ushort bsize = void; // total Block SIZE minus 1
bool found_block_size = false;
// read extra subfields
size_t len = 0;
while (len < gzip_extra_length) {
ubyte si1 = void; // Subfield Identifier1
ubyte si2 = void; // Subfield Identifier2
ushort slen = void; // Subfield LENgth
_stream.read(si1);
_stream.read(si2);
_stream.read(slen);
if (si1 == BAM_SI1 && si2 == BAM_SI2) {
// found 'BC' as subfield identifier
if (slen != 2) {
throwBgzfException("wrong BC subfield length: " ~
to!string(slen) ~ "; expected 2");
}
if (found_block_size) {
throwBgzfException("duplicate field with block size");
}
// read block size
_stream.read(bsize);
found_block_size = true;
// skip the rest
if (_seekable) {
_stream.seekCur(slen - bsize.sizeof);
} else {
_stream.readString(slen - bsize.sizeof);
}
} else {
// this subfield has nothing to do with block size,
// just skip
if (_seekable) {
_stream.seekCur(slen);
} else {
_stream.readString(slen);
}
}
len += si1.sizeof + si2.sizeof + slen.sizeof + slen;
}
if (len != gzip_extra_length) {
throwBgzfException("total length of subfields in bytes (" ~
to!string(len) ~
") is not equal to gzip_extra_length (" ~
to!string(gzip_extra_length) ~ ")");
}
if (!found_block_size) {
throwBgzfException("block size was not found in any subfield");
}
// read compressed data
auto cdata_size = bsize - gzip_extra_length - 19;
if (cdata_size > BGZF_MAX_BLOCK_SIZE) {
throwBgzfException("compressed data size is more than " ~
to!string(BGZF_MAX_BLOCK_SIZE) ~
" bytes, which is not allowed by " ~
"current BAM specification");
}
_current_block.bsize = bsize;
_current_block.cdata_size = cast(ushort)cdata_size;
ubyte[BGZF_MAX_BLOCK_SIZE] _buffer = void;
_stream.readExact(_buffer.ptr, cdata_size);
_stream.read(_current_block.crc32);
_stream.read(_current_block.input_size);
// now, this is a feature: allocate max(input_size, cdata_size).
// this way, only 1 allocation is done per block instead of 2.
// (see comments in bio.bam.bgzf.block about reusing this memory)
auto _buf_size = max(_current_block.input_size, cdata_size);
_current_block._buffer = uninitializedArray!(ubyte[])(_buf_size);
// copy compressed data at the start of the block
_current_block._buffer[0 .. cdata_size] = _buffer[0 .. cdata_size];
_current_block.start_offset = start_offset;
return;
} catch (ReadException e) {
throwBgzfException("stream error: " ~ e.msg);
}
assert(0);
}
}

111
bio/bam/bgzf/compress.d

@ -1,111 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.bgzf.compress;
import bio.bam.constants;
import etc.c.zlib;
import std.zlib : crc32, ZlibException;
import std.array;
import std.system;
import core.bitop;
/// Returns BGZF block containing compressed $(D chunk).
/// If $(D buffer) is provided, it will be used for storing the block.
///
/// Params:
/// chunk = chunk of memory to be compressed
/// level = compression level, see zlib documentation
/// buffer = optional buffer which will be used for storing
/// decompressed data
///
/// For uncompressed BAM output, use level = 0.
ubyte[] bgzfCompress(ubyte[] chunk, int level, ubyte[] buffer=null)
in
{
assert(-1 <= level && level <= 9);
}
body
{
assert(etc.c.zlib.compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
if (buffer is null) {
buffer = uninitializedArray!(ubyte[])(BGZF_MAX_BLOCK_SIZE);
} else {
buffer.length = BGZF_MAX_BLOCK_SIZE;
}
// write header
buffer[0 .. BLOCK_HEADER_LENGTH - ushort.sizeof] = BLOCK_HEADER_START[];
etc.c.zlib.z_stream zs;
zs.zalloc = null;
zs.zfree = null;
zs.next_in = cast(ubyte*)chunk.ptr;
zs.avail_in = cast(uint)chunk.length;
zs.next_out = buffer.ptr + BLOCK_HEADER_LENGTH;
zs.avail_out = cast(int)(buffer.length - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH);
auto err = etc.c.zlib.deflateInit2(&zs, /* compression level */ level,
/* deflated compression method */ Z_DEFLATED,
/* winbits (no header) */ -15,
/* memory usage level (default) */ 8,
/* default compression strategy */ Z_DEFAULT_STRATEGY);
if (err != Z_OK) {
throw new ZlibException(err);
}
err = etc.c.zlib.deflate(&zs, Z_FINISH);
if (err != Z_STREAM_END) {
throw new ZlibException(err);
}
err = etc.c.zlib.deflateEnd(&zs);
if (err != Z_OK) {
throw new ZlibException(err);
}
// almost done, update buffer length
buffer.length = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
// Write (block length - 1) in BC subfield.
// Why -1? To fit the value into 2 bytes (it's assumed to be in range 1-65536).
ushort len = cast(ushort)(buffer.length - 1);
buffer[BLOCK_HEADER_LENGTH - 2] = len & 0xFF; // little endian
buffer[BLOCK_HEADER_LENGTH - 1] = len >> 8;
// Write the footer
*(cast(uint*)(buffer.ptr + buffer.length - 8)) = crc32(0, chunk);
*(cast(uint*)(buffer.ptr + buffer.length - 4)) = cast(uint)chunk.length;
uint* ptr;
if (std.system.endian != Endian.littleEndian) {
ptr = cast(uint*)(buffer.ptr + buffer.length - 8);
*ptr = bswap(*ptr);
ptr = cast(uint*)(buffer.ptr + buffer.length - 4);
*ptr = bswap(*ptr);
}
return buffer;
}

324
bio/bam/chunkinputstream.d

@ -1,324 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.chunkinputstream;
import bio.bam.bgzf.block;
import bio.bam.virtualoffset;
import std.stream;
import std.range;
import std.array;
import std.traits;
import std.algorithm;
/// Interface on top of Stream, which glues decompressed blocks
/// together and provides virtualTell() method for determining
/// virtual offset in the stream.
///
/// NOTE: the class has no virtualSeek method. The reason is
/// that an implementing class should have no direct access
/// to any of underlying streams, (which provide chunks
/// and their start virtual offsets), the reason being
/// the single responsibility principle.
///
/// If you need to do "virtualSeek", seek to coffset
/// in stream providing compressed blocks, create new
/// stream for decompressing, and skip uoffset bytes
/// in the decompressed stream. In general, you probably
/// will want to use different decompression function
/// for each use case. (For serial iteration - without any
/// caching, for random access - with caching; and maybe
/// you'll want several types of caching.)
///
abstract class IChunkInputStream : Stream {
/// Returns: current virtual offset
///
/// (For details about what virtual offset is,
/// see bgzfrange module documentation.)
///
abstract VirtualOffset virtualTell();
/// Read a slice from chunk stream.
abstract ubyte[] readSlice(size_t n);
/// Average compression so far
abstract float average_compression_ratio() @property const;
/// Size of underlying BAM file (0 if not available).
abstract size_t compressed_file_size() @property const;
}
/// Extends DecompressedBgzfBlock with skip_start and skip_end members.
struct AugmentedDecompressedBgzfBlock {
DecompressedBgzfBlock block;
alias block this;
ushort skip_start; /// how many bytes to skip from start
ushort skip_end; /// how many bytes to skip from end
}
/// Convenience function for making decompressed blocks compatible with ChunkInputStream.
AugmentedDecompressedBgzfBlock makeAugmentedBlock(DecompressedBgzfBlock block) {
return AugmentedDecompressedBgzfBlock(block, 0, 0);
}
/**
Class for turning range of AugmentedDecompressedBgzfBlock objects
into a proper InputStream
NOTE: an assumption is made that calling empty() on range is cheap.
However, front() gets called only once for each element. That fits
the philosophy of std.algorithm.map.
*/
final class ChunkInputStream(ChunkRange) : IChunkInputStream
{
static assert(is(ElementType!ChunkRange == AugmentedDecompressedBgzfBlock));
/// Construct class instance from range of chunks.
this(ChunkRange range, size_t compressed_file_size=0)
{
_range = range;
readable = true;
writeable = false;
seekable = false;
if (_range.empty) {
setEOF();
}
_its_time_to_get_next_chunk = true; // defer getting first chunk
_compressed_file_size = compressed_file_size;
}
/// Read a slice from chunk stream.
///
/// Behaviour: when current chunk contains >= n bytes,
/// a chunk slice is returned.
/// Otherwise, memory copying occurs.
override ubyte[] readSlice(size_t n) {
if (_range.empty()) {
_start_offset = _end_offset;
_cur = 0;
setEOF();
return null;
}
if (_its_time_to_get_next_chunk) {
setupStream();
}
if (_cur + n < _len) {
// can return a slice,
// remaining in the current chunk
auto slice = _buf[_cur .. _cur + n];
_cur += n;
return slice;
} else if (_cur + n == _len) {
// end of current chunk
auto slice = _buf[_cur .. _len];
_range.popFront();
_its_time_to_get_next_chunk = true;
return slice;
} else {
// this branch will be executed rarely
// (wish the compiler could understand that...)
auto slice = uninitializedArray!(ubyte[])(n);
readExact(slice.ptr, n);
return slice;
}
}
override size_t readBlock(void* buffer, size_t size) {
if (_range.empty()) {
// no more chunks
_start_offset = _end_offset;
_cur = 0;
setEOF();
return 0;
}
if (_its_time_to_get_next_chunk) {
setupStream();
}
size_t read = readBlockHelper(buffer, size);
if (read < size) {
_range.popFront(); // get next chunk
setupStream();
}
if (read == 0) { // try with next chunk
read = readBlockHelper(buffer, size);
}
if (_cur == _len) { // end of current chunk
_range.popFront();
_its_time_to_get_next_chunk = true;
}
return read;
}
override size_t writeBlock(const void* buffer, size_t size) {
throw new WriteException("Stream is not writeable");
}
override ulong seek(long offset, SeekPos whence) {
throw new SeekException("Stream is not seekable");
}
/// Returns: current virtual offset
override VirtualOffset virtualTell() {
assert(_cur < (1<<16));
if (_its_time_to_get_next_chunk) {
setupStream();
}
return VirtualOffset(_start_offset, cast(ushort)_cur);
}
override float average_compression_ratio() @property const {
if (_total_compressed == 0) return 0.0;
return cast(float)_total_uncompressed/_total_compressed;
}
override size_t compressed_file_size() @property const {
return _compressed_file_size;
}
private:
ChunkRange _range;
typeof(_range.front.start_offset) _start_offset;
typeof(_range.front.end_offset) _end_offset;
typeof(_range.front.decompressed_data) _buf;
size_t _compressed_file_size;
size_t _len; // current data length
size_t _cur; // current position
size_t _total_compressed; // compressed bytes read so far
size_t _total_uncompressed; // uncompressed size of blocks read so far
bool _its_time_to_get_next_chunk;
// Effect:
//
// _range is untouched (only empty() and front() are used)
//
// _buf is filled with the contents of _range.front.decompressed_data
//
// _cur, _start_offset_, end_offset, _total_uncompressed, and
// _total_compressed variables are updated appropriately
//
// If _range.front.decompressed_data.length == 0, readEOF is set
// (the reason is that this means we've just encountered a special
// BGZF block, indicating end-of-file).
//
// _its_time_to_get_next_chunk is set back to false
void setupStream() {
_its_time_to_get_next_chunk = false;
_cur = 0;
// we don't rely on the caller that it will set _start_offset and
// _end_offset appropriately
if (_range.empty) {
_start_offset = _end_offset;
setEOF();
return;
}
auto tmp = _range.front; /// _range might be lazy,
/// so extra front() calls
/// can cost a lot
debug {
/*
import std.stdio;
if (tmp.skip_start != 0 || tmp.skip_end != 0) {
writeln("reading partial decompressed block: ");
writeln(" skip_start = ", tmp.skip_start);
writeln(" skip_end = ", tmp.skip_end);
writeln(" start_offset = ", tmp.start_offset);
writeln(" end_offset = ", tmp.end_offset);
}
*/
}
_cur = tmp.skip_start;
_start_offset = tmp.start_offset;
_end_offset = tmp.end_offset;
_total_compressed += _end_offset - _start_offset - tmp.skip_start - tmp.skip_end;
_buf = tmp.decompressed_data[0 .. $ - tmp.skip_end];
_len = _buf.length;
_total_uncompressed += _len;
if (_len == 0) {
setEOF();
}
return;
}
version(development)
{
final void setEOF() {
import std.stdio;
std.stdio.stderr.writeln("[info][chunkinputstream] len == 0, start offset = ", _start_offset,
", end offset = ", _end_offset);
readEOF = true;
}
} else {
final void setEOF() { readEOF = true; }
}
size_t readBlockHelper(void* buffer, size_t size) {
ubyte* cbuf = cast(ubyte*) buffer;
if (size + _cur > _len)
size = cast(size_t)(_len - _cur);
ubyte[] ubuf = cast(ubyte[])_buf[cast(size_t)_cur .. cast(size_t)(_cur + size)];
cbuf[0 .. size] = ubuf[];
_cur += size;
return size;
}
}
/**
Returns: input stream wrapping given range of memory chunks
*/
auto makeChunkInputStream(R)(R range, size_t compressed_file_size=0)
if (is(Unqual!(ElementType!R) == AugmentedDecompressedBgzfBlock))
{
return new ChunkInputStream!R(range, compressed_file_size);
}
/// ditto
auto makeChunkInputStream(R)(R range, size_t compressed_file_size=0)
if (is(Unqual!(ElementType!R) == DecompressedBgzfBlock))
{
return makeChunkInputStream(map!makeAugmentedBlock(range), compressed_file_size);
}

24
bio/bam/constants.d

@ -19,33 +19,13 @@
*/
module bio.bam.constants;
public import bio.core.bgzf.constants;
immutable BAM_MAGIC = "BAM\1";
immutable BAI_MAGIC = "BAI\1";
immutable ubyte BAM_SI1 = 66;
immutable ubyte BAM_SI2 = 67;
immutable BGZF_MAGIC = 0x04_08_8B_1F; // little endian
immutable ubyte[16] BLOCK_HEADER_START =
[ 31, 139, 8, 4, // BGZF magic
0, 0, 0, 0, // GZIP modification time
0, // GZIP extra flags
255, // GZIP OS identifier
6, 0, // GZIP extra length == 6 (LE)
66, 67, // Subfield 'BC'
2, 0]; // Subfield length (holds 1 ushort)
// BGZF block header length in bytes.
// Block header holds BLOCK_HEADER_START + block size (ushort)
immutable BLOCK_HEADER_LENGTH = BLOCK_HEADER_START.length + ushort.sizeof;
// BGZF footer holds CRC32 and size of decompressed block.
immutable BLOCK_FOOTER_LENGTH = uint.sizeof + uint.sizeof;
immutable BGZF_MAX_BLOCK_SIZE = 65536;
immutable BGZF_BLOCK_SIZE = 0xFF00;
immutable ubyte[28] BAM_EOF =
[31, 139, 8, 4,
0, 0, 0, 0,

2
bio/bam/output.d

@ -23,8 +23,8 @@ import bio.sam.header;
import bio.bam.reference;
import bio.bam.read;
import bio.bam.readrange;
import bio.bam.bgzf.compress;
import bio.bam.constants;
import bio.core.bgzf.compress;
import std.stream;
import std.system;

8
bio/bam/randomaccessmanager.d

@ -22,15 +22,15 @@
*/
module bio.bam.randomaccessmanager;
import bio.bam.bgzf.blockrange;
import bio.bam.virtualoffset;
import bio.bam.constants;
import bio.bam.read;
import bio.bam.readrange;
import bio.bam.chunkinputstream;
import bio.bam.baifile;
import bio.bam.bai.utils.algo;
import bio.core.bgzf.blockrange;
import bio.core.bgzf.virtualoffset;
import bio.core.bgzf.chunkinputstream;
import bio.core.utils.memoize;
import bio.core.utils.range;
import bio.core.utils.stream;
@ -140,7 +140,7 @@ class RandomAccessManager {
if (task_pool is null) {
return helper(map!decompressSerial(bgzf_range), offset);
} else {
return helper(task_pool.map!(bio.bam.bgzf.blockrange.decompressBgzfBlock)(bgzf_range), offset);
return helper(task_pool.map!(bio.core.bgzf.blockrange.decompressBgzfBlock)(bgzf_range), offset);
}
}

6
bio/bam/reader.d

@ -22,15 +22,15 @@ module bio.bam.reader;
public import bio.sam.header;
public import bio.bam.reference;
public import bio.bam.read;
public import bio.bam.virtualoffset;
public import bio.bam.tagvalue;
public import bio.bam.readrange;
import bio.bam.bgzf.blockrange;
import bio.bam.chunkinputstream;
import bio.bam.randomaccessmanager;
import bio.bam.baifile;
import bio.core.utils.range;
import bio.core.utils.stream;
import bio.core.bgzf.blockrange;
import bio.core.bgzf.chunkinputstream;
public import bio.core.bgzf.virtualoffset;
import std.system;
import std.stdio;

4
bio/bam/readrange.d

@ -19,9 +19,9 @@
*/
module bio.bam.readrange;
import bio.bam.virtualoffset;
import bio.bam.chunkinputstream;
import bio.bam.read;
import bio.core.bgzf.chunkinputstream;
import bio.core.bgzf.virtualoffset;
import bio.core.utils.switchendianness;
import std.stream;

2
bio/bam/reference.d

@ -21,7 +21,7 @@ module bio.bam.reference;
import bio.bam.randomaccessmanager;
import bio.bam.readrange;
import bio.bam.virtualoffset;
import bio.core.bgzf.virtualoffset;
import std.stream;
import std.exception;

90
bio/bam/virtualoffset.d

@ -1,90 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.virtualoffset;
import std.conv;
/// Structure representing virtual offset in BAM file.
struct VirtualOffset {
/// Params:
///
/// coffset = unsigned byte offset into the BGZF file
/// to the beginning of a BGZF block.
/// Must be strictly less than 2^48.
///
/// uoffset = unsigned byte offset into the uncompressed
/// data stream represented by that BGZF block
this(ulong coffset, ushort uoffset) nothrow
in {
assert(coffset < (1UL<<48));
}
body {
voffset = (coffset << 16) | uoffset;
}
/// Set both coffset and uoffset packed as (coffset<<16)|uoffset
this(ulong voffset) nothrow {
this.voffset = voffset;
}
/// ditto
ulong coffset() @property const nothrow {
return voffset >> 16;
}
/// ditto
ushort uoffset() @property const nothrow {
return voffset & 0xFFFF;
}
int opCmp(const ref VirtualOffset other) const nothrow {
if (this.voffset > other.voffset) { return 1; }
if (this.voffset < other.voffset) { return -1; }
return 0;
}
bool opEquals(const ref VirtualOffset other) const nothrow {
return this.voffset == other.voffset;
}
bool opEquals(ulong voffset) const nothrow {
return opEquals(VirtualOffset(voffset));
}
ulong opCast() const nothrow {
return voffset;
}
/// String representation in format "<coffset>/<uoffset>"
string toString() {
return to!string(coffset) ~ "/" ~ to!string(uoffset);
}
private:
ulong voffset;
}
unittest {
auto voffset = VirtualOffset(100500, 42);
assert(voffset.coffset == 100500);
assert(voffset.uoffset == 42);
assert(voffset == (100500UL << 16) + 42UL);
assert(cast(ulong)voffset == (100500UL << 16) + 42UL);
}

3
test/unittests.d

@ -20,7 +20,7 @@
import bio.sam.reader;
import bio.sam.header;
import bio.bam.bgzf.blockrange;
import bio.core.bgzf.blockrange;
import bio.bam.reader;
import bio.bam.output;
import bio.bam.md.reconstruct;
@ -34,7 +34,6 @@ import bio.bam.serialization.sam;
import bio.core.utils.tmpfile;
import std.path;
import std.range;
import std.stdio;

Loading…
Cancel
Save