You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1129 lines
39 KiB

/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.read;
import bio.core.base;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
import bio.bam.utils.array;
import bio.bam.utils.value;
import bio.core.utils.switchendianness;
import bio.bam.thirdparty.msgpack : Packer, unpack;
version(unittest) {
import bio.bam.utils.tagstoragebuilder;
import std.stdio;
}
import std.algorithm;
import std.range;
import std.conv;
import std.exception;
import std.stream;
import std.system;
import std.traits;
import std.array;
/**
Represents single CIGAR operation
*/
struct CigarOperation {
static assert(CigarOperation.sizeof == uint.sizeof);
/*
WARNING!
It is very essential that the size of
this struct is EXACTLY equal to uint.sizeof!
The reason is to avoid copying of arrays during alignment parsing.
Namely, when some_pointer points to raw cigar data,
we can just do a cast. This allows to access those data
directly, not doing any memory allocations.
*/
private uint raw; /// raw data from BAM
private static ubyte char2op(char c) {
switch(c) {
case 'M': return 0;
case 'I': return 1;
case 'D': return 2;
case 'N': return 3;
case 'S': return 4;
case 'H': return 5;
case 'P': return 6;
case '=': return 7;
case 'X': return 8;
default: return 8; // FIXME
}
}
this(uint length, char operation) {
enforce(length < (1<<28), "Too big length of CIGAR operation");
raw = (length << 4) | char2op(operation);
}
/// operation length
uint length() @property const nothrow {
return raw >> 4;
}
/// CIGAR operation as one of MIDNSHP=X
char operation() @property const nothrow {
if ((raw & 0xF) > 8) {
/* FIXME: what to do in this case? */
return '\0';
}
return "MIDNSHP=X"[raw & 0xF];
}
///
alias operation this;
// Each pair of bits has first bit set iff the operation is query consuming,
// and second bit set iff it is reference consuming.
// X = P H S N D I M
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
/// True iff operation is one of M, =, X, I, S
bool is_query_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
}
/// True iff operation is one of M, =, X, D, N
bool is_reference_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
}
}
/**
Represents single read
*/
struct BamRead {
mixin TagStorage;
// TODO: better names for properties
@property int ref_id() const nothrow { return _refID; }
/// 0-based leftmost coordinate of the first matching base
@property int position() const nothrow { return _pos; }
/// Indexing bin which this read belongs to.
@property Bin bin() const nothrow { return Bin(_bin); }
/// Mapping quality. Equals to 255 if not available, otherwise
/// equals to rounded -10 * log10(P {mapping position is wrong}).
@property ubyte mapping_quality() const nothrow { return _mapq; }
@property ushort flag() const nothrow { return _flag; }
@property int sequence_length() const nothrow { return _l_seq; }
@property int next_ref_id() const nothrow { return _next_refID; }
@property int next_pos() const nothrow { return _next_pos; }
@property int template_length() const nothrow { return _tlen; }
/// Set reference id.
@property void ref_id(int n) { _dup(); _refID = n; }
/// Sets 0-based leftmost coordinate. Bin is automatically recalculated.
@property void position(int n) { _dup(); _pos = n; _recalculate_bin(); }
/// Set mapping quality
@property void mapping_quality(ubyte n) { _dup(); _mapq = n; }
/// Set flag
@property void flag(ushort n) { _dup(); _flag = n; }
/// Set mate reference id.
@property void next_ref_id(int n) { _dup(); _next_refID = n; }
/// Set mate position
@property void next_pos(int n) { _dup(); _next_pos = n; }
/// Set template length
@property void template_length(int n) { _dup(); _tlen = n; }
/// Template having multiple segments in sequencing
@property bool is_paired() const nothrow { return cast(bool)(flag & 0x1); }
/// ditto
@property void is_paired(bool b) { _setFlag( 0, b); }
/// Each segment properly aligned according to the aligner
@property bool proper_pair() const nothrow { return cast(bool)(flag & 0x2); }
/// ditto
@property void proper_pair(bool b) { _setFlag( 1, b); }
/// Segment unmapped
@property bool is_unmapped() const nothrow { return cast(bool)(flag & 0x4); }
/// ditto
@property void is_unmapped(bool b) { _setFlag( 2, b); }
/// Next segment in the template unmapped
@property bool mate_is_unmapped() const nothrow { return cast(bool)(flag & 0x8); }
/// ditto
@property void mate_is_unmapped(bool b) { _setFlag( 3, b); }
/// Sequence being reverse complemented
@property bool is_reverse_strand() const nothrow { return cast(bool)(flag & 0x10); }
/// ditto
@property void is_reverse_strand(bool b) { _setFlag( 4, b); }
/// Sequence of the next segment in the template being reversed
@property bool mate_is_reverse_strand() const nothrow { return cast(bool)(flag & 0x20); }
/// ditto
@property void mate_is_reverse_strand(bool b) { _setFlag( 5, b); }
/// The first segment in the template
@property bool is_first_of_pair() const nothrow { return cast(bool)(flag & 0x40); }
/// ditto
@property void is_first_of_pair(bool b) { _setFlag( 6, b); }
/// The last segment in the template
@property bool is_second_of_pair() const nothrow { return cast(bool)(flag & 0x80); }
/// ditto
@property void is_second_of_pair(bool b) { _setFlag( 7, b); }
/// Secondary alignment
@property bool is_secondary_alignment() const nothrow { return cast(bool)(flag & 0x100); }
/// ditto
@property void is_secondary_alignment(bool b) { _setFlag( 8, b); }
/// Not passing quality controls
@property bool failed_quality_control() const nothrow { return cast(bool)(flag & 0x200); }
/// ditto
@property void failed_quality_control(bool b) { _setFlag( 9, b); }
/// PCR or optical duplicate
@property bool is_duplicate() const nothrow { return cast(bool)(flag & 0x400); }
/// ditto
@property void is_duplicate(bool b) { _setFlag(10, b); }
/// Convenience function, returns '+' or '-' indicating the strand.
@property char strand() const nothrow {
return is_reverse_strand ? '-' : '+';
}
/// Read name
@property string read_name() const nothrow {
// notice -1: the string is zero-terminated, so we should strip that '\0'
return cast(string)(_chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1]);
}
/// ditto
@property void read_name(string name) {
enforce(name.length >= 1 && name.length <= 255, "name length must be in 1-255 range");
_dup();
bio.bam.utils.array.replaceSlice(_chunk,
_chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1],
cast(ubyte[])name);
_l_read_name = cast(ubyte)(name.length + 1);
}
/// List of CIGAR operations
@property const(CigarOperation)[] cigar() const nothrow {
return cast(const(CigarOperation)[])(_chunk[_cigar_offset .. _cigar_offset +
_n_cigar_op * CigarOperation.sizeof]);
}
/// ditto
@property void cigar(const(CigarOperation)[] c) {
_dup();
bio.bam.utils.array.replaceSlice(_chunk,
_chunk[_cigar_offset .. _cigar_offset + _n_cigar_op * CigarOperation.sizeof],
cast(ubyte[])c);
_n_cigar_op = cast(ushort)(c.length);
_recalculate_bin();
}
/// The number of reference bases covered
int basesCovered() const {
if (this.is_unmapped) {
return 0; // actually, valid alignments should have empty cigar string
}
return reduce!"a + b.length"(0, filter!"a.is_reference_consuming"(cigar));
}
/// Human-readable representation of CIGAR string
string cigarString() {
char[] str;
// guess size of resulting string
str.reserve(_n_cigar_op * 3);
foreach (cigar_op; cigar) {
str ~= to!string(cigar_op.length);
str ~= cigar_op.operation;
}
return cast(string)str;
}
/// Sequence data
@property const(ubyte)[] raw_sequence_data() const nothrow {
return _chunk[_seq_offset .. _seq_offset + (_l_seq + 1) / 2];
}
static struct SequenceResult {
private ubyte[] _data;
private size_t _len;
private size_t _index;
private bool _use_first_4_bits;
this(const(ubyte[]) data, size_t len, bool use_first_4_bits=true) {
_data = cast(ubyte[])data;
_len = len;
_use_first_4_bits = use_first_4_bits;
}
@property bool empty() const {
return _index >= _len;
}
@property Base front() const {
return opIndex(0);
}
@property Base back() const {
return opIndex(_len - 1);
}
private auto _getActualPosition(size_t index) const
{
if (_use_first_4_bits) {
// [0 1] [2 3] [4 5] [6 7] ...
// |
// V
// 0 1 2 3
return index >> 1;
} else {
// [. 0] [1 2] [3 4] [5 6] ...
// |
// V
// 0 1 2 3
return (index >> 1) + (index & 1);
}
}
private auto _useFirst4Bits(size_t index) const
{
auto res = index % 2 == 0;
if (!_use_first_4_bits) {
res = !res;
}
return res;
}
@property SequenceResult save() const {
return SequenceResult(_data[_getActualPosition(_index) .. $],
_len - _index,
_useFirst4Bits(_index));
}
SequenceResult opSlice(size_t i, size_t j) const {
return SequenceResult(_data[_getActualPosition(_index + i) .. $],
j - i,
_useFirst4Bits(_index + i));
}
@property Base opIndex(size_t i) const {
auto pos = _index + i;
ubyte raw = _data[_getActualPosition(pos)];
if (_use_first_4_bits) {
if (pos & 1) {
raw &= 0xF;
} else {
raw >>= 4;
}
} else {
if (pos & 1) {
raw >>= 4;
} else {
raw &= 0xF;
}
}
return Base.fromInternalCode(raw);
}
void popFront() {
++_index;
}
void popBack() {
--_len;
}
@property size_t length() const {
return _len - _index;
}
}
/// Random-access range of characters
@property auto sequence() const {
return SequenceResult(raw_sequence_data, sequence_length);
}
static assert(isRandomAccessRange!(ReturnType!sequence));
/// Set sequence. Must be of the same length as current sequence.
@property void sequence(string seq) {
enforce(seq.length == _l_seq, "Sequence must have the same length as current");
_dup();
auto _raw_length = (seq.length + 1) / 2;
// set sequence
ubyte[] _seq = _chunk[_seq_offset .. _seq_offset + _raw_length];
for (size_t i = 0; i < _raw_length; ++i) {
_seq[i] = cast(ubyte)(Base(seq[2 * i]).internal_code << 4);
if (seq.length > 2 * i + 1)
_seq[i] |= cast(ubyte)(Base(seq[2 * i + 1]).internal_code);
}
}
/// Quality data
@property const(ubyte)[] phred_base_quality() const nothrow {
return _chunk[_qual_offset .. _qual_offset + _l_seq * char.sizeof];
}
/// Set quality data - array length must be of the same length as the sequence.
@property void phred_base_quality(const(ubyte)[] quality) {
enforce(quality.length == _l_seq, "Quality data must be of the same length as sequence");
_dup();
_chunk[_qual_offset .. _qual_offset + _l_seq] = quality;
}
/**
Constructs the struct from memory chunk
*/
this(ubyte[] chunk) {
// Switching endianness lazily is not a good idea:
//
// 1) switching byte order is pretty fast
// 2) lazy switching for arrays can kill the performance,
// it has to be done once
// 3) the code will be too complicated, whereas there're
// not so many users of big-endian systems
//
// In summa, BAM is little-endian format, so big-endian
// users will suffer anyway, it's unavoidable.
this._is_slice = true;
_chunk = chunk;
if (std.system.endian != Endian.littleEndian) {
// First 8 fields are 32-bit integers:
//
// 0) refID int
// 1) pos int
// 2) bin_mq_nl uint
// 3) flag_nc uint
// 4) l_seq int
// 5) next_refID int
// 6) next_pos int
// 7) tlen int
// ----------------------------------------------------
// (after them read_name follows which is string)
//
switchEndianness(_chunk.ptr, 8 * uint.sizeof);
// Then we need to switch endianness of CIGAR data:
switchEndianness(_chunk.ptr + _cigar_offset,
_n_cigar_op * uint.sizeof);
// Dealing with tags is the responsibility of TagStorage.
}
// create from chunk of little-endian memory
if (std.system.endian != Endian.littleEndian) {
fixTagStorageByteOrder();
}
}
private size_t calculateChunkSize(string read_name,
string sequence,
in CigarOperation[] cigar)
{
return 8 * int.sizeof
+ (read_name.length + 1) // tailing '\0'
+ uint.sizeof * cigar.length
+ ubyte.sizeof * ((sequence.length + 1) / 2)
+ ubyte.sizeof * sequence.length;
}
/// Construct alignment from basic information about it.
///
/// Other fields can be set afterwards.
this(string read_name, // info for developers:
string sequence, // these 3 fields are needed
in CigarOperation[] cigar) // to calculate size of _chunk
{
enforce(read_name.length < 256, "Too long read name, length must be <= 255");
this._is_slice = false;
if (this._chunk is null) {
this._chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)];
}
this._refID = -1; // set default values
this._pos = -1; // according to SAM/BAM
this._mapq = 255; // specification
this._next_refID = -1;
this._next_pos = -1;
this._tlen = 0;
this._l_read_name = cast(ubyte)(read_name.length + 1); // tailing '\0'
this._n_cigar_op = cast(ushort)(cigar.length);
this._l_seq = cast(int)(sequence.length);
// now all offsets can be calculated through corresponding properties
// set default quality
_chunk[_qual_offset .. _qual_offset + sequence.length] = 0xFF;
// set CIGAR data
auto _len = cigar.length * CigarOperation.sizeof;
_chunk[_cigar_offset .. _cigar_offset + _len] = cast(ubyte[])(cigar);
// set read_name
auto _offset = _read_name_offset;
_chunk[_offset .. _offset + read_name.length] = cast(ubyte[])read_name;
_chunk[_offset + read_name.length] = cast(ubyte)'\0';
this.sequence = sequence;
}
/// Low-level constructor for setting tag data on construction.
/// This allows to use less reallocations when creating an alignment
/// from scratch, by reusing memory for collecting tags.
/// Typically, you would use this constructor in conjunction with
/// utils.tagstoragebuilder module.
this(string read_name,
string sequence,
in CigarOperation[] cigar,
in ubyte[] tag_data)
{
_chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)
+ tag_data.length];
this(read_name, sequence, cigar);
_chunk[_tags_offset .. $] = tag_data;
}
/// Deep copy of an alignment.
BamRead dup() @property const {
BamRead result;
result._chunk = this._chunk.dup;
result._is_slice = false;
return result;
}
/// Compare two alignments, including tags
/// (they must be in the same order for equality).
bool opEquals(const ref BamRead other) const pure nothrow {
// in D, array comparison compares elements, not pointers.
return this._chunk == other._chunk;
}
/// ditto
bool opEquals(BamRead other) const pure nothrow {
return this._chunk == other._chunk;
}
/// Size of alignment when output to stream in BAM format.
/// Includes block_size as well (see SAM/BAM specification)
@property auto size_in_bytes() const {
return int.sizeof + _chunk.length;
}
/// Write alignment to EndianStream, together with block_size
/// and auxiliary data.
void write(EndianStream stream) {
stream.write(cast(int)(_chunk.length));
stream.writeExact(_chunk.ptr, _tags_offset);
writeTags(stream);
}
////////////////////////////////////////////////////////////////////////////
///
/// Packs message in the following format:
/// MsgPack array with elements
/// 0) read name - string
/// 1) flag - ushort
/// 2) reference sequence id - int
/// 3) leftmost mapping position (1-based) - int
/// 4) mapping quality - ubyte
/// 5,6) CIGAR:
/// array of lengths (int[])
/// array of operations (ubyte[])
/// 7) mate reference sequence id - int
/// 8) mate position (1-based) - int
/// 9) template length - int
/// 10) segment sequence - string
/// 11) phred-base quality - array of ubyte
/// 12) tags - map: string -> value
///
////////////////////////////////////////////////////////////////////////////
void toMsgpack(Packer)(ref Packer packer) const {
packer.beginArray(13);
packer.pack(cast(ubyte[])read_name);
packer.pack(flag);
packer.pack(ref_id);
packer.pack(position + 1);
packer.pack(mapping_quality);
packer.pack(array(map!"a.length"(cigar)));
packer.pack(array(map!"a.operation"(cigar)));
packer.pack(next_ref_id);
packer.pack(next_pos);
packer.pack(template_length);
packer.pack(to!string(sequence));
packer.pack(phred_base_quality);
packer.beginMap(tagCount());
foreach (key, value; this) {
packer.pack(key);
packer.pack(value);
}
}
ubyte[] _chunk; /// holds all the data,
/// the access is organized via properties
/// (see below)
private:
bool _is_slice; /// indicates whether _chunk is a slice or an allocated array.
/// Official field names from SAM/BAM specification.
/// For internal use only
@property int _refID() const nothrow {
return *(cast( int*)(_chunk.ptr + int.sizeof * 0));
}
@property int _pos() const nothrow {
return *(cast( int*)(_chunk.ptr + int.sizeof * 1));
}
@property uint _bin_mq_nl() const nothrow {
return *(cast(uint*)(_chunk.ptr + int.sizeof * 2));
}
@property uint _flag_nc() const nothrow {
return *(cast(uint*)(_chunk.ptr + int.sizeof * 3));
}
@property int _l_seq() const nothrow {
return *(cast( int*)(_chunk.ptr + int.sizeof * 4));
}
@property int _next_refID() const nothrow {
return *(cast( int*)(_chunk.ptr + int.sizeof * 5));
}
@property int _next_pos() const nothrow {
return *(cast( int*)(_chunk.ptr + int.sizeof * 6));
}
@property int _tlen() const nothrow {
return *(cast( int*)(_chunk.ptr + int.sizeof * 7));
}
/// Setters, also only for internal use
@property void _refID(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 0)) = n; }
@property void _pos(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 1)) = n; }
@property void _bin_mq_nl(uint n) { *(cast(uint*)(_chunk.ptr + int.sizeof * 2)) = n; }
@property void _flag_nc(uint n) { *(cast(uint*)(_chunk.ptr + int.sizeof * 3)) = n; }
@property void _l_seq(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 4)) = n; }
@property void _next_refID(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 5)) = n; }
@property void _next_pos(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 6)) = n; }
@property void _tlen(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 7)) = n; }
/// Additional useful properties, also from SAM/BAM specification
///
/// The layout of bin_mq_nl and flag_nc is as follows
/// (upper bits -------> lower bits):
///
/// bin_mq_nl [ { bin (16b) } { mapping quality (8b) } { read name length (8b) } ]
///
/// flag_nc [ { flag (16b) } { n_cigar_op (16b) } ]
///
@property ushort _bin() const nothrow {
return _bin_mq_nl >> 16;
}
@property ubyte _mapq() const nothrow {
return (_bin_mq_nl >> 8) & 0xFF;
}
@property ubyte _l_read_name() const nothrow {
return _bin_mq_nl & 0xFF;
}
@property ushort _flag() const nothrow {
return _flag_nc >> 16;
}
@property ushort _n_cigar_op() const nothrow {
return _flag_nc & 0xFFFF;
}
/// Setters for those properties
@property void _bin(ushort n) { _bin_mq_nl = (_bin_mq_nl & 0xFFFF) | (n << 16); }
@property void _mapq(ubyte n) { _bin_mq_nl = (_bin_mq_nl & ~0xFF00) | (n << 8); }
@property void _l_read_name(ubyte n) { _bin_mq_nl = (_bin_mq_nl & ~0xFF ) | n; }
@property void _flag(ushort n) { _flag_nc = (_flag_nc & 0xFFFF) | (n << 16); }
@property void _n_cigar_op(ushort n) { _flag_nc = (_flag_nc & ~0xFFFF) | n; }
/// Offsets of various arrays in bytes.
/// Currently, are computed each time, so if speed will be an issue,
/// they can be made fields instead of properties.
@property size_t _read_name_offset() const nothrow {
return 8 * int.sizeof;
}
@property size_t _cigar_offset() const nothrow {
return _read_name_offset + _l_read_name * char.sizeof;
}
@property size_t _seq_offset() const nothrow {
return _cigar_offset + _n_cigar_op * uint.sizeof;
}
@property size_t _qual_offset() const nothrow {
return _seq_offset + (_l_seq + 1) / 2 * ubyte.sizeof;
}
/// Offset of auxiliary data
@property size_t _tags_offset() const nothrow {
return _qual_offset + _l_seq * char.sizeof;
}
/// Sets n-th flag bit to boolean value b.
void _setFlag(int n, bool b) {
assert(n < 16);
// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
ushort mask = cast(ushort)(1 << n);
_flag = (_flag & ~mask) | ((-cast(int)b) & mask);
}
/// If _chunk is still a slice, not an array, duplicate it.
/// Used when some part of alignment record is modified by user.
///
/// Basically, it's sort of copy-on-write: a lot of read-only alignments
/// may point to the same location, but every modified one allocates its
/// own chunk of memory.
void _dup() {
if (_is_slice) {
_is_slice = false;
_chunk = _chunk.dup;
}
}
/// Calculates bin number.
void _recalculate_bin() {
_bin = reg2bin(position, position + basesCovered());
}
}
///////////////////////////////////////////////////////////////////////////////
///
/// Lazy tag storage.
///
/// Provides hash-like access (currently read-only) and opportunity to iterate
/// storage like an associative array.
///
///////////////////////////////////////////////////////////////////////////////
mixin template TagStorage() {
////////////////////////////////////////////////////////////////////////////
///
/// Provides access to chunk of memory which contains tags
/// This way, every time _tags_offset gets updated
/// (due to update of cigar string of read name and memory move),
/// the change is reflected automatically in tag storage.
///
////////////////////////////////////////////////////////////////////////////
private @property const(ubyte)[] _tags_chunk() const {
return _chunk[_tags_offset .. $];
}
////////////////////////////////////////////////////////////////////////////
///
/// Hash-like access to tags. Time complexity is O(#tags).
///
////////////////////////////////////////////////////////////////////////////
Value opIndex(string key) const {
enforce(key.length == 2, "Key length must be 2");
auto __tags_chunk = _tags_chunk; // _tags_chunk is evaluated lazily
if (__tags_chunk.length < 4)
return Value(null);
size_t offset = 0;
while (offset + 1 < __tags_chunk.length) {
if (__tags_chunk[offset .. offset + 2] == key) {
offset += 2;
return readValue(offset, __tags_chunk);
} else {
offset += 2;
skipValue(offset, __tags_chunk);
}
}
return Value(null);
}
/// ditto
void opIndexAssign(T)(T value, string key)
if (is(T == Value) || __traits(compiles, GetTypeId!T))
{
static if(is(T == Value)) {
enforce(key.length == 2, "Key length must be 2");
auto __tags_chunk = _tags_chunk;
_dup();
size_t offset = 0;
while (offset + 1 < __tags_chunk.length) {
if (__tags_chunk[offset .. offset + 2] == key) {
replaceValueAt(offset + 2, value);
return;
} else {
offset += 2;
skipValue(offset, __tags_chunk);
}
}
appendTag(key, value);
} else {
opIndexAssign(Value(value), key);
}
}
/// Append new tag to the end, skipping check if it already exists. O(1)
void appendTag(string key, Value value) {
auto oldlen = _chunk.length;
_chunk.length = _chunk.length + sizeInBytes(value) + 2 * char.sizeof;
_chunk[oldlen .. oldlen + 2] = cast(ubyte[])key;
emplaceValue(_chunk.ptr + oldlen + 2, value);
}
/// Remove all tags
void clearAllTags() {
_chunk.length = _tags_offset;
}
/// Number of tags
size_t tagCount() {
size_t result = 0;
size_t offset = 0;
auto __tags_chunk = _tags_chunk;
while (offset + 1 < __tags_chunk.length) {
offset += 2;
skipValue(offset, __tags_chunk);
result += 1;
}
return result;
}
/// replace existing tag
private void replaceValueAt(size_t offset, Value value) {
// offset points to the beginning of the value
auto begin = offset;
auto __tags_chunk = _tags_chunk;
skipValue(offset, __tags_chunk); // now offset is updated and points to the end
auto end = offset;
prepareSlice(_chunk, __tags_chunk[begin .. end], sizeInBytes(value));
emplaceValue(_chunk.ptr + _tags_offset + begin, value);
}
/////////////////////////////////////////////////////////////////////////////
/// Provides opportunity to iterate over tags.
/////////////////////////////////////////////////////////////////////////////
int opApply(scope int delegate(const ref string k, const ref Value v) dg) const {
size_t offset = 0;
auto __tags_chunk = _tags_chunk;
while (offset + 1 < __tags_chunk.length) {
auto key = cast(string)__tags_chunk[offset .. offset + 2];
offset += 2;
auto val = readValue(offset, __tags_chunk);
auto res = dg(key, val);
if (res != 0) {
return res;
}
}
return 0;
}
////////////////////////////////////////////////////////////////////////////
/// Returns the number of tags. Time complexity is O(#tags)
////////////////////////////////////////////////////////////////////////////
size_t tagCount() const {
size_t res = 0;
size_t offset = 0;
auto __tags_chunk = _tags_chunk;
while (offset + 1 < __tags_chunk.length) {
offset += 2;
skipValue(offset, __tags_chunk);
res += 1;
}
return res;
}
/// Writes auxiliary data to output stream
private void writeTags(Stream stream) {
if (std.system.endian == Endian.littleEndian) {
stream.writeExact(_tags_chunk.ptr, _tags_chunk.length);
} else {
fixTagStorageByteOrder(); // FIXME: should modify on-the-fly
stream.writeExact(_tags_chunk.ptr, _tags_chunk.length); // during writing to the stream
fixTagStorageByteOrder();
}
}
////////////////////////////////////////////////////////////////////////////
/// Reads value which starts from (_tags_chunk.ptr + offset) address,
/// and updates offset to the end of value. O(1)
////////////////////////////////////////////////////////////////////////////
private Value readValue(ref size_t offset, const(ubyte)[] tags_chunk) const {
string readValueArrayTypeHelper() {
char[] cases;
foreach (c2t; ArrayElementTagValueTypes) {
cases ~=
"case '"~c2t.ch~"':".dup~
" auto begin = offset;"~
" auto end = offset + length * "~c2t.ValueType.stringof~".sizeof;"~
" offset = end;"~
" return Value(cast("~c2t.ValueType.stringof~"[])(tags_chunk[begin .. end]));";
}
return to!string("switch (elem_type) {" ~ cases ~
" default: throw new UnknownTagTypeException(to!string(elem_type));"~
"}");
}
string readValuePrimitiveTypeHelper() {
char[] cases;
foreach (c2t; PrimitiveTagValueTypes) {
cases ~= "case '"~c2t.ch~"':"~
" auto p = tags_chunk.ptr + offset;"~
" auto value = *(cast("~c2t.ValueType.stringof~"*)p);"~
" offset += value.sizeof;"~
" return Value(value);".dup;
}
return to!string("switch (type) {" ~ cases ~
" default: throw new UnknownTagTypeException(to!string(type));"~
"}");
}
char type = cast(char)tags_chunk[offset++];
if (type == 'Z' || type == 'H') {
auto begin = offset;
while (tags_chunk[offset++] != 0) {}
// return string with stripped '\0'
auto v = Value(cast(string)tags_chunk[begin .. offset - 1]);
if (type == 'H') {
v.setHexadecimalFlag();
}
return v;
} else if (type == 'B') {
char elem_type = cast(char)tags_chunk[offset++];
uint length = *(cast(uint*)(tags_chunk.ptr + offset));
offset += uint.sizeof;
mixin(readValueArrayTypeHelper());
} else {
mixin(readValuePrimitiveTypeHelper());
}
}
/**
Increases offset so that it points to the next value. O(1).
*/
private void skipValue(ref size_t offset, const(ubyte)[] tags_chunk) const {
char type = cast(char)tags_chunk[offset++];
if (type == 'Z' || type == 'H') {
while (tags_chunk[offset++] != 0) {}
} else if (type == 'B') {
char elem_type = cast(char)tags_chunk[offset++];
auto length = *(cast(uint*)(tags_chunk.ptr + offset));
offset += uint.sizeof + charToSizeof(elem_type) * length;
} else {
offset += charToSizeof(type);
}
}
/**
Intended to be used in constructor for initial endianness fixing
in case the library is used on big-endian system.
NOT TESTED AT ALL!!!
*/
private void fixTagStorageByteOrder() {
/* TODO: TEST ON BIG-ENDIAN SYSTEM!!! */
const(ubyte)* p = _tags_chunk.ptr;
const(ubyte)* end = p + _chunk.length;
while (p < end) {
p += 2; // skip tag name
char type = *(cast(char*)p);
++p; // skip type
if (type == 'Z' || type == 'H') {
while (*p != 0) { // zero-terminated
++p; // string
}
++p; // skip '\0'
} else if (type == 'B') { // array
char elem_type = *(cast(char*)p);
uint size = charToSizeof(elem_type);
switchEndianness(p, uint.sizeof);
uint length = *(cast(uint*)p);
p += uint.sizeof; // skip length
if (size != 1) {
for (auto j = 0; j < length; j++) {
switchEndianness(p, size);
p += size;
}
} else {
// skip
p += length;
}
} else {
uint size = charToSizeof(type);
if (size != 1) {
switchEndianness(p, size);
p += size;
} else {
++p;
}
}
}
}
}
unittest {
import std.algorithm;
import std.stdio;
import std.math;
auto read = BamRead("readname",
"AGCTGACTACGTAATAGCCCTA",
[CigarOperation(22, 'M')]);
assert(read.sequence_length == 22);
assert(read.cigar.length == 1);
assert(read.cigarString() == "22M");
assert(read.read_name == "readname");
assert(equal(read.sequence(), "AGCTGACTACGTAATAGCCCTA"));
read.read_name = "anothername";
assert(read.read_name == "anothername");
assert(read.cigarString() == "22M");
read.phred_base_quality = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22];
assert(reduce!"a+b"(0, read.phred_base_quality) == 253);
read.cigar = [CigarOperation(20, 'M'), CigarOperation(2, 'X')];
assert(read.cigarString() == "20M2X");
read.sequence = "AGCTGGCTACGTAATAGCCCTA";
assert(equal(read.sequence(), "AGCTGGCTACGTAATAGCCCTA"));
assert(retro(read.sequence)[3] == 'C');
assert(retro(read.sequence)[1] == 'T');
assert(read.sequence[4] == 'G');
assert(read.sequence[0] == 'A');
assert(equal(read.sequence[0..8], "AGCTGGCT"));
assert(equal(read.sequence[3..5], "TG"));
assert(equal(read.sequence[3..9][1..4], "GGC"));
read["RG"] = 15;
assert(read["RG"] == 15);
read["X1"] = [1, 2, 3, 4, 5];
assert(read["X1"] == [1, 2, 3, 4, 5]);
read["RG"] = cast(float)5.6;
assert(approxEqual(to!float(read["RG"]), 5.6));
read["X1"] = 42;
assert(read["X1"] == 42);
assert(read.tagCount() == 2);
// Test tagstoragebuilder
auto builder = new TagStorageBuilder();
builder.put("X0", Value(24));
builder.put("X1", Value("abcd"));
builder.put("X2", Value([1,2,3]));
read = BamRead("readname",
"AGCTGACTACGTAATAGCCCTA",
[CigarOperation(22, 'M')],
builder.data);
assert(read["X0"] == 24);
assert(read["X1"] == "abcd");
assert(read["X2"] == [1,2,3]);
assert(read.tagCount() == 3);
// Test MsgPack serialization/deserialization
{
import std.typecons;
auto packer = bio.bam.thirdparty.msgpack.packer(Appender!(ubyte[])());
read.toMsgpack(packer);
auto data = packer.stream.data;
auto rec = unpack(data).via.array;
assert(rec[0] == "readname");
assert(rec[5].as!(int[]) == [22]);
assert(rec[6].as!(ubyte[]) == ['M']);
assert(rec[10].as!(ubyte[]) == to!string(read.sequence));
}
read.clearAllTags();
assert(read.tagCount() == 0);
}
/// BamRead wrapper which precomputes $(D end_position) = $(D position) + $(D basesCovered()).
///
/// Computation of basesCovered() takes quite a few cycles. Therefore in places where this
/// property is frequently accessed, it makes sense to precompute it for later use.
///
/// The idea is that this should be a drop-in replacement for BamRead in algorithms,
/// as the struct uses 'alias this' construction for the wrapped read.
struct EagerBamRead {
this(BamRead read) {
this.read = read;
this.end_position = read.position + read.basesCovered();
}
///
BamRead read;
///
alias read this;
/// End position on the reference, computed as position + basesCovered().
int end_position;
EagerBamRead dup() @property const {
return EagerBamRead(read.dup);
}
}