Browse Source

implemented idiomatic toString for BamRead

Other improvements:

* _is_slice is now hackishly contained in the chunk
  (name is zero terminated, so one byte is available)

* Each BamRead now contains a pointer to the reader

* SamReader and BamReader now share common interface
  bio.bam.abstractreader.IBamSamReader
remotes/georgeg/no_streams
lomereiter 9 years ago
parent
commit
bf12b493f4
  1. 42
      bio/bam/abstractreader.d
  2. 23
      bio/bam/randomaccessmanager.d
  3. 116
      bio/bam/read.d
  4. 24
      bio/bam/reader.d
  5. 12
      bio/bam/readrange.d
  6. 6
      bio/bam/reference.d
  7. 6
      bio/bam/serialization/sam.d
  8. 26
      bio/bam/tagvalue.d
  9. 4
      bio/bam/writer.d
  10. 32
      bio/sam/reader.d
  11. 11
      test/unittests.d

42
bio/bam/abstractreader.d

@ -0,0 +1,42 @@
/*
This file is part of BioD.
Copyright (C) 2013 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.abstractreader;
import bio.sam.header;
import bio.bam.read;
import bio.bam.referenceinfo;
public import std.range;
/// Common interface for $(DPREF2 bam, reader, BamReader)
/// and $(DPREF2 sam, reader, SamReader).
interface IBamSamReader {
/// SAM header
bio.sam.header.SamHeader header() @property;
/// Information about reference sequences
const(bio.bam.referenceinfo.ReferenceSequenceInfo)[] reference_sequences() @property const;
/// All reads in the file
std.range.InputRange!(bio.bam.read.BamRead) allReads() @property;
/// Filename
string filename() @property const;
}

23
bio/bam/randomaccessmanager.d

@ -23,6 +23,7 @@
module bio.bam.randomaccessmanager;
import bio.bam.constants;
import bio.bam.reader;
import bio.bam.read;
import bio.bam.readrange;
import bio.bam.baifile;
@ -75,6 +76,12 @@ class RandomAccessManager {
_filename = filename;
}
/// ditto
this(BamReader reader) {
_reader = reader;
_filename = reader.filename;
}
/// Constructs new manager with given index file.
/// This allows to do random-access interval queries.
///
@ -82,12 +89,19 @@ class RandomAccessManager {
/// filename = location of BAM file
/// bai = index file
this(string filename, ref BaiFile bai) {
_filename = filename;
_bai = bai;
_found_index_file = true;
}
/// ditto
this(BamReader reader, ref BaiFile bai) {
_reader = reader;
_filename = reader.filename;
_bai = bai;
_found_index_file = true;
}
/// If file ends with EOF block, returns virtual offset of the start of EOF block.
/// Otherwise, returns virtual offset of the physical end of file.
VirtualOffset eofVirtualOffset() const {
@ -148,7 +162,7 @@ class RandomAccessManager {
/// Every time new stream is used.
BamRead getReadAt(VirtualOffset offset) {
auto stream = createStreamStartingFrom(offset);
return bamReadRange(stream).front.dup;
return bamReadRange(stream, _reader).front.dup;
}
/// Get BGZF block at a given offset.
@ -169,7 +183,7 @@ class RandomAccessManager {
return record.end_virtual_offset > vo;
}
return until!offsetTooBig(bamReadRange!withOffsets(stream), to);
return until!offsetTooBig(bamReadRange!withOffsets(stream, _reader), to);
}
bool found_index_file() @property {
@ -256,7 +270,7 @@ class RandomAccessManager {
auto decompressed_blocks = getUnpackedBlocks(bgzf_range); // (3)
auto augmented_blocks = getAugmentedBlocks(decompressed_blocks, chunks); // (4)
IChunkInputStream stream = makeChunkInputStream(augmented_blocks); // (5)
auto reads = bamReadRange!IteratePolicy(stream);
auto reads = bamReadRange!IteratePolicy(stream, _reader);
return filterBamReads(reads, ref_id, beg, end); // (6)
}
@ -264,6 +278,7 @@ private:
string _filename;
BaiFile _bai;
BamReader _reader;
public:

116
bio/bam/read.d

@ -45,6 +45,7 @@ module bio.bam.read;
import bio.core.base;
import bio.bam.abstractreader;
import bio.bam.writer;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
@ -63,6 +64,7 @@ version(unittest) {
import std.algorithm;
import std.range;
import std.conv;
import std.format;
import std.exception;
import std.system;
import std.traits;
@ -143,8 +145,8 @@ struct CigarOperation {
}
///
string toString() const {
return to!string(length) ~ type;
void toString(scope void delegate(const(char)[]) sink) const {
sink.formattedWrite("%s%s", length, type);
}
}
@ -512,9 +514,9 @@ struct BamRead {
// In summa, BAM is little-endian format, so big-endian
// users will suffer anyway, it's unavoidable.
_chunk = chunk;
this._is_slice = true;
_chunk = chunk;
if (std.system.endian != Endian.littleEndian) {
switchChunkEndianness();
@ -566,8 +568,6 @@ struct BamRead {
{
enforce(read_name.length < 256, "Too long read name, length must be <= 255");
this._is_slice = false;
if (this._chunk is null) {
this._chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)];
}
@ -597,6 +597,8 @@ struct BamRead {
_chunk[_offset .. _offset + read_name.length] = cast(ubyte[])read_name;
_chunk[_offset + read_name.length] = cast(ubyte)'\0';
this._is_slice = false;
this.sequence = sequence;
}
@ -621,19 +623,24 @@ struct BamRead {
BamRead result;
result._chunk = this._chunk.dup;
result._is_slice = false;
result._reader = cast()_reader;
return result;
}
/// Compare two alignments, including tags
/// (the tags must follow in the same order for equality).
bool opEquals(const ref BamRead other) const pure nothrow {
// in D, array comparison compares elements, not pointers.
return this._chunk == other._chunk;
// don't forget about _is_slice trick
auto m = _cigar_offset;
return _chunk[0 .. m - 1] == other._chunk[0 .. m - 1] &&
_chunk[m .. $] == other._chunk[m .. $];
}
/// ditto
bool opEquals(BamRead other) const pure nothrow {
return this._chunk == other._chunk;
auto m = _cigar_offset;
return _chunk[0 .. m - 1] == other._chunk[0 .. m - 1] &&
_chunk[m .. $] == other._chunk[m .. $];
}
/// Size of the alignment record when output to stream in BAM format.
@ -645,6 +652,9 @@ struct BamRead {
package void write(ref BamWriter writer) {
writer.writeInteger(cast(int)(_chunk.length));
ubyte old_byte = _chunk[_cigar_offset - 1];
_chunk[_cigar_offset - 1] = 0;
if (std.system.endian != Endian.littleEndian) {
switchChunkEndianness();
writer.writeByteArray(_chunk[0 .. _tags_offset]);
@ -653,6 +663,8 @@ struct BamRead {
writer.writeByteArray(_chunk[0 .. _tags_offset]);
}
_chunk[_cigar_offset - 1] = old_byte;
writeTags(writer);
}
@ -694,6 +706,69 @@ struct BamRead {
packer.pack(value);
}
}
///
void toString(scope void delegate(const(char)[]) sink) const {
sink(name);
sink("\t");
sink.formattedWrite("%d\t", flag);
if (ref_id == -1 || _reader is null)
sink("*");
else
sink(_reader.reference_sequences[ref_id].name);
sink.formattedWrite("\t%d\t%d\t", position + 1, mapping_quality);
if (cigar.length == 0)
sink("*\t");
else
sink.formattedWrite("%(%s%)\t", cigar);
if (mate_ref_id == ref_id) {
if (mate_ref_id == -1)
sink("*\t");
else
sink("=\t");
} else {
if (mate_ref_id == -1 || _reader is null) {
sink("*\t");
} else {
auto mate_name = _reader.reference_sequences[mate_ref_id].name;
sink(mate_name);
sink("\t");
}
}
sink.formattedWrite("%d\t%d\t", mate_position + 1, template_length);
if (sequence_length == 0)
sink("*");
else
foreach (char c; sequence)
sink.formattedWrite("%c", c);
sink("\t");
if (base_qualities.length == 0 || base_qualities[0] == 0xFF)
sink("*");
else
foreach (qual; base_qualities)
sink.formattedWrite("%s", cast(char)(qual + 33));
foreach (k, v; this) {
sink("\t");
sink(k);
sink(":");
v.formatSam(sink);
}
}
/// Associates read with BAM reader. This is done automatically
/// if this read is obtained through BamReader/Reference methods.
void associateWithReader(bio.bam.abstractreader.IBamSamReader reader) {
_reader = reader;
}
/// Associated BAM/SAM reader.
bio.bam.abstractreader.IBamSamReader reader() @property {
return _reader;
}
package ubyte[] _chunk; // holds all the data,
// the access is organized via properties
@ -701,7 +776,20 @@ struct BamRead {
private:
bool _is_slice; // indicates whether _chunk is a slice or an allocated array.
// by specs, name ends with '\0'
// let's use this byte for something useful!
//
// (Of course this places some restrictions on usage,
// but allows to reduce size of record.)
bool _is_slice() @property const {
return cast(bool)_chunk[_cigar_offset - 1];
}
void _is_slice(bool is_slice) @property {
_chunk[_cigar_offset - 1] = is_slice ? 1 : 0;
}
IBamSamReader _reader;
// Official field names from SAM/BAM specification.
// For internal use only
@ -713,7 +801,7 @@ private:
return *(cast( int*)(_chunk.ptr + int.sizeof * 1));
}
@property uint _bin_mq_nl() const nothrow {
@property uint _bin_mq_nl() const nothrow pure @system {
return *(cast(uint*)(_chunk.ptr + int.sizeof * 2));
}
@ -762,7 +850,7 @@ private:
@property ubyte _mapq() const nothrow {
return (_bin_mq_nl >> 8) & 0xFF;
}
@property ubyte _l_read_name() const nothrow {
@property ubyte _l_read_name() const nothrow pure {
return _bin_mq_nl & 0xFF;
}
@property ushort _flag() const nothrow {
@ -782,11 +870,11 @@ private:
// Offsets of various arrays in bytes.
// Currently, are computed each time, so if speed will be an issue,
// they can be made fields instead of properties.
@property size_t _read_name_offset() const nothrow {
@property size_t _read_name_offset() const nothrow pure {
return 8 * int.sizeof;
}
@property size_t _cigar_offset() const nothrow {
@property size_t _cigar_offset() const nothrow pure {
return _read_name_offset + _l_read_name * char.sizeof;
}
@ -819,8 +907,8 @@ private:
// own chunk of memory.
void _dup() {
if (_is_slice) {
_is_slice = false;
_chunk = _chunk.dup;
_is_slice = false;
}
}

24
bio/bam/reader.d

@ -41,6 +41,7 @@
/// ------------------------
module bio.bam.reader;
import bio.bam.abstractreader;
public import bio.sam.header;
public import bio.bam.reference;
public import bio.bam.read;
@ -50,13 +51,13 @@ import bio.bam.randomaccessmanager;
import bio.bam.baifile;
import bio.core.utils.range;
import bio.core.utils.stream;
import bio.core.bgzf.blockrange;
import bio.core.bgzf.inputstream;
public import bio.core.bgzf.blockrange;
public import bio.core.bgzf.inputstream;
public import bio.core.bgzf.virtualoffset;
import std.system;
import std.stdio;
import std.algorithm : map, min;
import std.algorithm;
import std.range : zip;
import std.conv : to;
import std.exception : enforce;
@ -68,7 +69,7 @@ import std.string;
/**
BAM file reader, featuring parallel decompression of BGZF blocks.
*/
class BamReader {
class BamReader : IBamSamReader {
/**
Creates reader associated with file or stream.
@ -173,7 +174,7 @@ class BamReader {
/**
Returns: information about reference sequences
*/
bio.bam.referenceinfo.ReferenceSequenceInfo[] reference_sequences() @property {
const(bio.bam.referenceinfo.ReferenceSequenceInfo)[] reference_sequences() @property const {
return _reference_sequences;
}
@ -199,7 +200,7 @@ class BamReader {
*/
auto reads(alias IteratePolicy=bio.bam.readrange.withoutOffsets)() @property {
auto _decompressed_stream = getDecompressedBamReadStream();
return bamReadRange!IteratePolicy(_decompressed_stream);
return bamReadRange!IteratePolicy(_decompressed_stream, this);
}
/**
@ -231,7 +232,7 @@ class BamReader {
(void delegate(lazy float p) progressBarFunc)
{
auto _decompressed_stream = getDecompressedBamReadStream();
auto reads_with_offsets = bamReadRange!withOffsets(_decompressed_stream);
auto reads_with_offsets = bamReadRange!withOffsets(_decompressed_stream, this);
static struct Result(alias IteratePolicy, R, S) {
this(R range, S stream, void delegate(lazy float p) progressBarFunc) {
@ -277,6 +278,11 @@ class BamReader {
progressBarFunc);
}
/// Part of IBamSamReader interface
std.range.InputRange!(bio.bam.read.BamRead) allReads() @property {
return inputRangeObject(reads!withoutOffsets());
}
/**
Returns: the read which starts at a given virtual offset.
*/
@ -399,9 +405,9 @@ private:
// so we need to do that to get the right BAI status
if (_bai_status == BaiStatus.initialized) {
_rndaccssmgr = new RandomAccessManager(_filename, bai);
_rndaccssmgr = new RandomAccessManager(this, bai);
} else {
_rndaccssmgr = new RandomAccessManager(_filename);
_rndaccssmgr = new RandomAccessManager(this);
}
}
}

12
bio/bam/readrange.d

@ -20,6 +20,8 @@
module bio.bam.readrange;
import bio.bam.read;
import bio.bam.abstractreader;
import bio.bam.reader;
import bio.core.bgzf.inputstream;
import bio.core.bgzf.virtualoffset;
import bio.core.utils.switchendianness;
@ -77,8 +79,9 @@ class BamReadRange(alias IteratePolicy)
{
/// Create new range from IChunkInputStream.
this(ref IChunkInputStream stream) {
this(ref IChunkInputStream stream, BamReader reader=null) {
_stream = stream;
_reader = reader;
_endian_stream = new EndianStream(_stream, Endian.littleEndian);
readNext();
}
@ -99,6 +102,8 @@ private:
IChunkInputStream _stream;
EndianStream _endian_stream;
BamReader _reader;
BamRead _current_record;
bool _empty = false;
@ -143,10 +148,11 @@ private:
}
_current_record = BamRead(_stream.readSlice(block_size));
_current_record.associateWithReader(cast(IBamSamReader)_reader);
}
}
/// Returns: lazy range of BamRead/BamReadBlock structs constructed from a given stream.
auto bamReadRange(alias IteratePolicy=withoutOffsets)(ref IChunkInputStream stream) {
return new BamReadRange!IteratePolicy(stream);
auto bamReadRange(alias IteratePolicy=withoutOffsets)(ref IChunkInputStream stream, BamReader reader) {
return new BamReadRange!IteratePolicy(stream, reader);
}

6
bio/bam/reference.d

@ -38,8 +38,8 @@ module bio.bam.reference;
public import bio.bam.referenceinfo;
import bio.bam.randomaccessmanager;
import bio.bam.readrange;
import bio.bam.randomaccessmanager;
import bio.core.bgzf.virtualoffset;
import std.stream;
@ -121,7 +121,7 @@ struct ReferenceSequence {
auto last_offset = ioffsets[$ - 1];
auto stream = _manager.createStreamStartingFrom(last_offset);
auto last_few_reads = bamReadRange!withOffsets(stream);
auto last_few_reads = bamReadRange!withOffsets(stream, null);
VirtualOffset result;
assert(!last_few_reads.empty);
@ -163,7 +163,7 @@ struct ReferenceSequence {
auto offset = ioffsets[cast(size_t)index];
auto stream = _manager.createStreamStartingFrom(offset);
auto reads = bamReadRange(stream);
auto reads = bamReadRange(stream, null);
int last_position = int.min;

6
bio/bam/serialization/sam.d

@ -20,7 +20,7 @@
module bio.bam.serialization.sam;
import bio.bam.read;
import bio.bam.reference;
import bio.bam.referenceinfo;
import bio.bam.tagvalue;
import bio.core.utils.format;
@ -115,7 +115,7 @@ void serialize(S)(const ref Value v, ref S stream) {
/// -------------
/// toSam(alignment, bam.reference_sequences);
/// -------------
string toSam(R)(auto ref R alignment, ReferenceSequenceInfo[] info) {
string toSam(R)(auto ref R alignment, const(ReferenceSequenceInfo)[] info) {
char[] buf;
buf.reserve(512);
serialize(alignment, info, buf);
@ -124,7 +124,7 @@ string toSam(R)(auto ref R alignment, ReferenceSequenceInfo[] info) {
/// Serialize $(D alignment) to FILE* or append it to char[]/char*
/// (in char* case it's your responsibility to allocate enough memory)
void serialize(S, R)(auto ref R alignment, ReferenceSequenceInfo[] info, auto ref S stream)
void serialize(S, R)(auto ref R alignment, const(ReferenceSequenceInfo)[] info, auto ref S stream)
if (is(Unqual!S == FILE*) || is(Unqual!S == char*) || is(Unqual!S == char[]))
{

26
bio/bam/tagvalue.d

@ -47,6 +47,7 @@ module bio.bam.tagvalue;
public import std.conv;
import std.typetuple;
import std.exception;
import std.format;
import bio.bam.thirdparty.msgpack;
@ -449,4 +450,29 @@ struct Value {
default: break;
}
}
void formatSam(scope void delegate(const(char)[]) sink) const {
switch (_tag) {
case GetTypeId!byte: sink.formattedWrite("i:%s", *cast(byte*)(&u)); break;
case GetTypeId!ubyte: sink.formattedWrite("i:%s", *cast(ubyte*)(&u)); break;
case GetTypeId!short: sink.formattedWrite("i:%s", *cast(short*)(&u)); break;
case GetTypeId!ushort: sink.formattedWrite("i:%s", *cast(ushort*)(&u)); break;
case GetTypeId!int: sink.formattedWrite("i:%s", *cast(int*)(&u)); break;
case GetTypeId!uint: sink.formattedWrite("i:%s", *cast(uint*)(&u)); break;
case GetTypeId!float: sink.formattedWrite("f:%s", *cast(float*)(&u)); break;
case GetTypeId!string: sink("Z:"); sink(*cast(const(char)[]*)(&u)); break;
case hexStringTag: sink("H:"); sink(*cast(const(char)[]*)(&u)); break;
case GetTypeId!char: sink.formattedWrite("A:%c", *cast(char*)(&u)); break;
case GetTypeId!(byte[]): sink.formattedWrite("B:b:%s(%s,%)", *cast(byte[]*)(&u)); break;
case GetTypeId!(ubyte[]): sink.formattedWrite("B:B:%s(%s,%)", *cast(ubyte[]*)(&u)); break;
case GetTypeId!(short[]): sink.formattedWrite("B:s:%s(%s,%)", *cast(short[]*)(&u)); break;
case GetTypeId!(ushort[]): sink.formattedWrite("B:S:%s(%s,%)", *cast(ushort[]*)(&u)); break;
case GetTypeId!(int[]): sink.formattedWrite("B:i:%s(%s,%)", *cast(int[]*)(&u)); break;
case GetTypeId!(uint[]): sink.formattedWrite("B:I:%s(%s,%)", *cast(uint[]*)(&u)); break;
case GetTypeId!(float[]): sink.formattedWrite("B:f:%s(%s,%)", *cast(float[]*)(&u)); break;
default: break;
}
}
}

4
bio/bam/writer.d

@ -110,7 +110,7 @@ final class BamWriter {
/// resolving read reference IDs to names.
///
/// Flushes current BGZF block.
void writeReferenceSequenceInfo(bio.bam.referenceinfo.ReferenceSequenceInfo[] reference_sequences)
void writeReferenceSequenceInfo(const(bio.bam.referenceinfo.ReferenceSequenceInfo)[] reference_sequences)
{
_reference_sequences = reference_sequences;
@ -153,7 +153,7 @@ final class BamWriter {
private {
BgzfOutputStream _stream;
ReferenceSequenceInfo[] _reference_sequences;
const(ReferenceSequenceInfo)[] _reference_sequences;
size_t _current_size; // number of bytes written to the current BGZF block
}
}

32
bio/sam/reader.d

@ -19,9 +19,10 @@
*/
module bio.sam.reader;
import bio.bam.abstractreader;
import bio.sam.header;
import bio.bam.read;
import bio.bam.reference;
import bio.bam.referenceinfo;
version(DigitalMars) {
import bio.sam.utils.recordparser;
@ -37,8 +38,10 @@ private {
extern(C) size_t lseek(int, size_t, int);
}
class SamReader {
///
class SamReader : IBamSamReader {
///
this(string filename) {
_file = File(filename);
_filename = filename;
@ -46,19 +49,22 @@ class SamReader {
_initializeStream();
}
SamHeader header() @property {
///
bio.sam.header.SamHeader header() @property {
return _header;
}
ReferenceSequenceInfo[] reference_sequences() @property {
///
const(bio.bam.referenceinfo.ReferenceSequenceInfo)[] reference_sequences() @property const {
return _reference_sequences;
}
private alias File.ByLine!(char, char) LineRange;
static struct SamRecordRange {
this(LineRange lines, ref SamHeader header) {
this(LineRange lines, ref SamHeader header, SamReader reader) {
_header = header;
_reader = reader;
_line_range = lines;
_build_storage = new AlignmentBuildStorage();
@ -86,6 +92,7 @@ class SamReader {
_current_alignment = parseAlignmentLine(cast(string)_line_range.front.dup,
_header,
_build_storage);
_current_alignment.associateWithReader(cast(IBamSamReader)_reader);
}
}
@ -93,11 +100,12 @@ class SamReader {
BamRead _current_alignment;
bool _empty;
SamHeader _header;
SamReader _reader;
AlignmentBuildStorage _build_storage;
}
}
/// Reads in SAM file. Can be iterated only once.
/// Reads in SAM file.
auto reads() @property {
LineRange lines = _lines;
@ -114,7 +122,17 @@ class SamReader {
lines.popFront();
}
return SamRecordRange(lines, _header);
return SamRecordRange(lines, _header, this);
}
///
std.range.InputRange!(bio.bam.read.BamRead) allReads() @property {
return inputRangeObject(reads);
}
/// Filename
string filename() @property const {
return _filename;
}
private:

11
test/unittests.d

@ -18,11 +18,10 @@
*/
import bio.sam.reader;
import bio.sam.header;
import bio.core.bgzf.blockrange;
import bio.bam.reader;
import bio.bam.writer;
import bio.sam.reader;
import bio.sam.header;
import bio.bam.md.reconstruct;
import bio.bam.pileup;
import bio.bam.baseinfo;
@ -89,6 +88,9 @@ unittest {
reads.popFront();
assert(reads.front.cigarString() == "35M");
assert(toSam(reads.front, bf.reference_sequences) == "EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0");
foreach (record; bf.reads) {
assert(toSam(record, bf.reference_sequences) == to!string(record));
}
assert(bf.header.getSequenceIndex("chr1") == read.ref_id);
}
@ -286,6 +288,9 @@ unittest {
auto sf = new SamReader(buildPath(dirName(__FILE__), "data", "ex1_header.sam"));
assert(sf.reads.front.ref_id == 0);
assert(equal(sf.reads, bf.reads!withoutOffsets));
foreach (samread; sf.reads) {
assert(to!string(samread) == toSam(samread, sf.reference_sequences));
}
}
writeln("Testing pileup (high-level aspects)...");

Loading…
Cancel
Save