Browse Source

fixes

remotes/georgeg/bam_output_redesign
lomereiter 10 years ago
parent
commit
dd5b4a7183
  1. 301
      bio/bam/output.d
  2. 2
      bio/bam/pileuprange.d
  3. 4
      bio/bam/validation/alignment.d
  4. 4
      bio/sam/reader.d
  5. 8
      bio/sam/utils/fastrecordparser.d
  6. 8
      bio/sam/utils/recordparser.d
  7. 0
      examples/example1.d
  8. 8
      src_ragel/sam_alignment.rl

301
bio/bam/output.d

@ -1,301 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.output;
import bio.sam.header;
import bio.bam.reference;
import bio.bam.read;
import bio.bam.readrange;
import bio.bam.constants;
import bio.core.bgzf.compress;
import std.stream;
import std.system;
import std.range;
import std.traits;
import std.algorithm;
import std.parallelism;
import std.exception;
debug {
import std.stdio;
}
/// Writes BAM magic string at the beginning of the file.
void writeMagic(EndianStream stream) {
stream.writeString(BAM_MAGIC);
}
/// Writes header length (int32_t) and header text to a stream.
void writeSamHeader(EndianStream stream, string header)
in
{
assert(stream.endian == Endian.littleEndian);
}
body
{
stream.write(cast(int)(header.length));
stream.writeString(header);
}
/// Writes reference sequence information to a stream.
void writeReferenceSequences(EndianStream stream, ReferenceSequenceInfo[] info)
in
{
assert(stream.endian == Endian.littleEndian);
}
body
{
stream.write(cast(int)info.length);
foreach (refseq; info) {
stream.write(cast(int)(refseq.name.length + 1));
stream.writeString(refseq.name);
stream.write(cast(ubyte)'\0');
stream.write(cast(int)(refseq.length));
}
}
/// Writes alignment to a stream.
void writeAlignment(EndianStream stream, BamRead alignment) {
alignment.write(stream);
}
// TODO: refactor!
/// Writes BAM file to a stream.
///
/// Params:
///
/// stream = the stream to write to
///
/// header = SAM header as raw text
///
/// info = reference sequences info
///
/// alignments = range of alignments
///
/// compression_level = level of compression, use 0 for uncompressed BAM.
///
void writeBAM(R)(Stream stream,
string header,
ReferenceSequenceInfo[] info,
R alignments,
int compression_level=-1,
TaskPool task_pool=taskPool,
size_t parmapbufsz=32,
size_t parmapwusz=1,
bool writeEOF=true)
if (is(Unqual!(ElementType!R) == BamRead)
|| is(Unqual!(ElementType!R) == BamReadBlock))
{
// First, pack header and reference sequences.
auto header_memory_stream = new MemoryStream();
auto header_endian_stream = new EndianStream(header_memory_stream, Endian.littleEndian);
writeMagic(header_endian_stream);
writeSamHeader(header_endian_stream, header);
writeReferenceSequences(header_endian_stream, info);
foreach (block; std.range.chunks(header_memory_stream.data, BGZF_BLOCK_SIZE)) {
auto bgzf_block = bgzfCompress(block, compression_level);
stream.writeExact(bgzf_block.ptr, bgzf_block.length);
}
// Range of blocks, each is <= MAX_BLOCK_SIZE in size,
// except cases where single alignment takes more than
// one block. In this particular case, the alignment occupies
// the whole block.
static struct BlockRange(R) {
this(R alignments) {
_alignments = alignments;
_memory_stream = new MemoryStream();
_endian_stream = new EndianStream(_memory_stream);
if (!_alignments.empty) {
_current_alignment = _alignments.front;
}
loadNextBlock();
}
R _alignments;
bool _empty = false;
MemoryStream _memory_stream;
EndianStream _endian_stream;
BamRead _current_alignment;
bool empty() @property {
return _empty;
}
ubyte[] front() @property {
return _memory_stream.data[0 .. cast(size_t)_endian_stream.position];
}
void popFront() {
_endian_stream.seekSet(0);
loadNextBlock();
}
void loadNextBlock() {
if (_alignments.empty) {
_empty = true;
return;
}
while (true) {
auto pos = _endian_stream.position;
if (pos == 0 || (_current_alignment.size_in_bytes + pos <= BGZF_BLOCK_SIZE))
{
writeAlignment(_endian_stream, _current_alignment);
_alignments.popFront();
if (!_alignments.empty) {
_current_alignment = _alignments.front;
} else {
break;
}
} else {
break;
}
}
}
}
static auto blockRange(R)(R alignments) {
return BlockRange!R(alignments);
}
auto blocks = blockRange(alignments);
auto chunked_blocks = chunkedBlockRange(blocks, BGZF_BLOCK_SIZE);
// ugly workaround
// (issue 5710, cannot use delegates as parameters to non-global template)
static ubyte[] makeBgzfCompressor(int n)(ubyte[] buf) {
return bgzfCompress(buf, n);
}
// helper function
static void writeAlignmentBlocks(int n, R)(R chunked_blocks, ref Stream stream,
TaskPool task_pool,
size_t parmapbufsz,
size_t parmapwusz) {
version(serial) {
auto bgzf_blocks = map!(makeBgzfCompressor!n)(chunked_blocks);
} else {
auto bgzf_blocks = task_pool.map!(makeBgzfCompressor!n)(chunked_blocks,
parmapbufsz,
parmapwusz);
}
foreach (bgzf_block; bgzf_blocks) {
stream.writeExact(bgzf_block.ptr, bgzf_block.length);
}
}
switch (compression_level) {
case -1: writeAlignmentBlocks!(-1)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 0: writeAlignmentBlocks!(0)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 1: writeAlignmentBlocks!(1)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 2: writeAlignmentBlocks!(2)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 3: writeAlignmentBlocks!(3)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 4: writeAlignmentBlocks!(4)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 5: writeAlignmentBlocks!(5)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 6: writeAlignmentBlocks!(6)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 7: writeAlignmentBlocks!(7)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 8: writeAlignmentBlocks!(8)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
case 9: writeAlignmentBlocks!(9)(chunked_blocks, stream, task_pool, parmapbufsz, parmapwusz); break;
default: throw new Exception("compression level must be a number from -1 to 9");
}
if (writeEOF) {
// write EOF block
stream.writeExact(BAM_EOF.ptr, BAM_EOF.length);
}
}
private {
// Takes range of blocks as input and returns
// another range where too big blocks, with length
// more than $(D max_size) were cut into several smaller blocks
struct ChunkedBlockRange(R) {
R _blocks;
ElementType!R _current_block;
size_t _max_size;
bool _empty = false;
this(R blocks, size_t max_size) {
_blocks = blocks;
_max_size = max_size;
_loadNextBlock();
}
bool empty() @property {
return _empty;
}
ubyte[] front() @property {
if (_current_block.length <= _max_size) {
return _current_block;
} else {
version(serial) {
return _current_block[0 .. _max_size];
} else {
return _current_block[0 .. _max_size].dup;
}
}
}
void popFront() {
if (_current_block.length <= _max_size) {
_loadNextBlock();
} else {
_current_block = _current_block[_max_size .. $];
}
}
void _loadNextBlock() {
if (!_blocks.empty) {
// dup is required because map.front returns a reference
_current_block = _blocks.front.dup;
_blocks.popFront();
} else {
_empty = true;
}
}
}
auto chunkedBlockRange(R)(R blocks, size_t max_size) {
return ChunkedBlockRange!R(blocks, max_size);
}
}
unittest {
import std.stdio;
writeln("Testing chunkedBlockRange...");
ubyte[][] blocks = [[1, 2, 3], [4, 5, 6, 7], [8, 9, 10]];
assert(equal(chunkedBlockRange(blocks, 2),
[[1, 2], [3], [4, 5], [6, 7], [8, 9], [10]]));
assert(equal(chunkedBlockRange(blocks, 3),
[[1, 2, 3], [4, 5, 6], [7], [8, 9, 10]]));
assert(equal(chunkedBlockRange(blocks, 4), blocks));
assert(equal(chunkedBlockRange(blocks, 1), map!"[a]"(iota(1, 11))));
}

2
bio/bam/pileuprange.d

@ -78,7 +78,7 @@ struct PileupRead(Read=EagerBamRead) {
ubyte current_base_quality() @property const {
assert(_query_offset <= _read.sequence_length);
if (_cur_op.is_query_consuming && _cur_op.is_reference_consuming) {
return _read.phred_base_quality[_query_offset];
return _read.base_qualities[_query_offset];
} else {
return 255;
}

4
bio/bam/validation/alignment.d

@ -197,8 +197,8 @@ private:
bool invalidQualityData(ref BamRead al) {
/// Check quality data
if (!all!"a == 0xFF"(al.phred_base_quality) &&
!all!"0 <= a && a <= 93"(al.phred_base_quality))
if (!all!"a == 0xFF"(al.base_qualities) &&
!all!"0 <= a && a <= 93"(al.base_qualities))
{
if (!onError(al, AlignmentError.QualityDataContainsInvalidElements)) {
return true;

4
bio/sam/reader.d

@ -148,9 +148,7 @@ private:
_reference_sequences = new ReferenceSequenceInfo[_header.sequences.length];
foreach (sq; _header.sequences) {
ReferenceSequenceInfo seq;
seq.name = sq.name;
seq.length = sq.length;
auto seq = ReferenceSequenceInfo(sq.name, sq.length);
_reference_sequences[_header.getSequenceIndex(seq.name)] = seq;
}
}

8
bio/sam/utils/fastrecordparser.d

@ -4063,7 +4063,7 @@ goto case; case 134:
builder.data);
if (qual_ptr !is null && qual_index == sequence.length) {
read.phred_base_quality = qual_ptr[0 .. sequence.length];
read.base_qualities = qual_ptr[0 .. sequence.length];
}
read.flag = flag;
@ -4105,9 +4105,7 @@ unittest {
import bio.bam.serialization.sam;
import bio.bam.reference;
ReferenceSequenceInfo info;
info.name = "20";
info.length = 1234567;
auto info = ReferenceSequenceInfo("20", 1234567);
auto invalid_cigar_string = "1\t100\t20\t50000\t30\tMZABC\t=\t50000\t0\tACGT\t####";
alignment = parseAlignmentLine(invalid_cigar_string, header);
@ -4115,7 +4113,7 @@ unittest {
auto invalid_tag_and_qual = "2\t100\t20\t5\t40\t27M30X5D\t=\t3\t10\tACT\t !\n\tX1:i:7\tX3:i:zzz\tX4:i:5";
alignment = parseAlignmentLine(invalid_tag_and_qual, header);
assert(alignment.phred_base_quality == [255, 255, 255]); // i.e. invalid
assert(alignment.base_qualities == [255, 255, 255]); // i.e. invalid
assert(to!ubyte(alignment["X1"]) == 7);
assert(alignment["X3"].is_nothing);
assert(to!ubyte(alignment["X4"]) == 5);

8
bio/sam/utils/recordparser.d

@ -1060,7 +1060,7 @@ _again:
builder.data);
if (qual_ptr !is null && qual_index == sequence.length) {
read.phred_base_quality = qual_ptr[0 .. sequence.length];
read.base_qualities = qual_ptr[0 .. sequence.length];
}
read.flag = flag;
@ -1102,9 +1102,7 @@ unittest {
import bio.bam.serialization.sam;
import bio.bam.reference;
ReferenceSequenceInfo info;
info.name = "20";
info.length = 1234567;
auto info = ReferenceSequenceInfo("20", 1234567);
auto invalid_cigar_string = "1\t100\t20\t50000\t30\tMZABC\t=\t50000\t0\tACGT\t####";
alignment = parseAlignmentLine(invalid_cigar_string, header);
@ -1112,7 +1110,7 @@ unittest {
auto invalid_tag_and_qual = "2\t100\t20\t5\t40\t27M30X5D\t=\t3\t10\tACT\t !\n\tX1:i:7\tX3:i:zzz\tX4:i:5";
alignment = parseAlignmentLine(invalid_tag_and_qual, header);
assert(alignment.phred_base_quality == [255, 255, 255]); // i.e. invalid
assert(alignment.base_qualities == [255, 255, 255]); // i.e. invalid
assert(to!ubyte(alignment["X1"]) == 7);
assert(alignment["X3"].is_nothing);
assert(to!ubyte(alignment["X4"]) == 5);

0
examples/example1.d

8
src_ragel/sam_alignment.rl

@ -325,7 +325,7 @@ BamRead parseAlignmentLine(string line, SamHeader header,
builder.data);
if (qual_ptr !is null && qual_index == sequence.length) {
read.phred_base_quality = qual_ptr[0 .. sequence.length];
read.base_qualities = qual_ptr[0 .. sequence.length];
}
read.flag = flag;
@ -367,9 +367,7 @@ unittest {
import bio.bam.serialization.sam;
import bio.bam.reference;
ReferenceSequenceInfo info;
info.name = "20";
info.length = 1234567;
auto info = ReferenceSequenceInfo("20", 1234567);
auto invalid_cigar_string = "1\t100\t20\t50000\t30\tMZABC\t=\t50000\t0\tACGT\t####";
alignment = parseAlignmentLine(invalid_cigar_string, header);
@ -377,7 +375,7 @@ unittest {
auto invalid_tag_and_qual = "2\t100\t20\t5\t40\t27M30X5D\t=\t3\t10\tACT\t !\n\tX1:i:7\tX3:i:zzz\tX4:i:5";
alignment = parseAlignmentLine(invalid_tag_and_qual, header);
assert(alignment.phred_base_quality == [255, 255, 255]); // i.e. invalid
assert(alignment.base_qualities == [255, 255, 255]); // i.e. invalid
assert(to!ubyte(alignment["X1"]) == 7);
assert(alignment["X3"].is_nothing);
assert(to!ubyte(alignment["X4"]) == 5);

Loading…
Cancel
Save