Browse Source

better SAM parser

bio2
lomereiter 9 years ago
parent
commit
52d0ae3ce7
  1. 3
      bio/bam/bai/utils/algo.d
  2. 42
      bio/bam/read.d
  3. 65
      bio/bam/utils/tagstoragebuilder.d
  4. 103
      bio/core/utils/outbuffer.d
  5. 10
      bio/sam/reader.d
  6. 6066
      bio/sam/utils/fastrecordparser.d
  7. 1648
      bio/sam/utils/recordparser.d
  8. 418
      src_ragel/sam_alignment.rl
  9. 6
      test/unittests.d

3
bio/bam/bai/utils/algo.d

@ -27,6 +27,7 @@ import bio.bam.bai.chunk;
import std.range;
import std.algorithm;
import std.array;
struct NonOverlappingChunks(R) {
@ -84,7 +85,7 @@ private:
/// Returns:
/// range of non-overlapping chunks, covering the same subset
/// as original chunks
auto nonOverlappingChunks(R)(R r)
NonOverlappingChunks!R nonOverlappingChunks(R)(R r)
if (is(ElementType!R == Chunk))
{
return NonOverlappingChunks!R(r);

42
bio/bam/read.d

@ -763,22 +763,6 @@ struct BamRead {
this.sequence = sequence;
}
// Low-level constructor for setting tag data on construction.
// This allows to use less reallocations when creating an alignment
// from scratch, by reusing memory for collecting tags.
// Typically, you would use this constructor in conjunction with
// bio.bam.utils.tagstoragebuilder module.
this(string read_name,
string sequence,
in CigarOperation[] cigar,
in ubyte[] tag_data)
{
_chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)
+ tag_data.length];
this(read_name, sequence, cigar);
_chunk[_tags_offset .. $] = tag_data;
}
/// Deep copy of the record.
BamRead dup() @property const {
BamRead result;
@ -1057,6 +1041,11 @@ struct BamRead {
inout(ubyte)[] raw_data() @property inout {
return _chunk;
}
/// ditto
void raw_data(ubyte[] data) @property {
_chunk = data;
}
package ubyte[] _chunk; // holds all the data,
// the access is organized via properties
@ -1511,7 +1500,6 @@ mixin template TagStorage() {
unittest {
import bio.bam.utils.tagstoragebuilder;
import std.algorithm;
import std.stdio;
import std.math;
@ -1575,22 +1563,6 @@ unittest {
read["RG"] = null;
assert(read.tagCount() == 0);
// Test tagstoragebuilder
auto builder = new TagStorageBuilder();
builder.put("X0", Value(24));
builder.put("X1", Value("abcd"));
builder.put("X2", Value([1,2,3]));
read = BamRead("readname",
"AGCTGACTACGTAATAGCCCTA",
[CigarOperation(22, 'M')],
builder.data);
assert(read["X0"] == 24);
assert(read["X1"] == "abcd");
assert(read["X2"] == [1,2,3]);
assert(read.tagCount() == 3);
// Test MsgPack serialization/deserialization
{
@ -1599,8 +1571,8 @@ unittest {
read.toMsgpack(packer);
auto data = packer.stream.data;
auto rec = unpack(data).via.array;
assert(rec[0] == "readname");
assert(rec[5].as!(int[]) == [22]);
assert(rec[0].as!(ubyte[]) == "anothername");
assert(rec[5].as!(int[]) == [21]);
assert(rec[6].as!(ubyte[]) == ['M']);
assert(rec[10].as!(ubyte[]) == to!string(read.sequence));
}

65
bio/bam/utils/tagstoragebuilder.d

@ -1,65 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.utils.tagstoragebuilder;
import bio.bam.tagvalue;
import bio.bam.utils.value;
import std.array;
/// Struct for building tag storage, effectively managing memory.
struct TagStorageBuilder {
private Appender!(ubyte[]) _storage;
/// Return tag data (little-endian, in BAM format)
ubyte[] data() @property {
return _storage.data();
}
static TagStorageBuilder create() {
TagStorageBuilder builder;
builder._storage = appender!(ubyte[])();
return builder;
}
/// Clear underlying storage
void clear() {
_storage.clear();
}
private void putByRef(string tag, ref Value v) {
_storage.put(cast(ubyte[])tag);
emplaceValue(_storage, v);
}
/// Append tag value to the storage
void put(string tag, ref Value value) {
putByRef(tag, value);
}
/// ditto
void put(string tag, Value value) {
putByRef(tag, value);
}
}

103
bio/core/utils/outbuffer.d

@ -0,0 +1,103 @@
/*
This file is part of BioD.
Copyright (C) 2013 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.core.utils.outbuffer;
import std.array;
import std.exception;
///
class OutBuffer {
private {
ubyte[] _heap;
ubyte* _heap_ptr() @property { return _heap.ptr; }
size_t _heap_capacity;
size_t _heap_used;
}
///
this(size_t initial_capacity) {
_heap = uninitializedArray!(ubyte[])(initial_capacity);
_heap_capacity = initial_capacity;
}
///
ubyte[] data() @property {
return _heap_ptr[0 .. _heap_used];
}
/// Remove last elements such that new size is equal to $(D size).
void shrink(size_t size) {
enforce(size <= data.length);
_heap_used = size;
}
///
size_t capacity() @property const {
return _heap_capacity;
}
/// ditto
void capacity(size_t new_capacity) @property {
if (new_capacity <= _heap_capacity)
return;
_heap.length = new_capacity;
_heap_capacity = new_capacity;
}
///
void put(T)(T bytes) if (is(T == ubyte[])) {
size_t needed = bytes.length + _heap_used;
if (needed > _heap_capacity) {
do {
_heap_capacity = _heap_capacity * 3 / 2;
} while (_heap_capacity < needed);
capacity = _heap_capacity;
}
_heap_ptr[_heap_used .. _heap_used + bytes.length] = bytes[];
_heap_used = needed;
}
/// Dumps raw bytes into the buffer. No endianness conversion or whatever.
void put(T)(auto ref T value) if (!is(T == ubyte[])) {
put((cast(ubyte*)(&value))[0 .. T.sizeof]);
}
/// Responsibility that there's enough capacity is on the user
void putUnsafe(T)(T bytes) if (is(T == ubyte[])) {
_heap_ptr[_heap_used .. _heap_used + bytes.length] = bytes[];
_heap_used += bytes.length;;
}
/// ditto
void putUnsafe(T)(auto ref T value) if (!is(T == ubyte[])) {
putUnsafe((cast(ubyte*)(&value))[0 .. T.sizeof]);
}
///
void clear() {
_heap_used = 0;
}
}

10
bio/sam/reader.d

@ -27,6 +27,7 @@ import bio.bam.abstractreader;
import bio.sam.header;
import bio.bam.read;
import bio.bam.referenceinfo;
import bio.core.utils.outbuffer;
version(DigitalMars) {
import bio.sam.utils.recordparser;
@ -72,9 +73,9 @@ class SamReader : IBamSamReader {
this(LineRange lines, ref SamHeader header, SamReader reader) {
_header = header;
_reader = reader;
_buffer = new OutBuffer(8192); // should be plenty even for PacBio reads
_line_range = lines;
_build_storage = new AlignmentBuildStorage();
_parseNextLine();
}
@ -96,19 +97,18 @@ class SamReader : IBamSamReader {
if (_line_range.empty) {
_empty = true;
} else {
_current_alignment = parseAlignmentLine(cast(string)_line_range.front.dup,
_header,
_build_storage);
_current_alignment = parseAlignmentLine(cast(string)_line_range.front,
_header, _buffer);
_current_alignment.associateWithReader(cast(IBamSamReader)_reader);
}
}
LineRange _line_range;
BamRead _current_alignment;
OutBuffer _buffer;
bool _empty;
SamHeader _header;
SamReader _reader;
AlignmentBuildStorage _build_storage;
}
}

6066
bio/sam/utils/fastrecordparser.d

File diff suppressed because it is too large

1648
bio/sam/utils/recordparser.d

File diff suppressed because it is too large

418
src_ragel/sam_alignment.rl

@ -44,108 +44,257 @@
invalid_field = [^\t]* ; # TODO: make class with onError methods and pass it to parseAlignmentLine
### 1. STORE READ NAME ###
action qname_start { read_name_beg = p - line.ptr; }
action qname_end { read_name_end = p - line.ptr; }
action qname_end { read_name = line[read_name_beg .. p - line.ptr]; }
action handle_invalid_qname { fhold; fgoto recover_from_invalid_qname; }
recover_from_invalid_qname := invalid_field '\t' @{ fhold; fgoto flag_parsing; } ;
qname = '*' | (([!-?A-~]{1,255})** > qname_start % qname_end) ;
### 2. STORE FLAG ###
action set_flag { flag = to!ushort(int_value); }
action set_pos { pos = to!uint(int_value); }
action set_mapping_quality { mapping_quality = to!ubyte(int_value); }
flag = uint % set_flag;
action handle_invalid_flag { fhold; fgoto recover_from_invalid_flag; }
recover_from_invalid_flag := invalid_field '\t' @{ fhold; fgoto rname_parsing; } ;
### 3. STORE RNAME ###
action rname_start { rname_beg = p - line.ptr; }
action rname_end {
ref_id = header.getSequenceIndex(line[rname_beg .. p - line.ptr]);
}
action handle_invalid_rname { fhold; fgoto recover_from_invalid_rname; }
recover_from_invalid_rname := invalid_field '\t' @{ fhold; fgoto pos_parsing; } ;
rname = '*' | (([!-()+-<>-~] [!-~]*) > rname_start % rname_end);
### 4. STORE POS ###
action set_pos { end_pos = pos = to!uint(int_value); }
pos = uint % set_pos;
action handle_invalid_pos { fhold; fgoto recover_from_invalid_pos; }
recover_from_invalid_pos := invalid_field '\t' @{ fhold; fgoto mapq_parsing; } ;
### 5. STORE MAPPING QUALITY ###
action set_mapping_quality { mapping_quality = to!ubyte(int_value); }
mapq = uint % set_mapping_quality;
action handle_invalid_mapq { fhold; fgoto recover_from_invalid_mapq; }
recover_from_invalid_mapq := invalid_field '\t' @{ fhold; fgoto cigar_parsing; } ;
### 6. INITIALIZE OUTPUT BUFFER ###
action init_output_buffer {
buffer.capacity = 32 + read_name.length + 1;
buffer.putUnsafe!int(ref_id);
buffer.putUnsafe!int(pos - 1);
enforce(read_name.length + 1 <= 255, "Read name " ~ read_name ~ " is too long!");
// bin will be set later
auto bin_mq_nl = ((cast(uint)mapping_quality) << 8) | (read_name.length + 1);
buffer.putUnsafe(cast(uint)bin_mq_nl);
// number of CIGAR operations will be set later
buffer.putUnsafe!uint(flag << 16);
buffer.putUnsafe!int(0);
buffer.putUnsafe!int(-1); // mate ref. id
buffer.putUnsafe!int(-1); // mate pos
buffer.putUnsafe!int(0); // tlen
buffer.putUnsafe(cast(ubyte[])read_name);
buffer.putUnsafe!ubyte(0);
rollback_size = buffer.data.length;
}
### 7. STORE CIGAR OPERATIONS ###
action cigar_set_op_length { cigar_op_len = to!uint(int_value); }
action cigar_set_op_chr { cigar_op_chr = fc; }
action cigar_put_operation {
cigar.put(CigarOperation(cigar_op_len, cigar_op_chr));
auto op = CigarOperation(cigar_op_len, cigar_op_chr);
if (op.is_reference_consuming)
end_pos += op.length;
buffer.put!CigarOperation(op);
{
auto ptr = cast(uint*)(buffer.data.ptr + 3 * uint.sizeof);
*ptr = (*ptr) + 1;
}
}
action handle_invalid_cigar {
auto ptr = cast(uint*)(buffer.data.ptr + 3 * uint.sizeof);
*ptr = (*ptr) & 0xFFFF0000;
buffer.shrink(rollback_size);
end_pos = pos + 1;
fhold; fgoto recover_from_invalid_cigar;
}
recover_from_invalid_cigar := invalid_field '\t' @{ fhold; fgoto rnext_parsing; } ;
cigar = '*' | (uint % cigar_set_op_length
[MIDNSHPX=] > cigar_set_op_chr % cigar_put_operation)+ ;
### 8. SET BIN ###
action set_bin {
if (end_pos == pos)
++end_pos;
{
auto bin = reg2bin(pos, end_pos);
auto ptr = cast(uint*)(buffer.data.ptr + 2 * uint.sizeof);
*ptr = (*ptr) | ((cast(uint)bin) << 16);
}
}
### 9. SET MATE REF. ID ###
action set_same_mate_ref_id {
mate_ref_id = ref_id;
{
auto ptr = cast(int*)(buffer.data.ptr + 5 * int.sizeof);
*ptr = ref_id;
}
}
action rnext_start { rnext_beg = p - line.ptr; }
action rnext_end {
mate_ref_id = header.getSequenceIndex(line[rnext_beg .. p - line.ptr]);
{
auto ptr = cast(int*)(buffer.data.ptr + 5 * int.sizeof);
*ptr = header.getSequenceIndex(line[rnext_beg .. p - line.ptr]);
}
}
action handle_invalid_rnext { fhold; fgoto recover_from_invalid_rnext; }
recover_from_invalid_rnext := invalid_field '\t' @{ fhold; fgoto pnext_parsing; } ;
rnext = '*' | ('=' % set_same_mate_ref_id) |
(([!-()+-<>-~][!-~]*) > rnext_start % rnext_end) ;
action set_mate_pos { mate_pos = to!uint(int_value); }
action set_template_length { template_length = to!int(int_value); }
### 10. SET MATE POSITION ###
action set_mate_pos {
{
auto ptr = cast(int*)(buffer.data.ptr + 6 * int.sizeof);
*ptr = to!int(int_value) - 1;
}
}
action handle_invalid_pnext { fhold; fgoto recover_from_invalid_pnext; }
recover_from_invalid_pnext := invalid_field '\t' @{ fhold; fgoto tlen_parsing; } ;
pnext = uint % set_mate_pos;
### 11. SET TEMPLATE LENGTH ###
action set_template_length {
{
auto ptr = cast(int*)(buffer.data.ptr + 7 * int.sizeof);
*ptr = to!int(int_value);
}
}
action handle_invalid_tlen { fhold; fgoto recover_from_invalid_tlen; }
recover_from_invalid_tlen := invalid_field '\t' @{ fhold; fgoto seq_parsing; } ;
tlen = int % set_template_length;
### 12. SET SEQUENCE ###
action sequence_start { sequence_beg = p - line.ptr; }
action sequence_end { sequence = line[sequence_beg .. p - line.ptr]; }
seq = '*' | ([A-Za-z=.]+ > sequence_start % sequence_end) ;
qual = [!-~]+ ;
action sequence_end {
auto data = cast(ubyte[])line[sequence_beg .. p - line.ptr];
l_seq = cast(int)data.length;
auto raw_len = (l_seq + 1) / 2;
// reserve space for base qualities, too
buffer.capacity = buffer.data.length + raw_len + l_seq;
for (size_t i = 0; i < raw_len; ++i) {
auto b = cast(ubyte)(Base(data[2 * i]).internal_code << 4);
if (2 * i + 1 < l_seq)
b |= cast(ubyte)(Base(data[2 * i + 1]).internal_code);
buffer.putUnsafe!ubyte(b);
}
action allocate_quality_array {
if (sequence.length > 1024) {
qual_ptr = (new ubyte[sequence.length]).ptr;
} else {
qual_ptr = cast(ubyte*)alloca(sequence.length);
if (!qual_ptr) {
qual_ptr = (new ubyte[sequence.length]).ptr;
}
// set l_seq
{
auto ptr = cast(int*)(buffer.data.ptr + 4 * int.sizeof);
*ptr = l_seq;
}
qual_index = 0;
rollback_size = buffer.data.length;
}
action handle_invalid_seq {
rollback_size = buffer.data.length;
fhold; fgoto recover_from_invalid_seq;
}
recover_from_invalid_seq := invalid_field '\t' @{ fhold; fgoto qual_parsing; } ;
seq = '*' | ([A-Za-z=.]+ > sequence_start % sequence_end) ;
### 13. SET BASE QUALITIES ###
action convert_next_character_to_prob {
qual_ptr[qual_index++] = cast(ubyte)(fc - 33);
buffer.putUnsafe!ubyte(cast(ubyte)(fc - 33));
}
action handle_invalid_qual {
buffer.shrink(rollback_size);
for (size_t i = 0; i < l_seq; ++i)
buffer.putUnsafe!ubyte(0xFF);
rollback_size = buffer.data.length;
fhold; fgoto recover_from_invalid_qual;
}
# FIXME
recover_from_invalid_qual := invalid_field '\t' @{ fhold; fgoto tag_parsing; } ;
action check_qual_length {
if (buffer.data.length - rollback_size != l_seq) {
buffer.shrink(rollback_size);
for (size_t i = 0; i < l_seq; ++i)
buffer.putUnsafe!ubyte(0xFF);
}
rollback_size = buffer.data.length;
}
qual = [!-~]+ $ convert_next_character_to_prob ;
###### PARSE MANDATORY FIELDS #######
mandatoryfields = qname_parsing: (qname $!handle_invalid_qname)
flag_parsing: '\t' (flag $!handle_invalid_flag)
rname_parsing: '\t' (rname $!handle_invalid_rname)
pos_parsing: '\t' (pos $!handle_invalid_pos)
mapq_parsing: '\t' (mapq $!handle_invalid_mapq)
cigar_parsing: '\t' > init_output_buffer (cigar $!handle_invalid_cigar)
rnext_parsing: '\t' > set_bin (rnext $!handle_invalid_rnext)
pnext_parsing: '\t' (pnext $!handle_invalid_pnext)
tlen_parsing: '\t' (tlen $!handle_invalid_tlen)
seq_parsing: '\t' (seq $!handle_invalid_seq)
qual_parsing: '\t' (qual % check_qual_length $!handle_invalid_qual) ;
############ TAG PARSING ######
action set_charvalue {
buffer.capacity = buffer.data.length + 4;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('A');
buffer.putUnsafe!char(fc);
}
mandatoryfields = (qname | invalid_field) '\t'
(flag | invalid_field) '\t'
(rname | invalid_field) '\t'
(pos | invalid_field) '\t'
(mapq | invalid_field) '\t'
(cigar | invalid_field) '\t'
(rnext | invalid_field) '\t'
(pnext | invalid_field) '\t'
(tlen | invalid_field) '\t'
(seq | invalid_field) '\t'
((qual > allocate_quality_array
$ convert_next_character_to_prob) | invalid_field) ;
action set_charvalue { current_tagvalue = Value(fc); }
action set_integervalue {
buffer.capacity = buffer.data.length + 7;
buffer.putUnsafe(tag_key);
if (int_value < 0) {
if (int_value >= byte.min) {
current_tagvalue = Value(to!byte(int_value));
buffer.putUnsafe!char('c');
buffer.putUnsafe(cast(byte)int_value);
} else if (int_value >= short.min) {
current_tagvalue = Value(to!short(int_value));
buffer.putUnsafe!char('s');
buffer.putUnsafe(cast(short)int_value);
} else if (int_value >= int.min) {
current_tagvalue = Value(to!int(int_value));
buffer.putUnsafe!char('i');
buffer.putUnsafe(cast(int)int_value);
} else {
throw new Exception("integer out of range");
}
} else {
if (int_value <= ubyte.max) {
current_tagvalue = Value(to!ubyte(int_value));
buffer.putUnsafe!char('C');
buffer.putUnsafe(cast(ubyte)int_value);
} else if (int_value <= ushort.max) {
current_tagvalue = Value(to!ushort(int_value));
buffer.putUnsafe!char('S');
buffer.putUnsafe(cast(ushort)int_value);
} else if (int_value <= uint.max) {
current_tagvalue = Value(to!uint(int_value));
buffer.putUnsafe!char('I');
buffer.putUnsafe(cast(uint)int_value);
} else {
throw new Exception("integer out of range");
}
@ -155,16 +304,32 @@
action start_tagvalue { tagvalue_beg = p - line.ptr; }
action set_floatvalue {
current_tagvalue = Value(float_value);
buffer.capacity = buffer.data.length + 7;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('f');
buffer.putUnsafe!float(float_value);
}
action set_stringvalue {
current_tagvalue = Value(line[tagvalue_beg .. p - line.ptr]);
{
auto data = cast(ubyte[])(line[tagvalue_beg .. p - line.ptr]);
buffer.capacity = buffer.data.length + 4 + data.length;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('Z');
buffer.putUnsafe(data);
buffer.putUnsafe!ubyte(0);
}
}
action set_hexstringvalue {
current_tagvalue = Value(line[tagvalue_beg .. p - line.ptr]);
current_tagvalue.setHexadecimalFlag();
{
auto data = cast(ubyte[])(line[tagvalue_beg .. p - line.ptr]);
buffer.capacity = buffer.data.length + 4 + data.length;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('H');
buffer.putUnsafe(data);
buffer.putUnsafe!ubyte(0);
}
}
charvalue = [!-~] > set_charvalue ;
@ -172,49 +337,44 @@
floatvalue = float % set_floatvalue ;
action start_arrayvalue {
// it might be not the best idea to use outbuffer;
// the better idea might be two-pass approach
// when first pass is for counting commas, and
// the second is for filling allocated array
outbuffer.data.length = 0;
outbuffer.offset = 0;
arraytype = fc;
buffer.capacity = buffer.data.length + 8;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('B');
buffer.putUnsafe!char(arraytype);
buffer.putUnsafe!uint(0);
tag_array_length_offset = buffer.data.length - uint.sizeof;
}
action put_integer_to_array {
// here, we assume that compiler is smart enough to move switch out of loop.
switch (arraytype) {
case 'c': outbuffer.write(to!byte(int_value)); break;
case 'C': outbuffer.write(to!ubyte(int_value)); break;
case 's': outbuffer.write(to!short(int_value)); break;
case 'S': outbuffer.write(to!ushort(int_value)); break;
case 'i': outbuffer.write(to!int(int_value)); break;
case 'I': outbuffer.write(to!uint(int_value)); break;
case 'c': buffer.put(to!byte(int_value)); break;
case 'C': buffer.put(to!ubyte(int_value)); break;
case 's': buffer.put(to!short(int_value)); break;
case 'S': buffer.put(to!ushort(int_value)); break;
case 'i': buffer.put(to!int(int_value)); break;
case 'I': buffer.put(to!uint(int_value)); break;
default: assert(0);
}
{
auto ptr = cast(uint*)(buffer.data.ptr + tag_array_length_offset);
++*ptr;
}
}
action put_float_to_array {
outbuffer.write(float_value);
}
action set_arrayvalue {
switch (arraytype) {
case 'c': current_tagvalue = Value(cast(byte[])(outbuffer.toBytes())); break;
case 'C': current_tagvalue = Value(cast(ubyte[])(outbuffer.toBytes())); break;
case 's': current_tagvalue = Value(cast(short[])(outbuffer.toBytes())); break;
case 'S': current_tagvalue = Value(cast(ushort[])(outbuffer.toBytes())); break;
case 'i': current_tagvalue = Value(cast(int[])(outbuffer.toBytes())); break;
case 'I': current_tagvalue = Value(cast(uint[])(outbuffer.toBytes())); break;
case 'f': current_tagvalue = Value(cast(float[])(outbuffer.toBytes())); break;
default: assert(0);
buffer.put!float(float_value);
{
auto ptr = cast(uint*)(buffer.data.ptr + tag_array_length_offset);
++*ptr;
}
}
stringvalue = [ !-~]+ > start_tagvalue % set_stringvalue ;
hexstringvalue = xdigit+ > start_tagvalue % set_hexstringvalue ;
integerarrayvalue = [cCsSiI] > start_arrayvalue (',' int % put_integer_to_array)+ % set_arrayvalue;
floatarrayvalue = [f] > start_arrayvalue (',' float % put_float_to_array)+ % set_arrayvalue;
integerarrayvalue = [cCsSiI] > start_arrayvalue (',' int % put_integer_to_array)+ ;
floatarrayvalue = [f] > start_arrayvalue (',' float % put_float_to_array)+ ;
arrayvalue = integerarrayvalue | floatarrayvalue ;
tagvalue = ("A:" charvalue) |
@ -225,121 +385,87 @@
("B:" arrayvalue) ;
action tag_key_start { tag_key_beg = p - line.ptr; }
action tag_key_end { current_tag = line[tag_key_beg .. p - line.ptr]; }
action append_tag_value { builder.put(current_tag, current_tagvalue); }
action tag_key_end { tag_key = cast(ubyte[])(line[tag_key_beg .. p - line.ptr]); }
action handle_invalid_tag {
buffer.shrink(rollback_size);
fhold; fgoto recover_from_invalid_tag;
}
# FIXME: what if the tag is last?
recover_from_invalid_tag := invalid_field '\t' @{ fhold; fgoto tag_parsing; } ;
action update_rollback_size { rollback_size = buffer.data.length; }
tag = (alpha alnum) > tag_key_start % tag_key_end ;
optionalfield = (tag ':' tagvalue % append_tag_value) | invalid_field ;
optionalfield = tag ':' tagvalue % update_rollback_size $!handle_invalid_tag ;
optionalfields = optionalfield ('\t' optionalfield)* ;
alignment := mandatoryfields ('\t' optionalfields)? ;
alignment := field_parsing: mandatoryfields
tag_parsing: ('\t' optionalfields)? ;
write data;
}%%
import bio.sam.header;
import bio.bam.read;
import bio.bam.tagvalue;
import bio.bam.utils.tagstoragebuilder;
import std.array;
import bio.bam.bai.bin;
import bio.core.utils.outbuffer;
import bio.core.base;
import std.conv;
import std.typecons;
import std.outbuffer;
import std.c.stdlib;
class AlignmentBuildStorage {
Appender!(CigarOperation[]) cigar_appender;
OutBuffer outbuffer;
TagStorageBuilder tag_storage_builder;
this() {
cigar_appender = appender!(CigarOperation[])();
outbuffer = new OutBuffer();
tag_storage_builder = TagStorageBuilder.create();
}
void clear() {
cigar_appender.clear();
tag_storage_builder.clear();
outbuffer.data.length = 0;
outbuffer.offset = 0;
}
}
import std.array;
import std.exception;
BamRead parseAlignmentLine(string line, SamHeader header,
AlignmentBuildStorage b=null) {
BamRead parseAlignmentLine(string line, SamHeader header, OutBuffer buffer=null) {
char* p = cast(char*)line.ptr;
char* pe = p + line.length;
char* eof = pe;
int cs;
if (b is null) {
b = new AlignmentBuildStorage();
} else {
b.clear();
}
if (buffer is null)
buffer = new OutBuffer(8192);
else
buffer.clear();
size_t rollback_size; // needed in case of invalid data
byte current_sign = 1;
size_t read_name_beg; // position of beginning of QNAME
size_t read_name_end; // position past the end of QNAME
size_t sequence_beg; // position of SEQ start
string sequence; // SEQ
int l_seq; // sequence length
uint cigar_op_len; // length of CIGAR operation
char cigar_op_chr; // CIGAR operation
size_t cigar_op_len_start; // position of start of CIGAR operation
auto cigar = b.cigar_appender;
long int_value; // for storing temporary integers
float float_value; // for storing temporary floats
size_t float_beg; // position of start of current float
auto outbuffer = b.outbuffer; // used to build tag values which hold arrays
char arraytype; // type of last array tag value
size_t tag_array_length_offset; // where the length is stored in the buffer
string read_name;
ushort flag;
uint pos;
uint mate_pos;
ubyte mapping_quality;
int template_length;
ubyte* qual_ptr = null;
size_t qual_index;
string current_tag;
Value current_tagvalue;
int pos = -1;
int end_pos; // for bin calculation
int mate_pos = -1;
ubyte mapping_quality = 255;
int template_length = 0;
size_t tag_key_beg, tagvalue_beg;
ubyte[] tag_key;
size_t rname_beg, rnext_beg;
int ref_id = -1;
int mate_ref_id = -1;
auto builder = b.tag_storage_builder;
%%write init;
%%write exec;
auto read = BamRead(line[read_name_beg .. read_name_end],
sequence,
cigar.data,
builder.data);
if (qual_ptr !is null && qual_index == sequence.length) {
read.base_qualities = qual_ptr[0 .. sequence.length];
}
read.flag = flag;
read.mapping_quality = mapping_quality;
read.position = pos - 1; // we use 0-based coordinates, not 1-based
read.template_length = template_length;
read.mate_position = mate_pos - 1; // also 0-based
read.ref_id = ref_id;
read.mate_ref_id = mate_ref_id;
BamRead read;
auto gc_managed_chunk = uninitializedArray!(ubyte[])(buffer.data.length);
gc_managed_chunk[] = buffer.data[];
read.raw_data = gc_managed_chunk;
return read;
}
@ -367,7 +493,6 @@ unittest {
assert(equal!approxEqual(to!(float[])(alignment["Y1"]), [13.263, -3.1415, 52.63461]));
assert(to!char(alignment["XT"]) == 'U');
import std.stdio;
import bio.bam.reference;
auto info = ReferenceSequenceInfo("20", 1234567);
@ -382,5 +507,4 @@ unittest {
assert(to!ubyte(alignment["X1"]) == 7);
assert(alignment["X3"].is_nothing);
assert(to!ubyte(alignment["X4"]) == 5);
}

6
test/unittests.d

@ -246,8 +246,12 @@ unittest {
foreach (read; bf.reads) {
auto line = read.to!string();
auto read2 = parseAlignmentLine(line, bf.header);
read2.associateWithReader(bf);
if (read != read2) {
writeln(read.name);
writeln(read);
writeln(read2);
writeln(read.raw_data);
writeln(read2.raw_data);
}
assert(read == read2);
}

Loading…
Cancel
Save