Browse Source

Moved Cigar functionality into its own module. Reran ragel.

bio2
Pjotr Prins 5 years ago
parent
commit
7994406592
  1. 280
      bio/bam/cigar.d
  2. 55
      bio/bam/md/reconstruct.d
  3. 1
      bio/bam/pileup.d
  4. 228
      bio/bam/read.d
  5. 288
      bio/core/region.d
  6. 465
      bio/sam/utils/fastrecordparser.d
  7. 63
      bio/sam/utils/recordparser.d
  8. 7
      src_ragel/Makefile
  9. 55
      src_ragel/sam_alignment.rl

280
bio/bam/cigar.d

@ -0,0 +1,280 @@
/*
This file is part of BioD.
Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.cigar;
import std.algorithm;
import std.range;
import std.conv;
import std.format;
import std.exception;
import std.system;
import std.traits;
import std.array;
import std.bitmanip;
import core.stdc.stdlib;
import bio.core.base;
import bio.core.utils.format;
import bio.bam.abstractreader;
import bio.bam.cigar;
import bio.bam.writer;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
import bio.bam.md.core;
import bio.bam.utils.array;
import bio.bam.utils.value;
import bio.core.utils.switchendianness;
import bio.bam.thirdparty.msgpack : Packer, unpack;
/**
Represents single CIGAR operation
*/
struct CigarOperation {
static assert(CigarOperation.sizeof == uint.sizeof);
/*
WARNING!
It is very essential that the size of
this struct is EXACTLY equal to uint.sizeof!
The reason is to avoid copying of arrays during alignment parsing.
Namely, when some_pointer points to raw cigar data,
we can just do a cast. This allows to access those data
directly, not doing any memory allocations.
*/
private uint raw; // raw data from BAM
private static ubyte char2op(char c) {
switch(c) {
case 'M': return 0;
case 'I': return 1;
case 'D': return 2;
case 'N': return 3;
case 'S': return 4;
case 'H': return 5;
case 'P': return 6;
case '=': return 7;
case 'X': return 8;
default: return 15; // 15 is used as invalid value
}
}
/// Length must be strictly less than 2^28.
/// $(BR)
/// Operation type must be one of M, I, D, N, S, H, P, =, X.
this(uint length, char operation_type) {
enforce(length < (1<<28), "Too big length of CIGAR operation");
raw = (length << 4) | char2op(operation_type);
}
/// Operation length
uint length() @property const nothrow {
return raw >> 4;
}
/// CIGAR operation as one of MIDNSHP=X.
/// Absent or invalid operation is represented by '?'
char type() @property const nothrow {
return "MIDNSHP=X????????"[raw & 0xF];
}
// Each pair of bits has first bit set iff the operation is query consuming,
// and second bit set iff it is reference consuming.
// X = P H S N D I M
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
/// True iff operation is one of M, =, X, I, S
bool is_query_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
}
/// True iff operation is one of M, =, X, D, N
bool is_reference_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
}
/// True iff operation is one of M, =, X
bool is_match_or_mismatch() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
/// True iff operation is one of 'S', 'H'
bool is_clipping() @property const {
return ((raw & 0xF) >> 1) == 2; // 4 or 5
}
private void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(length);
sink.write(type);
}
void toString(scope void delegate(const(char)[]) sink) const {
toSam(sink);
}
}
/// Forward range of extended CIGAR operations, with =/X instead of M
/// Useful for, e.g., detecting positions of mismatches.
struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
private {
CigarOpRange _cigar;
MdOpRange _md_ops;
CigarOperation _front_cigar_op;
MdOperation _front_md_op;
uint _n_mismatches;
bool _empty;
}
///
this(CigarOpRange cigar, MdOpRange md_ops) {
_cigar = cigar;
_md_ops = md_ops;
fetchNextCigarOp();
fetchNextMdOp();
}
/// Forward range primitives
bool empty() @property const {
return _empty;
}
/// ditto
CigarOperation front() @property {
debug {
import std.stdio;
writeln(_front_cigar_op, " - ", _front_md_op);
}
if (_front_cigar_op.type != 'M')
return _front_cigar_op;
if (_n_mismatches == 0) {
assert(_front_md_op.is_match);
uint len = min(_front_md_op.match, _front_cigar_op.length);
return CigarOperation(len, '=');
}
assert(_front_md_op.is_mismatch);
return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X');
}
/// ditto
ExtendedCigarRange save() @property {
typeof(return) r = void;
r._cigar = _cigar.save;
r._md_ops = _md_ops.save;
r._front_cigar_op = _front_cigar_op;
r._front_md_op = _front_md_op;
r._n_mismatches = _n_mismatches;
r._empty = _empty;
return r;
}
/// ditto
void popFront() {
if (!_front_cigar_op.is_match_or_mismatch) {
if (_front_cigar_op.is_reference_consuming)
fetchNextMdOp();
fetchNextCigarOp();
return;
}
auto len = _front_cigar_op.length;
if (_n_mismatches > 0) {
enforce(_front_md_op.is_mismatch);
if (len > _n_mismatches) {
_front_cigar_op = CigarOperation(len - _n_mismatches, 'M');
_n_mismatches = 0;
fetchNextMdOp();
} else if (len < _n_mismatches) {
_n_mismatches -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
} else {
enforce(_front_md_op.is_match);
auto n_matches = _front_md_op.match;
if (len > n_matches) {
_front_cigar_op = CigarOperation(len - n_matches, 'M');
fetchNextMdOp();
} else if (len < n_matches) {
_front_md_op.match -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
}
}
private {
void fetchNextCigarOp() {
if (_cigar.empty) {
_empty = true;
return;
}
_front_cigar_op = _cigar.front;
_cigar.popFront();
}
void fetchNextMdOp() {
if (_md_ops.empty)
return;
_n_mismatches = 0;
_front_md_op = _md_ops.front;
_md_ops.popFront();
if (_front_md_op.is_mismatch) {
_n_mismatches = 1;
while (!_md_ops.empty && _md_ops.front.is_mismatch) {
_md_ops.popFront();
_n_mismatches += 1;
}
}
}
}
}
auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) {
return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops);
}

55
bio/bam/md/reconstruct.d

@ -8,10 +8,10 @@
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@ -23,6 +23,7 @@
*/
module bio.bam.md.reconstruct;
import bio.bam.cigar;
import bio.bam.read;
import bio.bam.md.core;
@ -34,7 +35,7 @@ import std.range;
/// Reconstruct read DNA.
/// Returns lazy sequence.
auto dna(T)(T read)
auto dna(T)(T read)
if(isBamRead!(Unqual!T))
{
@ -56,14 +57,14 @@ auto dna(T)(T read)
_seq = seq;
_ops = ops;
}
auto front() @property {
auto op = _ops.front;
return QueryChunk!S(_seq[0 .. op.length], op);
}
bool empty() @property {
return _ops.empty;
return _ops.empty;
}
void popFront() {
@ -81,7 +82,7 @@ auto dna(T)(T read)
// Get read sequence chunks corresponding to query-consuming operations in read.sequence
static auto queryChunks(ref T read) {
return getQueryChunksResult(read.sequence, filter!"a.is_query_consuming"(read.cigar));
}
@ -95,13 +96,13 @@ auto dna(T)(T read)
debug {
_initial_qseq = to!string(query_sequence);
}
_qseq = query_sequence;
_qseq = query_sequence;
_md = md_operations;
_fetchNextMdOperation();
}
bool empty() @property {
return _empty;
return _empty;
}
/*
@ -177,23 +178,23 @@ auto dna(T)(T read)
}
R _qseq;
M _md;
bool _empty;
MdOperation _cur_md_op;
}
}
auto md = _read["MD"];
string md_str;
if (!md.is_nothing) {
md_str = cast(string)_read["MD"];
}
static auto getResult(R, M)(ref T read, R query, M md_ops) {
return Result!(R, M)(read, query, md_ops);
}
auto result = getResult(_read,
auto result = getResult(_read,
joiner(map!"a.sequence"(filter!"a.operation.is_reference_consuming"(query_chunks))),
mdOperations(md_str));
@ -314,19 +315,19 @@ auto dna(R)(R reads)
/*
* If current chunk is empty, get the next one.
*
* Here's the reference:
*
* Here's the reference:
* .........................*.......................................
* _reference_pos (we are here)
* Last chunk ended just now:
* [..........]
* Go through subsequent reads while their leftmost position is
* less or equal to _reference_pos, select the one which covers
* more bases to the right of _reference_pos.
* [...............]
* [....]
* [..........]
* [.........] <- this one is the best
* _reference_pos (we are here)
* Last chunk ended just now:
* [..........]
* Go through subsequent reads while their leftmost position is
* less or equal to _reference_pos, select the one which covers
* more bases to the right of _reference_pos.
* [...............]
* [....]
* [..........]
* [.........] <- this one is the best
*/
if (_chunk.empty) {
if (_reads.empty) {
@ -377,7 +378,7 @@ auto dna(R)(R reads)
debug {
/*
import std.stdio;
writeln("_reference_pos = ", _reference_pos,
writeln("_reference_pos = ", _reference_pos,
"; best_read.position = ", best_read.position,
"; _chunk length = ", best_read.basesCovered());
*/
@ -400,7 +401,7 @@ auto dna(R)(R reads)
unittest {
// reads are taken from HG00110.chrom20.ILLUMINA.bwa.GBR.exome.20111114.bam
// reads are taken from HG00110.chrom20.ILLUMINA.bwa.GBR.exome.20111114.bam
auto r1 = BamRead("r1",
"AGGTTTTGTGAGTGGGACAGTTGCAGCAAAACACAACCATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTA",
@ -415,7 +416,7 @@ unittest {
CigarOperation(7, 'S')]);
r2.position = 60252;
r2["MD"] = "82T0";
auto r3 = BamRead("r3",
"CATAGGTGCCCATCCACCAAGGCAGGCTCTCCATCTTGCTCAGAGTGGCTCTAGCCCTTGCTGACTGCTGGGCAGGGAGAGAGCAGAGCT",
[CigarOperation(90, 'M')]);

1
bio/bam/pileup.d

@ -69,6 +69,7 @@
/// ---------------------------------------------------------
module bio.bam.pileup;
import bio.bam.cigar;
import bio.bam.read;
import bio.bam.md.reconstruct;
import bio.bam.splitter;

228
bio/bam/read.d

@ -51,6 +51,7 @@ import bio.core.base;
import bio.core.utils.format;
import bio.bam.abstractreader;
import bio.bam.cigar;
import bio.bam.writer;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
@ -74,233 +75,6 @@ import std.array;
import std.bitmanip;
import core.stdc.stdlib;
/**
Represents single CIGAR operation
*/
struct CigarOperation {
static assert(CigarOperation.sizeof == uint.sizeof);
/*
WARNING!
It is very essential that the size of
this struct is EXACTLY equal to uint.sizeof!
The reason is to avoid copying of arrays during alignment parsing.
Namely, when some_pointer points to raw cigar data,
we can just do a cast. This allows to access those data
directly, not doing any memory allocations.
*/
private uint raw; // raw data from BAM
private static ubyte char2op(char c) {
switch(c) {
case 'M': return 0;
case 'I': return 1;
case 'D': return 2;
case 'N': return 3;
case 'S': return 4;
case 'H': return 5;
case 'P': return 6;
case '=': return 7;
case 'X': return 8;
default: return 15; // 15 is used as invalid value
}
}
/// Length must be strictly less than 2^28.
/// $(BR)
/// Operation type must be one of M, I, D, N, S, H, P, =, X.
this(uint length, char operation_type) {
enforce(length < (1<<28), "Too big length of CIGAR operation");
raw = (length << 4) | char2op(operation_type);
}
/// Operation length
uint length() @property const nothrow {
return raw >> 4;
}
/// CIGAR operation as one of MIDNSHP=X.
/// Absent or invalid operation is represented by '?'
char type() @property const nothrow {
return "MIDNSHP=X????????"[raw & 0xF];
}
// Each pair of bits has first bit set iff the operation is query consuming,
// and second bit set iff it is reference consuming.
// X = P H S N D I M
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
/// True iff operation is one of M, =, X, I, S
bool is_query_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
}
/// True iff operation is one of M, =, X, D, N
bool is_reference_consuming() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
}
/// True iff operation is one of M, =, X
bool is_match_or_mismatch() @property const {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
/// True iff operation is one of 'S', 'H'
bool is_clipping() @property const {
return ((raw & 0xF) >> 1) == 2; // 4 or 5
}
private void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(length);
sink.write(type);
}
void toString(scope void delegate(const(char)[]) sink) const {
toSam(sink);
}
}
/// Forward range of extended CIGAR operations, with =/X instead of M
/// Useful for, e.g., detecting positions of mismatches.
struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
private {
CigarOpRange _cigar;
MdOpRange _md_ops;
CigarOperation _front_cigar_op;
MdOperation _front_md_op;
uint _n_mismatches;
bool _empty;
}
///
this(CigarOpRange cigar, MdOpRange md_ops) {
_cigar = cigar;
_md_ops = md_ops;
fetchNextCigarOp();
fetchNextMdOp();
}
/// Forward range primitives
bool empty() @property const {
return _empty;
}
/// ditto
CigarOperation front() @property {
debug {
import std.stdio;
writeln(_front_cigar_op, " - ", _front_md_op);
}
if (_front_cigar_op.type != 'M')
return _front_cigar_op;
if (_n_mismatches == 0) {
assert(_front_md_op.is_match);
uint len = min(_front_md_op.match, _front_cigar_op.length);
return CigarOperation(len, '=');
}
assert(_front_md_op.is_mismatch);
return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X');
}
/// ditto
ExtendedCigarRange save() @property {
typeof(return) r = void;
r._cigar = _cigar.save;
r._md_ops = _md_ops.save;
r._front_cigar_op = _front_cigar_op;
r._front_md_op = _front_md_op;
r._n_mismatches = _n_mismatches;
r._empty = _empty;
return r;
}
/// ditto
void popFront() {
if (!_front_cigar_op.is_match_or_mismatch) {
if (_front_cigar_op.is_reference_consuming)
fetchNextMdOp();
fetchNextCigarOp();
return;
}
auto len = _front_cigar_op.length;
if (_n_mismatches > 0) {
enforce(_front_md_op.is_mismatch);
if (len > _n_mismatches) {
_front_cigar_op = CigarOperation(len - _n_mismatches, 'M');
_n_mismatches = 0;
fetchNextMdOp();
} else if (len < _n_mismatches) {
_n_mismatches -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
} else {
enforce(_front_md_op.is_match);
auto n_matches = _front_md_op.match;
if (len > n_matches) {
_front_cigar_op = CigarOperation(len - n_matches, 'M');
fetchNextMdOp();
} else if (len < n_matches) {
_front_md_op.match -= len;
fetchNextCigarOp();
} else {
fetchNextCigarOp();
fetchNextMdOp();
}
}
}
private {
void fetchNextCigarOp() {
if (_cigar.empty) {
_empty = true;
return;
}
_front_cigar_op = _cigar.front;
_cigar.popFront();
}
void fetchNextMdOp() {
if (_md_ops.empty)
return;
_n_mismatches = 0;
_front_md_op = _md_ops.front;
_md_ops.popFront();
if (_front_md_op.is_mismatch) {
_n_mismatches = 1;
while (!_md_ops.empty && _md_ops.front.is_mismatch) {
_md_ops.popFront();
_n_mismatches += 1;
}
}
}
}
}
auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) {
return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops);
}
/**
BAM record representation.
*/

288
bio/core/region.d

@ -26,15 +26,64 @@
module bio.core.region;
#line 26 "region.d"
static const int region_parser_start = 1;
static const int region_parser_first_final = 3;
static const int region_parser_error = 0;
#line 30 "region.d"
static byte[] _region_parser_actions = [
0, 1, 1, 1, 2, 1, 3, 1,
4, 2, 0, 1
];
static const int region_parser_en_region = 1;
static byte[] _region_parser_key_offsets = [
0, 0, 6, 9, 12, 19, 21, 25
];
static char[] _region_parser_trans_keys = [
33u, 41u, 43u, 60u, 62u, 126u, 44u, 48u,
57u, 58u, 33u, 126u, 44u, 33u, 47u, 48u,
57u, 58u, 126u, 33u, 126u, 44u, 45u, 48u,
57u, 44u, 48u, 57u, 0
];
#line 40 "region.rl"
static byte[] _region_parser_single_lengths = [
0, 0, 1, 1, 1, 0, 2, 1
];
static byte[] _region_parser_range_lengths = [
0, 3, 1, 1, 3, 1, 1, 1
];
static byte[] _region_parser_index_offsets = [
0, 0, 4, 7, 10, 15, 17, 21
];
static byte[] _region_parser_indicies = [
0, 0, 0, 1, 2, 2, 1, 3,
0, 1, 5, 4, 5, 4, 1, 4,
1, 6, 7, 6, 1, 8, 8, 1,
0
];
static byte[] _region_parser_trans_targs = [
3, 0, 7, 4, 5, 6, 6, 2,
7
];
static byte[] _region_parser_trans_actions = [
0, 0, 9, 3, 0, 9, 1, 5,
1
];
static byte[] _region_parser_eof_actions = [
0, 0, 0, 3, 3, 3, 5, 7
];
static int region_parser_start = 1;
static int region_parser_first_final = 3;
static int region_parser_error = 0;
static int region_parser_en_region = 1;
#line 44 "region.rl"
import std.conv;
@ -57,161 +106,142 @@ Region parseRegion(string str) {
region.end = uint.max;
#line 57 "region.d"
#line 110 "region.d"
{
cs = region_parser_start;
}
#line 62 "region.rl"
#line 66 "region.rl"
#line 64 "region.d"
#line 117 "region.d"
{
int _klen;
uint _trans;
byte* _acts;
uint _nacts;
char* _keys;
if ( p == pe )
goto _test_eof;
switch ( cs )
if ( cs == 0 )
goto _out;
_resume:
_keys = &_region_parser_trans_keys[_region_parser_key_offsets[cs]];
_trans = _region_parser_index_offsets[cs];
_klen = _region_parser_single_lengths[cs];
if ( _klen > 0 ) {
char* _lower = _keys;
char* _mid;
char* _upper = _keys + _klen - 1;
while (1) {
if ( _upper < _lower )
break;
_mid = _lower + ((_upper-_lower) >> 1);
if ( (*p) < *_mid )
_upper = _mid - 1;
else if ( (*p) > *_mid )
_lower = _mid + 1;
else {
_trans += cast(uint)(_mid - _keys);
goto _match;
}
}
_keys += _klen;
_trans += _klen;
}
_klen = _region_parser_range_lengths[cs];
if ( _klen > 0 ) {
char* _lower = _keys;
char* _mid;
char* _upper = _keys + (_klen<<1) - 2;
while (1) {
if ( _upper < _lower )
break;
_mid = _lower + (((_upper-_lower) >> 1) & ~1);
if ( (*p) < _mid[0] )
_upper = _mid - 2;
else if ( (*p) > _mid[1] )
_lower = _mid + 2;
else {
_trans += cast(uint)((_mid - _keys)>>1);
goto _match;
}
}
_trans += _klen;
}
_match:
_trans = _region_parser_indicies[_trans];
cs = _region_parser_trans_targs[_trans];
if ( _region_parser_trans_actions[_trans] == 0 )
goto _again;
_acts = &_region_parser_actions[_region_parser_trans_actions[_trans]];
_nacts = cast(uint) *_acts++;
while ( _nacts-- > 0 )
{
goto case; case 1:
if ( (*p) < 43u ) {
if ( 33u <= (*p) && (*p) <= 41u )
goto st3;
} else if ( (*p) > 60u ) {
if ( 62u <= (*p) && (*p) <= 126u )
goto st3;
} else
goto st3;
goto st0;
st0:
cs = 0;
goto _out;
st3:
if ( ++p == pe )
goto _test_eof3;
goto case; case 3:
if ( (*p) == 58u )
goto tr3;
if ( 33u <= (*p) && (*p) <= 126u )
goto st3;
goto st0;
tr3:
switch ( *_acts++ )
{
case 0:
#line 29 "region.rl"
{ region.reference = str[0 .. p - str.ptr]; }
goto st4;
st4:
if ( ++p == pe )
goto _test_eof4;
goto case; case 4:
#line 100 "region.d"
if ( (*p) == 44u )
goto tr5;
if ( (*p) < 48u ) {
if ( 33u <= (*p) && (*p) <= 47u )
goto st5;
} else if ( (*p) > 57u ) {
if ( 58u <= (*p) && (*p) <= 126u )
goto st5;
} else
goto tr5;
goto st0;
st5:
if ( ++p == pe )
goto _test_eof5;
goto case; case 5:
if ( 33u <= (*p) && (*p) <= 126u )
goto st5;
goto st0;
tr5:
#line 25 "region.rl"
{ uint_value = 0; }
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st6;
tr6:
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st6;
st6:
if ( ++p == pe )
goto _test_eof6;
goto case; case 6:
#line 133 "region.d"
switch( (*p) ) {
case 44u: goto tr6;
case 45u: goto tr7;
default: break;
}
if ( 48u <= (*p) && (*p) <= 57u )
goto tr6;
goto st0;
tr7:
break;
case 1:
#line 30 "region.rl"
{ region.beg = to!uint(uint_value - 1); }
goto st2;
st2:
if ( ++p == pe )
goto _test_eof2;
goto case; case 2:
#line 150 "region.d"
if ( (*p) == 44u )
goto tr2;
if ( 48u <= (*p) && (*p) <= 57u )
goto tr2;
goto st0;
tr2:
#line 25 "region.rl"
{ uint_value = 0; }
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st7;
tr8:
#line 26 "region.rl"
{ if ((*p) != ',') uint_value *= 10, uint_value += (*p) - '0'; }
goto st7;
st7:
if ( ++p == pe )
goto _test_eof7;
goto case; case 7:
#line 170 "region.d"
if ( (*p) == 44u )
goto tr8;
if ( 48u <= (*p) && (*p) <= 57u )
goto tr8;
goto st0;
break;
case 2:
#line 33 "region.rl"
{ region.reference = str[0 .. p - str.ptr]; }
break;
case 3:
#line 34 "region.rl"
{ region.beg = to!uint(uint_value - 1); }
break;
#line 207 "region.d"
default: break;
}
}
_test_eof3: cs = 3; goto _test_eof;
_test_eof4: cs = 4; goto _test_eof;
_test_eof5: cs = 5; goto _test_eof;
_test_eof6: cs = 6; goto _test_eof;
_test_eof2: cs = 2; goto _test_eof;
_test_eof7: cs = 7; goto _test_eof;
_again:
if ( cs == 0 )
goto _out;
if ( ++p != pe )
goto _resume;
_test_eof: {}
if ( p == eof )
{
switch ( cs ) {
case 3:
case 4:
case 5:
#line 29 "region.rl"
byte* __acts = &_region_parser_actions[_region_parser_eof_actions[cs]];
uint __nacts = cast(uint) *__acts++;
while ( __nacts-- > 0 ) {
switch ( *__acts++ ) {
case 2:
#line 33 "region.rl"
{ region.reference = str[0 .. p - str.ptr]; }
break;
case 6:
#line 30 "region.rl"
case 3:
#line 34 "region.rl"
{ region.beg = to!uint(uint_value - 1); }
break;
case 7:
#line 31 "region.rl"
case 4:
#line 35 "region.rl"
{ region.end = to!uint(uint_value); }
break;
#line 203 "region.d"
#line 236 "region.d"
default: break;
}
}
}
_out: {}
}
#line 63 "region.rl"
#line 67 "region.rl"
return region;
}

465
bio/sam/utils/fastrecordparser.d

File diff suppressed because it is too large

63
bio/sam/utils/recordparser.d

@ -11,10 +11,10 @@ module bio.sam.utils.recordparser;
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@ -579,6 +579,7 @@ static int sam_alignment_en_alignment_tag_parsing = 251;
import bio.sam.header;
import bio.bam.cigar;
import bio.bam.read;
import bio.bam.bai.bin;
import bio.core.utils.outbuffer;
@ -599,7 +600,7 @@ BamRead parseAlignmentLine(string line, SamHeader header, OutBuffer buffer=null)
buffer.clear();
size_t rollback_size; // needed in case of invalid data
byte current_sign = 1;
size_t read_name_beg; // position of beginning of QNAME
@ -614,7 +615,7 @@ BamRead parseAlignmentLine(string line, SamHeader header, OutBuffer buffer=null)
char quals_last_char; // needed in order to handle '*' correctly
size_t cigar_op_len_start; // position of start of CIGAR operation
long int_value; // for storing temporary integers
float float_value; // for storing temporary floats
size_t float_beg; // position of start of current float
@ -636,14 +637,14 @@ BamRead parseAlignmentLine(string line, SamHeader header, OutBuffer buffer=null)
int ref_id = -1;
#line 639 "sam_alignment.d"
#line 640 "sam_alignment.d"
{
cs = sam_alignment_start;
}
#line 479 "sam_alignment.rl"
#line 480 "sam_alignment.rl"
#line 646 "sam_alignment.d"
#line 647 "sam_alignment.d"
{
int _klen;
uint _trans;
@ -738,7 +739,7 @@ _match:
break;
case 5:
#line 38 "sam_alignment.rl"
{
{
float_value = to!float(line[float_beg .. p - line.ptr]);
}
break;
@ -777,7 +778,7 @@ _match:
case 14:
#line 63 "sam_alignment.rl"
{
ref_id = header.getSequenceIndex(line[rname_beg .. p - line.ptr]);
ref_id = header.getSequenceIndex(line[rname_beg .. p - line.ptr]);
}
break;
case 15:
@ -849,11 +850,11 @@ _match:
break;
case 26:
#line 113 "sam_alignment.rl"
{
{
auto op = CigarOperation(cigar_op_len, cigar_op_chr);
if (op.is_reference_consuming)
end_pos += op.length;
buffer.put!CigarOperation(op);
buffer.put!CigarOperation(op);
{
auto ptr = cast(uint*)(buffer.data.ptr + 3 * uint.sizeof);
*ptr = (*ptr) + 1;
@ -902,7 +903,7 @@ _match:
case 32:
#line 156 "sam_alignment.rl"
{
{
{
auto ptr = cast(int*)(buffer.data.ptr + 5 * int.sizeof);
*ptr = header.getSequenceIndex(line[rnext_beg .. p - line.ptr]);
}
@ -918,7 +919,7 @@ _match:
break;
case 35:
#line 169 "sam_alignment.rl"
{
{
{
auto ptr = cast(int*)(buffer.data.ptr + 6 * int.sizeof);
*ptr = to!int(int_value) - 1;
@ -935,7 +936,7 @@ _match:
break;
case 38:
#line 181 "sam_alignment.rl"
{
{
{
auto ptr = cast(int*)(buffer.data.ptr + 7 * int.sizeof);
*ptr = to!int(int_value);
@ -956,7 +957,7 @@ _match:
break;
case 42:
#line 194 "sam_alignment.rl"
{
{
auto data = cast(ubyte[])line[sequence_beg .. p - line.ptr];
l_seq = cast(int)data.length;
auto raw_len = (l_seq + 1) / 2;
@ -1041,16 +1042,16 @@ _match:
break;
case 51:
#line 278 "sam_alignment.rl"
{
{
buffer.capacity = buffer.length + 4;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('A');
buffer.putUnsafe!char((*p));
buffer.putUnsafe!char((*p));
}
break;
case 52:
#line 285 "sam_alignment.rl"
{
{
buffer.capacity = buffer.length + 7;
buffer.putUnsafe(tag_key);
if (int_value < 0) {
@ -1088,7 +1089,7 @@ _match:
break;
case 54:
#line 319 "sam_alignment.rl"
{
{
buffer.capacity = buffer.length + 7;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('f');
@ -1097,7 +1098,7 @@ _match:
break;
case 55:
#line 326 "sam_alignment.rl"
{
{
{
auto data = cast(ubyte[])(line[tagvalue_beg .. p - line.ptr]);
buffer.capacity = buffer.length + 4 + data.length;
@ -1154,7 +1155,7 @@ _match:
break;
case 59:
#line 379 "sam_alignment.rl"
{
{
buffer.put!float(float_value);
{
auto ptr = cast(uint*)(buffer.data.ptr + tag_array_length_offset);
@ -1173,7 +1174,7 @@ _match:
case 62:
#line 403 "sam_alignment.rl"
{
buffer.shrink(rollback_size);
buffer.shrink(rollback_size);
p--; {cs = 180; if (true) goto _again;}
}
break;
@ -1185,7 +1186,7 @@ _match:
#line 410 "sam_alignment.rl"
{ rollback_size = buffer.length; }
break;
#line 1188 "sam_alignment.d"
#line 1189 "sam_alignment.d"
default: break;
}
}
@ -1208,7 +1209,7 @@ _again:
break;
case 5:
#line 38 "sam_alignment.rl"
{
{
float_value = to!float(line[float_beg .. p - line.ptr]);
}
break;
@ -1293,7 +1294,7 @@ _again:
break;
case 52:
#line 285 "sam_alignment.rl"
{
{
buffer.capacity = buffer.length + 7;
buffer.putUnsafe(tag_key);
if (int_value < 0) {
@ -1327,7 +1328,7 @@ _again:
break;
case 54:
#line 319 "sam_alignment.rl"
{
{
buffer.capacity = buffer.length + 7;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('f');
@ -1336,7 +1337,7 @@ _again:
break;
case 55:
#line 326 "sam_alignment.rl"
{
{
{
auto data = cast(ubyte[])(line[tagvalue_beg .. p - line.ptr]);
buffer.capacity = buffer.length + 4 + data.length;
@ -1381,7 +1382,7 @@ _again:
break;
case 59:
#line 379 "sam_alignment.rl"
{
{
buffer.put!float(float_value);
{
auto ptr = cast(uint*)(buffer.data.ptr + tag_array_length_offset);
@ -1392,7 +1393,7 @@ _again:
case 62:
#line 403 "sam_alignment.rl"
{
buffer.shrink(rollback_size);
buffer.shrink(rollback_size);
p--; {cs = 180; if (true) goto _again;}
}
break;
@ -1400,7 +1401,7 @@ _again:
#line 410 "sam_alignment.rl"
{ rollback_size = buffer.length; }
break;
#line 1403 "sam_alignment.d"
#line 1404 "sam_alignment.d"
default: break;
}
}
@ -1409,7 +1410,7 @@ _again:
_out: {}
}
#line 480 "sam_alignment.rl"
#line 481 "sam_alignment.rl"
BamRead read;
read.raw_data = buffer.data[];

7
src_ragel/Makefile

@ -6,23 +6,26 @@ all: fastrecordparser recordparser regionparser
.PHONY : regionparser
fastrecordparser:
fastrecordparser:
ragel sam_alignment.rl -D -G2
./workarounds/fix_switch_case_fallthrough.sh sam_alignment.d
echo 'module bio.sam.utils.fastrecordparser;' | cat - sam_alignment.d > .sam_alignment.d.tmp
rm sam_alignment.d
mv .sam_alignment.d.tmp fastrecordparser.d
mv fastrecordparser.d ../bio/sam/utils/fastrecordparser.d
recordparser:
recordparser:
ragel sam_alignment.rl -D
./workarounds/fix_static_const.sh sam_alignment.d
echo 'module bio.sam.utils.recordparser;' | cat - sam_alignment.d > .sam_alignment.d.tmp
rm sam_alignment.d
mv .sam_alignment.d.tmp recordparser.d
mv recordparser.d ../bio/sam/utils/recordparser.d
regionparser:
ragel region.rl -D
./workarounds/fix_static_const.sh region.d
mv region.d ../bio/core/region.d
clean:
rm -f *parser.d region.d

55
src_ragel/sam_alignment.rl

@ -8,10 +8,10 @@
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@ -35,7 +35,7 @@
int = (sign >update_sign)? uint % take_sign_into_account ;
action mark_float_start { float_beg = p - line.ptr; }
action update_float_value {
action update_float_value {
float_value = to!float(line[float_beg .. p - line.ptr]);
}
@ -61,7 +61,7 @@
### 3. STORE RNAME ###
action rname_start { rname_beg = p - line.ptr; }
action rname_end {
ref_id = header.getSequenceIndex(line[rname_beg .. p - line.ptr]);
ref_id = header.getSequenceIndex(line[rname_beg .. p - line.ptr]);
}
action handle_invalid_rname { fhold; fgoto recover_from_invalid_rname; }
@ -110,11 +110,11 @@
### 7. STORE CIGAR OPERATIONS ###
action cigar_set_op_length { cigar_op_len = to!uint(int_value); }
action cigar_set_op_chr { cigar_op_chr = fc; }
action cigar_put_operation {
action cigar_put_operation {
auto op = CigarOperation(cigar_op_len, cigar_op_chr);
if (op.is_reference_consuming)
end_pos += op.length;
buffer.put!CigarOperation(op);
buffer.put!CigarOperation(op);
{
auto ptr = cast(uint*)(buffer.data.ptr + 3 * uint.sizeof);
*ptr = (*ptr) + 1;
@ -151,10 +151,10 @@
*ptr = ref_id;
}
}
action rnext_start { rnext_beg = p - line.ptr; }
action rnext_end {
{
{
auto ptr = cast(int*)(buffer.data.ptr + 5 * int.sizeof);
*ptr = header.getSequenceIndex(line[rnext_beg .. p - line.ptr]);
}
@ -162,11 +162,11 @@
action handle_invalid_rnext { fhold; fgoto recover_from_invalid_rnext; }
recover_from_invalid_rnext := invalid_field '\t' @{ fhold; fgoto pnext_parsing; } ;
rnext = '*' | ('=' % set_same_mate_ref_id) |
rnext = '*' | ('=' % set_same_mate_ref_id) |
(([!-()+-<>-~][!-~]*) > rnext_start % rnext_end) ;
### 10. SET MATE POSITION ###
action set_mate_pos {
action set_mate_pos {
{
auto ptr = cast(int*)(buffer.data.ptr + 6 * int.sizeof);
*ptr = to!int(int_value) - 1;
@ -178,7 +178,7 @@
pnext = uint % set_mate_pos;
### 11. SET TEMPLATE LENGTH ###
action set_template_length {
action set_template_length {
{
auto ptr = cast(int*)(buffer.data.ptr + 7 * int.sizeof);
*ptr = to!int(int_value);
@ -191,7 +191,7 @@
### 12. SET SEQUENCE ###
action sequence_start { sequence_beg = p - line.ptr; }
action sequence_end {
action sequence_end {
auto data = cast(ubyte[])line[sequence_beg .. p - line.ptr];
l_seq = cast(int)data.length;
auto raw_len = (l_seq + 1) / 2;
@ -275,14 +275,14 @@
############ TAG PARSING ######
action set_charvalue {
action set_charvalue {
buffer.capacity = buffer.length + 4;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('A');
buffer.putUnsafe!char(fc);
buffer.putUnsafe!char(fc);
}
action set_integervalue {
action set_integervalue {
buffer.capacity = buffer.length + 7;
buffer.putUnsafe(tag_key);
if (int_value < 0) {
@ -316,14 +316,14 @@
action start_tagvalue { tagvalue_beg = p - line.ptr; }
action set_floatvalue {
action set_floatvalue {
buffer.capacity = buffer.length + 7;
buffer.putUnsafe(tag_key);
buffer.putUnsafe!char('f');
buffer.putUnsafe!float(float_value);
}
action set_stringvalue {
action set_stringvalue {
{
auto data = cast(ubyte[])(line[tagvalue_beg .. p - line.ptr]);
buffer.capacity = buffer.length + 4 + data.length;
@ -376,7 +376,7 @@
}
}
action put_float_to_array {
action put_float_to_array {
buffer.put!float(float_value);
{
auto ptr = cast(uint*)(buffer.data.ptr + tag_array_length_offset);
@ -390,10 +390,10 @@
floatarrayvalue = [f] > start_arrayvalue (',' float % put_float_to_array)+ ;
arrayvalue = integerarrayvalue | floatarrayvalue ;
tagvalue = ("A:" charvalue) |
("i:" integervalue) |
("f:" floatvalue) |
("Z:" stringvalue) |
tagvalue = ("A:" charvalue) |
("i:" integervalue) |
("f:" floatvalue) |
("Z:" stringvalue) |
("H:" hexstringvalue) |
("B:" arrayvalue) ;
@ -401,7 +401,7 @@
action tag_key_end { tag_key = cast(ubyte[])(line[tag_key_beg .. p - line.ptr]); }
action handle_invalid_tag {
buffer.shrink(rollback_size);
buffer.shrink(rollback_size);
fhold; fgoto recover_from_invalid_tag;
}
# FIXME: what if the tag is last?
@ -412,13 +412,14 @@
optionalfield = tag ':' tagvalue % update_rollback_size $!handle_invalid_tag ;
optionalfields = optionalfield ('\t' optionalfield)* ;
alignment := field_parsing: mandatoryfields
alignment := field_parsing: mandatoryfields
tag_parsing: ('\t' optionalfields)? ;
write data;
write data;
}%%