Browse Source

Started optimizing tokenizer/splitter

georgeg
Pjotr Prins 4 years ago
parent
commit
da9b0b958a
  1. 14
      bio/std/decompress.d
  2. 49
      bio/std/range/splitter.d

14
bio/std/decompress.d

@ -11,6 +11,20 @@ module bio.std.decompress;
current edition (still) uses the garbage collector. It may help to
switch it off or to use the BioD decompressor used by bgzf.
For a comparison with gzip a 2GB file decompressed with
real 0m53.701s
user 0m53.820s
sys 0m0.572s
while gzip took
real 0m11.528s
user 0m10.288s
sys 0m0.936s
So, that is something to aim for.
Conversion can happen between different encodings, provided the
line terminator is ubyte = '\n'. GzipbyLine logic is modeled on
ByLineImpl and readln function from std.stdio.

49
bio/std/range/splitter.d

@ -70,3 +70,52 @@ unittest {
assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3 4 \n")) == ["hello","1","2","3","4"]);
assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"chr1:55365,55365,1")) == ["chr1:55365","55365","1"]);
}
struct FastSplitConv(R)
if (isInputRange!R)
{
R list, split_on;
this(R range, R splits_on = cast(R)SPLIT_ON) {
list = range;
split_on = splits_on;
}
int opApply(scope int delegate(R) dg) {
size_t start = 0;
bool in_whitespace = false;
foreach(size_t pos, c; list) {
if (canFind(split_on,c)) { // hit split char
if (!in_whitespace) { // emit
auto token = list[start..pos];
dg(token);
}
start = pos+1;
in_whitespace = true;
} else {
in_whitespace = false;
}
}
if (!in_whitespace) { // emit final
auto token = list[start..$];
dg(token);
}
return 0;
}
}
unittest {
auto s = cast(ubyte[])"hello 1 2 \t3 4 \n";
for (int x = 0; x < 4_000_000; x++) {
assert(array(FastSplitConv!(ubyte[])(s)) == ["hello","1","2","3","4"]);
// assert(array(FastSplitConv!(ubyte[])(cast(ubyte[])" hello, 1 2 \t3 4 \n")) == ["","hello","1","2","3","4"]);
// assert(array(FastSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3 4 \n")) == ["hello","1","2","3","4"]);
// assert(array(FastSplitConv!(ubyte[])(cast(ubyte[])"chr1:55365,55365,1")) == ["chr1:55365","55365","1"]);
}
/*
real 0m3.733s
user 0m3.736s
sys 0m0.000s
*/
}

Loading…
Cancel
Save