Browse Source

new awesome serialization to sam/json

remotes/georgeg/no_streams
lomereiter 9 years ago
parent
commit
73da46c93e
  1. 191
      bio/bam/read.d
  2. 300
      bio/bam/serialization/json.d
  3. 235
      bio/bam/serialization/sam.d
  4. 88
      bio/bam/tagvalue.d
  5. 2
      bio/bam/writer.d
  6. 448
      bio/core/utils/format.d
  7. 117
      bio/sam/header.d
  8. 1
      bio/sam/utils/fastrecordparser.d
  9. 1
      bio/sam/utils/recordparser.d
  10. 1
      src_ragel/sam_alignment.rl
  11. 42
      test/unittests.d

191
bio/bam/read.d

@ -57,11 +57,6 @@ import bio.core.utils.switchendianness;
import bio.bam.thirdparty.msgpack : Packer, unpack;
version(unittest) {
import bio.bam.utils.tagstoragebuilder;
import std.stdio;
}
import std.algorithm;
import std.range;
import std.conv;
@ -70,6 +65,7 @@ import std.exception;
import std.system;
import std.traits;
import std.array;
import std.c.stdlib;
/**
Represents single CIGAR operation
@ -145,10 +141,15 @@ struct CigarOperation {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
///
private void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(length);
sink.write(type);
}
void toString(scope void delegate(const(char)[]) sink) const {
sink.putInteger(length);
sink.putChar(type);
toSam(sink);
}
}
@ -466,8 +467,6 @@ struct BamRead {
/// Sets query sequence. Sets all base qualities to 255 (i.e. unknown).
@property void sequence(string seq)
{
enforce(seq.length >= 1 && seq.length <= 255, "Sequence length must be in range 1-255");
_dup();
auto raw_length = (seq.length + 1) / 2;
@ -709,72 +708,174 @@ struct BamRead {
}
}
///
void toString(scope void delegate(const(char)[]) sink) const {
sink(name);
sink.putChar('\t');
sink.putInteger(flag);
sink.putChar('\t');
/// String representation.
/// $(BR)
/// Possible formats are SAM ("%s") and JSON ("%j")
void toString(scope void delegate(const(char)[]) sink, FormatSpec!char fmt) const {
if (size_in_bytes < 10000 && fmt.spec == 's') {
auto p = cast(char*)alloca(size_in_bytes * 5);
char* end = p;
toSam(end);
sink(p[0 .. end - p]);
} else if (size_in_bytes < 5000 && fmt.spec == 'j') {
auto p = cast(char*)alloca(size_in_bytes * 10 + 1000);
char* end = p;
toJson(end);
sink(p[0 .. end - p]);
} else if (fmt.spec == 's') {
toSam(sink);
} else if (fmt.spec == 'j') {
toJson(sink);
} else {
throw new FormatException("unknown format specifier");
}
}
/// ditto
void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(name);
sink.write('\t');
sink.write(flag);
sink.write('\t');
if (ref_id == -1 || _reader is null)
sink.putChar('*');
sink.write('*');
else
sink(_reader.reference_sequences[ref_id].name);
sink.write(_reader.reference_sequences[ref_id].name);
sink.putChar('\t');
sink.putInteger(position + 1);
sink.putChar('\t');
sink.putInteger(mapping_quality);
sink.putChar('\t');
sink.write('\t');
sink.write(position + 1);
sink.write('\t');
sink.write(mapping_quality);
sink.write('\t');
if (cigar.length == 0)
sink.putChar('*');
sink.write('*');
else
foreach (op; cigar)
op.toString(sink);
op.toSam(sink);
sink.putChar('\t');
sink.write('\t');
if (mate_ref_id == ref_id) {
if (mate_ref_id == -1)
sink("*\t");
sink.write("*\t");
else
sink("=\t");
sink.write("=\t");
} else {
if (mate_ref_id == -1 || _reader is null) {
sink("*\t");
sink.write("*\t");
} else {
auto mate_name = _reader.reference_sequences[mate_ref_id].name;
sink(mate_name);
sink("\t");
sink.write(mate_name);
sink.write("\t");
}
}
sink.putInteger(mate_position + 1);
sink.putChar('\t');
sink.putInteger(template_length);
sink.putChar('\t');
sink.write(mate_position + 1);
sink.write('\t');
sink.write(template_length);
sink.write('\t');
if (sequence_length == 0)
sink.putChar('*');
sink.write('*');
else
foreach (char c; sequence)
sink.putChar(c);
sink.putChar('\t');
sink.write(c);
sink.write('\t');
if (base_qualities.length == 0 || base_qualities[0] == 0xFF)
sink.putChar('*');
sink.write('*');
else
foreach (qual; base_qualities)
sink.putChar(cast(char)(qual + 33));
sink.write(cast(char)(qual + 33));
foreach (k, v; this) {
sink.putChar('\t');
sink(k);
sink.putChar(':');
v.formatSam(sink);
sink.write('\t');
sink.write(k);
sink.write(':');
v.toSam(sink);
}
}
/// ditto
string toSam()() const {
return to!string(this);
}
/// JSON representation
void toJson(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(`{"qname":`); sink.writeJson(name);
sink.write(`,"flag":`); sink.write(flag);
sink.write(`,"rname":`);
if (ref_id == -1 || _reader is null)
sink.write(`"*"`);
else
sink.writeJson(_reader.reference_sequences[ref_id].name);
sink.write(`,"pos":`); sink.write(position + 1);
sink.write(`,"mapq":`); sink.write(mapping_quality);
sink.write(`,"cigar":"`);
if (cigar.empty)
sink.write('*');
else
foreach (op; cigar)
op.toSam(sink);
sink.write('"');
sink.write(`,"rnext":`);
if (mate_ref_id == ref_id) {
if (mate_ref_id == -1)
sink.write(`"*"`);
else
sink.write(`"="`);
} else if (mate_ref_id == -1 || _reader is null) {
sink.write(`"*"`);
} else {
sink.writeJson(_reader.reference_sequences[mate_ref_id].name);
}
sink.write(`,"pnext":`); sink.write(mate_position + 1);
sink.write(`,"tlen":`); sink.write(template_length);
sink.write(`,"seq":"`);
if (sequence_length == 0)
sink.write('*');
else
foreach (char c; sequence)
sink.write(c);
sink.write('"');
sink.write(`,"qual":`);
sink.writeJson(base_qualities);
sink.write(`,"tags":{`);
bool not_first = false;
foreach (k, v; this) {
if (not_first)
sink.write(',');
sink.writeJson(k);
sink.write(':');
v.toJson(sink);
not_first = true;
}
sink.write("}}");
}
/// ditto
string toJson()() const {
auto w = appender!(char[])();
toJson((const(char)[] s) { w.put(s); });
return cast(string)w.data;
}
/// Associates read with BAM reader. This is done automatically
/// if this read is obtained through BamReader/Reference methods.
void associateWithReader(bio.bam.abstractreader.IBamSamReader reader) {
@ -1227,6 +1328,8 @@ mixin template TagStorage() {
}
unittest {
import bio.bam.utils.tagstoragebuilder;
import std.algorithm;
import std.stdio;
import std.math;

300
bio/bam/serialization/json.d

@ -1,300 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.serialization.json;
import bio.bam.read;
import bio.bam.reference;
import bio.bam.tagvalue;
import bio.core.utils.format;
import std.conv;
import std.algorithm;
import std.typecons;
import std.stdio;
import std.traits;
import std.range;
import std.c.stdlib;
import std.math;
import std.array;
/** Representation of tag value in SAM format
Example:
----------
Value v = 2.7;
assert(toJson(v) == "2.7");
v = [1, 2, 3];
assert(toJson(v) == "[1,2,3]");
----------
*/
string toJson(Value v) {
char[] buf;
buf.reserve(16);
jsonSerialize(v, buf);
return cast(string)buf;
}
/// JSON doesn't support NaN and +/- infinity.
/// Therefore the approach taken here is to represent
/// infinity as 1.0e+1024, and NaN as null.
void jsonSerializeFloat(S)(ref S stream, float f) {
if (isFinite(f)) {
append(stream, "%g", f);
} else {
if (f == float.infinity) {
putstring(stream, "1.0e+1024");
} else if (f == -float.infinity) {
putstring(stream, "-1.0e+1024");
} else if (isNaN(f)) {
putstring(stream, "null");
} else {
assert(0);
}
}
}
private static char[256] specialCharacterTable = [
/* 0-15 */ 0,0, 0,0,0,0,0,0, 'b','t','n',0, 'f','r',0, 0,
/* 16-31 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 32-47 */ 0,0,'"',0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 48-63 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0,'/',
/* 64-79 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 80-95 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0,'\\', 0,0, 0,
/* 96-111 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 112-127 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0
];
/// Prints string to $(D stream), with escaping.and quoting.
void jsonSerializeCharacterRange(S, R)(ref S stream, R chars)
if (is(ElementType!R == char) || is(R == string))
{
putcharacter(stream, '"');
foreach (char c; chars) {
auto sc = specialCharacterTable[cast(ubyte)c];
if (sc == 0) {
putcharacter(stream, c);
} else {
putcharacter(stream, '\\');
putcharacter(stream, sc);
}
}
putcharacter(stream, '"');
}
/// Print SAM representation to FILE* or append it to char[]/char*
/// (in char* case it's your responsibility to allocate enough memory)
void jsonSerialize(S)(const(Value) v, ref S stream) {
if (v.is_numeric_array) {
string toSamNumericArrayHelper() {
char[] cases;
foreach (t; ArrayElementTagValueTypes) {
char[] printexpr = "putinteger(stream, elem);".dup;
if (t.ch == 'f') {
printexpr = "jsonSerializeFloat(stream, elem);".dup;
}
cases ~= `case '`~t.ch~`':` ~
` putcharacter(stream, '[');`~
` auto arr = cast(`~t.ValueType.stringof~`[])v;`~
` if (arr.length != 0) { { auto elem = arr[0];`~printexpr~`}`~
` foreach (elem; arr[1..$]) { putcharacter(stream, ',');`~printexpr~`}`~
` }putcharacter(stream, ']');`~
` return;`.dup;
}
return "switch (v.bam_typeid) { " ~ cases.idup ~ "default: assert(0); }";
}
mixin(toSamNumericArrayHelper());
}
if (v.is_integer) {
switch (v.bam_typeid) {
case 'c': putinteger(stream, to!byte(v)); return;
case 'C': putinteger(stream, to!ubyte(v)); return;
case 's': putinteger(stream, to!short(v)); return;
case 'S': putinteger(stream, to!ushort(v)); return;
case 'i': putinteger(stream, to!int(v)); return;
case 'I': putinteger(stream, to!uint(v)); return;
default: assert(0);
}
}
if (v.is_float) {
jsonSerializeFloat(stream, to!float(v));
return;
}
switch (v.bam_typeid) {
case 'Z', 'H':
jsonSerializeCharacterRange(stream, cast(string)v);
return;
case 'A':
auto c = to!char(v);
jsonSerializeCharacterRange(stream, to!string(c));
return;
default: assert(0);
}
}
/// Get JSON representation of an alignment.
///
/// Requires providing information about reference sequences,
/// since alignment struct itself doesn't hold their names, only integer ids.
///
/// Example:
/// -------------
/// toJson(alignment, bam.reference_sequences);
/// -------------
string toJson(BamRead alignment, const(ReferenceSequenceInfo)[] info) {
char[] buf;
buf.reserve(512);
jsonSerialize(alignment, info, buf);
return cast(string)buf;
}
/// Serialize $(D alignment) to FILE* or append it to char[]/char*
/// (in char* case it's your responsibility to allocate enough memory)
void jsonSerialize(S)(BamRead alignment, const(ReferenceSequenceInfo)[] info, ref S stream)
if (is(Unqual!S == FILE*) || is(Unqual!S == char*) || is(Unqual!S == char[]))
{
// Notice: it is extremely important to exclude pointers,
// otherwise you'll get recursion and stack overflow.
static if (__traits(compiles, alloca(0)) && !is(Unqual!S == char*)) {
immutable ALLOCA_THRESHOLD = 5000;
if (alignment.size_in_bytes < ALLOCA_THRESHOLD) {
char* buffer = cast(char*)alloca(alignment.size_in_bytes * 10 + 1000);
if (buffer != null) {
char* p = buffer; // this pointer will be modified
jsonSerialize(alignment, info, p);
putstring(stream, buffer[0 .. p - buffer]);
return;
} else {
debug {
import std.stdio;
writeln("WARNING: pointer allocated with alloca was null");
}
}
}
}
putstring(stream, `{"qname":`);
jsonSerializeCharacterRange(stream, alignment.name);
putstring(stream, `,"flag":`);
putinteger(stream, alignment.flag);
putstring(stream, `,"rname":`);
if (alignment.ref_id == -1) {
putstring(stream, `"*","pos":`);
} else {
jsonSerializeCharacterRange(stream, info[alignment.ref_id].name);
putstring(stream, `,"pos":`);
}
putinteger(stream, alignment.position + 1);
putstring(stream, `,"mapq":`);
putinteger(stream, alignment.mapping_quality);
putstring(stream, `,"cigar":"`);
if (alignment.cigar.length == 0) {
putstring(stream, `*","rnext":`);
} else {
foreach (cigar_op; alignment.cigar) {
putinteger(stream, cigar_op.length);
putcharacter(stream, cigar_op.type);
}
putstring(stream, `","rnext":`);
}
if (alignment.mate_ref_id == alignment.ref_id) {
if (alignment.mate_ref_id == -1) {
putstring(stream, `"*","pnext":`);
} else {
putstring(stream, `"=","pnext":`);
}
} else {
if (alignment.mate_ref_id == -1 ||
info[alignment.mate_ref_id].name.length == 0)
{
putstring(stream, `"*","pnext":`);
} else {
jsonSerializeCharacterRange(stream, info[alignment.mate_ref_id].name);
putstring(stream, `,"pnext":`);
}
}
putinteger(stream, alignment.mate_position + 1);
putstring(stream, `,"tlen":`);
putinteger(stream, alignment.template_length);
putstring(stream, `,"seq":"`);
if (alignment.sequence_length == 0) {
putstring(stream, `*","qual":`);
} else {
foreach(char c; alignment.sequence) {
putcharacter(stream, c);
}
putstring(stream, `","qual":`);
}
putcharacter(stream, '[');
bool first = true;
foreach(ubyte c; alignment.base_qualities) {
if (!first) {
putcharacter(stream, ',');
} else {
first = false;
}
putinteger(stream, c);
}
putstring(stream, `],"tags":{`);
bool not_first = false;
foreach (k, v; alignment) {
assert(k.length == 2);
if (not_first) {
putcharacter(stream, ',');
}
jsonSerializeCharacterRange(stream, k);
putcharacter(stream, ':');
not_first = true;
jsonSerialize(v, stream);
}
putstring(stream, `}}`);
return;
}

235
bio/bam/serialization/sam.d

@ -1,235 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.bam.serialization.sam;
import bio.bam.read;
import bio.bam.referenceinfo;
import bio.bam.tagvalue;
import bio.core.utils.format;
import std.conv;
import std.algorithm;
import std.typecons;
import std.stdio;
import std.traits;
import std.c.stdlib;
import std.array;
/** Representation of tag value in SAM format
Example:
----------
Value v = 2.7;
assert(toSam(v) == "f:2.7");
v = [1, 2, 3];
assert(toSam(v) == "B:i,1,2,3");
----------
*/
string toSam(V)(auto ref V v)
if(is(V == Value))
{
char[] buf;
buf.reserve(16);
serialize(v, buf);
return cast(string)buf;
}
/// Print SAM representation to FILE* or append it to char[]/char*
/// (in char* case it's your responsibility to allocate enough memory)
void serialize(S)(const ref Value v, ref S stream) {
if (v.is_numeric_array) {
string toSamNumericArrayHelper() {
char[] cases;
foreach (t; ArrayElementTagValueTypes) {
char[] loopbody = "putcharacter(stream, ',');" ~
"putinteger(stream, elem);".dup;
if (t.ch == 'f') {
loopbody = "append(stream, \",%g\", elem);".dup;
}
cases ~= `case '`~t.ch~`':` ~
` putstring(stream, "B:`~t.ch~`");`~
` auto arr = cast(`~t.ValueType.stringof~`[])v;`~
` foreach (elem; arr) {`~loopbody~`}`~
` return;`.dup;
}
return "switch (v.bam_typeid) { " ~ cases.idup ~ "default: assert(0); }";
}
mixin(toSamNumericArrayHelper());
}
if (v.is_integer) {
putstring(stream, "i:");
switch (v.bam_typeid) {
case 'c': putinteger(stream, to!byte(v)); return;
case 'C': putinteger(stream, to!ubyte(v)); return;
case 's': putinteger(stream, to!short(v)); return;
case 'S': putinteger(stream, to!ushort(v)); return;
case 'i': putinteger(stream, to!int(v)); return;
case 'I': putinteger(stream, to!uint(v)); return;
default: assert(0);
}
}
if (v.is_float) {
append(stream, "f:%g", to!float(v));
return;
}
switch (v.bam_typeid) {
case 'Z', 'H':
putcharacter(stream, v.bam_typeid);
putcharacter(stream, ':');
putstring(stream, cast(string)v);
return;
case 'A':
putstring(stream, "A:");
putcharacter(stream, to!char(v));
return;
default: assert(0);
}
}
/// Get SAM representation of an alignment.
///
/// Requires providing information about reference sequences,
/// since alignment struct itself doesn't hold their names, only integer ids.
///
/// Example:
/// -------------
/// toSam(alignment, bam.reference_sequences);
/// -------------
string toSam(R)(auto ref R alignment, const(ReferenceSequenceInfo)[] info) {
char[] buf;
buf.reserve(512);
serialize(alignment, info, buf);
return cast(string)buf;
}
/// Serialize $(D alignment) to FILE* or append it to char[]/char*
/// (in char* case it's your responsibility to allocate enough memory)
void serialize(S, R)(auto ref R alignment, const(ReferenceSequenceInfo)[] info, auto ref S stream)
if (is(Unqual!S == FILE*) || is(Unqual!S == char*) || is(Unqual!S == char[]))
{
// Notice: it is extremely important to exclude pointers,
// otherwise you'll get recursion and stack overflow.
static if (__traits(compiles, alloca(0)) && !is(Unqual!S == char*)) {
immutable ALLOCA_THRESHOLD = 10000;
if (alignment.size_in_bytes < ALLOCA_THRESHOLD) {
// surely we can allocate 50 kilobytes on the stack,
// we're not targeting embedded systems :)
char* buffer = cast(char*)alloca(alignment.size_in_bytes * 5);
if (buffer != null) {
char* p = buffer; // this pointer will be modified
serialize(alignment, info, p);
putstring(stream, buffer[0 .. p - buffer]);
return;
} else {
debug {
import std.stdio;
writeln("WARNING: pointer allocated with alloca was null");
}
}
}
}
putstring(stream, alignment.name);
putcharacter(stream, '\t');
putinteger(stream, alignment.flag);
putcharacter(stream, '\t');
if (alignment.ref_id == -1) {
putstring(stream, "*\t");
} else {
putstring(stream, info[alignment.ref_id].name);
putcharacter(stream, '\t');
}
putinteger(stream, alignment.position + 1);
putcharacter(stream, '\t');
putinteger(stream, alignment.mapping_quality);
putcharacter(stream, '\t');
if (alignment.cigar.length == 0) {
putstring(stream, "*\t");
} else {
foreach (cigar_op; alignment.cigar) {
putinteger(stream, cigar_op.length);
putcharacter(stream, cigar_op.type);
}
putcharacter(stream, '\t');
}
if (alignment.mate_ref_id == alignment.ref_id) {
if (alignment.mate_ref_id == -1) {
putstring(stream, "*\t");
} else {
putstring(stream, "=\t");
}
} else {
if (alignment.mate_ref_id == -1 ||
info[alignment.mate_ref_id].name.length == 0)
{
putstring(stream, "*\t");
} else {
putstring(stream, info[alignment.mate_ref_id].name);
putcharacter(stream, '\t');
}
}
putinteger(stream, alignment.mate_position + 1);
putcharacter(stream, '\t');
putinteger(stream, alignment.template_length);
putcharacter(stream, '\t');
if (alignment.sequence_length == 0) {
putstring(stream, "*\t");
} else {
foreach(char c; alignment.sequence) {
putcharacter(stream, c);
}
putcharacter(stream, '\t');
}
if (alignment.base_qualities.length == 0 ||
alignment.base_qualities[0] == '\xFF')
{
putcharacter(stream, '*');
} else {
foreach(char c; alignment.base_qualities) {
putcharacter(stream, cast(char)(c + 33));
}
}
foreach (k, v; alignment) {
assert(k.length == 2);
putcharacter(stream, '\t');
putstring(stream, k);
putcharacter(stream, ':');
serialize(v, stream);
}
return;
}

88
bio/bam/tagvalue.d

@ -48,6 +48,7 @@ public import std.conv;
import std.typetuple;
import std.exception;
import std.format;
import std.array;
import bio.core.utils.format;
import bio.bam.thirdparty.msgpack;
@ -452,40 +453,83 @@ struct Value {
}
}
void formatSam(scope void delegate(const(char)[]) sink) const {
/// SAM representation
string toSam()() const {
auto w = appender!(char[])();
toSam((const(char)[] s) { w.put(s); });
return cast(string)w.data;
}
/// ditto
void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
if (is_integer) {
sink("i:");
sink.write("i:");
switch (_tag) {
case GetTypeId!byte: sink.putInteger(*cast(byte*)(&u)); break;
case GetTypeId!ubyte: sink.putInteger(*cast(ubyte*)(&u)); break;
case GetTypeId!short: sink.putInteger(*cast(short*)(&u)); break;
case GetTypeId!ushort: sink.putInteger(*cast(ushort*)(&u)); break;
case GetTypeId!int: sink.putInteger(*cast(int*)(&u)); break;
case GetTypeId!uint: sink.putInteger(*cast(uint*)(&u)); break;
case GetTypeId!byte: sink.write(*cast(byte*)(&u)); break;
case GetTypeId!ubyte: sink.write(*cast(ubyte*)(&u)); break;
case GetTypeId!short: sink.write(*cast(short*)(&u)); break;
case GetTypeId!ushort: sink.write(*cast(ushort*)(&u)); break;
case GetTypeId!int: sink.write(*cast(int*)(&u)); break;
case GetTypeId!uint: sink.write(*cast(uint*)(&u)); break;
default: break;
}
} else if (is_numeric_array) {
sink("B:");
sink.putChar(bam_typeid);
sink.putChar(',');
sink.write("B:");
sink.write(bam_typeid);
sink.write(',');
switch (_tag) {
case GetTypeId!(byte[]): sink.putArray(*cast(byte[]*)(&u), ','); break;
case GetTypeId!(ubyte[]): sink.putArray(*cast(ubyte[]*)(&u), ','); break;
case GetTypeId!(short[]): sink.putArray(*cast(short[]*)(&u), ','); break;
case GetTypeId!(ushort[]): sink.putArray(*cast(ushort[]*)(&u), ','); break;
case GetTypeId!(int[]): sink.putArray(*cast(int[]*)(&u), ','); break;
case GetTypeId!(uint[]): sink.putArray(*cast(uint[]*)(&u), ','); break;
case GetTypeId!(float[]): sink.putArray(*cast(float[]*)(&u), ','); break;
case GetTypeId!(byte[]): sink.writeArray(*cast(byte[]*)(&u), ','); break;
case GetTypeId!(ubyte[]): sink.writeArray(*cast(ubyte[]*)(&u), ','); break;
case GetTypeId!(short[]): sink.writeArray(*cast(short[]*)(&u), ','); break;
case GetTypeId!(ushort[]): sink.writeArray(*cast(ushort[]*)(&u), ','); break;
case GetTypeId!(int[]): sink.writeArray(*cast(int[]*)(&u), ','); break;
case GetTypeId!(uint[]): sink.writeArray(*cast(uint[]*)(&u), ','); break;
case GetTypeId!(float[]): sink.writeArray(*cast(float[]*)(&u), ','); break;
default: break;
}
} else {
switch (_tag) {
case GetTypeId!float: sink("f:"); sink.putFloat(*cast(float*)(&u)); break;
case GetTypeId!string: sink("Z:"); sink(*cast(const(char)[]*)(&u)); break;
case hexStringTag: sink("H:"); sink(*cast(const(char)[]*)(&u)); break;
case GetTypeId!char: sink("A:"); sink.putChar(*cast(char*)(&u)); break;
case GetTypeId!float: sink.write("f:"); sink.write(*cast(float*)(&u)); break;
case GetTypeId!string: sink.write("Z:"); sink.write(*cast(const(char)[]*)(&u)); break;
case hexStringTag: sink.write("H:"); sink.write(*cast(const(char)[]*)(&u)); break;
case GetTypeId!char: sink.write("A:"); sink.write(*cast(char*)(&u)); break;
default: break;
}
}
}
/// JSON representation
string toJson()() const {
auto w = appender!(char[])();
toJson((const(char)[] s) { w.put(s); });
return cast(string)w.data;
}
/// ditto
void toJson(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
switch (_tag) {
case GetTypeId!byte: sink.writeJson(*cast(byte*)(&u)); break;
case GetTypeId!ubyte: sink.writeJson(*cast(ubyte*)(&u)); break;
case GetTypeId!short: sink.writeJson(*cast(short*)(&u)); break;
case GetTypeId!ushort: sink.writeJson(*cast(ushort*)(&u)); break;
case GetTypeId!int: sink.writeJson(*cast(int*)(&u)); break;
case GetTypeId!uint: sink.writeJson(*cast(uint*)(&u)); break;
case GetTypeId!(byte[]): sink.writeJson(*cast(byte[]*)(&u)); break;
case GetTypeId!(ubyte[]): sink.writeJson(*cast(ubyte[]*)(&u)); break;
case GetTypeId!(short[]): sink.writeJson(*cast(short[]*)(&u)); break;
case GetTypeId!(ushort[]): sink.writeJson(*cast(ushort[]*)(&u)); break;
case GetTypeId!(int[]): sink.writeJson(*cast(int[]*)(&u)); break;
case GetTypeId!(uint[]): sink.writeJson(*cast(uint[]*)(&u)); break;
case GetTypeId!(float[]): sink.writeJson(*cast(float[]*)(&u)); break;
case GetTypeId!float: sink.writeJson(*cast(float*)(&u)); break;
case GetTypeId!string: sink.writeJson(*cast(string*)(&u)); break;
case hexStringTag: sink.writeJson(*cast(string*)(&u)); break;
case GetTypeId!char: sink.writeJson(*cast(char*)(&u)); break;
default: break;
}
}
}

2
bio/bam/writer.d

@ -100,7 +100,7 @@ final class BamWriter {
/// Writes SAM header. Should be called after construction.
void writeSamHeader(bio.sam.header.SamHeader header) {
auto text = toSam(header);
auto text = header.text;
writeInteger(cast(int)text.length);
writeString(text);
}

448
bio/core/utils/format.d

@ -18,21 +18,15 @@
*/
/**
D standard library formatting functions turned out to be
too slow for big data processing, while standard C functions
are very fast. This module contains set of functions for
building strings in memory and outputting them into a file.
For outputting to file, FILE* pointers are supported.
For building strings in memory, provide char[] array or char*
pointer when you're sure that amount of preallocated memory
is enough to store string representation.
In case char* is used, it is passed by reference, and the pointer
is updated during string building.
Use pointer version when it allows you to get better performance,
but remember that it's quite dangerous.
$(P This module provides fast formatting functions.)
$(P Each function has two overloads:
$(UL
$(LI $(D ref char*) - in this case, function starts
writing at the location, and updates the pointer.
No checks are done, it's user's responsibility that this is safe.)
$(LI $(D scope void delegate(const(char)[])) - formatted data
is passed to the delegate for further processing.)))
*/
module bio.core.utils.format;
@ -41,105 +35,10 @@ import std.c.stdlib;
import std.string;
import std.traits;
import std.array;
/// Used for building a string in a buffer or for outputting to stream.
size_t append(Args...)(FILE* stream, string format, Args args)
{
auto _format = toStringz(format);
return fprintf(stream, _format, args);
}
/// ditto
size_t append(Args...)(ref char* stream, string format, Args args) {
auto _format = toStringz(format);
auto sz = sprintf(stream, _format, args);
stream += sz;
return sz;
}
/// ditto
size_t append(Args...)(ref char[] stream, string format, Args args)
{
char[1024] buffer;
int count;
auto f = toStringz(format);
auto p = buffer.ptr;
auto psize = buffer.length;
for (;;)
{
version(Win32)
{
count = _snprintf(p,psize,f,args);
if (count != -1)
break;
psize *= 2;
p = cast(char *) alloca(psize);
}
version(Posix)
{
count = snprintf(p,psize,f,args);
if (count == -1)
psize *= 2;
else if (count >= psize)
psize = count + 1;
else
break;
p = cast(char *) alloca(psize);
}
}
stream ~= p[0 .. count];
return count;
}
/// Put char into a stream
size_t putcharacter(FILE* stream, char c)
{
fputc(c, stream);
return 1;
}
/// Append char to a buffer
size_t putcharacter(ref char[] stream, char c)
{
stream ~= c;
return 1;
}
/// ditto
size_t putcharacter(ref char* stream, char c) {
*stream++ = c;
return 1;
}
/// Append string to output stream.
size_t putstring(T)(FILE* stream, T[] s)
if (is(Unqual!T == char))
{
fwrite(s.ptr, s.length, char.sizeof, stream);
return s.length;
}
/// Append string to a buffer
size_t putstring(T)(ref char[] stream, T[] s)
if (is(Unqual!T == char))
{
stream ~= s;
return s.length;
}
/// ditto
size_t putstring(T)(ref char* stream, T[] s)
if (is(Unqual!T == char))
{
stream[0 .. s.length] = s;
stream += s.length;
return s.length;
}
import std.math;
private {
/// Reverses closed interval [begin .. end]
// Reverses closed interval [begin .. end]
void strreverse(char* begin, char* end)
{
char aux;
@ -147,9 +46,9 @@ private {
aux = *end, *end-- = *begin, *begin++ = aux;
}
/// Prints $(D value) at the address where $(D str) points to.
/// Returns number of characters written.
auto itoa(T)(T value, char* str)
// Prints $(D value) at the address where $(D str) points to.
// Returns number of characters written.
size_t itoa(T)(T value, char* str)
{
char* wstr=str;
@ -173,150 +72,237 @@ private {
}
}
/// Put integer number into a stream
size_t putinteger(T)(FILE* stream, T number)
if (isIntegral!T)
{
char[64] buf;
auto len = itoa(number, buf.ptr);
fwrite(buf.ptr, len, char.sizeof, stream);
return len;
}
/// Add string representation of an integer to a buffer
size_t putinteger(T)(ref char[] stream, T number)
if (isIntegral!T)
{
char[64] buf;
auto len = itoa(number, buf.ptr);
stream ~= buf[0 .. len];
return len;
}
/// ditto
size_t putinteger(T)(ref char* stream, T number) {
auto len = itoa(number, stream);
stream += len;
return len;
///
template isSomeSink(T) {
static if (is(T == void delegate(const(char)[])))
enum isSomeSink = true;
else static if (is(T == char*))
enum isSomeSink = true;
else
enum isSomeSink = false;
}
unittest {
char[] buf;
append(buf, "%d%g", 1, 2.4);
assert(cast(string)(buf) == "12.4");
private {
void writeFloat(T)(ref char* sink, T number)
if (isFloatingPoint!T)
{
char[4] format;
format[0] = '%';
format[1] = 'g';
format[2] = '\0';
sink += sprintf(sink, format.ptr, number);
}
append(buf, "%s", toStringz("k"));
assert(cast(string)(buf) == "12.4k");
void writeFloat(T)(scope void delegate(const(char)[]) sink, T number)
if (isFloatingPoint!T)
{
char[1024] buffer = void;
int count;
auto str = "m";
append(buf, "%.*s", str.length, str.ptr);
assert(cast(string)(buf) == "12.4km");
auto p = buffer.ptr;
auto psize = buffer.length;
for (;;)
{
version(Win32)
{
count = _snprintf(p,psize,"%g", cast(double)number);
if (count != -1)
break;
psize *= 2;
p = cast(char *) alloca(psize);
}
version(Posix)
{
count = snprintf(p,psize,"%g", cast(double)number);
if (count == -1)
psize *= 2;
else if (count >= psize)
psize = count + 1;
else
break;
p = cast(char *) alloca(psize);
}
}
append(buf, "%c%c", '/', 'h');
assert(cast(string)(buf) == "12.4km/h");
sink(p[0 .. count]);
}
ushort k = 5;
append(buf, "%d", k);
assert(cast(char)(buf[$-1]) == '5');
void writeInteger(T)(ref char* sink, T integer)
if (isIntegral!T)
{
sink += itoa(integer, sink);
}
buf.length = 0;
putstring(buf, "tes");
putcharacter(buf, 't');
void writeInteger(T)(scope void delegate(const(char)[]) sink, T integer)
if (isIntegral!T)
{
char[32] buf = void;
auto len = itoa(integer, buf.ptr);
sink(buf[0 .. len]);
}
uint z = 345;
append(buf, "%d", z);
void writeChar(T)(ref char* sink, T c)
if (isSomeChar!T)
{
*sink++ = c;
}
assert(cast(string)(buf) == "test345");
void writeChar(T)(scope void delegate(const(char)[]) sink, T c)
if (isSomeChar!T)
{
sink((&c)[0 .. 1]);
}
buf.length = 0;
putinteger(buf, 25);
assert(cast(string)(buf) == "25");
putinteger(buf, -31);
assert(cast(string)(buf) == "25-31");
void writeString(T)(ref char* sink, T s)
if (isSomeString!T)
{
auto str = cast(const(char)[])s;
memcpy(sink, str.ptr, str.length);
sink += str.length;
}
char* s = cast(char*)malloc(100);
scope(exit) free(s);
void writeString(T)(scope void delegate(const(char)[]) sink, T s)
if (isSomeString!T)
{
sink(cast(const(char)[])s);
}
char* p = s;
putstring(p, "123");
putinteger(p, 456);
putcharacter(p, '7');
append(p, "%g", 8.9);
assert(s[0 .. p - s] == "12345678.9");
}
void writeImpl(Sink, T)(auto ref Sink sink, T value)
if (isSomeSink!Sink)
{
static if (isIntegral!T)
writeInteger(sink, value);
else static if (isFloatingPoint!T)
writeFloat(sink, value);
else static if (isSomeChar!T)
writeChar(sink, value);
else static if (isSomeString!T)
writeString(sink, value);
else static assert(false,
"only integers, floats, chars and strings are supported");
}
void putFloat(T)(scope void delegate(const(char)[]) sink, T number)
if (isFloatingPoint!T)
{
char[1024] buffer = void;
int count;
// -------------------- JSON output utils ----------------------------------
auto p = buffer.ptr;
auto psize = buffer.length;
for (;;)
// JSON doesn't support NaN and +/- infinity.
// Therefore the approach taken here is to represent
// infinity as 1.0e+1024, and NaN as null.
void writeFloatJson(Sink, T)(auto ref Sink sink, T value)
if (isFloatingPoint!T)
{
version(Win32)
{
count = _snprintf(p,psize,"%g", cast(double)number);
if (count != -1)
break;
psize *= 2;
p = cast(char *) alloca(psize);
if (isFinite(value)) {
sink.write(value);
} else {
if (value == float.infinity) {
sink.write("1.0e+1024");
} else if (value == -float.infinity) {
sink.write("-1.0e+1024");
} else if (isNaN(value)) {
sink.write("null");
} else {
assert(0);
}
}
version(Posix)
{
count = snprintf(p,psize,"%g", cast(double)number);
if (count == -1)
psize *= 2;
else if (count >= psize)
psize = count + 1;
else
break;
p = cast(char *) alloca(psize);
}
immutable char[256] specialCharacterTable = [
/* 0-15 */ 0,0, 0,0,0,0,0,0, 'b','t','n',0, 'f','r',0, 0,
/* 16-31 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 32-47 */ 0,0,'"',0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 48-63 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0,'/',
/* 64-79 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 80-95 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0,'\\', 0,0, 0,
/* 96-111 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
/* 112-127 */ 0,0, 0,0,0,0,0,0, 0, 0, 0,0, 0, 0,0, 0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0
];
void writeStringJson(Sink, T)(auto ref Sink sink, T s)
if (isSomeString!T)
{
sink.write('"');
foreach (char c; s) {
auto sc = specialCharacterTable[cast(ubyte)c];
if (sc == 0) {
sink.write(c);
} else {
sink.write('\\');
sink.write(sc);
}
}
sink.write('"');
}
sink(p[0 .. count]);
}
void writeCharJson(Sink, T)(auto ref Sink sink, T c)
if (isSomeChar!T)
{
sink.writeStringJson((&c)[0 .. 1]);
}
void putInteger(T)(scope void delegate(const(char)[]) sink, T integer)
if (isIntegral!T)
{
char[64] buf = void;
auto len = itoa(integer, buf.ptr);
sink(buf[0 .. len]);
}
void writeArrayJson(Sink, T)(auto ref Sink sink, T array)
if (isArray!T && __traits(compiles, sink.writeJson(array[0])))
{
if (array.length == 0) {
sink.write("[]");
return;
}
void putChar(T)(scope void delegate(const(char)[]) sink, T c)
if (isSomeChar!T)
{
char[1] buf = void;
buf[0] = c;
sink(buf[0 .. 1]);
}
sink.write('[');
foreach (elem; array[0 .. $ - 1]) {
sink.writeJson(elem);
sink.write(',');
}
sink.writeJson(array[$ - 1]);
sink.write(']');
}
void put(T)(scope void delegate(const(char)[]) sink, T value) {
static if (isIntegral!T)
putInteger(sink, value);
else static if (isFloatingPoint!T)
putFloat(sink, value);
else static if (isSomeChar!T)
putChar(sink, value);
else static if (isSomeString!T)
sink(cast(const(char)[])value);
else static assert(false);
void writeJsonImpl(Sink, T)(auto ref Sink sink, T value)
if (isSomeSink!Sink)
{
static if (isIntegral!T)
writeInteger(sink, value);
else static if (isFloatingPoint!T)
writeFloatJson(sink, value);
else static if (isSomeChar!T)
writeCharJson(sink, value);
else static if (isSomeString!T)
writeStringJson(sink, value);
else static if (isArray!T && __traits(compiles, sink.writeJsonImpl(value[0])))
writeArrayJson(sink, value);
else static assert(false,
"only numbers, chars, strings and arrays are supported");
}
}
void putArray(T, U)(scope void delegate(const(char)[]) sink, T array, U delimiter)
if (isArray!T && (isSomeChar!U || isSomeString!U) && __traits(compiles, put(sink, array[0])))
///
void write(T)(ref char* sink, T value) { writeImpl(sink, value); }
///
void write(T)(scope void delegate(const(char)[]) sink, T value) { writeImpl(sink, value); }
///
void writeArray(Sink, T, U)(auto ref Sink sink, T array, U delimiter)
if (isSomeSink!Sink && isArray!T && (isSomeChar!U || isSomeString!U) &&
__traits(compiles, sink.write(array[0])))
{
if (array.length == 0)
return;
foreach (elem; array[0 .. $ - 1]) {
sink.put(elem);
sink.put(delimiter);
sink.write(elem);
sink.write(delimiter);
}
sink.put(array[$ - 1]);
sink.write(array[$ - 1]);
}
/// Supports numbers, strings, and arrays. No dictionary - because D doesn't have a good one.
void writeJson(T)(ref char* sink, T value) { writeJsonImpl(sink, value); }
/// ditto
void writeJson(T)(scope void delegate(const(char)[]) sink, T value) { writeJsonImpl(sink,