Browse Source

sff reader

remotes/georgeg/bam_output_redesign
lomereiter 10 years ago
parent
commit
b307d0f7be
  1. 115
      bio/sff/file.d
  2. 6
      bio/sff/index.d
  3. 23
      bio/sff/read.d
  4. 127
      bio/sff/readrange.d

115
bio/sff/file.d

@ -0,0 +1,115 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
module bio.sff.file;
import bio.sff.index;
import bio.sff.readrange;
import bio.core.utils.stream;
import std.stream;
import std.system;
import std.range;
import std.exception;
/// SFF file reader
struct SffFile {
/// Open file by filename
this(string filename) {
_filename = filename;
_readHeader();
}
/// Reads
auto reads() @property {
auto stream = new bio.core.utils.stream.File(filename);
Stream sff = new EndianStream(stream, Endian.bigEndian);
sff.seekSet(_header_length);
auto sff_reads = SffReadRange(sff, cast(ushort)_flow_chars.length, _index_location);
return takeExactly(sff_reads, _n_reads);
}
/// File name
string filename() @property const {
return _filename;
}
/// Location of the index (if included).
IndexLocation index_location() @property const {
return _index_location;
}
/// Nucleotides used for each flow of each read
string flow_order() @property const {
return _flow_chars;
}
/// Nucleotide bases of the key sequence used for each read
string key_sequence() @property const {
return _key_sequence;
}
private {
string _filename;
uint _magic_number;
char[4] _version;
uint _n_reads;
ushort _header_length;
string _flow_chars;
string _key_sequence;
IndexLocation _index_location;
void _readHeader() {
auto stream = new bio.core.utils.stream.File(_filename);
auto sff = new EndianStream(stream, Endian.bigEndian);
sff.read(_magic_number);
enforce(_magic_number == 0x2E736666, "Wrong magic number, expected 0x2E736666");
sff.readExact(_version.ptr, 4);
enforce(_version == [0, 0, 0, 1], "Unsupported version, expected 1");
sff.read(_index_location.offset);
sff.read(_index_location.length);
sff.read(_n_reads);
sff.read(_header_length);
ushort _key_length;
ushort _number_of_flows;
ubyte _flowgram_format_code;
sff.read(_key_length);
sff.read(_number_of_flows);
sff.read(_flowgram_format_code);
enforce(_flowgram_format_code == 1,
"Flowgram format codes other than 1 are not supported");
_flow_chars = cast(string)sff.readString(_number_of_flows);
_key_sequence = cast(string)sff.readString(_key_length);
}
}
}

6
bio/sff/index.d

@ -0,0 +1,6 @@
module bio.sff.index;
struct IndexLocation {
ulong offset;
uint length;
}

23
bio/sff/read.d

@ -0,0 +1,23 @@
module bio.sff.read;
/// SFF record
struct SffRead {
/// Read identifier
string name;
/// Homopolymer stretch estimates for each flow of the read
ushort[] flowgram_values;
ubyte[] flow_index_per_base;
/// Basecalled nucleotide sequence
char[] bases;
/// Phred-scaled quality scores
ubyte[] quality_scores;
ushort clip_qual_left;
ushort clip_qual_right;
ushort clip_adapter_left;
ushort clip_adapter_right;
}

127
bio/sff/readrange.d

@ -0,0 +1,127 @@
module bio.sff.readrange;
import bio.sff.read;
import bio.sff.index;
import bio.core.utils.switchendianness;
import std.algorithm;
import std.stream;
import std.system;
import std.array;
private {
// GC used in D is quite bad at allocating lots of objects in a tight loop.
// The following is a simple way to reduce the number of allocations.
ubyte[] current_chunk;
size_t used;
size_t chunk_size = 65_536;
static this() {
current_chunk = uninitializedArray!(ubyte[])(chunk_size);
used = 0;
}
T[] allocateArray(T : T[])(size_t size) {
size_t new_used = used + size * T.sizeof;
if (new_used > chunk_size) {
new_used = size * T.sizeof;
if (new_used > chunk_size)
chunk_size = new_used;
current_chunk = uninitializedArray!(ubyte[])(chunk_size);
used = new_used;
return cast(T[])current_chunk[0 .. used];
} else {
auto old_used = used;
used = new_used;
return cast(T[])current_chunk[old_used .. used];
}
}
}
struct SffReadRange {
this(Stream stream,
ushort number_of_flows_per_read,
IndexLocation index_location)
{
_stream = stream;
_n_flows = number_of_flows_per_read;
_index_loc = index_location;
_fetchNextRead();
}
private {
Stream _stream;
ushort _n_flows;
IndexLocation _index_loc;
bool _empty;
SffRead _read;
void _fetchNextRead() {
if (_stream.eof) {
_empty = true;
} else {
if (_stream.position == _index_loc.offset)
_stream.seekCur(_index_loc.length);
// determine how many bytes to read
ushort read_header_length = void;
ushort name_length = void;
uint number_of_bases = void;
_stream.read(read_header_length);
_stream.read(name_length);
_stream.read(number_of_bases);
_stream.read(_read.clip_qual_left);
_stream.read(_read.clip_qual_right);
_stream.read(_read.clip_adapter_left);
_stream.read(_read.clip_adapter_right);
char[] name = allocateArray!(char[])(name_length);
_stream.readExact(name.ptr, name_length);
_stream.seekCur(read_header_length - 16 - name_length);
_read.name = cast(string)name;
size_t _data_length = _n_flows * ushort.sizeof + 3 * number_of_bases;
_read.flowgram_values = allocateArray!(ushort[])(_n_flows);
_stream.readExact(_read.flowgram_values.ptr, _n_flows * ushort.sizeof);
if (std.system.endian != Endian.bigEndian) {
for (size_t i = 0; i < _n_flows; ++i) {
switchEndianness(_read.flowgram_values.ptr + i, ushort.sizeof);
}
}
_read.flow_index_per_base = allocateArray!(ubyte[])(number_of_bases);
_stream.readExact(_read.flow_index_per_base.ptr, number_of_bases);
_read.bases = allocateArray!(char[])(number_of_bases);
_stream.readExact(_read.bases.ptr, number_of_bases);
_read.quality_scores = allocateArray!(ubyte[])(number_of_bases);
_stream.readExact(_read.quality_scores.ptr, number_of_bases);
if (_data_length % 8 > 0)
_stream.seekCur(8 - (_data_length % 8));
}
}
}
bool empty() @property const {
return _empty;
}
SffRead front() @property {
return _read;
}
void popFront() {
_fetchNextRead();
}
}
Loading…
Cancel
Save