
4 changed files with 271 additions and 0 deletions
@ -0,0 +1,115 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
BioD is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published by |
|||
the Free Software Foundation; either version 2 of the License, or |
|||
(at your option) any later version. |
|||
|
|||
BioD is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
GNU General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|||
|
|||
*/ |
|||
module bio.sff.file; |
|||
|
|||
import bio.sff.index; |
|||
import bio.sff.readrange; |
|||
|
|||
import bio.core.utils.stream; |
|||
import std.stream; |
|||
import std.system; |
|||
import std.range; |
|||
import std.exception; |
|||
|
|||
/// SFF file reader
|
|||
struct SffFile { |
|||
|
|||
/// Open file by filename
|
|||
this(string filename) { |
|||
_filename = filename; |
|||
|
|||
_readHeader(); |
|||
} |
|||
|
|||
/// Reads
|
|||
auto reads() @property { |
|||
auto stream = new bio.core.utils.stream.File(filename); |
|||
Stream sff = new EndianStream(stream, Endian.bigEndian); |
|||
|
|||
sff.seekSet(_header_length); |
|||
auto sff_reads = SffReadRange(sff, cast(ushort)_flow_chars.length, _index_location); |
|||
return takeExactly(sff_reads, _n_reads); |
|||
} |
|||
|
|||
/// File name
|
|||
string filename() @property const { |
|||
return _filename; |
|||
} |
|||
|
|||
/// Location of the index (if included).
|
|||
IndexLocation index_location() @property const { |
|||
return _index_location; |
|||
} |
|||
|
|||
/// Nucleotides used for each flow of each read
|
|||
string flow_order() @property const { |
|||
return _flow_chars; |
|||
} |
|||
|
|||
/// Nucleotide bases of the key sequence used for each read
|
|||
string key_sequence() @property const { |
|||
return _key_sequence; |
|||
} |
|||
|
|||
private { |
|||
string _filename; |
|||
|
|||
uint _magic_number; |
|||
char[4] _version; |
|||
|
|||
uint _n_reads; |
|||
ushort _header_length; |
|||
|
|||
string _flow_chars; |
|||
string _key_sequence; |
|||
|
|||
IndexLocation _index_location; |
|||
|
|||
void _readHeader() { |
|||
auto stream = new bio.core.utils.stream.File(_filename); |
|||
auto sff = new EndianStream(stream, Endian.bigEndian); |
|||
|
|||
sff.read(_magic_number); |
|||
enforce(_magic_number == 0x2E736666, "Wrong magic number, expected 0x2E736666"); |
|||
|
|||
sff.readExact(_version.ptr, 4); |
|||
enforce(_version == [0, 0, 0, 1], "Unsupported version, expected 1"); |
|||
|
|||
sff.read(_index_location.offset); |
|||
sff.read(_index_location.length); |
|||
|
|||
sff.read(_n_reads); |
|||
sff.read(_header_length); |
|||
|
|||
ushort _key_length; |
|||
ushort _number_of_flows; |
|||
ubyte _flowgram_format_code; |
|||
|
|||
sff.read(_key_length); |
|||
sff.read(_number_of_flows); |
|||
sff.read(_flowgram_format_code); |
|||
enforce(_flowgram_format_code == 1, |
|||
"Flowgram format codes other than 1 are not supported"); |
|||
|
|||
_flow_chars = cast(string)sff.readString(_number_of_flows); |
|||
_key_sequence = cast(string)sff.readString(_key_length); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,6 @@ |
|||
module bio.sff.index; |
|||
|
|||
struct IndexLocation { |
|||
ulong offset; |
|||
uint length; |
|||
} |
@ -0,0 +1,23 @@ |
|||
module bio.sff.read; |
|||
|
|||
/// SFF record
|
|||
struct SffRead { |
|||
/// Read identifier
|
|||
string name; |
|||
|
|||
/// Homopolymer stretch estimates for each flow of the read
|
|||
ushort[] flowgram_values; |
|||
|
|||
ubyte[] flow_index_per_base; |
|||
|
|||
/// Basecalled nucleotide sequence
|
|||
char[] bases; |
|||
|
|||
/// Phred-scaled quality scores
|
|||
ubyte[] quality_scores; |
|||
|
|||
ushort clip_qual_left; |
|||
ushort clip_qual_right; |
|||
ushort clip_adapter_left; |
|||
ushort clip_adapter_right; |
|||
} |
@ -0,0 +1,127 @@ |
|||
module bio.sff.readrange; |
|||
|
|||
import bio.sff.read; |
|||
import bio.sff.index; |
|||
import bio.core.utils.switchendianness; |
|||
|
|||
import std.algorithm; |
|||
import std.stream; |
|||
import std.system; |
|||
import std.array; |
|||
|
|||
private { |
|||
|
|||
// GC used in D is quite bad at allocating lots of objects in a tight loop.
|
|||
// The following is a simple way to reduce the number of allocations.
|
|||
|
|||
ubyte[] current_chunk; |
|||
size_t used; |
|||
|
|||
size_t chunk_size = 65_536; |
|||
|
|||
static this() { |
|||
current_chunk = uninitializedArray!(ubyte[])(chunk_size); |
|||
used = 0; |
|||
} |
|||
|
|||
T[] allocateArray(T : T[])(size_t size) { |
|||
size_t new_used = used + size * T.sizeof; |
|||
if (new_used > chunk_size) { |
|||
new_used = size * T.sizeof; |
|||
if (new_used > chunk_size) |
|||
chunk_size = new_used; |
|||
|
|||
current_chunk = uninitializedArray!(ubyte[])(chunk_size); |
|||
used = new_used; |
|||
return cast(T[])current_chunk[0 .. used]; |
|||
} else { |
|||
auto old_used = used; |
|||
used = new_used; |
|||
return cast(T[])current_chunk[old_used .. used]; |
|||
} |
|||
} |
|||
} |
|||
|
|||
struct SffReadRange { |
|||
this(Stream stream, |
|||
ushort number_of_flows_per_read, |
|||
IndexLocation index_location) |
|||
{ |
|||
_stream = stream; |
|||
_n_flows = number_of_flows_per_read; |
|||
_index_loc = index_location; |
|||
|
|||
_fetchNextRead(); |
|||
} |
|||
|
|||
private { |
|||
Stream _stream; |
|||
ushort _n_flows; |
|||
IndexLocation _index_loc; |
|||
|
|||
bool _empty; |
|||
SffRead _read; |
|||
|
|||
void _fetchNextRead() { |
|||
if (_stream.eof) { |
|||
_empty = true; |
|||
} else { |
|||
if (_stream.position == _index_loc.offset) |
|||
_stream.seekCur(_index_loc.length); |
|||
|
|||
// determine how many bytes to read
|
|||
ushort read_header_length = void; |
|||
ushort name_length = void; |
|||
uint number_of_bases = void; |
|||
|
|||
_stream.read(read_header_length); |
|||
_stream.read(name_length); |
|||
_stream.read(number_of_bases); |
|||
_stream.read(_read.clip_qual_left); |
|||
_stream.read(_read.clip_qual_right); |
|||
_stream.read(_read.clip_adapter_left); |
|||
_stream.read(_read.clip_adapter_right); |
|||
|
|||
char[] name = allocateArray!(char[])(name_length); |
|||
_stream.readExact(name.ptr, name_length); |
|||
_stream.seekCur(read_header_length - 16 - name_length); |
|||
_read.name = cast(string)name; |
|||
|
|||
size_t _data_length = _n_flows * ushort.sizeof + 3 * number_of_bases; |
|||
|
|||
_read.flowgram_values = allocateArray!(ushort[])(_n_flows); |
|||
_stream.readExact(_read.flowgram_values.ptr, _n_flows * ushort.sizeof); |
|||
|
|||
if (std.system.endian != Endian.bigEndian) { |
|||
for (size_t i = 0; i < _n_flows; ++i) { |
|||
switchEndianness(_read.flowgram_values.ptr + i, ushort.sizeof); |
|||
} |
|||
} |
|||
|
|||
_read.flow_index_per_base = allocateArray!(ubyte[])(number_of_bases); |
|||
_stream.readExact(_read.flow_index_per_base.ptr, number_of_bases); |
|||
|
|||
_read.bases = allocateArray!(char[])(number_of_bases); |
|||
_stream.readExact(_read.bases.ptr, number_of_bases); |
|||
|
|||
_read.quality_scores = allocateArray!(ubyte[])(number_of_bases); |
|||
_stream.readExact(_read.quality_scores.ptr, number_of_bases); |
|||
|
|||
if (_data_length % 8 > 0) |
|||
_stream.seekCur(8 - (_data_length % 8)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
bool empty() @property const { |
|||
return _empty; |
|||
} |
|||
|
|||
SffRead front() @property { |
|||
return _read; |
|||
} |
|||
|
|||
void popFront() { |
|||
_fetchNextRead(); |
|||
} |
|||
} |
Loading…
Reference in new issue