
50 changed files with 3 additions and 11007 deletions
-
1.gitignore
-
2bio/std/file/fasta.d
-
2bio/std/file/fastq.d
-
3bio2/README.md
-
101bio2/bam/header.d
-
548bio2/bam/reader.d
-
86bio2/bam/writer.d
-
451bio2/bgzf.d
-
243bio2/bgzf_writer.d
-
11bio2/constants.d
-
99bio2/hashing.d
-
1bio2/logger.d
-
341bio2/pileup.d
-
46bio2/reads.d
-
41bio2/unpack.d
-
16dub.json
-
51examples/calculate_gc_content_from_reference.d
-
48examples/create_bam_from_scratch.d
-
58examples/iterate_tags.d
-
37examples/make_pileup.d
-
41examples/print_base_info.d
-
42examples/read_bam_file.d
-
37examples/transverse_multiple_bam_files.d
-
147meson.build
-
31src_ragel/Makefile
-
153src_ragel/maf_block.rl
-
86src_ragel/region.rl
-
525src_ragel/sam_alignment.rl
-
6src_ragel/workarounds/fix_static_const.sh
-
6src_ragel/workarounds/fix_switch_case_fallthrough.sh
-
BINtest/data/BXD_geno.txt.gz
-
3993test/data/b.sam
-
BINtest/data/b7_295_chunk.bam
-
BINtest/data/bins.bam
-
BINtest/data/bins.bam.bai
-
BINtest/data/corrupted_zlib_archive.bam
-
BINtest/data/duplicated_block_size.bam
-
BINtest/data/ex1_header.bam
-
BINtest/data/ex1_header.bam.bai
-
3273test/data/ex1_header.sam
-
BINtest/data/illu_20_chunk.bam
-
BINtest/data/ion_20_chunk.bam
-
BINtest/data/long_header.bam
-
BINtest/data/mg1655_chunk.bam
-
BINtest/data/no_block_size.bam
-
BINtest/data/tags.bam
-
BINtest/data/tags.bam.bai
-
BINtest/data/wrong_bc_subfield_length.bam
-
BINtest/data/wrong_extra_gzip_length.bam
-
484test/unittests.d
@ -1,3 +0,0 @@ |
|||
bio2 modules are the next generation BAM reading tools. Some of the |
|||
functionality aims to replace that of bio modules in this same |
|||
repository. |
@ -1,101 +0,0 @@ |
|||
/* |
|||
New style BAM reader. This file is part of Sambamba. |
|||
Copyright (C) 2017,2018 Pjotr Prins <pjotr.prins@thebird.nl> |
|||
|
|||
Sambamba is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published |
|||
by the Free Software Foundation; either version 2 of the License, |
|||
or (at your option) any later version. |
|||
|
|||
Sambamba is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|||
02111-1307 USA |
|||
|
|||
*/ |
|||
|
|||
// This is a complete rewrite of Artem Tarasov's original reader.
|
|||
|
|||
module bio2.bam.header; |
|||
|
|||
/* |
|||
import std.conv; |
|||
import core.stdc.stdio: fopen, fread, fclose; |
|||
import std.typecons; |
|||
import std.bitmanip; |
|||
|
|||
import bio.bam.cigar; |
|||
*/ |
|||
|
|||
import std.exception; |
|||
import std.file; |
|||
import std.stdio; |
|||
import std.string; |
|||
|
|||
import bio.bam.constants; |
|||
|
|||
import bio2.bgzf; |
|||
import bio2.bgzf_writer; |
|||
import bio2.constants; |
|||
|
|||
struct RefSequence { |
|||
size_d length; |
|||
string name; |
|||
} |
|||
|
|||
struct BamHeader { |
|||
string id; |
|||
string text; |
|||
RefSequence[] refs; |
|||
|
|||
@disable this(this); // disable copy semantics;
|
|||
} |
|||
|
|||
void fetch_bam_header(ref BamHeader header, ref BgzfStream stream) { |
|||
// stderr.writeln("Fetching BAM header");
|
|||
ubyte[4] ubyte4; |
|||
stream.read(ubyte4); |
|||
enforce(ubyte4 == BAM_MAGIC,"Invalid file format: expected BAM magic number"); |
|||
immutable text_size = stream.read!int(); |
|||
// stderr.writeln("Text size ",text_size.sizeof," ",text_size);
|
|||
immutable text = stream.read!string(text_size); |
|||
header = BamHeader(BAM_MAGIC,text); |
|||
immutable n_refs = stream.read!int(); |
|||
// stderr.writeln("Fetching ",n_refs," references");
|
|||
foreach(int n_ref; 0..n_refs) { |
|||
immutable l_name = stream.read!int(); |
|||
// stderr.writeln("!!",l_name);
|
|||
auto ref_name = stream.read!string(l_name); |
|||
immutable l_ref = stream.read!int(); // length of reference sequence (bps)
|
|||
// stderr.writeln(l_name," ",ref_name," ",l_ref);
|
|||
header.refs ~= RefSequence(l_ref,ref_name[0..l_name-1]); // drop zero terminator
|
|||
} |
|||
} |
|||
|
|||
void write_bam_header(ref BgzfWriter bw, ref BamHeader header) { |
|||
// stderr.writeln("Writing BAM header");
|
|||
ubyte[4] magic = cast(ubyte[])BAM_MAGIC; |
|||
bw.write(magic); |
|||
// stderr.writeln("Text size ",int.sizeof," ",header.text.length);
|
|||
bw.write!int(cast(int)header.text.length); |
|||
bw.write(header.text); |
|||
auto n_refs = cast(int)header.refs.length; |
|||
bw.write!int(cast(int)header.refs.length); |
|||
// stderr.writeln("Writing ",n_refs," references");
|
|||
foreach(int n_ref; 0..n_refs) { |
|||
immutable refseq = header.refs[n_ref]; |
|||
bw.write!int(cast(int)(refseq.name.length+1)); // incl. zero terminator
|
|||
// stderr.writeln("!!",refseq.name.length+1);
|
|||
bw.write(refseq.name); |
|||
bw.write!ubyte(cast(ubyte)'\0'); |
|||
bw.write!int(cast(int)refseq.length); |
|||
// stderr.writeln(refseq.name.length+1," ",refseq.name," ",refseq.length);
|
|||
} |
|||
// stderr.writeln("!!");
|
|||
bw.flush_block(); |
|||
} |
@ -1,548 +0,0 @@ |
|||
/* |
|||
New style BAM reader. This file is part of Sambamba. |
|||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> |
|||
|
|||
Sambamba is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published |
|||
by the Free Software Foundation; either version 2 of the License, |
|||
or (at your option) any later version. |
|||
|
|||
Sambamba is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|||
02111-1307 USA |
|||
|
|||
*/ |
|||
|
|||
// This is a complete rewrite of Artem Tarasov's original reader.
|
|||
|
|||
module bio2.bam.reader; |
|||
|
|||
import std.conv; |
|||
import core.stdc.stdio: fopen, fread, fclose; |
|||
import std.exception; |
|||
import std.file; |
|||
import std.format : format; |
|||
import std.stdio; |
|||
import std.string; |
|||
import std.typecons; |
|||
import std.bitmanip; |
|||
|
|||
import bio.bam.cigar; |
|||
import bio.bam.constants; |
|||
import bio.core.utils.exception; |
|||
|
|||
import bio2.bgzf; |
|||
import bio2.constants; |
|||
|
|||
import bio2.bam.header; |
|||
|
|||
template ReadFlags(alias flag) { |
|||
@property bool is_paired() nothrow { return cast(bool)(flag & 0x1); } |
|||
/// Each segment properly aligned according to the aligner
|
|||
@property bool is_proper_pair() nothrow { return cast(bool)(flag & 0x2); } |
|||
@property bool is_unmapped_raw() nothrow { return cast(bool)(flag & 0x4); } |
|||
@property bool is_mapped_raw() nothrow { return cast(bool)(!(flag & 0x4)); } |
|||
@property bool mate_is_unmapped() nothrow { return cast(bool)(flag & 0x8); } |
|||
@property bool is_reverse_strand() nothrow { return cast(bool)(flag & 0x10); } |
|||
@property bool mate_is_reverse_strand() nothrow { return cast(bool)(flag & 0x20); } |
|||
@property bool is_first_of_pair() nothrow { return cast(bool)(flag & 0x40); } |
|||
@property bool is_second_of_pair() nothrow { return cast(bool)(flag & 0x80); } |
|||
@property bool is_secondary_alignment() nothrow { return cast(bool)(flag & 0x100); } |
|||
@property bool is_qc_fail() { |
|||
assert(is_mapped_raw,to!string(this)); |
|||
return cast(bool)(flag & 0x200); } |
|||
alias is_qc_fail failed_quality_control; |
|||
/// PCR or optical duplicate
|
|||
@property bool is_duplicate() nothrow { return cast(bool)(flag & 0x400); } |
|||
/// Supplementary alignment
|
|||
@property bool is_supplementary() nothrow { return cast(bool)(flag & 0x800); } |
|||
@property string show_flags() { |
|||
string res = format("b%b-%d",flag,flag); |
|||
if (is_paired) res ~= " pair"; |
|||
if (is_proper_pair) res ~= " proper"; |
|||
if (is_mapped_raw) res ~= " mapped"; |
|||
if (is_unmapped_raw) res ~= " unmapped"; |
|||
if (mate_is_unmapped) res ~= " mate_unmapped"; |
|||
if (is_reverse_strand) res ~= " rev_strand"; |
|||
if (mate_is_reverse_strand) res ~= " mate_rev_strand"; |
|||
if (is_first_of_pair) res ~= " first_of_pair"; |
|||
if (is_second_of_pair) res ~= " second_of_pair"; |
|||
if (is_secondary_alignment) res ~= " secondary_aln"; |
|||
if (is_mapped_raw && is_qc_fail) res ~= " qc_fail"; |
|||
if (is_duplicate) res ~= " duplicate"; |
|||
if (is_supplementary) res ~= " suppl"; |
|||
return res; |
|||
} |
|||
} |
|||
|
|||
template CheckMapped(alias refid) { |
|||
@property nothrow bool is_unmapped() { |
|||
return is_unmapped_raw; |
|||
} |
|||
@property bool is_mapped() { |
|||
debug { |
|||
if (is_mapped_raw) { |
|||
assert(refid != -1, "ref_id can not be -1 for mapped read"); // BAM spec
|
|||
} |
|||
} |
|||
return !is_unmapped_raw; |
|||
} |
|||
} |
|||
|
|||
enum Offset { |
|||
bin_mq_nl=0, flag_nc=4, flag=6, l_seq=8, next_refID=12, next_pos=16, tlen=20, read_name=24 |
|||
}; |
|||
|
|||
/** |
|||
Raw Read buffer containing unparsed data. It should be considered |
|||
read-only. |
|||
|
|||
Current implementation is a cluct (class-struct hybrid). The _data |
|||
pointer is shared when ReadBlob is assigned to another variable |
|||
(i.e., there is a remote dependency). The advantage is that for |
|||
each Read data gets allocated on the heap only once. |
|||
|
|||
All offsets are indexed on init (except for tags). When using |
|||
fields beyond refid,pos use ProcessReadBlob instead because it |
|||
caches values. |
|||
*/ |
|||
|
|||
struct ReadBlob { |
|||
RefId refid; // -1 is invalid (BAM Spec)
|
|||
GenomePos pos; // 0 coordinate based (BAM spec)
|
|||
private ubyte[] _data; |
|||
uint offset_cigar=int.max, offset_seq=int.max, offset_qual=int.max; |
|||
|
|||
mixin ReadFlags!(_flag); |
|||
mixin CheckMapped!(refid); |
|||
|
|||
/* |
|||
this(RefId ref_id, GenomePos read_pos, ubyte[] buf) { |
|||
refid = ref_id; |
|||
pos = read_pos; |
|||
_data = buf; |
|||
} |
|||
*/ |
|||
|
|||
@property void cleanup() { |
|||
destroy(_data); |
|||
_data = null; |
|||
} |
|||
|
|||
// Turn ReadBlob into class-struct hybrid or a cluct ;)
|
|||
// @disable this(this); // disable copy semantics;
|
|||
|
|||
@property @trusted nothrow private const T fetch(T)(uint raw_offset) { |
|||
ubyte[] buf = cast(ubyte[])_data[raw_offset..raw_offset+T.sizeof]; |
|||
return cast(const(T))buf.read!(T,Endian.littleEndian)(); |
|||
} |
|||
|
|||
@property @trusted nothrow private const |
|||
uint _bin_mq_nl() { return fetch!uint(Offset.bin_mq_nl); } |
|||
@property @trusted nothrow private const |
|||
uint _flag_nc() { return fetch!uint(Offset.flag_nc); } |
|||
@property @trusted nothrow private const |
|||
int sequence_length() { return fetch!int(Offset.l_seq); } |
|||
@property @trusted nothrow private const |
|||
int _next_refID() { return fetch!int(Offset.next_refID); } |
|||
@property @trusted nothrow private const |
|||
int _next_pos() { return fetch!int(Offset.next_pos); } |
|||
@property @trusted nothrow private const |
|||
int _tlen() { return fetch!int(Offset.tlen); } // avoid using TLEN
|
|||
@property @trusted nothrow private const |
|||
ushort _bin() { return _bin_mq_nl >> 16; } |
|||
@property @trusted nothrow private const |
|||
ubyte _mapq() { return (_bin_mq_nl >> 8) & 0xFF; } |
|||
@property @trusted nothrow private const |
|||
ubyte _l_read_name() { return _bin_mq_nl & 0xFF; } |
|||
@property @trusted nothrow private const |
|||
ushort _flag() { return fetch!ushort(Offset.flag); } |
|||
@property @trusted nothrow private const |
|||
ushort _n_cigar_op() { return _flag_nc & 0xFFFF; } |
|||
@property @trusted nothrow private const |
|||
uint _read_name_offset() { return Offset.read_name; } |
|||
@property @trusted nothrow private |
|||
uint _cigar_offset() { |
|||
if (offset_cigar == int.max) |
|||
offset_cigar = Offset.read_name + cast(uint)(_l_read_name * char.sizeof); |
|||
return offset_cigar; |
|||
} |
|||
@property @trusted nothrow private |
|||
uint _seq_offset() { |
|||
if (offset_seq == int.max) |
|||
offset_seq = _cigar_offset + cast(uint)(_n_cigar_op * uint.sizeof); |
|||
return offset_seq; |
|||
} |
|||
@property @trusted nothrow private |
|||
uint _qual_offset() { |
|||
if (offset_qual == int.max) |
|||
offset_qual = _seq_offset + (sequence_length + 1)/2; |
|||
return offset_qual; |
|||
} |
|||
@property @trusted nothrow private |
|||
uint _tags_offset() { return _qual_offset + sequence_length; } |
|||
@property @trusted nothrow private |
|||
ubyte[] read_name() { return _data[_read_name_offset.._cigar_offset]; } |
|||
@property @trusted nothrow private |
|||
ubyte[] raw_cigar() { return _data[_cigar_offset.._seq_offset]; } |
|||
@property @trusted nothrow private |
|||
ubyte[] raw_sequence() { return _data[_seq_offset.._qual_offset]; } |
|||
|
|||
alias sequence_length _l_seq; |
|||
alias _mapq mapping_quality; // MAPQ
|
|||
|
|||
string toString() { |
|||
return "<** " ~ ReadBlob.stringof ~ " (data size " ~ to!string(_data.length) ~ ") " ~ to!string(refid) ~ ":" ~ to!string(pos) ~ " length " ~ to!string(sequence_length) ~ " flags " ~ show_flags() ~ ">"; |
|||
} |
|||
|
|||
} |
|||
|
|||
/** |
|||
ProcessReadBlob provides a caching mechanism for ReadBlob fields. Use |
|||
this when you need to access field/elements multiple times. Note |
|||
that ProcessReadBlob becomes invalid when ReadBlob goes out of scope. |
|||
*/ |
|||
struct ProcessReadBlob { |
|||
private Nullable!ReadBlob _read2; |
|||
Nullable!int sequence_length2; |
|||
private Nullable!string sequence2, read_name2; |
|||
private Nullable!CigarOperations cigar2; |
|||
private Nullable!GenomePos consumed_reference_bases2; |
|||
|
|||
mixin ReadFlags!(_flag); |
|||
mixin CheckMapped!(ref_id); |
|||
|
|||
this(Nullable!ReadBlob _r) { |
|||
_read2 = _r; |
|||
} |
|||
|
|||
@property void cleanup() { |
|||
_read2.cleanup; |
|||
} |
|||
|
|||
@property nothrow bool isNull() { |
|||
return _read2.isNull; |
|||
} |
|||
|
|||
@property RefId ref_id() { |
|||
enforce(_read2.is_mapped,"Trying to get ref_id an unmapped read " ~ to!string(_read2)); |
|||
return _read2.refid; |
|||
} |
|||
|
|||
@property RefId raw_ref_id() { |
|||
return _read2.refid; |
|||
} |
|||
|
|||
@property nothrow uint _flag_nc() { |
|||
return _read2._flag_nc; |
|||
} |
|||
|
|||
@property nothrow ushort _flag() { |
|||
return _read2._flag; |
|||
} |
|||
|
|||
alias ref_id refid; |
|||
|
|||
/// Get the start position on the reference sequence (better use
|
|||
/// start_loc), i.e., the first base that gets consumed in the
|
|||
/// CIGAR.
|
|||
@property GenomePos start_pos() { |
|||
assert(_read2.is_mapped,"Trying to get pos on an unmapped read"); // BAM spec
|
|||
asserte(_read2.pos < GenomePos.max); |
|||
return cast(GenomePos)_read2.pos; |
|||
} |
|||
|
|||
@property GenomePos raw_start_pos() { |
|||
return cast(GenomePos)_read2.pos; |
|||
} |
|||
|
|||
/// Get the end position on the reference sequence (better use end_loc)
|
|||
@property GenomePos end_pos() { |
|||
assert(sequence_length > 0, "Trying to get end_pos on an empty read sequence"); |
|||
assert(!consumed_reference_bases.isNull); |
|||
return start_pos + consumed_reference_bases; |
|||
} |
|||
|
|||
@property GenomePos raw_end_pos() { |
|||
return raw_start_pos + consumed_reference_bases; |
|||
} |
|||
|
|||
@property GenomeLocation start_loc() { |
|||
return GenomeLocation(ref_id,start_pos); |
|||
} |
|||
|
|||
@property GenomeLocation end_loc() { |
|||
return GenomeLocation(ref_id,end_pos); |
|||
} |
|||
|
|||
@property @trusted MappingQuality mapping_quality() { // MAPQ
|
|||
assert(_read2.is_mapped,"Trying to get MAPQ on an unmapped read"); // BAM spec
|
|||
return MappingQuality(_read2.mapping_quality); |
|||
} |
|||
|
|||
@property @trusted int tlen() { // do not use
|
|||
return _read2._tlen; |
|||
} |
|||
|
|||
@property @trusted GenomePos sequence_length() { |
|||
if (sequence_length2.isNull) |
|||
sequence_length2 = _read2.sequence_length; |
|||
return sequence_length2; |
|||
} |
|||
|
|||
/// Count and caches consumed reference bases. Uses raw_cigar to
|
|||
/// avoid a heap allocation.
|
|||
@property @trusted Nullable!GenomePos consumed_reference_bases() { |
|||
if (consumed_reference_bases2.isNull) { |
|||
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
|
|||
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
|
|||
auto raw = cast(uint[]) _read2.raw_cigar(); |
|||
if (raw.length==1 && raw[0] == '*') |
|||
return consumed_reference_bases2; // null
|
|||
else { |
|||
GenomePos bases = 0; |
|||
for (size_t i = 0; i < raw.length; i++) { |
|||
auto cigarop = CigarOperation(raw[i]); |
|||
if (cigarop.is_reference_consuming) |
|||
bases += cigarop.length; |
|||
} |
|||
consumed_reference_bases2 = bases; |
|||
} |
|||
} |
|||
return consumed_reference_bases2; |
|||
} |
|||
|
|||
/// Count query consumed bases. Uses raw_cigar to avoid a heap
|
|||
/// allocation.
|
|||
@property @trusted GenomePos consumed_query_bases() { |
|||
assert(_read2.is_mapped,"Trying to get consumed bases on an unmapped read"); // BAM spec
|
|||
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
|
|||
auto raw = cast(uint[]) _read2.raw_cigar(); |
|||
if (raw.length==1 && raw[0] == '*') |
|||
return consumed_reference_bases2; // null
|
|||
else { |
|||
GenomePos bases = 0; |
|||
for (size_t i = 0; i < raw.length; i++) { |
|||
auto cigarop = CigarOperation(raw[i]); |
|||
if (cigarop.is_query_consuming) |
|||
bases += cigarop.length; |
|||
} |
|||
return bases; |
|||
} |
|||
} |
|||
|
|||
/// Return read name as a string. If unavailable returns
|
|||
/// null. Caches name.
|
|||
@property Nullable!string read_name() { |
|||
if (read_name2.isNull) { |
|||
assert(_read2.is_mapped,"Trying to get RNAME on an unmapped read"); // BAM spec
|
|||
auto raw = _read2.read_name; |
|||
if (raw.length == 0 || (raw.length ==1 && raw[0] == '*')) |
|||
return read_name2; // null
|
|||
assert(raw.length < 255); // BAM spec
|
|||
if (raw[raw.length-1] == 0) // strip trailing C zero
|
|||
raw.length -= 1; |
|||
read_name2 = Nullable!string(cast(string)raw); |
|||
} |
|||
return read_name2; |
|||
} |
|||
|
|||
/// Returns Cigar as an array of operations. Returns null if no
|
|||
/// operations. Caches Cigar when there are operations.
|
|||
@property Nullable!CigarOperations cigar() { |
|||
if (cigar2.isNull) { |
|||
assert(_read2.is_mapped,"Trying to get CIGAR on an unmapped read"); // BAM spec
|
|||
assert(!read_name.isNull,"Trying to get CIGAR on RNAME is '*'"); // BAM spec
|
|||
auto raw = cast(uint[]) _read2.raw_cigar(); |
|||
if (raw.length==0 || (raw.length==1 && raw[0] == '*')) |
|||
return cigar2; // null
|
|||
else { |
|||
auto s = new CigarOperation[raw.length]; // Heap alloc
|
|||
s.length = 0; |
|||
for (size_t i = 0; i < raw.length; i++) { |
|||
s ~= CigarOperation(raw[i]); |
|||
} |
|||
cigar2 = s; |
|||
} |
|||
} |
|||
return cigar2; |
|||
} |
|||
|
|||
/// Return human readable sequence fragment - null if
|
|||
/// undefined. Caches sequence.
|
|||
@property Nullable!string sequence() { |
|||
if (sequence2.isNull) { // is it cached in sequence2?
|
|||
auto raw = _read2.raw_sequence(); |
|||
if (raw[0] == '*') { |
|||
assert(raw.length == 1); |
|||
return sequence2; // null
|
|||
} |
|||
auto raw_length = (sequence_length + 1) / 2; |
|||
char[16] convert = "=ACMGRSVTWYHKDBN"; |
|||
string s; |
|||
s.reserve(sequence_length); // Heap alloc
|
|||
for (size_t i = 0; i < sequence_length; i++) { |
|||
auto is_odd = i % 2; |
|||
auto nuc = (is_odd ? raw[i/2] & 0b00001111 : (raw[i/2] & 0b11110000) >> 4); |
|||
s ~= convert[nuc]; |
|||
} |
|||
sequence2 = s; |
|||
} |
|||
return sequence2; |
|||
} |
|||
|
|||
@property ubyte[] toBlob() { |
|||
return _read2._data; |
|||
} |
|||
|
|||
@property string posString() { |
|||
return (is_mapped ? to!string(ref_id) ~ ":" ~ to!string(start_pos) : "unmapped"); |
|||
} |
|||
|
|||
string toString() { |
|||
// return "<** " ~ ProcessReadBlob.stringof ~ ") " ~ to!string(_read2.refid) ~ ":" ~ to!string(_read2.pos) ~ " length " ~ to!string(sequence_length) ~ ">";
|
|||
return _read2.get.toString(); |
|||
} |
|||
|
|||
} |
|||
|
|||
/** |
|||
BamReader2 is used for foreach loops |
|||
*/ |
|||
|
|||
struct BamReadBlobs { |
|||
BgzfStream stream; |
|||
BamHeader header; |
|||
|
|||
this(string fn) { |
|||
stream = BgzfStream(fn); |
|||
} |
|||
|
|||
int opApply(scope int delegate(ref ReadBlob) dg) { |
|||
fetch_bam_header(header, stream); |
|||
// parse the reads
|
|||
while (!stream.eof()) { |
|||
immutable block_size = stream.read!int(); |
|||
immutable refid = stream.read!int(); |
|||
immutable pos = stream.read!int(); |
|||
|
|||
ubyte[] data = new ubyte[block_size-2*int.sizeof]; // Heap alloc FIXME
|
|||
auto read = ReadBlob(refid,pos,stream.read(data)); |
|||
dg(read); |
|||
} |
|||
return 0; |
|||
} |
|||
} |
|||
|
|||
/** |
|||
Read streamer - use on single thread only |
|||
*/ |
|||
|
|||
// import core.memory : pureMalloc;
|
|||
|
|||
struct BamReadBlobStream { |
|||
BgzfStream stream; |
|||
BamHeader header; |
|||
Nullable!ReadBlob readbuf; // points to current read
|
|||
ubyte[] data; // in sync with readbuf
|
|||
|
|||
this(string fn) { |
|||
stream = BgzfStream(fn); |
|||
fetch_bam_header(header, stream); |
|||
if (!empty) |
|||
popFront(); // preload front
|
|||
} |
|||
|
|||
bool empty() @property { |
|||
return stream.eof(); |
|||
} |
|||
|
|||
// Returns first read available. If past eof returns null.
|
|||
Nullable!ReadBlob front() { |
|||
return readbuf; |
|||
} |
|||
|
|||
void popFront() { |
|||
asserte(!empty); // should have been checked for
|
|||
immutable block_size = stream.read!int(); |
|||
immutable refid = stream.read!int(); |
|||
immutable pos = stream.read!int(); |
|||
|
|||
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
|
|||
data = new ubyte[block_size-2*int.sizeof]; |
|||
readbuf = ReadBlob(refid,pos,stream.read(data)); |
|||
assert(readbuf._data.ptr == data.ptr); |
|||
} |
|||
|
|||
} |
|||
|
|||
/** |
|||
Reader - use on single thread only |
|||
|
|||
This one provides peek support. Peek looks one read ahead in the read stream. |
|||
*/ |
|||
|
|||
struct BamBlobReader { |
|||
BgzfStream stream; |
|||
BamHeader header; |
|||
Nullable!ReadBlob peekbuf; // points to current read
|
|||
// ubyte[] data; // in sync with peekbuf
|
|||
|
|||
this(string fn) { |
|||
stream = BgzfStream(fn); |
|||
fetch_bam_header(header, stream); |
|||
} |
|||
|
|||
bool empty() @property { |
|||
return peekbuf.isNull && stream.eof(); |
|||
} |
|||
|
|||
Nullable!ReadBlob peek() { |
|||
if (peekbuf.isNull && !empty) |
|||
fetch(); |
|||
return peekbuf; |
|||
} |
|||
|
|||
/// Fetches the next read. If the peekbuf is not empty return that
|
|||
/// first and reset peekbuf.
|
|||
Nullable!ReadBlob fetch() { |
|||
if (!peekbuf.isNull) { |
|||
auto readbuf = peekbuf; |
|||
peekbuf = Nullable!ReadBlob(); |
|||
return readbuf; |
|||
} |
|||
asserte(!empty); // should have been checked for
|
|||
immutable block_size = stream.read!int(); |
|||
immutable refid = stream.read!int(); |
|||
immutable pos = stream.read!int(); |
|||
|
|||
// void *p = pureMalloc(block_size-2*int.sizeof); // test for GC effectiveness
|
|||
auto data = new ubyte[block_size-2*int.sizeof]; |
|||
peekbuf = ReadBlob(refid,pos,stream.read(data)); |
|||
return peekbuf; |
|||
} |
|||
|
|||
/// Returns the next matching read. Otherwise null
|
|||
///
|
|||
/// Example:
|
|||
///
|
|||
/// auto readbuf = ProcessReadBlob(stream.read_if!ProcessReadBlob((r) => !remove && r.is_mapped));
|
|||
/* |
|||
Nullable!ReadBlob read_if(R)(bool delegate(R r) is_match) { |
|||
while(!empty()) { |
|||
auto readbuf = read(); |
|||
if (is_match(R(readbuf))) |
|||
return readbuf; |
|||
else |
|||
return Nullable!ReadBlob(); |
|||
} |
|||
return Nullable!ReadBlob(); |
|||
} |
|||
*/ |
|||
} |
@ -1,86 +0,0 @@ |
|||
/* |
|||
New style BAM writer. This file is part of Sambamba. |
|||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> |
|||
|
|||
Sambamba is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published |
|||
by the Free Software Foundation; either version 2 of the License, |
|||
or (at your option) any later version. |
|||
|
|||
Sambamba is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|||
02111-1307 USA |
|||
|
|||
*/ |
|||
|
|||
module bio2.bam.writer; |
|||
|
|||
import std.conv; |
|||
import core.stdc.stdio: fopen, fread, fclose; |
|||
import std.exception; |
|||
import std.file; |
|||
import std.stdio; |
|||
import std.string; |
|||
import std.typecons; |
|||
import std.bitmanip; |
|||
|
|||
import bio.bam.cigar; |
|||
import bio.bam.constants; |
|||
|
|||
import bio2.bgzf; |
|||
import bio2.bgzf_writer; |
|||
import bio2.constants; |
|||
|
|||
import bio2.bam.header; |
|||
import bio2.bam.reader : ProcessReadBlob, Offset; |
|||
|
|||
struct ModifyProcessReadBlob { // make this generic later
|
|||
ProcessReadBlob _read2; |
|||
|
|||
@property ubyte[] toBlob() { |
|||
return _read2.toBlob(); |
|||
} |
|||
|
|||
@property void set_qc_fail() { |
|||
auto data = _read2.toBlob; |
|||
// writeln(_read2._flag);
|
|||
// data[Offset.flag_nc] = data[Offset.flag_nc] & 0x200;
|
|||
// writeln(data[Offset.flag_nc]);
|
|||
// buf.write!(T,Endian.littleEndian)(value,0);
|
|||
// ushort _flag() { return fetch!ushort(Offset.flag); }
|
|||
|
|||
ushort flag = _read2._flag | 0x200; |
|||
// writeln("flag=",flag);
|
|||
data[Offset.flag..Offset.flag+4].write!(ushort,Endian.littleEndian)(flag,0); |
|||
} |
|||
} |
|||
|
|||
struct BamWriter { |
|||
BgzfWriter bgzf_writer; |
|||
|
|||
this(string fn, ref BamHeader header, int compression_level = -1) { |
|||
bgzf_writer = BgzfWriter(fn,compression_level); |
|||
write_bam_header(bgzf_writer,header); |
|||
} |
|||
|
|||
void push(ModifyProcessReadBlob read) { |
|||
auto mod = read; |
|||
auto blob = mod.toBlob; |
|||
// another hack for now:
|
|||
bgzf_writer.write!int(cast(int)(blob.length+2*int.sizeof)); |
|||
bgzf_writer.write!int(cast(int)mod._read2.raw_ref_id); |
|||
bgzf_writer.write!int(cast(int)mod._read2.raw_start_pos); |
|||
bgzf_writer.write(blob); |
|||
} |
|||
|
|||
void push(ProcessReadBlob read) { |
|||
push(ModifyProcessReadBlob(read)); |
|||
} |
|||
|
|||
} |
@ -1,451 +0,0 @@ |
|||
/* |
|||
This file is part of Sambamba. |
|||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> |
|||
|
|||
Sambamba is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published |
|||
by the Free Software Foundation; either version 2 of the License, |
|||
or (at your option) any later version. |
|||
|
|||
Sambamba is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|||
02111-1307 USA |
|||
|
|||
*/ |
|||
|
|||
module bio2.bgzf; |
|||
|
|||
/** |
|||
|
|||
This is a new version of sambamba bgzf (under development). Bgzf is |
|||
a blocked version of the ubiquous gzip format. By making it blocked |
|||
it allows for seeking in gz files. Note that without seeking it |
|||
can take standard gz files too. |
|||
|
|||
The new version is a prototype for new sambamba architecture using |
|||
canonical D language features, including immutable and improved |
|||
laziness and a more functional programming style. It should provide |
|||
improved performance and minimize RAM use, as well as better |
|||
composability. |
|||
|
|||
Authors: Pjotr Prins |
|||
|
|||
*/ |
|||
|
|||
import core.stdc.string : memcpy; |
|||
|
|||
import std.bitmanip; |
|||
import std.conv; |
|||
import std.exception; |
|||
import std.file; |
|||
import std.stdio; |
|||
import std.typecons; |
|||
import std.zlib : calc_crc32 = crc32, ZlibException; |
|||
|
|||
import bio.bam.constants; |
|||
import bio.core.bgzf.block; |
|||
import bio.core.bgzf.constants; |
|||
import bio.core.utils.zlib : inflateInit2, inflate, inflateEnd, Z_OK, Z_FINISH, Z_STREAM_END; |
|||
|
|||
import bio2.constants; |
|||
|
|||
class BgzfException : Exception { |
|||
this(string msg) { super(msg); } |
|||
} |
|||
|
|||
alias Nullable!ulong FilePos; |
|||
alias immutable(uint) CRC32; |
|||
|
|||
alias BGZF_MAX_BLOCK_SIZE BLOCK_SIZE; |
|||
|
|||
alias ubyte[BLOCK_SIZE] BlockBuffer; |
|||
|
|||
@property ubyte read_ubyte(File f) { |
|||
ubyte[1] ubyte1; // read buffer
|
|||
immutable ubyte[1] buf = f.rawRead(ubyte1); |
|||
return buf[0]; |
|||
} |
|||
|
|||
@property ushort read_ushort(File f) { |
|||
ubyte[2] ubyte2; // read buffer
|
|||
immutable ubyte[2] buf = f.rawRead(ubyte2); |
|||
return littleEndianToNative!ushort(buf); |
|||
} |
|||
@property uint read_uint(File f) { |
|||
ubyte[4] ubyte4; // read buffer
|
|||
immutable ubyte[4] buf = f.rawRead(ubyte4); |
|||
return littleEndianToNative!uint(buf); |
|||
} |
|||
|
|||
/** |
|||
Uncompress a zlib buffer (without header) |
|||
*/ |
|||
ubyte[] deflate(ubyte[] uncompressed_buf, const ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) { |
|||
assert(uncompressed_buf.length == BLOCK_SIZE); |
|||
bio.core.utils.zlib.z_stream zs; |
|||
zs.next_in = cast(typeof(zs.next_in))compressed_buf; |
|||
zs.avail_in = to!uint(compressed_buf.length); |
|||
|
|||
auto err = inflateInit2(&zs, /* winbits = */-15); |
|||
if (err != Z_OK) throw new ZlibException(err); |
|||
|
|||
zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr; |
|||
zs.avail_out = cast(int)uncompressed_buf.length; |
|||
|
|||
scope(exit) { inflateEnd(&zs); } |
|||
err = inflate(&zs, Z_FINISH); |
|||
if (err != Z_STREAM_END) throw new ZlibException(err); |
|||
|
|||
assert(zs.total_out == uncompressed_size); |
|||
uncompressed_buf.length = uncompressed_size; |
|||
assert(crc32 == calc_crc32(0, uncompressed_buf[])); |
|||
|
|||
return uncompressed_buf; |
|||
} |
|||
|
|||
/** |
|||
BgzfReader is designed to run on a single thread. All it does is |
|||
fetch block headers and data, so the thread should easily keep up |
|||
with IO. All data processing is happening lazily in other threads. |
|||
*/ |
|||
struct BgzfReader { |
|||
File f; |
|||
FilePos report_fpos; // for error handler - assumes one thread!
|
|||
|
|||
this(string fn) { |
|||
enforce(fn.isFile); |
|||
f = File(fn,"r"); |
|||
} |
|||
|
|||
@disable this(this); // BgzfReader does not have copy semantics;
|
|||
|
|||
void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) { |
|||
throw new BgzfException("Error reading BGZF block starting in "~f.name ~" @ " ~ |
|||
to!string(report_fpos) ~ " (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg); |
|||
} |
|||
|
|||
void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) { |
|||
if (!check) |
|||
throwBgzfException(msg,file,line); |
|||
} |
|||
|
|||
/** |
|||
Reads the block header and returns the contained compressed data |
|||
size with the file pointer positioned at the associated |
|||
compressed data. |
|||
*/ |
|||
size_t read_block_header() { |
|||
ubyte[4] ubyte4; |
|||
auto magic = f.rawRead(ubyte4); |
|||
enforce1(magic.length == 4, "Premature end of file"); |
|||
enforce1(magic[0..4] == BGZF_MAGIC,"Invalid file format: expected bgzf magic number"); |
|||
ubyte[uint.sizeof + 2 * ubyte.sizeof] skip; |
|||
f.rawRead(skip); // skip gzip info
|
|||
ushort gzip_extra_length = f.read_ushort(); |
|||
immutable fpos1 = f.tell; |
|||
size_t bsize = 0; |
|||
while (f.tell < fpos1 + gzip_extra_length) { |
|||
immutable subfield_id1 = f.read_ubyte(); |
|||
immutable subfield_id2 = f.read_ubyte(); |
|||
immutable subfield_len = f.read_ushort(); |
|||
if (subfield_id1 == BAM_SI1 && subfield_id2 == BAM_SI2) { |
|||
// BC identifier
|
|||
enforce(gzip_extra_length == 6); |
|||
// FIXME: always picks first BC block
|
|||
bsize = 1+f.read_ushort(); // BLOCK size
|
|||
enforce1(subfield_len == 2, "BC subfield len should be 2"); |
|||
break; |
|||
} |
|||
else { |
|||
f.seek(subfield_len,SEEK_CUR); |
|||
} |
|||
enforce1(bsize!=0,"block size not found"); |
|||
f.seek(fpos1+gzip_extra_length); // skip any extra subfields - note we don't check for second BC
|
|||
} |
|||
immutable compressed_size = bsize - 1 - gzip_extra_length - 19; |
|||
enforce1(compressed_size <= BLOCK_SIZE, "compressed size larger than allowed"); |
|||
|
|||
// stderr.writeln("[compressed] size ", compressed_size, " bytes starting block @ ", report_fpos);
|
|||
return compressed_size; |
|||
} |
|||
|
|||
/** |
|||
Fetch the compressed data part of the block and return it with |
|||
the uncompressed size and CRC32. The file pointer is assumed to |
|||
be at the start of the compressed data and will be at the end of |
|||
that section after. |
|||
*/ |
|||
Tuple!(ubyte[],immutable(uint),CRC32) read_compressed_data(ubyte[] buffer) { |
|||
auto compressed_buf = f.rawRead(buffer); |
|||
|
|||
immutable CRC32 crc32 = f.read_uint(); |
|||
immutable uncompressed_size = f.read_uint(); |
|||
// stderr.writeln("[uncompressed] size ",uncompressed_size);
|
|||
return tuple(compressed_buf,uncompressed_size,crc32); |
|||
} |
|||
|
|||
/** |
|||
* Returns new tuple of the new file position, the compressed buffer and |
|||
* the CRC32 o the uncompressed data. file pos is NULL when done |
|||
*/ |
|||
Tuple!(FilePos,ubyte[],size_t,CRC32) read_compressed_block(FilePos fpos, ubyte[] buffer) { |
|||
immutable start_offset = fpos; |
|||
try { |
|||
if (fpos.isNull) throwBgzfException("Trying to read past eof"); |
|||
report_fpos = fpos; |
|||
f.seek(fpos); |
|||
immutable compressed_size = read_block_header(); |
|||
auto ret = read_compressed_data(buffer[0..compressed_size]); |
|||
auto compressed_buf = ret[0]; |
|||
immutable uncompressed_size = ret[1]; |
|||
immutable crc32 = ret[2]; |
|||
|
|||
if (uncompressed_size == 0) { |
|||
// check for eof marker, rereading block header
|
|||
auto lastpos = f.tell(); |
|||
f.seek(start_offset); |
|||
ubyte[BGZF_EOF.length] buf; |
|||
f.rawRead(buf); |
|||
f.seek(lastpos); |
|||
if (buf == BGZF_EOF) |
|||
return tuple(FilePos(),compressed_buf,cast(ulong)0,crc32); // sets fpos to null
|
|||
} |
|||
return tuple(FilePos(f.tell()),compressed_buf,cast(size_t)uncompressed_size,crc32); |
|||
} catch (Exception e) { throwBgzfException(e.msg,e.file,e.line); } |
|||
assert(0); // never reached
|
|||
} |
|||
} |
|||
|
|||
/** |
|||
Simple block iterator |
|||
*/ |
|||
struct BgzfBlocks { |
|||
BgzfReader bgzf; |
|||
|
|||
this(string fn) { |
|||
bgzf = BgzfReader(fn); |
|||
} |
|||
|
|||
@disable this(this); // disable copy semantics;
|
|||
|
|||
int opApply(scope int delegate(ubyte[]) dg) { |
|||
FilePos fpos = 0; |
|||
|
|||
try { |
|||
while (!fpos.isNull) { |
|||
BlockBuffer stack_buffer; |
|||
auto res = bgzf.read_compressed_block(fpos,stack_buffer); |
|||
fpos = res[0]; // point fpos to next block
|
|||
if (fpos.isNull) break; |
|||
|
|||
auto compressed_buf = res[1]; // same as stack_buffer
|
|||
auto uncompressed_size = res[2]; |
|||
auto crc32 = res[3]; |
|||
BlockBuffer uncompressed_buf; |
|||
// call delegated function with new block
|
|||
dg(deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32)); |
|||
} |
|||
} catch (Exception e) { bgzf.throwBgzfException(e.msg,e.file,e.line); } |
|||
return 0; |
|||
} |
|||
} |
|||
|
|||
|
|||
Tuple!(size_t,FilePos) read_blockx(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) { |
|||
BlockBuffer compressed_buf; |
|||
auto res = bgzf.read_compressed_block(fpos,compressed_buf); |
|||
fpos = res[0]; // point fpos to next block
|
|||
if (fpos.isNull) return tuple(cast(size_t)0,fpos); |
|||
auto data = res[1]; |
|||
|
|||
assert(data.ptr == compressed_buf.ptr); |
|||
size_t uncompressed_size = res[2]; |
|||
auto crc32 = res[3]; |
|||
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); |
|||
return tuple(uncompressed_size,fpos); |
|||
} |
|||
|
|||
import std.parallelism; |
|||
|
|||
int kick_off_reading_block_ahead(ubyte[] uncompressed_buf, ubyte[] compressed_buf, size_t uncompressed_size, CRC32 crc32) { |
|||
// writeln("HEY " ~ to!string(uncompressed_size));
|
|||
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); |
|||
return -1; |
|||
} |
|||
|
|||
/** |
|||
*/ |
|||
struct BlockReadAhead { |
|||
bool task_running = false, we_have_a_task = false; |
|||
Task!(kick_off_reading_block_ahead, ubyte[], ubyte[], size_t, CRC32)* t; |
|||
FilePos fpos2; |
|||
size_t uncompressed_size2 = 0; |
|||
BlockBuffer compressed_buf2; |
|||
BlockBuffer uncompressed_buf2; |
|||
|
|||
private void read_next_block() { |
|||
} |
|||
|
|||
private void add_deflate_task() { |
|||
} |
|||
|
|||
private void copy_deflated_buffer() { |
|||
} |
|||
|
|||
void setup_block_reader() { |
|||
read_next_block(); |
|||
add_deflate_task(); |
|||
throw new Exception("NYI"); |
|||
} |
|||
|
|||
Tuple!(size_t,FilePos) read_block(ref BgzfReader bgzf, FilePos fpos, ref ubyte[] uncompressed_buf) { |
|||
assert(we_have_a_task); |
|||
copy_deflated_buffer(); |
|||
read_next_block(); |
|||
add_deflate_task(); |
|||
// return
|
|||
|
|||
if (task_running) { |
|||
int res = t.yieldForce; |
|||
// writeln(res);
|
|||
task_running = false; |
|||
memcpy(uncompressed_buf.ptr,compressed_buf2.ptr,uncompressed_size2); |
|||
return tuple(uncompressed_size2, fpos2); |
|||
} |
|||
else { |
|||
BlockBuffer compressed_buf; |
|||
auto res = bgzf.read_compressed_block(fpos,compressed_buf); |
|||
fpos = res[0]; // point fpos to next block
|
|||
if (fpos.isNull) return tuple(cast(size_t)0,fpos); |
|||
auto data = res[1]; |
|||
assert(data.ptr == compressed_buf.ptr); |
|||
size_t uncompressed_size = res[2]; |
|||
auto crc32 = res[3]; |
|||
|
|||
deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); |
|||
|
|||
// now set up a new buffer
|
|||
auto res2 = bgzf.read_compressed_block(fpos,compressed_buf2); |
|||
fpos2 = res[0]; // point fpos to next block
|
|||
if (!fpos2.isNull) { |
|||
auto data2 = res2[1]; |
|||
uncompressed_size2 = res2[2]; |
|||
t = task!kick_off_reading_block_ahead(cast(ubyte[])uncompressed_buf2,cast(ubyte[])compressed_buf2,uncompressed_size2,res2[3]); |
|||
t.executeInNewThread(); |
|||
task_running = true; |
|||
} |
|||
return tuple(uncompressed_size,fpos); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/** |
|||
*/ |
|||
struct BlockReadUnbuffered { |
|||
|
|||
void setup_block_reader() { |
|||
} |
|||
|
|||
Tuple!(ubyte[], size_t, FilePos) read_block(ref BgzfReader bgzf, in FilePos fpos, ubyte[] uncompressed_buf) { |
|||
BlockBuffer compressed_buf; |
|||
auto res = bgzf.read_compressed_block(fpos,compressed_buf); |
|||
auto fpos2 = res[0]; // point fpos to next block
|
|||
if (fpos.isNull) return tuple(uncompressed_buf,cast(size_t)0,fpos2); |
|||
auto data = res[1]; |
|||
assert(data.ptr == compressed_buf.ptr); |
|||
size_t uncompressed_size = res[2]; |
|||
auto crc32 = res[3]; |
|||
|
|||
auto buf = deflate(uncompressed_buf,compressed_buf,uncompressed_size,crc32); |
|||
assert(buf.ptr == uncompressed_buf.ptr); |
|||
return tuple(uncompressed_buf,uncompressed_size,fpos2); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
Streams bgzf data and fetch items by unit or buffer. These can go beyond |
|||
the size of individual blocks(!) |
|||
*/ |
|||
|
|||
struct BgzfStream { |
|||
BgzfReader bgzf; |
|||
FilePos fpos; // track file position
|
|||
ubyte[] uncompressed_buf; // current data buffer
|
|||
size_t uncompressed_size; // current data buffer size
|
|||
Nullable!int block_pos; // position in block
|
|||
BlockReadUnbuffered blockread; |
|||
|
|||
this(string fn) { |
|||
bgzf = BgzfReader(fn); |
|||
uncompressed_buf = new ubyte[BLOCK_SIZE]; |
|||
fpos = 0; |
|||
} |
|||
|
|||
@disable this(this); // disable copy semantics;
|
|||
|
|||
@property bool eof() { |
|||
return fpos.isNull; |
|||
} |
|||
|
|||
/** |
|||
Fetch data into buffer. The size of the buffer can be larger than |
|||
one or more multiple blocks |
|||
*/ |
|||
ubyte[] fetch(ubyte[] buffer) { |
|||
if (block_pos.isNull) { |
|||
blockread.setup_block_reader(); |
|||
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf); // read first block
|
|||
assert(res[0].ptr == uncompressed_buf.ptr); |
|||
uncompressed_size = res[1]; |
|||
fpos = res[2]; |
|||
block_pos = 0; |
|||
} |
|||
|
|||
immutable buffer_length = buffer.length; |
|||
size_t buffer_pos = 0; |
|||
size_t remaining = buffer_length; |
|||
|
|||
while (remaining > 0) { |
|||
if (block_pos + remaining < uncompressed_size) { |
|||
// full copy
|
|||
assert(buffer_pos + remaining == buffer_length); |
|||
memcpy(buffer[buffer_pos..buffer_pos+remaining].ptr,uncompressed_buf[block_pos..block_pos+remaining].ptr,remaining); |
|||
block_pos += remaining; |
|||
remaining = 0; |
|||
} |
|||
else { |
|||
// read tail of buffer
|
|||
immutable tail = uncompressed_size - block_pos; |
|||
memcpy(buffer[buffer_pos..buffer_pos+tail].ptr,uncompressed_buf[block_pos..uncompressed_size].ptr,tail); |
|||
buffer_pos += tail; |
|||
remaining -= tail; |
|||
auto res = blockread.read_block(bgzf,fpos,uncompressed_buf); |
|||
assert(res[0].ptr == uncompressed_buf.ptr); |
|||
uncompressed_size = res[1]; |
|||
fpos = res[2]; |
|||
block_pos = 0; |
|||
} |
|||
} |
|||
return buffer; |
|||
} |
|||
|
|||
int read(T)() { // for integers
|
|||
ubyte[T.sizeof] buf; |
|||
auto b = fetch(buf); |
|||
return b.read!(T,Endian.littleEndian)(); |
|||
} |
|||
|
|||
string read(T)(size_t len) { |
|||
ubyte[] buf = new ubyte[len]; // heap allocation
|
|||
fetch(buf); |
|||
return cast(T)buf; |
|||
} |
|||
|
|||
T[] read(T)(T[] buffer) { return cast(T[])fetch(cast(ubyte[])buffer); }; |
|||
} |
@ -1,243 +0,0 @@ |
|||
/* |
|||
New style BGZF writer. This file is part of Sambamba. |
|||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> |
|||
|
|||
Sambamba is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published |
|||
by the Free Software Foundation; either version 2 of the License, |
|||
or (at your option) any later version. |
|||
|
|||
Sambamba is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|||
02111-1307 USA |
|||
|
|||
*/ |
|||
|
|||
// Based on the original by Artem Tarasov.
|
|||
|
|||
module bio2.bgzf_writer; |
|||
|
|||
// import core.stdc.stdlib : malloc, free;
|
|||
import core.memory: pureMalloc, pureFree; |
|||
import core.stdc.stdio: fopen, fread, fclose; |
|||
import std.bitmanip; |
|||
import std.conv; |
|||
import std.exception; |
|||
import std.typecons; |
|||
import std.parallelism; |
|||
import std.array; |
|||
import std.algorithm : max; |
|||
import std.stdio; |
|||
import std.typecons; |
|||
|
|||
import bio.core.bgzf.compress; |
|||
import bio.core.utils.roundbuf; |
|||
|
|||
// import undead.stream;
|
|||
|
|||
import bio.bam.constants: BGZF_MAX_BLOCK_SIZE, BGZF_BLOCK_SIZE, BGZF_EOF; |
|||
import bio2.bgzf; |
|||
import bio2.constants; |
|||
|
|||
alias void delegate(ubyte[], ubyte[]) BlockWriteHandler; |
|||
|
|||
/// Convenience function for Taskpool handler
|
|||
Tuple!(ubyte[], ubyte[], BlockWriteHandler) bgzfCompressFunc(ubyte[] input, |
|||
int level, |
|||
ubyte[] output_buffer, |
|||
BlockWriteHandler handler) |
|||
{ |
|||
auto output = bgzfCompress(input, level, output_buffer); |
|||
return tuple(input, output, handler); |
|||
} |
|||
|
|||
/// BGZF compression - this is a port of the original that used the
|
|||
/// undead.stream library.
|
|||
struct BgzfWriter { |
|||
|
|||
private: |
|||
File f; |
|||
TaskPool task_pool; |
|||
|
|||
ubyte[] buffer; // a slice into compression_buf (uncompressed data)
|
|||
ubyte[] tmp; // a slice into compression_buf (compressed data)
|
|||
size_t current_size; |
|||
int compression_level; |
|||
|
|||
alias Task!(bgzfCompressFunc, |
|||
ubyte[], int, ubyte[], BlockWriteHandler) CompressionTask; |
|||
RoundBuf!(CompressionTask*) _compression_tasks; |
|||
ubyte[] compression_buf; |
|||
|
|||
public: |
|||
|
|||
/// Create new BGZF output stream with a multi-threaded writer
|
|||
this(string fn, int _compression_level=-1) { |
|||
f = File(fn,"wb"); |
|||
enforce1(-1 <= compression_level && compression_level <= 9, |
|||
"BGZF compression level must be a number in interval [-1, 9]"); |
|||
size_t max_block_size = BGZF_MAX_BLOCK_SIZE; |
|||
size_t block_size = BGZF_BLOCK_SIZE; |
|||
task_pool = taskPool(), |
|||
compression_level = _compression_level; |
|||
|
|||
// create a ring buffer that is large enough
|
|||
size_t n_tasks = max(task_pool.size, 1) * 16; |
|||
_compression_tasks = RoundBuf!(CompressionTask*)(n_tasks); |
|||
|
|||
// create extra block to which we can write while n_tasks are
|
|||
// executed
|
|||
auto comp_buf_size = (2 * n_tasks + 2) * max_block_size; |
|||
auto p = cast(ubyte*)pureMalloc(comp_buf_size); |
|||
compression_buf = p[0 .. comp_buf_size]; |
|||
buffer = compression_buf[0 .. block_size]; |
|||
tmp = compression_buf[max_block_size .. max_block_size * 2]; |
|||
} |
|||
|
|||
~this() { |
|||
close(); |
|||
} |
|||
|
|||
@disable this(this); // BgzfWriter does not have copy semantics;
|
|||
|
|||
void throwBgzfException(string msg, string file = __FILE__, size_t line = __LINE__) { |
|||
throw new BgzfException("Error writing BGZF block starting in "~f.name ~ |
|||
" (" ~ file ~ ":" ~ to!string(line) ~ "): " ~ msg); |
|||
} |
|||
|
|||
void enforce1(bool check, lazy string msg, string file = __FILE__, int line = __LINE__) { |
|||
if (!check) |
|||
throwBgzfException(msg,file,line); |
|||
} |
|||
|
|||
void write(const void* buf, size_t size) { |
|||
// stderr.writeln("HEY1 writing bytes ",size);
|
|||
if (size + current_size >= buffer.length) { |
|||
size_t room; |
|||
ubyte[] data = (cast(ubyte*)buf)[0 .. size]; |
|||
|
|||
while (data.length + current_size >= buffer.length) { |
|||
room = buffer.length - current_size; |
|||
buffer[$ - room .. $] = data[0 .. room]; |
|||
data = data[room .. $]; |
|||
|
|||
current_size = buffer.length; |
|||
|
|||
flush_block(); |
|||
} |
|||
|
|||
buffer[0 .. data.length] = data[]; |
|||
current_size = data.length; |
|||
} else { |
|||
buffer[current_size .. current_size + size] = (cast(ubyte*)buf)[0 .. size]; |
|||
current_size += size; |
|||
} |
|||
// return size;
|
|||
} |
|||
|
|||
void write(ubyte[] buf) { |
|||
write(buf.ptr, buf.length); |
|||
} |
|||
|
|||
void write(string s) { |
|||
write(cast(ubyte[])s); |
|||
} |
|||
|
|||
void write(T)(T value) { // int values
|
|||
// ubyte[T.sizeof] buf;
|
|||
ubyte[] buf = [0,0,0,0,0,0,0,0,0,0]; |
|||
assert(T.sizeof < buf.length); |
|||
buf.write!(T,Endian.littleEndian)(value,0); |
|||
// writeln("HEY T.sizeof: ",T.sizeof," value ",value," ",buf[0..T.sizeof]);
|
|||
write(buf[0..T.sizeof]); |
|||
} |
|||
|
|||
/// Force flushing current block, even if it is not yet filled.
|
|||
/// Should also be used when it's not desired to have records
|
|||
/// crossing block borders.
|
|||
void flush_block() { |
|||
if (current_size == 0) |
|||
return; |
|||
|
|||
Tuple!(ubyte[], ubyte[], BlockWriteHandler) front_result; |
|||
if (_compression_tasks.full) { |
|||
front_result = _compression_tasks.front.yieldForce(); |
|||
_compression_tasks.popFront(); |
|||
} |
|||
|
|||
auto compression_task = task!bgzfCompressFunc(buffer[0 .. current_size], |
|||
compression_level, tmp, |
|||
_before_write); |
|||
_compression_tasks.put(compression_task); |
|||
task_pool.put(compression_task); |
|||
|
|||
size_t offset = buffer.ptr - compression_buf.ptr; |
|||
immutable N = tmp.length; |
|||
offset += 2 * N; |
|||
if (offset == compression_buf.length) |
|||
offset = 0; |
|||
buffer = compression_buf[offset .. offset + buffer.length]; |
|||
tmp = compression_buf[offset + N .. offset + 2 * N]; |
|||
current_size = 0; |
|||
|
|||
if (front_result[0] !is null) |
|||
writeResult(front_result); |
|||
|
|||
while (!_compression_tasks.empty) { |
|||
auto task = _compression_tasks.front; |
|||
if (!task.done()) |
|||
break; |
|||
auto result = task.yieldForce(); |
|||
writeResult(result); |
|||
_compression_tasks.popFront(); |
|||
} |
|||
} |
|||
|
|||
private void delegate(ubyte[], ubyte[]) _before_write; |
|||
void setWriteHandler(void delegate(ubyte[], ubyte[]) handler) { |
|||
_before_write = handler; |
|||
} |
|||
|
|||
private void writeResult(Tuple!(ubyte[], ubyte[], BlockWriteHandler) block) { |
|||
auto uncompressed = block[0]; |
|||
auto compressed = block[1]; |
|||
auto handler = block[2]; |
|||
if (handler) {// write handler enabled
|
|||
handler(uncompressed, compressed); |
|||
} |
|||
// _stream.writeExact(compressed.ptr, compressed.length);
|
|||
f.rawWrite(compressed); |
|||
} |
|||
|
|||
/// Flush all remaining BGZF blocks and underlying stream.
|
|||
void flush() { |
|||
flush_block(); |
|||
|
|||
while (!_compression_tasks.empty) { |
|||
auto task = _compression_tasks.front; |
|||
auto block = task.yieldForce(); |
|||
writeResult(block); |
|||
_compression_tasks.popFront(); |
|||
} |
|||
|
|||
f.flush(); |
|||
current_size = 0; |
|||
} |
|||
|
|||
/// Flush all remaining BGZF blocks and close source stream.
|
|||
/// Automatically adds empty block at the end, serving as indicator
|
|||
/// of end of stream. This function is automatically called on
|
|||
/// destruction.
|
|||
void close() { |
|||
flush(); |
|||
f.rawWrite(BGZF_EOF); |
|||
f.close(); |
|||
pureFree(compression_buf.ptr); |
|||
} |
|||
} |
@ -1,11 +0,0 @@ |
|||
module bio2.constants; |
|||
|
|||
alias ulong size_d; |
|||
alias int RefId; // -1 is invalid FIXME
|
|||
alias uint GenomePos; // 32-bits, do check when reading!
|
|||
alias ubyte MappingQuality; // 255 is invalid FIXME
|
|||
|
|||
struct GenomeLocation { |
|||
RefId ref_id; |
|||
GenomePos pos; |
|||
} |
@ -1,99 +0,0 @@ |
|||
/* |
|||
This file is part of Sambamba. |
|||
Copyright (C) 2017 Pjotr Prins <pjotr.prins@thebird.nl> |
|||
|
|||
Sambamba is free software; you can redistribute it and/or modify |
|||
it under the terms of the GNU General Public License as published |
|||
by the Free Software Foundation; either version 2 of the License, |
|||
or (at your option) any later version. |
|||
|
|||
Sambamba is distributed in the hope that it will be useful, but |
|||
WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU General Public License |
|||
along with this program; if not, write to the Free Software |
|||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
|||
02111-1307 USA |
|||
|
|||
*/ |
|||
|
|||
module bio2.hashing; |
|||
|
|||
import std.conv; |
|||
import std.stdio; |
|||