
38 changed files with 0 additions and 23300 deletions
@ -1,49 +0,0 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2013 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of this software and associated documentation files (the "Software"), |
|||
to deal in the Software without restriction, including without limitation |
|||
the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
and/or sell copies of the Software, and to permit persons to whom the |
|||
Software is furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
DEALINGS IN THE SOFTWARE. |
|||
|
|||
*/ |
|||
module bio.bam.abstractreader; |
|||
|
|||
import bio.sam.header; |
|||
import bio.bam.read; |
|||
import bio.bam.referenceinfo; |
|||
|
|||
public import std.range; |
|||
|
|||
/// Common interface for $(DPREF2 bam, reader, BamReader)
|
|||
/// and $(DPREF2 sam, reader, SamReader).
|
|||
interface IBamSamReader { |
|||
/// SAM header
|
|||
bio.sam.header.SamHeader header() @property; |
|||
|
|||
/// Information about reference sequences
|
|||
const(bio.bam.referenceinfo.ReferenceSequenceInfo)[] reference_sequences() @property const nothrow; |
|||
|
|||
/// All reads in the file
|
|||
std.range.InputRange!(bio.bam.read.BamRead) allReads() @property; |
|||
|
|||
/// Filename
|
|||
string filename() @property const; |
|||
|
|||
///
|
|||
void assumeSequentialProcessing(); |
|||
} |
@ -1,92 +0,0 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of this software and associated documentation files (the "Software"), |
|||
to deal in the Software without restriction, including without limitation |
|||
the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
and/or sell copies of the Software, and to permit persons to whom the |
|||
Software is furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
DEALINGS IN THE SOFTWARE. |
|||
|
|||
*/ |
|||
module bio.bam.bai.bin; |
|||
|
|||
import bio.core.bgzf.chunk; |
|||
import bio.bam.constants; |
|||
|
|||
/// Distinct bin
|
|||
struct Bin { |
|||
|
|||
/// Construct a bin with an id
|
|||
this(uint id) nothrow { |
|||
this.id = id; |
|||
} |
|||
|
|||
uint id; /// bin number
|
|||
Chunk[] chunks; |
|||
|
|||
/// How deep the bin is in the tree
|
|||
int level() @property const nothrow { |
|||
if (id == 0) return 0; |
|||
if (id < 9) return 1; |
|||
if (id < 73) return 2; |
|||
if (id < 585) return 3; |
|||
if (id < 4681) return 4; |
|||
return 5; |
|||
} |
|||
|
|||
/// Returns whether the bin is a leaf in the B-tree
|
|||
bool is_leaf() @property const nothrow { |
|||
return id > BAI_MAX_NONLEAF_BIN_ID; |
|||
} |
|||
|
|||
/// Check if bin can overlap with a region
|
|||
bool canOverlapWith(int begin, int end) const nothrow { |
|||
if (id == 0) return true; |
|||
if (id > BAI_MAX_BIN_ID) return false; |
|||
|
|||
/// The following code is based on reg2bins() function
|
|||
if (begin < 0) begin = 0; |
|||
auto magic_number = 4681; |
|||
auto b = begin >> 14; |
|||
auto e = end >> 14; |
|||
|
|||
while (true) { |
|||
auto delta = id - magic_number; |
|||
if (b <= delta && delta <= e) return true; |
|||
|
|||
magic_number >>= 3; |
|||
|
|||
if (magic_number == 0) return false; |
|||
|
|||
b >>= 3; |
|||
e >>= 3; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// Returns bin number for [beg, end) interval (zero-based).
|
|||
/// Taken from SAM/BAM specification.
|
|||
ushort reg2bin(int beg, int end) { |
|||
if (end == beg) end = beg + 1; // edge case
|
|||
|
|||
--end; |
|||
if (beg>>14 == end>>14) return cast(ushort)(((1<<15)-1)/7 + (beg>>14)); |
|||
if (beg>>17 == end>>17) return cast(ushort)(((1<<12)-1)/7 + (beg>>17)); |
|||
if (beg>>20 == end>>20) return cast(ushort)(((1<<9)-1)/7 + (beg>>20)); |
|||
if (beg>>23 == end>>23) return cast(ushort)(((1<<6)-1)/7 + (beg>>23)); |
|||
if (beg>>26 == end>>26) return cast(ushort)(((1<<3)-1)/7 + (beg>>26)); |
|||
return 0; |
|||
} |
@ -1,366 +0,0 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2012-2017 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of this software and associated documentation files (the "Software"), |
|||
to deal in the Software without restriction, including without limitation |
|||
the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
and/or sell copies of the Software, and to permit persons to whom the |
|||
Software is furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
DEALINGS IN THE SOFTWARE. |
|||
|
|||
*/ |
|||
module bio.bam.bai.indexing; |
|||
|
|||
import bio.bam.reader; |
|||
import bio.bam.readrange; |
|||
import bio.bam.constants; |
|||
|
|||
import bio.bam.bai.bin; |
|||
import bio.core.bgzf.chunk; |
|||
|
|||
import undead.stream; |
|||
import std.array; |
|||
import std.algorithm; |
|||
import std.system; |
|||
import std.exception; |
|||
import core.stdc.string; |
|||
|
|||
// Suppose we have an alignment which covers bases on a reference,
|
|||
// starting from one position and ending at another position.
|
|||
// In order to build linear index, we need to find to which windows
|
|||
// the two positions correspond.
|
|||
//
|
|||
//
|
|||
// (K = 16384)
|
|||
//
|
|||
// [0, K)[K, 2K)[2K, 3K)... <- windows
|
|||
// [.......) <- alignment
|
|||
//
|
|||
private size_t toLinearIndexOffset(int position) { |
|||
return position < 0 ? 0 : position / BAI_LINEAR_INDEX_WINDOW_SIZE; |
|||
} |
|||
|
|||
///
|
|||
struct IndexBuilder { |
|||
private { |
|||
// array of linear offsets for the current reference entry
|
|||
VirtualOffset[] _linear_index; |
|||
// (maximum index in _linear_index where data was written) + 1
|
|||
size_t _linear_index_write_length; |
|||
|
|||
static struct PreviousRead { |
|||
int ref_id = -1; |
|||
int position; |
|||
int end_position; |
|||
int basesCovered() { return end_position - position; } |
|||
Bin bin; |
|||
bool is_unmapped; |
|||
|
|||
private char[256] _name_buf; // by spec., name length is <= 255 chars
|
|||
string name; |
|||
|
|||
VirtualOffset start_virtual_offset; |
|||
VirtualOffset end_virtual_offset; |
|||
} |
|||
|
|||
PreviousRead _prev_read; |
|||
|
|||
Stream _stream; |
|||
int _n_refs; |
|||
|
|||
ulong _no_coord = 0; |
|||
// metadata for each reference
|
|||
ulong _beg_vo = -1UL; |
|||
ulong _end_vo = 0; |
|||
ulong _unmapped = 0; |
|||
ulong _mapped = 0; |
|||
|
|||
bool _first_read = true; |
|||
|
|||
// map: bin ID -> array of chunks
|
|||
Chunk[][uint] _chunks; |
|||
|
|||
VirtualOffset _current_chunk_beg; // start of current chunk
|
|||
|
|||
// no metadata for empty references
|
|||
void writeEmptyReference() { |
|||
_stream.write(cast(int)0); // n_bins
|
|||
_stream.write(cast(int)0); // n_intv
|
|||
} |
|||
|
|||
void updateLastReadInfo(ref BamReadBlock read) { |
|||
with (_prev_read) { |
|||
ref_id = read.ref_id; |
|||
position = read.position; |
|||
end_position = position + read.basesCovered(); |
|||
bin = read.bin; |
|||
is_unmapped = read.is_unmapped; |
|||
_name_buf[0 .. read.name.length] = read.name[]; |
|||
name = cast(string)_name_buf[0 .. read.name.length]; |
|||
start_virtual_offset = read.start_virtual_offset; |
|||
end_virtual_offset = read.end_virtual_offset; |
|||
} |
|||
} |
|||
|
|||
void updateMetadata(ref BamReadBlock read) { |
|||
if (read.ref_id == -1) { |
|||
++_no_coord; |
|||
} else { |
|||
if (read.is_unmapped) { |
|||
++_unmapped; |
|||
} else { |
|||
++_mapped; |
|||
} |
|||
|
|||
if (_beg_vo == -1UL) |
|||
_beg_vo = cast(ulong)read.start_virtual_offset; |
|||
_end_vo = cast(ulong)read.end_virtual_offset; |
|||
} |
|||
} |
|||
|
|||
void updateLinearIndex() { |
|||
assert(_prev_read.ref_id >= 0); |
|||
|
|||
size_t beg, end; |
|||
|
|||
if (_prev_read.is_unmapped) { |
|||
end = beg = toLinearIndexOffset(_prev_read.position); |
|||
} else { |
|||
beg = toLinearIndexOffset(_prev_read.position); |
|||
end = toLinearIndexOffset(_prev_read.position + _prev_read.basesCovered() - 1); |
|||
} |
|||
|
|||
debug { |
|||
import std.stdio; |
|||
if (end >= _linear_index.length) { |
|||
writeln("beg: ", beg); |
|||
writeln("end: ", end); |
|||
writeln("pos: ", _prev_read.position); |
|||
writeln("bases: ", _prev_read.basesCovered()); |
|||
} |
|||
} |
|||
|
|||
foreach (i; beg .. end + 1) |
|||
if (_linear_index[i] == 0UL) |
|||
_linear_index[i] = _prev_read.start_virtual_offset; |
|||
|
|||
if (end + 1 > _linear_index_write_length) |
|||
_linear_index_write_length = end + 1; |
|||
} |
|||
|
|||
void dumpCurrentLinearIndex() { |
|||
_stream.write(cast(int)_linear_index_write_length); |
|||
|
|||
//
|
|||
// There might be untouched places in linear index
|
|||
// with virtual offset equal to zero.
|
|||
// However, it's not a good idea to leave those zeros,
|
|||
// since we can start lookup from the last non-zero virtual offset
|
|||
// encountered before the untouched window.
|
|||
//
|
|||
VirtualOffset last_voffset = 0; |
|||
|
|||
foreach (voffset; _linear_index[0 .. _linear_index_write_length]) { |
|||
if (voffset == 0) |
|||
voffset = last_voffset; |
|||
else |
|||
last_voffset = voffset; |
|||
_stream.write(cast(ulong)voffset); |
|||
} |
|||
} |
|||
|
|||
void dumpCurrentReference() { |
|||
// +1 because we output dummy bin, too
|
|||
_stream.write(cast(int)(_chunks.length + 1)); |
|||
|
|||
foreach (bin_id, bin_chunks; _chunks) { |
|||
if (bin_chunks.length > 0) { |
|||
_stream.write(cast(uint)bin_id); |
|||
_stream.write(cast(int)bin_chunks.length); |
|||
foreach (chunk; bin_chunks) { |
|||
_stream.write(cast(ulong)chunk.beg); |
|||
_stream.write(cast(ulong)chunk.end); |
|||
} |
|||
} |
|||
} |
|||
_stream.write(cast(uint)37450); |
|||
_stream.write(cast(int)2); |
|||
_stream.write(cast(ulong)_beg_vo); |
|||
_stream.write(cast(ulong)_end_vo); |
|||
_stream.write(cast(ulong)_mapped); |
|||
_stream.write(cast(ulong)_unmapped); |
|||
|
|||
dumpCurrentLinearIndex(); |
|||
|
|||
// reset data
|
|||
memset(_linear_index.ptr, 0, _linear_index.length * ulong.sizeof); |
|||
_linear_index_write_length = 0; |
|||
_chunks = null; |
|||
_current_chunk_beg = _prev_read.end_virtual_offset; |
|||
|
|||
_beg_vo = _end_vo = cast(ulong)_current_chunk_beg; |
|||
_unmapped = 0; |
|||
_mapped = 0; |
|||
} |
|||
|
|||
// adds chunk to the current bin (which is determined from _prev_read)
|
|||
void updateChunks() { |
|||
auto current_chunk_end = _prev_read.end_virtual_offset; |
|||
|
|||
auto bin_id = _prev_read.bin.id; |
|||
|
|||
if (bin_id !in _chunks) |
|||
_chunks[bin_id] = []; |
|||
auto cs = _chunks[bin_id]; |
|||
|
|||
bool canMergeWithPreviousChunk() { |
|||
assert(cs.length > 0); |
|||
auto last_chunk = cs[$ - 1]; |
|||
|
|||
if (last_chunk.end.coffset == _current_chunk_beg.coffset) |
|||
return true; |
|||
|
|||
return false; |
|||
} |
|||
|
|||
if (cs.length == 0 || !canMergeWithPreviousChunk()) { |
|||
auto new_chunk = Chunk(_current_chunk_beg, current_chunk_end); |
|||
_chunks[_prev_read.bin.id] ~= new_chunk; |
|||
} else { |
|||
_chunks[_prev_read.bin.id][$ - 1].end = current_chunk_end; |
|||
} |
|||
|
|||
_current_chunk_beg = current_chunk_end; |
|||
} |
|||
|
|||
void checkThatBinIsCorrect(ref BamReadBlock read) { |
|||
if (!check_bins) |
|||
return; |
|||
auto expected = reg2bin(read.position, |
|||
read.position + read.basesCovered()); |
|||
enforce(read.bin.id == expected, |
|||
"Bin in read with name '" ~ read.name ~ |
|||
"' is set incorrectly (" ~ to!string(read.bin.id) ~ |
|||
" instead of expected " ~ to!string(expected) ~ ")"); |
|||
} |
|||
|
|||
void checkThatInputIsSorted(ref BamReadBlock read) { |
|||
if (_first_read) return; |
|||
if (read.ref_id == -1) return; // unmapped
|
|||
if (_prev_read.ref_id < read.ref_id) return; |
|||
|
|||
enforce(read.ref_id == _prev_read.ref_id && |
|||
read.position >= _prev_read.position, |
|||
"BAM file is not coordinate-sorted: " ~ |
|||
"read '" ~ read.name ~ "' (" ~ read.ref_id.to!string ~ ":" ~ read.position.to!string ~ ")" ~ |
|||
" must be after read '" ~ _prev_read.name ~ "' (" ~ _prev_read.ref_id.to!string ~ ":" ~ _prev_read.position.to!string ~ ")" ~ |
|||
"' (at virtual offsets " ~ |
|||
to!string(_prev_read.start_virtual_offset) ~ ", " ~ read.start_virtual_offset.to!string ~ ")"); |
|||
} |
|||
} |
|||
|
|||
///
|
|||
this(Stream output_stream, int number_of_references) { |
|||
_stream = new EndianStream(output_stream, Endian.littleEndian); |
|||
_n_refs = number_of_references; |
|||
|
|||
size_t size = BAI_MAX_BIN_ID - BAI_MAX_NONLEAF_BIN_ID + 1; |
|||
_linear_index = new VirtualOffset[](size); |
|||
|
|||
_stream.writeString(BAI_MAGIC); // write BAI magic string
|
|||
_stream.write(cast(int)_n_refs); // and number of references
|
|||
} |
|||
|
|||
/// Check that bins are correct.
|
|||
bool check_bins; |
|||
|
|||
/// Add a read. The reads must be put in coordinate-sorted order.
|
|||
void put(BamReadBlock read) { |
|||
checkThatInputIsSorted(read); |
|||
scope(exit) updateMetadata(read); |
|||
|
|||
if (read.ref_id < 0) |
|||
return; |
|||
|
|||
// start position is unavailable, skip
|
|||
if (read.position < 0) |
|||
return; |
|||
|
|||
if (_first_read) { |
|||
updateLastReadInfo(read); |
|||
_first_read = false; |
|||
_current_chunk_beg = read.start_virtual_offset; |
|||
|
|||
if (read.ref_id > 0) |
|||
foreach (i; 0 .. read.ref_id) |
|||
writeEmptyReference(); |
|||
|
|||
return; |
|||
} |
|||
|
|||
checkThatBinIsCorrect(read); |
|||
|
|||
// new reference, so write data for previous one(s)
|
|||
if (read.ref_id > _prev_read.ref_id) { |
|||
updateLinearIndex(); |
|||
updateChunks(); |
|||
dumpCurrentReference(); |
|||
|
|||
foreach (i; _prev_read.ref_id + 1 .. read.ref_id) |
|||
writeEmptyReference(); |
|||
} |
|||
|
|||
if (read.ref_id == _prev_read.ref_id) { |
|||
updateLinearIndex(); |
|||
|
|||
if (read.bin.id != _prev_read.bin.id) |
|||
updateChunks(); |
|||
} |
|||
|
|||
updateLastReadInfo(read); |
|||
} |
|||
|
|||
/// Closes the stream
|
|||
void finish() { |
|||
if (!_first_read) { // at least one was processed
|
|||
assert(_prev_read.ref_id >= 0); |
|||
updateLinearIndex(); |
|||
updateChunks(); |
|||
dumpCurrentReference(); |
|||
} |
|||
|
|||
// _prev_read.ref_id == -1 if all are unmapped
|
|||
foreach (i; _prev_read.ref_id + 1 .. _n_refs) |
|||
writeEmptyReference(); |
|||
|
|||
_stream.write(cast(ulong)_no_coord); |
|||
_stream.close(); |
|||
} |
|||
} |
|||
|
|||
/// Writes BAM index to the $(D stream)
|
|||
///
|
|||
/// Accepts optional $(D progressBarFunc)
|
|||
void createIndex(BamReader bam, Stream stream, bool check_bins=false, |
|||
void delegate(lazy float p) progressBarFunc=null) |
|||
{ |
|||
auto n_refs = cast(int)bam.reference_sequences.length; |
|||
auto index_builder = IndexBuilder(stream, n_refs); |
|||
index_builder.check_bins = check_bins; |
|||
auto reads = bam.readsWithProgress!withOffsets(progressBarFunc); |
|||
foreach (read; reads) |
|||
index_builder.put(read); |
|||
index_builder.finish(); |
|||
} |
@ -1,169 +0,0 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of this software and associated documentation files (the "Software"), |
|||
to deal in the Software without restriction, including without limitation |
|||
the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
and/or sell copies of the Software, and to permit persons to whom the |
|||
Software is furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
DEALINGS IN THE SOFTWARE. |
|||
|
|||
*/ |
|||
module bio.bam.baifile; |
|||
|
|||
public import bio.core.bgzf.chunk; |
|||
public import bio.bam.bai.bin; |
|||
import bio.core.bgzf.virtualoffset; |
|||
import bio.bam.constants; |
|||
|
|||
import undead.stream; |
|||
import std.system; |
|||
import std.exception; |
|||
import std.algorithm; |
|||
import std.conv; |
|||
import std.range; |
|||
import std.file; |
|||
import std.path; |
|||
|
|||
/// Represents index for a single reference
|
|||
struct Index { |
|||
/// Information about bins
|
|||
Bin[] bins; |
|||
|
|||
/// Virtual file offsets of first alignments overlapping 16384-byte windows
|
|||
/// on the reference sequence. This linear index is used to reduce amount
|
|||
/// of file seeks for region queries, since with its help one can reduce the
|
|||
/// number of chunks to be investigated based on their end position.
|
|||
///
|
|||
///
|
|||
/// Suppose you have a region [beg, end) and want to do a query.
|
|||
///
|
|||
/// Here's the reference:
|
|||
/// [....................!..............!.................................]
|
|||
/// beg end
|
|||
///
|
|||
/// Here's the same reference with 16384-byte long windows:
|
|||
/// [%...........%.......!....%.........!..%...........%...........%......]
|
|||
/// beg end
|
|||
/// [ 1st window][ 2nd window][...
|
|||
///
|
|||
/// With linear index, we can take the second window, find out what is
|
|||
/// the minimum virtual offset among alignments overlapping this window,
|
|||
/// and skip all chunks which end position is less or equal to this offset:
|
|||
///
|
|||
/// [........@...........!..............!.................................]
|
|||
/// . ..min. offset beg end
|
|||
/// [ ). . <- this chunk is skipped
|
|||
/// [ ) <- this one is not
|
|||
///
|
|||
VirtualOffset[] ioffsets; |
|||
|
|||
/// Get (approximate) virtual offset of the first alignment overlapping $(D position)
|
|||
///
|
|||
/// Returned virtual offset is less or equal to real offset.
|
|||
VirtualOffset getMinimumOffset(int position) { |
|||
int pos = max(0, position); |
|||
int _i = min(pos / BAI_LINEAR_INDEX_WINDOW_SIZE, cast(int)ioffsets.length - 1); |
|||
auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i]; |
|||
return min_offset; |
|||
} |
|||
} |
|||
|
|||
struct BaiFile { |
|||
Index[] indices; |
|||
|
|||
/// Initialize from stream which contains BAI data
|
|||
this(ref Stream stream) { |
|||
_stream = stream; |
|||
parse(); |
|||
} |
|||
|
|||
/// Open BAI file given either filename of BAM file or that of BAI file.
|
|||
this(string filename) { |
|||
Stream fstream; |
|||
|
|||
if (!endsWith(filename, ".bai")) { |
|||
/// Unfortunately, std.path.addExt is going to be deprecated
|
|||
|
|||
auto first_filename = filename ~ ".bai"; |
|||
auto second_filename = to!string(retro(find(retro(filename), '.'))) ~ "bai"; |
|||
|
|||
if (std.file.exists(first_filename)) { |
|||
fstream = new BufferedFile(absolutePath(first_filename)); |
|||
} else { |
|||
if (std.file.exists(second_filename)) { |
|||
fstream = new BufferedFile(absolutePath(second_filename)); |
|||
} else { |
|||
throw new Exception("searched for " ~ first_filename ~ " or " ~ |
|||
second_filename ~ ", found neither"); |
|||
} |
|||
} |
|||
} else { |
|||
fstream = new BufferedFile(filename); |
|||
} |
|||
|
|||
Stream estream = new EndianStream(fstream, Endian.littleEndian); |
|||
this(estream); |
|||
} |
|||
|
|||
private: |
|||
Stream _stream; |
|||
|
|||
/// according to section 4.2 of SAM/BAM specification
|
|||
void parse() { |
|||
auto magic = _stream.readString(4); |
|||
enforce(magic == "BAI\1", "Invalid file format: expected BAI\\1"); |
|||
|
|||
int n_ref; |
|||
_stream.read(n_ref); |
|||
indices = uninitializedArray!(Index[])(n_ref); |
|||
|
|||
foreach (i; 0 .. n_ref) { |
|||
int n_bin = void; |
|||
_stream.read(n_bin); |
|||
indices[i].bins = uninitializedArray!(Bin[])(n_bin); |
|||
|
|||
foreach (j; 0 .. n_bin) { |
|||
uint id = void; |
|||
_stream.read(id); |
|||
auto bin = Bin(id); |
|||
|
|||
int n_chunk = void; |
|||
_stream.read(n_chunk); |
|||
bin.chunks = uninitializedArray!(Chunk[])(n_chunk); |
|||
|
|||
foreach (k; 0 .. n_chunk) { |
|||
ulong tmp = void; |
|||
_stream.read(tmp); |
|||
bin.chunks[k].beg = VirtualOffset(tmp); |
|||
_stream.read(tmp); |
|||
bin.chunks[k].end = VirtualOffset(tmp); |
|||
} |
|||
|
|||
indices[i].bins[j] = bin; |
|||
} |
|||
|
|||
int n_intv = void; |
|||
_stream.read(n_intv); |
|||
indices[i].ioffsets = uninitializedArray!(VirtualOffset[])(n_intv); |
|||
|
|||
foreach (j; 0 .. n_intv) { |
|||
ulong tmp = void; |
|||
_stream.read(tmp); |
|||
indices[i].ioffsets[j] = VirtualOffset(tmp); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -1,728 +0,0 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of this software and associated documentation files (the "Software"), |
|||
to deal in the Software without restriction, including without limitation |
|||
the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
and/or sell copies of the Software, and to permit persons to whom the |
|||
Software is furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
DEALINGS IN THE SOFTWARE. |
|||
|
|||
*/ |
|||
module bio.bam.baseinfo; |
|||
|
|||
import bio.core.base; |
|||
import bio.core.sequence; |
|||
|
|||
import bio.bam.read; |
|||
import bio.bam.tagvalue; |
|||
import bio.bam.iontorrent.flowcall; |
|||
import bio.bam.md.core; |
|||
import bio.bam.cigar; |
|||
|
|||
import std.range; |
|||
import std.conv; |
|||
import std.traits; |
|||
import std.typecons; |
|||
import std.typetuple; |
|||
|
|||
///
|
|||
enum Option |
|||
{ |
|||
/// adds 'cigar_before' and 'cigar_after' properties
|
|||
cigarExtra, |
|||
|
|||
/// adds 'md_operation', 'md_operation_offset' properties
|
|||
mdCurrentOp, |
|||
|
|||
/// adds 'previous_md_operation' property
|
|||
mdPreviousOp, |
|||
|
|||
/// adds 'next_md_operation' property
|
|||
mdNextOp |
|||
} |
|||
|
|||
///
|
|||
struct MixinArg(T, string Tag) { |
|||
T value; |
|||
alias value this; |
|||
alias Tag TagName; |
|||
} |
|||
|
|||
/// Wrapper for arguments to $(D basesWith) function (see below).
|
|||
/// Required to distinguish to which tag each parameter refers.
|
|||
MixinArg!(T, Tag) arg(string Tag, T)(T value) { |
|||
return MixinArg!(T, Tag)(value); |
|||
} |
|||
|
|||
template staticFilter(alias P, T...) |
|||
{ |
|||
static if (T.length == 0) |
|||
alias TypeTuple!() staticFilter; |
|||
else static if (P!(T[0])) |
|||
alias TypeTuple!(T[0], staticFilter!(P, T[1..$])) staticFilter; |
|||
else |
|||
alias staticFilter!(P, T[1..$]) staticFilter; |
|||
} |
|||
|
|||
template isTag(alias argument) |
|||
{ |
|||
enum isTag = is(typeof(argument) == string); |
|||
} |
|||
|
|||
template isOption(alias argument) |
|||
{ |
|||
enum isOption = is(typeof(argument) == Option); |
|||
} |
|||
|
|||
struct PerBaseInfo(R, TagsAndOptions...) { |
|||
|
|||
alias staticFilter!(isTag, TagsAndOptions) Tags; |
|||
alias staticFilter!(isOption, TagsAndOptions) Options; |
|||
|
|||
private alias TypeTuple!("CIGAR", Tags) Extensions; |
|||
|
|||
// /////////////////////////////////////////////////////////////////////////
|
|||
//
|
|||
// Each 'extension' is a template with name TAGbaseInfo, containing
|
|||
// a couple of mixin templates:
|
|||
//
|
|||
// * resultProperties
|
|||
// These are additional properties provided by the template
|
|||
//
|
|||
// * rangeMethods
|
|||
// These describe how to proceed to the next base.
|
|||
// The following methods must be implemented:
|
|||
//
|
|||
// - void setup(Args...)(const ref R read, Args args);
|
|||
// Gets called during range construction. All constructor
|
|||
// arguments are forwarded, and it's this function which
|
|||
// is responsible for getting required parameters for this
|
|||
// particular template.
|
|||
//
|
|||
// - void populate(Result)(ref Result r);
|
|||
// Populates fields of the result declared in resultProperties.
|
|||
// Should run in O(1), just copying a few variables.
|
|||
// Current base of the result is updated before the call.
|
|||
//
|
|||
// - void update(const ref R read);
|
|||
// Encapsulates logic of moving to the next base and updating
|
|||
// mixin variables correspondingly.
|
|||
//
|
|||
// - void copy(Range)(const ref Range source, ref Range target);
|
|||
// Gets called during $(D source.save). Therefore, any ranges
|
|||
// used in mixin templates must be saved as well at that time.
|
|||
//
|
|||
// /////////////////////////////////////////////////////////////////////////
|
|||
|
|||
private static string getResultProperties(Exts...)() { |
|||
char[] result; |
|||
foreach (ext; Exts) |
|||
result ~= "mixin " ~ ext ~ "baseInfo!(R, Options).resultProperties;".dup; |
|||
return cast(string)result; |
|||
} |
|||
|
|||
static struct Result { |
|||
/// Actual read base, with strand taken into account.
|
|||
Base base; |
|||
alias base this; |
|||
|
|||
string opCast(T)() if (is(T == string)) |
|||
{ |
|||
return to!string(base); |
|||
} |
|||
|
|||
bool opEquals(T)(T base) const |
|||
if (is(Unqual!T == Base)) |
|||
{ |
|||
return this.base == base; |
|||
} |
|||
|
|||
bool opEquals(T)(T result) const |
|||
if (is(Unqual!T == Result)) |
|||
{ |
|||
return this == result; |
|||
} |
|||
|
|||
bool opEquals(T)(T base) const |
|||
if (is(Unqual!T == char) || is(Unqual!T == dchar)) |
|||
{ |
|||
return this.base == base; |
|||
} |
|||
|
|||
mixin(getResultProperties!Extensions()); |
|||
} |
|||
|
|||
private static string getRangeMethods(Exts...)() { |
|||
char[] result; |
|||
foreach (ext; Exts) |
|||
result ~= "mixin " ~ ext ~ "baseInfo!(R, Options).rangeMethods " ~ ext ~ ";".dup; |
|||
return cast(string)result; |
|||
} |
|||
|
|||
mixin(getRangeMethods!Extensions()); |
|||
|
|||
private void setup(string tag, Args...)(R read, Args args) { |
|||
mixin(tag ~ ".setup(read, args);"); |
|||
} |
|||
|
|||
private void populate(string tag)(ref Result r) { |
|||
mixin(tag ~ ".populate(r);"); |
|||
} |
|||
|
|||
private void update(string tag)() { |
|||
mixin(tag ~ ".update(_read);"); |
|||
} |
|||
|
|||
private void copy(string tag)(ref typeof(this) other) { |
|||
mixin(tag ~ ".copy(this, other);"); |
|||
} |
|||
|
|||
this(Args...)(R read, Args args) { |
|||
_read = read; |
|||
_rev = read.is_reverse_strand; |
|||
_seq = reversableRange!complementBase(read.sequence, _rev); |
|||
|
|||
foreach (t; Extensions) { |
|||
setup!t(read, args); |
|||
} |
|||
} |
|||
|
|||
bool empty() @property { |
|||
return _seq.empty; |
|||
} |
|||
|
|||
/// Allows to construct front element in-place, avoiding a copy.
|
|||
void constructFront(Result* addr) |
|||
{ |
|||
addr.base = _seq.front; |
|||
foreach (t; Extensions) |
|||
populate!t(*addr); |
|||
} |
|||
|
|||
Result front() @property { |
|||
Result r = void; |
|||
r.base = _seq.front; |
|||
foreach (t; Extensions) |
|||
populate!t(r); |
|||
return r; |
|||
} |
|||
|
|||
void popFront() { |
|||
moveToNextBase(); |
|||
} |
|||
|
|||
PerBaseInfo save() @property { |
|||
PerBaseInfo r = void; |
|||
r._read = _read.dup; |
|||
r._seq = _seq.save; |
|||
r._rev = _rev; |
|||
foreach (t; Extensions) |
|||
copy!t(r); |
|||
return r; |
|||
} |
|||
|
|||
ref PerBaseInfo opAssign(PerBaseInfo other) { |
|||
_read = other._read; |
|||
_seq = other._seq.save; |
|||
_rev = other._rev; |
|||
foreach (t; Extensions) |
|||
other.copy!t(this); |
|||
return this; |
|||
} |
|||
|
|||
private void moveToNextBase() { |
|||
|
|||
foreach (t; Extensions) { |
|||
update!t(); |
|||
} |
|||
|
|||
_seq.popFront(); |
|||
} |
|||
|
|||
/// Returns true if the read is reverse strand,
|
|||
/// and false otherwise.
|
|||
bool reverse_strand() @property const { |
|||
return _rev; |
|||
} |
|||
|
|||
private { |
|||
bool _rev = void; |
|||
R _read = void; |
|||
ReversableRange!(complementBase, typeof(_read.sequence)) _seq = void; |
|||
} |
|||
} |
|||
|
|||
///
|
|||
/// Collect per-base information from available tags.
|
|||
/// Use $(D arg!TagName) to pass a parameter related to a particular tag.
|
|||
///
|
|||
/// Example:
|
|||
///
|
|||
/// basesWith!"FZ"(arg!"flowOrder"(flow_order), arg!"keySequence"(key_sequence));
|
|||
///
|
|||
template basesWith(TagsAndOptions...) { |
|||
auto basesWith(R, Args...)(R read, Args args) { |
|||
return PerBaseInfo!(R, TagsAndOptions)(read, args); |
|||
} |
|||
} |
|||
|
|||
/// Provides additional property $(D reference_base)
|
|||
template MDbaseInfo(R, Options...) { |
|||
|
|||
mixin template resultProperties() { |
|||
|
|||
enum MdCurrentOp = staticIndexOf!(Option.mdCurrentOp, Options) != -1; |
|||
enum MdPreviousOp = staticIndexOf!(Option.mdPreviousOp, Options) != -1; |
|||
enum MdNextOp = staticIndexOf!(Option.mdNextOp, Options) != -1; |
|||
|
|||
/// If current CIGAR operation is reference consuming,
|
|||
/// returns reference base at this position, otherwise
|
|||
/// returns '-'.
|
|||
///
|
|||
/// If read is on '-' strand, the result will be
|
|||
/// complementary base.
|
|||
char reference_base() @property const { |
|||
return _ref_base; |
|||
} |
|||
|
|||
private char _ref_base = void; |
|||
|
|||
static if (MdPreviousOp) |
|||
{ |
|||
private Nullable!MdOperation _previous_md_operation = void; |
|||
|
|||
/// Previous MD operation
|
|||
Nullable!MdOperation previous_md_operation() @property { |
|||
return _previous_md_operation; |
|||
} |
|||
} |
|||
|
|||
static if (MdCurrentOp) |
|||
{ |
|||
|
|||
private MdOperation _current_md_operation = void; |
|||
private uint _current_md_operation_offset = void; |
|||
|
|||
/// Current MD operation
|
|||
MdOperation md_operation() @property { |
|||
return _current_md_operation; |
|||
} |
|||
|
|||
/// If current MD operation is match, returns how many bases
|
|||
/// have matched before the current base. Otherwise returns 0.
|
|||
uint md_operation_offset() @property const { |
|||
return _current_md_operation_offset; |
|||
} |
|||
} |
|||
|
|||
static if (MdNextOp) |
|||
{ |
|||
private Nullable!MdOperation _next_md_operation = void; |
|||
/// Next MD operation
|
|||
Nullable!MdOperation next_md_operation() @property { |
|||
return _next_md_operation; |
|||
} |
|||
} |
|||
} |
|||
|
|||
mixin template rangeMethods() { |
|||
|
|||
enum MdCurrentOp = staticIndexOf!(Option.mdCurrentOp, Options) != -1; |
|||
enum MdPreviousOp = staticIndexOf!(Option.mdPreviousOp, Options) != -1; |
|||
enum MdNextOp = staticIndexOf!(Option.mdNextOp, Options) != -1; |
|||
|
|||
private { |
|||
ReversableRange!(reverseMdOp, MdOperationRange) _md_ops = void; |
|||
uint _match; // remaining length of current match operation
|
|||
MdOperation _md_front = void; |
|||
|
|||
static if (MdPreviousOp) |
|||
{ |
|||
Nullable!MdOperation _previous_md_op; |
|||
bool _md_front_is_initialized; |
|||
} |
|||
} |
|||
|
|||
private void updateMdFrontVariable() |
|||
{ |
|||
static if (MdPreviousOp) |
|||
{ |
|||
if (_md_front_is_initialized) |
|||
_previous_md_op = _md_front; |
|||
|
|||
_md_front_is_initialized = true; |
|||
} |
|||
|
|||
_md_front = _md_ops.front; |
|||
_md_ops.popFront(); |
|||
} |
|||
|
|||
void setup(Args...)(const ref R read, Args args) |
|||
{ |
|||
auto md = read["MD"]; |
|||
auto md_str = *(cast(string*)&md); |
|||
_md_ops = reversableRange!reverseMdOp(mdOperations(md_str), |
|||
read.is_reverse_strand); |
|||
|
|||
while (!_md_ops.empty) |
|||
{ |
|||
updateMdFrontVariable(); |
|||
if (!_md_front.is_deletion) { |
|||
if (_md_front.is_match) { |
|||
_match = _md_front.match; |
|||
} |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
|
|||
void populate(Result)(ref Result result) |
|||
{ |
|||
if (!current_cigar_operation.is_reference_consuming) |
|||
{ |
|||
result._ref_base = '-'; |
|||
return; |
|||
} |
|||
|
|||
MdOperation op = _md_front; |
|||
if (op.is_mismatch) |
|||
result._ref_base = op.mismatch.asCharacter; |
|||
else if (op.is_match) { |
|||
result._ref_base = result.base.asCharacter; |
|||
} |
|||
else assert(0); |
|||
|
|||
static if (MdPreviousOp) |
|||
{ |
|||
if (_previous_md_op.isNull) |
|||
result._previous_md_operation.nullify(); |
|||
else |
|||
result._previous_md_operation = _previous_md_op.get; |
|||
} |
|||
|
|||
static if (MdCurrentOp) |
|||
{ |
|||
|
|||
result._current_md_operation = op; |
|||
result._current_md_operation_offset = _md_front.match - _match; |
|||
} |
|||
|
|||
static if (MdNextOp) |
|||
{ |
|||
if (_md_ops.empty) |
|||
result._next_md_operation.nullify(); |
|||
else |
|||
result._next_md_operation = _md_ops.front; |
|||
} |
|||
} |
|||
|
|||
void update(const ref R read) |
|||
{ |
|||
if (!current_cigar_operation.is_reference_consuming) |
|||
return; |
|||
|
|||
if (_md_front.is_mismatch) |
|||
{ |
|||
if (_md_ops.empty) |
|||
return; |
|||
|
|||
updateMdFrontVariable(); |
|||
} |
|||
else if (_md_front.is_match) |
|||
{ |
|||
--_match; |
|||
if (_match == 0 && !_md_ops.empty) { |
|||
updateMdFrontVariable(); |
|||
} |
|||
} |
|||
else assert(0); |
|||
|
|||
while (_md_front.is_deletion) { |
|||
if (_md_ops.empty) |
|||
return; |
|||
|
|||
updateMdFrontVariable(); |
|||
} |
|||
|
|||
if (_match == 0 && _md_front.is_match) |
|||
_match = _md_front.match; |
|||
} |
|||
|
|||
void copy(Range)(ref Range source, ref Range target) |
|||
{ |
|||
target.MD._md_ops = source.MD._md_ops.save; |
|||
target.MD._md_front = source.MD._md_front; |
|||
|
|||
static if (MdPreviousOp) |
|||
{ |
|||
if (source.MD._previous_md_op.isNull) |
|||
target.MD._previous_md_op.nullify(); |
|||
else |
|||
target.MD._previous_md_op = source.MD._previous_md_op.get; |
|||
target.MD._md_front_is_initialized = source.MD._md_front_is_initialized; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// Provides additional property $(D flow_call).
|
|||
template FZbaseInfo(R, Options...) { |
|||
|
|||
mixin template resultProperties() { |
|||
/// Current flow call
|
|||
ReadFlowCall flow_call() @property const { |
|||
return _flow_call; |
|||
} |
|||
|
|||
private { |
|||
ReadFlowCall _flow_call; |
|||
} |
|||
} |
|||
|
|||
mixin template rangeMethods() { |
|||
|
|||
private { |
|||
ReadFlowCallRange!(BamRead.SequenceResult) _flow_calls = void; |
|||
ReadFlowCall _current_flow_call = void; |
|||
ushort _at = void; |
|||
|
|||
debug { |
|||
string _read_name; |
|||
} |
|||
} |
|||
|
|||
void setup(Args...)(const ref R read, Args args) |
|||
{ |
|||
string flow_order = void; |
|||
string key_sequence = void; |
|||
|
|||
debug { |
|||
_read_name = read.name.idup; |
|||
} |
|||
|
|||
enum flowOrderExists = staticIndexOf!(MixinArg!(string, "flowOrder"), Args); |
|||
enum keySequenceExists = staticIndexOf!(MixinArg!(string, "keySequence"), Args); |
|||
static assert(flowOrderExists != -1, `Flow order must be provided via arg!"flowOrder"`); |
|||
static assert(keySequenceExists != -1, `Flow order must be provided via arg!"keySequence"`); |
|||
|
|||
foreach (arg; args) { |
|||
static if(is(typeof(arg) == MixinArg!(string, "flowOrder"))) |
|||
flow_order = arg; |
|||
|
|||
static if(is(typeof(arg) == MixinArg!(string, "keySequence"))) |
|||
key_sequence = arg; |
|||
} |
|||
|
|||
_at = 0; |
|||
|
|||
_flow_calls = readFlowCalls(read, flow_order, key_sequence); |
|||
if (!_flow_calls.empty) { |
|||
_current_flow_call = _flow_calls.front; |
|||
} |
|||
} |
|||
|
|||
void populate(Result)(ref Result result) { |
|||
result._flow_call = _current_flow_call; |
|||
|
|||
debug { |
|||
if (result.base != result._flow_call.base) { |
|||
import std.stdio; |
|||
stderr.writeln("invalid flow call at ", _read_name, ": ", result.position); |
|||
} |
|||
} |
|||
} |
|||
|
|||
void update(const ref R read) |
|||
{ |
|||
++_at; |
|||
if (_at == _current_flow_call.length) { |
|||
_flow_calls.popFront(); |
|||
if (!_flow_calls.empty) { |
|||
_current_flow_call = _flow_calls.front; |
|||
_at = 0; |
|||
} |
|||
} |
|||
} |
|||
|
|||
void copy(Range)(ref Range source, ref Range target) { |
|||
target.FZ._flow_calls = source._flow_calls.save(); |
|||
target.FZ._at = source.FZ._at; |
|||
target.FZ._current_flow_call = source._current_flow_call; |
|||
|
|||
debug { |
|||
target._read_name = _read_name; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
/// Retrieving flow signal intensities from ZM tags is also available.
|
|||
alias FZbaseInfo ZMbaseInfo; |
|||
|
|||
/// Provides additional properties
|
|||
/// * position
|
|||
/// * cigar_operation
|
|||
/// * cigar_operation_offset
|
|||
template CIGARbaseInfo(R, Options...) { |
|||
|
|||
mixin template resultProperties() { |
|||
|
|||
enum CigarExtraProperties = staticIndexOf!(Option.cigarExtra, Options) != -1; |
|||
|
|||
static if (CigarExtraProperties) |
|||
{ |
|||
/// Current CIGAR operation
|
|||
CigarOperation cigar_operation() @property { |
|||
return _cigar[_operation_index]; |
|||
} |
|||
|
|||
/// CIGAR operations before current one
|
|||
auto cigar_before() @property { |
|||
return _cigar[0 .. _operation_index]; |
|||
} |
|||
|
|||
/// CIGAR operations after current one
|
|||
auto cigar_after() @property { |
|||
return _cigar[_operation_index + 1 .. _cigar.length]; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
/// Current CIGAR operation
|
|||
CigarOperation cigar_operation() @property const { |
|||
return _current_cigar_op; |
|||
} |
|||
} |
|||
|
|||
/// Position of the corresponding base on the reference.
|
|||
/// If current CIGAR operation is not one of 'M', '=', 'X',
|
|||
/// returns the position of the previous mapped base.
|
|||
uint position() @property const { |
|||
return _reference_position; |
|||
} |
|||
|
|||
/// Offset in current CIGAR operation, starting from 0.
|
|||
uint cigar_operation_offset() @property const { |
|||
return _cigar_operation_offset; |
|||
} |
|||
|
|||
private { |
|||
int _operation_index = void; |
|||
uint _reference_position = void; |
|||
uint _cigar_operation_offset = void; |
|||
static if (CigarExtraProperties) |
|||
{ |
|||
ReversableRange!(identity, const(CigarOperation)[]) _cigar = void; |
|||
} |
|||
else |
|||
{ |
|||
CigarOperation _current_cigar_op; |
|||
} |
|||
} |
|||
} |
|||
|
|||
mixin template rangeMethods() { |
|||
|
|||
enum CigarExtraProperties = staticIndexOf!(Option.cigarExtra, Options) != -1; |
|||
|
|||
private { |
|||
CigarOperation _current_cigar_op = void; |
|||
|
|||
ulong _cur_cig_op_len = void; |
|||
bool _cur_cig_op_is_ref_cons = void; |
|||
|
|||
int _index = void; |
|||
uint _at = void; |
|||
uint _ref_pos = void; |
|||
ReversableRange!(identity, const(CigarOperation)[]) _cigar = void; |
|||
} |
|||
|
|||
/// Current CIGAR operation, available to all extensions
|
|||
const(CigarOperation) current_cigar_operation() @property const { |
|||
return _current_cigar_op; |
|||
} |
|||
|
|||
void setup(Args...)(const ref R read, Args) |
|||
{ |
|||
_cigar = reversableRange(read.cigar, read.is_reverse_strand); |
|||
|
|||
_index = -1; |
|||
_ref_pos = reverse_strand ? (read.position + read.basesCovered() - 1) |
|||
: read.position; |
|||
|
|||
_moveToNextCigarOperator(); |
|||
assert(_index >= 0); |
|||
} |
|||
|
|||
void populate(Result)(ref Result result) { |
|||
result._reference_position = _ref_pos; |
|||
result._cigar_operation_offset = _at; |
|||
static if (CigarExtraProperties) |
|||
{ |
|||
result._cigar = _cigar; |
|||
result._operation_index = _index; |
|||
} |
|||
else |
|||
{ |
|||
result._current_cigar_op = _current_cigar_op; |
|||
} |
|||
} |
|||
|
|||
void update(const ref R read) |
|||
{ |
|||
++_at; |
|||
|
|||
if (_cur_cig_op_is_ref_cons) { |
|||
_ref_pos += reverse_strand ? -1 : 1; |
|||
} |
|||
|
|||
if (_at == _cur_cig_op_len) { |
|||
_moveToNextCigarOperator(); |
|||
} |
|||
} |
|||
|
|||
void copy(Range)(const ref Range source, ref Range target) { |
|||
target.CIGAR._cigar = source.CIGAR._cigar; |
|||
target.CIGAR._index = source.CIGAR._index; |
|||
target.CIGAR._current_cigar_op = source.CIGAR._current_cigar_op; |
|||
target.CIGAR._cur_cig_op_len = source.CIGAR._cur_cig_op_len; |
|||
target.CIGAR._cur_cig_op_is_ref_cons = source.CIGAR._cur_cig_op_is_ref_cons; |
|||
target.CIGAR._at = source.CIGAR._at; |
|||
target.CIGAR._ref_pos = source.CIGAR._ref_pos; |
|||
} |
|||
|
|||
private void _moveToNextCigarOperator() { |
|||
_at = 0; |
|||
for (++_index; _index < _cigar.length; ++_index) |
|||
{ |
|||
_current_cigar_op = _cigar[_index]; |
|||
_cur_cig_op_is_ref_cons = _current_cigar_op.is_reference_consuming; |
|||
_cur_cig_op_len = _current_cigar_op.length; |
|||
|
|||
if (_current_cigar_op.is_query_consuming) |
|||
break; |
|||
|
|||
if (_cur_cig_op_is_ref_cons) |
|||
{ |
|||
if (reverse_strand) |
|||
_ref_pos -= _cur_cig_op_len; |
|||
else |
|||
_ref_pos += _cur_cig_op_len; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
@ -1,290 +0,0 @@ |
|||
/* |
|||
This file is part of BioD. |
|||
Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com> |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of this software and associated documentation files (the "Software"), |
|||
to deal in the Software without restriction, including without limitation |
|||
the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
and/or sell copies of the Software, and to permit persons to whom the |
|||
Software is furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
DEALINGS IN THE SOFTWARE. |
|||
|
|||
*/ |
|||
|
|||
module bio.bam.cigar; |
|||
|
|||
import std.algorithm; |
|||
import std.range; |
|||
import std.conv; |
|||
import std.format; |
|||
import std.exception; |
|||
import std.system; |
|||
import std.traits; |
|||
import std.array; |
|||
import std.bitmanip; |
|||
import core.stdc.stdlib; |
|||
|
|||
import bio.core.base; |
|||
import bio.core.utils.format; |
|||
|
|||
import bio.bam.abstractreader; |
|||
//import bio.bam.cigar;
|
|||
import bio.bam.writer; |
|||
import bio.bam.tagvalue; |
|||
import bio.bam.bai.bin; |
|||
|
|||
import bio.bam.md.core; |
|||
|
|||
import bio.bam.utils.array; |
|||
import bio.bam.utils.value; |
|||
import bio.core.utils.switchendianness; |
|||
|
|||
import bio.bam.thirdparty.msgpack : Packer, unpack; |
|||
|
|||
/** |
|||
Represents single CIGAR operation |
|||
*/ |
|||
struct CigarOperation { |
|||
static assert(CigarOperation.sizeof == uint.sizeof); |
|||
/* |
|||
WARNING! |
|||
|
|||
It is very essential that the size of |
|||
this struct is EXACTLY equal to uint.sizeof! |
|||
|
|||
The reason is to avoid copying of arrays during alignment parsing. |
|||
|
|||
Namely, when some_pointer points to raw cigar data, |
|||
we can just do a cast. This allows to access those data |
|||
directly, not doing any memory allocations. |
|||
*/ |
|||
|
|||
private uint raw; // raw data from BAM
|
|||
|
|||
private static ubyte char2op(char c) { |
|||
switch(c) { |
|||
case 'M': return 0; |
|||
case 'I': return 1; |
|||
case 'D': return 2; |
|||
case 'N': return 3; |
|||
case 'S': return 4; |
|||
case 'H': return 5; |
|||
case 'P': return 6; |
|||
case '=': return 7; |
|||
case 'X': return 8; |
|||
default: return 15; // 15 is used as invalid value
|
|||
} |
|||
} |
|||
|
|||
/// Length must be strictly less than 2^28.
|
|||
/// $(BR)
|
|||
/// Operation type must be one of M, I, D, N, S, H, P, =, X.
|
|||
this(uint length, char operation_type) { |
|||
enforce(length < (1<<28), "Too big length of CIGAR operation"); |
|||
raw = (length << 4) | char2op(operation_type); |
|||
} |
|||
|
|||
this(uint _raw) { |
|||
raw = _raw; |
|||
} |
|||
|
|||
/// Operation length
|
|||
uint length() @property const nothrow @nogc { |
|||
return raw >> 4; |
|||
} |
|||
|
|||
/// CIGAR operation as one of MIDNSHP=X.
|
|||
/// Absent or invalid operation is represented by '?'
|
|||
char type() @property const nothrow @nogc { |
|||
return "MIDNSHP=X????????"[raw & 0xF]; |
|||
} |
|||
|
|||
// Each pair of bits has first bit set iff the operation is query consuming,
|
|||
// and second bit set iff it is reference consuming.
|
|||
// X = P H S N D I M
|
|||
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; |
|||
|
|||
/// True iff operation is one of M, =, X, I, S
|
|||
bool is_query_consuming() @property const nothrow @nogc { |
|||
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0; |
|||
} |
|||
|
|||
/// True iff operation is one of M, =, X, D, N
|
|||
bool is_reference_consuming() @property const nothrow @nogc { |
|||
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0; |
|||
} |
|||
|
|||
/// True iff operation is one of M, =, X
|
|||
bool is_match_or_mismatch() @property const nothrow @nogc { |
|||
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3; |
|||
} |
|||
|
|||
/// True iff operation is one of 'S', 'H'
|
|||
bool is_clipping() @property const nothrow @nogc { |
|||
return ((raw & 0xF) >> 1) == 2; // 4 or 5
|
|||
} |
|||
|
|||
void toSam(Sink)(auto ref Sink sink) const |
|||
if (isSomeSink!Sink) |
|||
{ |
|||
sink.write(length); |
|||
sink.write(type); |
|||
} |
|||
|
|||
void toString(scope void delegate(const(char)[]) sink) const { |
|||
toSam(sink); |
|||
} |
|||
} |
|||
|
|||
alias CigarOperation[] CigarOperations; |
|||
|
|||
bool is_unavailable(CigarOperations cigar) @property nothrow @nogc { |
|||
return (cigar.length == 1 && cigar[0].raw == '*'); |
|||
} |
|||
|
|||
/// Forward range of extended CIGAR operations, with =/X instead of M
|
|||
/// Useful for, e.g., detecting positions of mismatches.
|
|||
struct ExtendedCigarRange(CigarOpRange, MdOpRange) { |
|||
static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation)); |
|||
static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation)); |
|||
|
|||
private { |
|||