Browse Source

move files

remotes/georgeg/master
George Githinji 4 years ago
parent
commit
4febe23c36
  1. 49
      bio/std/bam/abstractreader.d
  2. 92
      bio/std/bam/bai/bin.d
  3. 366
      bio/std/bam/bai/indexing.d
  4. 169
      bio/std/bam/baifile.d
  5. 728
      bio/std/bam/baseinfo.d
  6. 290
      bio/std/bam/cigar.d
  7. 37
      bio/std/bam/constants.d
  8. 320
      bio/std/bam/iontorrent/flowcall.d
  9. 99
      bio/std/bam/iontorrent/flowindex.d
  10. 4
      bio/std/bam/md/core.d
  11. 147
      bio/std/bam/md/operation.d
  12. 182
      bio/std/bam/md/parse.d
  13. 440
      bio/std/bam/md/reconstruct.d
  14. 429
      bio/std/bam/multireader.d
  15. 1015
      bio/std/bam/pileup.d
  16. 469
      bio/std/bam/randomaccessmanager.d
  17. 1648
      bio/std/bam/read.d
  18. 594
      bio/std/bam/reader.d
  19. 193
      bio/std/bam/readrange.d
  20. 221
      bio/std/bam/reference.d
  21. 63
      bio/std/bam/referenceinfo.d
  22. 65
      bio/std/bam/region.d
  23. 541
      bio/std/bam/snpcallers/maq.d
  24. 47
      bio/std/bam/snpcallers/simple.d
  25. 106
      bio/std/bam/splitter.d
  26. 588
      bio/std/bam/tagvalue.d
  27. 5173
      bio/std/bam/thirdparty/msgpack.d
  28. 113
      bio/std/bam/utils/array.d
  29. 92
      bio/std/bam/utils/graph.d
  30. 404
      bio/std/bam/utils/samheadermerger.d
  31. 121
      bio/std/bam/utils/value.d
  32. 562
      bio/std/bam/validation/alignment.d
  33. 446
      bio/std/bam/validation/samheader.d
  34. 287
      bio/std/bam/writer.d
  35. 808
      bio/std/sam/header.d
  36. 222
      bio/std/sam/reader.d
  37. 4712
      bio/std/sam/utils/fastrecordparser.d
  38. 1458
      bio/std/sam/utils/recordparser.d

49
bio/std/bam/abstractreader.d

@ -1,49 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2013 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.abstractreader;
import bio.sam.header;
import bio.bam.read;
import bio.bam.referenceinfo;
public import std.range;
/// Common interface for $(DPREF2 bam, reader, BamReader)
/// and $(DPREF2 sam, reader, SamReader).
interface IBamSamReader {
/// SAM header
bio.sam.header.SamHeader header() @property;
/// Information about reference sequences
const(bio.bam.referenceinfo.ReferenceSequenceInfo)[] reference_sequences() @property const nothrow;
/// All reads in the file
std.range.InputRange!(bio.bam.read.BamRead) allReads() @property;
/// Filename
string filename() @property const;
///
void assumeSequentialProcessing();
}

92
bio/std/bam/bai/bin.d

@ -1,92 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.bai.bin;
import bio.core.bgzf.chunk;
import bio.bam.constants;
/// Distinct bin
struct Bin {
/// Construct a bin with an id
this(uint id) nothrow {
this.id = id;
}
uint id; /// bin number
Chunk[] chunks;
/// How deep the bin is in the tree
int level() @property const nothrow {
if (id == 0) return 0;
if (id < 9) return 1;
if (id < 73) return 2;
if (id < 585) return 3;
if (id < 4681) return 4;
return 5;
}
/// Returns whether the bin is a leaf in the B-tree
bool is_leaf() @property const nothrow {
return id > BAI_MAX_NONLEAF_BIN_ID;
}
/// Check if bin can overlap with a region
bool canOverlapWith(int begin, int end) const nothrow {
if (id == 0) return true;
if (id > BAI_MAX_BIN_ID) return false;
/// The following code is based on reg2bins() function
if (begin < 0) begin = 0;
auto magic_number = 4681;
auto b = begin >> 14;
auto e = end >> 14;
while (true) {
auto delta = id - magic_number;
if (b <= delta && delta <= e) return true;
magic_number >>= 3;
if (magic_number == 0) return false;
b >>= 3;
e >>= 3;
}
}
}
/// Returns bin number for [beg, end) interval (zero-based).
/// Taken from SAM/BAM specification.
ushort reg2bin(int beg, int end) {
if (end == beg) end = beg + 1; // edge case
--end;
if (beg>>14 == end>>14) return cast(ushort)(((1<<15)-1)/7 + (beg>>14));
if (beg>>17 == end>>17) return cast(ushort)(((1<<12)-1)/7 + (beg>>17));
if (beg>>20 == end>>20) return cast(ushort)(((1<<9)-1)/7 + (beg>>20));
if (beg>>23 == end>>23) return cast(ushort)(((1<<6)-1)/7 + (beg>>23));
if (beg>>26 == end>>26) return cast(ushort)(((1<<3)-1)/7 + (beg>>26));
return 0;
}

366
bio/std/bam/bai/indexing.d

@ -1,366 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012-2017 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.bai.indexing;
import bio.bam.reader;
import bio.bam.readrange;
import bio.bam.constants;
import bio.bam.bai.bin;
import bio.core.bgzf.chunk;
import undead.stream;
import std.array;
import std.algorithm;
import std.system;
import std.exception;
import core.stdc.string;
// Suppose we have an alignment which covers bases on a reference,
// starting from one position and ending at another position.
// In order to build linear index, we need to find to which windows
// the two positions correspond.
//
//
// (K = 16384)
//
// [0, K)[K, 2K)[2K, 3K)... <- windows
// [.......) <- alignment
//
private size_t toLinearIndexOffset(int position) {
return position < 0 ? 0 : position / BAI_LINEAR_INDEX_WINDOW_SIZE;
}
///
struct IndexBuilder {
private {
// array of linear offsets for the current reference entry
VirtualOffset[] _linear_index;
// (maximum index in _linear_index where data was written) + 1
size_t _linear_index_write_length;
static struct PreviousRead {
int ref_id = -1;
int position;
int end_position;
int basesCovered() { return end_position - position; }
Bin bin;
bool is_unmapped;
private char[256] _name_buf; // by spec., name length is <= 255 chars
string name;
VirtualOffset start_virtual_offset;
VirtualOffset end_virtual_offset;
}
PreviousRead _prev_read;
Stream _stream;
int _n_refs;
ulong _no_coord = 0;
// metadata for each reference
ulong _beg_vo = -1UL;
ulong _end_vo = 0;
ulong _unmapped = 0;
ulong _mapped = 0;
bool _first_read = true;
// map: bin ID -> array of chunks
Chunk[][uint] _chunks;
VirtualOffset _current_chunk_beg; // start of current chunk
// no metadata for empty references
void writeEmptyReference() {
_stream.write(cast(int)0); // n_bins
_stream.write(cast(int)0); // n_intv
}
void updateLastReadInfo(ref BamReadBlock read) {
with (_prev_read) {
ref_id = read.ref_id;
position = read.position;
end_position = position + read.basesCovered();
bin = read.bin;
is_unmapped = read.is_unmapped;
_name_buf[0 .. read.name.length] = read.name[];
name = cast(string)_name_buf[0 .. read.name.length];
start_virtual_offset = read.start_virtual_offset;
end_virtual_offset = read.end_virtual_offset;
}
}
void updateMetadata(ref BamReadBlock read) {
if (read.ref_id == -1) {
++_no_coord;
} else {
if (read.is_unmapped) {
++_unmapped;
} else {
++_mapped;
}
if (_beg_vo == -1UL)
_beg_vo = cast(ulong)read.start_virtual_offset;
_end_vo = cast(ulong)read.end_virtual_offset;
}
}
void updateLinearIndex() {
assert(_prev_read.ref_id >= 0);
size_t beg, end;
if (_prev_read.is_unmapped) {
end = beg = toLinearIndexOffset(_prev_read.position);
} else {
beg = toLinearIndexOffset(_prev_read.position);
end = toLinearIndexOffset(_prev_read.position + _prev_read.basesCovered() - 1);
}
debug {
import std.stdio;
if (end >= _linear_index.length) {
writeln("beg: ", beg);
writeln("end: ", end);
writeln("pos: ", _prev_read.position);
writeln("bases: ", _prev_read.basesCovered());
}
}
foreach (i; beg .. end + 1)
if (_linear_index[i] == 0UL)
_linear_index[i] = _prev_read.start_virtual_offset;
if (end + 1 > _linear_index_write_length)
_linear_index_write_length = end + 1;
}
void dumpCurrentLinearIndex() {
_stream.write(cast(int)_linear_index_write_length);
//
// There might be untouched places in linear index
// with virtual offset equal to zero.
// However, it's not a good idea to leave those zeros,
// since we can start lookup from the last non-zero virtual offset
// encountered before the untouched window.
//
VirtualOffset last_voffset = 0;
foreach (voffset; _linear_index[0 .. _linear_index_write_length]) {
if (voffset == 0)
voffset = last_voffset;
else
last_voffset = voffset;
_stream.write(cast(ulong)voffset);
}
}
void dumpCurrentReference() {
// +1 because we output dummy bin, too
_stream.write(cast(int)(_chunks.length + 1));
foreach (bin_id, bin_chunks; _chunks) {
if (bin_chunks.length > 0) {
_stream.write(cast(uint)bin_id);
_stream.write(cast(int)bin_chunks.length);
foreach (chunk; bin_chunks) {
_stream.write(cast(ulong)chunk.beg);
_stream.write(cast(ulong)chunk.end);
}
}
}
_stream.write(cast(uint)37450);
_stream.write(cast(int)2);
_stream.write(cast(ulong)_beg_vo);
_stream.write(cast(ulong)_end_vo);
_stream.write(cast(ulong)_mapped);
_stream.write(cast(ulong)_unmapped);
dumpCurrentLinearIndex();
// reset data
memset(_linear_index.ptr, 0, _linear_index.length * ulong.sizeof);
_linear_index_write_length = 0;
_chunks = null;
_current_chunk_beg = _prev_read.end_virtual_offset;
_beg_vo = _end_vo = cast(ulong)_current_chunk_beg;
_unmapped = 0;
_mapped = 0;
}
// adds chunk to the current bin (which is determined from _prev_read)
void updateChunks() {
auto current_chunk_end = _prev_read.end_virtual_offset;
auto bin_id = _prev_read.bin.id;
if (bin_id !in _chunks)
_chunks[bin_id] = [];
auto cs = _chunks[bin_id];
bool canMergeWithPreviousChunk() {
assert(cs.length > 0);
auto last_chunk = cs[$ - 1];
if (last_chunk.end.coffset == _current_chunk_beg.coffset)
return true;
return false;
}
if (cs.length == 0 || !canMergeWithPreviousChunk()) {
auto new_chunk = Chunk(_current_chunk_beg, current_chunk_end);
_chunks[_prev_read.bin.id] ~= new_chunk;
} else {
_chunks[_prev_read.bin.id][$ - 1].end = current_chunk_end;
}
_current_chunk_beg = current_chunk_end;
}
void checkThatBinIsCorrect(ref BamReadBlock read) {
if (!check_bins)
return;
auto expected = reg2bin(read.position,
read.position + read.basesCovered());
enforce(read.bin.id == expected,
"Bin in read with name '" ~ read.name ~
"' is set incorrectly (" ~ to!string(read.bin.id) ~
" instead of expected " ~ to!string(expected) ~ ")");
}
void checkThatInputIsSorted(ref BamReadBlock read) {
if (_first_read) return;
if (read.ref_id == -1) return; // unmapped
if (_prev_read.ref_id < read.ref_id) return;
enforce(read.ref_id == _prev_read.ref_id &&
read.position >= _prev_read.position,
"BAM file is not coordinate-sorted: " ~
"read '" ~ read.name ~ "' (" ~ read.ref_id.to!string ~ ":" ~ read.position.to!string ~ ")" ~
" must be after read '" ~ _prev_read.name ~ "' (" ~ _prev_read.ref_id.to!string ~ ":" ~ _prev_read.position.to!string ~ ")" ~
"' (at virtual offsets " ~
to!string(_prev_read.start_virtual_offset) ~ ", " ~ read.start_virtual_offset.to!string ~ ")");
}
}
///
this(Stream output_stream, int number_of_references) {
_stream = new EndianStream(output_stream, Endian.littleEndian);
_n_refs = number_of_references;
size_t size = BAI_MAX_BIN_ID - BAI_MAX_NONLEAF_BIN_ID + 1;
_linear_index = new VirtualOffset[](size);
_stream.writeString(BAI_MAGIC); // write BAI magic string
_stream.write(cast(int)_n_refs); // and number of references
}
/// Check that bins are correct.
bool check_bins;
/// Add a read. The reads must be put in coordinate-sorted order.
void put(BamReadBlock read) {
checkThatInputIsSorted(read);
scope(exit) updateMetadata(read);
if (read.ref_id < 0)
return;
// start position is unavailable, skip
if (read.position < 0)
return;
if (_first_read) {
updateLastReadInfo(read);
_first_read = false;
_current_chunk_beg = read.start_virtual_offset;
if (read.ref_id > 0)
foreach (i; 0 .. read.ref_id)
writeEmptyReference();
return;
}
checkThatBinIsCorrect(read);
// new reference, so write data for previous one(s)
if (read.ref_id > _prev_read.ref_id) {
updateLinearIndex();
updateChunks();
dumpCurrentReference();
foreach (i; _prev_read.ref_id + 1 .. read.ref_id)
writeEmptyReference();
}
if (read.ref_id == _prev_read.ref_id) {
updateLinearIndex();
if (read.bin.id != _prev_read.bin.id)
updateChunks();
}
updateLastReadInfo(read);
}
/// Closes the stream
void finish() {
if (!_first_read) { // at least one was processed
assert(_prev_read.ref_id >= 0);
updateLinearIndex();
updateChunks();
dumpCurrentReference();
}
// _prev_read.ref_id == -1 if all are unmapped
foreach (i; _prev_read.ref_id + 1 .. _n_refs)
writeEmptyReference();
_stream.write(cast(ulong)_no_coord);
_stream.close();
}
}
/// Writes BAM index to the $(D stream)
///
/// Accepts optional $(D progressBarFunc)
void createIndex(BamReader bam, Stream stream, bool check_bins=false,
void delegate(lazy float p) progressBarFunc=null)
{
auto n_refs = cast(int)bam.reference_sequences.length;
auto index_builder = IndexBuilder(stream, n_refs);
index_builder.check_bins = check_bins;
auto reads = bam.readsWithProgress!withOffsets(progressBarFunc);
foreach (read; reads)
index_builder.put(read);
index_builder.finish();
}

169
bio/std/bam/baifile.d

@ -1,169 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.baifile;
public import bio.core.bgzf.chunk;
public import bio.bam.bai.bin;
import bio.core.bgzf.virtualoffset;
import bio.bam.constants;
import undead.stream;
import std.system;
import std.exception;
import std.algorithm;
import std.conv;
import std.range;
import std.file;
import std.path;
/// Represents index for a single reference
struct Index {
/// Information about bins
Bin[] bins;
/// Virtual file offsets of first alignments overlapping 16384-byte windows
/// on the reference sequence. This linear index is used to reduce amount
/// of file seeks for region queries, since with its help one can reduce the
/// number of chunks to be investigated based on their end position.
///
///
/// Suppose you have a region [beg, end) and want to do a query.
///
/// Here's the reference:
/// [....................!..............!.................................]
/// beg end
///
/// Here's the same reference with 16384-byte long windows:
/// [%...........%.......!....%.........!..%...........%...........%......]
/// beg end
/// [ 1st window][ 2nd window][...
///
/// With linear index, we can take the second window, find out what is
/// the minimum virtual offset among alignments overlapping this window,
/// and skip all chunks which end position is less or equal to this offset:
///
/// [........@...........!..............!.................................]
/// . ..min. offset beg end
/// [ ). . <- this chunk is skipped
/// [ ) <- this one is not
///
VirtualOffset[] ioffsets;
/// Get (approximate) virtual offset of the first alignment overlapping $(D position)
///
/// Returned virtual offset is less or equal to real offset.
VirtualOffset getMinimumOffset(int position) {
int pos = max(0, position);
int _i = min(pos / BAI_LINEAR_INDEX_WINDOW_SIZE, cast(int)ioffsets.length - 1);
auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i];
return min_offset;
}
}
struct BaiFile {
Index[] indices;
/// Initialize from stream which contains BAI data
this(ref Stream stream) {
_stream = stream;
parse();
}
/// Open BAI file given either filename of BAM file or that of BAI file.
this(string filename) {
Stream fstream;
if (!endsWith(filename, ".bai")) {
/// Unfortunately, std.path.addExt is going to be deprecated
auto first_filename = filename ~ ".bai";
auto second_filename = to!string(retro(find(retro(filename), '.'))) ~ "bai";
if (std.file.exists(first_filename)) {
fstream = new BufferedFile(absolutePath(first_filename));
} else {
if (std.file.exists(second_filename)) {
fstream = new BufferedFile(absolutePath(second_filename));
} else {
throw new Exception("searched for " ~ first_filename ~ " or " ~
second_filename ~ ", found neither");
}
}
} else {
fstream = new BufferedFile(filename);
}
Stream estream = new EndianStream(fstream, Endian.littleEndian);
this(estream);
}
private:
Stream _stream;
/// according to section 4.2 of SAM/BAM specification
void parse() {
auto magic = _stream.readString(4);
enforce(magic == "BAI\1", "Invalid file format: expected BAI\\1");
int n_ref;
_stream.read(n_ref);
indices = uninitializedArray!(Index[])(n_ref);
foreach (i; 0 .. n_ref) {
int n_bin = void;
_stream.read(n_bin);
indices[i].bins = uninitializedArray!(Bin[])(n_bin);
foreach (j; 0 .. n_bin) {
uint id = void;
_stream.read(id);
auto bin = Bin(id);
int n_chunk = void;
_stream.read(n_chunk);
bin.chunks = uninitializedArray!(Chunk[])(n_chunk);
foreach (k; 0 .. n_chunk) {
ulong tmp = void;
_stream.read(tmp);
bin.chunks[k].beg = VirtualOffset(tmp);
_stream.read(tmp);
bin.chunks[k].end = VirtualOffset(tmp);
}
indices[i].bins[j] = bin;
}
int n_intv = void;
_stream.read(n_intv);
indices[i].ioffsets = uninitializedArray!(VirtualOffset[])(n_intv);
foreach (j; 0 .. n_intv) {
ulong tmp = void;
_stream.read(tmp);
indices[i].ioffsets[j] = VirtualOffset(tmp);
}
}
}
}

728
bio/std/bam/baseinfo.d

@ -1,728 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.baseinfo;
import bio.core.base;
import bio.core.sequence;
import bio.bam.read;
import bio.bam.tagvalue;
import bio.bam.iontorrent.flowcall;
import bio.bam.md.core;
import bio.bam.cigar;
import std.range;
import std.conv;
import std.traits;
import std.typecons;
import std.typetuple;
///
enum Option
{
/// adds 'cigar_before' and 'cigar_after' properties
cigarExtra,
/// adds 'md_operation', 'md_operation_offset' properties
mdCurrentOp,
/// adds 'previous_md_operation' property
mdPreviousOp,
/// adds 'next_md_operation' property
mdNextOp
}
///
struct MixinArg(T, string Tag) {
T value;
alias value this;
alias Tag TagName;
}
/// Wrapper for arguments to $(D basesWith) function (see below).
/// Required to distinguish to which tag each parameter refers.
MixinArg!(T, Tag) arg(string Tag, T)(T value) {
return MixinArg!(T, Tag)(value);
}
template staticFilter(alias P, T...)
{
static if (T.length == 0)
alias TypeTuple!() staticFilter;
else static if (P!(T[0]))
alias TypeTuple!(T[0], staticFilter!(P, T[1..$])) staticFilter;
else
alias staticFilter!(P, T[1..$]) staticFilter;
}
template isTag(alias argument)
{
enum isTag = is(typeof(argument) == string);
}
template isOption(alias argument)
{
enum isOption = is(typeof(argument) == Option);
}
struct PerBaseInfo(R, TagsAndOptions...) {
alias staticFilter!(isTag, TagsAndOptions) Tags;
alias staticFilter!(isOption, TagsAndOptions) Options;
private alias TypeTuple!("CIGAR", Tags) Extensions;
// /////////////////////////////////////////////////////////////////////////
//
// Each 'extension' is a template with name TAGbaseInfo, containing
// a couple of mixin templates:
//
// * resultProperties
// These are additional properties provided by the template
//
// * rangeMethods
// These describe how to proceed to the next base.
// The following methods must be implemented:
//
// - void setup(Args...)(const ref R read, Args args);
// Gets called during range construction. All constructor
// arguments are forwarded, and it's this function which
// is responsible for getting required parameters for this
// particular template.
//
// - void populate(Result)(ref Result r);
// Populates fields of the result declared in resultProperties.
// Should run in O(1), just copying a few variables.
// Current base of the result is updated before the call.
//
// - void update(const ref R read);
// Encapsulates logic of moving to the next base and updating
// mixin variables correspondingly.
//
// - void copy(Range)(const ref Range source, ref Range target);
// Gets called during $(D source.save). Therefore, any ranges
// used in mixin templates must be saved as well at that time.
//
// /////////////////////////////////////////////////////////////////////////
private static string getResultProperties(Exts...)() {
char[] result;
foreach (ext; Exts)
result ~= "mixin " ~ ext ~ "baseInfo!(R, Options).resultProperties;".dup;
return cast(string)result;
}
static struct Result {
/// Actual read base, with strand taken into account.
Base base;
alias base this;
string opCast(T)() if (is(T == string))
{
return to!string(base);
}
bool opEquals(T)(T base) const
if (is(Unqual!T == Base))
{
return this.base == base;
}
bool opEquals(T)(T result) const
if (is(Unqual!T == Result))
{
return this == result;
}
bool opEquals(T)(T base) const
if (is(Unqual!T == char) || is(Unqual!T == dchar))
{
return this.base == base;
}
mixin(getResultProperties!Extensions());
}
private static string getRangeMethods(Exts...)() {
char[] result;
foreach (ext; Exts)
result ~= "mixin " ~ ext ~ "baseInfo!(R, Options).rangeMethods " ~ ext ~ ";".dup;
return cast(string)result;
}
mixin(getRangeMethods!Extensions());
private void setup(string tag, Args...)(R read, Args args) {
mixin(tag ~ ".setup(read, args);");
}
private void populate(string tag)(ref Result r) {
mixin(tag ~ ".populate(r);");
}
private void update(string tag)() {
mixin(tag ~ ".update(_read);");
}
private void copy(string tag)(ref typeof(this) other) {
mixin(tag ~ ".copy(this, other);");
}
this(Args...)(R read, Args args) {
_read = read;
_rev = read.is_reverse_strand;
_seq = reversableRange!complementBase(read.sequence, _rev);
foreach (t; Extensions) {
setup!t(read, args);
}
}
bool empty() @property {
return _seq.empty;
}
/// Allows to construct front element in-place, avoiding a copy.
void constructFront(Result* addr)
{
addr.base = _seq.front;
foreach (t; Extensions)
populate!t(*addr);
}
Result front() @property {
Result r = void;
r.base = _seq.front;
foreach (t; Extensions)
populate!t(r);
return r;
}
void popFront() {
moveToNextBase();
}
PerBaseInfo save() @property {
PerBaseInfo r = void;
r._read = _read.dup;
r._seq = _seq.save;
r._rev = _rev;
foreach (t; Extensions)
copy!t(r);
return r;
}
ref PerBaseInfo opAssign(PerBaseInfo other) {
_read = other._read;
_seq = other._seq.save;
_rev = other._rev;
foreach (t; Extensions)
other.copy!t(this);
return this;
}
private void moveToNextBase() {
foreach (t; Extensions) {
update!t();
}
_seq.popFront();
}
/// Returns true if the read is reverse strand,
/// and false otherwise.
bool reverse_strand() @property const {
return _rev;
}
private {
bool _rev = void;
R _read = void;
ReversableRange!(complementBase, typeof(_read.sequence)) _seq = void;
}
}
///
/// Collect per-base information from available tags.
/// Use $(D arg!TagName) to pass a parameter related to a particular tag.
///
/// Example:
///
/// basesWith!"FZ"(arg!"flowOrder"(flow_order), arg!"keySequence"(key_sequence));
///
template basesWith(TagsAndOptions...) {
auto basesWith(R, Args...)(R read, Args args) {
return PerBaseInfo!(R, TagsAndOptions)(read, args);
}
}
/// Provides additional property $(D reference_base)
template MDbaseInfo(R, Options...) {
mixin template resultProperties() {
enum MdCurrentOp = staticIndexOf!(Option.mdCurrentOp, Options) != -1;
enum MdPreviousOp = staticIndexOf!(Option.mdPreviousOp, Options) != -1;
enum MdNextOp = staticIndexOf!(Option.mdNextOp, Options) != -1;
/// If current CIGAR operation is reference consuming,
/// returns reference base at this position, otherwise
/// returns '-'.
///
/// If read is on '-' strand, the result will be
/// complementary base.
char reference_base() @property const {
return _ref_base;
}
private char _ref_base = void;
static if (MdPreviousOp)
{
private Nullable!MdOperation _previous_md_operation = void;
/// Previous MD operation
Nullable!MdOperation previous_md_operation() @property {
return _previous_md_operation;
}
}
static if (MdCurrentOp)
{
private MdOperation _current_md_operation = void;
private uint _current_md_operation_offset = void;
/// Current MD operation
MdOperation md_operation() @property {
return _current_md_operation;
}
/// If current MD operation is match, returns how many bases
/// have matched before the current base. Otherwise returns 0.
uint md_operation_offset() @property const {
return _current_md_operation_offset;
}
}
static if (MdNextOp)
{
private Nullable!MdOperation _next_md_operation = void;
/// Next MD operation
Nullable!MdOperation next_md_operation() @property {
return _next_md_operation;
}
}
}
mixin template rangeMethods() {
enum MdCurrentOp = staticIndexOf!(Option.mdCurrentOp, Options) != -1;
enum MdPreviousOp = staticIndexOf!(Option.mdPreviousOp, Options) != -1;
enum MdNextOp = staticIndexOf!(Option.mdNextOp, Options) != -1;
private {
ReversableRange!(reverseMdOp, MdOperationRange) _md_ops = void;
uint _match; // remaining length of current match operation
MdOperation _md_front = void;
static if (MdPreviousOp)
{
Nullable!MdOperation _previous_md_op;
bool _md_front_is_initialized;
}
}
private void updateMdFrontVariable()
{
static if (MdPreviousOp)
{
if (_md_front_is_initialized)
_previous_md_op = _md_front;
_md_front_is_initialized = true;
}
_md_front = _md_ops.front;
_md_ops.popFront();
}
void setup(Args...)(const ref R read, Args args)
{
auto md = read["MD"];
auto md_str = *(cast(string*)&md);
_md_ops = reversableRange!reverseMdOp(mdOperations(md_str),
read.is_reverse_strand);
while (!_md_ops.empty)
{
updateMdFrontVariable();
if (!_md_front.is_deletion) {
if (_md_front.is_match) {
_match = _md_front.match;
}
break;
}
}
}
void populate(Result)(ref Result result)
{
if (!current_cigar_operation.is_reference_consuming)
{
result._ref_base = '-';
return;
}
MdOperation op = _md_front;
if (op.is_mismatch)
result._ref_base = op.mismatch.asCharacter;
else if (op.is_match) {
result._ref_base = result.base.asCharacter;
}
else assert(0);
static if (MdPreviousOp)
{
if (_previous_md_op.isNull)
result._previous_md_operation.nullify();
else
result._previous_md_operation = _previous_md_op.get;
}
static if (MdCurrentOp)
{
result._current_md_operation = op;
result._current_md_operation_offset = _md_front.match - _match;
}
static if (MdNextOp)
{
if (_md_ops.empty)
result._next_md_operation.nullify();
else
result._next_md_operation = _md_ops.front;
}
}
void update(const ref R read)
{
if (!current_cigar_operation.is_reference_consuming)
return;
if (_md_front.is_mismatch)
{
if (_md_ops.empty)
return;
updateMdFrontVariable();
}
else if (_md_front.is_match)
{
--_match;
if (_match == 0 && !_md_ops.empty) {
updateMdFrontVariable();
}
}
else assert(0);
while (_md_front.is_deletion) {
if (_md_ops.empty)
return;
updateMdFrontVariable();
}
if (_match == 0 && _md_front.is_match)
_match = _md_front.match;
}
void copy(Range)(ref Range source, ref Range target)
{
target.MD._md_ops = source.MD._md_ops.save;
target.MD._md_front = source.MD._md_front;
static if (MdPreviousOp)
{
if (source.MD._previous_md_op.isNull)
target.MD._previous_md_op.nullify();
else
target.MD._previous_md_op = source.MD._previous_md_op.get;
target.MD._md_front_is_initialized = source.MD._md_front_is_initialized;
}
}
}
}
/// Provides additional property $(D flow_call).
template FZbaseInfo(R, Options...) {
mixin template resultProperties() {
/// Current flow call
ReadFlowCall flow_call() @property const {
return _flow_call;
}
private {
ReadFlowCall _flow_call;
}
}
mixin template rangeMethods() {
private {
ReadFlowCallRange!(BamRead.SequenceResult) _flow_calls = void;
ReadFlowCall _current_flow_call = void;
ushort _at = void;
debug {
string _read_name;
}
}
void setup(Args...)(const ref R read, Args args)
{
string flow_order = void;
string key_sequence = void;
debug {
_read_name = read.name.idup;
}
enum flowOrderExists = staticIndexOf!(MixinArg!(string, "flowOrder"), Args);
enum keySequenceExists = staticIndexOf!(MixinArg!(string, "keySequence"), Args);
static assert(flowOrderExists != -1, `Flow order must be provided via arg!"flowOrder"`);
static assert(keySequenceExists != -1, `Flow order must be provided via arg!"keySequence"`);
foreach (arg; args) {
static if(is(typeof(arg) == MixinArg!(string, "flowOrder")))
flow_order = arg;
static if(is(typeof(arg) == MixinArg!(string, "keySequence")))
key_sequence = arg;
}
_at = 0;
_flow_calls = readFlowCalls(read, flow_order, key_sequence);
if (!_flow_calls.empty) {
_current_flow_call = _flow_calls.front;
}
}
void populate(Result)(ref Result result) {
result._flow_call = _current_flow_call;
debug {
if (result.base != result._flow_call.base) {
import std.stdio;
stderr.writeln("invalid flow call at ", _read_name, ": ", result.position);
}
}
}
void update(const ref R read)
{
++_at;
if (_at == _current_flow_call.length) {
_flow_calls.popFront();
if (!_flow_calls.empty) {
_current_flow_call = _flow_calls.front;
_at = 0;
}
}
}
void copy(Range)(ref Range source, ref Range target) {
target.FZ._flow_calls = source._flow_calls.save();
target.FZ._at = source.FZ._at;
target.FZ._current_flow_call = source._current_flow_call;
debug {
target._read_name = _read_name;
}
}
}
}
/// Retrieving flow signal intensities from ZM tags is also available.
alias FZbaseInfo ZMbaseInfo;
/// Provides additional properties
/// * position
/// * cigar_operation
/// * cigar_operation_offset
template CIGARbaseInfo(R, Options...) {
mixin template resultProperties() {
enum CigarExtraProperties = staticIndexOf!(Option.cigarExtra, Options) != -1;
static if (CigarExtraProperties)
{
/// Current CIGAR operation
CigarOperation cigar_operation() @property {
return _cigar[_operation_index];
}
/// CIGAR operations before current one
auto cigar_before() @property {
return _cigar[0 .. _operation_index];
}
/// CIGAR operations after current one
auto cigar_after() @property {
return _cigar[_operation_index + 1 .. _cigar.length];
}
}
else
{
/// Current CIGAR operation
CigarOperation cigar_operation() @property const {
return _current_cigar_op;
}
}
/// Position of the corresponding base on the reference.
/// If current CIGAR operation is not one of 'M', '=', 'X',
/// returns the position of the previous mapped base.
uint position() @property const {
return _reference_position;
}
/// Offset in current CIGAR operation, starting from 0.
uint cigar_operation_offset() @property const {
return _cigar_operation_offset;
}
private {
int _operation_index = void;
uint _reference_position = void;
uint _cigar_operation_offset = void;
static if (CigarExtraProperties)
{
ReversableRange!(identity, const(CigarOperation)[]) _cigar = void;
}
else
{
CigarOperation _current_cigar_op;
}
}
}
mixin template rangeMethods() {
enum CigarExtraProperties = staticIndexOf!(Option.cigarExtra, Options) != -1;
private {
CigarOperation _current_cigar_op = void;
ulong _cur_cig_op_len = void;
bool _cur_cig_op_is_ref_cons = void;
int _index = void;
uint _at = void;
uint _ref_pos = void;
ReversableRange!(identity, const(CigarOperation)[]) _cigar = void;
}
/// Current CIGAR operation, available to all extensions
const(CigarOperation) current_cigar_operation() @property const {
return _current_cigar_op;
}
void setup(Args...)(const ref R read, Args)
{
_cigar = reversableRange(read.cigar, read.is_reverse_strand);
_index = -1;
_ref_pos = reverse_strand ? (read.position + read.basesCovered() - 1)
: read.position;
_moveToNextCigarOperator();
assert(_index >= 0);
}
void populate(Result)(ref Result result) {
result._reference_position = _ref_pos;
result._cigar_operation_offset = _at;
static if (CigarExtraProperties)
{
result._cigar = _cigar;
result._operation_index = _index;
}
else
{
result._current_cigar_op = _current_cigar_op;
}
}
void update(const ref R read)
{
++_at;
if (_cur_cig_op_is_ref_cons) {
_ref_pos += reverse_strand ? -1 : 1;
}
if (_at == _cur_cig_op_len) {
_moveToNextCigarOperator();
}
}
void copy(Range)(const ref Range source, ref Range target) {
target.CIGAR._cigar = source.CIGAR._cigar;
target.CIGAR._index = source.CIGAR._index;
target.CIGAR._current_cigar_op = source.CIGAR._current_cigar_op;
target.CIGAR._cur_cig_op_len = source.CIGAR._cur_cig_op_len;
target.CIGAR._cur_cig_op_is_ref_cons = source.CIGAR._cur_cig_op_is_ref_cons;
target.CIGAR._at = source.CIGAR._at;
target.CIGAR._ref_pos = source.CIGAR._ref_pos;
}
private void _moveToNextCigarOperator() {
_at = 0;
for (++_index; _index < _cigar.length; ++_index)
{
_current_cigar_op = _cigar[_index];
_cur_cig_op_is_ref_cons = _current_cigar_op.is_reference_consuming;
_cur_cig_op_len = _current_cigar_op.length;
if (_current_cigar_op.is_query_consuming)
break;
if (_cur_cig_op_is_ref_cons)
{
if (reverse_strand)
_ref_pos -= _cur_cig_op_len;
else
_ref_pos += _cur_cig_op_len;
}
}
}
}
}

290
bio/std/bam/cigar.d

@ -1,290 +0,0 @@
/*
This file is part of BioD.
Copyright (C) 2012-2016 Artem Tarasov <lomereiter@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module bio.bam.cigar;
import std.algorithm;
import std.range;
import std.conv;
import std.format;
import std.exception;
import std.system;
import std.traits;
import std.array;
import std.bitmanip;
import core.stdc.stdlib;
import bio.core.base;
import bio.core.utils.format;
import bio.bam.abstractreader;
//import bio.bam.cigar;
import bio.bam.writer;
import bio.bam.tagvalue;
import bio.bam.bai.bin;
import bio.bam.md.core;
import bio.bam.utils.array;
import bio.bam.utils.value;
import bio.core.utils.switchendianness;
import bio.bam.thirdparty.msgpack : Packer, unpack;
/**
Represents single CIGAR operation
*/
struct CigarOperation {
static assert(CigarOperation.sizeof == uint.sizeof);
/*
WARNING!
It is very essential that the size of
this struct is EXACTLY equal to uint.sizeof!
The reason is to avoid copying of arrays during alignment parsing.
Namely, when some_pointer points to raw cigar data,
we can just do a cast. This allows to access those data
directly, not doing any memory allocations.
*/
private uint raw; // raw data from BAM
private static ubyte char2op(char c) {
switch(c) {
case 'M': return 0;
case 'I': return 1;
case 'D': return 2;
case 'N': return 3;
case 'S': return 4;
case 'H': return 5;
case 'P': return 6;
case '=': return 7;
case 'X': return 8;
default: return 15; // 15 is used as invalid value
}
}
/// Length must be strictly less than 2^28.
/// $(BR)
/// Operation type must be one of M, I, D, N, S, H, P, =, X.
this(uint length, char operation_type) {
enforce(length < (1<<28), "Too big length of CIGAR operation");
raw = (length << 4) | char2op(operation_type);
}
this(uint _raw) {
raw = _raw;
}
/// Operation length
uint length() @property const nothrow @nogc {
return raw >> 4;
}
/// CIGAR operation as one of MIDNSHP=X.
/// Absent or invalid operation is represented by '?'
char type() @property const nothrow @nogc {
return "MIDNSHP=X????????"[raw & 0xF];
}
// Each pair of bits has first bit set iff the operation is query consuming,
// and second bit set iff it is reference consuming.
// X = P H S N D I M
private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
/// True iff operation is one of M, =, X, I, S
bool is_query_consuming() @property const nothrow @nogc {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
}
/// True iff operation is one of M, =, X, D, N
bool is_reference_consuming() @property const nothrow @nogc {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
}
/// True iff operation is one of M, =, X
bool is_match_or_mismatch() @property const nothrow @nogc {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
/// True iff operation is one of 'S', 'H'
bool is_clipping() @property const nothrow @nogc {
return ((raw & 0xF) >> 1) == 2; // 4 or 5
}
void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
sink.write(length);
sink.write(type);
}
void toString(scope void delegate(const(char)[]) sink) const {
toSam(sink);
}
}
alias CigarOperation[] CigarOperations;
bool is_unavailable(CigarOperations cigar) @property nothrow @nogc {
return (cigar.length == 1 && cigar[0].raw == '*');
}
/// Forward range of extended CIGAR operations, with =/X instead of M
/// Useful for, e.g., detecting positions of mismatches.
struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
private {