Browse Source

Merge branch 'master' of git://github.com/biod/BioD

bio2
lomereiter 9 years ago
parent
commit
88801851f7
  1. 24
      bio/bam/bai/bin.d
  2. 41
      bio/bam/baifile.d
  3. 4
      bio/bam/randomaccessmanager.d
  4. 55
      bio/bam/read.d
  5. 130
      bio/newick/parser.d
  6. 165
      bio/newick/tree.d
  7. 17
      bio/newick/treenode.d

24
bio/bam/bai/bin.d

@ -51,6 +51,30 @@ struct Bin {
bool is_leaf() @property const nothrow {
return id > BAI_MAX_NONLEAF_BIN_ID;
}
/// Check if bin can overlap with a region
bool canOverlapWith(int begin, int end) const nothrow {
if (id == 0) return true;
if (id > BAI_MAX_BIN_ID) return false;
/// The following code is based on reg2bins() function
if (begin < 0) begin = 0;
auto magic_number = 4681;
auto b = begin >> 14;
auto e = end >> 14;
while (true) {
auto delta = id - magic_number;
if (b <= delta && delta <= e) return true;
magic_number >>= 3;
if (magic_number == 0) return false;
b >>= 3;
e >>= 3;
}
}
}
/// Returns bin number for [beg, end) interval (zero-based).

41
bio/bam/baifile.d

@ -40,8 +40,8 @@ import std.path;
/// Represents index for a single reference
struct Index {
/// Information about bins
Bin[uint] bins;
Bin[] bins;
/// Virtual file offsets of first alignments overlapping 16384-byte windows
/// on the reference sequence. This linear index is used to reduce amount
/// of file seeks for region queries, since with its help one can reduce the
@ -79,22 +79,6 @@ struct Index {
auto min_offset = (_i == -1) ? VirtualOffset(0) : ioffsets[_i];
return min_offset;
}
/// Range of bins that overlap interval [beg, end)
auto getBins(uint beg, uint end) {
assert(beg < end);
if (end >= 1u<<29) end = 1u<<29;
--end;
return chain(repeat(0).takeOne(),
iota(1 + (beg >> 26), 2 + (end >> 26)),
iota(9 + (beg >> 23), 10 + (end >> 23)),
iota(73 + (beg >> 20), 74 + (end >> 20)),
iota(585 + (beg >> 17), 586 + (end >> 17)),
iota(4681 + (beg >> 14), 4682 + (end >> 14)))
.zip(bins.repeat())
.map!"a[0] in a[1]"()
.filter!"a !is null"();
}
}
struct BaiFile {
@ -144,38 +128,39 @@ private:
int n_ref;
_stream.read(n_ref);
indices.length = n_ref;
indices = uninitializedArray!(Index[])(n_ref);
foreach (i; 0 .. n_ref) {
int n_bin;
int n_bin = void;
_stream.read(n_bin);
indices[i].bins = uninitializedArray!(Bin[])(n_bin);
foreach (j; 0 .. n_bin) {
uint id;
uint id = void;
_stream.read(id);
auto bin = Bin(id);
int n_chunk;
int n_chunk = void;
_stream.read(n_chunk);
bin.chunks.length = n_chunk;
bin.chunks = uninitializedArray!(Chunk[])(n_chunk);
foreach (k; 0 .. n_chunk) {
ulong tmp;
ulong tmp = void;
_stream.read(tmp);
bin.chunks[k].beg = VirtualOffset(tmp);
_stream.read(tmp);
bin.chunks[k].end = VirtualOffset(tmp);
}
indices[i].bins[id] = bin;
indices[i].bins[j] = bin;
}
int n_intv;
int n_intv = void;
_stream.read(n_intv);
indices[i].ioffsets.length = n_intv;
indices[i].ioffsets = uninitializedArray!(VirtualOffset[])(n_intv);
foreach (j; 0 .. n_intv) {
ulong tmp;
ulong tmp = void;
_stream.read(tmp);
indices[i].ioffsets[j] = VirtualOffset(tmp);
}

4
bio/bam/randomaccessmanager.d

@ -246,7 +246,9 @@ class RandomAccessManager {
auto min_offset = _bai.indices[ref_id].getMinimumOffset(beg);
Chunk[] bai_chunks;
foreach (b; _bai.indices[ref_id].getBins(beg, end)) {
foreach (b; _bai.indices[ref_id].bins) {
if (!b.canOverlapWith(beg, end))
continue;
foreach (chunk; b.chunks) {
if (chunk.end > min_offset) {

55
bio/bam/read.d

@ -147,6 +147,11 @@ struct CigarOperation {
return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
}
/// True iff operation is one of 'S', 'H'
bool is_clipping() @property const {
return ((raw & 0xF) >> 1) == 2; // 4 or 5
}
private void toSam(Sink)(auto ref Sink sink) const
if (isSomeSink!Sink)
{
@ -779,6 +784,7 @@ struct BamRead {
BamRead result;
result._chunk = this._chunk.dup;
result._is_slice = false;
result._modify_in_place = false;
result._reader = cast()_reader;
return result;
}
@ -1041,7 +1047,17 @@ struct BamRead {
bio.bam.abstractreader.IBamSamReader reader() @property {
return _reader;
}
///
bool is_slice_backed() @property const {
return _is_slice;
}
/// Raw representation of the read. Occasionally useful for dirty hacks!
inout(ubyte)[] raw_data() @property inout {
return _chunk;
}
package ubyte[] _chunk; // holds all the data,
// the access is organized via properties
// (see below)
@ -1054,11 +1070,22 @@ private:
// (Of course this places some restrictions on usage,
// but allows to reduce size of record.)
bool _is_slice() @property const {
return cast(bool)_chunk[_cigar_offset - 1];
return cast(bool)(_chunk[_cigar_offset - 1] & 1);
}
void _is_slice(bool is_slice) @property {
_chunk[_cigar_offset - 1] = is_slice ? 1 : 0;
_chunk[_cigar_offset - 1] &= 0b11111110;
_chunk[_cigar_offset - 1] |= (is_slice ? 1 : 0);
}
// don't call _dup() if the record is modified
bool _modify_in_place() @property const {
return cast(bool)(_chunk[_cigar_offset - 1] & 2);
}
void _modify_in_place(bool in_place) @property {
_chunk[_cigar_offset - 1] &= 0b11111101;
_chunk[_cigar_offset - 1] |= (in_place ? 2 : 0);
}
IBamSamReader _reader;
@ -1178,7 +1205,7 @@ private:
// may point to the same location, but every modified one allocates its
// own chunk of memory.
void _dup() {
if (_is_slice) {
if (_is_slice && !_modify_in_place) {
_chunk = _chunk.dup;
_is_slice = false;
}
@ -1690,3 +1717,23 @@ bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
static assert(isTwoWayCompatible!(compareReadNames, BamRead, string));
static assert(isTwoWayCompatible!(compareCoordinates, BamRead, int));
/// Allows modification of the read in-place even if it's slice-backed.
struct UniqueRead(R) {
R read;
alias read this;
this(R read) {
this.read = read;
this.read._modify_in_place = true;
}
~this() {
this.read._modify_in_place = false;
}
}
/// ditto
auto assumeUnique(R)(auto ref R read) if (isBamRead!R) {
return UniqueRead!R(read);
}

130
bio/newick/parser.d

@ -1,130 +0,0 @@
module bio.newick.parser;
import bio.newick.treenode;
import std.exception;
import std.format;
import std.ascii;
import std.array;
class NewickParseException : Exception {
this(string msg) {
super(msg);
}
}
NewickNode* parse(string s) {
string str = s;
return parseTree(str);
}
private {
bool isValidCharacter(dchar c) {
return isAlphaNum(c) || c == '_';
}
void skipWhiteSpace(ref string s) {
while (true) {
if (s.empty)
throw new NewickParseException("String is empty");
if (isWhite(s.front))
s = s[1 .. $];
else
break;
}
}
NewickNode* parseTree(ref string s) {
skipWhiteSpace(s);
auto ch = s.front;
if (ch == ';')
return new NewickNode;
auto result = parseSubtree(s);
enforce(s.front == ';', "String does not end with ';'");
return result;
}
NewickNode* parseSubtree(ref string s) {
skipWhiteSpace(s);
auto ch = s.front;
switch (ch) {
case '(':
return parseInternalNode(s);
case ',':
case ')':
return new NewickNode;
default:
enforce(isValidCharacter(ch), "Node name contains invalid character");
return parseLeafNode(s);
}
}
NewickNode* parseLeafNode(ref string s, NewickNode* nd=null) {
skipWhiteSpace(s);
auto ch = s.front;
auto node = nd is null ? (new NewickNode) : nd;
size_t i = 0;
while (isValidCharacter(s[i])) {
++i;
}
if (i > 0)
node.name = s[0 .. i];
s = s[i .. $];
ch = s.front;
if (ch == ':')
formattedRead(s, ":%s", &node.distance_to_parent);
return node;
}
NewickNode* parseInternalNode(ref string s) {
skipWhiteSpace(s);
auto ch = s.front;
assert(ch == '(');
s.popFront();
auto node = new NewickNode;
auto children = Appender!(NewickNode*[])();
while (true) {
auto child = parseSubtree(s);
child.parent = node;
children.put(child);
ch = s.front;
if (ch == ',') {
s.popFront();
continue;
} else if (ch == ')') {
s.popFront();
break;
} else {
throw new NewickParseException("Parse error");
}
}
node.children = children.data;
parseLeafNode(s, node);
return node;
}
}

165
bio/newick/tree.d

@ -1,165 +0,0 @@
module bio.newick.tree;
import bio.newick.parser;
import bio.newick.treenode;
import std.algorithm;
import std.range;
/**
Newick tree
*/
class NewickTree {
private {
NewickNode* _root;
}
/** Root node, may be null */
NewickNode* root() @property {
return _root;
}
/** Creates a tree from its string representation */
static NewickTree fromString(string str) {
auto tree = new NewickTree();
tree._root = bio.newick.parser.parse(str);
tree._fillDictionary();
return tree;
}
/** Creates a tree with a given root node */
static NewickTree fromRootNode(NewickNode* root) {
auto tree = new NewickTree();
tree._root = root;
tree._fillDictionary();
return tree;
}
/** Dictionary of nodes */
NewickNode*[string] nodes;
/** Distance between two nodes. Complexity is O(tree depth). */
double distance(NewickNode* n1, NewickNode* n2) {
// FIXME: allocate if necessary
NewickNode*[1024] n1_path; // n1, n1.parent, ..., except root
NewickNode*[1024] n2_path; // n2, n2.parent, ..., except root
int i1, i2;
while (n1 != root) {
n1_path[i1++] = n1;
n1 = n1.parent;
}
while (n2 != root) {
n2_path[i2++] = n2;
n2 = n2.parent;
}
for (--i1, --i2;
i1 >= 0 && i2 >= 0 && n1_path[i1] == n2_path[i2];
--i1, --i2) {}
return reduce!"a+b.distance_to_parent"(0.0, chain(n1_path[0 .. i1 + 1], n2_path[0 .. i2 + 1]));
}
/** ditto */
double distance(string n1, string n2) {
return distance(nodes[n1], nodes[n2]);
}
private {
void _fillDictionary() {
_fillDictRecur(root);
}
void _fillDictRecur(NewickNode* node) {
if (node !is null) {
if (node.name !is null)
nodes[node.name] = node;
foreach (child; node.children) {
_fillDictRecur(child);
}
}
}
}
}
unittest {
import std.math;
auto tree = NewickTree.fromString("(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);");
assert(tree.root !is null);
assert(tree.root.name is null);
assert(approxEqual(tree.distance("A", "B"), 0.3));
assert(approxEqual(tree.distance("B", "C"), 1.0));
assert(approxEqual(tree.distance("C", "D"), 0.7));
assert(approxEqual(tree.distance("A", "C"), 0.9));
assert(approxEqual(tree.distance("B", "D"), 1.1));
assert(approxEqual(tree.distance("A", "D"), 1.0));
assert(tree.root.children.length == 3);
assert(tree.root.children[0].name == "A");
assert(approxEqual(tree.root.children[0].distance_to_parent, 0.1));
assert(tree.root.children[1].name == "B");
assert(approxEqual(tree.root.children[1].distance_to_parent, 0.2));
auto child = tree.root.children[2];
assert(child !is null);
assert(child.parent == tree.root);
assert(child.name is null);
assert(approxEqual(child.distance_to_parent, 0.5));
assert(child.children.length == 2);
assert(child.children[0].name == "C");
assert(approxEqual(child.children[0].distance_to_parent, 0.3));
assert(child.children[1].name == "D");
assert(approxEqual(child.children[1].distance_to_parent, 0.4));
tree = NewickTree.fromString("(A,B,(C,D));");
assert(tree.root !is null);
assert(tree.root.children.length == 3);
assert(tree.root.children[0].name == "A");
assert(tree.root.children[1].name == "B");
assert(tree.root.children[2].name is null);
assert(tree.root.children[2].children[0].name == "C");
assert(tree.root.children[2].children[1].name == "D");
assert(tree.nodes["D"].parent.children[0].name == "C");
tree = NewickTree.fromString("((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;");
assert(tree.root.name == "A");
assert(approxEqual(tree.distance("A", "B"), 0.3));
assert(approxEqual(tree.distance("B", "C"), 1.0));
assert(approxEqual(tree.distance("C", "D"), 0.7));
assert(approxEqual(tree.distance("A", "C"), 0.9));
assert(approxEqual(tree.distance("B", "D"), 1.1));
assert(approxEqual(tree.distance("A", "D"), 1.0));
assert(tree.root.children.length == 1);
child = tree.root.children[0];
assert(child.parent == tree.root);
assert(child.name == "F");
assert(approxEqual(child.distance_to_parent, 0.1));
assert(child.children.length == 2);
assert(child.children[0].name == "B");
assert(approxEqual(child.children[0].distance_to_parent, 0.2));
child = child.children[1];
assert(child.name == "E");
assert(approxEqual(child.distance_to_parent, 0.5));
assert(child.children.length == 2);
assert(child.children[0].name == "C");
assert(approxEqual(child.children[0].distance_to_parent, 0.3));
assert(child.children[1].name == "D");
assert(approxEqual(child.children[1].distance_to_parent, 0.4));
tree = NewickTree.fromString("(,,(,));");
assert(tree.root !is null);
assert(tree.root.children.length == 3);
assert(tree.root.children[0].name is null);
assert(tree.root.children[0].children.length == 0);
assert(tree.root.children[1].name is null);
assert(tree.root.children[1].children.length == 0);
child = tree.root.children[2];
assert(child.parent == tree.root);
assert(child.name is null);
assert(child.children.length == 2);
assert(child.children[0].name is null);
assert(child.children[1].name is null);
}

17
bio/newick/treenode.d

@ -1,17 +0,0 @@
module bio.newick.treenode;
/** Node in a Newick tree */
struct NewickNode {
/** Parent node (null if node is root) */
NewickNode* parent;
/** Node name (null if node is not named) */
string name;
/** Distance to parent node */
double distance_to_parent = 1.0;
/** Child nodes */
NewickNode*[] children;
}
Loading…
Cancel
Save