You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
406 lines
14 KiB
406 lines
14 KiB
import bio.std.hts.bam.reader;
|
|
import bio.std.hts.bam.pileup;
|
|
import std.stdio;
|
|
|
|
void main() {
|
|
|
|
auto bam = new BamReader("test/data/ex1_header.bam");
|
|
auto reads = bam["chr2"][150 .. 160]; // region chr2:149-158
|
|
auto pileup = makePileup(reads,
|
|
false, // reads don't contain MD tags
|
|
155, 158); // specify [start, end) interval
|
|
|
|
foreach (column; pileup) {
|
|
writeln("Reference position: ", column.position);
|
|
writeln(" Coverage: ", column.coverage);
|
|
writeln(" Reads:");
|
|
|
|
foreach (read; column.reads) {
|
|
writefln("%30s\t%s\t%.2d\t%s\t%2s/%2s\t%2s/%2s\t%10s\t%s %s",
|
|
read.name,
|
|
read.current_base,
|
|
read.current_base_quality,
|
|
read.cigar_operation,
|
|
read.cigar_operation_offset + 1, read.cigar_operation.length,
|
|
read.query_offset + 1, read.sequence.length,
|
|
read.cigarString(),
|
|
read.cigar_before, read.cigar_after);
|
|
}
|
|
}
|
|
|
|
import bio.std.hts.bam.cigar;
|
|
import bio.std.hts.bam.reader;
|
|
import bio.std.hts.bam.writer;
|
|
import bio.std.hts.sam.reader;
|
|
import bio.std.hts.sam.header;
|
|
import bio.std.hts.bam.md.core;
|
|
import bio.std.hts.bam.md.reconstruct;
|
|
import bio.std.hts.bam.pileup;
|
|
import bio.std.hts.bam.baseinfo;
|
|
import bio.std.hts.bam.validation.samheader;
|
|
import bio.std.hts.bam.validation.alignment;
|
|
import bio.std.hts.utils.samheadermerger;
|
|
import bio.std.hts.sam.utils.recordparser;
|
|
import bio.core.bgzf.block;
|
|
import bio.core.bgzf.inputstream;
|
|
import bio.core.bgzf.outputstream;
|
|
import bio.core.utils.roundbuf;
|
|
import bio.core.utils.range;
|
|
import bio.core.utils.tmpfile;
|
|
import bio.core.utils.stream;
|
|
import bio.core.sequence;
|
|
import bio.core.base;
|
|
import bio.core.tinymap;
|
|
import bio.core.utils.roundbuf;
|
|
|
|
import std.path;
|
|
import std.range;
|
|
import std.stdio;
|
|
// import undead.stream;
|
|
import std.algorithm;
|
|
import std.array;
|
|
import std.conv;
|
|
import std.exception;
|
|
import std.math;
|
|
import std.typetuple;
|
|
import std.regex;
|
|
|
|
auto fn = buildPath(dirName(__FILE__), "data", "tags.bam");
|
|
auto bf = new BamReader(fn);
|
|
foreach (read; bf.reads) {
|
|
auto line = read.to!string();
|
|
auto read2 = parseAlignmentLine(line, bf.header);
|
|
if (read != read2 && isValid(read)) {
|
|
stderr.writeln(read.name);
|
|
}
|
|
assert(read == read2 || !isValid(read));
|
|
}
|
|
|
|
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
|
|
bf = new BamReader(fn);
|
|
auto readx = bf.reads;
|
|
readx.front;
|
|
readx.popFront();
|
|
readx.popFront();
|
|
|
|
{
|
|
fn = buildPath(dirName(__FILE__), "data", "b.sam");
|
|
auto sam = new SamReader(fn);
|
|
auto writer = new BamWriter("/dev/null", 0);
|
|
writer.writeSamHeader(sam.header);
|
|
writer.writeReferenceSequenceInfo(sam.reference_sequences);
|
|
foreach (r; sam.reads)
|
|
writer.writeRecord(r);
|
|
writer.finish();
|
|
}
|
|
{
|
|
fn = buildPath(dirName(__FILE__), "data", "bins.bam");
|
|
bf = new BamReader(fn);
|
|
|
|
void compareWithNaiveApproach(int beg, int end) {
|
|
|
|
auto refseq = array(bf["large"][beg .. end]);
|
|
|
|
auto naive = array(filter!((BamRead a) {
|
|
return a.ref_id != -1 &&
|
|
bf.reference(a.ref_id).name == "large" &&
|
|
a.position < end &&
|
|
a.position + a.basesCovered() > beg; })
|
|
(bf.reads!withoutOffsets));
|
|
if (!equal(naive, refseq)) {
|
|
stderr.writeln(beg);
|
|
stderr.writeln(end);
|
|
stderr.writeln(array(map!"a.name"(refseq)));
|
|
stderr.writeln(array(map!"a.name"(naive)));
|
|
}
|
|
assert(equal(refseq, naive));
|
|
}
|
|
|
|
|
|
// Time to kick in GC
|
|
import core.memory;
|
|
stderr.writeln("**** Calling GC");
|
|
GC.collect();
|
|
stderr.writeln("**** Past calling GC");
|
|
}
|
|
CigarOperation[] cigarFromString(string cigar) {
|
|
return match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOperation(m[1].to!uint, m[2].to!char)).array;
|
|
}
|
|
|
|
{
|
|
stderr.writeln("Running unittests...");
|
|
// stderr.writeln("Testing extracting SAM header...");
|
|
|
|
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
|
|
bf = new BamReader(fn);
|
|
assert(bf.header.format_version == "1.3");
|
|
assert(bf.header.sorting_order == SortingOrder.coordinate);
|
|
assert(bf.header.sequences.length == 2);
|
|
assert(bf.header.getSequenceIndex("chr1") == 0);
|
|
assert(bf.header.sequences["chr2"].length == 1584);
|
|
|
|
fn = buildPath(dirName(__FILE__), "data", "bins.bam");
|
|
bf = new BamReader(fn);
|
|
assert(bf.header.sorting_order == SortingOrder.unknown);
|
|
assert(bf.header.sequences.length == 3);
|
|
assert(bf.header.read_groups.length == 0);
|
|
assert(bf.header.getSequenceIndex("large") == 2);
|
|
assert(bf.header.sequences["small"].length == 65536);
|
|
|
|
// Time to kick in GC
|
|
import core.memory;
|
|
stderr.writeln("**** Calling GC");
|
|
GC.collect();
|
|
stderr.writeln("**** Past calling GC");
|
|
|
|
{
|
|
// stderr.writeln("Testing alignment parsing...");
|
|
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
|
|
bf = new BamReader(fn);
|
|
auto reads2 = bf.reads;
|
|
auto read = reads2.front;
|
|
assert(equal(read.sequence, "CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA"));
|
|
assert(equal(map!"cast(char)(a + 33)"(read.base_qualities),
|
|
"<<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<;"));
|
|
assert(bf.reference(read.ref_id).name == "chr1");
|
|
assert(read.name == "EAS56_57:6:190:289:82");
|
|
assert(read.flag == 69);
|
|
assert(read.position == 99);
|
|
assert(read.mapping_quality == 0);
|
|
reads2.popFront();
|
|
reads2.popFront();
|
|
assert(reads2.front.cigarString() == "35M");
|
|
assert(reads2.front.to!string() == "EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0");
|
|
assert(bf.header.getSequenceIndex("chr1") == read.ref_id);
|
|
}
|
|
|
|
assert(bf.reads.front.name == "EAS56_57:6:190:289:82");
|
|
stderr.writeln("**** Calling GCx");
|
|
GC.collect();
|
|
stderr.writeln("**** Past calling GCx");
|
|
|
|
// stderr.writeln("Testing tag parsing...");
|
|
fn = buildPath(dirName(__FILE__), "data", "tags.bam");
|
|
bf = new BamReader(fn);
|
|
foreach (alignment; bf.reads) {
|
|
auto name = alignment.name;
|
|
assert(name[0..4] == "tag_");
|
|
char[] tag;
|
|
name = name[4..$];
|
|
while (name[0] != ':') {
|
|
tag ~= name[0];
|
|
name = name[1..$];
|
|
}
|
|
name = name[1..$];
|
|
auto value = alignment[tag.idup].toSam();
|
|
if (name != value) {
|
|
stderr.writeln("tag: ", tag, "\tname: ", name, "\tvalue: ", value);
|
|
stderr.writeln("value bam_typeid: ", alignment[tag.idup].bam_typeid);
|
|
}
|
|
|
|
assert(name == value);
|
|
}
|
|
|
|
// stderr.writeln("Testing exception handling...");
|
|
fn = buildPath(dirName(__FILE__), "data", "duplicated_block_size.bam");
|
|
assertThrown!BgzfException(new BamReader(fn));
|
|
fn = buildPath(dirName(__FILE__), "data", "no_block_size.bam");
|
|
assertThrown!BgzfException(new BamReader(fn));
|
|
fn = buildPath(dirName(__FILE__), "data", "wrong_extra_gzip_length.bam");
|
|
assertThrown!BgzfException(new BamReader(fn));
|
|
fn = buildPath(dirName(__FILE__), "data", "wrong_bc_subfield_length.bam");
|
|
assertThrown!BgzfException(reduce!"a+b.sequence_length"(0, (new BamReader(fn)).reads!withoutOffsets));
|
|
fn = buildPath(dirName(__FILE__), "data", "corrupted_zlib_archive.bam");
|
|
import bio.core.utils.zlib;
|
|
assertThrown!ZlibException(walkLength((new BamReader(fn)).reads));
|
|
// stderr.writeln("Testing random access...");
|
|
|
|
fn = buildPath(dirName(__FILE__), "data", "bins.bam");
|
|
bf = new BamReader(fn);
|
|
|
|
|
|
// Time to kick in GC
|
|
import core.memory;
|
|
stderr.writeln("**** Calling GC");
|
|
GC.collect();
|
|
stderr.writeln("**** Past calling GC");
|
|
|
|
{
|
|
auto fst_offset_tiny = bf["tiny"].startVirtualOffset();
|
|
auto fst_offset_small = bf["small"].startVirtualOffset();
|
|
auto fst_offset_large = bf["large"].startVirtualOffset();
|
|
|
|
auto fst_read_tiny = bf.getReadAt(fst_offset_tiny);
|
|
auto fst_read_small = bf.getReadAt(fst_offset_small);
|
|
auto fst_read_large = bf.getReadAt(fst_offset_large);
|
|
|
|
assert(fst_read_tiny.name == "tiny:r1:0..1:len1:bin4681:hexbin0x1249");
|
|
assert(fst_read_small.name == "small:r1:0..1:len1:bin4681:hexbin0x1249");
|
|
assert(fst_read_large.name == "large:r1:0..1:len1:bin4681:hexbin0x1249");
|
|
}
|
|
|
|
// stderr.writeln("Testing Value code...");
|
|
Value v = 5;
|
|
assert(v.is_integer);
|
|
assert(v.toSam() == "i:5");
|
|
assert(v == 5);
|
|
assert(v == "5");
|
|
assert(v != [1,2,3]);
|
|
v = "abc";
|
|
assert(v.is_string);
|
|
assert(v.toSam() == "Z:abc");
|
|
assert(v == "abc");
|
|
v = [1, 2, 3];
|
|
assert(v.is_numeric_array);
|
|
assert(v.toSam() == "B:i,1,2,3");
|
|
assert(v == [1,2,3]);
|
|
assert(v == "[1, 2, 3]");
|
|
v = [1.5, 2.3, 17.0];
|
|
assert(v.is_numeric_array);
|
|
assert(v.toSam() == "B:f,1.5,2.3,17");
|
|
assert(approxEqual(to!(float[])(v), [1.5, 2.3, 17]));
|
|
v = 5.6;
|
|
assert(v.is_float);
|
|
assert(v.toSam() == "f:5.6");
|
|
assert(approxEqual(to!float(v), 5.6));
|
|
v = -17;
|
|
assert(v.is_signed);
|
|
assert(v.toSam() == "i:-17");
|
|
assert(v == -17);
|
|
assert(v == "-17");
|
|
v = 297u;
|
|
assert(v.is_unsigned);
|
|
assert(v.toSam() == "i:297");
|
|
assert(v == 297);
|
|
assert(v == "297");
|
|
|
|
short[] array_of_shorts = [4, 5, 6];
|
|
v = array_of_shorts;
|
|
assert(v.is_numeric_array);
|
|
assert(v.toSam() == "B:s,4,5,6");
|
|
assert(to!(short[])(v) == array_of_shorts);
|
|
assert(v == [4,5,6]);
|
|
assert(v == "[4, 5, 6]");
|
|
|
|
v = null;
|
|
assert(v.is_nothing);
|
|
|
|
v = "0eabcf123";
|
|
v.setHexadecimalFlag();
|
|
assert(v.is_hexadecimal_string);
|
|
assert(v == "0eabcf123");
|
|
|
|
// stderr.writeln("Testing parseAlignmentLine/toSam functions...");
|
|
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
|
|
bf = new BamReader(fn);
|
|
foreach (read; bf.reads) {
|
|
auto line = read.to!string();
|
|
auto read2 = parseAlignmentLine(line, bf.header);
|
|
read2.associateWithReader(bf);
|
|
if (read != read2) {
|
|
stderr.writeln(read);
|
|
stderr.writeln(read2);
|
|
stderr.writeln(read.raw_data);
|
|
stderr.writeln(read2.raw_data);
|
|
}
|
|
assert(read == read2);
|
|
}
|
|
|
|
fn = buildPath(dirName(__FILE__), "data", "tags.bam");
|
|
bf = new BamReader(fn);
|
|
foreach (read; bf.reads) {
|
|
auto line = read.to!string();
|
|
auto read2 = parseAlignmentLine(line, bf.header);
|
|
if (read != read2 && isValid(read)) {
|
|
stderr.writeln(read.name);
|
|
}
|
|
assert(read == read2 || !isValid(read));
|
|
}
|
|
|
|
// stderr.writeln("Testing BAM writing...");
|
|
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
|
|
bf = new BamReader(fn);
|
|
{
|
|
string tmp = tmpFile("12035913820619231129310.bam");
|
|
auto stream = new bio.core.utils.stream.File(tmp, "wb+");
|
|
|
|
auto writer = new BamWriter(stream);
|
|
|
|
writer.writeSamHeader(bf.header);
|
|
writer.writeReferenceSequenceInfo(bf.reference_sequences);
|
|
|
|
foreach (read; bf.reads)
|
|
writer.writeRecord(read);
|
|
|
|
writer.flush();
|
|
|
|
stream.seekSet(0);
|
|
assert(walkLength((new BamReader(stream)).reads) == 3270);
|
|
stream.close();
|
|
}
|
|
|
|
// stderr.writeln("Testing SAM reading...");
|
|
{
|
|
auto sf = new SamReader(buildPath(dirName(__FILE__), "data", "ex1_header.sam"));
|
|
assert(sf.reads.front.ref_id == 0);
|
|
assert(equal(sf.reads, bf.reads!withoutOffsets));
|
|
}
|
|
|
|
// stderr.writeln("Testing pileup (high-level aspects)...");
|
|
{
|
|
// All of pileup functions should automatically filter out unmapped reads.
|
|
|
|
// When reads in a range are aligned to different references,
|
|
// pileup objects should process only the first one.
|
|
bf = new BamReader(fn); // chr1, chr2
|
|
{
|
|
auto pileup2 = makePileup(bf.reads);
|
|
foreach (column; pileup2) {
|
|
foreach (read; column.reads) {
|
|
assert(bf.reference_sequences[read.ref_id].name == "chr1");
|
|
assert(read.ref_id == column.ref_id);
|
|
assert(!read.is_unmapped);
|
|
}
|
|
}
|
|
}
|
|
// However, if pileupColumns is used, columns corresponding to chr1
|
|
// should come first, and after them -- those for chr2
|
|
{
|
|
auto columns = pileupColumns(bf.reads);
|
|
int current_ref_id = -1;
|
|
|
|
// [99 .. 1569] [1 .. 1567]
|
|
int[2] expected_columns = [1470, 1567];
|
|
foreach (column; columns) {
|
|
int ref_id = column.ref_id;
|
|
--expected_columns[ref_id];
|
|
if (ref_id != current_ref_id) {
|
|
assert(ref_id > current_ref_id);
|
|
switch (ref_id) {
|
|
case 0:
|
|
assert(column.reads.front.name == "EAS56_57:6:190:289:82");
|
|
assert(column.position == 99);
|
|
break;
|
|
case 1:
|
|
assert(column.reads.front.name == "B7_591:8:4:841:340");
|
|
assert(column.position == 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
current_ref_id = ref_id;
|
|
}
|
|
if (!column.reads.empty) {
|
|
foreach (read; column.reads) {
|
|
assert(read.ref_id == ref_id);
|
|
assert(!read.is_unmapped);
|
|
}
|
|
}
|
|
}
|
|
assert(expected_columns == [0, 0]);
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|