You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

376 lines
14 KiB

/*
This file is part of BioD.
Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
BioD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
BioD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
import bio.sam.reader;
import bio.sam.header;
import bio.bam.bgzf.blockrange;
import bio.bam.reader;
import bio.bam.output;
import bio.bam.md.reconstruct;
import bio.bam.pileuprange;
import bio.bam.baseinfo;
import bio.bam.validation.samheader;
import bio.bam.validation.alignment;
import bio.bam.utils.samheadermerger;
import bio.sam.utils.recordparser;
import bio.bam.serialization.sam;
import bio.core.utils.tmpfile;
import std.path;
import std.range;
import std.stdio;
import std.stream;
import std.algorithm;
import std.array;
import std.conv;
import std.exception;
import std.math;
unittest {
writeln("Testing extracting SAM header...");
auto fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
auto bf = new BamReader(fn);
assert(bf.header.format_version == "1.3");
assert(bf.header.sorting_order == SortingOrder.coordinate);
assert(bf.header.sequences.length == 2);
assert(bf.header.getSequenceIndex("chr1") == 0);
assert(bf.header.sequences["chr2"].length == 1584);
fn = buildPath(dirName(__FILE__), "data", "bins.bam");
bf = new BamReader(fn);
assert(bf.header.sorting_order == SortingOrder.unknown);
assert(bf.header.sequences.length == 3);
assert(bf.header.read_groups.length == 0);
assert(bf.header.getSequenceIndex("large") == 2);
assert(bf.header.sequences["small"].length == 65536);
{
writeln("Testing alignment parsing...");
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
bf = new BamReader(fn);
auto reads = bf.reads;
auto read = reads.front;
assert(equal(read.sequence, "CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA"));
assert(equal(map!"cast(char)(a + 33)"(read.phred_base_quality),
"<<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<;"));
assert(bf.reference(read.ref_id).name == "chr1");
assert(read.read_name == "EAS56_57:6:190:289:82");
assert(read.flag == 69);
assert(read.position == 99);
assert(read.mapping_quality == 0);
reads.popFront();
reads.popFront();
assert(reads.front.cigarString() == "35M");
assert(toSam(reads.front, bf.reference_sequences) == "EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0");
assert(bf.header.getSequenceIndex("chr1") == read.ref_id);
}
assert(bf.reads.front.read_name == "EAS56_57:6:190:289:82");
writeln("Testing tag parsing...");
fn = buildPath(dirName(__FILE__), "data", "tags.bam");
bf = new BamReader(fn);
foreach (alignment; bf.reads) {
auto read_name = alignment.read_name;
assert(read_name[0..4] == "tag_");
char[] tag;
read_name = read_name[4..$];
while (read_name[0] != ':') {
tag ~= read_name[0];
read_name = read_name[1..$];
}
read_name = read_name[1..$];
string value = toSam(alignment[tag.idup]);
if (read_name != value) {
writeln("tag: ", tag, "\tread_name: ", read_name, "\tvalue: ", value);
writeln("value bam_typeid: ", alignment[tag.idup].bam_typeid);
}
assert(read_name == value);
}
writeln("Testing exception handling...");
fn = buildPath(dirName(__FILE__), "data", "duplicated_block_size.bam");
assertThrown!BgzfException(new BamReader(fn));
fn = buildPath(dirName(__FILE__), "data", "no_block_size.bam");
assertThrown!BgzfException(new BamReader(fn));
fn = buildPath(dirName(__FILE__), "data", "wrong_extra_gzip_length.bam");
assertThrown!BgzfException(new BamReader(fn));
fn = buildPath(dirName(__FILE__), "data", "wrong_bc_subfield_length.bam");
assertThrown!BgzfException(reduce!"a+b.sequence_length"(0, (new BamReader(fn)).reads!withoutOffsets));
fn = buildPath(dirName(__FILE__), "data", "corrupted_zlib_archive.bam");
assertThrown!ZlibException(walkLength((new BamReader(fn)).reads));
writeln("Testing random access...");
fn = buildPath(dirName(__FILE__), "data", "bins.bam");
bf = new BamReader(fn);
void compareWithNaiveApproach(int beg, int end) {
auto refseq = array(bf["large"][beg .. end]);
auto naive = array(filter!((BamRead a) {
return a.ref_id != -1 &&
bf.reference(a.ref_id).name == "large" &&
a.position < end &&
a.position + a.basesCovered() > beg; })
(bf.reads!withoutOffsets));
if (!equal(naive, refseq)) {
writeln(beg);
writeln(end);
writeln(array(map!"a.read_name"(refseq)));
writeln(array(map!"a.read_name"(naive)));
}
assert(equal(refseq, naive));
}
compareWithNaiveApproach(1400, 1500);
compareWithNaiveApproach( 10, 123);
compareWithNaiveApproach( 135, 1236);
compareWithNaiveApproach(1350, 3612);
compareWithNaiveApproach( 643, 1732);
compareWithNaiveApproach( 267, 1463);
compareWithNaiveApproach( 0, 30);
compareWithNaiveApproach(1363, 1612);
compareWithNaiveApproach( 361, 1231);
compareWithNaiveApproach( 322, 612);
compareWithNaiveApproach( 912, 938);
compareWithNaiveApproach( 0, 3000);
compareWithNaiveApproach( 0, 100);
compareWithNaiveApproach( 0, 1000);
compareWithNaiveApproach( 0, 1900);
compareWithNaiveApproach( 1, 279);
for (auto i = 50_000; i < 1_000_000; i += 50_000) {
compareWithNaiveApproach(i, i + 100);
}
{
auto fst_offset_tiny = bf["tiny"].startVirtualOffset();
auto fst_offset_small = bf["small"].startVirtualOffset();
auto fst_offset_large = bf["large"].startVirtualOffset();
auto fst_read_tiny = bf.getReadAt(fst_offset_tiny);
auto fst_read_small = bf.getReadAt(fst_offset_small);
auto fst_read_large = bf.getReadAt(fst_offset_large);
assert(fst_read_tiny.read_name == "tiny:r1:0..1:len1:bin4681:hexbin0x1249");
assert(fst_read_small.read_name == "small:r1:0..1:len1:bin4681:hexbin0x1249");
assert(fst_read_large.read_name == "large:r1:0..1:len1:bin4681:hexbin0x1249");
}
writeln("Testing Value code...");
Value v = 5;
assert(v.is_integer);
assert(toSam(v) == "i:5");
assert(v == 5);
assert(v == "5");
assert(v != [1,2,3]);
v = "abc";
assert(v.is_string);
assert(toSam(v) == "Z:abc");
assert(v == "abc");
v = [1, 2, 3];
assert(v.is_numeric_array);
assert(toSam(v) == "B:i,1,2,3");
assert(v == [1,2,3]);
assert(v == "[1, 2, 3]");
v = [1.5, 2.3, 17.0];
assert(v.is_numeric_array);
assert(toSam(v) == "B:f,1.5,2.3,17");
assert(approxEqual(to!(float[])(v), [1.5, 2.3, 17]));
v = 5.6;
assert(v.is_float);
assert(toSam(v) == "f:5.6");
assert(approxEqual(to!float(v), 5.6));
v = -17;
assert(v.is_signed);
assert(toSam(v) == "i:-17");
assert(v == -17);
assert(v == "-17");
v = 297u;
assert(v.is_unsigned);
assert(toSam(v) == "i:297");
assert(v == 297);
assert(v == "297");
short[] array_of_shorts = [4, 5, 6];
v = array_of_shorts;
assert(v.is_numeric_array);
assert(toSam(v) == "B:s,4,5,6");
assert(to!(short[])(v) == array_of_shorts);
assert(v == [4,5,6]);
assert(v == "[4, 5, 6]");
v = null;
assert(v.is_nothing);
v = "0eabcf123";
v.setHexadecimalFlag();
assert(v.is_hexadecimal_string);
assert(v == "0eabcf123");
writeln("Test parseAlignmentLine/toSam functions...");
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
bf = new BamReader(fn);
foreach (read; bf.reads) {
auto line = toSam(read, bf.reference_sequences);
auto read2 = parseAlignmentLine(line, bf.header);
if (read != read2) {
writeln(read.read_name);
}
assert(read == read2);
}
fn = buildPath(dirName(__FILE__), "data", "tags.bam");
bf = new BamReader(fn);
foreach (read; bf.reads) {
auto line = toSam(read, bf.reference_sequences);
auto read2 = parseAlignmentLine(line, bf.header);
if (read != read2 && isValid(read)) {
writeln(read.read_name);
}
assert(read == read2 || !isValid(read));
}
writeln("Test BAM writing...");
fn = buildPath(dirName(__FILE__), "data", "ex1_header.bam");
bf = new BamReader(fn);
{
string tmp = tmpFile("12035913820619231129310.bam");
auto stream = new BufferedFile(tmp, FileMode.Out, 8192);
writeBAM(stream, bf.header.text, bf.reference_sequences, bf.reads!withoutOffsets, 9);
stream.seekSet(0);
assert(walkLength((new BamReader(tmp)).reads!withoutOffsets) == 3270);
stream.close();
}
writeln("Test SAM reading...");
{
auto sf = new SamReader(buildPath(dirName(__FILE__), "data", "ex1_header.sam"));
assert(sf.reads.front.ref_id == 0);
assert(equal(sf.reads, bf.reads!withoutOffsets));
}
writeln("Testing pileup (high-level aspects)...");
{
// All of pileup functions should automatically filter out unmapped reads.
// When reads in a range are aligned to different references,
// pileup objects should process only the first one.
bf = new BamReader(fn); // chr1, chr2
{
auto pileup = makePileup(bf.reads);
foreach (column; pileup) {
foreach (read; column.reads) {
assert(bf.reference_sequences[read.ref_id].name == "chr1");
assert(read.ref_id == column.ref_id);
assert(!read.is_unmapped);
}
}
}
// However, if pileupColumns is used, columns corresponding to chr1
// should come first, and after them -- those for chr2
{
auto columns = pileupColumns(bf.reads);
int current_ref_id = -1;
// [99 .. 1569] [1 .. 1567]
int[2] expected_columns = [1470, 1567];
foreach (column; columns) {
int ref_id = column.ref_id;
--expected_columns[ref_id];
if (ref_id != current_ref_id) {
assert(ref_id > current_ref_id);
switch (ref_id) {
case 0:
assert(column.reads.front.read_name == "EAS56_57:6:190:289:82");
assert(column.position == 99);
break;
case 1:
assert(column.reads.front.read_name == "B7_591:8:4:841:340");
assert(column.position == 0);
break;
default:
break;
}
current_ref_id = ref_id;
}
if (!column.reads.empty) {
foreach (read; column.reads) {
assert(read.ref_id == ref_id);
assert(!read.is_unmapped);
}
}
}
assert(expected_columns == [0, 0]);
}
}
writeln("Testing basesWith functionality...");
{
fn = buildPath(dirName(__FILE__), "data", "mg1655_chunk.bam");
bf = new BamReader(fn);
auto flow_order = bf.header.read_groups.values.front.flow_order;
auto reads = array(bf.reads);
auto read = reads[1];
assert(!read.is_reverse_strand);
auto basesFZ = basesWith!"FZ"(read, arg!"FZ"(flow_order));
assert(equal(basesFZ.save, read.sequence));
assert(equal(take(map!"a.flow_call.intensity_value"(basesFZ.save), 92),
[219, 219, 194, 194, 92, 107, 83, 198, 198, 78,
// A A C C T G A T T A
292, 292, 292, 81, 79, 78, 95, 99, 315, 315, 315,
// C C C A T C A G T T T
89, 79, 290, 290, 290, 100, 209, 209, 87, 80,
// G C G G G T G G C A
191, 191, 101, 179, 179, 210, 210, 99, 184, 184,
// C C A T T G G T A A
90, 91, 193, 193, 66, 100, 112, 79, 108, 106, 212, 212,
// C A C C A T G C A C A A
90, 96, 111, 94, 64, 94, 187, 187, 84, 110, 98, 102, 100,
// C T A C T C G G T G C T C
93, 89, 205, 205, 107, 98, 96, 91, 203, 203, 68, 180, 180,
// G C G G A C G A C C G T T
118, 246, 246, 91, 102, 94, 116, 90, 99, 101, 298, 298, 298
// C G G T G C T G C T G G G
]));
// bases must be the same
foreach (r; reads) {
if (r.is_unmapped) continue;
if (r.cigar.length == 0) continue;
if (r.is_reverse_strand) {
basesFZ = basesWith!"FZ"(r, arg!"FZ"(flow_order));
assert(equal(basesFZ.save, retro(r.sequence)));
} else {
basesFZ = basesWith!"FZ"(r, arg!"FZ"(flow_order));
assert(equal(basesFZ.save, r.sequence));
}
}
}
}
void main() {
}