You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

559 lines
18 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. /**
  21. Module for random access operations on BAM file.
  22. */
  23. module bio.bam.randomaccessmanager;
  24. import bio.bam.constants;
  25. import bio.bam.reader;
  26. import bio.bam.read;
  27. import bio.bam.readrange;
  28. import bio.bam.baifile;
  29. import bio.bam.bai.utils.algo;
  30. import bio.core.bgzf.blockrange;
  31. import bio.core.bgzf.virtualoffset;
  32. import bio.core.bgzf.inputstream;
  33. import bio.core.utils.memoize;
  34. import bio.core.utils.range;
  35. import bio.core.utils.stream;
  36. import std.system;
  37. import std.algorithm;
  38. import std.array;
  39. import std.range;
  40. import std.traits;
  41. import std.exception;
  42. import std.parallelism;
  43. // keeps task pool together with block
  44. struct BgzfBlockAux {
  45. TaskPool task_pool;
  46. BgzfBlock block;
  47. alias block this;
  48. hash_t toHash() const pure @safe nothrow { return block.toHash(); }
  49. bool opEquals(const ref BgzfBlockAux other) pure @safe nothrow {
  50. return block == other.block;
  51. }
  52. int opCmp(const ref BgzfBlockAux other) const pure @safe nothrow {
  53. return block.opCmp(other.block);
  54. }
  55. }
  56. // BgzfBlockAux -> Task
  57. auto decompressTask(BgzfBlockAux b) {
  58. auto t = task!decompressBgzfBlock(b.block);
  59. b.task_pool.put(t);
  60. return t;
  61. }
  62. // BgzfBlockAux -> Task
  63. private alias memoize!(decompressTask, 512,
  64. FifoCache, BgzfBlockAux) memDecompressTask;
  65. // (BgzfBlock, TaskPool) -> DecompressedBgzfBlock
  66. auto decompressSerial(BT)(BT block_and_pool) {
  67. return decompress(block_and_pool).yieldForce();
  68. }
  69. // (BgzfBlock, TaskPool) -> Task
  70. auto decompress(BT)(BT block_and_pool) {
  71. auto data = BgzfBlockAux(block_and_pool[1], block_and_pool[0]);
  72. return memDecompressTask(data);
  73. }
  74. // ([BgzfBlock], TaskPool) -> [DecompressedBgzfBlock]
  75. auto parallelUnpack(BR)(BR bgzf_range, TaskPool pool, size_t n_threads = 0) {
  76. if (n_threads == 0)
  77. n_threads = max(pool.size(), 1);
  78. auto tasks = bgzf_range.zip(repeat(pool)).map!decompress();
  79. return tasks.prefetch(n_threads).map!"a.yieldForce()"();
  80. }
  81. debug {
  82. import std.stdio;
  83. }
  84. /// Class which random access tasks are delegated to.
  85. class RandomAccessManager {
  86. void setTaskPool(TaskPool task_pool) {
  87. _task_pool = task_pool;
  88. }
  89. void setBufferSize(size_t buffer_size) {
  90. _buffer_size = buffer_size;
  91. }
  92. /// Constructs new manager for BAM file
  93. this(string filename) {
  94. _filename = filename;
  95. }
  96. /// ditto
  97. this(BamReader reader) {
  98. _reader = reader;
  99. _filename = reader.filename;
  100. }
  101. /// Constructs new manager with given index file.
  102. /// This allows to do random-access interval queries.
  103. ///
  104. /// Params:
  105. /// filename = location of BAM file
  106. /// bai = index file
  107. this(string filename, ref BaiFile bai) {
  108. _filename = filename;
  109. _bai = bai;
  110. _found_index_file = true;
  111. }
  112. /// ditto
  113. this(BamReader reader, ref BaiFile bai) {
  114. _reader = reader;
  115. _filename = reader.filename;
  116. _bai = bai;
  117. _found_index_file = true;
  118. }
  119. /// If file ends with EOF block, returns virtual offset of the start of EOF block.
  120. /// Otherwise, returns virtual offset of the physical end of file.
  121. VirtualOffset eofVirtualOffset() const {
  122. ulong file_offset = std.file.getSize(_filename);
  123. if (hasEofBlock()) {
  124. return VirtualOffset(file_offset - BAM_EOF.length, 0);
  125. } else {
  126. return VirtualOffset(file_offset, 0);
  127. }
  128. }
  129. /// Returns true if the file ends with EOF block, and false otherwise.
  130. bool hasEofBlock() const {
  131. auto _stream = new bio.core.utils.stream.File(_filename);
  132. if (_stream.size < BAM_EOF.length) {
  133. return false;
  134. }
  135. ubyte[BAM_EOF.length] buf;
  136. _stream.seekEnd(-cast(int)BAM_EOF.length);
  137. _stream.readExact(&buf, BAM_EOF.length);
  138. if (buf != BAM_EOF) {
  139. return false;
  140. }
  141. return true;
  142. }
  143. /// Get new IChunkInputStream starting from specified virtual offset.
  144. IChunkInputStream createStreamStartingFrom(VirtualOffset offset, bool parallel=true) {
  145. auto _stream = new bio.core.utils.stream.File(_filename);
  146. auto _compressed_stream = new EndianStream(_stream, Endian.littleEndian);
  147. _compressed_stream.seekSet(cast(size_t)(offset.coffset));
  148. auto n_threads = parallel ? max(task_pool.size, 1) : 1;
  149. auto blocks = BgzfRange(_compressed_stream).parallelUnpack(task_pool, n_threads);
  150. static auto helper(R)(R decompressed_range, VirtualOffset offset) {
  151. auto adjusted_front = AugmentedDecompressedBgzfBlock(decompressed_range.front,
  152. offset.uoffset, 0);
  153. decompressed_range.popFront();
  154. auto adjusted_range = chain(repeat(adjusted_front, 1),
  155. map!makeAugmentedBlock(decompressed_range));
  156. return cast(IChunkInputStream)makeChunkInputStream(adjusted_range);
  157. }
  158. return helper(blocks, offset);
  159. }
  160. /// Get single read at a given virtual offset.
  161. /// Every time new stream is used.
  162. BamRead getReadAt(VirtualOffset offset) {
  163. auto stream = createStreamStartingFrom(offset);
  164. return bamReadRange(stream, _reader).front.dup;
  165. }
  166. /// Get BGZF block at a given offset.
  167. BgzfBlock getBgzfBlockAt(ulong offset) {
  168. auto stream = new bio.core.utils.stream.File(_filename);
  169. stream.seekSet(offset);
  170. return BgzfRange(stream).front;
  171. }
  172. /// Get reads between two virtual offsets. First virtual offset must point
  173. /// to a start of an alignment record.
  174. ///
  175. /// If $(D task_pool) is not null, it is used for parallel decompression. Otherwise, decompression is serial.
  176. auto getReadsBetween(VirtualOffset from, VirtualOffset to) {
  177. IChunkInputStream stream = createStreamStartingFrom(from);
  178. static bool offsetTooBig(BamReadBlock record, VirtualOffset vo) {
  179. return record.end_virtual_offset > vo;
  180. }
  181. return until!offsetTooBig(bamReadRange!withOffsets(stream, _reader), to);
  182. }
  183. bool found_index_file() @property {
  184. return _found_index_file;
  185. }
  186. private bool _found_index_file = false; // overwritten in constructor if filename is provided
  187. /// BAI file
  188. ref const(BaiFile) getBai() const {
  189. return _bai;
  190. }
  191. /// Get BAI chunks containing all alignment records overlapping specified region
  192. Chunk[] getChunks(int ref_id, int beg, int end) {
  193. enforce(found_index_file, "BAM index file (.bai) must be provided");
  194. enforce(ref_id >= 0 && ref_id < _bai.indices.length, "Invalid reference sequence index");
  195. // Select all bins that overlap with [beg, end).
  196. // Then from such bins select all chunks that end to the right of min_offset.
  197. // Sort these chunks by leftmost coordinate and remove all overlaps.
  198. auto min_offset = _bai.indices[ref_id].getMinimumOffset(beg);
  199. Chunk[] bai_chunks;
  200. foreach (b; _bai.indices[ref_id].bins) {
  201. if (!b.canOverlapWith(beg, end)) {
  202. continue;
  203. }
  204. foreach (chunk; b.chunks) {
  205. if (chunk.end > min_offset) {
  206. bai_chunks ~= chunk;
  207. // optimization
  208. if (bai_chunks[$-1].beg < min_offset) {
  209. bai_chunks[$-1].beg = min_offset;
  210. }
  211. }
  212. }
  213. }
  214. sort(bai_chunks);
  215. return bai_chunks;
  216. }
  217. /// Fetch alignments with given reference sequence id, overlapping [beg..end)
  218. auto getReads(alias IteratePolicy=withOffsets)(int ref_id, uint beg, uint end) {
  219. auto chunks = array(nonOverlappingChunks(getChunks(ref_id, beg, end)));
  220. debug {
  221. /*
  222. import std.stdio;
  223. writeln("[random access] chunks:");
  224. writeln(" ", chunks);
  225. */
  226. }
  227. // General plan:
  228. //
  229. // chunk[0] -> bgzfRange[0] |
  230. // chunk[1] -> bgzfRange[1] | (2)
  231. // .... | -> (joiner(bgzfRange), [start/end v.o.])
  232. // chunk[k] -> bgzfRange[k] | |
  233. // (1) /* parallel */ V (3)
  234. // (unpacked blocks, [start/end v.o.])
  235. // |
  236. // V (4)
  237. // (modified unpacked blocks)
  238. // |
  239. // V (5)
  240. // IChunkInputStream
  241. // |
  242. // V (6)
  243. // filter out non-overlapping records
  244. // |
  245. // V
  246. // that's it!
  247. auto bgzf_range = getJoinedBgzfRange(chunks); // (2)
  248. auto decompressed_blocks = getUnpackedBlocks(bgzf_range, task_pool); // (3)
  249. auto augmented_blocks = getAugmentedBlocks(decompressed_blocks, chunks); // (4)
  250. IChunkInputStream stream = makeChunkInputStream(augmented_blocks); // (5)
  251. auto reads = bamReadRange!IteratePolicy(stream, _reader);
  252. return filterBamReads(reads, ref_id, beg, end); // (6)
  253. }
  254. private:
  255. string _filename;
  256. BaiFile _bai;
  257. BamReader _reader;
  258. TaskPool _task_pool;
  259. size_t _buffer_size;
  260. TaskPool task_pool() @property {
  261. if (_task_pool is null)
  262. _task_pool = taskPool;
  263. return _task_pool;
  264. }
  265. public:
  266. // Let's implement the plan described above!
  267. // (1) : (Chunk, Stream) -> [BgzfBlock]
  268. static struct ChunkToBgzfRange {
  269. static bool offsetTooBig(BgzfBlock block, ulong offset) {
  270. return block.start_offset > offset;
  271. }
  272. private {
  273. Chunk _chunk;
  274. Stream _stream;
  275. bool _init = false;
  276. Until!(offsetTooBig, BgzfRange, ulong) _range;
  277. }
  278. this(Chunk chunk, Stream stream) {
  279. _chunk = chunk;
  280. _stream = stream;
  281. }
  282. auto front() @property { init(); return _range.front; }
  283. void popFront() { init(); _range.popFront(); }
  284. bool empty() @property { init(); return _range.empty; }
  285. private void init() {
  286. if (!_init) {
  287. _stream.seekSet(cast(size_t)_chunk.beg.coffset);
  288. _range = until!offsetTooBig(BgzfRange(_stream),
  289. _chunk.end.coffset);
  290. _init = true;
  291. }
  292. }
  293. }
  294. // (2) : Chunk[] -> [BgzfBlock]
  295. auto getJoinedBgzfRange(Chunk[] bai_chunks) {
  296. Stream file = new bio.core.utils.stream.File(_filename);
  297. Stream stream = new BufferedStream(file, _buffer_size);
  298. ChunkToBgzfRange[] bgzf_ranges;
  299. bgzf_ranges.length = bai_chunks.length;
  300. foreach (i, ref range; bgzf_ranges) {
  301. range = ChunkToBgzfRange(bai_chunks[i], stream);
  302. }
  303. auto bgzf_blocks = joiner(bgzf_ranges);
  304. return bgzf_blocks;
  305. }
  306. // (3) : ([BgzfBlock], TaskPool) -> [DecompressedBgzfBlock]
  307. static auto getUnpackedBlocks(R)(R bgzf_range, TaskPool pool) {
  308. version(serial) {
  309. return bgzf_range.parallelUnpack(pool, 1);
  310. } else {
  311. return bgzf_range.parallelUnpack(pool);
  312. }
  313. }
  314. // (4) : ([DecompressedBgzfBlock], Chunk[]) -> [AugmentedDecompressedBgzfBlock]
  315. // decompressed blocks:
  316. // [.....][......][......][......][......][......][.....][....]
  317. //
  318. // what we need (chunks):
  319. // [.........] [.........] [...........] [..]
  320. //
  321. // Solution: augment decompressed blocks with skip_start and skip_end members
  322. // and teach ChunkInputStream to deal with ranges of such blocks.
  323. static struct AugmentedBlockRange(R) {
  324. this(R blocks, Chunk[] bai_chunks) {
  325. _blocks = blocks;
  326. if (_blocks.empty) {
  327. _empty = true;
  328. } else {
  329. _cur_block = _blocks.front;
  330. _blocks.popFront();
  331. }
  332. _chunks = bai_chunks[];
  333. }
  334. bool empty() @property {
  335. return _empty;
  336. }
  337. AugmentedDecompressedBgzfBlock front() @property {
  338. AugmentedDecompressedBgzfBlock result;
  339. result.block = _cur_block;
  340. if (_chunks.empty) {
  341. return result;
  342. }
  343. if (beg.coffset == result.start_offset) {
  344. result.skip_start = beg.uoffset;
  345. }
  346. if (end.coffset == result.start_offset) {
  347. auto to_skip = result.decompressed_data.length - end.uoffset;
  348. assert(to_skip <= ushort.max);
  349. result.skip_end = cast(ushort)to_skip;
  350. }
  351. return result;
  352. }
  353. void popFront() {
  354. if (_cur_block.start_offset == end.coffset) {
  355. _chunks = _chunks[1 .. $];
  356. }
  357. if (_blocks.empty) {
  358. _empty = true;
  359. return;
  360. }
  361. _cur_block = _blocks.front;
  362. _blocks.popFront();
  363. }
  364. private {
  365. R _blocks;
  366. ElementType!R _cur_block;
  367. bool _empty;
  368. Chunk[] _chunks;
  369. VirtualOffset beg() @property {
  370. return _chunks[0].beg;
  371. }
  372. VirtualOffset end() @property {
  373. return _chunks[0].end;
  374. }
  375. }
  376. }
  377. static auto getAugmentedBlocks(R)(R decompressed_blocks, Chunk[] bai_chunks) {
  378. return AugmentedBlockRange!R(decompressed_blocks, bai_chunks);
  379. }
  380. static struct BamReadFilter(R) {
  381. this(R r, int ref_id, uint beg, uint end) {
  382. _range = r;
  383. _ref_id = ref_id;
  384. _beg = beg;
  385. _end = end;
  386. findNext();
  387. }
  388. bool empty() @property {
  389. return _empty;
  390. }
  391. ElementType!R front() @property {
  392. return _current_read;
  393. }
  394. void popFront() {
  395. _range.popFront();
  396. findNext();
  397. }
  398. private:
  399. R _range;
  400. int _ref_id;
  401. uint _beg;
  402. uint _end;
  403. bool _empty;
  404. ElementType!R _current_read;
  405. void findNext() {
  406. if (_range.empty) {
  407. _empty = true;
  408. return;
  409. }
  410. while (!_range.empty) {
  411. _current_read = _range.front;
  412. // BamReads are sorted first by ref. ID.
  413. auto current_ref_id = _current_read.ref_id;
  414. if (current_ref_id > _ref_id) {
  415. // no more records for this _ref_id
  416. _empty = true;
  417. return;
  418. } else if (current_ref_id < _ref_id) {
  419. // skip reads referring to sequences
  420. // with ID less than ours
  421. _range.popFront();
  422. continue;
  423. }
  424. if (_current_read.position >= _end) {
  425. _empty = true;
  426. // As reads are sorted by leftmost coordinate,
  427. // all remaining alignments in _range
  428. // will not overlap the interval as well.
  429. //
  430. // [-----)
  431. // . [-----------)
  432. // . [---)
  433. // . [-------)
  434. // . [-)
  435. // [beg ..... end)
  436. return;
  437. }
  438. if (_current_read.position > _beg) {
  439. return; // definitely overlaps
  440. }
  441. if (_current_read.position +
  442. _current_read.basesCovered() <= _beg)
  443. {
  444. /// ends before beginning of the region
  445. /// [-----------)
  446. /// [beg .......... end)
  447. _range.popFront();
  448. /// Zero-length reads are also considered non-overlapping,
  449. /// so for consistency the inequality 12 lines above is strict.
  450. } else {
  451. return; /// _current_read overlaps the region
  452. }
  453. }
  454. _empty = true;
  455. }
  456. }
  457. // Get range of alignments sorted by leftmost coordinate,
  458. // together with an interval [beg, end),
  459. // and return another range of alignments which overlap the region.
  460. static auto filterBamReads(R)(R r, int ref_id, uint beg, uint end)
  461. {
  462. return BamReadFilter!R(r, ref_id, beg, end);
  463. }
  464. }