You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

977 lines
34 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. /// $(P This module is used for iterating over columns of alignment.)
  21. /// $(P The function makePileup is called on
  22. /// a range of coordinate-sorted reads mapped to the same reference.
  23. /// It returns an input range of columns.)
  24. /// $(P This returned range can then be iterated with $(D foreach).
  25. /// First column is located at the same position on the reference,
  26. /// as the first base of the first read.
  27. /// $(BR)
  28. /// Each $(D popFront) operation advances current position on the
  29. /// reference. The default behaviour is to exclude sites with zero coverage
  30. /// from the iteration.)
  31. /// $(P Each column keeps set of reads that overlap corresponding position
  32. /// on the reference.
  33. /// If reads contain MD tags, and makePileup was asked
  34. /// to use them, reference base at the column is also available.)
  35. /// $(BR)
  36. /// Each read preserves all standard read properties
  37. /// but also keeps column-related information, namely
  38. /// <ul>
  39. /// $(LI number of bases consumed from the read sequence so far)
  40. /// $(LI current CIGAR operation and offset in it)
  41. /// $(LI all CIGAR operations before and after current one)</ul>
  42. /// $(BR)
  43. /// It is clear from the above that current CIGAR operation cannot be an insertion.
  44. /// The following are suggested ways to check for them:
  45. /// <ul>
  46. /// $(LI $(D cigar_after.length > 0 &&
  47. /// cigar_operation_offset == cigar_operation.length - 1 &&
  48. /// cigar_after[0].type == 'I'))
  49. /// $(LI $(D cigar_before.length > 0 &&
  50. /// cigar_operation_offset == 0 &&
  51. /// cigar_before[$ - 1].type == 'I'))</ul>
  52. /// $(BR)
  53. /// Example:
  54. /// ---------------------------------------------------------
  55. /// import bio.bam.reader, bio.bam.pileup, std.stdio, std.algorithm : count;
  56. /// void main() {
  57. /// auto bam = new BamReader("file.bam"); // assume single reference and MD tags
  58. /// auto pileup = bam.reads().makePileup(true); // use MD tags
  59. /// foreach (column; pileup) {
  60. /// auto matches = column.bases.count(column.reference_base);
  61. /// if (matches < column.coverage * 2 / 3)
  62. /// writeln(column.position); // print positions of possible mismatches
  63. /// }
  64. /// }
  65. /// ---------------------------------------------------------
  66. module bio.bam.pileup;
  67. import bio.bam.read;
  68. import bio.bam.md.reconstruct;
  69. import bio.bam.splitter;
  70. import std.algorithm;
  71. import std.range;
  72. import std.random;
  73. import std.traits;
  74. import std.conv;
  75. import std.array;
  76. import std.exception;
  77. /// Represents a read aligned to a column
  78. struct PileupRead(Read=bio.bam.read.EagerBamRead) {
  79. Read read; ///
  80. alias read this;
  81. private alias read _read;
  82. /// Current CIGAR operation. One of 'M', '=', 'X', 'D', 'N.
  83. /// Use $(D cigar_after)/$(D cigar_before) to determine insertions.
  84. bio.bam.read.CigarOperation cigar_operation() @property const {
  85. return _cur_op;
  86. }
  87. /// Number of bases consumed from the current CIGAR operation.
  88. uint cigar_operation_offset() @property const {
  89. return _cur_op_offset;
  90. }
  91. /// CIGAR operations after the current operation
  92. const(bio.bam.read.CigarOperation)[] cigar_after() @property const {
  93. return _read.cigar[_cur_op_index + 1 .. $];
  94. }
  95. /// CIGAR operations before the current operation
  96. const(bio.bam.read.CigarOperation)[] cigar_before() @property const {
  97. return _read.cigar[0 .. _cur_op_index];
  98. }
  99. /// If current CIGAR operation is one of 'M', '=', or 'X', returns read base
  100. /// at the current column. Otherwise, returns '-'.
  101. char current_base() @property const {
  102. assert(_query_offset <= _read.sequence_length);
  103. if (_cur_op.is_query_consuming && _cur_op.is_reference_consuming) {
  104. return _read.sequence[_query_offset];
  105. } else {
  106. return '-';
  107. }
  108. }
  109. /// If current CIGAR operation is one of 'M', '=', or 'X', returns
  110. /// Phred-scaled read base quality at the current column.
  111. /// Otherwise, returns 255.
  112. ubyte current_base_quality() @property const {
  113. assert(_query_offset <= _read.sequence_length);
  114. if (_cur_op.is_query_consuming && _cur_op.is_reference_consuming) {
  115. return _read.base_qualities[_query_offset];
  116. } else {
  117. return 255;
  118. }
  119. }
  120. /// Returns number of bases consumed from the read sequence.
  121. /// $(BR)
  122. /// More concisely,
  123. /// $(UL
  124. /// $(LI if current CIGAR operation is 'M', '=', or 'X',
  125. /// index of current read base in the read sequence)
  126. /// $(LI if current CIGAR operation is 'D' or 'N',
  127. /// index of read base after the deletion)
  128. /// )
  129. /// (in both cases indices are 0-based)
  130. int query_offset() @property const {
  131. assert(_query_offset <= _read.sequence_length);
  132. return _query_offset;
  133. }
  134. /// Returns duplicate
  135. PileupRead dup() @property {
  136. PileupRead r = void;
  137. r._read = _read; // logically const, thus no .dup here
  138. r._cur_op_index = _cur_op_index;
  139. r._cur_op = _cur_op;
  140. r._cur_op_offset = _cur_op_offset;
  141. r._query_offset = _query_offset;
  142. return r;
  143. }
  144. private {
  145. // index of current CIGAR operation in _read.cigar
  146. uint _cur_op_index;
  147. // current CIGAR operation
  148. CigarOperation _cur_op;
  149. // number of bases consumed from the current CIGAR operation
  150. uint _cur_op_offset;
  151. // number of bases consumed from the read sequence
  152. uint _query_offset;
  153. this(Read read) {
  154. _read = read;
  155. // find first M/=/X/D operation
  156. auto cigar = _read.cigar;
  157. for (_cur_op_index = 0; _cur_op_index < cigar.length; ++_cur_op_index) {
  158. assertCigarIndexIsValid();
  159. _cur_op = cigar[_cur_op_index];
  160. if (_cur_op.is_reference_consuming) {
  161. if (_cur_op.type != 'N') {
  162. break;
  163. }
  164. } else if (_cur_op.is_query_consuming) {
  165. _query_offset += _cur_op.length; // skip S and I operations
  166. }
  167. }
  168. assertCigarIndexIsValid();
  169. }
  170. // move one base to the right on the reference
  171. void incrementPosition() {
  172. ++_cur_op_offset;
  173. // if current CIGAR operation is D or N, query offset is untouched
  174. if (_cur_op.is_query_consuming) {
  175. ++_query_offset;
  176. }
  177. if (_cur_op_offset >= _cur_op.length) {
  178. _cur_op_offset = 0; // reset CIGAR operation offset
  179. auto cigar = _read.cigar;
  180. // get next reference-consuming CIGAR operation (M/=/X/D/N)
  181. for (++_cur_op_index; _cur_op_index < cigar.length; ++_cur_op_index) {
  182. _cur_op = cigar[_cur_op_index];
  183. if (_cur_op.is_reference_consuming) {
  184. break;
  185. }
  186. if (_cur_op.is_query_consuming) {
  187. _query_offset += _cur_op.length;
  188. }
  189. }
  190. assertCigarIndexIsValid();
  191. }
  192. }
  193. void assertCigarIndexIsValid() {
  194. assert(_cur_op_index < _read.cigar.length, "Invalid read " ~ _read.name
  195. ~ " - CIGAR " ~ _read.cigarString()
  196. ~ ", sequence " ~ to!string(_read.sequence));
  197. }
  198. }
  199. }
  200. static assert(isBamRead!(PileupRead!BamRead));
  201. static assert(isBamRead!(PileupRead!EagerBamRead));
  202. /// Represents a single pileup column
  203. struct PileupColumn(R) {
  204. private {
  205. ulong _position;
  206. int _ref_id = -1;
  207. R _reads;
  208. size_t _n_starting_here;
  209. }
  210. /// Reference base. 'N' if not available.
  211. char reference_base() @property const {
  212. return _reference_base;
  213. }
  214. private char _reference_base = 'N';
  215. /// Coverage at this position (equals to number of reads)
  216. size_t coverage() const @property {
  217. return _reads.length;
  218. }
  219. /// Returns reference ID (-1 if unmapped)
  220. int ref_id() const @property {
  221. return _ref_id;
  222. }
  223. /// Position on the reference
  224. ulong position() const @property {
  225. return _position;
  226. }
  227. /// Reads overlapping the position, sorted by coordinate
  228. auto reads() @property {
  229. return assumeSorted!compareCoordinates(_reads[]);
  230. }
  231. /// Reads that have leftmost mapped position at this column
  232. auto reads_starting_here() @property {
  233. return _reads[$ - _n_starting_here .. $];
  234. }
  235. /// Shortcut for map!(read => read.current_base)(reads)
  236. auto bases() @property {
  237. return map!"a.current_base"(reads);
  238. }
  239. /// Shortcut for map!(read => read.current_base_quality)(reads)
  240. auto base_qualities() @property {
  241. return map!"a.current_base_quality"(reads);
  242. }
  243. /// Shortcut for map!(read => read.mapping_quality)(reads)
  244. auto read_qualities() @property {
  245. return map!"a.mapping_quality"(reads);
  246. }
  247. }
  248. /**
  249. * The class for iterating reference bases together with reads overlapping them.
  250. */
  251. class PileupRange(R, alias TColumn=PileupColumn) {
  252. alias PileupRead!EagerBamRead Read;
  253. alias Read[] ReadArray;
  254. alias TColumn!ReadArray Column;
  255. private {
  256. R _reads;
  257. Column _column;
  258. Appender!ReadArray _read_buf;
  259. bool _skip_zero_coverage;
  260. }
  261. protected {
  262. // This is extracted into a method not only to reduce duplication
  263. // (not so much of it), but to allow to override it!
  264. // For that reason it is not marked as final. Overhead of virtual
  265. // function is negligible compared to computations in EagerBamRead
  266. // constructor together with inserting new element into appender.
  267. void add(ref BamRead read) {
  268. _read_buf.put(PileupRead!EagerBamRead(EagerBamRead(read)));
  269. }
  270. }
  271. /**
  272. * Create new pileup iterator from a range of reads.
  273. */
  274. this(R reads, bool skip_zero_coverage) {
  275. _reads = reads;
  276. _read_buf = appender!ReadArray();
  277. _skip_zero_coverage = skip_zero_coverage;
  278. if (!_reads.empty) {
  279. initNewReference(); // C++ programmers, don't worry! Virtual tables in D
  280. // are populated before constructor body is executed.
  281. }
  282. }
  283. /// Returns PileupColumn struct corresponding to the current position.
  284. ref Column front() @property {
  285. return _column;
  286. }
  287. /// Whether all reads have been processed.
  288. bool empty() @property {
  289. return _reads.empty && _read_buf.data.empty;
  290. }
  291. /// Move to next position on the reference.
  292. void popFront() {
  293. auto pos = ++_column._position;
  294. size_t survived = 0;
  295. auto data = _read_buf.data;
  296. for (size_t i = 0; i < data.length; ++i) {
  297. if (data[i].end_position > pos) {
  298. if (survived < i)
  299. {
  300. data[survived] = data[i];
  301. }
  302. ++survived;
  303. }
  304. }
  305. for (size_t i = 0; i < survived; ++i) {
  306. data[i].incrementPosition();
  307. }
  308. // unless range is empty, this value is
  309. _read_buf.shrinkTo(survived);
  310. _column._n_starting_here = 0; // updated either in initNewReference()
  311. // or in the loop below
  312. if (!_reads.empty) {
  313. if (_reads.front.ref_id != _column._ref_id &&
  314. survived == 0) // processed all reads aligned to the previous reference
  315. {
  316. initNewReference();
  317. } else {
  318. size_t n = 0;
  319. while (!_reads.empty &&
  320. _reads.front.position == pos &&
  321. _reads.front.ref_id == _column._ref_id)
  322. {
  323. auto read = _reads.front;
  324. add(read);
  325. _reads.popFront();
  326. ++n;
  327. }
  328. _column._n_starting_here = n;
  329. // handle option of skipping sites with zero coverage
  330. if (survived == 0 && n == 0 && _skip_zero_coverage) {
  331. // the name might be misleading but it does the trick
  332. initNewReference();
  333. }
  334. }
  335. }
  336. _column._reads = _read_buf.data;
  337. }
  338. protected void initNewReference() {
  339. auto read = _reads.front;
  340. _column._position = read.position;
  341. _column._ref_id = read.ref_id;
  342. uint n = 1;
  343. add(read);
  344. _reads.popFront();
  345. while (!_reads.empty) {
  346. read = _reads.front;
  347. if (read.position == _column._position) {
  348. add(read);
  349. ++n;
  350. _reads.popFront();
  351. } else {
  352. break;
  353. }
  354. }
  355. _column._n_starting_here = n;
  356. _column._reads = _read_buf.data;
  357. }
  358. }
  359. /// Abstract pileup structure. S is type of column range.
  360. struct AbstractPileup(S) {
  361. S columns;
  362. /// Pileup columns
  363. alias columns this;
  364. private {
  365. ulong _start_position;
  366. ulong _end_position;
  367. int _ref_id;
  368. }
  369. /// $(D start_from) parameter provided to a pileup function
  370. ulong start_position() @property const {
  371. return _start_position;
  372. }
  373. /// $(D end_at) parameter provided to a pileup function
  374. ulong end_position() @property const {
  375. return _end_position;
  376. }
  377. /// Reference ID of all reads in this pileup.
  378. int ref_id() @property const {
  379. return _ref_id;
  380. }
  381. }
  382. struct TakeUntil(alias pred, Range, Sentinel) if (isInputRange!Range)
  383. {
  384. private Range _input;
  385. private Sentinel _sentinel;
  386. bool _done;
  387. this(Range input, Sentinel sentinel) {
  388. _input = input; _sentinel = sentinel; _done = _input.empty || predSatisfied();
  389. }
  390. @property bool empty() { return _done; }
  391. @property auto ref front() { return _input.front; }
  392. private bool predSatisfied() { return startsWith!pred(_input, _sentinel); }
  393. void popFront() { _input.popFront(); _done = _input.empty || predSatisfied(); }
  394. }
  395. auto takeUntil(alias pred, Range, Sentinel)(Range range, Sentinel sentinel) {
  396. return TakeUntil!(pred, Range, Sentinel)(range, sentinel);
  397. }
  398. auto pileupInstance(alias P, R)(R reads, ulong start_from, ulong end_at, bool skip_zero_coverage) {
  399. auto rs = filter!"!a.is_unmapped"(reads);
  400. while (!rs.empty) {
  401. auto r = rs.front;
  402. if (r.position + r.basesCovered() < start_from) {
  403. rs.popFront();
  404. } else {
  405. break;
  406. }
  407. }
  408. int ref_id = -1;
  409. if (!rs.empty) {
  410. ref_id = rs.front.ref_id;
  411. }
  412. auto sameref_rs = takeUntil!"a.ref_id != b"(rs, ref_id);
  413. alias typeof(sameref_rs) ReadRange;
  414. PileupRange!ReadRange columns = new P!ReadRange(sameref_rs, skip_zero_coverage);
  415. while (!columns.empty) {
  416. auto c = columns.front;
  417. if (c.position < start_from) {
  418. columns.popFront();
  419. } else {
  420. break;
  421. }
  422. }
  423. auto chopped = takeUntil!"a.position >= b"(columns, end_at);
  424. return AbstractPileup!(typeof(chopped))(chopped, start_from, end_at, ref_id);
  425. }
  426. auto pileupColumns(R)(R reads, bool use_md_tag=false, bool skip_zero_coverage=true) {
  427. auto rs = filter!"!a.is_unmapped"(reads);
  428. alias typeof(rs) ReadRange;
  429. PileupRange!ReadRange columns;
  430. if (use_md_tag) {
  431. columns = new PileupRangeUsingMdTag!ReadRange(rs, skip_zero_coverage);
  432. } else {
  433. columns = new PileupRange!ReadRange(rs, skip_zero_coverage);
  434. }
  435. return columns;
  436. }
  437. /// Tracks current reference base
  438. final static class PileupRangeUsingMdTag(R) :
  439. PileupRange!(R, PileupColumn)
  440. {
  441. // The code is similar to that in reconstruct.d but here we can't make
  442. // an assumption about any particular read having non-zero length on reference.
  443. // current chunk of reference
  444. private alias typeof(_column._reads[].front) Read;
  445. private ReturnType!(dna!Read) _chunk;
  446. // end position of the current chunk on reference (assuming half-open interval)
  447. private uint _chunk_end_position;
  448. // next read from which we will extract reference chunk
  449. //
  450. // More precisely,
  451. // _next_chunk_provider = argmax (read => read.end_position)
  452. // {reads overlapping current column}
  453. private Read _next_chunk_provider;
  454. private bool _has_next_chunk_provider = false;
  455. // coverage at the previous location
  456. private ulong _prev_coverage;
  457. // we also track current reference ID
  458. private int _curr_ref_id = -1;
  459. ///
  460. this(R reads, bool skip_zero_coverage) {
  461. super(reads, skip_zero_coverage);
  462. }
  463. // Checks length of the newly added read and tracks the read which
  464. // end position on the reference is the largest.
  465. //
  466. // When reconstructed reference chunk will become empty, next one will be
  467. // constructed from that read. This algorithm allows to minimize the number
  468. // of reads for which MD tag will be decoded.
  469. protected override void add(ref BamRead read) {
  470. // the behaviour depends on whether a new contig starts here or not
  471. bool had_zero_coverage = _prev_coverage == 0;
  472. super.add(read);
  473. // get wrapped read
  474. auto _read = _read_buf.data.back;
  475. // if we've just moved to another reference sequence, do the setup
  476. if (_read.ref_id != _curr_ref_id) {
  477. _curr_ref_id = _read.ref_id;
  478. _has_next_chunk_provider = true;
  479. _next_chunk_provider = _read;
  480. return;
  481. }
  482. // two subsequent next_chunk_providers must overlap
  483. // unless (!) there was a region with zero coverage in-between
  484. if (_read.position > _chunk_end_position && !had_zero_coverage) {
  485. return;
  486. }
  487. // compare with previous candidate and replace if this one is better
  488. if (_read.end_position > _chunk_end_position) {
  489. if (!_has_next_chunk_provider) {
  490. _has_next_chunk_provider = true;
  491. _next_chunk_provider = _read;
  492. } else if (_read.end_position > _next_chunk_provider.end_position) {
  493. _next_chunk_provider = _read;
  494. }
  495. }
  496. }
  497. protected override void initNewReference() {
  498. _prev_coverage = 0;
  499. super.initNewReference();
  500. if (_has_next_chunk_provider) {
  501. // prepare first chunk
  502. _chunk = dna(_next_chunk_provider);
  503. _chunk_end_position = _next_chunk_provider.end_position;
  504. _has_next_chunk_provider = false;
  505. _column._reference_base = _chunk.front;
  506. _chunk.popFront();
  507. } else {
  508. _column._reference_base = 'N';
  509. }
  510. }
  511. ///
  512. override void popFront() {
  513. if (!_chunk.empty) {
  514. // update current reference base
  515. _column._reference_base = _chunk.front;
  516. _chunk.popFront();
  517. } else {
  518. _column._reference_base = 'N';
  519. }
  520. // update _prev_coverage
  521. _prev_coverage = _column.coverage;
  522. // the order is important - maybe we will obtain new next_chunk_provider
  523. // during this call to popFront()
  524. super.popFront();
  525. // If we have consumed the whole current chunk,
  526. // we need to obtain the next one if it's possible.
  527. if (_chunk.empty && _has_next_chunk_provider) {
  528. _chunk = dna(_next_chunk_provider);
  529. debug {
  530. /* import std.stdio;
  531. writeln();
  532. writeln("position: ", _next_chunk_provider.position);
  533. writeln("next chunk: ", to!string(_chunk));
  534. */
  535. }
  536. _chunk_end_position = _next_chunk_provider.end_position;
  537. _has_next_chunk_provider = false;
  538. _chunk.popFrontN(cast(size_t)(_column.position - _next_chunk_provider.position));
  539. _column._reference_base = _chunk.front;
  540. _chunk.popFront();
  541. }
  542. }
  543. }
  544. /// Creates a pileup range from a range of reads.
  545. /// Note that all reads must be aligned to the same reference.
  546. ///
  547. /// See $(D PileupColumn) documentation for description of range elements.
  548. /// Note also that you can't use $(D std.array.array()) function on pileup
  549. /// because it won't make deep copies of underlying data structure.
  550. /// (One might argue that in this case it would be better to use opApply,
  551. /// but typically one would use $(D std.algorithm.map) on pileup columns
  552. /// to obtain some numeric characteristics.)
  553. ///
  554. /// Params:
  555. /// use_md_tag = if true, use MD tag together with CIGAR
  556. /// to recover reference bases
  557. ///
  558. /// start_from = position from which to start
  559. ///
  560. /// end_at = position before which to stop
  561. ///
  562. /// $(BR)
  563. /// That is, the range of positions is half-open interval
  564. /// $(BR)
  565. /// [max(start_from, first mapped read start position),
  566. /// $(BR)
  567. /// min(end_at, last mapped end position among all reads))
  568. ///
  569. /// skip_zero_coverage = if true, skip sites with zero coverage
  570. ///
  571. auto makePileup(R)(R reads,
  572. bool use_md_tag=false,
  573. ulong start_from=0,
  574. ulong end_at=ulong.max,
  575. bool skip_zero_coverage=true)
  576. {
  577. if (use_md_tag) {
  578. return pileupInstance!PileupRangeUsingMdTag(reads, start_from, end_at, skip_zero_coverage);
  579. } else {
  580. return pileupInstance!PileupRange(reads, start_from, end_at, skip_zero_coverage);
  581. }
  582. }
  583. unittest {
  584. import std.algorithm;
  585. import std.range;
  586. import std.array;
  587. // the set of reads below was taken from 1000 Genomes BAM file
  588. // NA20828.mapped.ILLUMINA.bwa.TSI.low_coverage.20101123.bam
  589. // (region 20:1127810-1127819)
  590. auto readnames = array(iota(10).map!(i => "r" ~ to!string(i))());
  591. auto sequences = ["ATTATGGACATTGTTTCCGTTATCATCATCATCATCATCATCATCATTATCATC",
  592. "GACATTGTTTCCGTTATCATCATCATCATCATCATCATCATCATCATCATCATC",
  593. "ATTGTTTCCGTTATCATCATCATCATCATCATCATCATCATCATCATCATCACC",
  594. "TGTTTCCGTTATCATCATCATCATCATCATCATCATCATCATCATCATCACCAC",
  595. "TCCGTTATCATCATCATCATCATCATCATCATCATCATCATCATCACCACCACC",
  596. "GTTATCATCATCATCATCATCATCATCATCATCATCATCATCATCGTCACCCTG",
  597. "TCATCATCATCATAATCATCATCATCATCATCATCATCGTCACCCTGTGTTGAG",
  598. "TCATCATCATCGTCACCCTGTGTTGAGGACAGAAGTAATTTCCCTTTCTTGGCT",
  599. "TCATCATCATCATCACCACCACCACCCTGTGTTGAGGACAGAAGTAATATCCCT",
  600. "CACCACCACCCTGTGTTGAGGACAGAAGTAATTTCCCTTTCTTGGCTGGTCACC"];
  601. // multiple sequence alignment:
  602. // ***
  603. // ATTATGGACATTGTTTCCGTTATCATCATCATCATCATCATCATCATTATCATC
  604. // GACATTGTTTCCGTTATCATCATCATCATCATCATCATCATCATCATCATCAT---C
  605. // ATTGTTTCCGTTATCATCATCATCATCATCATCATCATCATCATCATCATCACC
  606. // TGTTTCCGTTATCATCATCATCATCATCATCATCATCATCATCATCAT---CACCAC
  607. // TCCGTTATCATCATCATCATCATCATCATCATCATCATCATCAT---CACCACCACC
  608. // GTTATCATCATCATCATCATCATCATCATCATCATCATCAT---CATCGTCACCCTG
  609. // ATCATCATCATAATCATCATCATCATCATCAT---CATCGTCACCCTGTGTTGAG
  610. // TCATCATCATCGTCAC------------------CCTGTGTTGAGGACAGAAGTAATTTCCCTTTCTTGGCT
  611. // TCATCATCATCATCACCACCACCACCCTGTGTTGAGGACAGAAGTAATATCCCT
  612. // ---CACCACCACCCTGTGTTGAGGACAGAAGTAATTTCCCTTTCTTGGCTGGTCACC
  613. // * * * * * * * * * *
  614. // 760 770 780 790 800 810 820 830 840 850
  615. auto cigars = [[CigarOperation(54, 'M')],
  616. [CigarOperation(54, 'M')],
  617. [CigarOperation(50, 'M'), CigarOperation(3, 'I'), CigarOperation(1, 'M')],
  618. [CigarOperation(54, 'M')],
  619. [CigarOperation(54, 'M')],
  620. [CigarOperation(54, 'M')],
  621. [CigarOperation(2, 'S'), CigarOperation(52, 'M')],
  622. [CigarOperation(16, 'M'), CigarOperation(15, 'D'), CigarOperation(38, 'M')],
  623. [CigarOperation(13, 'M'), CigarOperation(3, 'I'), CigarOperation(38, 'M')],
  624. [CigarOperation(54, 'M')]];
  625. auto positions = [758, 764, 767, 769, 773, 776, 785, 795, 804, 817];
  626. auto md_tags = ["47C6", "54", "51", "50T3", "46T7", "45A0C7", "11C24A0C14",
  627. "11A3T0^CATCATCATCACCAC38", "15T29T5", "2T45T5"];
  628. BamRead[] reads = new BamRead[10];
  629. foreach (i; iota(10)) {
  630. reads[i] = BamRead(readnames[i], sequences[i], cigars[i]);
  631. reads[i].position = positions[i];
  632. reads[i].ref_id = 0;
  633. reads[i]["MD"] = md_tags[i];
  634. }
  635. auto first_read_position = reads.front.position;
  636. auto reference = to!string(dna(reads));
  637. import std.stdio;
  638. writeln("Testing pileup (low-level aspects)...");
  639. auto pileup = makePileup(reads, true, 796, 849, false);
  640. auto pileup2 = makePileup(reads, true, 0, ulong.max, false);
  641. assert(pileup.front.position == 796);
  642. assert(pileup.start_position == 796);
  643. assert(pileup.end_position == 849);
  644. while (pileup2.front.position != 796) {
  645. pileup2.popFront();
  646. }
  647. while (!pileup.empty) {
  648. auto column = pileup.front;
  649. auto column2 = pileup2.front;
  650. assert(column.coverage == column2.coverage);
  651. pileup2.popFront();
  652. // check that DNA is built correctly from MD tags and CIGAR
  653. assert(column.reference_base == reference[cast(size_t)(column.position - first_read_position)]);
  654. switch (column.position) {
  655. case 796:
  656. assert(equal(column.bases, "CCCCCCAC"));
  657. pileup.popFront();
  658. break;
  659. case 805:
  660. assert(equal(column.bases, "TCCCCCCCC"));
  661. pileup.popFront();
  662. break;
  663. case 806:
  664. assert(equal(column.bases, "AAAAAAAGA"));
  665. pileup.popFront();
  666. break;
  667. case 810:
  668. // last read is not yet fetched by pileup engine
  669. assert(column.reads[column.coverage - 2].cigar_after.front.type == 'D');
  670. pileup.popFront();
  671. break;
  672. case 817:
  673. assert(column.reads[column.coverage - 2].cigar_before.back.type == 'I');
  674. pileup.popFront();
  675. break;
  676. case 821:
  677. assert(column.reads[column.coverage - 3].cigar_operation.type == 'D');
  678. assert(equal(column.bases, "AAGG-AA"));
  679. pileup.popFront();
  680. break;
  681. case 826:
  682. assert(equal(column.bases, "CCCCCC"));
  683. pileup.popFront();
  684. break;
  685. case 849:
  686. assert(equal(column.bases, "TAT"));
  687. pileup.popFront();
  688. assert(pileup.empty);
  689. break;
  690. default:
  691. pileup.popFront();
  692. break;
  693. }
  694. }
  695. // another set of reads, the same file, region 20:12360032-12360050
  696. // test the case when reference has some places with zero coverage
  697. reads = [BamRead("r1", "CCCACATAGAAAGCTTGCTGTTTCTCTGTGGGAAGTTTTAACTTAGGTCAGCTT",
  698. [CigarOperation(54, 'M')]),
  699. BamRead("r2", "TAGAAAGCTTGCTGTTTCTCTGTGGGAAGTTTTAACTTAGGTTAGCTTCATCTA",
  700. [CigarOperation(54, 'M')]),
  701. BamRead("r3", "TTTTTCTTTCTTTCTTTGAAGAAGGCAGATTCCTGGTCCTGCCACTCAAATTTT",
  702. [CigarOperation(54, 'M')]),
  703. BamRead("r4", "TTTCTTTCTTTCTTTGAAGAAGGCAGATTCCTGGTCCTGCCACTCAAATTTTCA",
  704. [CigarOperation(54, 'M')])];
  705. reads[0].position = 979;
  706. reads[0]["MD"] = "54";
  707. reads[0].ref_id = 0;
  708. reads[1].position = 985;
  709. reads[1]["MD"] = "42C7C3";
  710. reads[1].ref_id = 0;
  711. reads[2].position = 1046;
  712. reads[2]["MD"] = "54";
  713. reads[2].ref_id = 0;
  714. reads[3].position = 1048;
  715. reads[3]["MD"] = "54";
  716. reads[3].ref_id = 0;
  717. assert(equal(dna(reads),
  718. map!(c => c.reference_base)(makePileup(reads, true, 0, ulong.max, false))));
  719. }
  720. static struct PileupChunkRange(C) {
  721. private C _chunks;
  722. private BamRead[] _prev_chunk;
  723. private BamRead[] _current_chunk;
  724. private bool _empty;
  725. private ulong _beg = 0;
  726. private bool _use_md_tag;
  727. private ulong _start_from;
  728. private ulong _end_at;
  729. this(C chunks, bool use_md_tag, ulong start_from, ulong end_at) {
  730. _chunks = chunks;
  731. _use_md_tag = use_md_tag;
  732. _start_from = start_from;
  733. _end_at = end_at;
  734. while (true) {
  735. if (_chunks.empty) {
  736. _empty = true;
  737. } else {
  738. _current_chunk = _chunks.front;
  739. _chunks.popFront();
  740. if (_beg >= end_at) {
  741. _empty = true;
  742. break;
  743. }
  744. auto last_read = _current_chunk[$-1];
  745. if (last_read.position + last_read.basesCovered() > start_from) {
  746. break;
  747. }
  748. }
  749. }
  750. }
  751. bool empty() @property {
  752. return _empty;
  753. }
  754. auto front() @property {
  755. return makePileup(chain(_prev_chunk, _current_chunk),
  756. _use_md_tag,
  757. max(_beg, _start_from),
  758. min(_current_chunk[$-1].position, _end_at));
  759. }
  760. void popFront() {
  761. _prev_chunk = _current_chunk;
  762. if (_chunks.empty) {
  763. _empty = true;
  764. return;
  765. }
  766. _current_chunk = _chunks.front;
  767. _chunks.popFront();
  768. assert(_prev_chunk.length > 0);
  769. _beg = _prev_chunk[$-1].position;
  770. // keep only those reads in _prev_chunk that have overlap with the last one
  771. // 1) estimate read length
  772. int[15] buf = void;
  773. int read_length = void;
  774. if (_prev_chunk.length <= 15) {
  775. for (size_t k = 0; k < _prev_chunk.length; ++k) {
  776. buf[k] = _prev_chunk[k].sequence_length;
  777. }
  778. topN(buf[0.._prev_chunk.length], _prev_chunk.length / 2);
  779. read_length = buf[_prev_chunk.length / 2];
  780. } else {
  781. copy(map!"a.sequence_length"(randomSample(_prev_chunk, 15)), buf[]);
  782. topN(buf[], 7);
  783. read_length = buf[7];
  784. debug {
  785. import std.stdio;
  786. stderr.writeln("[pileupChunks] read_length=", read_length);
  787. }
  788. }
  789. // 2) do binary search for those reads that start from (_beg - 2 * read_length)
  790. // (it's an experimental fact that almost none of reads consumes that much
  791. // on a reference sequence)
  792. auto pos = _beg - 2 * read_length;
  793. long i = 0;
  794. long j = _prev_chunk.length - 1;
  795. // positions of _prev_chunk[0 .. i] are less than pos,
  796. // positions of _prev_chunk[j + 1 .. $] are more or equal to pos.
  797. while (i <= j) {
  798. auto m = (i + j) / 2;
  799. assert(m < _prev_chunk.length);
  800. auto p = _prev_chunk[m].position;
  801. if (p >= pos) {
  802. j = m - 1;
  803. } else {
  804. i = m + 1;
  805. }
  806. }
  807. _prev_chunk = _prev_chunk[i .. $];
  808. }
  809. }
  810. /// This function constructs range of non-overlapping consecutive pileups from a range of reads
  811. /// so that these pileups can be processed in parallel.
  812. ///
  813. /// It's allowed to pass ranges of sorted reads with different ref. IDs,
  814. /// they won't get mixed in any chunk.
  815. ///
  816. /// Params:
  817. /// use_md_tag = recover reference bases from MD tag and CIGAR
  818. ///
  819. /// block_size = approximate amount of memory that each pileup will consume,
  820. /// given in bytes. (Usually consumption will be a bit higher.)
  821. ///
  822. /// start_from = position of the first column of the first chunk
  823. ///
  824. /// end_at = position after the last column of the last chunk
  825. ///
  826. /// $(BR)
  827. /// WARNING: block size should be big enough so that every block will share
  828. /// some reads only with adjacent blocks.
  829. /// $(BR)
  830. /// As such, it is not recommended to reduce the $(I block_size).
  831. /// But there might be a need to increase it in case of very high coverage.
  832. auto pileupChunks(R)(R reads, bool use_md_tag=false, size_t block_size=16_384_000,
  833. ulong start_from=0, ulong end_at=ulong.max) {
  834. auto chunks = chunksConsumingLessThan(reads, block_size);
  835. return PileupChunkRange!(typeof(chunks))(chunks, use_md_tag, start_from, end_at);
  836. }