You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

284 lines
9.0 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. module bio.bam.bai.indexing;
  21. import bio.bam.reader;
  22. import bio.bam.constants;
  23. import bio.bam.bai.bin;
  24. import bio.bam.bai.chunk;
  25. import std.stream;
  26. import std.array;
  27. import std.algorithm;
  28. import std.system;
  29. import std.exception;
  30. // Suppose we have an alignment which covers bases on a reference,
  31. // starting from one position and ending at another position.
  32. // In order to build linear index, we need to find to which windows
  33. // the two positions correspond.
  34. //
  35. //
  36. // (K = 16384)
  37. //
  38. // [0, K)[K, 2K)[2K, 3K)... <- windows
  39. // [.......) <- alignment
  40. //
  41. private size_t toLinearIndexOffset(int position) {
  42. return position < 0 ? 0 : position / BAI_LINEAR_INDEX_WINDOW_SIZE;
  43. }
  44. void defaultProgressBarFunc(lazy float dummy) {}
  45. /// Writes BAM index to the $(D stream)
  46. ///
  47. /// Accepts optional $(D progressBarFunc)
  48. void createIndex(BamReader bam, ref Stream stream, void delegate(lazy float p) progressBarFunc=null) {
  49. auto endian_stream = new EndianStream(stream, Endian.littleEndian);
  50. auto refs = bam.reference_sequences;
  51. auto nrefs = refs.length;
  52. endian_stream.writeString(BAI_MAGIC); // write BAI magic string
  53. endian_stream.write(cast(int)nrefs); // and number of references
  54. void writeEmptyReference() {
  55. endian_stream.write(cast(int)0); // n_bins
  56. endian_stream.write(cast(int)0); // n_intv
  57. }
  58. // BAM file contains no alignments at all or all reads are unmapped
  59. if (bam.reads!withOffsets.empty ||
  60. bam.reads!withOffsets.front.read.ref_id < 0) {
  61. foreach (i; 0 .. nrefs) {
  62. writeEmptyReference();
  63. }
  64. return;
  65. }
  66. // OK, now let's deal with non-degenerate case
  67. auto alignment_blocks = bam.readsWithProgress!withOffsets(progressBarFunc);
  68. auto prev_block = alignment_blocks.front;
  69. alignment_blocks.popFront();
  70. // this is the main character hereafter
  71. auto prev_read = prev_block.read;
  72. // array of linear offsets for the current reference entry
  73. ulong[BAI_MAX_BIN_ID - BAI_MAX_NONLEAF_BIN_ID + 1] linear_index;
  74. // (maximum index in linear_index where data was written) + 1
  75. size_t linear_index_write_length;
  76. // map: bin ID -> array of chunks
  77. Chunk[][uint] chunks;
  78. auto first_ref_id = prev_block.read.ref_id;
  79. auto current_chunk_beg = prev_block.start_virtual_offset;
  80. assert(first_ref_id >= 0);
  81. foreach (i; 0 .. first_ref_id) {
  82. writeEmptyReference();
  83. }
  84. void updateLinearIndex() {
  85. assert(prev_read.ref_id >= 0);
  86. size_t beg, end;
  87. if (prev_read.is_unmapped) {
  88. end = beg = toLinearIndexOffset(prev_read.position);
  89. } else {
  90. beg = toLinearIndexOffset(prev_read.position);
  91. end = toLinearIndexOffset(prev_read.position + prev_read.basesCovered() - 1);
  92. }
  93. debug {
  94. import std.stdio;
  95. if (end >= linear_index.length) {
  96. writeln("beg: ", beg);
  97. writeln("end: ", end);
  98. writeln("pos: ", prev_read.position);
  99. writeln("bases: ", prev_read.basesCovered());
  100. }
  101. }
  102. foreach (i; beg .. end + 1) {
  103. if (linear_index[i] == 0UL) {
  104. linear_index[i] = cast(ulong)prev_block.start_virtual_offset;
  105. }
  106. }
  107. if (end + 1 > linear_index_write_length) {
  108. linear_index_write_length = end + 1;
  109. }
  110. }
  111. void dumpCurrentLinearIndex() {
  112. endian_stream.write(cast(int)linear_index_write_length);
  113. //
  114. // There might be untouched places in linear index
  115. // with virtual offset equal to zero.
  116. // However, it's not a good idea to leave those zeros,
  117. // since we can start lookup from the last non-zero virtual offset
  118. // encountered before the untouched window.
  119. //
  120. ulong last_voffset = 0;
  121. foreach (voffset; linear_index[0 .. linear_index_write_length])
  122. {
  123. if (voffset == 0) {
  124. voffset = last_voffset;
  125. } else {
  126. last_voffset = voffset;
  127. }
  128. endian_stream.write(voffset);
  129. }
  130. }
  131. void dumpCurrentReference() {
  132. endian_stream.write(cast(int)chunks.length);
  133. foreach (bin_id, bin_chunks; chunks) {
  134. if (bin_chunks.length > 0) {
  135. endian_stream.write(bin_id);
  136. endian_stream.write(cast(int)bin_chunks.length);
  137. foreach (chunk; bin_chunks) {
  138. endian_stream.write(cast(ulong)chunk.beg);
  139. endian_stream.write(cast(ulong)chunk.end);
  140. }
  141. }
  142. }
  143. dumpCurrentLinearIndex();
  144. // reset data
  145. linear_index[] = 0;
  146. linear_index_write_length = 0;
  147. chunks = null;
  148. current_chunk_beg = prev_block.end_virtual_offset;
  149. }
  150. // adds chunk to the current bin (which is determined from prev_read)
  151. void updateChunks() {
  152. auto current_chunk_end = prev_block.end_virtual_offset;
  153. auto bin_id = prev_read.bin.id;
  154. if (bin_id !in chunks) {
  155. chunks[bin_id] = [];
  156. }
  157. auto cs = chunks[bin_id];
  158. bool canMergeWithPreviousChunk() {
  159. assert(cs.length > 0);
  160. auto last_chunk = cs[$ - 1];
  161. if (last_chunk.end.coffset == current_chunk_beg.coffset)
  162. return true;
  163. return false;
  164. }
  165. if (cs.length == 0 || !canMergeWithPreviousChunk()) {
  166. chunks[prev_read.bin.id] ~= Chunk(current_chunk_beg, current_chunk_end);
  167. } else {
  168. chunks[prev_read.bin.id][$ - 1].end = current_chunk_end;
  169. }
  170. current_chunk_beg = current_chunk_end;
  171. }
  172. foreach (block; alignment_blocks) {
  173. auto read = block.read;
  174. // new reference, so write data for previous one(s)
  175. if (read.ref_id != prev_read.ref_id) {
  176. updateLinearIndex();
  177. updateChunks();
  178. dumpCurrentReference();
  179. foreach (i; prev_read.ref_id + 1 .. read.ref_id)
  180. writeEmptyReference();
  181. }
  182. // this and all the following reads are unmapped
  183. if (read.ref_id < 0) {
  184. break;
  185. }
  186. // start position is unavailable, skip
  187. if (read.position < 0) {
  188. prev_block = block;
  189. prev_read = read;
  190. continue;
  191. }
  192. // check if the BAM file is indeed sorted
  193. if ((read.ref_id == prev_read.ref_id &&
  194. read.position < prev_read.position) ||
  195. (read.ref_id < prev_read.ref_id))
  196. {
  197. throw new Exception("BAM file is not properly sorted: " ~
  198. "read '" ~ read.name ~ "'" ~
  199. " must be before read '" ~
  200. prev_read.name ~
  201. "' (at virtual offset " ~
  202. to!string(prev_block.start_virtual_offset)~
  203. ")");
  204. }
  205. // ---------------------------------------------------------------------
  206. if (read.ref_id == prev_read.ref_id) {
  207. updateLinearIndex();
  208. if (read.bin.id != prev_read.bin.id) {
  209. updateChunks();
  210. }
  211. }
  212. // ---------------------------------------------------------------------
  213. prev_block = block;
  214. prev_read = read;
  215. }
  216. // after the loop, prev_read is the last read with ref_id >= 0
  217. assert(prev_read.ref_id >= 0);
  218. updateLinearIndex();
  219. updateChunks();
  220. dumpCurrentReference();
  221. // write the rest
  222. foreach (i; prev_read.ref_id + 1 .. nrefs) {
  223. writeEmptyReference();
  224. }
  225. }