You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

310 lines
13 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. module bio.core.bgzf.blockrange;
  21. public import bio.core.bgzf.block;
  22. import bio.bam.constants;
  23. import std.stream;
  24. import std.array : uninitializedArray;
  25. import std.conv;
  26. import std.algorithm : max;
  27. /// Exception type, thrown in case of encountering corrupt BGZF blocks
  28. class BgzfException : Exception {
  29. this(string msg) { super(msg); }
  30. }
  31. /**
  32. Range for iterating over BGZF blocks coming from any Stream
  33. */
  34. struct BgzfRange {
  35. // /////////////////////////////////////////////////////////////////////////
  36. //
  37. // | Here is the general picture of what happens.
  38. // |
  39. // | First of all, BgzfRange reads bytes from the stream and determines
  40. // | boundaries of BGZF blocks. Elements of the range are blocks of
  41. // | compressed data, together with their start offsets in the file.
  42. // |
  43. // | After that, blocks are decompressed, and another range comes into play.
  44. // | Start offsets are still provided together with blocks, because they are
  45. // | needed for random access, to calculate virtual offsets of alignments.
  46. //
  47. // - Virtual offset is a pair of numbers which uniquely identifies
  48. // - location of an individual alignment record in the file. The first is
  49. // - the start offset of the bgzf block in which the alignment record begins,
  50. // - and the second is offset in decompressed data of that block.
  51. // - Blocks are required to contain no more than 65536 bytes of uncompressed
  52. // - data, and virtual offsets are stored as uint64_t numbers as follows:
  53. // - [ {block offset (48 bits)} {offset in decompressed data (16 bits)} ]
  54. // -
  55. // - Relatively small size of BGZF blocks makes for fast random access,
  56. // - still allowing good compression (about 3x).
  57. //
  58. // | Now that we have range of decompressed blocks, those blocks have to be
  59. // | made into a proper input stream of bytes. BGZF specification does not
  60. // | deal with alignment records, it deals with blocks of arbitrary data.
  61. // | Therefore it's possible that some alignments will get splitted, though
  62. // | nowadays most software which produces BAM files avoids that.
  63. // |
  64. // | ChunkInputStream joins decompressed blocks into a stream, providing
  65. // | virtualTell() method, which returns current virtual offset.
  66. // |
  67. // | Then, in case of BAM file, SAM header is read first, then comes some
  68. // | basic information about reference sequences (names and lengths),
  69. // | just so as not to duplicate it in alignment records and use integer
  70. // | indices instead.
  71. // |
  72. // | After that, alignment records come. They are typically quite small,
  73. // | about 100-1000 bytes depending on sequence length and amount of tags.
  74. // |
  75. // | In order to avoid copying memory, and, more importantly, allocating it
  76. // | so frequently (GC is not well suited for 10000s of allocations/sec),
  77. // | the input stream provides yet another method, readSlice, which tries to
  78. // | return a slice of underlying decompressed block, and only in case it's
  79. // | impossible it allocates memory which is very rare.
  80. //
  81. // - There're also two possible policies for iterating alignments,
  82. // - either packed together with their virtual offsets, or without them.
  83. // - The first one is used for random access, the second one - for serial.
  84. //
  85. // -----------------------------------------------------------------------
  86. // Picture summarizing the above description:
  87. //
  88. // Stream BgzfRange range of input range of
  89. // decompressed stream alignment
  90. // each block BGZF blocks, records
  91. // is ~20kB each ~65kB
  92. //
  93. // ------- ---------. --------- ---------
  94. // | r | | 1st | \ | f|d | | SAM |
  95. // | a | -> | bgzf |\ -----> | i|e | | header|
  96. // | w | | block | \ | r|c | |-------|
  97. // | | |_______|\ \ | s|o | -> | r|s |
  98. // | b | | 2nd | \ \ | t|m | | e|e |
  99. // | y | -> | bgzf | \ ---> | p | | f|q|i |
  100. // | t | | block | \ | r | | e|u|n |
  101. // | e | |_______| \ | e|b | | r|e|f |
  102. // | s | | 3rd | \ | s|l | -> | e|n|o |
  103. // | . | -> | bgzf | \ | s|o | | n|c |
  104. // | . | | block | ->| e|c | | c|e |
  105. // | . | |-------| | d|k | | e|s |
  106. // | . | -> | ... | | | |-------| --------
  107. // | . | | | |-------| -> |records| -> |------|
  108. // | . | | ... | | ... | | ... | -> |------|
  109. //
  110. // /////////////////////////////////////////////////////////////////////////
  111. /**
  112. Constructs range from stream
  113. */
  114. this(Stream stream) {
  115. _stream = stream;
  116. _seekable = stream.seekable;
  117. loadNextBlock();
  118. }
  119. /**
  120. Returns: offset of the start of the current BGZF block
  121. in underlying stream. If the stream is non-seekable,
  122. the result is always 0.
  123. */
  124. @property ulong start_offset() { return _start_offset; }
  125. bool empty() @property {
  126. return _empty;
  127. }
  128. void popFront() {
  129. loadNextBlock();
  130. }
  131. BgzfBlock front() @property {
  132. return _current_block;
  133. }
  134. private:
  135. Stream _stream;
  136. ulong _start_offset;
  137. bool _empty = false;
  138. bool _seekable = false;
  139. BgzfBlock _current_block;
  140. void throwBgzfException(string msg) {
  141. throw new BgzfException("Error reading BGZF block starting from offset " ~
  142. to!string(_start_offset) ~ ": " ~ msg);
  143. }
  144. void loadNextBlock() {
  145. if (_seekable) {
  146. _start_offset = _stream.position;
  147. }
  148. if (_stream.eof()) {
  149. _empty = true; // indicate that range is now empty
  150. version(development) {
  151. import std.stdio;
  152. stderr.writeln("[info][BGZF range] EOF, current offset is ", _stream.position);
  153. }
  154. return;
  155. }
  156. try {
  157. uint bgzf_magic = void;
  158. // TODO: fix byte order if needed
  159. auto bytes_read = _stream.read((cast(ubyte*)&bgzf_magic)[0 .. 4]);
  160. if (bytes_read == 0) {
  161. _empty = true;
  162. version(development) {
  163. import std.stdio;
  164. stderr.writeln("[info][BGZF range] end of stream, current offset is ", _stream.position);
  165. }
  166. return;
  167. // TODO: check if last BGZF block was empty, and if not throw a warning
  168. }
  169. if (bgzf_magic != BGZF_MAGIC) {
  170. throwBgzfException("wrong BGZF magic");
  171. }
  172. ushort gzip_extra_length = void;
  173. if (_seekable) {
  174. _stream.seekCur(uint.sizeof + 2 * ubyte.sizeof);
  175. } else {
  176. uint gzip_mod_time = void;
  177. ubyte gzip_extra_flags = void;
  178. ubyte gzip_os = void;
  179. _stream.read(gzip_mod_time);
  180. _stream.read(gzip_extra_flags);
  181. _stream.read(gzip_os);
  182. }
  183. _stream.read(gzip_extra_length);
  184. ushort bsize = void; // total Block SIZE minus 1
  185. bool found_block_size = false;
  186. // read extra subfields
  187. size_t len = 0;
  188. while (len < gzip_extra_length) {
  189. ubyte si1 = void; // Subfield Identifier1
  190. ubyte si2 = void; // Subfield Identifier2
  191. ushort slen = void; // Subfield LENgth
  192. _stream.read(si1);
  193. _stream.read(si2);
  194. _stream.read(slen);
  195. if (si1 == BAM_SI1 && si2 == BAM_SI2) {
  196. // found 'BC' as subfield identifier
  197. if (slen != 2) {
  198. throwBgzfException("wrong BC subfield length: " ~
  199. to!string(slen) ~ "; expected 2");
  200. }
  201. if (found_block_size) {
  202. throwBgzfException("duplicate field with block size");
  203. }
  204. // read block size
  205. _stream.read(bsize);
  206. found_block_size = true;
  207. // skip the rest
  208. if (_seekable) {
  209. _stream.seekCur(slen - bsize.sizeof);
  210. } else {
  211. _stream.readString(slen - bsize.sizeof);
  212. }
  213. } else {
  214. // this subfield has nothing to do with block size,
  215. // just skip
  216. if (_seekable) {
  217. _stream.seekCur(slen);
  218. } else {
  219. _stream.readString(slen);
  220. }
  221. }
  222. len += si1.sizeof + si2.sizeof + slen.sizeof + slen;
  223. }
  224. if (len != gzip_extra_length) {
  225. throwBgzfException("total length of subfields in bytes (" ~
  226. to!string(len) ~
  227. ") is not equal to gzip_extra_length (" ~
  228. to!string(gzip_extra_length) ~ ")");
  229. }
  230. if (!found_block_size) {
  231. throwBgzfException("block size was not found in any subfield");
  232. }
  233. // read compressed data
  234. auto cdata_size = bsize - gzip_extra_length - 19;
  235. if (cdata_size > BGZF_MAX_BLOCK_SIZE) {
  236. throwBgzfException("compressed data size is more than " ~
  237. to!string(BGZF_MAX_BLOCK_SIZE) ~
  238. " bytes, which is not allowed by " ~
  239. "current BAM specification");
  240. }
  241. _current_block.bsize = bsize;
  242. _current_block.cdata_size = cast(ushort)cdata_size;
  243. ubyte[BGZF_MAX_BLOCK_SIZE] _buffer = void;
  244. _stream.readExact(_buffer.ptr, cdata_size);
  245. _stream.read(_current_block.crc32);
  246. _stream.read(_current_block.input_size);
  247. // now, this is a feature: allocate max(input_size, cdata_size).
  248. // this way, only 1 allocation is done per block instead of 2.
  249. // (see comments in bio.bam.bgzf.block about reusing this memory)
  250. auto _buf_size = max(_current_block.input_size, cdata_size);
  251. _current_block._buffer = uninitializedArray!(ubyte[])(_buf_size);
  252. // copy compressed data at the start of the block
  253. _current_block._buffer[0 .. cdata_size] = _buffer[0 .. cdata_size];
  254. _current_block.start_offset = start_offset;
  255. return;
  256. } catch (ReadException e) {
  257. throwBgzfException("stream error: " ~ e.msg);
  258. }
  259. assert(0);
  260. }
  261. }