You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

164 lines
5.5 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. module bio.core.bgzf.block;
  21. import bio.bam.constants;
  22. import std.array : uninitializedArray;
  23. import std.conv;
  24. import std.zlib : crc32, ZlibException;
  25. import etc.c.zlib;
  26. import std.exception;
  27. /**
  28. Structure representing BGZF block.
  29. In general, users shouldn't use it, as it is EXTREMELY low-level.
  30. */
  31. struct BgzfBlock {
  32. // field types are as in the SAM/BAM specification
  33. // ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t
  34. public ulong start_offset; /// start offset in the file, in bytes
  35. /// end offset in the file, in bytes
  36. public ulong end_offset() @property const {
  37. return start_offset + bsize + 1;
  38. }
  39. public ushort bsize; /// total Block SIZE minus one
  40. public ushort cdata_size; /// compressed data size
  41. /// A buffer is used to reduce number of allocations.
  42. ///
  43. /// Its size is max(bsize + 1, input_size)
  44. /// Initially, it contains compressed data, but is rewritten
  45. /// during decompressBgzfBlock -- indeed, who cares about
  46. /// compressed data after it has been uncompressed?
  47. public ubyte[] _buffer = void;
  48. /// If block has been already decompressed, result is undefined.
  49. public ubyte[] compressed_data() @property {
  50. return _buffer[0 .. cast(size_t)cdata_size];
  51. }
  52. public uint crc32;
  53. public uint input_size; /// size of uncompressed data
  54. hash_t toHash() const pure @safe nothrow {
  55. // since the block can be either compressed or decompressed,
  56. // returning CRC sum is the easiest and safest thing to do
  57. return crc32;
  58. }
  59. bool opEquals(const ref BgzfBlock other) pure @safe nothrow {
  60. return crc32 == other.crc32;
  61. }
  62. int opCmp(const ref BgzfBlock other) const pure @safe nothrow {
  63. return crc32 < other.crc32 ? -1 :
  64. crc32 > other.crc32 ? 1 : 0;
  65. }
  66. }
  67. /**
  68. Struct representing decompressed BgzfBlock
  69. Start offset is needed to be able to tell current virtual offset,
  70. and yet be able to decompress blocks in parallel.
  71. */
  72. struct DecompressedBgzfBlock {
  73. ulong start_offset;
  74. ulong end_offset;
  75. ubyte[] decompressed_data;
  76. }
  77. /// Function for BGZF block decompression.
  78. /// Reuses buffer allocated for storing compressed data,
  79. /// i.e. after execution buffer of the passed $(D block)
  80. /// is overwritten with uncompressed data.
  81. DecompressedBgzfBlock decompressBgzfBlock(BgzfBlock block) {
  82. if (block.input_size == 0) {
  83. return DecompressedBgzfBlock(block.start_offset,
  84. block.start_offset + block.bsize + 1,
  85. cast(ubyte[])[]); // EOF marker
  86. // TODO: add check for correctness of EOF marker
  87. }
  88. int err = void;
  89. // allocate buffer on the stack
  90. ubyte[BGZF_MAX_BLOCK_SIZE] uncompressed_buf = void;
  91. // check that block follows BAM specification
  92. enforce(block.input_size <= BGZF_MAX_BLOCK_SIZE,
  93. "Uncompressed block size must be within " ~
  94. to!string(BGZF_MAX_BLOCK_SIZE) ~ " bytes");
  95. // for convenience, provide a slice
  96. auto uncompressed = uncompressed_buf[0 .. block.input_size];
  97. // set input data
  98. etc.c.zlib.z_stream zs;
  99. zs.next_in = cast(typeof(zs.next_in))block.compressed_data;
  100. zs.avail_in = to!uint(block.compressed_data.length);
  101. err = etc.c.zlib.inflateInit2(&zs, /* winbits = */-15);
  102. if (err)
  103. {
  104. throw new ZlibException(err);
  105. }
  106. // uncompress it into a buffer on the stack
  107. zs.next_out = cast(typeof(zs.next_out))uncompressed_buf.ptr;
  108. zs.avail_out = block.input_size;
  109. err = etc.c.zlib.inflate(&zs, Z_FINISH);
  110. switch (err)
  111. {
  112. case Z_STREAM_END:
  113. assert(zs.total_out == block.input_size);
  114. err = etc.c.zlib.inflateEnd(&zs);
  115. if (err != Z_OK) {
  116. throw new ZlibException(err);
  117. }
  118. break;
  119. default:
  120. etc.c.zlib.inflateEnd(&zs);
  121. throw new ZlibException(err);
  122. }
  123. assert(block.crc32 == crc32(0, uncompressed[]));
  124. // Now copy back to block._buffer, overwriting existing data.
  125. // It should have enough bytes already allocated.
  126. assert(block._buffer.length >= block.input_size);
  127. block._buffer[0 .. block.input_size] = uncompressed[];
  128. return DecompressedBgzfBlock(block.start_offset,
  129. block.start_offset + block.bsize + 1,
  130. block._buffer[0 .. block.input_size]);
  131. }