You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1691 lines
56 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012-2013 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. /// $(P $(D BamRead) type provides convenient interface for working with SAM/BAM records.)
  21. ///
  22. /// $(P All flags, tags, and fields can be accessed and modified.)
  23. ///
  24. /// Examples:
  25. /// ---------------------------
  26. /// import std.conv;
  27. /// ...
  28. /// assert(!read.is_unmapped); // check flag
  29. /// assert(read.ref_id != -1); // access field
  30. ///
  31. /// int edit_distance = to!int(read["NM"]); // access tag
  32. /// read["NM"] = 0; // modify tag
  33. /// read["NM"] = null; // remove tag
  34. /// read["NM"] = null; // no-op
  35. ///
  36. /// foreach (tag, value; read) // iterate over tags
  37. /// writeln(tag, " ", value); // and print their keys and values
  38. ///
  39. /// read.sequence = "AGCAGACTACGTGTGCATAC"; // sets base qualities to 255
  40. /// assert(read.base_qualities[0] == 255);
  41. /// read.is_unmapped = true; // set flag
  42. /// read.ref_id = -1; // set field
  43. /// ---------------------------
  44. module bio.bam.read;
  45. import bio.core.base;
  46. import bio.core.utils.format;
  47. import bio.bam.abstractreader;
  48. import bio.bam.writer;
  49. import bio.bam.tagvalue;
  50. import bio.bam.bai.bin;
  51. import bio.bam.md.core;
  52. import bio.bam.utils.array;
  53. import bio.bam.utils.value;
  54. import bio.core.utils.switchendianness;
  55. import bio.bam.thirdparty.msgpack : Packer, unpack;
  56. import std.algorithm;
  57. import std.range;
  58. import std.conv;
  59. import std.format;
  60. import std.exception;
  61. import std.system;
  62. import std.traits;
  63. import std.array;
  64. import std.c.stdlib;
  65. /**
  66. Represents single CIGAR operation
  67. */
  68. struct CigarOperation {
  69. static assert(CigarOperation.sizeof == uint.sizeof);
  70. /*
  71. WARNING!
  72. It is very essential that the size of
  73. this struct is EXACTLY equal to uint.sizeof!
  74. The reason is to avoid copying of arrays during alignment parsing.
  75. Namely, when some_pointer points to raw cigar data,
  76. we can just do a cast. This allows to access those data
  77. directly, not doing any memory allocations.
  78. */
  79. private uint raw; // raw data from BAM
  80. private static ubyte char2op(char c) {
  81. switch(c) {
  82. case 'M': return 0;
  83. case 'I': return 1;
  84. case 'D': return 2;
  85. case 'N': return 3;
  86. case 'S': return 4;
  87. case 'H': return 5;
  88. case 'P': return 6;
  89. case '=': return 7;
  90. case 'X': return 8;
  91. default: return 15; // 15 is used as invalid value
  92. }
  93. }
  94. /// Length must be strictly less than 2^28.
  95. /// $(BR)
  96. /// Operation type must be one of M, I, D, N, S, H, P, =, X.
  97. this(uint length, char operation_type) {
  98. enforce(length < (1<<28), "Too big length of CIGAR operation");
  99. raw = (length << 4) | char2op(operation_type);
  100. }
  101. /// Operation length
  102. uint length() @property const nothrow {
  103. return raw >> 4;
  104. }
  105. /// CIGAR operation as one of MIDNSHP=X.
  106. /// Absent or invalid operation is represented by '?'
  107. char type() @property const nothrow {
  108. return "MIDNSHP=X????????"[raw & 0xF];
  109. }
  110. // Each pair of bits has first bit set iff the operation is query consuming,
  111. // and second bit set iff it is reference consuming.
  112. // X = P H S N D I M
  113. private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
  114. /// True iff operation is one of M, =, X, I, S
  115. bool is_query_consuming() @property const {
  116. return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
  117. }
  118. /// True iff operation is one of M, =, X, D, N
  119. bool is_reference_consuming() @property const {
  120. return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
  121. }
  122. /// True iff operation is one of M, =, X
  123. bool is_match_or_mismatch() @property const {
  124. return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
  125. }
  126. private void toSam(Sink)(auto ref Sink sink) const
  127. if (isSomeSink!Sink)
  128. {
  129. sink.write(length);
  130. sink.write(type);
  131. }
  132. void toString(scope void delegate(const(char)[]) sink) const {
  133. toSam(sink);
  134. }
  135. }
  136. /// Forward range of extended CIGAR operations, with =/X instead of M
  137. /// Useful for, e.g., detecting positions of mismatches.
  138. struct ExtendedCigarRange(CigarOpRange, MdOpRange) {
  139. static assert(isInputRange!CigarOpRange && is(Unqual!(ElementType!CigarOpRange) == CigarOperation));
  140. static assert(isInputRange!MdOpRange && is(Unqual!(ElementType!MdOpRange) == MdOperation));
  141. private {
  142. CigarOpRange _cigar;
  143. MdOpRange _md_ops;
  144. CigarOperation _front_cigar_op;
  145. MdOperation _front_md_op;
  146. uint _n_mismatches;
  147. bool _empty;
  148. }
  149. ///
  150. this(CigarOpRange cigar, MdOpRange md_ops) {
  151. _cigar = cigar;
  152. _md_ops = md_ops;
  153. fetchNextCigarOp();
  154. fetchNextMdOp();
  155. }
  156. /// Forward range primitives
  157. bool empty() @property const {
  158. return _empty;
  159. }
  160. /// ditto
  161. CigarOperation front() @property {
  162. debug {
  163. import std.stdio;
  164. writeln(_front_cigar_op, " - ", _front_md_op);
  165. }
  166. if (_front_cigar_op.type != 'M')
  167. return _front_cigar_op;
  168. if (_n_mismatches == 0) {
  169. assert(_front_md_op.is_match);
  170. uint len = min(_front_md_op.match, _front_cigar_op.length);
  171. return CigarOperation(len, '=');
  172. }
  173. assert(_front_md_op.is_mismatch);
  174. return CigarOperation(min(_n_mismatches, _front_cigar_op.length), 'X');
  175. }
  176. /// ditto
  177. ExtendedCigarRange save() @property {
  178. typeof(return) r = void;
  179. r._cigar = _cigar.save;
  180. r._md_ops = _md_ops.save;
  181. r._front_cigar_op = _front_cigar_op;
  182. r._front_md_op = _front_md_op;
  183. r._n_mismatches = _n_mismatches;
  184. r._empty = _empty;
  185. return r;
  186. }
  187. /// ditto
  188. void popFront() {
  189. if (!_front_cigar_op.is_match_or_mismatch) {
  190. if (_front_cigar_op.is_reference_consuming)
  191. fetchNextMdOp();
  192. fetchNextCigarOp();
  193. return;
  194. }
  195. auto len = _front_cigar_op.length;
  196. if (_n_mismatches > 0) {
  197. enforce(_front_md_op.is_mismatch);
  198. if (len > _n_mismatches) {
  199. _front_cigar_op = CigarOperation(len - _n_mismatches, 'M');
  200. _n_mismatches = 0;
  201. fetchNextMdOp();
  202. } else if (len < _n_mismatches) {
  203. _n_mismatches -= len;
  204. fetchNextCigarOp();
  205. } else {
  206. fetchNextCigarOp();
  207. fetchNextMdOp();
  208. }
  209. } else {
  210. enforce(_front_md_op.is_match);
  211. auto n_matches = _front_md_op.match;
  212. if (len > n_matches) {
  213. _front_cigar_op = CigarOperation(len - n_matches, 'M');
  214. fetchNextMdOp();
  215. } else if (len < n_matches) {
  216. _front_md_op.match -= len;
  217. fetchNextCigarOp();
  218. } else {
  219. fetchNextCigarOp();
  220. fetchNextMdOp();
  221. }
  222. }
  223. }
  224. private {
  225. void fetchNextCigarOp() {
  226. if (_cigar.empty) {
  227. _empty = true;
  228. return;
  229. }
  230. _front_cigar_op = _cigar.front;
  231. _cigar.popFront();
  232. }
  233. void fetchNextMdOp() {
  234. if (_md_ops.empty)
  235. return;
  236. _n_mismatches = 0;
  237. _front_md_op = _md_ops.front;
  238. _md_ops.popFront();
  239. if (_front_md_op.is_mismatch) {
  240. _n_mismatches = 1;
  241. while (!_md_ops.empty && _md_ops.front.is_mismatch) {
  242. _md_ops.popFront();
  243. _n_mismatches += 1;
  244. }
  245. }
  246. }
  247. }
  248. }
  249. auto makeExtendedCigar(CigarOpRange, MdOpRange)(CigarOpRange cigar, MdOpRange md_ops) {
  250. return ExtendedCigarRange!(CigarOpRange, MdOpRange)(cigar, md_ops);
  251. }
  252. /**
  253. BAM record representation.
  254. */
  255. struct BamRead {
  256. mixin TagStorage;
  257. /// Reference index in BAM file header
  258. @property int ref_id() const nothrow { return _refID; }
  259. /// ditto
  260. @property void ref_id(int n) { _dup(); _refID = n; }
  261. /// 0-based leftmost coordinate of the first matching base
  262. @property int position() const nothrow { return _pos; }
  263. /// ditto
  264. @property void position(int n) { _dup(); _pos = n; _recalculate_bin(); }
  265. /// Indexing bin which this read belongs to. Recalculated when position is changed.
  266. @property bio.bam.bai.bin.Bin bin() const nothrow { return Bin(_bin); }
  267. /// Mapping quality. Equals to 255 if not available, otherwise
  268. /// equals to rounded -10 * log10(P {mapping position is wrong}).
  269. @property ubyte mapping_quality() const nothrow { return _mapq; }
  270. /// ditto
  271. @property void mapping_quality(ubyte n) { _dup(); _mapq = n; }
  272. /// Flag bits (should be used on very rare occasions, see flag getters/setters below)
  273. @property ushort flag() const nothrow { return _flag; }
  274. /// ditto
  275. @property void flag(ushort n) { _dup(); _flag = n; }
  276. /// Sequence length. In fact, sequence.length can be used instead, but that might be
  277. /// slower if the compiler is not smart enough to optimize away unrelated stuff.
  278. @property int sequence_length() const nothrow { return _l_seq; }
  279. /// Mate reference ID
  280. @property int mate_ref_id() const nothrow { return _next_refID; }
  281. /// ditto
  282. @property void mate_ref_id(int n) { _dup(); _next_refID = n; }
  283. /// Mate position
  284. @property int mate_position() const nothrow { return _next_pos; }
  285. /// ditto
  286. @property void mate_position(int n) { _dup(); _next_pos = n; }
  287. /// Template length
  288. @property int template_length() const nothrow { return _tlen; }
  289. /// ditto
  290. @property void template_length(int n) { _dup(); _tlen = n; }
  291. // ------------------------ FLAG GETTERS/SETTERS -------------------------------------- //
  292. /// Template having multiple segments in sequencing
  293. @property bool is_paired() const nothrow { return cast(bool)(flag & 0x1); }
  294. /// ditto
  295. @property void is_paired(bool b) { _setFlag( 0, b); }
  296. /// Each segment properly aligned according to the aligner
  297. @property bool proper_pair() const nothrow { return cast(bool)(flag & 0x2); }
  298. /// ditto
  299. @property void proper_pair(bool b) { _setFlag( 1, b); }
  300. /// Segment unmapped
  301. @property bool is_unmapped() const nothrow { return cast(bool)(flag & 0x4); }
  302. /// ditto
  303. @property void is_unmapped(bool b) { _setFlag( 2, b); }
  304. /// Next segment in the template unmapped
  305. @property bool mate_is_unmapped() const nothrow { return cast(bool)(flag & 0x8); }
  306. /// ditto
  307. @property void mate_is_unmapped(bool b) { _setFlag( 3, b); }
  308. /// Sequence being reverse complemented
  309. @property bool is_reverse_strand() const nothrow { return cast(bool)(flag & 0x10); }
  310. /// ditto
  311. @property void is_reverse_strand(bool b) { _setFlag( 4, b); }
  312. /// Sequence of the next segment in the template being reversed
  313. @property bool mate_is_reverse_strand() const nothrow { return cast(bool)(flag & 0x20); }
  314. /// ditto
  315. @property void mate_is_reverse_strand(bool b) { _setFlag( 5, b); }
  316. /// The first segment in the template
  317. @property bool is_first_of_pair() const nothrow { return cast(bool)(flag & 0x40); }
  318. /// ditto
  319. @property void is_first_of_pair(bool b) { _setFlag( 6, b); }
  320. /// The last segment in the template
  321. @property bool is_second_of_pair() const nothrow { return cast(bool)(flag & 0x80); }
  322. /// ditto
  323. @property void is_second_of_pair(bool b) { _setFlag( 7, b); }
  324. /// Secondary alignment
  325. @property bool is_secondary_alignment() const nothrow { return cast(bool)(flag & 0x100); }
  326. /// ditto
  327. @property void is_secondary_alignment(bool b) { _setFlag( 8, b); }
  328. /// Not passing quality controls
  329. @property bool failed_quality_control() const nothrow { return cast(bool)(flag & 0x200); }
  330. /// ditto
  331. @property void failed_quality_control(bool b) { _setFlag( 9, b); }
  332. /// PCR or optical duplicate
  333. @property bool is_duplicate() const nothrow { return cast(bool)(flag & 0x400); }
  334. /// ditto
  335. @property void is_duplicate(bool b) { _setFlag(10, b); }
  336. /// Convenience function, returns '+' or '-' indicating the strand.
  337. @property char strand() const nothrow {
  338. return is_reverse_strand ? '-' : '+';
  339. }
  340. /// ditto
  341. @property void strand(char c) {
  342. enforce(c == '-' || c == '+', "Strand must be '-' or '+'");
  343. is_reverse_strand = c == '-';
  344. }
  345. /// Read name, length must be in 1..255 interval.
  346. @property string name() const nothrow {
  347. // notice -1: the string is zero-terminated, so we should strip that '\0'
  348. return cast(string)(_chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1]);
  349. }
  350. /// ditto
  351. @property void name(string new_name) {
  352. enforce(new_name.length >= 1 && new_name.length <= 255,
  353. "name length must be in 1-255 range");
  354. _dup();
  355. bio.bam.utils.array.replaceSlice(_chunk,
  356. _chunk[_read_name_offset .. _read_name_offset + _l_read_name - 1],
  357. cast(ubyte[])new_name);
  358. _l_read_name = cast(ubyte)(new_name.length + 1);
  359. }
  360. /// List of CIGAR operations
  361. @property const(CigarOperation)[] cigar() const nothrow {
  362. return cast(const(CigarOperation)[])(_chunk[_cigar_offset .. _cigar_offset +
  363. _n_cigar_op * CigarOperation.sizeof]);
  364. }
  365. /// ditto
  366. @property void cigar(const(CigarOperation)[] c) {
  367. enforce(c.length < 65536, "Too many CIGAR operations, must be <= 65535");
  368. _dup();
  369. bio.bam.utils.array.replaceSlice(_chunk,
  370. _chunk[_cigar_offset .. _cigar_offset + _n_cigar_op * CigarOperation.sizeof],
  371. cast(ubyte[])c);
  372. _n_cigar_op = cast(ushort)(c.length);
  373. _recalculate_bin();
  374. }
  375. /// Extended CIGAR where M operators are replaced with =/X based
  376. /// on information from MD tag. Throws if the read doesn't have MD
  377. /// tag.
  378. auto extended_cigar() @property const {
  379. Value md = this["MD"];
  380. enforce(md.is_string);
  381. return makeExtendedCigar(cigar, mdOperations(*cast(string*)(&md)));
  382. }
  383. /// The number of reference bases covered by this read.
  384. /// $(BR)
  385. /// Returns 0 if the read is unmapped.
  386. int basesCovered() const {
  387. if (this.is_unmapped) {
  388. return 0; // actually, valid alignments should have empty cigar string
  389. }
  390. return reduce!"a + b.length"(0, filter!"a.is_reference_consuming"(cigar));
  391. }
  392. /// Human-readable representation of CIGAR string (same as in SAM format)
  393. string cigarString() const {
  394. char[] str;
  395. // guess size of resulting string
  396. str.reserve(_n_cigar_op * 3);
  397. foreach (cigar_op; cigar) {
  398. str ~= to!string(cigar_op.length);
  399. str ~= cigar_op.type;
  400. }
  401. return cast(string)str;
  402. }
  403. private @property const(ubyte)[] raw_sequence_data() const nothrow {
  404. return _chunk[_seq_offset .. _seq_offset + (_l_seq + 1) / 2];
  405. }
  406. /// Read-only random-access range for access to sequence data.
  407. static struct SequenceResult {
  408. private size_t _index;
  409. private ubyte[] _data = void;
  410. private size_t _len = void;
  411. private bool _use_first_4_bits = void;
  412. this(const(ubyte[]) data, size_t len, bool use_first_4_bits=true) {
  413. _data = cast(ubyte[])data;
  414. _len = len;
  415. _use_first_4_bits = use_first_4_bits;
  416. }
  417. ///
  418. @property bool empty() const {
  419. return _index >= _len;
  420. }
  421. ///
  422. @property bio.core.base.Base front() const {
  423. return opIndex(0);
  424. }
  425. ///
  426. @property bio.core.base.Base back() const {
  427. return opIndex(_len - 1);
  428. }
  429. /*
  430. I have no fucking idea why this tiny piece of code
  431. does NOT get inlined by stupid DMD compiler.
  432. Therefore I use string mixin instead.
  433. (hell yeah! Back to the 90s! C macros rulez!)
  434. private size_t _getActualPosition(size_t index) const
  435. {
  436. if (_use_first_4_bits) {
  437. // [0 1] [2 3] [4 5] [6 7] ...
  438. // |
  439. // V
  440. // 0 1 2 3
  441. return index >> 1;
  442. } else {
  443. // [. 0] [1 2] [3 4] [5 6] ...
  444. // |
  445. // V
  446. // 0 1 2 3
  447. return (index >> 1) + (index & 1);
  448. }
  449. }*/
  450. private static string _getActualPosition(string index) {
  451. return "((" ~ index ~") >> 1) + " ~
  452. "(_use_first_4_bits ? 0 : ((" ~ index ~ ") & 1))";
  453. }
  454. private bool _useFirst4Bits(size_t index) const
  455. {
  456. auto res = index % 2 == 0;
  457. if (!_use_first_4_bits) {
  458. res = !res;
  459. }
  460. return res;
  461. }
  462. ///
  463. @property SequenceResult save() const {
  464. return SequenceResult(_data[mixin(_getActualPosition("_index")) .. $],
  465. _len - _index,
  466. _useFirst4Bits(_index));
  467. }
  468. ///
  469. SequenceResult opSlice(size_t i, size_t j) const {
  470. return SequenceResult(_data[mixin(_getActualPosition("_index + i")) .. $],
  471. j - i,
  472. _useFirst4Bits(_index + i));
  473. }
  474. ///
  475. @property bio.core.base.Base opIndex(size_t i) const {
  476. auto pos = _index + i;
  477. if (_use_first_4_bits)
  478. {
  479. if (pos & 1)
  480. return Base.fromInternalCode(_data[pos >> 1] & 0xF);
  481. else
  482. return Base.fromInternalCode(_data[pos >> 1] >> 4);
  483. }
  484. else
  485. {
  486. if (pos & 1)
  487. return Base.fromInternalCode(_data[(pos >> 1) + 1] >> 4);
  488. else
  489. return Base.fromInternalCode(_data[pos >> 1] & 0xF);
  490. }
  491. assert(false);
  492. }
  493. ///
  494. void popFront() {
  495. ++_index;
  496. }
  497. ///
  498. void popBack() {
  499. --_len;
  500. }
  501. ///
  502. @property size_t length() const {
  503. return _len - _index;
  504. }
  505. }
  506. /// Random-access range of characters
  507. @property SequenceResult sequence() const {
  508. return SequenceResult(raw_sequence_data, sequence_length);
  509. }
  510. static assert(isRandomAccessRange!(ReturnType!sequence));
  511. /// Sets query sequence. Sets all base qualities to 255 (i.e. unknown).
  512. @property void sequence(string seq)
  513. {
  514. _dup();
  515. auto raw_length = (seq.length + 1) / 2;
  516. // set sequence
  517. auto replacement = uninitializedArray!(ubyte[])(raw_length + seq.length);
  518. replacement[raw_length .. $] = 0xFF;
  519. for (size_t i = 0; i < raw_length; ++i) {
  520. replacement[i] = cast(ubyte)(Base(seq[2 * i]).internal_code << 4);
  521. if (seq.length > 2 * i + 1)
  522. replacement[i] |= cast(ubyte)(Base(seq[2 * i + 1]).internal_code);
  523. }
  524. bio.bam.utils.array.replaceSlice(_chunk,
  525. _chunk[_seq_offset .. _tags_offset],
  526. replacement);
  527. _l_seq = cast(int)seq.length;
  528. }
  529. /// Quality data (phred-based scores)
  530. @property const(ubyte)[] base_qualities() const nothrow {
  531. return _chunk[_qual_offset .. _qual_offset + _l_seq * char.sizeof];
  532. }
  533. /// Set quality data - array length must be of the same length as the sequence.
  534. @property void base_qualities(const(ubyte)[] quality) {
  535. enforce(quality.length == _l_seq, "Quality data must be of the same length as sequence");
  536. _dup();
  537. _chunk[_qual_offset .. _qual_offset + _l_seq] = quality;
  538. }
  539. /*
  540. Constructs the struct from memory chunk
  541. */
  542. this(ubyte[] chunk) {
  543. // Switching endianness lazily is not a good idea:
  544. //
  545. // 1) switching byte order is pretty fast
  546. // 2) lazy switching for arrays can kill the performance,
  547. // it has to be done once
  548. // 3) the code will be too complicated, whereas there're
  549. // not so many users of big-endian systems
  550. //
  551. // In summa, BAM is little-endian format, so big-endian
  552. // users will suffer anyway, it's unavoidable.
  553. _chunk = chunk;
  554. this._is_slice = true;
  555. if (std.system.endian != Endian.littleEndian) {
  556. switchChunkEndianness();
  557. // Dealing with tags is the responsibility of TagStorage.
  558. fixTagStorageByteOrder();
  559. }
  560. }
  561. // Doesn't touch tags, only fields.
  562. // @@@TODO: NEEDS TESTING@@@
  563. private void switchChunkEndianness() {
  564. // First 8 fields are 32-bit integers:
  565. //
  566. // 0) refID int
  567. // 1) pos int
  568. // 2) bin_mq_nl uint
  569. // 3) flag_nc uint
  570. // 4) l_seq int
  571. // 5) next_refID int
  572. // 6) next_pos int
  573. // 7) tlen int
  574. // ----------------------------------------------------
  575. // (after them name follows which is string)
  576. //
  577. switchEndianness(_chunk.ptr, 8 * uint.sizeof);
  578. // Then we need to switch endianness of CIGAR data:
  579. switchEndianness(_chunk.ptr + _cigar_offset,
  580. _n_cigar_op * uint.sizeof);
  581. }
  582. private size_t calculateChunkSize(string read_name,
  583. string sequence,
  584. in CigarOperation[] cigar)
  585. {
  586. return 8 * int.sizeof
  587. + (read_name.length + 1) // tailing '\0'
  588. + uint.sizeof * cigar.length
  589. + ubyte.sizeof * ((sequence.length + 1) / 2)
  590. + ubyte.sizeof * sequence.length;
  591. }
  592. /// Construct alignment from basic information about it.
  593. ///
  594. /// Other fields can be set afterwards.
  595. this(string read_name, // info for developers:
  596. string sequence, // these 3 fields are needed
  597. in CigarOperation[] cigar) // to calculate size of _chunk
  598. {
  599. enforce(read_name.length < 256, "Too long read name, length must be <= 255");
  600. enforce(cigar.length < 65536, "Too many CIGAR operations, must be <= 65535");
  601. if (this._chunk is null) {
  602. this._chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)];
  603. }
  604. this._refID = -1; // set default values
  605. this._pos = -1; // according to SAM/BAM
  606. this._mapq = 255; // specification
  607. this._next_refID = -1;
  608. this._next_pos = -1;
  609. this._tlen = 0;
  610. this._l_read_name = cast(ubyte)(read_name.length + 1); // tailing '\0'
  611. this._n_cigar_op = cast(ushort)(cigar.length);
  612. this._l_seq = cast(int)(sequence.length);
  613. // now all offsets can be calculated through corresponding properties
  614. // set default quality
  615. _chunk[_qual_offset .. _qual_offset + sequence.length] = 0xFF;
  616. // set CIGAR data
  617. auto _len = cigar.length * CigarOperation.sizeof;
  618. _chunk[_cigar_offset .. _cigar_offset + _len] = cast(ubyte[])(cigar);
  619. // set read_name
  620. auto _offset = _read_name_offset;
  621. _chunk[_offset .. _offset + read_name.length] = cast(ubyte[])read_name;
  622. _chunk[_offset + read_name.length] = cast(ubyte)'\0';
  623. this._is_slice = false;
  624. this.sequence = sequence;
  625. }
  626. // Low-level constructor for setting tag data on construction.
  627. // This allows to use less reallocations when creating an alignment
  628. // from scratch, by reusing memory for collecting tags.
  629. // Typically, you would use this constructor in conjunction with
  630. // bio.bam.utils.tagstoragebuilder module.
  631. this(string read_name,
  632. string sequence,
  633. in CigarOperation[] cigar,
  634. in ubyte[] tag_data)
  635. {
  636. _chunk = new ubyte[calculateChunkSize(read_name, sequence, cigar)
  637. + tag_data.length];
  638. this(read_name, sequence, cigar);
  639. _chunk[_tags_offset .. $] = tag_data;
  640. }
  641. /// Deep copy of the record.
  642. BamRead dup() @property const {
  643. BamRead result;
  644. result._chunk = this._chunk.dup;
  645. result._is_slice = false;
  646. result._reader = cast()_reader;
  647. return result;
  648. }
  649. /// Compare two alignments, including tags
  650. /// (the tags must follow in the same order for equality).
  651. bool opEquals(const ref BamRead other) const pure nothrow {
  652. // don't forget about _is_slice trick
  653. auto m = _cigar_offset;
  654. return _chunk[0 .. m - 1] == other._chunk[0 .. m - 1] &&
  655. _chunk[m .. $] == other._chunk[m .. $];
  656. }
  657. /// ditto
  658. bool opEquals(BamRead other) const pure nothrow {
  659. auto m = _cigar_offset;
  660. return _chunk[0 .. m - 1] == other._chunk[0 .. m - 1] &&
  661. _chunk[m .. $] == other._chunk[m .. $];
  662. }
  663. /// Size of the alignment record when output to stream in BAM format.
  664. /// Includes block_size as well (see SAM/BAM specification)
  665. @property size_t size_in_bytes() const {
  666. return int.sizeof + _chunk.length;
  667. }
  668. package void write(ref BamWriter writer) {
  669. writer.writeInteger(cast(int)(_chunk.length));
  670. ubyte old_byte = _chunk[_cigar_offset - 1];
  671. _chunk[_cigar_offset - 1] = 0;
  672. if (std.system.endian != Endian.littleEndian) {
  673. switchChunkEndianness();
  674. writer.writeByteArray(_chunk[0 .. _tags_offset]);
  675. switchChunkEndianness();
  676. } else {
  677. writer.writeByteArray(_chunk[0 .. _tags_offset]);
  678. }
  679. _chunk[_cigar_offset - 1] = old_byte;
  680. writeTags(writer);
  681. }
  682. /// Packs message in the following format:
  683. /// $(BR)
  684. /// MsgPack array with elements
  685. /// $(OL
  686. /// $(LI name - string)
  687. /// $(LI flag - ushort)
  688. /// $(LI reference sequence id - int)
  689. /// $(LI leftmost mapping position (1-based) - int)
  690. /// $(LI mapping quality - ubyte)
  691. /// $(LI array of CIGAR operation lengths - int[])
  692. /// $(LI array of CIGAR operation types - ubyte[])
  693. /// $(LI mate reference sequence id - int)
  694. /// $(LI mate position (1-based) - int)
  695. /// $(LI template length - int)
  696. /// $(LI segment sequence - string)
  697. /// $(LI phred-base quality - ubyte[])
  698. /// $(LI tags - map: string -> value))
  699. void toMsgpack(Packer)(ref Packer packer) const {
  700. packer.beginArray(13);
  701. packer.pack(cast(ubyte[])name);
  702. packer.pack(flag);
  703. packer.pack(ref_id);
  704. packer.pack(position + 1);
  705. packer.pack(mapping_quality);
  706. packer.pack(array(map!"a.length"(cigar)));
  707. packer.pack(array(map!"a.type"(cigar)));
  708. packer.pack(mate_ref_id);
  709. packer.pack(mate_position);
  710. packer.pack(template_length);
  711. packer.pack(to!string(sequence));
  712. packer.pack(base_qualities);
  713. packer.beginMap(tagCount());
  714. foreach (key, value; this) {
  715. packer.pack(key);
  716. packer.pack(value);
  717. }
  718. }
  719. /// String representation.
  720. /// $(BR)
  721. /// Possible formats are SAM ("%s") and JSON ("%j")
  722. void toString(scope void delegate(const(char)[]) sink, FormatSpec!char fmt) const {
  723. if (size_in_bytes < 10000 && fmt.spec == 's') {
  724. auto p = cast(char*)alloca(size_in_bytes * 5);
  725. char* end = p;
  726. toSam(end);
  727. sink(p[0 .. end - p]);
  728. } else if (size_in_bytes < 5000 && fmt.spec == 'j') {
  729. auto p = cast(char*)alloca(size_in_bytes * 10 + 1000);
  730. char* end = p;
  731. toJson(end);
  732. sink(p[0 .. end - p]);
  733. } else if (fmt.spec == 's') {
  734. toSam(sink);
  735. } else if (fmt.spec == 'j') {
  736. toJson(sink);
  737. } else {
  738. throw new FormatException("unknown format specifier");
  739. }
  740. }
  741. /// ditto
  742. void toSam(Sink)(auto ref Sink sink) const
  743. if (isSomeSink!Sink)
  744. {
  745. sink.write(name);
  746. sink.write('\t');
  747. sink.write(flag);
  748. sink.write('\t');
  749. if (ref_id == -1 || _reader is null)
  750. sink.write('*');
  751. else
  752. sink.write(_reader.reference_sequences[ref_id].name);
  753. sink.write('\t');
  754. sink.write(position + 1);
  755. sink.write('\t');
  756. sink.write(mapping_quality);
  757. sink.write('\t');
  758. if (cigar.length == 0)
  759. sink.write('*');
  760. else
  761. foreach (op; cigar)
  762. op.toSam(sink);
  763. sink.write('\t');
  764. if (mate_ref_id == ref_id) {
  765. if (mate_ref_id == -1)
  766. sink.write("*\t");
  767. else
  768. sink.write("=\t");
  769. } else {
  770. if (mate_ref_id == -1 || _reader is null) {
  771. sink.write("*\t");
  772. } else {
  773. auto mate_name = _reader.reference_sequences[mate_ref_id].name;
  774. sink.write(mate_name);
  775. sink.write("\t");
  776. }
  777. }
  778. sink.write(mate_position + 1);
  779. sink.write('\t');
  780. sink.write(template_length);
  781. sink.write('\t');
  782. if (sequence_length == 0)
  783. sink.write('*');
  784. else
  785. foreach (char c; sequence)
  786. sink.write(c);
  787. sink.write('\t');
  788. if (base_qualities.length == 0 || base_qualities[0] == 0xFF)
  789. sink.write('*');
  790. else
  791. foreach (qual; base_qualities)
  792. sink.write(cast(char)(qual + 33));
  793. foreach (k, v; this) {
  794. sink.write('\t');
  795. sink.write(k);
  796. sink.write(':');
  797. v.toSam(sink);
  798. }
  799. }
  800. /// ditto
  801. string toSam()() const {
  802. return to!string(this);
  803. }
  804. /// JSON representation
  805. void toJson(Sink)(auto ref Sink sink) const
  806. if (isSomeSink!Sink)
  807. {
  808. sink.write(`{"qname":`); sink.writeJson(name);
  809. sink.write(`,"flag":`); sink.write(flag);
  810. sink.write(`,"rname":`);
  811. if (ref_id == -1 || _reader is null)
  812. sink.write(`"*"`);
  813. else
  814. sink.writeJson(_reader.reference_sequences[ref_id].name);
  815. sink.write(`,"pos":`); sink.write(position + 1);
  816. sink.write(`,"mapq":`); sink.write(mapping_quality);
  817. sink.write(`,"cigar":"`);
  818. if (cigar.empty)
  819. sink.write('*');
  820. else
  821. foreach (op; cigar)
  822. op.toSam(sink);
  823. sink.write('"');
  824. sink.write(`,"rnext":`);
  825. if (mate_ref_id == ref_id) {
  826. if (mate_ref_id == -1)
  827. sink.write(`"*"`);
  828. else
  829. sink.write(`"="`);
  830. } else if (mate_ref_id == -1 || _reader is null) {
  831. sink.write(`"*"`);
  832. } else {
  833. sink.writeJson(_reader.reference_sequences[mate_ref_id].name);
  834. }
  835. sink.write(`,"pnext":`); sink.write(mate_position + 1);
  836. sink.write(`,"tlen":`); sink.write(template_length);
  837. sink.write(`,"seq":"`);
  838. if (sequence_length == 0)
  839. sink.write('*');
  840. else
  841. foreach (char c; sequence)
  842. sink.write(c);
  843. sink.write('"');
  844. sink.write(`,"qual":`);
  845. sink.writeJson(base_qualities);
  846. sink.write(`,"tags":{`);
  847. bool not_first = false;
  848. foreach (k, v; this) {
  849. if (not_first)
  850. sink.write(',');
  851. sink.writeJson(k);
  852. sink.write(':');
  853. v.toJson(sink);
  854. not_first = true;
  855. }
  856. sink.write("}}");
  857. }
  858. /// ditto
  859. string toJson()() const {
  860. auto w = appender!(char[])();
  861. toJson((const(char)[] s) { w.put(s); });
  862. return cast(string)w.data;
  863. }
  864. /// Associates read with BAM reader. This is done automatically
  865. /// if this read is obtained through BamReader/Reference methods.
  866. void associateWithReader(bio.bam.abstractreader.IBamSamReader reader) {
  867. _reader = reader;
  868. }
  869. /// Associated BAM/SAM reader.
  870. bio.bam.abstractreader.IBamSamReader reader() @property {
  871. return _reader;
  872. }
  873. package ubyte[] _chunk; // holds all the data,
  874. // the access is organized via properties
  875. // (see below)
  876. private:
  877. // by specs, name ends with '\0'
  878. // let's use this byte for something useful!
  879. //
  880. // (Of course this places some restrictions on usage,
  881. // but allows to reduce size of record.)
  882. bool _is_slice() @property const {
  883. return cast(bool)_chunk[_cigar_offset - 1];
  884. }
  885. void _is_slice(bool is_slice) @property {
  886. _chunk[_cigar_offset - 1] = is_slice ? 1 : 0;
  887. }
  888. IBamSamReader _reader;
  889. // Official field names from SAM/BAM specification.
  890. // For internal use only
  891. @property int _refID() const nothrow {
  892. return *(cast( int*)(_chunk.ptr + int.sizeof * 0));
  893. }
  894. @property int _pos() const nothrow {
  895. return *(cast( int*)(_chunk.ptr + int.sizeof * 1));
  896. }
  897. @property uint _bin_mq_nl() const nothrow pure @system {
  898. return *(cast(uint*)(_chunk.ptr + int.sizeof * 2));
  899. }
  900. @property uint _flag_nc() const nothrow {
  901. return *(cast(uint*)(_chunk.ptr + int.sizeof * 3));
  902. }
  903. @property int _l_seq() const nothrow {
  904. return *(cast( int*)(_chunk.ptr + int.sizeof * 4));
  905. }
  906. @property int _next_refID() const nothrow {
  907. return *(cast( int*)(_chunk.ptr + int.sizeof * 5));
  908. }
  909. @property int _next_pos() const nothrow {
  910. return *(cast( int*)(_chunk.ptr + int.sizeof * 6));
  911. }
  912. @property int _tlen() const nothrow {
  913. return *(cast( int*)(_chunk.ptr + int.sizeof * 7));
  914. }
  915. // Setters, also only for internal use
  916. @property void _refID(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 0)) = n; }
  917. @property void _pos(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 1)) = n; }
  918. @property void _bin_mq_nl(uint n) { *(cast(uint*)(_chunk.ptr + int.sizeof * 2)) = n; }
  919. @property void _flag_nc(uint n) { *(cast(uint*)(_chunk.ptr + int.sizeof * 3)) = n; }
  920. @property void _l_seq(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 4)) = n; }
  921. @property void _next_refID(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 5)) = n; }
  922. @property void _next_pos(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 6)) = n; }
  923. @property void _tlen(int n) { *(cast( int*)(_chunk.ptr + int.sizeof * 7)) = n; }
  924. // Additional useful properties, also from SAM/BAM specification
  925. //
  926. // The layout of bin_mq_nl and flag_nc is as follows
  927. // (upper bits -------> lower bits):
  928. //
  929. // bin_mq_nl [ { bin (16b) } { mapping quality (8b) } { read name length (8b) } ]
  930. //
  931. // flag_nc [ { flag (16b) } { n_cigar_op (16b) } ]
  932. //
  933. @property ushort _bin() const nothrow {
  934. return _bin_mq_nl >> 16;
  935. }
  936. @property ubyte _mapq() const nothrow {
  937. return (_bin_mq_nl >> 8) & 0xFF;
  938. }
  939. @property ubyte _l_read_name() const nothrow pure {
  940. return _bin_mq_nl & 0xFF;
  941. }
  942. @property ushort _flag() const nothrow {
  943. return _flag_nc >> 16;
  944. }
  945. @property ushort _n_cigar_op() const nothrow {
  946. return _flag_nc & 0xFFFF;
  947. }
  948. // Setters for those properties
  949. @property void _bin(ushort n) { _bin_mq_nl = (_bin_mq_nl & 0xFFFF) | (n << 16); }
  950. @property void _mapq(ubyte n) { _bin_mq_nl = (_bin_mq_nl & ~0xFF00) | (n << 8); }
  951. @property void _l_read_name(ubyte n) { _bin_mq_nl = (_bin_mq_nl & ~0xFF ) | n; }
  952. @property void _flag(ushort n) { _flag_nc = (_flag_nc & 0xFFFF) | (n << 16); }
  953. @property void _n_cigar_op(ushort n) { _flag_nc = (_flag_nc & ~0xFFFF) | n; }
  954. // Offsets of various arrays in bytes.
  955. // Currently, are computed each time, so if speed will be an issue,
  956. // they can be made fields instead of properties.
  957. @property size_t _read_name_offset() const nothrow pure {
  958. return 8 * int.sizeof;
  959. }
  960. @property size_t _cigar_offset() const nothrow pure {
  961. return _read_name_offset + _l_read_name * char.sizeof;
  962. }
  963. @property size_t _seq_offset() const nothrow {
  964. return _cigar_offset + _n_cigar_op * uint.sizeof;
  965. }
  966. @property size_t _qual_offset() const nothrow {
  967. return _seq_offset + (_l_seq + 1) / 2;
  968. }
  969. // Offset of auxiliary data
  970. @property size_t _tags_offset() const nothrow {
  971. return _qual_offset + _l_seq;
  972. }
  973. // Sets n-th flag bit to boolean value b.
  974. void _setFlag(int n, bool b) {
  975. assert(n < 16);
  976. // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
  977. ushort mask = cast(ushort)(1 << n);
  978. _flag = (_flag & ~mask) | ((-cast(int)b) & mask);
  979. }
  980. // If _chunk is still a slice, not an array, duplicate it.
  981. // Used when some part of alignment record is modified by user.
  982. //
  983. // Basically, it's sort of copy-on-write: a lot of read-only alignments
  984. // may point to the same location, but every modified one allocates its
  985. // own chunk of memory.
  986. void _dup() {
  987. if (_is_slice) {
  988. _chunk = _chunk.dup;
  989. _is_slice = false;
  990. }
  991. }
  992. // Calculates bin number.
  993. void _recalculate_bin() {
  994. _bin = reg2bin(position, position + basesCovered());
  995. }
  996. }
  997. /// Lazy tag storage.
  998. ///
  999. /// Provides hash-like access and opportunity to iterate
  1000. /// storage like an associative array.
  1001. mixin template TagStorage() {
  1002. // Provides access to chunk of memory which contains tags.
  1003. // This way, every time _tags_offset gets updated
  1004. // (due to update of cigar string/read name/sequence and memory move),
  1005. // the change is reflected automatically in tag storage.
  1006. private @property const(ubyte)[] _tags_chunk() const {
  1007. return _chunk[_tags_offset .. $];
  1008. }
  1009. /// Hash-like access to tags. Time complexity is $(BIGOH number of tags).
  1010. /// $(BR)
  1011. /// If tag with such $(I key) is not found, returned value 'is nothing'.
  1012. /// $(BR)
  1013. /// If key length is different from 2, exception is thrown.
  1014. /// $(BR)
  1015. /// Special case when $(I value) represents nothing is used for removing tag
  1016. /// (assuming that no more than one with this key is presented in the record).
  1017. ///
  1018. /// Examples:
  1019. /// ----------------------------
  1020. /// auto v = read["NM"];
  1021. /// assert(v.is_integer);
  1022. ///
  1023. /// auto v = read["MN"];
  1024. /// assert(v.is_nothing); // no such tag
  1025. ///
  1026. /// read["NM"] = 3; // converted to bio.bam.tagvalue.Value implicitly
  1027. ///
  1028. /// read["NM"] = null; // removes tag
  1029. /// assert(read["NM"].is_nothing);
  1030. /// ----------------------------
  1031. bio.bam.tagvalue.Value opIndex(string key) const {
  1032. enforce(key.length == 2, "Key length must be 2");
  1033. auto __tags_chunk = _tags_chunk; // _tags_chunk is evaluated lazily
  1034. if (__tags_chunk.length < 4)
  1035. return Value(null);
  1036. size_t offset = 0;
  1037. while (offset + 1 < __tags_chunk.length) {
  1038. if (__tags_chunk[offset .. offset + 2] == key) {
  1039. offset += 2;
  1040. return readValue(offset, __tags_chunk);
  1041. } else {
  1042. offset += 2;
  1043. skipValue(offset, __tags_chunk);
  1044. }
  1045. }
  1046. return Value(null);
  1047. }
  1048. /// ditto
  1049. void opIndexAssign(T)(T value, string key)
  1050. if (is(T == Value) || __traits(compiles, GetTypeId!T))
  1051. {
  1052. static if(is(T == Value)) {
  1053. enforce(key.length == 2, "Key length must be 2");
  1054. auto __tags_chunk = _tags_chunk;
  1055. _dup();
  1056. size_t offset = 0;
  1057. while (offset + 1 < __tags_chunk.length) {
  1058. if (__tags_chunk[offset .. offset + 2] == key) {
  1059. if (value.is_nothing) {
  1060. // special case - remove tag
  1061. removeValueAt(offset);
  1062. } else {
  1063. replaceValueAt(offset + 2, value);
  1064. }
  1065. return;
  1066. } else {
  1067. offset += 2;
  1068. skipValue(offset, __tags_chunk);
  1069. }
  1070. }
  1071. if (!value.is_nothing)
  1072. appendTag(key, value);
  1073. } else {
  1074. opIndexAssign(Value(value), key);
  1075. }
  1076. }
  1077. /// Append new tag to the end, skipping check if it already exists. $(BIGOH 1)
  1078. void appendTag(string key, Value value) {
  1079. auto oldlen = _chunk.length;
  1080. _chunk.length = _chunk.length + sizeInBytes(value) + 2 * char.sizeof;
  1081. _chunk[oldlen .. oldlen + 2] = cast(ubyte[])key;
  1082. emplaceValue(_chunk.ptr + oldlen + 2, value);
  1083. }
  1084. /// Remove all tags
  1085. void clearAllTags() {
  1086. _chunk.length = _tags_offset;
  1087. }
  1088. /// Number of tags. $(BIGOH number of tags)
  1089. size_t tagCount() {
  1090. size_t result = 0;
  1091. size_t offset = 0;
  1092. auto __tags_chunk = _tags_chunk;
  1093. while (offset + 1 < __tags_chunk.length) {
  1094. offset += 2;
  1095. skipValue(offset, __tags_chunk);
  1096. result += 1;
  1097. }
  1098. return result;
  1099. }
  1100. // replace existing tag
  1101. private void replaceValueAt(size_t offset, Value value) {
  1102. // offset points to the beginning of the value
  1103. auto begin = offset;
  1104. auto __tags_chunk = _tags_chunk;
  1105. skipValue(offset, __tags_chunk); // now offset is updated and points to the end
  1106. auto end = offset;
  1107. prepareSlice(_chunk, __tags_chunk[begin .. end], sizeInBytes(value));
  1108. emplaceValue(_chunk.ptr + _tags_offset + begin, value);
  1109. }
  1110. // remove existing tag
  1111. private void removeValueAt(size_t begin) {
  1112. // offset points to the beginning of the value
  1113. auto offset = begin + 2;
  1114. auto __tags_chunk = _tags_chunk;
  1115. skipValue(offset, __tags_chunk);
  1116. auto end = offset;
  1117. // this does the job (see prepareSlice code)
  1118. prepareSlice(_chunk, __tags_chunk[begin .. end], 0);
  1119. }
  1120. /// Provides opportunity to iterate over tags.
  1121. int opApply(scope int delegate(const ref string k, const ref Value v) dg) const {
  1122. size_t offset = 0;
  1123. auto __tags_chunk = _tags_chunk;
  1124. while (offset + 1 < __tags_chunk.length) {
  1125. auto key = cast(string)__tags_chunk[offset .. offset + 2];
  1126. offset += 2;
  1127. auto val = readValue(offset, __tags_chunk);
  1128. auto res = dg(key, val);
  1129. if (res != 0) {
  1130. return res;
  1131. }
  1132. }
  1133. return 0;
  1134. }
  1135. /// Returns the number of tags. Time complexity is $(BIGOH number of tags)
  1136. size_t tagCount() const {
  1137. size_t res = 0;
  1138. size_t offset = 0;
  1139. auto __tags_chunk = _tags_chunk;
  1140. while (offset + 1 < __tags_chunk.length) {
  1141. offset += 2;
  1142. skipValue(offset, __tags_chunk);
  1143. res += 1;
  1144. }
  1145. return res;
  1146. }
  1147. private void writeTags(BamWriter writer) {
  1148. if (std.system.endian == Endian.littleEndian) {
  1149. writer.writeByteArray(_tags_chunk[]);
  1150. } else {
  1151. fixTagStorageByteOrder();
  1152. writer.writeByteArray(_tags_chunk[]);
  1153. fixTagStorageByteOrder();
  1154. }
  1155. }
  1156. // Reads value which starts from (_tags_chunk.ptr + offset) address,
  1157. // and updates offset to the end of value. O(1)
  1158. private Value readValue(ref size_t offset, const(ubyte)[] tags_chunk) const {
  1159. string readValueArrayTypeHelper() {
  1160. char[] cases;
  1161. foreach (c2t; ArrayElementTagValueTypes) {
  1162. cases ~=
  1163. "case '"~c2t.ch~"':".dup~
  1164. " auto begin = offset;"~
  1165. " auto end = offset + length * "~c2t.ValueType.stringof~".sizeof;"~
  1166. " offset = end;"~
  1167. " return Value(cast("~c2t.ValueType.stringof~"[])(tags_chunk[begin .. end]));";
  1168. }
  1169. return to!string("switch (elem_type) {" ~ cases ~
  1170. " default: throw new UnknownTagTypeException(to!string(elem_type));"~
  1171. "}");
  1172. }
  1173. string readValuePrimitiveTypeHelper() {
  1174. char[] cases;
  1175. foreach (c2t; PrimitiveTagValueTypes) {
  1176. cases ~= "case '"~c2t.ch~"':"~
  1177. " auto p = tags_chunk.ptr + offset;"~
  1178. " auto value = *(cast("~c2t.ValueType.stringof~"*)p);"~
  1179. " offset += value.sizeof;"~
  1180. " return Value(value);".dup;
  1181. }
  1182. return to!string("switch (type) {" ~ cases ~
  1183. " default: throw new UnknownTagTypeException(to!string(type));"~
  1184. "}");
  1185. }
  1186. char type = cast(char)tags_chunk[offset++];
  1187. if (type == 'Z' || type == 'H') {
  1188. auto begin = offset;
  1189. while (tags_chunk[offset++] != 0) {}
  1190. // return string with stripped '\0'
  1191. auto v = Value(cast(string)tags_chunk[begin .. offset - 1]);
  1192. if (type == 'H') {
  1193. v.setHexadecimalFlag();
  1194. }
  1195. return v;
  1196. } else if (type == 'B') {
  1197. char elem_type = cast(char)tags_chunk[offset++];
  1198. uint length = *(cast(uint*)(tags_chunk.ptr + offset));
  1199. offset += uint.sizeof;
  1200. mixin(readValueArrayTypeHelper());
  1201. } else {
  1202. mixin(readValuePrimitiveTypeHelper());
  1203. }
  1204. }
  1205. // Increases offset so that it points to the next value. O(1).
  1206. private void skipValue(ref size_t offset, const(ubyte)[] tags_chunk) const {
  1207. char type = cast(char)tags_chunk[offset++];
  1208. if (type == 'Z' || type == 'H') {
  1209. while (tags_chunk[offset++] != 0) {}
  1210. } else if (type == 'B') {
  1211. char elem_type = cast(char)tags_chunk[offset++];
  1212. auto length = *(cast(uint*)(tags_chunk.ptr + offset));
  1213. offset += uint.sizeof + charToSizeof(elem_type) * length;
  1214. } else {
  1215. offset += charToSizeof(type);
  1216. }
  1217. }
  1218. /*
  1219. Intended to be used in constructor for initial endianness fixing
  1220. in case the library is used on big-endian system.
  1221. NOT TESTED AT ALL!!!
  1222. */
  1223. private void fixTagStorageByteOrder() {
  1224. /* TODO: TEST ON BIG-ENDIAN SYSTEM!!! */
  1225. const(ubyte)* p = _tags_chunk.ptr;
  1226. const(ubyte)* end = p + _chunk.length;
  1227. while (p < end) {
  1228. p += 2; // skip tag name
  1229. char type = *(cast(char*)p);
  1230. ++p; // skip type
  1231. if (type == 'Z' || type == 'H') {
  1232. while (*p != 0) { // zero-terminated
  1233. ++p; // string
  1234. }
  1235. ++p; // skip '\0'
  1236. } else if (type == 'B') { // array
  1237. char elem_type = *(cast(char*)p);
  1238. uint size = charToSizeof(elem_type);
  1239. switchEndianness(p, uint.sizeof);
  1240. uint length = *(cast(uint*)p);
  1241. p += uint.sizeof; // skip length
  1242. if (size != 1) {
  1243. for (auto j = 0; j < length; j++) {
  1244. switchEndianness(p, size);
  1245. p += size;
  1246. }
  1247. } else {
  1248. // skip
  1249. p += length;
  1250. }
  1251. } else {
  1252. uint size = charToSizeof(type);
  1253. if (size != 1) {
  1254. switchEndianness(p, size);
  1255. p += size;
  1256. } else {
  1257. ++p;
  1258. }
  1259. }
  1260. }
  1261. }
  1262. }
  1263. unittest {
  1264. import bio.bam.utils.tagstoragebuilder;
  1265. import std.algorithm;
  1266. import std.stdio;
  1267. import std.math;
  1268. writeln("Testing BamRead behaviour...");
  1269. auto read = BamRead("readname",
  1270. "AGCTGACTACGTAATAGCCCTA",
  1271. [CigarOperation(22, 'M')]);
  1272. assert(read.sequence_length == 22);
  1273. assert(read.cigar.length == 1);
  1274. assert(read.cigarString() == "22M");
  1275. assert(read.name == "readname");
  1276. assert(equal(read.sequence(), "AGCTGACTACGTAATAGCCCTA"));
  1277. read.name = "anothername";
  1278. assert(read.name == "anothername");
  1279. assert(read.cigarString() == "22M");
  1280. read.base_qualities = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
  1281. 13, 14, 15, 16, 17, 18, 19, 20, 21, 22];
  1282. assert(reduce!"a+b"(0, read.base_qualities) == 253);
  1283. read["RG"] = 15;
  1284. assert(read["RG"] == 15);
  1285. read["X1"] = [1, 2, 3, 4, 5];
  1286. assert(read["X1"] == [1, 2, 3, 4, 5]);
  1287. read.cigar = [CigarOperation(20, 'M'), CigarOperation(2, 'X')];
  1288. assert(read.cigarString() == "20M2X");
  1289. read["RG"] = cast(float)5.6;
  1290. assert(approxEqual(to!float(read["RG"]), 5.6));
  1291. read.sequence = "AGCTGGCTACGTAATAGCCCT";
  1292. assert(read.sequence_length == 21);
  1293. assert(read.base_qualities.length == 21);
  1294. assert(read.base_qualities[20] == 255);
  1295. assert(equal(read.sequence(), "AGCTGGCTACGTAATAGCCCT"));
  1296. assert(retro(read.sequence)[2] == 'C');
  1297. assert(retro(read.sequence)[0] == 'T');
  1298. assert(read.sequence[4] == 'G');
  1299. assert(read.sequence[0] == 'A');
  1300. assert(equal(read.sequence[0..8], "AGCTGGCT"));
  1301. assert(equal(read.sequence[3..5], "TG"));
  1302. assert(equal(read.sequence[3..9][1..4], "GGC"));
  1303. read["X1"] = 42;
  1304. assert(read["X1"] == 42);
  1305. assert(read.tagCount() == 2);
  1306. read["X1"] = null;
  1307. assert(read["X1"].is_nothing);
  1308. assert(read.tagCount() == 1);
  1309. read.sequence = "GTAAGCTGGCACTAGCAGCCT";
  1310. read.cigar = [CigarOperation(read.sequence_length, 'M')];
  1311. read["RG"] = null;
  1312. read["RG"] = "readgroup1";
  1313. assert(read.tagCount() == 1);
  1314. read["RG"] = null;
  1315. assert(read.tagCount() == 0);
  1316. // Test tagstoragebuilder
  1317. auto builder = new TagStorageBuilder();
  1318. builder.put("X0", Value(24));
  1319. builder.put("X1", Value("abcd"));
  1320. builder.put("X2", Value([1,2,3]));
  1321. read = BamRead("readname",
  1322. "AGCTGACTACGTAATAGCCCTA",
  1323. [CigarOperation(22, 'M')],
  1324. builder.data);
  1325. assert(read["X0"] == 24);
  1326. assert(read["X1"] == "abcd");
  1327. assert(read["X2"] == [1,2,3]);
  1328. assert(read.tagCount() == 3);
  1329. // Test MsgPack serialization/deserialization
  1330. {
  1331. import std.typecons;
  1332. auto packer = bio.bam.thirdparty.msgpack.packer(Appender!(ubyte[])());
  1333. read.toMsgpack(packer);
  1334. auto data = packer.stream.data;
  1335. auto rec = unpack(data).via.array;
  1336. assert(rec[0] == "readname");
  1337. assert(rec[5].as!(int[]) == [22]);
  1338. assert(rec[6].as!(ubyte[]) == ['M']);
  1339. assert(rec[10].as!(ubyte[]) == to!string(read.sequence));
  1340. }
  1341. read.clearAllTags();
  1342. assert(read.tagCount() == 0);
  1343. }
  1344. /// $(P BamRead wrapper which precomputes $(D end_position) = $(D position) + $(D basesCovered()).)
  1345. ///
  1346. /// $(P Computation of basesCovered() takes quite a few cycles. Therefore in places where this
  1347. /// property is frequently accessed, it makes sense to precompute it for later use.)
  1348. ///
  1349. /// $(P The idea is that this should be a drop-in replacement for BamRead in algorithms,
  1350. /// as the struct uses 'alias this' construction for the wrapped read.)
  1351. struct EagerBamRead {
  1352. ///
  1353. this(BamRead read) {
  1354. this.read = read;
  1355. this.end_position = read.position + read.basesCovered();
  1356. }
  1357. ///
  1358. BamRead read;
  1359. ///
  1360. alias read this;
  1361. /// End position on the reference, computed as position + basesCovered().
  1362. int end_position;
  1363. ///
  1364. EagerBamRead dup() @property const {
  1365. return EagerBamRead(read.dup);
  1366. }
  1367. }
  1368. static assert(is(EagerBamRead : BamRead));
  1369. /// Checks if $(D T) behaves like $(D BamRead)
  1370. template isBamRead(T)
  1371. {
  1372. static if (is(Unqual!T : BamRead))
  1373. enum isBamRead = true;
  1374. else
  1375. enum isBamRead = __traits(compiles,
  1376. {
  1377. T t; bool p;
  1378. p = t.ref_id == 1; p = t.position == 2; p = t.bin.id == 3;
  1379. p = t.mapping_quality == 4; p = t.flag == 5; p = t.sequence_length == 6;
  1380. p = t.mate_ref_id == 7; p = t.mate_position == 8; p = t.template_length == 9;
  1381. p = t.is_paired; p = t.proper_pair; p = t.is_unmapped;
  1382. p = t.mate_is_unmapped; p = t.mate_is_reverse_strand; p = t.is_first_of_pair;
  1383. p = t.is_second_of_pair; p = t.is_secondary_alignment; p = t.failed_quality_control;
  1384. p = t.is_duplicate; p = t.strand == '+'; p = t.name == "";
  1385. p = t.cigar[0].type == 'M'; p = t.basesCovered() > 42; p = t.cigarString() == "";
  1386. p = t.sequence[0] == 'A'; p = t.base_qualities[0] == 0;
  1387. });
  1388. }
  1389. /// $(P Comparison function for 'queryname' sorting order
  1390. /// (return whether first read is 'less' than second))
  1391. ///
  1392. /// $(P This function can be called on:
  1393. /// $(UL
  1394. /// $(LI two reads)
  1395. /// $(LI read and string in any order)))
  1396. bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
  1397. if (isBamRead!R1 && isBamRead!R2)
  1398. {
  1399. return a1.name < a2.name;
  1400. }
  1401. bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
  1402. if (isBamRead!R1 && isSomeString!R2)
  1403. {
  1404. return a1.name < a2;
  1405. }
  1406. bool compareReadNames(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
  1407. if (isSomeString!R1 && isBamRead!R2)
  1408. {
  1409. return a1 < a2.name;
  1410. }
  1411. /// $(P Comparison function for 'coordinate' sorting order
  1412. /// (returns whether first read is 'less' than second))
  1413. ///
  1414. /// $(P This function can be called on:
  1415. /// $(UL
  1416. /// $(LI two reads (in this case, reference IDs are also taken into account))
  1417. /// $(LI read and integer in any order)))
  1418. bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
  1419. if (isBamRead!R1 && isBamRead!R2)
  1420. {
  1421. if (a1.ref_id == -1) return false; // unmapped reads should be last
  1422. if (a2.ref_id == -1) return true;
  1423. if (a1.ref_id < a2.ref_id) return true;
  1424. if (a1.ref_id > a2.ref_id) return false;
  1425. if (a1.position < a2.position) return true;
  1426. return false;
  1427. }
  1428. bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
  1429. if (isBamRead!R1 && isIntegral!R2)
  1430. {
  1431. return a1.position < a2;
  1432. }
  1433. bool compareCoordinates(R1, R2)(const auto ref R1 a1, const auto ref R2 a2)
  1434. if (isIntegral!R1 && isBamRead!R2)
  1435. {
  1436. return a1 < a2.position;
  1437. }
  1438. static assert(isTwoWayCompatible!(compareReadNames, BamRead, string));
  1439. static assert(isTwoWayCompatible!(compareCoordinates, BamRead, int));