You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

321 lines
9.7 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. module bio.bam.iontorrent.flowcall;
  21. import bio.bam.tagvalue;
  22. import bio.bam.iontorrent.flowindex;
  23. import bio.core.base;
  24. import bio.core.utils.range;
  25. import std.array;
  26. import std.typecons;
  27. import std.range;
  28. import std.algorithm;
  29. import std.exception;
  30. /// Tag where flow signal intensities are stored
  31. enum FlowGramTag : ubyte {
  32. FZ,
  33. ZM
  34. }
  35. /// Scale of intensity values
  36. float multiplier(FlowGramTag tag) {
  37. return tag == FlowGramTag.FZ ? 100.0 : 256.0;
  38. }
  39. /// Flow base call
  40. struct FlowCall {
  41. private {
  42. short _signal_intensity;
  43. static assert(Base.ValueSetSize <= 16 && FlowGramTag.max < 16,
  44. "implementation of FlowCall should be changed?");
  45. ubyte _storage; // tag in upper 4 bits, base in lower 4 bits
  46. Base _base() @property const {
  47. return Base.fromInternalCode(_storage & 0xF);
  48. }
  49. void _base(Base b) @property {
  50. _storage &= 0xF0;
  51. _storage |= b.internal_code;
  52. }
  53. FlowGramTag _tag() @property const {
  54. return cast(FlowGramTag)(_storage >> 4);
  55. }
  56. void _tag(FlowGramTag tag) @property {
  57. _storage &= 0xF;
  58. _storage |= (cast(ubyte)tag << 4);
  59. }
  60. this(short signal_intensity, Base b, FlowGramTag tag) {
  61. _signal_intensity = signal_intensity;
  62. _storage = cast(ubyte)(b.internal_code | (tag << 4));
  63. }
  64. }
  65. /// Nucleotide
  66. Base base() @property const {
  67. return _base;
  68. }
  69. /// Signal intensity, normalized to homopolymer lengths
  70. float intensity() @property const {
  71. return _signal_intensity / multiplier(_tag);
  72. }
  73. /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tag,
  74. /// and 256.0 for ZM tag.
  75. /// More efficient, because this is how intensities are stored in FZ/ZM tag.
  76. short intensity_value() @property const {
  77. return _signal_intensity;
  78. }
  79. }
  80. /// Flow call associated with a read
  81. struct ReadFlowCall {
  82. private {
  83. FlowCall _fc;
  84. ushort _offset;
  85. ushort _called_len;
  86. ushort _flow_index;
  87. this(Base b, short signal_intensity, ushort offset,
  88. ushort called, ushort flow_index, FlowGramTag tag)
  89. {
  90. _fc = FlowCall(signal_intensity, b, tag);
  91. _offset = offset;
  92. _called_len = called;
  93. _flow_index = flow_index;
  94. }
  95. }
  96. /// Called nucleotide
  97. Base base() @property const {
  98. return _fc._base;
  99. }
  100. /// Set base to its complement
  101. void complement() {
  102. _fc._base = _fc._base.complement;
  103. }
  104. /// Called homopolymer length
  105. ushort length() @property const {
  106. return _called_len;
  107. }
  108. /// Zero-based position of the first nucleotide in the run,
  109. /// relative to start of the read. Takes strandness into account.
  110. ushort offset() @property const {
  111. return _offset;
  112. }
  113. /// Signal intensity, normalized to homopolymer lengths
  114. float intensity() @property const {
  115. return _fc.intensity;
  116. }
  117. /// round(intensity * Multiplier) where Multiplier is 100.0 for FZ tags,
  118. /// and 256.0 for ZM tags.
  119. /// More efficient, because this is how intensities are stored in FZ/ZM tag.
  120. short intensity_value() @property const {
  121. return _fc._signal_intensity;
  122. }
  123. /// Flow index (0-based)
  124. size_t flow_index() @property const {
  125. return _flow_index;
  126. }
  127. }
  128. /// Get flow calls from signal intensities and flow order.
  129. auto flowCalls(short[] intensities, string flow_order, FlowGramTag tag) {
  130. static FlowCall flowCall(T)(T call) {
  131. return FlowCall(call[0], Base(call[1]), call[2]);
  132. }
  133. return map!flowCall(zip(intensities, flow_order, repeat(tag)));
  134. }
  135. struct ReadFlowCallRange(S)
  136. if (!is(S == class))
  137. {
  138. private {
  139. string _flow_order = void;
  140. short[] _intensities = void;
  141. bool _rev = void;
  142. S _sequence = void;
  143. int _zf = void;
  144. Base _current_base = void;
  145. ushort _current_length = void;
  146. size_t _current_flow_index;
  147. ushort _current_offset;
  148. ushort _overlap = void;
  149. FlowGramTag _tag = void;
  150. bool _empty = false;
  151. // consumes next homopolymer from the sequence,
  152. // and updates _current_base, _current_flow_index,
  153. // _current_length appropriately
  154. void _doSetup() {
  155. if (_sequence.empty) {
  156. _empty = true;
  157. return;
  158. }
  159. _current_length = 1;
  160. // setup current base and current length
  161. if (!_rev) {
  162. _current_base = _sequence.front;
  163. _sequence.popFront();
  164. while (!_sequence.empty && _sequence.front == _current_base) {
  165. _sequence.popFront();
  166. ++_current_length;
  167. }
  168. } else {
  169. _current_base = _sequence.back; // complement later
  170. _sequence.popBack(); // because of comparison below
  171. while (!_sequence.empty && _sequence.back == _current_base) {
  172. _sequence.popBack();
  173. ++_current_length;
  174. }
  175. _current_base = _current_base.complement;
  176. }
  177. // setup current flow index
  178. for ( ; _current_flow_index < _flow_order.length; ++_current_flow_index) {
  179. if (_flow_order[_current_flow_index] == _current_base) {
  180. break;
  181. }
  182. }
  183. }
  184. }
  185. this(S seq, short[] intensities, bool reverse_strand,
  186. string flow_order, ushort first_base_overlap, int zf, FlowGramTag tag)
  187. {
  188. _sequence = seq;
  189. _intensities = intensities;
  190. _rev = reverse_strand;
  191. _flow_order = flow_order;
  192. _zf = zf;
  193. _overlap = first_base_overlap;
  194. _tag = tag;
  195. if (_sequence.empty) {
  196. _empty = true;
  197. } else {
  198. _doSetup();
  199. }
  200. }
  201. bool empty() @property const {
  202. return _empty;
  203. }
  204. ReadFlowCall front() @property const {
  205. enforce(_current_flow_index < _intensities.length,
  206. "Inconsistency between FZ/ZM tag and read bases");
  207. auto intensity = cast(ushort)(_intensities[_current_flow_index] - _overlap);
  208. ReadFlowCall rfc = void;
  209. rfc._fc = FlowCall(intensity, _current_base, _tag);
  210. rfc._offset = _current_offset;
  211. rfc._called_len = _current_length;
  212. rfc._flow_index = cast(ushort)(_current_flow_index + _zf);
  213. return rfc;
  214. }
  215. void popFront() {
  216. _current_offset += _current_length;
  217. ++_current_flow_index;
  218. _overlap = 0; // after first base it is always zero
  219. _doSetup();
  220. }
  221. ReadFlowCallRange!S save() @property {
  222. // bitwise copy
  223. // FIXME: is it safe?
  224. ReadFlowCallRange!S r = this;
  225. return r;
  226. }
  227. }
  228. private ReadFlowCallRange!S readFlowCallRange(S)(S seq, short[] intensities, bool rev,
  229. string flow_order, ushort overlap, int zf,
  230. FlowGramTag tag)
  231. {
  232. return ReadFlowCallRange!S(seq, intensities, rev, flow_order, overlap, zf, tag);
  233. }
  234. /// Get read flow calls. Takes ZF tag and strandness into account.
  235. ///
  236. /// Tag name is an optional argument because it is not standard and will likely
  237. /// be changed in the future (there was a proposal on samtools mailing list
  238. /// to introduce standard FB tag).
  239. auto readFlowCalls(R)(R read, string flow_order, string key_sequence, string tag="ZF") {
  240. auto zf = cast(int)read[tag];
  241. auto fz_value = read["FZ"];
  242. auto zm_value = read["ZM"];
  243. enforce(!(fz_value.is_nothing && zm_value.is_nothing),
  244. "Neither FZ nor ZM tag is presented in a mapped read");
  245. auto fg_tag = fz_value.is_nothing ? FlowGramTag.ZM : FlowGramTag.FZ;
  246. short[] flow_int = *cast(short[]*)(fg_tag == FlowGramTag.ZM ? &zm_value : &fz_value);
  247. flow_order = flow_order[zf .. $];
  248. auto intensities = flow_int[zf .. $];
  249. // key sequence is required because its last base can overlap with first called base
  250. ushort overlap = 0;
  251. Base5 base = read.is_reverse_strand ? read.sequence.back.complement : read.sequence.front;
  252. foreach_reverse (c; key_sequence) {
  253. if (c != base)
  254. break;
  255. overlap += cast(int)(multiplier(fg_tag));
  256. }
  257. return readFlowCallRange(read.sequence, intensities, read.is_reverse_strand,
  258. flow_order, overlap, zf, fg_tag);
  259. }