You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

540 lines
18 KiB

  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. Permission is hereby granted, free of charge, to any person obtaining a
  5. copy of this software and associated documentation files (the "Software"),
  6. to deal in the Software without restriction, including without limitation
  7. the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. and/or sell copies of the Software, and to permit persons to whom the
  9. Software is furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  17. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. */
  20. /// BAM records may carry arbitrary information in tags.
  21. /// $(BR)
  22. /// $(D Value) type provides convenient way to work with this information.
  23. ///
  24. /// Example:
  25. /// --------------------------------
  26. /// import bio.bam.reader, bio.bam.tagvalue;
  27. /// ...
  28. /// auto bam = new BamReader("file.bam");
  29. /// Value v = bam.reads.front["MD"];
  30. /// assert(v.is_string);
  31. /// v = 5;
  32. /// assert(v.is_signed); // because 5 is of type int which is signed
  33. /// assert(v == "5"); // converted to string and then compared
  34. /// v = "abc";
  35. /// assert(v.is_string);
  36. /// v = [1, 2, 3]; // integer and float arrays are supported
  37. /// assert(v.is_numeric_array);
  38. /// v = [1.5f, 2.3f, 17.0f]; // double[] arrays must be converted to float[]
  39. /// assert(v.is_numeric_array);
  40. /// v = 5.6;
  41. /// assert(v.is_float);
  42. /// v = -17;
  43. /// assert(v.is_signed);
  44. /// ----------------------------------
  45. module bio.bam.tagvalue;
  46. public import std.conv;
  47. import std.typetuple;
  48. import std.exception;
  49. import std.format;
  50. import std.array;
  51. import bio.core.utils.format;
  52. import bio.bam.thirdparty.msgpack;
  53. struct CharToType(char c, T) {
  54. /** symbol */
  55. enum ch = c;
  56. /** type which corresponds to the symbol
  57. according to SAM/BAM specification
  58. */
  59. alias T ValueType;
  60. }
  61. /**
  62. Thrown in case of unrecognized tag type
  63. */
  64. class UnknownTagTypeException : Exception {
  65. this(string msg) { super(msg); }
  66. }
  67. alias TypeTuple!(CharToType!('A', char),
  68. CharToType!('c', byte),
  69. CharToType!('C', ubyte),
  70. CharToType!('s', short),
  71. CharToType!('S', ushort),
  72. CharToType!('i', int),
  73. CharToType!('I', uint),
  74. CharToType!('f', float)) PrimitiveTagValueTypes;
  75. alias TypeTuple!(CharToType!('Z', string),
  76. CharToType!('H', string)) StringTagValueTypes;
  77. alias TypeTuple!(CharToType!('c', byte),
  78. CharToType!('C', ubyte),
  79. CharToType!('s', short),
  80. CharToType!('S', ushort),
  81. CharToType!('i', int),
  82. CharToType!('I', uint),
  83. CharToType!('f', float)) ArrayElementTagValueTypes;
  84. /*
  85. Useful in TagStorage implementations, for skipping elements
  86. Params:
  87. c = primitive type identifier
  88. Returns: size of corresponding type in bytes
  89. */
  90. uint charToSizeof(char c) {
  91. string charToSizeofHelper() {
  92. char[] cases;
  93. foreach (c2t; PrimitiveTagValueTypes) {
  94. cases ~= "case '"~c2t.ch~"':"~
  95. " return "~to!string(c2t.ValueType.sizeof)~";".dup;
  96. }
  97. return "switch (c) { " ~ cases.idup ~
  98. " default: " ~
  99. " throw new UnknownTagTypeException(to!string(c));"~
  100. "}";
  101. }
  102. mixin(charToSizeofHelper());
  103. }
  104. /*
  105. Pair of type and its ubyte identifier.
  106. (Currently, ubyte is enough, but that might change in the future.)
  107. */
  108. struct TypeId(T, ubyte id) {
  109. enum Id = id;
  110. alias T Type;
  111. }
  112. /*
  113. Structure of type identifier:
  114. 0 1
  115. primitive array/string
  116. something null/nothing numeric string
  117. numeric char 0 0 Z H
  118. integer float 0 [see left 0 0
  119. unsigned signed 0 0 branch] 0 0
  120. [ size in bytes] [size in bytes] 0 [element size] 1 1
  121. (TypeId >> 5) == elementType.sizeof
  122. */
  123. alias TypeTuple!(TypeId!(char, 0b001_00_1_00),
  124. TypeId!(ubyte, 0b001_0_0000),
  125. TypeId!(ushort, 0b010_0_0000),
  126. TypeId!(uint, 0b100_0__0__0__0__0),
  127. /* Let's take 4 u i n s p
  128. uint as an n n u o r
  129. example b s t m m i
  130. y i e e e m
  131. t g g r t i
  132. e n e i h t
  133. s e r c i i
  134. d n v
  135. g e
  136. */
  137. TypeId!(byte, 0b001_1_0000),
  138. TypeId!(short, 0b010_1_0000),
  139. TypeId!(int, 0b100_1_0000),
  140. TypeId!(float, 0b100_01_000),
  141. TypeId!(ubyte[], 0b001_000_01),
  142. TypeId!(ushort[], 0b010_000_01),
  143. TypeId!(uint[], 0b100_000_01),
  144. TypeId!(byte[], 0b001_100_01),
  145. TypeId!(short[], 0b010_100_01),
  146. TypeId!(int[], 0b100_100_01),
  147. TypeId!(float[], 0b100_01_001),
  148. TypeId!(string, 0b001_00_101),
  149. TypeId!(string, 0b001_01_101),
  150. TypeId!(typeof(null), 0b0000_0010))
  151. TypeIdMap;
  152. private immutable hexStringTag = 0b001_01_101;
  153. private template GetType(U) {
  154. alias U.Type GetType;
  155. }
  156. /// Get tag for type T.
  157. ///
  158. /// Useful for comparison with tag field of Value struct.
  159. ///
  160. /// Example:
  161. /// -----------------------------------
  162. /// Value v = "zzz";
  163. /// assert(v.tag == GetTypeId!string);
  164. /// -----------------------------------
  165. template GetTypeId(T) {
  166. ///
  167. enum GetTypeId = TypeIdMap[staticIndexOf!(T, staticMap!(GetType, TypeIdMap))].Id;
  168. }
  169. string generateUnion() {
  170. char[] u = "union U {".dup;
  171. foreach (t; PrimitiveTagValueTypes) {
  172. u ~= t.ValueType.stringof ~ " " ~ t.ch ~ ";".dup;
  173. }
  174. foreach (t; StringTagValueTypes) {
  175. u ~= t.ValueType.stringof ~ " " ~ t.ch ~ ";".dup;
  176. }
  177. foreach (t; ArrayElementTagValueTypes) {
  178. u ~= t.ValueType.stringof ~ "[] " ~ 'B' ~ t.ch ~ ";".dup;
  179. }
  180. u ~= "}; U u;".dup;
  181. return u.idup;
  182. }
  183. template ArrayOf(T) {
  184. alias T[] ArrayOf;
  185. }
  186. string injectOpAssign() {
  187. char[] cs;
  188. foreach (t; PrimitiveTagValueTypes) {
  189. cs ~= "final void opAssign(" ~ t.ValueType.stringof ~ " value) {" ~
  190. " this.u." ~ t.ch ~ " = value;" ~
  191. " this._tag = " ~ to!string(GetTypeId!(t.ValueType)) ~ ";" ~
  192. " this.bam_typeid = '" ~ t.ch ~ "';" ~
  193. "}";
  194. }
  195. cs ~= "final void opAssign(string value) {" ~
  196. " this.u.Z = value;" ~
  197. " this._tag = " ~ to!string(GetTypeId!string) ~ ";" ~
  198. " this.bam_typeid = 'Z';" ~
  199. "}";
  200. foreach (t; ArrayElementTagValueTypes) {
  201. cs ~= "final void opAssign(" ~ t.ValueType.stringof ~ "[] value) {" ~
  202. " this.u.B" ~ t.ch ~ " = value;" ~
  203. " this._tag = " ~ to!string(GetTypeId!(ArrayOf!(t.ValueType))) ~ ";" ~
  204. " this.bam_typeid = '" ~ t.ch ~ "';" ~
  205. "}";
  206. }
  207. return cs.idup;
  208. }
  209. string injectOpCast() {
  210. char[] cs = "static if".dup;
  211. string injectSwitchPrimitive(string requested_type)
  212. {
  213. char[] cs = `switch (_tag) {`.dup;
  214. foreach (t2; PrimitiveTagValueTypes) {
  215. cs ~= `case GetTypeId!`~t2.ValueType.stringof~`: `~
  216. ` return to!T(u.`~t2.ch~`);`.dup;
  217. }
  218. cs ~= ` default: throw new ConvException("Cannot convert Value to `~
  219. requested_type~`");`~
  220. `}`;
  221. return cs.idup;
  222. }
  223. string injectSwitchArrayElement(string requested_type)
  224. {
  225. char[] cs = `switch (_tag) {`.dup;
  226. foreach (t2; ArrayElementTagValueTypes) {
  227. cs ~= `case GetTypeId!(`~t2.ValueType.stringof~`[]): `~
  228. ` return to!T(u.B`~t2.ch~`);`.dup;
  229. }
  230. cs ~= ` default: throw new ConvException("Cannot convert Value to `~
  231. requested_type~`");`~
  232. `}`;
  233. return cs.idup;
  234. }
  235. foreach (t; TypeTuple!(byte, ubyte, short, ushort, int, uint,
  236. char, float, double, real, long, ulong))
  237. {
  238. cs ~= `(is(T == `~t.stringof~`)) {`~
  239. injectSwitchPrimitive(t.stringof)~
  240. `} else static if`.dup;
  241. }
  242. foreach (t; ArrayElementTagValueTypes) {
  243. cs ~= `(is(T == ` ~ t.ValueType.stringof ~ `[])) {` ~
  244. injectSwitchArrayElement(t.ValueType.stringof ~ "[]")~
  245. `} else static if `;
  246. }
  247. cs ~= `(is(T == string)) {` ~
  248. ` if (is_string) {`
  249. ` return bam_typeid == 'Z' ? u.Z : u.H;`~
  250. ` } else if (is_integer || is_float || is_character) {`~
  251. ` `~injectSwitchPrimitive("string")~
  252. ` } else {`~
  253. injectSwitchArrayElement("string")~
  254. ` }`~
  255. `}`.dup;
  256. return "final T opCast(T)() const {" ~ cs.idup ~ "}";
  257. }
  258. /**
  259. Struct for representing tag values.
  260. Tagged union, allows to store
  261. 8/16/32-bit integers, floats, chars, strings,
  262. and arrays of integers/floats.
  263. */
  264. struct Value {
  265. /*
  266. Notice that having union first allows to do simple casts,
  267. without using opCast(). That's a bit hackish but
  268. allows for better speed.
  269. */
  270. private mixin(generateUnion());
  271. /**
  272. If this is an array, one of [cCsSiIf].
  273. Otherwise, one of [AcCsSiIfZH]
  274. See SAM/BAM specification for details.
  275. */
  276. public char bam_typeid;
  277. /*
  278. WARNING:
  279. Currently, type identifier for (u)int requires 8 bits.
  280. Fortunately, SAM/BAM specification doesn't use bigger integer types.
  281. However, in case of need to extend the hierarchy, the type
  282. should be changed from ubyte to something bigger.
  283. */
  284. ubyte _tag;
  285. /// Designates the type of currently stored value.
  286. ///
  287. /// Supposed to be used externally for checking type with GetTypeId.
  288. ubyte tag() @property const {
  289. return _tag;
  290. }
  291. mixin(injectOpAssign());
  292. mixin(injectOpCast());
  293. ///
  294. final void opAssign(Value v) {
  295. bam_typeid = v.bam_typeid;
  296. _tag = v._tag;
  297. u = v.u;
  298. }
  299. /// ditto
  300. final void opAssign(typeof(null) n) {
  301. _tag = GetTypeId!(typeof(null));
  302. }
  303. ///
  304. final bool opEquals(T)(const T val) {
  305. try {
  306. return to!T(this) == val;
  307. } catch (ConvException e) {
  308. return false;
  309. }
  310. }
  311. ///
  312. string toString() const {
  313. return opCast!string();
  314. }
  315. ///
  316. this(T)(T value) {
  317. opAssign(value);
  318. }
  319. /// sets 'H' tag instead of default 'Z'. Is not expected to be used much.
  320. void setHexadecimalFlag() {
  321. enforce(this.is_string);
  322. bam_typeid = 'H';
  323. _tag = hexStringTag;
  324. if (_tag != 0b111) {
  325. u.H = u.Z;
  326. }
  327. }
  328. /// Holds $(D null). Represents non-existing tag. Such values are used to remove tags.
  329. bool is_nothing() @property const { return _tag == GetTypeId!(typeof(null)); }
  330. /// char
  331. bool is_character() @property const { return _tag == GetTypeId!char; }
  332. /// float
  333. bool is_float() @property const { return _tag == GetTypeId!float; }
  334. /// ubyte[]/byte[]/ushort[]/short[]/uint[]/int[]/float[]
  335. bool is_numeric_array() @property const { return (_tag & 0b111) == 0b001; }
  336. /// ubyte[]/byte[]/ushort[]/short[]/uint[]/int[]
  337. bool is_array_of_integers() @property const { return (_tag & 0b1111) == 0b0001; }
  338. /// float[]
  339. bool is_array_of_floats() @property const { return (_tag & 0b1111) == 0b1001; }
  340. /// ubyte/byte/ushort/short/uint/int
  341. bool is_integer() @property const { return (_tag & 0b1111) == 0; }
  342. /// ubyte/ushort/uint
  343. bool is_unsigned() @property const { return (_tag & 0b11111) == 0; }
  344. /// byte/short/int
  345. bool is_signed() @property const { return (_tag & 0b11111) == 0b10000; }
  346. /// 'Z' or 'H' tag
  347. bool is_string() @property const { return (_tag & 0b111) == 0b101; }
  348. /// 'H' tag
  349. bool is_hexadecimal_string() @property const { return (_tag & 0b1101) == 0b1101; }
  350. /// Serializes value in MessagePack format
  351. public void toMsgpack(Packer)(ref Packer packer) const {
  352. switch (_tag) {
  353. case GetTypeId!byte: packer.pack(*cast(byte*)(&u)); break;
  354. case GetTypeId!ubyte: packer.pack(*cast(ubyte*)(&u)); break;
  355. case GetTypeId!short: packer.pack(*cast(short*)(&u)); break;
  356. case GetTypeId!ushort: packer.pack(*cast(ushort*)(&u)); break;
  357. case GetTypeId!int: packer.pack(*cast(int*)(&u)); break;
  358. case GetTypeId!uint: packer.pack(*cast(uint*)(&u)); break;
  359. case GetTypeId!float: packer.pack(*cast(float*)(&u)); break;
  360. case GetTypeId!string: packer.pack(*cast(char[]*)(&u)); break;
  361. case hexStringTag: packer.pack(*cast(char[]*)(&u)); break;
  362. case GetTypeId!char: packer.pack(*cast(ubyte*)(&u)); break;
  363. case GetTypeId!(byte[]): packer.pack(*cast(byte[]*)(&u)); break;
  364. case GetTypeId!(ubyte[]): packer.pack(*cast(ubyte[]*)(&u)); break;
  365. case GetTypeId!(short[]): packer.pack(*cast(short[]*)(&u)); break;
  366. case GetTypeId!(ushort[]): packer.pack(*cast(ushort[]*)(&u)); break;
  367. case GetTypeId!(int[]): packer.pack(*cast(int[]*)(&u)); break;
  368. case GetTypeId!(uint[]): packer.pack(*cast(uint[]*)(&u)); break;
  369. case GetTypeId!(float[]): packer.pack(*cast(float[]*)(&u)); break;
  370. case GetTypeId!(typeof(null)): packer.pack(null); break;
  371. default: break;
  372. }
  373. }
  374. /// SAM representation
  375. string toSam()() const {
  376. auto w = appender!(char[])();
  377. toSam((const(char)[] s) { w.put(s); });
  378. return cast(string)w.data;
  379. }
  380. /// ditto
  381. void toSam(Sink)(auto ref Sink sink) const
  382. if (isSomeSink!Sink)
  383. {
  384. if (is_integer) {
  385. sink.write("i:");
  386. switch (_tag) {
  387. case GetTypeId!byte: sink.write(*cast(byte*)(&u)); break;
  388. case GetTypeId!ubyte: sink.write(*cast(ubyte*)(&u)); break;
  389. case GetTypeId!short: sink.write(*cast(short*)(&u)); break;
  390. case GetTypeId!ushort: sink.write(*cast(ushort*)(&u)); break;
  391. case GetTypeId!int: sink.write(*cast(int*)(&u)); break;
  392. case GetTypeId!uint: sink.write(*cast(uint*)(&u)); break;
  393. default: break;
  394. }
  395. } else if (is_numeric_array) {
  396. sink.write("B:");
  397. sink.write(bam_typeid);
  398. sink.write(',');
  399. switch (_tag) {
  400. case GetTypeId!(byte[]): sink.writeArray(*cast(byte[]*)(&u), ','); break;
  401. case GetTypeId!(ubyte[]): sink.writeArray(*cast(ubyte[]*)(&u), ','); break;
  402. case GetTypeId!(short[]): sink.writeArray(*cast(short[]*)(&u), ','); break;
  403. case GetTypeId!(ushort[]): sink.writeArray(*cast(ushort[]*)(&u), ','); break;
  404. case GetTypeId!(int[]): sink.writeArray(*cast(int[]*)(&u), ','); break;
  405. case GetTypeId!(uint[]): sink.writeArray(*cast(uint[]*)(&u), ','); break;
  406. case GetTypeId!(float[]): sink.writeArray(*cast(float[]*)(&u), ','); break;
  407. default: break;
  408. }
  409. } else {
  410. switch (_tag) {
  411. case GetTypeId!float: sink.write("f:"); sink.write(*cast(float*)(&u)); break;
  412. case GetTypeId!string: sink.write("Z:"); sink.write(*cast(const(char)[]*)(&u)); break;
  413. case hexStringTag: sink.write("H:"); sink.write(*cast(const(char)[]*)(&u)); break;
  414. case GetTypeId!char: sink.write("A:"); sink.write(*cast(char*)(&u)); break;
  415. default: break;
  416. }
  417. }
  418. }
  419. /// JSON representation
  420. string toJson()() const {
  421. auto w = appender!(char[])();
  422. toJson((const(char)[] s) { w.put(s); });
  423. return cast(string)w.data;
  424. }
  425. /// ditto
  426. void toJson(Sink)(auto ref Sink sink) const
  427. if (isSomeSink!Sink)
  428. {
  429. switch (_tag) {
  430. case GetTypeId!byte: sink.writeJson(*cast(byte*)(&u)); break;
  431. case GetTypeId!ubyte: sink.writeJson(*cast(ubyte*)(&u)); break;
  432. case GetTypeId!short: sink.writeJson(*cast(short*)(&u)); break;
  433. case GetTypeId!ushort: sink.writeJson(*cast(ushort*)(&u)); break;
  434. case GetTypeId!int: sink.writeJson(*cast(int*)(&u)); break;
  435. case GetTypeId!uint: sink.writeJson(*cast(uint*)(&u)); break;
  436. case GetTypeId!(byte[]): sink.writeJson(*cast(byte[]*)(&u)); break;
  437. case GetTypeId!(ubyte[]): sink.writeJson(*cast(ubyte[]*)(&u)); break;
  438. case GetTypeId!(short[]): sink.writeJson(*cast(short[]*)(&u)); break;
  439. case GetTypeId!(ushort[]): sink.writeJson(*cast(ushort[]*)(&u)); break;
  440. case GetTypeId!(int[]): sink.writeJson(*cast(int[]*)(&u)); break;
  441. case GetTypeId!(uint[]): sink.writeJson(*cast(uint[]*)(&u)); break;
  442. case GetTypeId!(float[]): sink.writeJson(*cast(float[]*)(&u)); break;
  443. case GetTypeId!float: sink.writeJson(*cast(float*)(&u)); break;
  444. case GetTypeId!string: sink.writeJson(*cast(string*)(&u)); break;
  445. case hexStringTag: sink.writeJson(*cast(string*)(&u)); break;
  446. case GetTypeId!char: sink.writeJson(*cast(char*)(&u)); break;
  447. default: break;
  448. }
  449. }
  450. }