You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

382 lines
14 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. /*
  2. This file is part of BioD.
  3. Copyright (C) 2012 Artem Tarasov <lomereiter@gmail.com>
  4. BioD is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. BioD is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  15. */
  16. %%{
  17. machine sam_alignment;
  18. action update_sign { current_sign = fc == '-' ? -1 : 1; }
  19. action init_integer { int_value = 0; }
  20. action consume_next_digit { int_value *= 10; int_value += fc - '0'; }
  21. action take_sign_into_account { int_value *= current_sign; current_sign = 1; }
  22. sign = [\-+];
  23. uint = ([0-9]{1,18}) > init_integer $ consume_next_digit ;
  24. int = (sign >update_sign)? uint % take_sign_into_account ;
  25. action mark_float_start { float_beg = p - line.ptr; }
  26. action update_float_value {
  27. float_value = to!float(line[float_beg .. p - line.ptr]);
  28. }
  29. float = ((sign? ((digit* '.'? digit+ ([eE] sign? digit+)?) | "inf") ) | "nan")
  30. > mark_float_start % update_float_value ;
  31. invalid_field = [^\t]* ; # TODO: make class with onError methods and pass it to parseAlignmentLine
  32. action qname_start { read_name_beg = p - line.ptr; }
  33. action qname_end { read_name_end = p - line.ptr; }
  34. qname = '*' | (([!-?A-~]{1,255})** > qname_start % qname_end) ;
  35. action set_flag { flag = to!ushort(int_value); }
  36. action set_pos { pos = to!uint(int_value); }
  37. action set_mapping_quality { mapping_quality = to!ubyte(int_value); }
  38. flag = uint % set_flag;
  39. action rname_start { rname_beg = p - line.ptr; }
  40. action rname_end {
  41. ref_id = header.getSequenceIndex(line[rname_beg .. p - line.ptr]);
  42. }
  43. rname = '*' | (([!-()+-<>-~] [!-~]*) > rname_start % rname_end);
  44. pos = uint % set_pos;
  45. mapq = uint % set_mapping_quality;
  46. action cigar_set_op_length { cigar_op_len = to!uint(int_value); }
  47. action cigar_set_op_chr { cigar_op_chr = fc; }
  48. action cigar_put_operation {
  49. cigar.put(CigarOperation(cigar_op_len, cigar_op_chr));
  50. }
  51. cigar = '*' | (uint % cigar_set_op_length
  52. [MIDNSHPX=] > cigar_set_op_chr % cigar_put_operation)+ ;
  53. action set_same_mate_ref_id {
  54. mate_ref_id = ref_id;
  55. }
  56. action rnext_start { rnext_beg = p - line.ptr; }
  57. action rnext_end {
  58. mate_ref_id = header.getSequenceIndex(line[rnext_beg .. p - line.ptr]);
  59. }
  60. rnext = '*' | ('=' % set_same_mate_ref_id) |
  61. (([!-()+-<>-~][!-~]*) > rnext_start % rnext_end) ;
  62. action set_mate_pos { mate_pos = to!uint(int_value); }
  63. action set_template_length { template_length = to!int(int_value); }
  64. pnext = uint % set_mate_pos;
  65. tlen = int % set_template_length;
  66. action sequence_start { sequence_beg = p - line.ptr; }
  67. action sequence_end { sequence = line[sequence_beg .. p - line.ptr]; }
  68. seq = '*' | ([A-Za-z=.]+ > sequence_start % sequence_end) ;
  69. qual = [!-~]+ ;
  70. action allocate_quality_array {
  71. if (sequence.length > 1024) {
  72. qual_ptr = (new ubyte[sequence.length]).ptr;
  73. } else {
  74. qual_ptr = cast(ubyte*)alloca(sequence.length);
  75. if (!qual_ptr) {
  76. qual_ptr = (new ubyte[sequence.length]).ptr;
  77. }
  78. }
  79. qual_index = 0;
  80. }
  81. action convert_next_character_to_prob {
  82. qual_ptr[qual_index++] = cast(ubyte)(fc - 33);
  83. }
  84. mandatoryfields = (qname | invalid_field) '\t'
  85. (flag | invalid_field) '\t'
  86. (rname | invalid_field) '\t'
  87. (pos | invalid_field) '\t'
  88. (mapq | invalid_field) '\t'
  89. (cigar | invalid_field) '\t'
  90. (rnext | invalid_field) '\t'
  91. (pnext | invalid_field) '\t'
  92. (tlen | invalid_field) '\t'
  93. (seq | invalid_field) '\t'
  94. ((qual > allocate_quality_array
  95. $ convert_next_character_to_prob) | invalid_field) ;
  96. action set_charvalue { current_tagvalue = Value(fc); }
  97. action set_integervalue {
  98. if (int_value < 0) {
  99. if (int_value >= byte.min) {
  100. current_tagvalue = Value(to!byte(int_value));
  101. } else if (int_value >= short.min) {
  102. current_tagvalue = Value(to!short(int_value));
  103. } else if (int_value >= int.min) {
  104. current_tagvalue = Value(to!int(int_value));
  105. } else {
  106. throw new Exception("integer out of range");
  107. }
  108. } else {
  109. if (int_value <= ubyte.max) {
  110. current_tagvalue = Value(to!ubyte(int_value));
  111. } else if (int_value <= ushort.max) {
  112. current_tagvalue = Value(to!ushort(int_value));
  113. } else if (int_value <= uint.max) {
  114. current_tagvalue = Value(to!uint(int_value));
  115. } else {
  116. throw new Exception("integer out of range");
  117. }
  118. }
  119. }
  120. action start_tagvalue { tagvalue_beg = p - line.ptr; }
  121. action set_floatvalue {
  122. current_tagvalue = Value(float_value);
  123. }
  124. action set_stringvalue {
  125. current_tagvalue = Value(line[tagvalue_beg .. p - line.ptr]);
  126. }
  127. action set_hexstringvalue {
  128. current_tagvalue = Value(line[tagvalue_beg .. p - line.ptr]);
  129. current_tagvalue.setHexadecimalFlag();
  130. }
  131. charvalue = [!-~] > set_charvalue ;
  132. integervalue = int % set_integervalue;
  133. floatvalue = float % set_floatvalue ;
  134. action start_arrayvalue {
  135. // it might be not the best idea to use outbuffer;
  136. // the better idea might be two-pass approach
  137. // when first pass is for counting commas, and
  138. // the second is for filling allocated array
  139. outbuffer.data.length = 0;
  140. outbuffer.offset = 0;
  141. arraytype = fc;
  142. }
  143. action put_integer_to_array {
  144. // here, we assume that compiler is smart enough to move switch out of loop.
  145. switch (arraytype) {
  146. case 'c': outbuffer.write(to!byte(int_value)); break;
  147. case 'C': outbuffer.write(to!ubyte(int_value)); break;
  148. case 's': outbuffer.write(to!short(int_value)); break;
  149. case 'S': outbuffer.write(to!ushort(int_value)); break;
  150. case 'i': outbuffer.write(to!int(int_value)); break;
  151. case 'I': outbuffer.write(to!uint(int_value)); break;
  152. default: assert(0);
  153. }
  154. }
  155. action put_float_to_array {
  156. outbuffer.write(float_value);
  157. }
  158. action set_arrayvalue {
  159. switch (arraytype) {
  160. case 'c': current_tagvalue = Value(cast(byte[])(outbuffer.toBytes())); break;
  161. case 'C': current_tagvalue = Value(cast(ubyte[])(outbuffer.toBytes())); break;
  162. case 's': current_tagvalue = Value(cast(short[])(outbuffer.toBytes())); break;
  163. case 'S': current_tagvalue = Value(cast(ushort[])(outbuffer.toBytes())); break;
  164. case 'i': current_tagvalue = Value(cast(int[])(outbuffer.toBytes())); break;
  165. case 'I': current_tagvalue = Value(cast(uint[])(outbuffer.toBytes())); break;
  166. case 'f': current_tagvalue = Value(cast(float[])(outbuffer.toBytes())); break;
  167. default: assert(0);
  168. }
  169. }
  170. stringvalue = [ !-~]+ > start_tagvalue % set_stringvalue ;
  171. hexstringvalue = xdigit+ > start_tagvalue % set_hexstringvalue ;
  172. integerarrayvalue = [cCsSiI] > start_arrayvalue (',' int % put_integer_to_array)+ % set_arrayvalue;
  173. floatarrayvalue = [f] > start_arrayvalue (',' float % put_float_to_array)+ % set_arrayvalue;
  174. arrayvalue = integerarrayvalue | floatarrayvalue ;
  175. tagvalue = ("A:" charvalue) |
  176. ("i:" integervalue) |
  177. ("f:" floatvalue) |
  178. ("Z:" stringvalue) |
  179. ("H:" hexstringvalue) |
  180. ("B:" arrayvalue) ;
  181. action tag_key_start { tag_key_beg = p - line.ptr; }
  182. action tag_key_end { current_tag = line[tag_key_beg .. p - line.ptr]; }
  183. action append_tag_value { builder.put(current_tag, current_tagvalue); }
  184. tag = (alpha alnum) > tag_key_start % tag_key_end ;
  185. optionalfield = (tag ':' tagvalue % append_tag_value) | invalid_field ;
  186. optionalfields = optionalfield ('\t' optionalfield)* ;
  187. alignment := mandatoryfields ('\t' optionalfields)? ;
  188. write data;
  189. }%%
  190. import bio.sam.header;
  191. import bio.bam.read;
  192. import bio.bam.tagvalue;
  193. import bio.bam.utils.tagstoragebuilder;
  194. import std.array;
  195. import std.conv;
  196. import std.typecons;
  197. import std.outbuffer;
  198. import std.c.stdlib;
  199. class AlignmentBuildStorage {
  200. Appender!(CigarOperation[]) cigar_appender;
  201. OutBuffer outbuffer;
  202. TagStorageBuilder tag_storage_builder;
  203. this() {
  204. cigar_appender = appender!(CigarOperation[])();
  205. outbuffer = new OutBuffer();
  206. tag_storage_builder = TagStorageBuilder.create();
  207. }
  208. void clear() {
  209. cigar_appender.clear();
  210. tag_storage_builder.clear();
  211. outbuffer.data.length = 0;
  212. outbuffer.offset = 0;
  213. }
  214. }
  215. BamRead parseAlignmentLine(string line, SamHeader header,
  216. AlignmentBuildStorage b=null) {
  217. char* p = cast(char*)line.ptr;
  218. char* pe = p + line.length;
  219. char* eof = pe;
  220. int cs;
  221. if (b is null) {
  222. b = new AlignmentBuildStorage();
  223. } else {
  224. b.clear();
  225. }
  226. byte current_sign = 1;
  227. size_t read_name_beg; // position of beginning of QNAME
  228. size_t read_name_end; // position past the end of QNAME
  229. size_t sequence_beg; // position of SEQ start
  230. string sequence; // SEQ
  231. uint cigar_op_len; // length of CIGAR operation
  232. char cigar_op_chr; // CIGAR operation
  233. size_t cigar_op_len_start; // position of start of CIGAR operation
  234. auto cigar = b.cigar_appender;
  235. long int_value; // for storing temporary integers
  236. float float_value; // for storing temporary floats
  237. size_t float_beg; // position of start of current float
  238. auto outbuffer = b.outbuffer; // used to build tag values which hold arrays
  239. char arraytype; // type of last array tag value
  240. ushort flag;
  241. uint pos;
  242. uint mate_pos;
  243. ubyte mapping_quality;
  244. int template_length;
  245. ubyte* qual_ptr = null;
  246. size_t qual_index;
  247. string current_tag;
  248. Value current_tagvalue;
  249. size_t tag_key_beg, tagvalue_beg;
  250. size_t rname_beg, rnext_beg;
  251. int ref_id = -1;
  252. int mate_ref_id = -1;
  253. auto builder = b.tag_storage_builder;
  254. %%write init;
  255. %%write exec;
  256. auto read = BamRead(line[read_name_beg .. read_name_end],
  257. sequence,
  258. cigar.data,
  259. builder.data);
  260. if (qual_ptr !is null && qual_index == sequence.length) {
  261. read.base_qualities = qual_ptr[0 .. sequence.length];
  262. }
  263. read.flag = flag;
  264. read.mapping_quality = mapping_quality;
  265. read.position = pos - 1; // we use 0-based coordinates, not 1-based
  266. read.template_length = template_length;
  267. read.mate_position = mate_pos - 1; // also 0-based
  268. read.ref_id = ref_id;
  269. read.mate_ref_id = mate_ref_id;
  270. return read;
  271. }
  272. unittest {
  273. import std.algorithm;
  274. import std.math;
  275. auto line = "ERR016155.15021091\t185\t20\t60033\t25\t66S35M\t=\t60033\t0\tAGAAAAAACTGGAAGTTAATAGAGTGGTGACTCAGATCCAGTGGTGGAAGGGTAAGGGATCTTGGAACCCTATAGAGTTGCTGTGTGCCAGGGCCAGATCC\t#####################################################################################################\tX0:i:1\tX1:i:0\tXC:i:35\tMD:Z:17A8A8\tRG:Z:ERR016155\tAM:i:0\tNM:i:2\tSM:i:25\tXT:A:U\tBQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\tY0:B:c,1,2,3\tY1:B:f,13.263,-3.1415,52.63461";
  276. auto header = new SamHeader("@SQ\tSN:20\tLN:1234567");
  277. auto alignment = parseAlignmentLine(line, header);
  278. assert(alignment.name == "ERR016155.15021091");
  279. assert(equal(alignment.sequence(), "AGAAAAAACTGGAAGTTAATAGAGTGGTGACTCAGATCCAGTGGTGGAAGGGTAAGGGATCTTGGAACCCTATAGAGTTGCTGTGTGCCAGGGCCAGATCC"));
  280. assert(alignment.cigarString() == "66S35M");
  281. assert(alignment.flag == 185);
  282. assert(alignment.position == 60032);
  283. assert(alignment.mapping_quality == 25);
  284. assert(alignment.mate_position == 60032);
  285. assert(alignment.ref_id == 0);
  286. assert(alignment.mate_ref_id == 0);
  287. assert(to!ubyte(alignment["AM"]) == 0);
  288. assert(to!ubyte(alignment["SM"]) == 25);
  289. assert(to!string(alignment["MD"]) == "17A8A8");
  290. assert(equal(to!(byte[])(alignment["Y0"]), [1, 2, 3]));
  291. assert(equal!approxEqual(to!(float[])(alignment["Y1"]), [13.263, -3.1415, 52.63461]));
  292. assert(to!char(alignment["XT"]) == 'U');
  293. import std.stdio;
  294. import bio.bam.reference;
  295. auto info = ReferenceSequenceInfo("20", 1234567);
  296. auto invalid_cigar_string = "1\t100\t20\t50000\t30\tMZABC\t=\t50000\t0\tACGT\t####";
  297. alignment = parseAlignmentLine(invalid_cigar_string, header);
  298. assert(equal(alignment.sequence(), "ACGT"));
  299. auto invalid_tag_and_qual = "2\t100\t20\t5\t40\t27M30X5D\t=\t3\t10\tACT\t !\n\tX1:i:7\tX3:i:zzz\tX4:i:5";
  300. alignment = parseAlignmentLine(invalid_tag_and_qual, header);
  301. assert(alignment.base_qualities == [255, 255, 255]); // i.e. invalid
  302. assert(to!ubyte(alignment["X1"]) == 7);
  303. assert(alignment["X3"].is_nothing);
  304. assert(to!ubyte(alignment["X4"]) == 5);
  305. }