You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

265 lines
7.9 KiB

import std.array;
import std.string;
import std.algorithm;
import std.stdio;
import std.range;
import std.conv;
import std.math;
import std.typecons;
import std.exception;
import std.path;
import std.file;
import fai;
/*
A text-based single-letter format for representing
nucleotide (nt) and amino-acid (aa) sequences.
The ">" symbol/character marks the start of a fasta entry.
Each fasta entry comprise of an alphanumeric definition line followed by a
newline character and a single or multiline sequence of IUPAC codes used to
represent nucleotide or amino-acid sequences.
An example of a nucleotide fasta file
>Entry1_ID header field1|field2|...
TTGACGGGTTTTTGTCCTGATT
>Entry2_ID header field1|field2|...
ATTTTGGGTTACTGTTGGTTTTTGGGC
TODO:
1. Allow reading gzipped fasta files.
*/
struct FastaRecord {
string header;
string sequence;
ulong lineLen;
string lineTerm = "\n";
// split the header to array of fields
@property string[] headerFields(){
return split(header, "|").map!strip.array;
}
// return the length of the sequence
@property ulong len() {
return sequence.length;
}
string toString() {
string seq;
ulong i = lineLen;
for (; i<sequence.length; i+=lineLen) {
seq~=sequence[i-lineLen..i]~"\n";
}
seq~=sequence[i-lineLen..$];
return format(">%s\n%s", header, seq);
}
unittest {
auto recString = q"(
>chr2
acgtgagtgcacgtgagtgcacgtgagtgc
acgtgagtgcacgtgagtgc
)".outdent().strip();
auto header = "chr2";
auto seq = "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc";
auto lineLen = 30;
auto rec = FastaRecord(header, seq, lineLen);
assert (rec.toString() == recString);
assert (rec.len == seq.length);
}
}
struct Region {
string reference;
ulong beg, end;
@property len() {
return end - beg;
}
string toString() {
if ( end == 0 ) {
if ( beg == 0 )
return reference;
else
return format("%s:%s", reference, beg+1);
}
return format("%s:%s-%s", reference, beg+1, end);
}
this(string q) {
auto res = q.split(":");
reference = res[0];
if ( res.length > 2) {
throw new Exception(format("Wrong format for region: %s", q));
} else if (res.length > 1) {
auto boundaries = res[1].replace(",","").split("-").map!(to!ulong);
beg = boundaries[0]-1;
if ( boundaries.length == 2 ) {
end = boundaries[1];
}
}
}
unittest {
assert ( Region("chr2:1-10").toString() == "chr2:1-10" );
assert ( Region("chr2:1-10").len == 10 );
assert ( Region("chr2").toString() == "chr2" );
assert ( Region("chr2:10").toString() == "chr2:10" );
assertThrown!Exception( Region("chr2:1:10") );
auto region1 = Region("chr1:1,000-2000");
assert(region1.reference == "chr1");
assert(region1.beg == 999);
assert(region1.end == 2000);
auto region2 = Region("chr2");
assert(region2.reference == "chr2");
assert(region2.beg == 0);
assert(region2.end == 0);
auto region3 = Region("chr3:1,000,000");
assert(region3.reference == "chr3");
assert(region3.beg == 999_999);
assert(region3.end == 0);
}
}
auto fastaRecords(string filename) {
File f = File(filename);
FastaRecord[] records;
string lineTerm = f.byLine(KeepTerminator.yes).take(1).front.endsWith("\r\n") ? "\r\n" : "\n";
f.seek(0);
ulong offset;
foreach(line; f.byLine(KeepTerminator.no, lineTerm)) {
offset+= line.length + lineTerm.length;
if ( line.startsWith(">") ) {
records~=FastaRecord();
records[$-1].lineTerm = lineTerm;
records[$-1].header ~= line[1..$];
} else {
if ( records[$-1].lineLen == 0 ) {
records[$-1].lineLen = line.length;
}
records[$-1].sequence ~= line;
}
}
return records;
}
unittest {
auto testFa = tempDir.buildPath("test.fa");
scope(exit) testFa.remove;
File(testFa, "w").writeln(q"(
>chr1
acgtgagtgc
>chr2
acgtgagtgcacgtgagtgcacgtgagtgc
acgtgagtgcacgtgagtgc
>chr3 hrsv | Kilifi | partial sequence
CATGTTATTACAAGTAGTGATATTTGCCCTAATAATAATATTGTAGTGAAATCCAATTTCACAACAATGC
)".outdent().strip());
auto records = fastaRecords(testFa);
assert ( records.length == 3 );
assert ( records[0].header == "chr1" );
assert ( records[0].sequence == "acgtgagtgc" );
assert ( records[0].lineLen == 10 );
assert ( records[0].toString == q"(
>chr1
acgtgagtgc
)".outdent().strip());
assert ( records[1].header == "chr2" );
assert ( records[1].sequence == "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc" );
assert ( records[1].lineLen == 30 );
assert ( records[1].toString == q"(
>chr2
acgtgagtgcacgtgagtgcacgtgagtgc
acgtgagtgcacgtgagtgc
)".outdent().strip());
assert(records[2].header == "chr3 hrsv | Kilifi | partial sequence" );
assert(records[2].headerFields.length == 3);
assert(records[2].headerFields == ["chr3 hrsv","Kilifi","partial sequence"]);
}
auto fastaRegions(string filename, string[] queries) {
File f = File(filename);
FaiRecord[string] index = makeIndex(readFai(filename~=".fai"));
Region[] regions = to!(Region[])(queries);
return fetchFastaRegions(f, index, regions);
}
auto fetchFastaRegions(File fasta, FaiRecord[string] index, Region[] regions) {
FastaRecord[] records;
foreach (region; regions) {
string seq;
ulong len;
if ( region.reference in index ) {
auto reference = index[region.reference];
fasta.seek(reference.offset+region.beg+region.beg/reference.lineLen);
size_t bufLen;
if ( region.end == 0 )
bufLen = reference.seqLen + reference.seqLen/reference.lineLen;
else
bufLen = region.len + region.len/reference.lineLen;
seq = fasta.rawRead(new char[bufLen]).replace(reference.lineTerm,"").idup;
len = reference.lineLen;
}
records ~= FastaRecord(region.to!string, seq, len);
}
return records;
}
unittest {
auto testFa = tempDir.buildPath("test.fa");
scope(exit) testFa.remove;
File(testFa, "w").writeln(q"(
>chr1
acgtgagtgc
>chr2
acgtgagtgcacgtgagtgcacgtgagtgc
acgtgagtgcacgtgagtgc
)".outdent().strip());
auto faiString = "
chr1\t10\t6\t10\t11
chr2\t50\t23\t30\t31
".outdent().strip();
auto testIndex = tempDir.buildPath("test.fa.fai");
scope(exit) testIndex.remove;
File(testIndex, "w").writeln(faiString);
auto regions = fastaRegions(testFa, ["chr1:4-6", "chr2:36-45"]);
assert ( regions.length == 2 );
assert ( regions[0].header == "chr1:4-6" );
assert ( regions[0].len == 3 );
assert ( regions[0].sequence == "tga" );
assert ( regions[0].lineLen == 10 );
assert ( regions[1].header == "chr2:36-45" );
assert ( regions[1].len == 10 );
assert ( regions[1].sequence == "agtgcacgtg" );
assert ( regions[1].lineLen == 30 );
regions = fastaRegions(testFa, ["chr1"]);
assert ( regions.length == 1 );
assert ( regions[0].header == "chr1" );
assert ( regions[0].len == 10 );
assert ( regions[0].sequence == "acgtgagtgc" );
assert ( regions[0].lineLen == 10 );
regions = fastaRegions(testFa, ["chr2"]);
assert ( regions.length == 1 );
assert ( regions[0].header == "chr2" );
assert ( regions[0].len == 50 );
assert ( regions[0].sequence == "acgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgcacgtgagtgc" );
assert ( regions[0].lineLen == 30 );
}