String.prototype.scan = function(regex) {
if (!regex.global) throw "regex must have 'global' flag set";
var r = []
this.replace(regex, function() {
r.push(Array.prototype.slice.call(arguments, 1, -2));
});
return r;
}
function reverseComplement(dna) {
return /[^ATCG]/.test(dna) ? "Invalid sequence" :
[...dna.replace(/./g,x=>x=="A"?"T":x=="T"?"A":x=="C"?"G":"C")].reverse().join("")
}
function getFrames(seq){
var rev = reverseComplement(seq)
var seq_length = seq.length
return {
"1" : seq, "2": seq.substring(1,seq_length-1), "3": seq.substring(2,seq_length-1),
"-1": rev, "-2": rev.substring(1,seq_length-1), "-3": rev.substring(2,seq_length-1)
}
}
function findOrfInFrame(seq,frame,min,len) {
var orfs = []
var tri_nts = seq.match(/.{1,3}/g)
var start = ""
var stop = ""
var orf_seq = ""
var is_neg = frame < 0
var neg_offset = {"1" : 1, "2" : 3, "3" : 5}
var offset = neg_offset[Math.abs(frame).toString()]
tri_nts.forEach( function(tri_nt,i){
// The very end of the sequence
if (i == tri_nts.size - 1) {
stop = ((i*3)+frame+2)
if (is_neg) {
start = (len - parseInt(start) - offset)
stop = (len - parseInt(stop) - offset)
}
if ((orf_seq.length + 3) >= min) {
orfs.push([start.toString(),">#{stop}",orf_seq])
}
}
// Start of an ORF
if (orf_seq == "" && tri_nt == "ATG") {
orf_seq += tri_nt
start = (i * 3) + frame
// Not a stop codon
} else if (codons[tri_nt] != "*" && orf_seq != ""){
orf_seq += tri_nt
// Find a stop codon
} else if (codons[tri_nt] == "*" && orf_seq != ""){
if ((orf_seq.length + 3) >= min){
stop = ((i*3)+frame+2)
if (is_neg) {
start = (len - parseInt(start) - offset)
stop = (len - parseInt(stop) - offset)
}
orfs.push([start.toString(),stop.toString(),orf_seq])
start = ""
stop = ""
orf_seq = ""
}
}
});
return orfs
}
function findOrfs(seq,min) {
var frames = getFrames(seq)
var len = seq.length
var all_orfs = {}
var count = 1
var frames_l = [1]//,2,3,-1,-2,-3]
frames_l.forEach(function(frame){
var is_neg = frame < 0 ? true : false
var seq = frames[frame.toString()]
var orfs = findOrfInFrame(seq,frame,min,len)
orfs.forEach(function(orf){
var orf_info = {
'start' : orf[0],
'stop' : orf[1],
'strand': frame > 0 ? "+" : "-",
'seq' : orf[2].match(/.{1,3}/g).map(codon=>codons[codon]).join(""),
'bp' : orf[2].length + 3,
'aa' : orf[2].length / 3,
'frame' : frame,
};
all_orfs[`ORF${count}`] = orf_info
count += 1
});
});
console.log(all_orfs)
return all_orfs
}
describe("Example Test Cases", function(){
var seq = "AAAAGAGAAGCTGCAAGTCATGGATTTGGAAAAACATCAGGGAATTCATTTAAAGTAAATAGCTGCAAAGACCACATTGGAAAGTCAATGCCAAATGTCCTAGAAGATGAAGTATATGAAACAGTTGTAGATACCTCTGAAGAAGATAGTTTTTCATTATGTTTTTCTAAATGTAGAACAAAAAATCTACAAAAAGTAAGAACTAGCAAGACTAGGAAAAAAATTTTCCATGAAGCAAACGCTGATGAATGTGAAAAATCTAAAAACCAAGTGAAAGAAAAATACTCATTTGTATCTGAAGTGGAACCAAATGATACTGATCCATTAGATTCAAATGTAGCAAATCAGAAGCCCTTTGAGAGTGGAAGTGACAAAATCTCCAAGGAAGTTGTACCGTCTTTGGCCTGTGAATGGTCTCAACTAACCCTTTCAGGTCTAAATGGAGCCCAGATGGAGAAAATACCCCTATTGCATATTTCTTCATGTGACCAAAATATTTCAGAAAAAGACCTATTAGACACAGAGAACAAAAGAAAGAAAGATTTTCTTACTTCAGAGAATTCTTTGCCACGTATTTCTAGCCTACCAAAATCAGAGAAGCCATTAAATGAGGAAACAGTGGTAAATAAGAGAGATGAAGAGCAGCATCTTGAATCTCATACAGACTGCATTCTTGCAGTAAAGCAGGCAATATCTGGAACTTCTCCAGTGGCTTCTTCATTTCAGGGTATCAAAAAGTCTATATTCAGAATAAGAGAATCACCTAAAGAGACTTTCAATGCAAGTTTTTCAGGTCATATGACTGATCCAAACTTTAAAAAAGAAACTGAAGCCTCTGAAAGTGGACTGGAAATACATACTGTTTGCTCACAGAAGGAGGACTCCTTATGTCCAAATTTAATTGATAATGGAAGCTGGCCAGCCACCACCACACAGAATTCTGTAGCTTTGAAGAATGCAGGTTTAATATCCACTTTGAAAAAGAAAACAAATAAGTTTATTTATGCTATACATGATGAAACATCTTATAAAGGAAAAAAAATACCGAAAGACCAAAAATCAGAACTAATTAACTGTTCAGCCCAGTTTGAAGCAAATGCTTTTGAAGCACCACTTACATTTGCAAATGCTGATTCAGGTTTATTGCATTCTTCTGTGAAAAGAAGCTGTTCACAGAATGATTCTGAAGAACCAACTTTGTCCTTAACTAGCTCTTTTGGGACAATTCTGAGGAAATGTTCTAGAAATGAAACATGTTCTAATAATACAGTAATCTCTCAGGATCTTGATTATAAAGAAGCAAAATGTAATAAGGAAAAACTACAGTTATTTATTACCCCAGAAGCTGATTCTCTGTCATGCCTGCAGGAAGGACAGTGTGAAAATGATCCAAAAAGCAAAAAAGTTTCAGATATAAAAGAAGAGGTCTTGGCTGCAGCATGTCACCCAGTACAACATTCAAAAGTGGAATACAGTGATACTGACTTTCAATCCCAGAAAAGTCTTTTATATGATCATGAAAATGCCAGCACTCTTATTTTAACTCCTACTTCCAAGGATGTTCTGTCAAACCTAGTCATGATTTCTAGAGGCAAAGAATCATACAAAATGTCAGACAAGCTCAAAGGTAACAATTATGAATCTGATGTTGAATTAACCAAAAATATTCCCATGGAAAAGAATCAAGATGTATGTGCTTTAAATGAAAATTATAAAAACGTTGAGCTGTTGCCACCTGAAAAATACATGAGAGTAGCATCACCTTCAAGAAAGGTACAATTCAACCAAAACACAAATCTAAGAGTAATCCAAAAAAATCAAGAAGAAACTACTTCAATTTCAAAAATAACTGTCAATCCAGACTCTGAAGAACTTTTCTCAGACAATGAGAATAATTTTGTCTTCCAAGTAGCTAATGAAAGGAATAATCTTGCTTTAGGAAATACTAAGGAACTTCATGAAACAGACTTGACTTGTGTAAACGAACCCATTTTCAAGAACTAAAAAGAGAAGCTGCAAGTCATGGATTTGGAAAAACATCAGGGAATTCATTTAAAGTAAATAGCTGCAAAGACCACATTGGAAAGTCAATGCCAAATGTCCTAGAAGATGAAGTATATGAAACAGTTGTAGATACCTCTGAAGAAGATAGTTTTTCATTATGTTTTTCTAAATGTAGAACAAAAAATCTACAAAAAGTAAGAACTAGCAAGACTAGGAAAAAAATTTTCCATGAAGCAAACGCTGATGAATGTGAAAAATCTAAAAACCAAGTGAAAGAAAAATACTCATTTGTATCTGAAGTGGAACCAAATGATACTGATCCATTAGATTCAAATGTAGCAAATCAGAAGCCCTTTGAGAGTGGAAGTGACAAAATCTCCAAGGAAGTTGTACCGTCTTTGGCCTGTGAATGGTCTCAACTAACCCTTTCAGGTCTAAATGGAGCCCAGATGGAGAAAATACCCCTATTGCATATTTCTTCATGTGACCAAAATATTTCAGAAAAAGACCTATTAGACACAGAGAACAAAAGAAAGAAAGATTTTCTTACTTCAGAGAATTCTTTGCCACGTATTTCTAGCCTACCAAAATCAGAGAAGCCATTAAATGAGGAAACAGTGGTAAATAAGAGAGATGAAGAGCAGCATCTTGAATCTCATACAGACTGCATTCTTGCAGTAAAGCAGGCAATATCTGGAACTTCTCCAGTGGCTTCTTCATTTCAGGGTATCAAAAAGTCTATATTCAGAATAAGAGAATCACCTAAAGAGACTTTCAATGCAAGTTTTTCAGGTCATATGACTGATCCAAACTTTAAAAAAGAAACTGAAGCCTCTGAAAGTGGACTGGAAATACATACTGTTTGCTCACAGAAGGAGGACTCCTTATGTCCAAATTTAATTGATAATGGAAGCTGGCCAGCCACCACCACACAGAATTCTGTAGCTTTGAAGAATGCAGGTTTAATATCCACTTTGAAAAAGAAAACAAATAAGTTTATTTATGCTATACATGATGAAACATCTTATAAAGGAAAAAAAATACCGAAAGACCAAAAATCAGAACTAATTAACTGTTCAGCCCAGTTTGAAGCAAATGCTTTTGAAGCACCACTTACATTTGCAAATGCTGATTCAGGTTTATTGCATTCTTCTGTGAAAAGAAGCTGTTCACAGAATGATTCTGAAGAACCAACTTTGTCCTTAACTAGCTCTTTTGGGACAATTCTGAGGAAATGTTCTAGAAATGAAACATGTTCTAATAATACAGTAATCTCTCAGGATCTTGATTATAAAGAAGCAAAATGTAATAAGGAAAAACTACAGTTATTTATTACCCCAGAAGCTGATTCTCTGTCATGCCTGCAGGAAGGACAGTGTGAAAATGATCCAAAAAGCAAAAAAGTTTCAGATATAAAAGAAGAGGTCTTGGCTGCAGCATGTCACCCAGTACAACATTCAAAAGTGGAATACAGTGATACTGACTTTCAATCCCAGAAAAGTCTTTTATATGATCATGAAAATGCCAGCACTCTTATTTTAACTCCTACTTCCAAGGATGTTCTGTCAAACCTAGTCATGATTTCTAGAGGCAAAGAATCATACAAAATGTCAGACAAGCTCAAAGGTAACAATTATGAATCTGATGTTGAATTAACCAAAAATATTCCCATGGAAAAGAATCAAGATGTATGTGCTTTAAATGAAAATTATAAAAACGTTGAGCTGTTGCCACCTGAAAAATACATGAGAGTAGCATCACCTTCAAGAAAGGTACAATTCAACCAAAACACAAATCTAAGAGTAATCCAAAAAAATCAAGAAGAAACTACTTCAATTTCAAAAATAACTGTCAATCCAGACTCTGAAGAACTTTTCTCAGACAATGAGAATAATTTTGTCTTCCAAGTAGCTAATGAAAGGAATAATCTTGCTTTAGGAAATACTAAGGAACTTCATGAAACAGACTTGACTTGTGTAAACGAACCCATTTTCAAGAACTA"
var res = {
"ORF1" : {
'start' : "88",
'stop' : "2004",
'strand' : "+",
'seq' : "MPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLTSSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCENDPKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLSNLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYMRVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNNLALGNTKELHETDLTCVNEPIFKN",
'bp' : 1917,
'aa' : 638,
'frame' : 1,
},
"ORF2" : {
'start' : "2443",
'stop' : "2520",
'strand' : "+",
'seq' : "MEPRWRKYPYCIFLHVTKIFQKKTY",
'bp' : 78,
'aa' : 25,
'frame' : 1
},
"ORF3" : {
'start' : "440",
'stop' : "517",
'strand' : "+",
'seq' : "MEPRWRKYPYCIFLHVTKIFQKKTY",
'bp' : 78,
'aa' : 25,
'frame' : 2},
"ORF4" : {
'start' : "2414",
'stop' : "2491",
'strand' : "+",
'seq' : "MVSTNPFRSKWSPDGENTPIAYFFM",
'bp' : 78,
'aa' : 25,
'frame' : 2
},
"ORF5" : {
'start' : "411",
'stop' : "488",
'strand' : "+",
'seq' : "MVSTNPFRSKWSPDGENTPIAYFFM",
'bp' : 78,
'aa' : 25,
'frame' : 3
},
"ORF6" : {
'start' : "2091",
'stop' : ">4004",
'strand' : "+",
'seq' : "MPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLTSSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCENDPKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLSNLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYMRVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNNLALGNTKELHETDLTCVNEPIFK",
'bp' : 1914,
'aa' : 637,
'frame' : 3
},
"ORF7" : {
'start' : "1759",
'stop' : "1622",
'strand' : "-",
'seq' : "MLLSCIFQVATAQRFYNFHLKHIHLDSFPWEYFWLIQHQIHNCYL",
'bp' : 138,
'aa' : 45,
'frame' : -1,
},
"ORF8" : {
'start' : "1441",
'stop' : "1334",
'strand' : "-",
'seq' : "MLQPRPLLLYLKLFCFLDHFHTVLPAGMTENQLLG",
'bp' : 108,
'aa' : 35,
'frame' : -1
},
"ORF9" : {
'start' : "721",
'stop' : "578",
'strand' : "-",
'seq' : "MKKPLEKFQILPALLQECSLYEIQDAALHLSYLPLFPHLMASLILVG",
'bp' : 144,
'aa' : 47,
'frame' : -1
},
"ORF10" : {
'start' : "484",
'stop' : "260",
'strand' : "-",
'seq' : "MKKYAIGVFSPSGLHLDLKGLVETIHRPKTVQLPWRFCHFHSQRASDLLHLNLMDQYHLVPLQIQMSIFLSLGF",
'bp' : 225,
'aa' : 74,
'frame' : -1},
"ORF11" : {
'start' : "157",
'stop' : "50",
'strand' : "-",
'seq' : "MKNYLLQRYLQLFHILHLLGHLALTFQCGLCSYLL",
'bp' : 108,
'aa' : 35,
'frame' : -1
},
"ORF12" : {
'start' : "3762",
'stop' : "3625",
'strand' : "-",
'seq' : "MLLSCIFQVATAQRFYNFHLKHIHLDSFPWEYFWLIQHQIHNCYL",
'bp' : 138,
'aa' : 45,
'frame' : -2
},
"ORF13" : {
'start' : "3444",
'stop' : "3337",
'strand' : "-",
'seq' : "MLQPRPLLLYLKLFCFLDHFHTVLPAGMTENQLLG",
'bp' : 108,
'aa' : 35,
'frame' : -2
},
"ORF14" : {
'start' : "2724",
'stop' : "2581",
'strand' : "-",
'seq' : "MKKPLEKFQILPALLQECSLYEIQDAALHLSYLPLFPHLMASLILVG",
'bp' : 144,
'aa' : 47,
'frame' : -2
},
"ORF15" : {
'start' : "2487",
'stop' : "2263",
'strand' : "-",
'seq' : "MKKYAIGVFSPSGLHLDLKGLVETIHRPKTVQLPWRFCHFHSQRASDLLHLNLMDQYHLVPLQIQMSIFLSLGF",
'bp' : 225,
'aa' : 74,
'frame' : -2
},
"ORF16" : {
'start' : "2160",
'stop' : "2053",
'strand' : "-",
'seq' : "MKNYLLQRYLQLFHILHLLGHLALTFQCGLCSYLL",
'bp' : 108,
'aa' : 35,
'frame' : -2
},
"ORF17" : {
'start' : "2052",
'stop' : "1936",
'strand' : "-",
'seq' : "MNSLMFFQIHDLQLLFLVLENGFVYTSQVCFMKFLSIS",
'bp' : 117,
'aa' : 38,
'frame' : -2
}
}
it("Testing for Basic Functionality", function(){
Test.assertSimilar(findOrfs(seq,75), res);
});
});
def get_frames(seq)
rev = seq.gsub(/[ATGC]/, 'A' => 'T', 'T' => 'A', 'G' => 'C', 'C' => 'G').reverse
{ "1": seq, "2": seq[1..-1], "3": seq[2..-1],
"-1": rev, "-2": rev[1..-1], "-3": rev[2..-1] }
end
def find_orf_in_frame(seq,frame,min,len)
orfs = []
tri_nts = seq.scan(/.{3}/)
start = stop = seq = ""
seq = seq.reverse.tr("ACTG","TGAC")
is_neg = frame < 0
neg_offset = {"1" => 1, "2" => 3, "3" => 5}
offset = neg_offset[frame.abs().to_s]
tri_nts.each_with_index do |tri_nt,i|
if (i == tri_nts.size - 1)
stop = ((i*3)+frame+2)
start = (len - start.to_i - offset) if is_neg
stop = (len - stop.to_i - offset) if is_neg
orfs << [start.to_s,">#{stop}",seq] if (seq.length + 3) >= min
break
end
if seq == "" && tri_nt == "ATG"
# p "ATG"
seq += tri_nt
start = (i * 3) + frame
elsif (!$codons.select {|k,v| v == "*"}.keys.include?(tri_nt)) && seq != ""
seq += tri_nt
elsif $codons.select {|k,v| v == "*"}.keys.include?(tri_nt)
print "\n-----\n"
print seq.scan(/.{3}/).map{|a| $codons[a]}.join()
if (seq.length + 3) >= min
stop = ((i*3)+frame+2)
start = (len - start.to_i - offset) if is_neg
stop = (len - stop.to_i - offset) if is_neg
orfs << [start.to_s,stop.to_s,seq]
end
start = stop = seq = ""
end
end
orfs
end
def find_orfs(seq,min)
frames = get_frames(seq)
len = seq.length
all_orfs = Hash.new()
count = 1
# [1,2,3,-1,-2,-3].each do |frame|
[1].each do |frame|
is_neg = frame < 0 ? true : false
seq = frames[frame.to_s.to_sym]
orfs = find_orf_in_frame(seq,frame,min,len)
orfs.each do |orf|
orf_info = {
start: orf[0],
stop: orf[1],
strand: frame > 0 ? "+" : "-",
seq: orf[2].scan(/.{3}/).map{|a| $codons[a]}.join(),
bp: orf[2].length + 3,
aa: orf[2].length / 3,
frame: frame,
}
all_orfs["ORF#{count}"] = orf_info
count += 1
end
end
all_orfs
end
seq = "AAAAGAGAAGCTGCAAGTCATGGATTTGGAAAAACATCAGGGAATTCATTTAAAGTAAATAGCTGCAAAGACCACATTGGAAAGTCAATGCCAAATGTCCTAGAAGATGAAGTATATGAAACAGTTGTAGATACCTCTGAAGAAGATAGTTTTTCATTATGTTTTTCTAAATGTAGAACAAAAAATCTACAAAAAGTAAGAACTAGCAAGACTAGGAAAAAAATTTTCCATGAAGCAAACGCTGATGAATGTGAAAAATCTAAAAACCAAGTGAAAGAAAAATACTCATTTGTATCTGAAGTGGAACCAAATGATACTGATCCATTAGATTCAAATGTAGCAAATCAGAAGCCCTTTGAGAGTGGAAGTGACAAAATCTCCAAGGAAGTTGTACCGTCTTTGGCCTGTGAATGGTCTCAACTAACCCTTTCAGGTCTAAATGGAGCCCAGATGGAGAAAATACCCCTATTGCATATTTCTTCATGTGACCAAAATATTTCAGAAAAAGACCTATTAGACACAGAGAACAAAAGAAAGAAAGATTTTCTTACTTCAGAGAATTCTTTGCCACGTATTTCTAGCCTACCAAAATCAGAGAAGCCATTAAATGAGGAAACAGTGGTAAATAAGAGAGATGAAGAGCAGCATCTTGAATCTCATACAGACTGCATTCTTGCAGTAAAGCAGGCAATATCTGGAACTTCTCCAGTGGCTTCTTCATTTCAGGGTATCAAAAAGTCTATATTCAGAATAAGAGAATCACCTAAAGAGACTTTCAATGCAAGTTTTTCAGGTCATATGACTGATCCAAACTTTAAAAAAGAAACTGAAGCCTCTGAAAGTGGACTGGAAATACATACTGTTTGCTCACAGAAGGAGGACTCCTTATGTCCAAATTTAATTGATAATGGAAGCTGGCCAGCCACCACCACACAGAATTCTGTAGCTTTGAAGAATGCAGGTTTAATATCCACTTTGAAAAAGAAAACAAATAAGTTTATTTATGCTATACATGATGAAACATCTTATAAAGGAAAAAAAATACCGAAAGACCAAAAATCAGAACTAATTAACTGTTCAGCCCAGTTTGAAGCAAATGCTTTTGAAGCACCACTTACATTTGCAAATGCTGATTCAGGTTTATTGCATTCTTCTGTGAAAAGAAGCTGTTCACAGAATGATTCTGAAGAACCAACTTTGTCCTTAACTAGCTCTTTTGGGACAATTCTGAGGAAATGTTCTAGAAATGAAACATGTTCTAATAATACAGTAATCTCTCAGGATCTTGATTATAAAGAAGCAAAATGTAATAAGGAAAAACTACAGTTATTTATTACCCCAGAAGCTGATTCTCTGTCATGCCTGCAGGAAGGACAGTGTGAAAATGATCCAAAAAGCAAAAAAGTTTCAGATATAAAAGAAGAGGTCTTGGCTGCAGCATGTCACCCAGTACAACATTCAAAAGTGGAATACAGTGATACTGACTTTCAATCCCAGAAAAGTCTTTTATATGATCATGAAAATGCCAGCACTCTTATTTTAACTCCTACTTCCAAGGATGTTCTGTCAAACCTAGTCATGATTTCTAGAGGCAAAGAATCATACAAAATGTCAGACAAGCTCAAAGGTAACAATTATGAATCTGATGTTGAATTAACCAAAAATATTCCCATGGAAAAGAATCAAGATGTATGTGCTTTAAATGAAAATTATAAAAACGTTGAGCTGTTGCCACCTGAAAAATACATGAGAGTAGCATCACCTTCAAGAAAGGTACAATTCAACCAAAACACAAATCTAAGAGTAATCCAAAAAAATCAAGAAGAAACTACTTCAATTTCAAAAATAACTGTCAATCCAGACTCTGAAGAACTTTTCTCAGACAATGAGAATAATTTTGTCTTCCAAGTAGCTAATGAAAGGAATAATCTTGCTTTAGGAAATACTAAGGAACTTCATGAAACAGACTTGACTTGTGTAAACGAACCCATTTTCAAGAACTAAAAAGAGAAGCTGCAAGTCATGGATTTGGAAAAACATCAGGGAATTCATTTAAAGTAAATAGCTGCAAAGACCACATTGGAAAGTCAATGCCAAATGTCCTAGAAGATGAAGTATATGAAACAGTTGTAGATACCTCTGAAGAAGATAGTTTTTCATTATGTTTTTCTAAATGTAGAACAAAAAATCTACAAAAAGTAAGAACTAGCAAGACTAGGAAAAAAATTTTCCATGAAGCAAACGCTGATGAATGTGAAAAATCTAAAAACCAAGTGAAAGAAAAATACTCATTTGTATCTGAAGTGGAACCAAATGATACTGATCCATTAGATTCAAATGTAGCAAATCAGAAGCCCTTTGAGAGTGGAAGTGACAAAATCTCCAAGGAAGTTGTACCGTCTTTGGCCTGTGAATGGTCTCAACTAACCCTTTCAGGTCTAAATGGAGCCCAGATGGAGAAAATACCCCTATTGCATATTTCTTCATGTGACCAAAATATTTCAGAAAAAGACCTATTAGACACAGAGAACAAAAGAAAGAAAGATTTTCTTACTTCAGAGAATTCTTTGCCACGTATTTCTAGCCTACCAAAATCAGAGAAGCCATTAAATGAGGAAACAGTGGTAAATAAGAGAGATGAAGAGCAGCATCTTGAATCTCATACAGACTGCATTCTTGCAGTAAAGCAGGCAATATCTGGAACTTCTCCAGTGGCTTCTTCATTTCAGGGTATCAAAAAGTCTATATTCAGAATAAGAGAATCACCTAAAGAGACTTTCAATGCAAGTTTTTCAGGTCATATGACTGATCCAAACTTTAAAAAAGAAACTGAAGCCTCTGAAAGTGGACTGGAAATACATACTGTTTGCTCACAGAAGGAGGACTCCTTATGTCCAAATTTAATTGATAATGGAAGCTGGCCAGCCACCACCACACAGAATTCTGTAGCTTTGAAGAATGCAGGTTTAATATCCACTTTGAAAAAGAAAACAAATAAGTTTATTTATGCTATACATGATGAAACATCTTATAAAGGAAAAAAAATACCGAAAGACCAAAAATCAGAACTAATTAACTGTTCAGCCCAGTTTGAAGCAAATGCTTTTGAAGCACCACTTACATTTGCAAATGCTGATTCAGGTTTATTGCATTCTTCTGTGAAAAGAAGCTGTTCACAGAATGATTCTGAAGAACCAACTTTGTCCTTAACTAGCTCTTTTGGGACAATTCTGAGGAAATGTTCTAGAAATGAAACATGTTCTAATAATACAGTAATCTCTCAGGATCTTGATTATAAAGAAGCAAAATGTAATAAGGAAAAACTACAGTTATTTATTACCCCAGAAGCTGATTCTCTGTCATGCCTGCAGGAAGGACAGTGTGAAAATGATCCAAAAAGCAAAAAAGTTTCAGATATAAAAGAAGAGGTCTTGGCTGCAGCATGTCACCCAGTACAACATTCAAAAGTGGAATACAGTGATACTGACTTTCAATCCCAGAAAAGTCTTTTATATGATCATGAAAATGCCAGCACTCTTATTTTAACTCCTACTTCCAAGGATGTTCTGTCAAACCTAGTCATGATTTCTAGAGGCAAAGAATCATACAAAATGTCAGACAAGCTCAAAGGTAACAATTATGAATCTGATGTTGAATTAACCAAAAATATTCCCATGGAAAAGAATCAAGATGTATGTGCTTTAAATGAAAATTATAAAAACGTTGAGCTGTTGCCACCTGAAAAATACATGAGAGTAGCATCACCTTCAAGAAAGGTACAATTCAACCAAAACACAAATCTAAGAGTAATCCAAAAAAATCAAGAAGAAACTACTTCAATTTCAAAAATAACTGTCAATCCAGACTCTGAAGAACTTTTCTCAGACAATGAGAATAATTTTGTCTTCCAAGTAGCTAATGAAAGGAATAATCTTGCTTTAGGAAATACTAAGGAACTTCATGAAACAGACTTGACTTGTGTAAACGAACCCATTTTCAAGAACTA"
Test.describe('Basic Tests') do
res = {"ORF1"=>{:start=>"88", :stop=>"2004", :strand=>"+", :seq=>"MPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLTSSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCENDPKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLSNLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYMRVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNNLALGNTKELHETDLTCVNEPIFKN", :bp=>1917, :aa=>638, :frame=>1}, "ORF2"=>{:start=>"2443", :stop=>"2520", :strand=>"+", :seq=>"MEPRWRKYPYCIFLHVTKIFQKKTY", :bp=>78, :aa=>25, :frame=>1}, "ORF3"=>{:start=>"440", :stop=>"517", :strand=>"+", :seq=>"MEPRWRKYPYCIFLHVTKIFQKKTY", :bp=>78, :aa=>25, :frame=>2}, "ORF4"=>{:start=>"2414", :stop=>"2491", :strand=>"+", :seq=>"MVSTNPFRSKWSPDGENTPIAYFFM", :bp=>78, :aa=>25, :frame=>2}, "ORF5"=>{:start=>"411", :stop=>"488", :strand=>"+", :seq=>"MVSTNPFRSKWSPDGENTPIAYFFM", :bp=>78, :aa=>25, :frame=>3}, "ORF6"=>{:start=>"2091", :stop=>">4004", :strand=>"+", :seq=>"MPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVANQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLTSSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCENDPKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLSNLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYMRVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNNLALGNTKELHETDLTCVNEPIFK", :bp=>1914, :aa=>637, :frame=>3}, "ORF7"=>{:start=>"1759", :stop=>"1622", :strand=>"-", :seq=>"MLLSCIFQVATAQRFYNFHLKHIHLDSFPWEYFWLIQHQIHNCYL", :bp=>138, :aa=>45, :frame=>-1}, "ORF8"=>{:start=>"1441", :stop=>"1334", :strand=>"-", :seq=>"MLQPRPLLLYLKLFCFLDHFHTVLPAGMTENQLLG", :bp=>108, :aa=>35, :frame=>-1}, "ORF9"=>{:start=>"721", :stop=>"578", :strand=>"-", :seq=>"MKKPLEKFQILPALLQECSLYEIQDAALHLSYLPLFPHLMASLILVG", :bp=>144, :aa=>47, :frame=>-1}, "ORF10"=>{:start=>"484", :stop=>"260", :strand=>"-", :seq=>"MKKYAIGVFSPSGLHLDLKGLVETIHRPKTVQLPWRFCHFHSQRASDLLHLNLMDQYHLVPLQIQMSIFLSLGF", :bp=>225, :aa=>74, :frame=>-1}, "ORF11"=>{:start=>"157", :stop=>"50", :strand=>"-", :seq=>"MKNYLLQRYLQLFHILHLLGHLALTFQCGLCSYLL", :bp=>108, :aa=>35, :frame=>-1}, "ORF12"=>{:start=>"3762", :stop=>"3625", :strand=>"-", :seq=>"MLLSCIFQVATAQRFYNFHLKHIHLDSFPWEYFWLIQHQIHNCYL", :bp=>138, :aa=>45, :frame=>-2}, "ORF13"=>{:start=>"3444", :stop=>"3337", :strand=>"-", :seq=>"MLQPRPLLLYLKLFCFLDHFHTVLPAGMTENQLLG", :bp=>108, :aa=>35, :frame=>-2}, "ORF14"=>{:start=>"2724", :stop=>"2581", :strand=>"-", :seq=>"MKKPLEKFQILPALLQECSLYEIQDAALHLSYLPLFPHLMASLILVG", :bp=>144, :aa=>47, :frame=>-2}, "ORF15"=>{:start=>"2487", :stop=>"2263", :strand=>"-", :seq=>"MKKYAIGVFSPSGLHLDLKGLVETIHRPKTVQLPWRFCHFHSQRASDLLHLNLMDQYHLVPLQIQMSIFLSLGF", :bp=>225, :aa=>74, :frame=>-2}, "ORF16"=>{:start=>"2160", :stop=>"2053", :strand=>"-", :seq=>"MKNYLLQRYLQLFHILHLLGHLALTFQCGLCSYLL", :bp=>108, :aa=>35, :frame=>-2}, "ORF17"=>{:start=>"2052", :stop=>"1936", :strand=>"-", :seq=>"MNSLMFFQIHDLQLLFLVLENGFVYTSQVCFMKFLSIS", :bp=>117, :aa=>38, :frame=>-2}}
Test.it('Testing for Basic Functionality') do
Test.assert_equals(find_orfs(seq,75), res)
end
end
# Test.describe("Random tests") do
# def sol_get_frames(seq)
# rev = seq.gsub(/[ATGC]/, 'A' => 'T', 'T' => 'A', 'G' => 'C', 'C' => 'G').reverse
# { "1": seq, "2": seq[1..-1], "3": seq[2..-1],
# "-1": rev, "-2": rev[1..-1], "-3": rev[2..-1] }
# end
# def sol_find_orf_in_frame(seq,frame,min,len)
# orfs = []
# tri_nts = seq.scan(/.{3}/)
# start = stop = seq = ""
# seq = seq.reverse.tr("ACTG","TGAC")
# is_neg = frame < 0
# neg_offset = {"1" => 1, "2" => 3, "3" => 5}
# offset = neg_offset[frame.abs().to_s]
# tri_nts.each_with_index do |tri_nt,i|
# if (i == tri_nts.size - 1)
# stop = ((i*3)+frame+2)
# start = (len - start.to_i - offset) if is_neg
# stop = (len - stop.to_i - offset) if is_neg
# orfs << [start.to_s,">#{stop}",seq] if (seq.length + 3) >= min
# break
# end
# if seq == "" && tri_nt == "ATG"
# seq += tri_nt
# start = (i * 3) + frame
# elsif (!$codons.select {|k,v| v == "*"}.keys.include?(tri_nt)) && seq != ""
# seq += tri_nt
# elsif $codons.select {|k,v| v == "*"}.keys.include?(tri_nt)
# if (seq.length + 3) >= min
# stop = ((i*3)+frame+2)
# start = (len - start.to_i - offset) if is_neg
# stop = (len - stop.to_i - offset) if is_neg
# orfs << [start.to_s,stop.to_s,seq]
# end
# start = stop = seq = ""
# end
# end
# orfs
# end
# def sol_find_orfs(seq,min)
# frames = sol_get_frames(seq)
# len = seq.length
# all_orfs = Hash.new()
# count = 1
# [1,2,3,-1,-2,-3].each do |frame|
# is_neg = frame < 0 ? true : false
# seq = frames[frame.to_s.to_sym]
# orfs = sol_find_orf_in_frame(seq,frame,min,len)
# orfs.each do |orf|
# orf_info = {
# start: orf[0],
# stop: orf[1],
# strand: frame > 0 ? "+" : "-",
# seq: orf[2].scan(/.{3}/).map{|a| $codons[a]}.join(),
# bp: orf[2].length + 3,
# aa: orf[2].length / 3,
# frame: frame,
# }
# all_orfs["ORF#{count}"] = orf_info
# count += 1
# end
# end
# all_orfs
# end
# def generate_random_seq(limit)
# seq = ""
# nb_sub_seq = rand(3..6)
# all = $codons.keys
# no_stop = $codons.select {|k,v| v != "*"}.keys
# nb_sub_seq.times do |n|
# sub_seq = ""
# is_orf = [true,false].sample
# limit.times do |t|
# if is_orf
# sub_seq += sub_seq == "" ? "ATG" : no_stop.sample
# else
# sub_seq += all.sample
# end
# end
# seq += sub_seq
# end
# seq
# end
# 10.times do |x|
# limit = rand(50..150)
# r_seq = generate_random_seq(limit)
# Test.it("Testing for #{seq} and #{limit}") do
# Test.assert_equals(find_orfs(r_seq,limit), sol_find_orfs(r_seq,limit),"It should work for random inputs too")
# end
# end
# end