Skip to content

Commit

Permalink
correct tests
Browse files Browse the repository at this point in the history
  • Loading branch information
camilogarciabotero committed Jan 9, 2023
1 parent 604643c commit 2fa6108
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 59 deletions.
2 changes: 1 addition & 1 deletion src/GeneFinder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ include("algorithms/simplefinder.jl")
export simplefind, simplefind_extended, simplecds_generator, simpleprot_generator

include("types.jl")
export ORF, Codon, stopcodons, startcodon, extended_startcodons
export ORF, Codon, CDS, Protein, stopcodons, startcodon, extended_startcodons

include("helpers.jl")
export eachcodon, hasprematurestop
Expand Down
112 changes: 54 additions & 58 deletions src/algorithms/simplefinder.jl
Original file line number Diff line number Diff line change
@@ -1,28 +1,48 @@
"""
simplefind(sequence::LongDNA)
simplefind(sequence::LongDNA; alternative_start::Bool=false)
The simplest algorithm that finds ORFs in a DNA sequence.
The simplefind function takes a LongDNA sequence and returns a Vector{ORF} containing the ORFs found in the sequence. It searches the sequence for start codons (ATG) and stops when it finds a stop codon (TAG, TAA, or TGA), adding each ORF it finds to the vector. The function also searches the reverse complement of the sequence, so it finds ORFs on both strands.
This function has not ORFs size and overlapping condition contraints. Thus it might consider `aa"M*"` a posible encoding protein from the resulting ORFs.
Extending the starting codons search to include ATG, GTG, and TTG.
In E. coli (K-12 strain), ATG is used in about 83 % of its genes, GTG in 14 % and TTG in 3 % of the cases @ref[axelson-fisk_comparative_2015]
"""
function simplefind(sequence::LongDNA)
function simplefind(sequence::LongDNA; alternative_start::Bool=false)
orf = nothing
orfs = Vector{ORF}()
seqbound = length(sequence) - 2 #3

for strand in ['+', '-']
seq = strand == '-' ? reverse_complement(sequence) : sequence

start_codon_indices = findall(startcodon, seq)

for i in start_codon_indices
for j in i.start:3:seqbound
if seq[j:j+2] stopcodons
push!(orfs, orf)
break
if alternative_start == false
for strand in ['+', '-']
seq = strand == '-' ? reverse_complement(sequence) : sequence

start_codon_indices = findall(startcodon, seq)

for i in start_codon_indices
for j in i.start:3:seqbound
if seq[j:j+2] stopcodons
push!(orfs, orf)
break
end
orf = ORF(i.start:j+5, strand)
end
end
end
else
# using the extended start codon matrix
for strand in ['+', '-']
seq = strand == '-' ? reverse_complement(sequence) : sequence

start_codon_indices = findall(extended_startcodons, seq)

for i in start_codon_indices
for j in i:3:seqbound
if seq[j:j+2] stopcodons
push!(orfs, orf)
break
end
orf = ORF(i:j+5, strand)
end
orf = ORF(i.start:j+5, strand)
end
end
end
Expand All @@ -48,36 +68,6 @@ end
end


"""
simplefind_extended(sequence::LongDNA)
Similar to `simplefind` implementation that finds ORFs in a DNA sequence, but extending the starting codons search to include ATG, GTG, and TTG.
In E. coli (K-12 strain), ATG is used in about 83 % of its genes, GTG in 14 % and TTG in 3 % of the cases @ref[axelson-fisk_comparative_2015]
"""
function simplefind_extended(sequence::LongDNA)
orf = nothing
orfs = Vector{ORF}()
seqbound = length(sequence) - 2 #3

for strand in ['+', '-']
seq = strand == '-' ? reverse_complement(sequence) : sequence

start_codon_indices = findall(extended_startcodons, seq)

for i in start_codon_indices
for j in i:3:seqbound
if seq[j:j+2] stopcodons
push!(orfs, orf)
break
end
orf = ORF(i:j+5, strand)
end
end
end
return orfs
end

"""
simplecds_generator(sequence::LongDNA; alternative_start::Bool=false)
Expand All @@ -89,18 +79,18 @@ The `simplecds_generator` is a generator function that takes a `LongDNA` sequenc
and then it extracts the actual CDS sequence from each ORF.
The function also searches the reverse complement of the sequence, so it finds CDSs on both strands.
"""
function simplecds_generator(sequence::LongDNA; alternative_start::Bool = false)
orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
function simplecds_generator(sequence::LongDNA; alternative_start::Bool = false, min_len = 6)
orfs = simplefind(sequence; alternative_start)
reversedseq = reverse_complement(sequence)
cds = (i.strand == '+' ? sequence[i.location] : reversedseq[i.location] for i in orfs)
cds = (i.strand == '+' ? CDS(sequence[i.location],i) : CDS(reversedseq[i.location],i) for i in orfs if length(i.location) >= min_len)
return cds
end

function simplecds_generator(sequence::String; alternative_start::Bool=false)
function simplecds_generator(sequence::String; alternative_start::Bool=false, min_len = 6)
sequence = LongDNA{4}(sequence)
orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
orfs = simplefind(sequence; alternative_start)
reversedseq = reverse_complement(sequence)
cds = (i.strand == '+' ? sequence[i.location] : reversedseq[i.location] for i in orfs)
cds = (i.strand == '+' ? CDS(sequence[i.location],i) : CDS(reversedseq[i.location],i) for i in orfs if length(i.location) >= min_len)
return cds
end

Expand All @@ -111,7 +101,10 @@ end
cds01 = collect(simplecds_generator(seq01))

@test length(cds01) == 5
@test cds01 == [dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCTAG", dna"ATGCATGCTAGTAACTAG", dna"ATGCTAGTAACTAGCTAG"]
# @test cds01 == [dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCTAG", dna"ATGCATGCTAGTAACTAG", dna"ATGCTAGTAACTAGCTAG"]
# @test cds01[1] == [CDS(dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG", ORF(1:33, '+')), CDS(dna"ATGCATGCATGCATGCTAGTAACTAGCTAG", ORF(4:33, '+')), CDS(dna"ATGCATGCATGCTAG", ORF(8:22, '+')), CDS(dna"ATGCATGCTAGTAACTAG", ORF(12:29, '+')), CDS(dna"ATGCTAGTAACTAGCTAG", ORF(16:33, '+'))]
@test cds01[1].sequence == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG"
@test cds01[1].orf == ORF(1:33, '+')
end

"""
Expand All @@ -121,18 +114,18 @@ As its name suggest this generator function that iterates over the sequence to f
The `simpleprot_generator` function takes a `LongDNA` sequence and returns a `Vector{CDS}` containing the
coding sequences (CDSs) found in the sequence.
"""
function simpleprot_generator(sequence::LongDNA; alternative_start::Bool = false, code::GeneticCode = BioSequences.standard_genetic_code)
orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
function simpleprot_generator(sequence::LongDNA; alternative_start::Bool=false, code::GeneticCode=BioSequences.standard_genetic_code, min_len = 6)
orfs = simplefind(sequence; alternative_start)
reversedseq = reverse_complement(sequence)
proteins = (i.strand == '+' ? translate(sequence[i.location]; code) : translate(reversedseq[i.location]; code) for i in orfs)
proteins = (i.strand == '+' ? Protein(translate(sequence[i.location]; alternative_start, code), i) : Protein(translate(reversedseq[i.location]; alternative_start, code), i) for i in orfs if length(i.location) >= min_len)
return proteins
end

function simpleprot_generator(sequence::String; alternative_start::Bool = false, code::GeneticCode = BioSequences.standard_genetic_code)
function simpleprot_generator(sequence::String; alternative_start::Bool = false, code::GeneticCode = BioSequences.standard_genetic_code, min_len = 6)
sequence = LongDNA{4}(sequence)
orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
orfs = simplefind(sequence; alternative_start)
reversedseq = reverse_complement(sequence)
proteins = (i.strand == '+' ? translate(sequence[i.location]; code) : translate(reversedseq[i.location]; code) for i in orfs)
proteins = (i.strand == '+' ? Protein(translate(sequence[i.location]; alternative_start, code), i) : Protein(translate(reversedseq[i.location]; alternative_start, code), i) for i in orfs if length(i.location) >= min_len)
return proteins
end

Expand All @@ -143,5 +136,8 @@ end
proteins01 = collect(simpleprot_generator(seq01))

@test length(proteins01) == 5
@test proteins01 == [aa"MMHACMLVTS*", aa"MHACMLVTS*", aa"MHAC*", aa"MHASN*", aa"MLVTS*"]
# @test proteins01 == [aa"MMHACMLVTS*", aa"MHACMLVTS*", aa"MHAC*", aa"MHASN*", aa"MLVTS*"]

@test proteins01[1].sequence == aa"MMHACMLVTS*"
@test proteins01[1].orf == ORF(1:33, '+')
end

2 comments on commit 2fa6108

@camilogarciabotero
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/75412

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.0.6 -m "<description of version>" 2fa610833b6edf6cf4d1cff79bee463a4cb60f69
git push origin v0.0.6

Please sign in to comment.