From 2fa610833b6edf6cf4d1cff79bee463a4cb60f69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= <ca.garcia2@uniandes.edu.co>
Date: Mon, 9 Jan 2023 13:36:30 -0500
Subject: [PATCH] correct tests

---
 src/GeneFinder.jl              |   2 +-
 src/algorithms/simplefinder.jl | 112 ++++++++++++++++-----------------
 2 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl
index 941bcc4..04380b7 100644
--- a/src/GeneFinder.jl
+++ b/src/GeneFinder.jl
@@ -8,7 +8,7 @@ include("algorithms/simplefinder.jl")
 export simplefind, simplefind_extended, simplecds_generator, simpleprot_generator
 
 include("types.jl")
-export ORF, Codon, stopcodons, startcodon, extended_startcodons
+export ORF, Codon, CDS, Protein, stopcodons, startcodon, extended_startcodons
 
 include("helpers.jl")
 export eachcodon, hasprematurestop
diff --git a/src/algorithms/simplefinder.jl b/src/algorithms/simplefinder.jl
index efc74a8..208aac9 100644
--- a/src/algorithms/simplefinder.jl
+++ b/src/algorithms/simplefinder.jl
@@ -1,28 +1,48 @@
 """
-    simplefind(sequence::LongDNA)
+    simplefind(sequence::LongDNA; alternative_start::Bool=false)
 
 The simplest algorithm that finds ORFs in a DNA sequence.
 
 The simplefind function takes a LongDNA sequence and returns a Vector{ORF} containing the ORFs found in the sequence. It searches the sequence for start codons (ATG) and stops when it finds a stop codon (TAG, TAA, or TGA), adding each ORF it finds to the vector. The function also searches the reverse complement of the sequence, so it finds ORFs on both strands.
     This function has not ORFs size and overlapping condition contraints. Thus it might consider `aa"M*"` a posible encoding protein from the resulting ORFs.
+        Extending the starting codons search to include ATG, GTG, and TTG.
+    In E. coli (K-12 strain), ATG is used in about 83 % of its genes, GTG in 14 % and TTG in 3 % of the cases @ref[axelson-fisk_comparative_2015]
 """
-function simplefind(sequence::LongDNA)
+function simplefind(sequence::LongDNA; alternative_start::Bool=false)
     orf = nothing
     orfs = Vector{ORF}()
     seqbound = length(sequence) - 2 #3
-
-    for strand in ['+', '-']
-        seq = strand == '-' ? reverse_complement(sequence) : sequence
-
-        start_codon_indices = findall(startcodon, seq)
-
-        for i in start_codon_indices
-            for j in i.start:3:seqbound
-                if seq[j:j+2] ∈ stopcodons
-                    push!(orfs, orf)
-                    break
+    if alternative_start == false
+        for strand in ['+', '-']
+            seq = strand == '-' ? reverse_complement(sequence) : sequence
+
+            start_codon_indices = findall(startcodon, seq)
+
+            for i in start_codon_indices
+                for j in i.start:3:seqbound
+                    if seq[j:j+2] ∈ stopcodons
+                        push!(orfs, orf)
+                        break
+                    end
+                    orf = ORF(i.start:j+5, strand)
+                end
+            end
+        end
+    else
+        # using the extended start codon matrix 
+        for strand in ['+', '-']
+            seq = strand == '-' ? reverse_complement(sequence) : sequence
+    
+            start_codon_indices = findall(extended_startcodons, seq)
+    
+            for i in start_codon_indices
+                for j in i:3:seqbound
+                    if seq[j:j+2] ∈ stopcodons
+                        push!(orfs, orf)
+                        break
+                    end
+                    orf = ORF(i:j+5, strand)
                 end
-                orf = ORF(i.start:j+5, strand)
             end
         end
     end
@@ -48,36 +68,6 @@ end
 end
 
 
-"""
-    simplefind_extended(sequence::LongDNA)
-
-Similar to `simplefind` implementation that finds ORFs in a DNA sequence, but extending the starting codons search to include ATG, GTG, and TTG.
-    In E. coli (K-12 strain), ATG is used in about 83 % of its genes, GTG in 14 % and TTG in 3 % of the cases @ref[axelson-fisk_comparative_2015]
-
-"""
-function simplefind_extended(sequence::LongDNA)
-    orf = nothing
-    orfs = Vector{ORF}()
-    seqbound = length(sequence) - 2 #3
-
-    for strand in ['+', '-']
-        seq = strand == '-' ? reverse_complement(sequence) : sequence
-
-        start_codon_indices = findall(extended_startcodons, seq)
-
-        for i in start_codon_indices
-            for j in i:3:seqbound
-                if seq[j:j+2] ∈ stopcodons
-                    push!(orfs, orf)
-                    break
-                end
-                orf = ORF(i:j+5, strand)
-            end
-        end
-    end
-    return orfs
-end
-
 """
     simplecds_generator(sequence::LongDNA; alternative_start::Bool=false)
 
@@ -89,18 +79,18 @@ The `simplecds_generator` is a generator function that takes a `LongDNA` sequenc
     and then it extracts the actual CDS sequence from each ORF. 
     The function also searches the reverse complement of the sequence, so it finds CDSs on both strands.
 """
-function simplecds_generator(sequence::LongDNA; alternative_start::Bool = false)
-    orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
+function simplecds_generator(sequence::LongDNA; alternative_start::Bool = false, min_len = 6)
+    orfs = simplefind(sequence; alternative_start)
     reversedseq = reverse_complement(sequence)
-    cds = (i.strand == '+' ? sequence[i.location] : reversedseq[i.location] for i in orfs)
+    cds = (i.strand == '+' ? CDS(sequence[i.location],i) : CDS(reversedseq[i.location],i) for i in orfs if length(i.location) >= min_len)
     return cds
 end
 
-function simplecds_generator(sequence::String; alternative_start::Bool=false)
+function simplecds_generator(sequence::String; alternative_start::Bool=false, min_len = 6)
     sequence = LongDNA{4}(sequence)
-    orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
+    orfs = simplefind(sequence; alternative_start)
     reversedseq = reverse_complement(sequence)
-    cds = (i.strand == '+' ? sequence[i.location] : reversedseq[i.location] for i in orfs)
+    cds = (i.strand == '+' ? CDS(sequence[i.location],i) : CDS(reversedseq[i.location],i) for i in orfs if length(i.location) >= min_len)
     return cds
 end
 
@@ -111,7 +101,10 @@ end
     cds01 = collect(simplecds_generator(seq01))
 
     @test length(cds01) == 5
-    @test cds01 == [dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCTAG", dna"ATGCATGCTAGTAACTAG", dna"ATGCTAGTAACTAGCTAG"]
+    # @test cds01 == [dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCATGCTAGTAACTAGCTAG", dna"ATGCATGCATGCTAG", dna"ATGCATGCTAGTAACTAG", dna"ATGCTAGTAACTAGCTAG"]
+    # @test cds01[1] == [CDS(dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG", ORF(1:33, '+')), CDS(dna"ATGCATGCATGCATGCTAGTAACTAGCTAG", ORF(4:33, '+')), CDS(dna"ATGCATGCATGCTAG", ORF(8:22, '+')), CDS(dna"ATGCATGCTAGTAACTAG", ORF(12:29, '+')), CDS(dna"ATGCTAGTAACTAGCTAG", ORF(16:33, '+'))]
+    @test cds01[1].sequence == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG"
+    @test cds01[1].orf == ORF(1:33, '+')
 end
 
 """
@@ -121,18 +114,18 @@ As its name suggest this generator function that iterates over the sequence to f
     The `simpleprot_generator` function takes a `LongDNA` sequence and returns a `Vector{CDS}` containing the 
     coding sequences (CDSs) found in the sequence. 
 """
-function simpleprot_generator(sequence::LongDNA; alternative_start::Bool = false, code::GeneticCode = BioSequences.standard_genetic_code)
-    orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
+function simpleprot_generator(sequence::LongDNA; alternative_start::Bool=false, code::GeneticCode=BioSequences.standard_genetic_code, min_len = 6)
+    orfs = simplefind(sequence; alternative_start)
     reversedseq = reverse_complement(sequence)
-    proteins = (i.strand == '+' ? translate(sequence[i.location]; code) : translate(reversedseq[i.location]; code) for i in orfs)
+    proteins = (i.strand == '+' ? Protein(translate(sequence[i.location]; alternative_start, code), i) : Protein(translate(reversedseq[i.location]; alternative_start, code), i) for i in orfs if length(i.location) >= min_len)
     return proteins
 end
 
-function simpleprot_generator(sequence::String; alternative_start::Bool = false, code::GeneticCode = BioSequences.standard_genetic_code)
+function simpleprot_generator(sequence::String; alternative_start::Bool = false, code::GeneticCode = BioSequences.standard_genetic_code, min_len = 6)
     sequence = LongDNA{4}(sequence)
-    orfs = alternative_start == false ? simplefind(sequence) : simplefind_extended(sequence)
+    orfs = simplefind(sequence; alternative_start)
     reversedseq = reverse_complement(sequence)
-    proteins = (i.strand == '+' ? translate(sequence[i.location]; code) : translate(reversedseq[i.location]; code) for i in orfs)
+    proteins = (i.strand == '+' ? Protein(translate(sequence[i.location]; alternative_start, code), i) : Protein(translate(reversedseq[i.location]; alternative_start, code), i) for i in orfs if length(i.location) >= min_len)
     return proteins
 end
 
@@ -143,5 +136,8 @@ end
     proteins01 = collect(simpleprot_generator(seq01))
     
     @test length(proteins01) == 5
-    @test proteins01 == [aa"MMHACMLVTS*", aa"MHACMLVTS*", aa"MHAC*", aa"MHASN*", aa"MLVTS*"]
+    # @test proteins01 == [aa"MMHACMLVTS*", aa"MHACMLVTS*", aa"MHAC*", aa"MHASN*", aa"MLVTS*"]
+
+    @test proteins01[1].sequence == aa"MMHACMLVTS*"
+    @test proteins01[1].orf == ORF(1:33, '+')
 end
\ No newline at end of file