correct tests and add new writer

camilogarciabotero · Jan 10, 2023 · d13ff08 · d13ff08
1 parent f5b1c51
commit d13ff08
Show file tree

Hide file tree

Showing 8 changed files with 605 additions and 32 deletions.
diff --git a/docs/src/simplefinder.html b/docs/src/simplefinder.html
diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl
@@ -8,7 +8,7 @@ include("algorithms/simplefinder.jl")
 export simplefind, simplefind_extended, simplecds_generator, simpleprot_generator
 
 include("types.jl")
-export ORF, Codon, CDS, Protein, stopcodons, startcodon, extended_startcodons
+export ORF, Codon, CDS, Protein
 
 include("helpers.jl")
 export eachcodon, hasprematurestop
@@ -17,6 +17,9 @@ include("findgenes.jl")
 export locationgenerator, locationgenerator_extended, orfgenerator, cdsgenerator, proteingenerator
 
 include("io.jl")
-export write_cds, write_proteins
+export write_cds, write_proteins, write_bed
+
+include("constants.jl")
+export STOPCODONS, STARTCODON, EXTENDED_STARTCODONS
 
 end
diff --git a/src/algorithms/simplefinder.jl b/src/algorithms/simplefinder.jl
@@ -16,11 +16,11 @@ function simplefind(sequence::LongDNA; alternative_start::Bool=false)
         for strand in ['+', '-']
             seq = strand == '-' ? reverse_complement(sequence) : sequence
 
-            start_codon_indices = findall(startcodon, seq)
+            start_codon_indices = findall(STARTCODON, seq)
 
             for i in start_codon_indices
                 for j in i.start:3:seqbound
-                    if seq[j:j+2] ∈ stopcodons
+                    if seq[j:j+2] ∈ STOPCODONS
                         push!(orfs, orf)
                         break
                     end
@@ -33,11 +33,11 @@ function simplefind(sequence::LongDNA; alternative_start::Bool=false)
         for strand in ['+', '-']
             seq = strand == '-' ? reverse_complement(sequence) : sequence
 
-            start_codon_indices = findall(extended_startcodons, seq)
+            start_codon_indices = findall(EXTENDED_STARTCODONS, seq)
 
             for i in start_codon_indices
                 for j in i:3:seqbound
-                    if seq[j:j+2] ∈ stopcodons
+                    if seq[j:j+2] ∈ STOPCODONS
                         push!(orfs, orf)
                         break
                     end

diff --git a/src/constants.jl b/src/constants.jl
@@ -0,0 +1,3 @@
+const STOPCODONS = [Codon("TAG"), Codon("TAA"), Codon("TGA")]
+const STARTCODON = ExactSearchQuery(Codon("ATG"), iscompatible)
+const EXTENDED_STARTCODONS = PWMSearchQuery([Codon("ATG"), Codon("GTG"), Codon("TTG")], 1.0)
diff --git a/src/findgenes.jl b/src/findgenes.jl
@@ -16,14 +16,14 @@ Returns:
 function locationgenerator(sequence::LongDNA; alternative_start::Bool=false)
     seqbound = length(sequence) - 2
     if alternative_start == false
-        start_codon_indices = findall(startcodon, sequence)
+        start_codon_indices = findall(STARTCODON, sequence)
         @inbounds begin
-            (i.start:j+2 for i in start_codon_indices for j in i.start:3:seqbound if sequence[j:j+2] ∈ stopcodons && !hasprematurestop(sequence[i.start:j+2]))
+            (i.start:j+2 for i in start_codon_indices for j in i.start:3:seqbound if sequence[j:j+2] ∈ STOPCODONS && !hasprematurestop(sequence[i.start:j+2]))
         end
     else
-        start_codon_indices = findall(extended_startcodons, sequence)
+        start_codon_indices = findall(EXTENDED_STARTCODONS, sequence)
         @inbounds begin
-            (i:j+2 for i in start_codon_indices for j in i:3:seqbound if sequence[j:j+2] ∈ stopcodons && !hasprematurestop(sequence[i:j+2]))
+            (i:j+2 for i in start_codon_indices for j in i:3:seqbound if sequence[j:j+2] ∈ STOPCODONS && !hasprematurestop(sequence[i:j+2]))
         end
     end
 end

diff --git a/src/helpers.jl b/src/helpers.jl
@@ -1,7 +1,3 @@
-using BioSequences
-using TestItems
-include("types.jl")
-
 """
     eachcodon(sequence::LongDNA)
 
@@ -32,7 +28,7 @@ Returns a boolean indicating whether the `sequence` has more than one stop codon
 function hasprematurestop(sequence::LongDNA)::Bool
     stop_codon_count = 0
     @inbounds for codon in eachcodon(sequence)
-        if codon ∈ stopcodons
+        if codon ∈ STOPCODONS
             stop_codon_count += 1
         end
     end

diff --git a/src/io.jl b/src/io.jl
@@ -1,13 +1,33 @@
+"""
+   write_bed(file::String, seq::LongDNA; alternative_start::Bool=false)
+
+Write BED data to a file.
+
+Parameters:
+- file: string; the file name to which the BED data will be written
+- seq: an instance of `LongDNA`, representing a long DNA sequence
+- alternative_start: boolean (optional); default is `false`
+
+"""
+function write_bed(file::String, seq::LongDNA; alternative_start=false)
+   open(file, "w") do f
+      @simd for i in simplefind(seq; alternative_start)
+         write(f, "$(i.location.start)\t$(i.location.start)\t$(i.strand)\n")
+      end
+   end
+end
+
+
 """
    write_cds(file::String, seq::LongDNA; alternative_start=false, min_len = 6)
 
 Write a file containing the coding sequences (CDSs) of a given DNA sequence to the specified file.
 
-## Parameters
+Parameters:
 - `file`: A string representing the file path and name where the CDSs should be written.
 - `seq`: A `LongDNA` object representing the DNA sequence from which the CDSs should be extracted.
 
-## Keyword Arguments
+Keyword Arguments:
 - `alternative_start`: A boolean value indicating whether alternative start codons should be used when identifying CDSs. Default is `false`.
 - `min_len`: An integer representing the minimum length that a CDS must have in order to be included in the output file. Default is `6`.
 """
@@ -19,17 +39,16 @@ function write_cds(file::String, seq::LongDNA; alternative_start=false, min_len
     end
 end
 
-
 """
    write_proteins(file::String, seq::LongDNA; alternative_start = false, code::GeneticCode = BioSequences.standard_genetic_code, min_len = 6)
 
 Write a file containing the protein sequences encoded by the coding sequences (CDSs) of a given DNA sequence to the specified file.
 
-## Parameters
+Parameters:
 - `file`: A string representing the file path and name where the protein sequences should be written.
 - `seq`: A `LongDNA` object representing the DNA sequence from which the CDSs and protein sequences should be extracted.
 
-## Keyword Arguments
+Keyword Arguments:
 - `alternative_start`: A boolean value indicating whether alternative start codons should be used when identifying CDSs and translating them into protein sequences. Default is `false`.
 - `code`: A `GeneticCode` object representing the genetic code that should be used to translate the CDSs into protein sequences. Default is the standard genetic code.
 - `min_len`: An integer representing the minimum length that a protein sequence must have in order to be included in the output file. Default is `6`.
@@ -42,6 +61,9 @@ function write_proteins(file::String, seq::LongDNA; alternative_start = false, c
    end
 end
 
+
+
+
 # FASTA.Writer(open("some_file.fna", "w")) do writer
 #     write(writer, record) # a FASTA.Record
 # end
diff --git a/src/types.jl b/src/types.jl
@@ -1,9 +1,8 @@
 # Structs associated with gene models 
-# using GenomicFeatures
+
 abstract type Gene end
 
 # abstract type exon end
-
 # abstract type intron end
 
 """
@@ -66,13 +65,6 @@ function Base.count(codons::Vector{Codon}, sequence::LongDNA)
     return a
 end
 
-const stopcodons = [Codon("TAG"), Codon("TAA"), Codon("TGA")]
-
-const startcodon = ExactSearchQuery(Codon("ATG"), iscompatible)
-
-const extended_startcodons = PWMSearchQuery([Codon("ATG"), Codon("GTG"), Codon("TTG")], 1.0)
-
-
 # """
 
 """
@@ -92,6 +84,11 @@ struct CDS
 end
 
 """
+    struct Protein
+        sequence::LongSequence
+        orf::ORF
+    end
+    
 Similarly to the `CDS` struct, the `Protein` struct represents a encoded protein sequence in a DNA sequence. 
     It has three fields:
 
@@ -103,9 +100,6 @@ struct Protein
     orf::ORF
 end
 
-
-
-
 # Similarly to the `CDS` struct, the `Protein` struct represents a encoded protein sequence in a DNA sequence. 
 #     It has three fields: