Skip to content

Commit

Permalink
correct tests and add new writer
Browse files Browse the repository at this point in the history
  • Loading branch information
camilogarciabotero committed Jan 10, 2023
1 parent f5b1c51 commit d13ff08
Show file tree
Hide file tree
Showing 8 changed files with 605 additions and 32 deletions.
555 changes: 555 additions & 0 deletions docs/src/simplefinder.html

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions src/GeneFinder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ include("algorithms/simplefinder.jl")
export simplefind, simplefind_extended, simplecds_generator, simpleprot_generator

include("types.jl")
export ORF, Codon, CDS, Protein, stopcodons, startcodon, extended_startcodons
export ORF, Codon, CDS, Protein

include("helpers.jl")
export eachcodon, hasprematurestop
Expand All @@ -17,6 +17,9 @@ include("findgenes.jl")
export locationgenerator, locationgenerator_extended, orfgenerator, cdsgenerator, proteingenerator

include("io.jl")
export write_cds, write_proteins
export write_cds, write_proteins, write_bed

include("constants.jl")
export STOPCODONS, STARTCODON, EXTENDED_STARTCODONS

end
8 changes: 4 additions & 4 deletions src/algorithms/simplefinder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ function simplefind(sequence::LongDNA; alternative_start::Bool=false)
for strand in ['+', '-']
seq = strand == '-' ? reverse_complement(sequence) : sequence

start_codon_indices = findall(startcodon, seq)
start_codon_indices = findall(STARTCODON, seq)

for i in start_codon_indices
for j in i.start:3:seqbound
if seq[j:j+2] stopcodons
if seq[j:j+2] STOPCODONS
push!(orfs, orf)
break
end
Expand All @@ -33,11 +33,11 @@ function simplefind(sequence::LongDNA; alternative_start::Bool=false)
for strand in ['+', '-']
seq = strand == '-' ? reverse_complement(sequence) : sequence

start_codon_indices = findall(extended_startcodons, seq)
start_codon_indices = findall(EXTENDED_STARTCODONS, seq)

for i in start_codon_indices
for j in i:3:seqbound
if seq[j:j+2] stopcodons
if seq[j:j+2] STOPCODONS
push!(orfs, orf)
break
end
Expand Down
3 changes: 3 additions & 0 deletions src/constants.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
const STOPCODONS = [Codon("TAG"), Codon("TAA"), Codon("TGA")]
const STARTCODON = ExactSearchQuery(Codon("ATG"), iscompatible)
const EXTENDED_STARTCODONS = PWMSearchQuery([Codon("ATG"), Codon("GTG"), Codon("TTG")], 1.0)
8 changes: 4 additions & 4 deletions src/findgenes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ Returns:
function locationgenerator(sequence::LongDNA; alternative_start::Bool=false)
seqbound = length(sequence) - 2
if alternative_start == false
start_codon_indices = findall(startcodon, sequence)
start_codon_indices = findall(STARTCODON, sequence)
@inbounds begin
(i.start:j+2 for i in start_codon_indices for j in i.start:3:seqbound if sequence[j:j+2] stopcodons && !hasprematurestop(sequence[i.start:j+2]))
(i.start:j+2 for i in start_codon_indices for j in i.start:3:seqbound if sequence[j:j+2] STOPCODONS && !hasprematurestop(sequence[i.start:j+2]))
end
else
start_codon_indices = findall(extended_startcodons, sequence)
start_codon_indices = findall(EXTENDED_STARTCODONS, sequence)
@inbounds begin
(i:j+2 for i in start_codon_indices for j in i:3:seqbound if sequence[j:j+2] stopcodons && !hasprematurestop(sequence[i:j+2]))
(i:j+2 for i in start_codon_indices for j in i:3:seqbound if sequence[j:j+2] STOPCODONS && !hasprematurestop(sequence[i:j+2]))
end
end
end
Expand Down
6 changes: 1 addition & 5 deletions src/helpers.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
using BioSequences
using TestItems
include("types.jl")

"""
eachcodon(sequence::LongDNA)
Expand Down Expand Up @@ -32,7 +28,7 @@ Returns a boolean indicating whether the `sequence` has more than one stop codon
function hasprematurestop(sequence::LongDNA)::Bool
stop_codon_count = 0
@inbounds for codon in eachcodon(sequence)
if codon stopcodons
if codon STOPCODONS
stop_codon_count += 1
end
end
Expand Down
32 changes: 27 additions & 5 deletions src/io.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,33 @@
"""
write_bed(file::String, seq::LongDNA; alternative_start::Bool=false)
Write BED data to a file.
Parameters:
- file: string; the file name to which the BED data will be written
- seq: an instance of `LongDNA`, representing a long DNA sequence
- alternative_start: boolean (optional); default is `false`
"""
function write_bed(file::String, seq::LongDNA; alternative_start=false)
open(file, "w") do f
@simd for i in simplefind(seq; alternative_start)
write(f, "$(i.location.start)\t$(i.location.start)\t$(i.strand)\n")
end
end
end


"""
write_cds(file::String, seq::LongDNA; alternative_start=false, min_len = 6)
Write a file containing the coding sequences (CDSs) of a given DNA sequence to the specified file.
## Parameters
Parameters:
- `file`: A string representing the file path and name where the CDSs should be written.
- `seq`: A `LongDNA` object representing the DNA sequence from which the CDSs should be extracted.
## Keyword Arguments
Keyword Arguments:
- `alternative_start`: A boolean value indicating whether alternative start codons should be used when identifying CDSs. Default is `false`.
- `min_len`: An integer representing the minimum length that a CDS must have in order to be included in the output file. Default is `6`.
"""
Expand All @@ -19,17 +39,16 @@ function write_cds(file::String, seq::LongDNA; alternative_start=false, min_len
end
end


"""
write_proteins(file::String, seq::LongDNA; alternative_start = false, code::GeneticCode = BioSequences.standard_genetic_code, min_len = 6)
Write a file containing the protein sequences encoded by the coding sequences (CDSs) of a given DNA sequence to the specified file.
## Parameters
Parameters:
- `file`: A string representing the file path and name where the protein sequences should be written.
- `seq`: A `LongDNA` object representing the DNA sequence from which the CDSs and protein sequences should be extracted.
## Keyword Arguments
Keyword Arguments:
- `alternative_start`: A boolean value indicating whether alternative start codons should be used when identifying CDSs and translating them into protein sequences. Default is `false`.
- `code`: A `GeneticCode` object representing the genetic code that should be used to translate the CDSs into protein sequences. Default is the standard genetic code.
- `min_len`: An integer representing the minimum length that a protein sequence must have in order to be included in the output file. Default is `6`.
Expand All @@ -42,6 +61,9 @@ function write_proteins(file::String, seq::LongDNA; alternative_start = false, c
end
end




# FASTA.Writer(open("some_file.fna", "w")) do writer
# write(writer, record) # a FASTA.Record
# end
18 changes: 6 additions & 12 deletions src/types.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# Structs associated with gene models
# using GenomicFeatures

abstract type Gene end

# abstract type exon end

# abstract type intron end

"""
Expand Down Expand Up @@ -66,13 +65,6 @@ function Base.count(codons::Vector{Codon}, sequence::LongDNA)
return a
end

const stopcodons = [Codon("TAG"), Codon("TAA"), Codon("TGA")]

const startcodon = ExactSearchQuery(Codon("ATG"), iscompatible)

const extended_startcodons = PWMSearchQuery([Codon("ATG"), Codon("GTG"), Codon("TTG")], 1.0)


# """

"""
Expand All @@ -92,6 +84,11 @@ struct CDS
end

"""
struct Protein
sequence::LongSequence
orf::ORF
end
Similarly to the `CDS` struct, the `Protein` struct represents a encoded protein sequence in a DNA sequence.
It has three fields:
Expand All @@ -103,9 +100,6 @@ struct Protein
orf::ORF
end




# Similarly to the `CDS` struct, the `Protein` struct represents a encoded protein sequence in a DNA sequence.
# It has three fields:

Expand Down

0 comments on commit d13ff08

Please sign in to comment.