Skip to content

Commit 3e56ded

Browse files
committed
Merge branch 'feature/transcodingstreams' into develop
2 parents 47e1237 + d3a1327 commit 3e56ded

File tree

7 files changed

+185
-91
lines changed

7 files changed

+185
-91
lines changed

Project.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,23 @@ version = "0.1.1"
66
[deps]
77
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
88
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
9-
BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
9+
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
1010
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
11-
BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
1211
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
1312
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
1413
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
14+
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
1515
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
1616

1717
[compat]
1818
Automa = "0.7, 0.8"
1919
BGZFStreams = "0.3"
20-
BioCore = "2"
20+
BioGenerics = "0.1"
2121
BioSequences = "2"
22-
BufferedStreams = "1"
2322
FASTX = "1"
2423
GenomicFeatures = "2"
2524
Indexes = "0.1"
25+
TranscodingStreams = "0.9.5"
2626
URIParser = "0.4"
2727
julia = "1"
2828

docs/src/man/gff3.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,30 @@ end
3434
close(reader)
3535
```
3636

37+
The iterator interface demonstrated above allocates an object for each record and that may be a bottleneck of reading data from a file.
38+
In-place reading reuses a pre-allocated object for every record and less memory allocation happens in reading:
39+
40+
```julia
41+
# Import the GFF3 module.
42+
using GFF3
43+
44+
# Open a GFF3 file.
45+
reader = open(GFF3.Reader, "data.gff3")
46+
47+
# Pre-allocate record.
48+
record = GFF3.Record()
49+
50+
# Iterate over records.
51+
while !eof(reader)
52+
empty!(record)
53+
read!(reader, record)
54+
# do something
55+
end
56+
57+
# Finally, close the reader.
58+
close(reader)
59+
```
60+
3761
If you are interested in directives (which starts with '#') in addition to genomic features, you need to pass `skip_directives=false` when initializing a GFF3 constructor:
3862
```julia
3963
# Set skip_directives to true (this is set to false by default).

src/GFF3.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@
33

44
module GFF3
55

6-
using BioCore
6+
using BioGenerics
77
using Indexes
88
using FASTX.FASTA #TODO: move responsibility to FASTX.jl.
9+
using TranscodingStreams
910

1011
import Automa
1112
import Automa.RegExp: @re_str
13+
import Automa.Stream: @mark, @markpos, @relpos, @abspos
14+
1215
import BGZFStreams
13-
import BioCore.Exceptions: missingerror
16+
import BioGenerics.Exceptions: missingerror
1417
import BioSequences
15-
import BufferedStreams
1618
import GenomicFeatures: GenomicFeatures, Interval, IntervalCollection
1719
import URIParser
1820

src/reader.jl

Lines changed: 127 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# GFF3 Reader
22
# ===========
33

4-
mutable struct Reader <: BioCore.IO.AbstractReader
5-
state::BioCore.Ragel.State
4+
mutable struct Reader{S <: TranscodingStream} <: BioGenerics.IO.AbstractReader
5+
state::BioGenerics.Automa.State{S}
66
index::Union{Indexes.Tabix, Nothing}
77
save_directives::Bool
88
targets::Vector{Symbol}
@@ -11,8 +11,9 @@ mutable struct Reader <: BioCore.IO.AbstractReader
1111
directive_count::Int
1212
preceding_directive_count::Int
1313

14-
function Reader(input::BufferedStreams.BufferedInputStream, index=nothing, save_directives::Bool=false, skip_features::Bool=false, skip_directives::Bool=true, skip_comments::Bool=true)
15-
if isa(index, Indexes.Tabix) && !isa(input.source, BGZFStreams.BGZFStream)
14+
function Reader(input::S, index=nothing, save_directives::Bool=false, skip_features::Bool=false, skip_directives::Bool=true, skip_comments::Bool=true) where S <: TranscodingStream
15+
16+
if isa(index, Indexes.Tabix) && !isa(input.stream, BGZFStreams.BGZFStream)
1617
throw(ArgumentError("not a BGZF stream"))
1718
end
1819
targets = Symbol[]
@@ -25,7 +26,7 @@ mutable struct Reader <: BioCore.IO.AbstractReader
2526
if !skip_comments
2627
push!(targets, :comment)
2728
end
28-
return new(BioCore.Ragel.State(body_machine.start_state, input), index, save_directives, targets, false, Record[], 0, 0)
29+
return new{S}(BioGenerics.Automa.State(input, body_machine.start_state, 1, false), index, save_directives, targets, false, Record[], 0, 0)
2930
end
3031
end
3132

@@ -63,7 +64,15 @@ function Reader(input::IO; index=nothing, save_directives::Bool=false, skip_feat
6364
if isa(index, AbstractString)
6465
index = Indexes.Tabix(index)
6566
end
66-
return Reader(BufferedStreams.BufferedInputStream(input), index, save_directives, skip_features, skip_directives, skip_comments)
67+
68+
if isa(input, TranscodingStream)
69+
return Reader(input, index, save_directives, skip_features, skip_directives, skip_comments)
70+
end
71+
72+
stream = TranscodingStreams.NoopStream(input)
73+
74+
return Reader(stream, index, save_directives, skip_features, skip_directives, skip_comments)
75+
6776
end
6877

6978
function Reader(filepath::AbstractString; index=:auto, save_directives::Bool=false, skip_features::Bool=false, skip_directives::Bool=true, skip_comments::Bool=true)
@@ -85,19 +94,35 @@ function Base.eltype(::Type{<:Reader})
8594
return Record
8695
end
8796

88-
function BioCore.IO.stream(reader::Reader)
97+
function BioGenerics.IO.stream(reader::Reader)
8998
return reader.state.stream
9099
end
91100

92101
function Base.eof(reader::Reader)
93-
return reader.state.finished || eof(reader.state.stream)
102+
return reader.state.filled || eof(reader.state.stream)
94103
end
95104

96105
function Base.close(reader::Reader)
97106
# make trailing directives accessable
98107
reader.directive_count = reader.preceding_directive_count
99108
reader.preceding_directive_count = 0
100-
close(BioCore.IO.stream(reader))
109+
close(BioGenerics.IO.stream(reader))
110+
end
111+
112+
function Base.read!(reader::Reader, record::Record)
113+
return readrecord!(reader.state.stream, reader, record)
114+
end
115+
116+
function index!(record::Record)
117+
stream = TranscodingStreams.NoopStream(IOBuffer(record.data))
118+
return index!(stream, record)
119+
end
120+
121+
function Base.iterate(reader::Reader, nextone::Record = Record())
122+
if BioGenerics.IO.tryread!(reader, nextone) === nothing
123+
return nothing
124+
end
125+
return copy(nextone), empty!(nextone)
101126
end
102127

103128
function IntervalCollection(reader::Reader)
@@ -148,6 +173,14 @@ function getfasta(reader::Reader) #TODO: move responsibility to FASTX.jl.
148173
return FASTA.Reader(reader.state.stream)
149174
end
150175

176+
function appendfrom!(dst, dpos, src, spos, n)
177+
if length(dst) < dpos + n - 1
178+
resize!(dst, dpos + n - 1)
179+
end
180+
unsafe_copyto!(dst, dpos, src, spos, n)
181+
return dst
182+
end
183+
151184
const record_machine, body_machine = (function ()
152185
cat = Automa.RegExp.cat
153186
rep = Automa.RegExp.rep
@@ -157,27 +190,27 @@ const record_machine, body_machine = (function ()
157190

158191
feature = let
159192
seqid = re"[a-zA-Z0-9.:^*$@!+_?\-|%]*"
160-
seqid.actions[:enter] = [:mark]
193+
seqid.actions[:enter] = [:pos]
161194
seqid.actions[:exit] = [:feature_seqid]
162195

163196
source = re"[ -~]*"
164-
source.actions[:enter] = [:mark]
197+
source.actions[:enter] = [:pos]
165198
source.actions[:exit] = [:feature_source]
166199

167200
type_ = re"[ -~]*"
168-
type_.actions[:enter] = [:mark]
201+
type_.actions[:enter] = [:pos]
169202
type_.actions[:exit] = [:feature_type_]
170203

171204
start = re"[0-9]+|\."
172-
start.actions[:enter] = [:mark]
205+
start.actions[:enter] = [:pos]
173206
start.actions[:exit] = [:feature_start]
174207

175208
end_ = re"[0-9]+|\."
176-
end_.actions[:enter] = [:mark]
209+
end_.actions[:enter] = [:pos]
177210
end_.actions[:exit] = [:feature_end_]
178211

179212
score = re"[ -~]*[0-9][ -~]*|\."
180-
score.actions[:enter] = [:mark]
213+
score.actions[:enter] = [:pos]
181214
score.actions[:exit] = [:feature_score]
182215

183216
strand = re"[+\-?]|\."
@@ -189,7 +222,7 @@ const record_machine, body_machine = (function ()
189222
attributes = let
190223
char = re"[^=;,\t\r\n]"
191224
key = rep1(char)
192-
key.actions[:enter] = [:mark]
225+
key.actions[:enter] = [:pos]
193226
key.actions[:exit] = [:feature_attribute_key]
194227
val = rep(char)
195228
attr = cat(key, '=', val, rep(cat(',', val)))
@@ -216,7 +249,7 @@ const record_machine, body_machine = (function ()
216249
comment.actions[:exit] = [:comment]
217250

218251
record = alt(feature, directive, comment)
219-
record.actions[:enter] = [:anchor]
252+
record.actions[:enter] = [:mark]
220253
record.actions[:exit] = [:record]
221254

222255
blank = re"[ \t]*"
@@ -238,46 +271,57 @@ const record_machine, body_machine = (function ()
238271
end)()
239272

240273
const record_actions = Dict(
241-
:feature_seqid => :(record.seqid = (mark:p-1) .- offset),
242-
:feature_source => :(record.source = (mark:p-1) .- offset),
243-
:feature_type_ => :(record.type_ = (mark:p-1) .- offset),
244-
:feature_start => :(record.start = (mark:p-1) .- offset),
245-
:feature_end_ => :(record.end_ = (mark:p-1) .- offset),
246-
:feature_score => :(record.score = (mark:p-1) .- offset),
247-
:feature_strand => :(record.strand = p - offset),
248-
:feature_phase => :(record.phase = p - offset),
249-
:feature_attribute_key => :(push!(record.attribute_keys, (mark:p-1) .- offset)),
274+
:mark => :(@mark),
275+
:pos => :(pos = @relpos(p)),
276+
:feature_seqid => :(record.seqid = pos:@relpos(p-1)),
277+
:feature_source => :(record.source = pos:@relpos(p-1)),
278+
:feature_type_ => :(record.type_ = pos:@relpos(p-1)),
279+
:feature_start => :(record.start = pos:@relpos(p-1)),
280+
:feature_end_ => :(record.end_ = pos:@relpos(p-1)),
281+
:feature_score => :(record.score = pos:@relpos(p-1)),
282+
:feature_strand => :(record.strand = @relpos(p)),
283+
:feature_phase => :(record.phase = @relpos(p)),
284+
:feature_attribute_key => :(push!(record.attribute_keys, pos:@relpos(p-1))),
250285
:feature => :(record.kind = :feature),
251286
:directive => :(record.kind = :directive),
252287
:comment => :(record.kind = :comment),
253288
:record => quote
254-
BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1)
255-
record.filled = (offset+1:p-1) .- offset
256-
end,
257-
:anchor => :(),
258-
:mark => :(mark = p)
289+
appendfrom!(record.data, 1, data, @markpos, p-@markpos)
290+
record.filled = 1:(p-@markpos)
291+
end
259292
)
260293

261-
BioCore.ReaderHelper.generate_index_function(
262-
Record,
294+
context = Automa.CodeGenContext(
295+
generator = :goto,
296+
checkbounds = false,
297+
loopunroll = 0
298+
)
299+
300+
Automa.Stream.generate_reader(
301+
:index!,
263302
record_machine,
264-
quote
265-
mark = offset = 0
266-
end,
267-
record_actions
303+
arguments = (:(record::Record),),
304+
actions = record_actions,
305+
context = context,
306+
returncode = quote
307+
if cs == 0
308+
return record
309+
end
310+
throw(ArgumentError(string("failed to index ", eltype(record), " ~>", repr(String(data[p:min(p+7,p_end)])))))
311+
end
268312
) |> eval
269313

270-
BioCore.ReaderHelper.generate_read_function(
271-
Reader,
314+
315+
Automa.Stream.generate_reader(
316+
:readrecord!,
272317
body_machine,
273-
quote
274-
mark = offset = 0
275-
end,
276-
merge(record_actions,
318+
arguments = (:(reader::Reader), :(record::Record)),
319+
actions = merge(record_actions,
277320
Dict(
321+
:countline => :(linenum += 1),
278322
:record => quote
279-
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
280-
record.filled = (offset+1:p-1) .- offset
323+
appendfrom!(record.data, 1, data, @markpos, p-@markpos)
324+
record.filled = 1:(p-@markpos)
281325
if isfeature(record)
282326
reader.directive_count = reader.preceding_directive_count
283327
reader.preceding_directive_count = 0
@@ -288,9 +332,10 @@ BioCore.ReaderHelper.generate_read_function(
288332
end
289333
if is_fasta_directive(record)
290334
reader.found_fasta = true
291-
reader.state.finished = true
335+
reader.state.filled = true
292336
end
293337
end
338+
294339
if record.kind reader.targets
295340
found_record = true
296341
@escape
@@ -299,14 +344,46 @@ BioCore.ReaderHelper.generate_read_function(
299344
:body => quote
300345
if data[p] == UInt8('>')
301346
reader.found_fasta = true
302-
reader.state.finished = true
347+
reader.state.filled = true
303348
# HACK: any better way?
304349
cs = 0
305350
@goto exit
306351
end
307352
end,
308-
:countline => :(linenum += 1),
309-
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1)
310353
)
311-
)
354+
),
355+
context = context,
356+
initcode = quote
357+
cs = reader.state.state
358+
linenum = reader.state.linenum
359+
found_record = false
360+
end,
361+
loopcode = quote
362+
if found_record
363+
@goto __return__
364+
end
365+
end,
366+
returncode = quote
367+
368+
reader.state.state = cs
369+
# reader.state.filled |= cs == 0 # Note: if set to true, remains true.
370+
reader.state.linenum = linenum
371+
372+
if found_record
373+
return record
374+
end
375+
376+
if cs == 0 || eof(stream)
377+
throw(EOFError())
378+
end
379+
380+
if cs < 0
381+
error(eltype(Reader), " file format error on line ", linenum, " ~>", repr(String(data[p:min(p+7,p_end)])))
382+
end
383+
384+
if p > p_eof 0
385+
error("incomplete $(typeof(reader)) input on line ", linenum)
386+
end
387+
388+
end
312389
) |> eval

0 commit comments

Comments
 (0)