Skip to content

Commit a0f8429

Browse files
authored
Merge pull request #248 from JuliaGPU/tb/late_opt
Perform late optimization, after deferred codegen and byval lowering.
2 parents 7dfb27f + d57f5f5 commit a0f8429

File tree

4 files changed

+93
-96
lines changed

4 files changed

+93
-96
lines changed

Diff for: src/driver.jl

+66-69
Original file line numberDiff line numberDiff line change
@@ -166,68 +166,23 @@ const __llvm_initialized = Ref(false)
166166
runtime_fns = LLVM.name.(defs(runtime))
167167
end
168168

169-
@timeit_debug to "LLVM middle-end" begin
170-
# target-specific libraries
169+
@timeit_debug to "Library linking" begin
171170
if libraries
171+
# target-specific libraries
172172
undefined_fns = LLVM.name.(decls(ir))
173173
@timeit_debug to "target libraries" link_libraries!(job, ir, undefined_fns)
174-
end
175-
176-
if optimize
177-
@timeit_debug to "optimization" optimize!(job, ir)
178-
179-
# optimization may have replaced functions, so look the entry point up again
180-
entry = functions(ir)[entry_fn]
181-
end
182174

183-
if libraries
184-
undefined_fns = LLVM.name.(decls(ir))
175+
# GPU run-time library
185176
if any(fn -> fn in runtime_fns, undefined_fns)
186177
@timeit_debug to "runtime library" link_library!(ir, runtime)
187178
end
188179
end
189-
190-
if ccall(:jl_is_debugbuild, Cint, ()) == 1
191-
@timeit_debug to "verification" verify(ir)
192-
end
193-
194-
if only_entry
195-
# replace non-entry function definitions with a declaration
196-
for f in functions(ir)
197-
f == entry && continue
198-
isdeclaration(f) && continue
199-
LLVM.isintrinsic(f) && continue
200-
empty!(f)
201-
end
202-
end
203-
204-
# remove everything except for the entry and any exported global variables
205-
@timeit_debug to "clean-up" begin
206-
exports = String[entry_fn]
207-
for gvar in globals(ir)
208-
push!(exports, LLVM.name(gvar))
209-
end
210-
211-
ModulePassManager() do pm
212-
internalize!(pm, exports)
213-
214-
# eliminate all unused internal functions
215-
global_optimizer!(pm)
216-
global_dce!(pm)
217-
strip_dead_prototypes!(pm)
218-
219-
# merge constants (such as exception messages) from the runtime
220-
constant_merge!(pm)
221-
222-
run!(pm, ir)
223-
end
224-
end
225180
end
226181

227-
entry = finish_module!(job, ir, entry)
228-
229182
# deferred code generation
230-
if !only_entry && deferred_codegen && haskey(functions(ir), "deferred_codegen")
183+
do_deferred_codegen = !only_entry && deferred_codegen &&
184+
haskey(functions(ir), "deferred_codegen")
185+
if do_deferred_codegen
231186
dyn_marker = functions(ir)["deferred_codegen"]
232187

233188
cache = Dict{CompilerJob, String}(job => entry_fn)
@@ -257,7 +212,7 @@ const __llvm_initialized = Ref(false)
257212
for dyn_job in keys(worklist)
258213
# cached compilation
259214
dyn_entry_fn = get!(cache, dyn_job) do
260-
dyn_ir, dyn_meta = codegen(:llvm, dyn_job; optimize,
215+
dyn_ir, dyn_meta = codegen(:llvm, dyn_job; optimize=false,
261216
deferred_codegen=false, parent_job=job)
262217
dyn_entry_fn = LLVM.name(dyn_meta.entry)
263218
merge!(compiled, dyn_meta.compiled)
@@ -281,28 +236,70 @@ const __llvm_initialized = Ref(false)
281236
end
282237
end
283238

284-
ModulePassManager() do pm
285-
# inline and optimize the call to the deferred code. in particular we want to
286-
# remove unnecessary alloca's that are created by pass-by-ref semantics.
287-
instruction_combining!(pm)
288-
always_inliner!(pm)
289-
scalar_repl_aggregates_ssa!(pm)
290-
promote_memory_to_register!(pm)
291-
gvn!(pm)
239+
# all deferred compilations should have been resolved
240+
@compiler_assert isempty(uses(dyn_marker)) job
241+
unsafe_delete!(ir, dyn_marker)
242+
end
292243

293-
# merge constants (such as exception messages) from each entry
294-
constant_merge!(pm)
244+
@timeit_debug to "IR post-processing" begin
245+
entry = finish_module!(job, ir, entry)
295246

296-
# merge duplicate functions, since each compilation invocation emits everything
297-
# XXX: ideally we want to avoid emitting these in the first place
298-
merge_functions!(pm)
247+
if optimize
248+
@timeit_debug to "optimization" optimize!(job, ir)
299249

300-
run!(pm, ir)
250+
# optimization may have replaced functions, so look the entry point up again
251+
entry = functions(ir)[entry_fn]
301252
end
302253

303-
# all deferred compilations should have been resolved
304-
@compiler_assert isempty(uses(dyn_marker)) job
305-
unsafe_delete!(ir, dyn_marker)
254+
if ccall(:jl_is_debugbuild, Cint, ()) == 1
255+
@timeit_debug to "verification" verify(ir)
256+
end
257+
258+
@timeit_debug to "clean-up" begin
259+
# replace non-entry function definitions with a declaration
260+
if only_entry
261+
for f in functions(ir)
262+
f == entry && continue
263+
isdeclaration(f) && continue
264+
LLVM.isintrinsic(f) && continue
265+
empty!(f)
266+
end
267+
end
268+
269+
# remove everything except for the entry and any exported global variables
270+
exports = String[entry_fn]
271+
for gvar in globals(ir)
272+
push!(exports, LLVM.name(gvar))
273+
end
274+
275+
ModulePassManager() do pm
276+
internalize!(pm, exports)
277+
278+
# eliminate all unused internal functions
279+
global_optimizer!(pm)
280+
global_dce!(pm)
281+
strip_dead_prototypes!(pm)
282+
283+
# merge constants (such as exception messages)
284+
constant_merge!(pm)
285+
286+
if do_deferred_codegen
287+
# inline and optimize the call to the deferred code. in particular we want to
288+
# remove unnecessary alloca's that are created by pass-by-ref semantics.
289+
instruction_combining!(pm)
290+
always_inliner!(pm)
291+
scalar_repl_aggregates_ssa!(pm)
292+
promote_memory_to_register!(pm)
293+
gvn!(pm)
294+
295+
# merge duplicate functions, since each compilation invocation emits everything
296+
# XXX: ideally we want to avoid emitting these in the first place
297+
merge_functions!(pm)
298+
end
299+
300+
run!(pm, ir)
301+
end
302+
end
306303
end
307304

308305
return ir, (; entry, compiled)

Diff for: src/gcn.jl

+18-20
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,6 @@ runtime_slug(job::CompilerJob{GCNCompilerTarget}) = "gcn-$(job.target.dev_isa)$(
3535
const gcn_intrinsics = () # TODO: ("vprintf", "__assertfail", "malloc", "free")
3636
isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsics)
3737

38-
# we have to fake our target early in the pipeline because Julia's optimization passes
39-
# weren't designed for a non-0 stack addrspace, and the AMDGPU target is very strict
40-
# about which addrspaces are permitted for various code patterns
41-
function process_module!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module)
42-
triple!(mod, llvm_triple(NativeCompilerTarget()))
43-
datalayout!(mod, julia_datalayout(NativeCompilerTarget()))
44-
end
45-
4638
function process_entry!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module, entry::LLVM.Function)
4739
entry = invoke(process_entry!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
4840

@@ -58,6 +50,24 @@ function add_lowering_passes!(job::CompilerJob{GCNCompilerTarget}, pm::LLVM.Pass
5850
add!(pm, ModulePass("LowerThrowExtra", lower_throw_extra!))
5951
end
6052

53+
function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}),
54+
mod::LLVM.Module, entry::LLVM.Function)
55+
# we have to fake our target early in the pipeline because Julia's optimization passes
56+
# weren't designed for a non-0 stack addrspace, and the AMDGPU target is very strict
57+
# about which addrspaces are permitted for various code patterns
58+
triple!(mod, llvm_triple(NativeCompilerTarget()))
59+
datalayout!(mod, julia_datalayout(NativeCompilerTarget()))
60+
61+
entry = invoke(finish_module!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
62+
63+
if job.source.kernel
64+
# work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
65+
entry = lower_byval(job, mod, entry)
66+
end
67+
68+
return entry
69+
end
70+
6171
# We need to do alloca rewriting (from 0 to 5) after Julia's optimization
6272
# passes because of two reasons:
6373
# 1. Debug builds call the target verifier first, which would trip if AMDGPU
@@ -80,18 +90,6 @@ function optimize_module!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module)
8090
end
8191
end
8292

83-
function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}),
84-
mod::LLVM.Module, entry::LLVM.Function)
85-
entry = invoke(finish_module!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
86-
87-
if job.source.kernel
88-
# work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
89-
entry = lower_byval(job, mod, entry)
90-
end
91-
92-
return entry
93-
end
94-
9593

9694
## LLVM passes
9795

Diff for: test/native.jl

+6-6
Original file line numberDiff line numberDiff line change
@@ -284,18 +284,18 @@ end
284284
@testset "LazyCodegen" begin
285285
import .LazyCodegen: call_delayed
286286

287-
global flag = Ref(false) # otherwise f is a closure and we can't
288-
# pass it to `Val`...
289-
f() = (flag[]=true; nothing)
287+
f(A) = (A[] += 42; nothing)
290288

289+
global flag = [0]
291290
function caller()
292-
call_delayed(f)
291+
call_delayed(f, flag::Vector{Int})
293292
end
294293
@test caller() === nothing
295-
@test flag[]
294+
@test flag[] == 42
296295

297296
ir = sprint(io->native_code_llvm(io, caller, Tuple{}, dump_module=true))
298-
@test occursin(r"define void @julia_f_\d+", ir)
297+
@test occursin(r"add i64 %\d+, 42", ir)
298+
# NOTE: can't just look for `jl_f` here, since it may be inlined and optimized away.
299299

300300
add(x, y) = x+y
301301
function call_add(x, y)

Diff for: test/util.jl

+3-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ end
3535
module TestRuntime
3636
# dummy methods
3737
signal_exception() = return
38-
malloc(sz) = C_NULL
38+
# HACK: if malloc returns 0 or traps, all calling functions (like jl_box_*)
39+
# get reduced to a trap, which really messes with our test suite.
40+
malloc(sz) = Ptr{Cvoid}(Int(0xDEADBEEF))
3941
report_oom(sz) = return
4042
report_exception(ex) = return
4143
report_exception_name(ex) = return

0 commit comments

Comments
 (0)