Merge pull request #248 from JuliaGPU/tb/late_opt

maleadt · web-flow · commit a0f8429ebcb9 · 2021-09-24T07:01:39.000+02:00
Perform late optimization, after deferred codegen and byval lowering.
diff --git a/src/driver.jl b/src/driver.jl
@@ -166,68 +166,23 @@ const __llvm_initialized = Ref(false)
         runtime_fns = LLVM.name.(defs(runtime))
     end
 
-    @timeit_debug to "LLVM middle-end" begin
-        # target-specific libraries
+    @timeit_debug to "Library linking" begin
         if libraries
+            # target-specific libraries
             undefined_fns = LLVM.name.(decls(ir))
             @timeit_debug to "target libraries" link_libraries!(job, ir, undefined_fns)
-        end
-
-        if optimize
-            @timeit_debug to "optimization" optimize!(job, ir)
-
-            # optimization may have replaced functions, so look the entry point up again
-            entry = functions(ir)[entry_fn]
-        end
 
-        if libraries
-            undefined_fns = LLVM.name.(decls(ir))
+            # GPU run-time library
             if any(fn -> fn in runtime_fns, undefined_fns)
                 @timeit_debug to "runtime library" link_library!(ir, runtime)
             end
         end
-
-        if ccall(:jl_is_debugbuild, Cint, ()) == 1
-            @timeit_debug to "verification" verify(ir)
-        end
-
-        if only_entry
-            # replace non-entry function definitions with a declaration
-            for f in functions(ir)
-                f == entry && continue
-                isdeclaration(f) && continue
-                LLVM.isintrinsic(f) && continue
-                empty!(f)
-            end
-        end
-
-        # remove everything except for the entry and any exported global variables
-        @timeit_debug to "clean-up" begin
-            exports = String[entry_fn]
-            for gvar in globals(ir)
-                push!(exports, LLVM.name(gvar))
-            end
-
-            ModulePassManager() do pm
-                internalize!(pm, exports)
-
-                # eliminate all unused internal functions
-                global_optimizer!(pm)
-                global_dce!(pm)
-                strip_dead_prototypes!(pm)
-
-                # merge constants (such as exception messages) from the runtime
-                constant_merge!(pm)
-
-                run!(pm, ir)
-            end
-        end
     end
 
-    entry = finish_module!(job, ir, entry)
-
     # deferred code generation
-    if !only_entry && deferred_codegen && haskey(functions(ir), "deferred_codegen")
+    do_deferred_codegen = !only_entry && deferred_codegen &&
+                          haskey(functions(ir), "deferred_codegen")
+    if do_deferred_codegen
         dyn_marker = functions(ir)["deferred_codegen"]
 
         cache = Dict{CompilerJob, String}(job => entry_fn)
@@ -257,7 +212,7 @@ const __llvm_initialized = Ref(false)
             for dyn_job in keys(worklist)
                 # cached compilation
                 dyn_entry_fn = get!(cache, dyn_job) do
-                    dyn_ir, dyn_meta = codegen(:llvm, dyn_job; optimize,
+                    dyn_ir, dyn_meta = codegen(:llvm, dyn_job; optimize=false,
                                                deferred_codegen=false, parent_job=job)
                     dyn_entry_fn = LLVM.name(dyn_meta.entry)
                     merge!(compiled, dyn_meta.compiled)
@@ -281,28 +236,70 @@ const __llvm_initialized = Ref(false)
             end
         end
 
-        ModulePassManager() do pm
-            # inline and optimize the call to the deferred code. in particular we want to
-            # remove unnecessary alloca's that are created by pass-by-ref semantics.
-            instruction_combining!(pm)
-            always_inliner!(pm)
-            scalar_repl_aggregates_ssa!(pm)
-            promote_memory_to_register!(pm)
-            gvn!(pm)
+        # all deferred compilations should have been resolved
+        @compiler_assert isempty(uses(dyn_marker)) job
+        unsafe_delete!(ir, dyn_marker)
+    end
 
-            # merge constants (such as exception messages) from each entry
-            constant_merge!(pm)
+    @timeit_debug to "IR post-processing" begin
+        entry = finish_module!(job, ir, entry)
 
-            # merge duplicate functions, since each compilation invocation emits everything
-            # XXX: ideally we want to avoid emitting these in the first place
-            merge_functions!(pm)
+        if optimize
+            @timeit_debug to "optimization" optimize!(job, ir)
 
-            run!(pm, ir)
+            # optimization may have replaced functions, so look the entry point up again
+            entry = functions(ir)[entry_fn]
         end
 
-        # all deferred compilations should have been resolved
-        @compiler_assert isempty(uses(dyn_marker)) job
-        unsafe_delete!(ir, dyn_marker)
+        if ccall(:jl_is_debugbuild, Cint, ()) == 1
+            @timeit_debug to "verification" verify(ir)
+        end
+
+        @timeit_debug to "clean-up" begin
+            # replace non-entry function definitions with a declaration
+            if only_entry
+                for f in functions(ir)
+                    f == entry && continue
+                    isdeclaration(f) && continue
+                    LLVM.isintrinsic(f) && continue
+                    empty!(f)
+                end
+            end
+
+            # remove everything except for the entry and any exported global variables
+            exports = String[entry_fn]
+            for gvar in globals(ir)
+                push!(exports, LLVM.name(gvar))
+            end
+
+            ModulePassManager() do pm
+                internalize!(pm, exports)
+
+                # eliminate all unused internal functions
+                global_optimizer!(pm)
+                global_dce!(pm)
+                strip_dead_prototypes!(pm)
+
+                # merge constants (such as exception messages)
+                constant_merge!(pm)
+
+                if do_deferred_codegen
+                    # inline and optimize the call to the deferred code. in particular we want to
+                    # remove unnecessary alloca's that are created by pass-by-ref semantics.
+                    instruction_combining!(pm)
+                    always_inliner!(pm)
+                    scalar_repl_aggregates_ssa!(pm)
+                    promote_memory_to_register!(pm)
+                    gvn!(pm)
+
+                    # merge duplicate functions, since each compilation invocation emits everything
+                    # XXX: ideally we want to avoid emitting these in the first place
+                    merge_functions!(pm)
+                end
+
+                run!(pm, ir)
+            end
+        end
     end
 
     return ir, (; entry, compiled)
diff --git a/src/gcn.jl b/src/gcn.jl
@@ -35,14 +35,6 @@ runtime_slug(job::CompilerJob{GCNCompilerTarget}) = "gcn-$(job.target.dev_isa)$(
 const gcn_intrinsics = () # TODO: ("vprintf", "__assertfail", "malloc", "free")
 isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsics)
 
-# we have to fake our target early in the pipeline because Julia's optimization passes
-# weren't designed for a non-0 stack addrspace, and the AMDGPU target is very strict
-# about which addrspaces are permitted for various code patterns
-function process_module!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module)
-    triple!(mod, llvm_triple(NativeCompilerTarget()))
-    datalayout!(mod, julia_datalayout(NativeCompilerTarget()))
-end
-
 function process_entry!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module, entry::LLVM.Function)
     entry = invoke(process_entry!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
 
@@ -58,6 +50,24 @@ function add_lowering_passes!(job::CompilerJob{GCNCompilerTarget}, pm::LLVM.Pass
     add!(pm, ModulePass("LowerThrowExtra", lower_throw_extra!))
 end
 
+function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}),
+                        mod::LLVM.Module, entry::LLVM.Function)
+    # we have to fake our target early in the pipeline because Julia's optimization passes
+    # weren't designed for a non-0 stack addrspace, and the AMDGPU target is very strict
+    # about which addrspaces are permitted for various code patterns
+    triple!(mod, llvm_triple(NativeCompilerTarget()))
+    datalayout!(mod, julia_datalayout(NativeCompilerTarget()))
+
+    entry = invoke(finish_module!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
+
+    if job.source.kernel
+        # work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
+        entry = lower_byval(job, mod, entry)
+    end
+
+    return entry
+end
+
 # We need to do alloca rewriting (from 0 to 5) after Julia's optimization
 # passes because of two reasons:
 # 1. Debug builds call the target verifier first, which would trip if AMDGPU
@@ -80,18 +90,6 @@ function optimize_module!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module)
     end
 end
 
-function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}),
-                        mod::LLVM.Module, entry::LLVM.Function)
-    entry = invoke(finish_module!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
-
-    if job.source.kernel
-        # work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
-        entry = lower_byval(job, mod, entry)
-    end
-
-    return entry
-end
-
 
 ## LLVM passes
 
diff --git a/test/native.jl b/test/native.jl
@@ -284,18 +284,18 @@ end
 @testset "LazyCodegen" begin
     import .LazyCodegen: call_delayed
 
-    global flag = Ref(false) # otherwise f is a closure and we can't
-                             # pass it to `Val`...
-    f() = (flag[]=true; nothing)
+    f(A) = (A[] += 42; nothing)
 
+    global flag = [0]
     function caller()
-        call_delayed(f)
+        call_delayed(f, flag::Vector{Int})
     end
     @test caller() === nothing
-    @test flag[]
+    @test flag[] == 42
 
     ir = sprint(io->native_code_llvm(io, caller, Tuple{}, dump_module=true))
-    @test occursin(r"define void @julia_f_\d+", ir)
+    @test occursin(r"add i64 %\d+, 42", ir)
+    # NOTE: can't just look for `jl_f` here, since it may be inlined and optimized away.
 
     add(x, y) = x+y
     function call_add(x, y)
diff --git a/test/util.jl b/test/util.jl
@@ -35,7 +35,9 @@ end
 module TestRuntime
     # dummy methods
     signal_exception() = return
-    malloc(sz) = C_NULL
+    # HACK: if malloc returns 0 or traps, all calling functions (like jl_box_*)
+    #       get reduced to a trap, which really messes with our test suite.
+    malloc(sz) = Ptr{Cvoid}(Int(0xDEADBEEF))
     report_oom(sz) = return
     report_exception(ex) = return
     report_exception_name(ex) = return