Delay the GCN triple hack to right before optimization.

maleadt · maleadt · commit b8cf0a445e44 · 2021-09-23T17:49:55.000+02:00
diff --git a/src/driver.jl b/src/driver.jl
@@ -234,23 +234,6 @@ const __llvm_initialized = Ref(false)
                     unsafe_delete!(LLVM.parent(call), call)
                 end
             end
-
-            # clean-up
-            ModulePassManager() do pm
-                # inline and optimize the call to the deferred code. in particular we want to
-                # remove unnecessary alloca's that are created by pass-by-ref semantics.
-                instruction_combining!(pm)
-                always_inliner!(pm)
-                scalar_repl_aggregates_ssa!(pm)
-                promote_memory_to_register!(pm)
-                gvn!(pm)
-
-                # merge duplicate functions, since each compilation invocation emits everything
-                # XXX: ideally we want to avoid emitting these in the first place
-                merge_functions!(pm)
-
-                run!(pm, ir)
-            end
         end
 
         # all deferred compilations should have been resolved
@@ -300,6 +283,20 @@ const __llvm_initialized = Ref(false)
                 # merge constants (such as exception messages)
                 constant_merge!(pm)
 
+                if do_deferred_codegen
+                    # inline and optimize the call to the deferred code. in particular we want to
+                    # remove unnecessary alloca's that are created by pass-by-ref semantics.
+                    instruction_combining!(pm)
+                    always_inliner!(pm)
+                    scalar_repl_aggregates_ssa!(pm)
+                    promote_memory_to_register!(pm)
+                    gvn!(pm)
+
+                    # merge duplicate functions, since each compilation invocation emits everything
+                    # XXX: ideally we want to avoid emitting these in the first place
+                    merge_functions!(pm)
+                end
+
                 run!(pm, ir)
             end
         end
diff --git a/src/gcn.jl b/src/gcn.jl
@@ -35,14 +35,6 @@ runtime_slug(job::CompilerJob{GCNCompilerTarget}) = "gcn-$(job.target.dev_isa)$(
 const gcn_intrinsics = () # TODO: ("vprintf", "__assertfail", "malloc", "free")
 isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsics)
 
-# we have to fake our target early in the pipeline because Julia's optimization passes
-# weren't designed for a non-0 stack addrspace, and the AMDGPU target is very strict
-# about which addrspaces are permitted for various code patterns
-function process_module!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module)
-    triple!(mod, llvm_triple(NativeCompilerTarget()))
-    datalayout!(mod, julia_datalayout(NativeCompilerTarget()))
-end
-
 function process_entry!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module, entry::LLVM.Function)
     entry = invoke(process_entry!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
 
@@ -58,6 +50,24 @@ function add_lowering_passes!(job::CompilerJob{GCNCompilerTarget}, pm::LLVM.Pass
     add!(pm, ModulePass("LowerThrowExtra", lower_throw_extra!))
 end
 
+function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}),
+                        mod::LLVM.Module, entry::LLVM.Function)
+    # we have to fake our target early in the pipeline because Julia's optimization passes
+    # weren't designed for a non-0 stack addrspace, and the AMDGPU target is very strict
+    # about which addrspaces are permitted for various code patterns
+    triple!(mod, llvm_triple(NativeCompilerTarget()))
+    datalayout!(mod, julia_datalayout(NativeCompilerTarget()))
+
+    entry = invoke(finish_module!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
+
+    if job.source.kernel
+        # work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
+        entry = lower_byval(job, mod, entry)
+    end
+
+    return entry
+end
+
 # We need to do alloca rewriting (from 0 to 5) after Julia's optimization
 # passes because of two reasons:
 # 1. Debug builds call the target verifier first, which would trip if AMDGPU
@@ -80,18 +90,6 @@ function optimize_module!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module)
     end
 end
 
-function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}),
-                        mod::LLVM.Module, entry::LLVM.Function)
-    entry = invoke(finish_module!, Tuple{CompilerJob, LLVM.Module, LLVM.Function}, job, mod, entry)
-
-    if job.source.kernel
-        # work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92)
-        entry = lower_byval(job, mod, entry)
-    end
-
-    return entry
-end
-
 
 ## LLVM passes