@@ -85,13 +85,31 @@ runtime_slug(@nospecialize(job::CompilerJob{PTXCompilerTarget})) =
85
85
" -exitable=$(job. target. exitable) "
86
86
87
87
function process_module! (@nospecialize (job:: CompilerJob{PTXCompilerTarget} ), mod:: LLVM.Module )
88
+ ctx = context (mod)
89
+
88
90
# calling convention
89
91
if LLVM. version () >= v " 8"
90
92
for f in functions (mod)
91
93
# JuliaGPU/GPUCompiler.jl#97
92
94
# callconv!(f, LLVM.API.LLVMPTXDeviceCallConv)
93
95
end
94
96
end
97
+
98
+ # emit the device capability and ptx isa version as constants in the module. this makes
99
+ # it possible to 'query' these in device code, relying on LLVM to optimize the checks
100
+ # away and generate static code. note that we only do so if there's actual uses of these
101
+ # variables; unconditionally creating a gvar would result in duplicate declarations.
102
+ for (name, value) in [" sm_major" => job. target. cap. major,
103
+ " sm_minor" => job. target. cap. minor,
104
+ " ptx_major" => job. target. ptx. major,
105
+ " ptx_minor" => job. target. ptx. minor]
106
+ if haskey (globals (mod), name)
107
+ gv = globals (mod)[name]
108
+ initializer! (gv, ConstantInt (LLVM. Int32Type (ctx), value))
109
+ # change the linkage so that we can inline the value
110
+ linkage! (gv, LLVM. API. LLVMPrivateLinkage)
111
+ end
112
+ end
95
113
end
96
114
97
115
function process_entry! (@nospecialize (job:: CompilerJob{PTXCompilerTarget} ),
@@ -142,12 +160,6 @@ function process_entry!(@nospecialize(job::CompilerJob{PTXCompilerTarget}),
142
160
# calling convention
143
161
callconv! (entry, LLVM. API. LLVMPTXKernelCallConv)
144
162
end
145
- else
146
- # we can't look up device functions using the CUDA APIs, so alias them to a global
147
- gv = GlobalVariable (mod, llvmtype (entry), LLVM. name (entry) * " _slot" )
148
- initializer! (gv, entry)
149
- linkage! (gv, LLVM. API. LLVMLinkOnceODRLinkage)
150
- set_used! (mod, gv)
151
163
end
152
164
153
165
return entry
@@ -161,6 +173,10 @@ function add_lowering_passes!(@nospecialize(job::CompilerJob{PTXCompilerTarget})
161
173
162
174
# even if we support `unreachable`, we still prefer `exit` to `trap`
163
175
add! (pm, ModulePass (" HideTrap" , hide_trap!))
176
+
177
+ # we emit properties (of the device and ptx isa) as private global constants,
178
+ # so run the optimizer so that they are inlined before the rest of the optimizer runs.
179
+ global_optimizer! (pm)
164
180
end
165
181
166
182
function optimize_module! (@nospecialize (job:: CompilerJob{PTXCompilerTarget} ),
0 commit comments