add amd llvm compiler (tinygrad#9519)

b1tg · chenyuxyz · web-flow · commit 58206fa8a971 · 2025-03-21T23:13:27.000+08:00
Co-authored-by: b1tg &lt;b1tg@users.noreply.github.com&gt;
Co-authored-by: chenyu &lt;chenyu@fastmail.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -549,6 +549,7 @@ jobs:
           deps: testing_minimal${{matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}
           opencl: ${{ matrix.backend == 'gpu' && 'true' }}
           amd: ${{ matrix.backend == 'amd' && 'true' }}
+          llvm: ${{ (matrix.backend == 'amd' || matrix.backend == 'llvm') && 'true' }}
           cuda: ${{ (matrix.backend == 'ptx' || matrix.backend == 'triton' || matrix.backend == 'nv') && 'true' }}
       - name: Set env
         run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nPTX=1\nMOCKGPU=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nNV=1\nMOCKGPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
@@ -565,6 +566,9 @@ jobs:
       - name: Run pytest (amd)
         if: matrix.backend=='amd'
         run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_hcq.py test/external/external_test_am.py --durations=20
+      - name: Run pytest (amd with llvm backend)
+        if: matrix.backend=='amd'
+        run: python -m pytest -n=auto test/test_amd_llvm.py --durations=20
       - name: Run TRANSCENDENTAL math
         run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
       - name: Run process replay tests
@@ -588,6 +592,7 @@ jobs:
         python-version: '3.11'
         amd: 'true'
         cuda: 'true'
+        llvm: 'true'
     - name: Run real world test
       run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
     - name: Test models (Metal)
@@ -615,6 +620,12 @@ jobs:
         FORWARD_ONLY: 1
       run: |
         python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
+    - name: Run pytest (amd with llvm backend)
+      env:
+        MOCKGPU: 1
+        AMD: 1
+      run: |
+        python -m pytest -n=auto test/test_amd_llvm.py --durations=20
     - name: Run pytest (ptx)
       env:
         MOCKGPU: 1
diff --git a/test/test_amd_llvm.py b/test/test_amd_llvm.py
@@ -0,0 +1,32 @@
+import unittest
+import numpy as np
+from tinygrad import Device
+from tinygrad.helpers import flat_mv
+if Device.DEFAULT=="AMD":
+  from tinygrad.runtime.ops_amd import AMDAllocator, AMDDevice, AMDProgram
+  from tinygrad.runtime.support.compiler_amd import AMDLLVMCompiler
+
+@unittest.skipUnless(Device.DEFAULT == "AMD", "Runs only on AMD")
+class TestAMDLLVM(unittest.TestCase):
+  def test_compiler(self):
+    src = '''
+; https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/imm.ll
+define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
+entry:
+  store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
+  ret void
+}
+    '''
+    device = AMDDevice()
+    compiler = AMDLLVMCompiler("gfx1100")
+    obj = compiler.compile(src)
+    allocator = AMDAllocator(device)
+    a = allocator.alloc(1*8)
+    prog = AMDProgram(device, "test", obj)
+    prog(a, wait=True)
+    na = np.empty(1, np.uint64)
+    allocator._copyout(flat_mv(na.data), a)
+    assert na == [0x1234567800000005]
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py
@@ -1,6 +1,7 @@
 import ctypes, subprocess
 import tinygrad.runtime.autogen.comgr as comgr
 from tinygrad.device import Compiler, CompileError
+from tinygrad.runtime.ops_llvm import LLVMCompiler
 
 def amdgpu_disassemble(lib:bytes):
   asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
@@ -68,3 +69,17 @@ def compile(self, src:str) -> bytes:
     try: return compile_hip(src, self.arch, src.split('\n', 1)[0].strip() == '.text')
     except RuntimeError as e: raise CompileError(e) from e
   def disassemble(self, lib:bytes): amdgpu_disassemble(lib)
+
+class AMDLLVMCompiler(LLVMCompiler):
+  jit = False
+  target_arch = "AMDGPU"
+  def __init__(self, arch: str):
+    self.arch = arch
+    super().__init__(self.arch, "+cumode")
+  def __reduce__(self): return (AMDLLVMCompiler, (self.arch,))
+  def compile(self, src:str) -> bytes:
+    try: return super().compile(src)
+    except RuntimeError as e:
+      if "undefined value '@llvm.amdgcn." in str(e): raise CompileError(str(e) + "AMD with LLVM backend requires LLVM >= 18") from e
+      raise CompileError(e) from e
+  def disassemble(self, lib:bytes): amdgpu_disassemble(lib)