minimum change for rdna4 [pr] (tinygrad#9455)

geohot · web-flow · commit 09e7708b4973 · 2025-03-16T13:39:24.000+08:00
diff --git a/extra/hip_large_kernel.py b/extra/hip_large_kernel.py
@@ -0,0 +1,25 @@
+from tinygrad.device import Device, Buffer
+from tinygrad.dtype import dtypes, _to_np_dtype
+
+dev = Device.default
+mbin = dev.compiler.compile("""
+typedef long unsigned int size_t;
+extern "C" __attribute__((device, const)) size_t __ockl_get_group_id(unsigned int);
+extern "C" __attribute__((global)) void __attribute__((amdgpu_flat_work_group_size(1, 1))) write_ones(signed char* data0) {
+  int gidx0 = __ockl_get_group_id(0); /* 16 */
+  int gidx1 = __ockl_get_group_id(1); /* 1026048 */
+  *(data0+(gidx0+gidx1*1)) = 1;
+}
+""")
+dev.compiler.disassemble(mbin)
+buf0 = Buffer(Device.DEFAULT, 1*65537, dtypes.uint8).ensure_allocated()
+
+prg = dev.runtime("write_ones", mbin)
+prg(buf0._buf, global_size=(1,65537,1), local_size=(1,1,1), wait=True)
+
+import numpy as np
+def to_np(buf): return np.frombuffer(buf.as_buffer().cast(buf.dtype.base.fmt), dtype=_to_np_dtype(buf.dtype.base))
+
+big = to_np(buf0)
+print(big)
+print((big-1).nonzero())
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
@@ -397,13 +397,16 @@ def cast_float_to_bf16(x: UOp) -> UOp:
 class AMDRenderer(CStyleLanguage):
   device = "AMD"
   shared_max = 65536
+  # NOTE: this is only really needed on gfx12, even though gfx11 reports the same limitation
+  global_max = (2147483647, 65535, 65535)
   # https://gpuopen.com/learn/wmma_on_rdna3/
   tensor_cores = [TensorCore(dims=(16,16,16), threads=32, elements_per_thread=(16,16,8), dtype_in=di, dtype_out=do,
     opts=("l0","l0","l0","l0","l1","u1","u1","u1"), swizzle=(((4,9,10,11,0),(1,2,3,5,6,7,8)), ((0,1,2,3,4),(9,10,11,5,6,7,8))))
     for di,do in [(dtypes.half,dtypes.float),(dtypes.half,dtypes.half)]]
 
   def __init__(self, arch:str): # gfx942 => MI300, gfx1100 => RX 7900
-    self.tensor_cores, self.arch = AMDRenderer.tensor_cores, arch
+    # TODO: fix tensor cores for gfx1201
+    self.tensor_cores, self.arch = AMDRenderer.tensor_cores if arch != "gfx1201" else [], arch
   def __reduce__(self): return self.__class__, (self.arch,)
 
   # language options
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
@@ -685,7 +685,8 @@ def __init__(self, device:str=""):
     self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
     self.target = int(self.dev_iface.props['gfx_target_version'])
     self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
-    if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
+    if self.target < 100300 or self.target >= 130000: raise RuntimeError(f"Unsupported arch: {self.arch}")
+    if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
 
     self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
     self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1