Skip to content

Commit 09e7708

Browse files
authored
minimum change for rdna4 [pr] (tinygrad#9455)
1 parent be21616 commit 09e7708

File tree

3 files changed

+31
-2
lines changed

3 files changed

+31
-2
lines changed

extra/hip_large_kernel.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from tinygrad.device import Device, Buffer
2+
from tinygrad.dtype import dtypes, _to_np_dtype
3+
4+
dev = Device.default
5+
mbin = dev.compiler.compile("""
6+
typedef long unsigned int size_t;
7+
extern "C" __attribute__((device, const)) size_t __ockl_get_group_id(unsigned int);
8+
extern "C" __attribute__((global)) void __attribute__((amdgpu_flat_work_group_size(1, 1))) write_ones(signed char* data0) {
9+
int gidx0 = __ockl_get_group_id(0); /* 16 */
10+
int gidx1 = __ockl_get_group_id(1); /* 1026048 */
11+
*(data0+(gidx0+gidx1*1)) = 1;
12+
}
13+
""")
14+
dev.compiler.disassemble(mbin)
15+
buf0 = Buffer(Device.DEFAULT, 1*65537, dtypes.uint8).ensure_allocated()
16+
17+
prg = dev.runtime("write_ones", mbin)
18+
prg(buf0._buf, global_size=(1,65537,1), local_size=(1,1,1), wait=True)
19+
20+
import numpy as np
21+
def to_np(buf): return np.frombuffer(buf.as_buffer().cast(buf.dtype.base.fmt), dtype=_to_np_dtype(buf.dtype.base))
22+
23+
big = to_np(buf0)
24+
print(big)
25+
print((big-1).nonzero())

tinygrad/renderer/cstyle.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -397,13 +397,16 @@ def cast_float_to_bf16(x: UOp) -> UOp:
397397
class AMDRenderer(CStyleLanguage):
398398
device = "AMD"
399399
shared_max = 65536
400+
# NOTE: this is only really needed on gfx12, even though gfx11 reports the same limitation
401+
global_max = (2147483647, 65535, 65535)
400402
# https://gpuopen.com/learn/wmma_on_rdna3/
401403
tensor_cores = [TensorCore(dims=(16,16,16), threads=32, elements_per_thread=(16,16,8), dtype_in=di, dtype_out=do,
402404
opts=("l0","l0","l0","l0","l1","u1","u1","u1"), swizzle=(((4,9,10,11,0),(1,2,3,5,6,7,8)), ((0,1,2,3,4),(9,10,11,5,6,7,8))))
403405
for di,do in [(dtypes.half,dtypes.float),(dtypes.half,dtypes.half)]]
404406

405407
def __init__(self, arch:str): # gfx942 => MI300, gfx1100 => RX 7900
406-
self.tensor_cores, self.arch = AMDRenderer.tensor_cores, arch
408+
# TODO: fix tensor cores for gfx1201
409+
self.tensor_cores, self.arch = AMDRenderer.tensor_cores if arch != "gfx1201" else [], arch
407410
def __reduce__(self): return self.__class__, (self.arch,)
408411

409412
# language options

tinygrad/runtime/ops_amd.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,8 @@ def __init__(self, device:str=""):
685685
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
686686
self.target = int(self.dev_iface.props['gfx_target_version'])
687687
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
688-
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
688+
if self.target < 100300 or self.target >= 130000: raise RuntimeError(f"Unsupported arch: {self.arch}")
689+
if DEBUG >= 1: print(f"AMDDevice: opening {self.device_id} with target {self.target} arch {self.arch}")
689690

690691
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
691692
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1

0 commit comments

Comments
 (0)