Skip to content

Commit c4c29c8

Browse files
authored
nv: parse elf attrs (tinygrad#9275)
* better * hm * hm * fixed
1 parent 6350725 commit c4c29c8

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

tinygrad/runtime/ops_nv.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ class NVProgram(HCQProgram):
195195
def __init__(self, dev:NVDevice, name:str, lib:bytes):
196196
self.dev, self.name, self.lib = dev, name, lib
197197

198-
if MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
198+
if MOCKGPU: image, sections, relocs, cbuf0_size = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [], 0x160 # type: ignore
199199
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
200200

201201
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
@@ -208,10 +208,10 @@ def __init__(self, dev:NVDevice, name:str, lib:bytes):
208208
if sh.name == f".text.{self.name}":
209209
self.prog_addr, self.prog_sz, self.regs_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, max(sh.header.sh_info>>24, 16)
210210
elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size)
211-
elif sh.name == ".nv.info":
212-
for off in range(0, sh.header.sh_size, 12):
213-
typ, _, val = struct.unpack_from("III", sh.content, off)
214-
if typ & 0xffff == 0x1204: self.lcmem_usage = val + 0x240
211+
elif sh.name.startswith(".nv.info"):
212+
for typ, param, data in self._parse_elf_info(sh):
213+
if sh.name == f".nv.info.{name}" and param == 0xa: cbuf0_size = struct.unpack_from("IH", data)[1] # EIATTR_PARAM_CBANK
214+
elif sh.name == ".nv.info" and param == 0x12: self.lcmem_usage = struct.unpack_from("II", data)[1] + 0x240 # EIATTR_MIN_STACK_SIZE
215215

216216
# Ensure device has enough local memory to run the program
217217
self.dev._ensure_has_local_memory(self.lcmem_usage)
@@ -226,7 +226,7 @@ def __init__(self, dev:NVDevice, name:str, lib:bytes):
226226

227227
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
228228

229-
self.constbuffer_0 = [0] * 88
229+
self.constbuffer_0 = [0] * (cbuf0_size // 4)
230230
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
231231

232232
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
@@ -248,9 +248,15 @@ def __init__(self, dev:NVDevice, name:str, lib:bytes):
248248
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
249249
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
250250

251-
# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
251+
# NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
252252
super().__init__(NVArgsState, self.dev, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))
253253

254+
def _parse_elf_info(self, sh, start_off=0):
255+
while start_off < sh.header.sh_size:
256+
typ, param, sz = struct.unpack_from("BBH", sh.content, start_off)
257+
yield typ, param, sh.content[start_off+4:start_off+sz+4] if typ == 0x4 else sz
258+
start_off += (sz if typ == 0x4 else 0) + 4
259+
254260
def __del__(self):
255261
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True))
256262

0 commit comments

Comments
 (0)