@@ -195,7 +195,7 @@ class NVProgram(HCQProgram):
195
195
def __init__ (self , dev :NVDevice , name :str , lib :bytes ):
196
196
self .dev , self .name , self .lib = dev , name , lib
197
197
198
- if MOCKGPU : image , sections , relocs = memoryview (bytearray (lib ) + b'\x00 ' * (4 - len (lib )% 4 )).cast ("I" ), [], [] # type: ignore
198
+ if MOCKGPU : image , sections , relocs , cbuf0_size = memoryview (bytearray (lib ) + b'\x00 ' * (4 - len (lib )% 4 )).cast ("I" ), [], [], 0x160 # type: ignore
199
199
else : image , sections , relocs = elf_loader (self .lib , force_section_align = 128 )
200
200
201
201
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
@@ -208,10 +208,10 @@ def __init__(self, dev:NVDevice, name:str, lib:bytes):
208
208
if sh .name == f".text.{ self .name } " :
209
209
self .prog_addr , self .prog_sz , self .regs_usage = self .lib_gpu .va_addr + sh .header .sh_addr , sh .header .sh_size , max (sh .header .sh_info >> 24 , 16 )
210
210
elif m := re .match (r'\.nv\.constant(\d+)' , sh .name ): self .constbufs [int (m .group (1 ))] = (self .lib_gpu .va_addr + sh .header .sh_addr , sh .header .sh_size )
211
- elif sh .name == ".nv.info" :
212
- for off in range ( 0 , sh . header . sh_size , 12 ):
213
- typ , _ , val = struct .unpack_from ("III " , sh . content , off )
214
- if typ & 0xffff == 0x1204 : self .lcmem_usage = val + 0x240
211
+ elif sh .name . startswith ( ".nv.info" ) :
212
+ for typ , param , data in self . _parse_elf_info ( sh ):
213
+ if sh . name == f".nv.info. { name } " and param == 0xa : cbuf0_size = struct .unpack_from ("IH " , data )[ 1 ] # EIATTR_PARAM_CBANK
214
+ elif sh . name == ".nv.info" and param == 0x12 : self .lcmem_usage = struct . unpack_from ( "II" , data )[ 1 ] + 0x240 # EIATTR_MIN_STACK_SIZE
215
215
216
216
# Ensure device has enough local memory to run the program
217
217
self .dev ._ensure_has_local_memory (self .lcmem_usage )
@@ -226,7 +226,7 @@ def __init__(self, dev:NVDevice, name:str, lib:bytes):
226
226
227
227
ctypes .memmove (self .lib_gpu .va_addr , mv_address (image ), image .nbytes )
228
228
229
- self .constbuffer_0 = [0 ] * 88
229
+ self .constbuffer_0 = [0 ] * ( cbuf0_size // 4 )
230
230
self .constbuffer_0 [6 :12 ] = [* data64_le (self .dev .shared_mem_window ), * data64_le (self .dev .local_mem_window ), * data64_le (0xfffdc0 )]
231
231
232
232
smem_cfg = min (shmem_conf * 1024 for shmem_conf in [32 , 64 , 100 ] if shmem_conf * 1024 >= self .shmem_usage ) // 4096 + 1
@@ -248,9 +248,15 @@ def __init__(self, dev:NVDevice, name:str, lib:bytes):
248
248
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
249
249
self .max_threads = ((65536 // round_up (max (1 , self .regs_usage ) * 32 , 256 )) // 4 ) * 4 * 32
250
250
251
- # NV's kernargs is constbuffer (size 0x160) , then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
251
+ # NV's kernargs is constbuffer, then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
252
252
super ().__init__ (NVArgsState , self .dev , self .name , kernargs_alloc_size = round_up (self .constbufs [0 ][1 ], 1 << 8 ) + (8 << 8 ))
253
253
254
+ def _parse_elf_info (self , sh , start_off = 0 ):
255
+ while start_off < sh .header .sh_size :
256
+ typ , param , sz = struct .unpack_from ("BBH" , sh .content , start_off )
257
+ yield typ , param , sh .content [start_off + 4 :start_off + sz + 4 ] if typ == 0x4 else sz
258
+ start_off += (sz if typ == 0x4 else 0 ) + 4
259
+
254
260
def __del__ (self ):
255
261
if hasattr (self , 'lib_gpu' ): self .dev .allocator .free (self .lib_gpu , self .lib_gpu .size , BufferSpec (cpu_access = True ))
256
262
0 commit comments