Skip to content

Commit 70db8c3

Browse files
authored
hcq: dyn alloc signals (tinygrad#9238)
* hcq: dyn alloc signals * types and uniqueue devs * typing * mypy * mypy one more time * test * make fds to not intersect in mockgpu between drivers
1 parent 6610ad5 commit 70db8c3

File tree

6 files changed

+59
-43
lines changed

6 files changed

+59
-43
lines changed

test/mockgpu/nv/nvdriver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def __init__(self, gpus=6):
5959
self.root_handle = None
6060

6161
self.gpus = {}
62-
self.next_fd = (1 << 30)
62+
self.next_fd = (1 << 29)
6363
self.next_handle = 1
6464

6565
self.object_by_handle = {}

test/test_hcq.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,5 +499,17 @@ def test_on_device_hang(self):
499499
assert "0xDEADBEE1" in str(ctx.exception)
500500
os.environ.pop("MOCKGPU_EMU_FAULTADDR")
501501

502+
def test_multidevice(self):
503+
try: amd_dev = Device["AMD"]
504+
except Exception: self.skipTest("no AMD device, test skipped")
505+
506+
try: nv_dev = Device["NV"]
507+
except Exception: self.skipTest("no NV device, test skipped")
508+
509+
x = amd_dev.signal_t()
510+
y = nv_dev.signal_t()
511+
assert type(x) is amd_dev.signal_t
512+
assert type(y) is nv_dev.signal_t
513+
502514
if __name__ == "__main__":
503515
unittest.main()

tinygrad/runtime/ops_amd.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import annotations
2-
from typing import Any, cast
2+
from typing import Any, cast, ClassVar
33
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
44
assert sys.platform != 'win32'
55
from dataclasses import dataclass
@@ -27,10 +27,7 @@ def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
2727

2828
class AMDSignal(HCQSignal):
2929
def __init__(self, base_addr:int|None=None, **kwargs):
30-
super().__init__(AMDDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=100)
31-
32-
def __del__(self):
33-
if isinstance(self.base_addr, int): AMDDevice.signals_pool.append(self.base_addr)
30+
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
3431

3532
def _sleep(self, time_spent_waiting_ms:int):
3633
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
@@ -562,9 +559,11 @@ def on_device_hang(self):
562559
def device_fini(self): self.adev.fini()
563560

564561
class AMDDevice(HCQCompiled):
562+
devices: ClassVar[list[HCQCompiled]] = []
563+
signal_pages: ClassVar[list[Any]] = []
564+
signal_pool: ClassVar[list[int]] = []
565+
565566
driverless:bool = not HWInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
566-
signals_page:Any = None
567-
signals_pool:list[int] = []
568567

569568
def __init__(self, device:str=""):
570569
self.device_id = int(device.split(":")[1]) if ":" in device else 0
@@ -573,11 +572,6 @@ def __init__(self, device:str=""):
573572
self.arch = "gfx%d%x%x" % (self.target // 10000, (self.target // 100) % 100, self.target % 100)
574573
if self.target < 100300 or self.target >= 120000: raise RuntimeError(f"Unsupported arch: {self.arch}")
575574

576-
if AMDDevice.signals_page is None:
577-
AMDDevice.signals_page = self.dev_iface.alloc(16 * 65536, host=True, uncached=True, cpu_access=True)
578-
AMDDevice.signals_pool = [AMDDevice.signals_page.va_addr + off for off in range(0, AMDDevice.signals_page.size, 16)]
579-
else: self.dev_iface.map(AMDDevice.signals_page)
580-
581575
self.max_cu_id = self.dev_iface.props['simd_count'] // self.dev_iface.props['simd_per_cu'] - 1
582576
self.max_wave_id = self.dev_iface.props['max_waves_per_simd'] * self.dev_iface.props['simd_per_cu'] - 1
583577
self.has_scratch_base_registers = self.target >= 110000

tinygrad/runtime/ops_nv.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22
import os, ctypes, contextlib, re, functools, mmap, struct, array, sys
33
assert sys.platform != 'win32'
4-
from typing import Any, cast, Union, Type
4+
from typing import Any, cast, Union, Type, ClassVar
55
from dataclasses import dataclass
66
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
77
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
@@ -73,10 +73,7 @@ def make_qmd_struct_type():
7373

7474
class NVSignal(HCQSignal):
7575
def __init__(self, base_addr:int|None=None, **kwargs):
76-
super().__init__(NVDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=1000, value_off=0, timestamp_off=8)
77-
78-
def __del__(self):
79-
if isinstance(self.base_addr, int): NVDevice.signals_pool.append(self.base_addr)
76+
super().__init__(base_addr, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
8077

8178
class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']):
8279
def __init__(self):
@@ -285,12 +282,14 @@ class GPFifo:
285282

286283
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
287284
class NVDevice(HCQCompiled[NVSignal]):
285+
devices: ClassVar[list[HCQCompiled]] = []
286+
signal_pages: ClassVar[list[Any]] = []
287+
signal_pool: ClassVar[list[int]] = []
288+
288289
root = None
289290
fd_ctl: HWInterface
290291
fd_uvm: HWInterface
291292
gpus_info: Union[list, ctypes.Array] = []
292-
signals_page: Any = None
293-
signals_pool: list[int] = []
294293

295294
# TODO: Need a proper allocator for va addresses
296295
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
@@ -433,11 +432,6 @@ def __init__(self, device:str=""):
433432
try: uvm.enable_peer_access(self.fd_uvm, gpuUuidA=self.gpu_uuid, gpuUuidB=dev.gpu_uuid)
434433
except RuntimeError as e: raise RuntimeError(str(e) + f". Make sure GPUs #{self.gpu_minor} & #{dev.gpu_minor} have P2P enabled between.") from e
435434

436-
if NVDevice.signals_page is None:
437-
NVDevice.signals_page = self._gpu_alloc(16 * 65536, cpu_access=True, uncached=True)
438-
NVDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, NVDevice.signals_page.size, 16)]
439-
else: self._gpu_map(NVDevice.signals_page)
440-
441435
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
442436
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.nvdevice, channel_params).hObjectNew
443437

tinygrad/runtime/ops_qcom.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os, ctypes, functools, mmap, struct, array, math, sys
33
assert sys.platform != 'win32'
44
from types import SimpleNamespace
5-
from typing import Any, cast
5+
from typing import Any, cast, ClassVar
66
from tinygrad.device import BufferSpec
77
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
88
from tinygrad.runtime.support.hcq import HWInterface
@@ -38,10 +38,7 @@ def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disa
3838

3939
class QCOMSignal(HCQSignal):
4040
def __init__(self, base_addr:int|None=None, **kwargs):
41-
super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2)
42-
43-
def __del__(self):
44-
if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr)
41+
super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
4542

4643
def _sleep(self, time_spent_waiting_ms:int):
4744
# Sleep only for only timeline signals. Do it immediately to free cpu.
@@ -320,16 +317,16 @@ def _free(self, opaque, options:BufferSpec):
320317
self.dev._gpu_free(opaque)
321318

322319
class QCOMDevice(HCQCompiled):
323-
signals_page: Any = None
324-
signals_pool: list[int] = []
320+
devices: ClassVar[list[HCQCompiled]] = []
321+
signal_pages: ClassVar[list[Any]] = []
322+
signal_pool: ClassVar[list[int]] = []
323+
325324
gpu_id: int = 0
326325
dummy_addr: int = 0
327326

328327
def __init__(self, device:str=""):
329328
self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR)
330329
QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
331-
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
332-
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
333330

334331
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
335332
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)

tinygrad/runtime/support/hcq.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import annotations
2-
from typing import cast, Type, TypeVar, Generic, Any
2+
from typing import cast, Type, TypeVar, Generic, Any, ClassVar
33
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
44
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
55
from tinygrad.renderer import Renderer
@@ -203,15 +203,20 @@ def submit(self, dev:DeviceType, var_vals:dict[Variable, int]|None=None):
203203
def _submit(self, dev:DeviceType): raise NotImplementedError("need _submit")
204204

205205
class HCQSignal(Generic[DeviceType]):
206-
def __init__(self, base_addr:sint=0, value:int=0, timeline_for_device:DeviceType|None=None, timestamp_divider=1, value_off=0, timestamp_off=8):
207-
self.base_addr, self.value_addr, self.timestamp_addr = base_addr, base_addr+value_off, base_addr+timestamp_off
206+
def __init__(self, base_addr:sint|None=None, value:int=0, dev_t:Type[DeviceType]|None=None, timeline_for_device:DeviceType|None=None,
207+
timestamp_divider=1, value_off=0, timestamp_off=8):
208+
self.base_addr = dev_t._alloc_signal_addr() if dev_t is not None and base_addr is None else base_addr
209+
self.value_addr, self.timestamp_addr, self.dev_t = self.base_addr+value_off, self.base_addr+timestamp_off, dev_t
208210
self.timestamp_divider:decimal.Decimal = decimal.Decimal(timestamp_divider)
209211
self.timeline_for_device:DeviceType|None = timeline_for_device
210212

211-
if isinstance(base_addr, int):
213+
if isinstance(self.base_addr, int):
212214
self.value_mv, self.timestamp_mv = to_mv(self.value_addr, 8).cast('Q'), to_mv(self.timestamp_addr, 8).cast('Q')
213215
self.value_mv[0] = value
214216

217+
def __del__(self):
218+
if isinstance(self.base_addr, int) and self.dev_t is not None: self.dev_t.signal_pool.append(self.base_addr)
219+
215220
@property
216221
def value(self) -> int: return self.value_mv[0]
217222

@@ -332,23 +337,29 @@ class HCQCompiled(Compiled, Generic[SignalType]):
332337
"""
333338
A base class for devices compatible with the HCQ (Hardware Command Queue) API.
334339
"""
335-
devices: list[HCQCompiled] = []
340+
devices: ClassVar[list[HCQCompiled]] = []
341+
signal_pages: ClassVar[list[Any]] = []
342+
signal_pool: ClassVar[list[int]] = []
336343

337344
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
338345
comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None):
339346
self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
347+
348+
from tinygrad.runtime.graph.hcq import HCQGraph
349+
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
350+
351+
# Map signals if any
352+
for sig_page in self.signal_pages: cast(HCQAllocator, self.allocator).map(sig_page)
353+
self.devices.append(self)
354+
340355
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
341356
self.timeline_value:int = 1
342357
self.timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
343358
self._shadow_timeline_signal:SignalType = self.signal_t(value=0, timeline_for_device=self)
344359
self.sig_prof_records:list[tuple[HCQSignal, HCQSignal, str, bool]] = []
345360

346-
from tinygrad.runtime.graph.hcq import HCQGraph
347-
super().__init__(device, allocator, renderer, compiler, runtime, HCQGraph)
348-
349361
self.kernargs_page:HCQBuffer = self.allocator.alloc(16 << 20, BufferSpec(cpu_access=True))
350362
self.kernargs_allocator:BumpAllocator = BumpAllocator(self.kernargs_page.size, base=cast(int, self.kernargs_page.va_addr), wrap=True)
351-
self.devices.append(self)
352363

353364
def synchronize(self):
354365
try: self.timeline_signal.wait(self.timeline_value - 1)
@@ -361,6 +372,14 @@ def synchronize(self):
361372
Compiled.profile_events += [ProfileRangeEvent(self.device, name, st.timestamp, en.timestamp, cp) for st,en,name,cp in self.sig_prof_records]
362373
self.sig_prof_records = []
363374

375+
@classmethod
376+
def _alloc_signal_addr(cls) -> int:
377+
if not cls.signal_pool:
378+
cls.signal_pages.append(alc:=cls.devices[0].allocator.alloc(0x1000, BufferSpec(host=True, uncached=True, cpu_access=True)))
379+
cls.signal_pool += [alc.va_addr + off for off in range(0, alc.size, 16)]
380+
for dev in cls.devices: cast(HCQAllocator, dev.allocator).map(alc)
381+
return cls.signal_pool.pop()
382+
364383
def _at_profile_finalize(self):
365384
def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
366385
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.timeline_value).submit(d)

0 commit comments

Comments
 (0)