diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 131cf1d9e3359..da0f35f9ab6ef 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,6 +26,8 @@ jobs: key: llvm-speed deps: testing_minimal llvm: 'true' + - name: External Benchmark Schedule + run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py - name: Speed Test run: LLVM=1 LLVMOPT=1 python3 test/test_speed_v_torch.py - name: Speed Test (BEAM=2) diff --git a/test/mockgpu/nv/nvgpu.py b/test/mockgpu/nv/nvgpu.py index 8a7f8e3074fbf..f450a16ab1401 100644 --- a/test/mockgpu/nv/nvgpu.py +++ b/test/mockgpu/nv/nvgpu.py @@ -92,7 +92,7 @@ def execute_qmd(self, qmd_addr): qmd = qmd_struct_t.from_address(qmd_addr) prg_addr = qmd.program_address_lower + (qmd.program_address_upper << 32) const0 = to_mv(qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32), 0x160).cast('I') - args_cnt, vals_cnt = const0[0], const0[1] + args_cnt, vals_cnt = const0[80], const0[81] args_addr = qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32) + 0x160 args = to_mv(args_addr, args_cnt*8).cast('Q') vals = to_mv(args_addr + args_cnt*8, vals_cnt*4).cast('I') diff --git a/test/test_uops.py b/test/test_uops.py index 7c658a582a049..c41767500e5ce 100644 --- a/test/test_uops.py +++ b/test/test_uops.py @@ -10,7 +10,7 @@ from tinygrad.ops import Ops, UOp, UPat, KernelInfo, exec_alu # noqa F401 from tinygrad.spec import spec from tinygrad.renderer import ProgramSpec -from tinygrad.engine.schedule import to_si +from tinygrad.engine.schedule import fix_kernel_ops from tinygrad.engine.realize import CompiledRunner, lower_schedule_item, get_kernel from tinygrad.codegen.linearize import linearize_uop from tinygrad.codegen.rewriter import full_graph_rewrite, sym @@ -487,7 +487,7 @@ def test_simple_order_with_special(self): class TestUPatHelpers(unittest.TestCase): def test_location(self): self.assertEqual(sym.patterns[-1][0].location[0].replace("\\", "/").split("/")[-1], "rewriter.py") - self.assertEqual(to_si.patterns[0][0].location[0].replace("\\", "/").split("/")[-1], "schedule.py") + self.assertEqual(fix_kernel_ops.patterns[0][0].location[0].replace("\\", "/").split("/")[-1], "schedule.py") self.assertEqual(spec.patterns[0][0].location[0].replace("\\", "/").split("/")[-1], "ops.py") with self.assertRaises(AssertionError): # TODO: location UPat files created in test/*? test_upat = UPat(Ops.CONST, dtypes.bool) diff --git a/tinygrad/codegen/rewriter.py b/tinygrad/codegen/rewriter.py index 0fd9cbfb52606..c0eb205cda699 100644 --- a/tinygrad/codegen/rewriter.py +++ b/tinygrad/codegen/rewriter.py @@ -54,8 +54,7 @@ def fold_expanded(ex, buf): rootsrc[0] if isinstance(rootsrc, tuple) else None) else: # for non image, we upcast the index pointer - new_src[0] = new_src[0].cast(new_src[0].dtype.base.vec(fold_length).ptr(size=new_src[0].dtype.size//fold_length, - local=new_src[0].dtype.local)) + new_src[0] = new_src[0].cast(new_src[0].dtype.base.vec(fold_length).ptr(size=new_src[0].dtype.size, local=new_src[0].dtype.local)) # generate the folded new_srcs if is_load: new_load = UOp(Ops.LOAD, load_1.dtype.vec(fold_length), tuple(new_src)) @@ -271,6 +270,8 @@ def reduce_collapse(acc:UOp, ret:UOp, alu:UOp): lambda gep,alu: UOp(alu.op, alu.dtype.scalar().vec(gep.dtype.count), tuple(x.gep(gep.arg) for x in alu.src), alu.arg)), # push some GEPs through WMMAs (UPat(Ops.GEP, src=(UPat(Ops.WMMA, name="wmma"),), name="gep"), gep_through_wmma), + # CAT can't be rendered. it's a VECTORIZE on vectors, we expand to a single VECTORIZEs with GEPs (TODO: move this later) + (UPat(Ops.CAT, name="x"), lambda x: UOp(Ops.VECTORIZE, x.dtype, tuple(y.gep(i) for y in x.src for i in range(y.dtype.count)))), # tensor core with a 0 input is acc (UPat(Ops.WMMA, src=(UPat.const(None, 0.0), UPat.var(), UPat.var("acc"))), lambda acc: acc), (UPat(Ops.WMMA, src=(UPat.var(), UPat.const(None, 0.0), UPat.var("acc"))), lambda acc: acc), @@ -378,8 +379,7 @@ def do_expand(root:UOp): new_srcs.append(src) elif src.dtype.count > 1: # put any input dtype > 1 grouped together - new_srcs.append(UOp(Ops.VECTORIZE, - src.dtype.scalar().vec(expand_sz*src.dtype.count), tuple(src.gep(i) for i in range(src.dtype.count))*expand_sz)) + new_srcs.append(UOp(Ops.CAT, src.dtype.scalar().vec(expand_sz*src.dtype.count), (src,)*expand_sz)) else: # repeat the arg new_srcs.append(src.broadcast(expand_sz)) diff --git a/tinygrad/device.py b/tinygrad/device.py index 2845e9c1c1da8..66f65e0352b9f 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -227,7 +227,9 @@ def _offset(self, buf, size:int, offset:int): return from_mv(self._as_buffer(buf # CPUProgram is a jit/shellcode program that can be just mmapped and jumped to class CPUProgram: - helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1') + rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1') + atomic_lib = ctypes.CDLL(ctypes.util.find_library('atomic')) if sys.platform == "linux" else None + def __init__(self, name:str, lib:bytes): if sys.platform == "win32": PAGE_EXECUTE_READWRITE = 0x40 @@ -243,15 +245,15 @@ def __init__(self, name:str, lib:bytes): # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np) self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC) - if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False) + if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False) self.mem.write(lib) - if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True) + if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True) # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang. # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5 - CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) + CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem)) diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 0ec55bbab2135..d61809e388ca9 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -356,21 +356,19 @@ def check_load_st(glbl:UOp, view:UOp): raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n" +colored(" - a += a.T\n", "red")+colored(" + a += a.T.contiguous()", "green")) -to_si = PatternMatcher([ - # BUFFER -> DEFINE_GLOBAL +fix_kernel_ops = PatternMatcher([ + # BUFFER becomes DEFINE_GLOBAL (UPat(Ops.BUFFER, name="x"), _append_buf), - # simplify and unbind the final VIEWs + # BIND in shapetracker becomes DEFINE_VAR (UPat(Ops.VIEW, name="x"), _append_st_vars), - # don't need SINK on COPY or BUFFER_VIEW + # remove SINK from COPY and BUFFER_VIEW (UPat(Ops.SINK, src=(UPat.store(UPat.var("b"), UPat(), UPat((Ops.COPY, Ops.BUFFER_VIEW), name="x")),)), lambda b,x: x.replace(src=(b, *x.src))), - # don't need contiguous or assign anymore + # remove CONTIGUOUS/ASSIGN/DEVICE/PRELOAD (UPat(Ops.CONTIGUOUS, src=(UPat.var("x"),)), lambda x: x), (UPat(Ops.ASSIGN, src=(UPat(), UPat.var("x"),)), lambda x: x), - # don't need DEVICE anymore (UPat(Ops.VIEW, name="view", src=(UPat(Ops.DEVICE),)), lambda view: view.replace(src=())), - # PRELOAD becomes LOAD - (UPat(Ops.PRELOAD, name="root"), lambda root:root.replace(op=Ops.LOAD)), - # once images are loaded they become the base dtype + (UPat(Ops.PRELOAD, name="x"), lambda x: x.replace(op=Ops.LOAD)), + # no ImageDType after load (UPat(GroupOp.All-{Ops.DEFINE_GLOBAL}, name="x"), lambda x: x.replace(dtype=x.dtype.base) if isinstance(x.dtype, ImageDType) else None), # if this kernel also assigns to the loaded buffer, ensure we can index it correctly (UPat(Ops.LOAD, src=(UPat.var("glbl"), UPat.var("view"))), check_load_st), @@ -384,19 +382,19 @@ def unbind_variable(ctx:dict[Variable, int], bind:UOp, var:UOp, val:UOp): def schedule_uop(pre:UOp, ctx:ScheduleContext) -> UOp: # unbind_vars + push views to edges sink = graph_rewrite(graph_rewrite(pre, unbind_vars+view_left, ctx=ctx.var_vals), view_right) - # remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL - ast = graph_rewrite(sink, to_si, si_ctx:=KernelContext(ctx.var_vals)) # deal with ASSIGN if len(ctx.assigns) != 0: - assign_preloads = ctx.preloads[si_ctx.bufs[0].buffer] + assign_preloads = ctx.preloads[pre.src[0].buf_uop.buffer] for x in list(sink.toposort)[::-1]: # we only allow a kernel to depend on either the before ASSIGN or after ASSIGN version of a BUFFER if x.op is Ops.LOAD and x.buf_uop in assign_preloads: raise RuntimeError("cycle detected in graph") # PRELOAD tells the toposort this kernel should run before ASSIGN if x.op is Ops.PRELOAD: assign_preloads[x.buf_uop] = None + # fix_kernel_ops + sink = graph_rewrite(sink, fix_kernel_ops, si_ctx:=KernelContext(ctx.var_vals)) # NOTE: we only add the metadata for fused tensors metadata = tuple(dedup(m for x in pre.toposort if x.op is not Ops.BUFFER and (m:=ctx.ops_metadata.get(x)) is not None)) - return UOp(Ops.KERNEL, src=tuple(si_ctx.bufs), arg=Kernel(ast, metadata)) + return UOp(Ops.KERNEL, src=tuple(si_ctx.bufs), arg=Kernel(sink, metadata)) PROCESS_REPLAY_CAPTURE:dict[str, bytes] = {} if CAPTURE_PROCESS_REPLAY: diff --git a/tinygrad/ops.py b/tinygrad/ops.py index b96eb814f27ca..c1f2d457ff765 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -118,7 +118,7 @@ class Ops(FastEnum): REDUCE_AXIS = auto() # helper ops - GEP = auto(); VECTORIZE = auto() # noqa: E702 + GEP = auto(); VECTORIZE = auto(); CAT = auto() # noqa: E702 # UnaryOps CAST = auto(); BITCAST = auto(); EXP2 = auto(); LOG2 = auto(); SIN = auto(); SQRT = auto(); RECIP = auto(); NEG = auto() # noqa: E702 @@ -152,6 +152,7 @@ class Ops(FastEnum): # device DEVICE = auto() MULTI = auto() + CUSTOM = auto() class GroupOp: Unary = {Ops.EXP2, Ops.LOG2, Ops.SIN, Ops.SQRT, Ops.RECIP, Ops.NEG} @@ -593,7 +594,7 @@ def const_factor(self) -> int: if self.op is Ops.ADD: return math.gcd(self.src[0].const_factor(), self.src[1].const_factor()) if self.op is Ops.MUL: return self.src[0].arg if self.src[0].op is Ops.CONST else self.src[1].arg if self.src[1].op is Ops.CONST else 1 return 1 - def divides(self, v) -> UOp|None: + def divides(self, v:int) -> UOp|None: if v==1: return self if self.op is Ops.CONST: return self.const_like(self.arg//v) if self.arg%v == 0 else None if self.op is Ops.VCONST: return self.const_like(tuple(x//v for x in self.arg)) if all(x%v == 0 for x in self.arg) else None diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 2a21e86e46109..efd8dc8d7d85a 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -52,6 +52,8 @@ (UPat(Ops.GEP, name="x"), lambda ctx,x: ctx[x.src[0]] + \ (f"[{x.arg[0]}]" if x.src[0].dtype.count > (8 if ctx.device in {"CUDA", "NV"} else 4) or ctx.device in {'CLANG', 'DSP'} else \ f".{'xyzwabcd'[x.arg[0]]}")), + # custom passes through with format + (UPat(Ops.CUSTOM, name="x"), lambda ctx,x: x.arg.format(*[ctx[y] for y in x.src])), ]) extra_pm = PatternMatcher([ @@ -148,7 +150,7 @@ def render(self, name:str, uops:list[UOp]) -> str: assert l is not None, f"failed to render {u.op} {u.dtype} {[(x.op,x.dtype) for x in u.src]} {u.arg}" if u.op in {Ops.ENDIF, Ops.ENDRANGE}: depth -= 1 - if u.op in {Ops.CONST, Ops.GEP, Ops.INDEX} or \ + if u.op in {Ops.CONST, Ops.GEP, Ops.INDEX, Ops.CUSTOM} or \ (u.op in {Ops.VECTORIZE, *GroupOp.ALU, Ops.CAST, Ops.BITCAST} and child_count[u] == 1 and not getenv("EXPAND_SSA")): r[u] = l else: diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 7f6606d4d1b95..daba3e35b7f26 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface from tinygrad.ops import sint -from tinygrad.device import BufferSpec +from tinygrad.device import BufferSpec, CPUProgram from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX from tinygrad.renderer.cstyle import AMDRenderer from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio @@ -151,8 +151,7 @@ def _submit(self, dev:AMDDevice): for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value dev.compute_queue.put_value += len(cmds) - dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value - dev.compute_queue.doorbell[0] = dev.compute_queue.put_value + dev.compute_queue.signal_doorbell() class AMDCopyQueue(HWQueue): def __init__(self, max_copy_size=0x40000000): @@ -236,8 +235,7 @@ def _submit(self, dev:AMDDevice): dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:]) dev.sdma_queue.put_value += rem_packet_cnt * 4 - dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value - dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value + dev.sdma_queue.signal_doorbell() class AMDProgram(HCQProgram): def __init__(self, dev:AMDDevice, name:str, lib:bytes): @@ -298,6 +296,13 @@ class AMDQueueDesc: doorbell: memoryview put_value: int = 0 + def signal_doorbell(self): + self.write_ptr[0] = self.put_value + + # Ensure all prior writes are visible to the GPU. + if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5) + self.doorbell[0] = self.put_value + class KFDIface: kfd:HWInterface|None = None event_page:HCQBuffer|None = None diff --git a/tinygrad/runtime/ops_disk.py b/tinygrad/runtime/ops_disk.py index 0a01e22a08e5e..7039389a1d818 100644 --- a/tinygrad/runtime/ops_disk.py +++ b/tinygrad/runtime/ops_disk.py @@ -67,7 +67,7 @@ def __init__(self, device:DiskDevice, size:int, offset=0): self.device, self.size, self.offset = device, size, offset def __repr__(self): return f"" def _buf(self) -> memoryview: - assert hasattr(self.device, "mem"), "DiskBuffer wasn't opened" + assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}" return memoryview(self.device.mem)[self.offset:self.offset+self.size] MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000) diff --git a/tinygrad/runtime/ops_dsp.py b/tinygrad/runtime/ops_dsp.py index 0b2a53f7db852..0b00ec72324a0 100644 --- a/tinygrad/runtime/ops_dsp.py +++ b/tinygrad/runtime/ops_dsp.py @@ -9,11 +9,20 @@ from tinygrad.runtime.autogen import libc, qcom_dsp if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import +from tinygrad.ops import PatternMatcher, UPat + +dsp_pm = PatternMatcher([ + (UPat(Ops.VECTORIZE, src=UPat.var("y"))*UPat.var("x"), lambda x,y: UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")*x), + (UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True), + lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])) +]) + class DSPRenderer(ClangRenderer): device = "DSP" supports_float4 = True buffer_suffix = " restrict __attribute__((align_value(128)))" kernel_prefix = "__attribute__((noinline)) " + extra_matcher = dsp_pm+ClangRenderer.extra_matcher type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" } code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})", Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})", diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 368b3c03b9429..e5222878d1669 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -6,7 +6,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU from tinygrad.ops import sint -from tinygrad.device import BufferSpec +from tinygrad.device import BufferSpec, CPUProgram from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.cstyle import NVRenderer @@ -122,6 +122,8 @@ def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo): gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41) gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count + + if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5) dev.gpu_mmio[0x90 // 4] = gpfifo.token gpfifo.put_value += 1 @@ -141,6 +143,7 @@ def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, .. self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I') self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H') + self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I') qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr) if self.active_qmd is None: @@ -188,7 +191,7 @@ def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo) class NVArgsState(CLikeArgsState): def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()): - if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)] + if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)] super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0) class NVProgram(HCQProgram): diff --git a/tinygrad/spec.py b/tinygrad/spec.py index 41ce1341e28a5..567875d664d42 100644 --- a/tinygrad/spec.py +++ b/tinygrad/spec.py @@ -115,7 +115,7 @@ # NOTE: for testing, we let sinks be anything #(UPat(UOps.SINK, src=UPat(UOps.STORE)), lambda: True), (UPat(Ops.SINK, dtypes.void), lambda: True), - (UPat(Ops.NOOP), lambda: True), + (UPat((Ops.NOOP, Ops.CUSTOM)), lambda: True), # PTX LOAD/STORE (UPat((Ops.LOAD, Ops.STORE), src=(UPat(dtype=dtypes.int64),), allow_any_len=True), lambda: True),