Skip to content

[pull] master from tinygrad:master #97

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
key: llvm-speed
deps: testing_minimal
llvm: 'true'
- name: External Benchmark Schedule
run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
- name: Speed Test
run: LLVM=1 LLVMOPT=1 python3 test/test_speed_v_torch.py
- name: Speed Test (BEAM=2)
Expand Down
2 changes: 1 addition & 1 deletion test/mockgpu/nv/nvgpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def execute_qmd(self, qmd_addr):
qmd = qmd_struct_t.from_address(qmd_addr)
prg_addr = qmd.program_address_lower + (qmd.program_address_upper << 32)
const0 = to_mv(qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32), 0x160).cast('I')
args_cnt, vals_cnt = const0[0], const0[1]
args_cnt, vals_cnt = const0[80], const0[81]
args_addr = qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32) + 0x160
args = to_mv(args_addr, args_cnt*8).cast('Q')
vals = to_mv(args_addr + args_cnt*8, vals_cnt*4).cast('I')
Expand Down
4 changes: 2 additions & 2 deletions test/test_uops.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tinygrad.ops import Ops, UOp, UPat, KernelInfo, exec_alu # noqa F401
from tinygrad.spec import spec
from tinygrad.renderer import ProgramSpec
from tinygrad.engine.schedule import to_si
from tinygrad.engine.schedule import fix_kernel_ops
from tinygrad.engine.realize import CompiledRunner, lower_schedule_item, get_kernel
from tinygrad.codegen.linearize import linearize_uop
from tinygrad.codegen.rewriter import full_graph_rewrite, sym
Expand Down Expand Up @@ -487,7 +487,7 @@ def test_simple_order_with_special(self):
class TestUPatHelpers(unittest.TestCase):
def test_location(self):
self.assertEqual(sym.patterns[-1][0].location[0].replace("\\", "/").split("/")[-1], "rewriter.py")
self.assertEqual(to_si.patterns[0][0].location[0].replace("\\", "/").split("/")[-1], "schedule.py")
self.assertEqual(fix_kernel_ops.patterns[0][0].location[0].replace("\\", "/").split("/")[-1], "schedule.py")
self.assertEqual(spec.patterns[0][0].location[0].replace("\\", "/").split("/")[-1], "ops.py")
with self.assertRaises(AssertionError): # TODO: location UPat files created in test/*?
test_upat = UPat(Ops.CONST, dtypes.bool)
Expand Down
8 changes: 4 additions & 4 deletions tinygrad/codegen/rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def fold_expanded(ex, buf):
rootsrc[0] if isinstance(rootsrc, tuple) else None)
else:
# for non image, we upcast the index pointer
new_src[0] = new_src[0].cast(new_src[0].dtype.base.vec(fold_length).ptr(size=new_src[0].dtype.size//fold_length,
local=new_src[0].dtype.local))
new_src[0] = new_src[0].cast(new_src[0].dtype.base.vec(fold_length).ptr(size=new_src[0].dtype.size, local=new_src[0].dtype.local))
# generate the folded new_srcs
if is_load:
new_load = UOp(Ops.LOAD, load_1.dtype.vec(fold_length), tuple(new_src))
Expand Down Expand Up @@ -271,6 +270,8 @@ def reduce_collapse(acc:UOp, ret:UOp, alu:UOp):
lambda gep,alu: UOp(alu.op, alu.dtype.scalar().vec(gep.dtype.count), tuple(x.gep(gep.arg) for x in alu.src), alu.arg)),
# push some GEPs through WMMAs
(UPat(Ops.GEP, src=(UPat(Ops.WMMA, name="wmma"),), name="gep"), gep_through_wmma),
# CAT can't be rendered. it's a VECTORIZE on vectors, we expand to a single VECTORIZEs with GEPs (TODO: move this later)
(UPat(Ops.CAT, name="x"), lambda x: UOp(Ops.VECTORIZE, x.dtype, tuple(y.gep(i) for y in x.src for i in range(y.dtype.count)))),
# tensor core with a 0 input is acc
(UPat(Ops.WMMA, src=(UPat.const(None, 0.0), UPat.var(), UPat.var("acc"))), lambda acc: acc),
(UPat(Ops.WMMA, src=(UPat.var(), UPat.const(None, 0.0), UPat.var("acc"))), lambda acc: acc),
Expand Down Expand Up @@ -378,8 +379,7 @@ def do_expand(root:UOp):
new_srcs.append(src)
elif src.dtype.count > 1:
# put any input dtype > 1 grouped together
new_srcs.append(UOp(Ops.VECTORIZE,
src.dtype.scalar().vec(expand_sz*src.dtype.count), tuple(src.gep(i) for i in range(src.dtype.count))*expand_sz))
new_srcs.append(UOp(Ops.CAT, src.dtype.scalar().vec(expand_sz*src.dtype.count), (src,)*expand_sz))
else:
# repeat the arg
new_srcs.append(src.broadcast(expand_sz))
Expand Down
10 changes: 6 additions & 4 deletions tinygrad/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,9 @@ def _offset(self, buf, size:int, offset:int): return from_mv(self._as_buffer(buf

# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
class CPUProgram:
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
atomic_lib = ctypes.CDLL(ctypes.util.find_library('atomic')) if sys.platform == "linux" else None

def __init__(self, name:str, lib:bytes):
if sys.platform == "win32":
PAGE_EXECUTE_READWRITE = 0x40
Expand All @@ -243,15 +245,15 @@ def __init__(self, name:str, lib:bytes):
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)

if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True)

# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))

self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))

Expand Down
24 changes: 11 additions & 13 deletions tinygrad/engine/schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,21 +356,19 @@ def check_load_st(glbl:UOp, view:UOp):
raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
+colored(" - a += a.T\n", "red")+colored(" + a += a.T.contiguous()", "green"))

to_si = PatternMatcher([
# BUFFER -> DEFINE_GLOBAL
fix_kernel_ops = PatternMatcher([
# BUFFER becomes DEFINE_GLOBAL
(UPat(Ops.BUFFER, name="x"), _append_buf),
# simplify and unbind the final VIEWs
# BIND in shapetracker becomes DEFINE_VAR
(UPat(Ops.VIEW, name="x"), _append_st_vars),
# don't need SINK on COPY or BUFFER_VIEW
# remove SINK from COPY and BUFFER_VIEW
(UPat(Ops.SINK, src=(UPat.store(UPat.var("b"), UPat(), UPat((Ops.COPY, Ops.BUFFER_VIEW), name="x")),)), lambda b,x: x.replace(src=(b, *x.src))),
# don't need contiguous or assign anymore
# remove CONTIGUOUS/ASSIGN/DEVICE/PRELOAD
(UPat(Ops.CONTIGUOUS, src=(UPat.var("x"),)), lambda x: x),
(UPat(Ops.ASSIGN, src=(UPat(), UPat.var("x"),)), lambda x: x),
# don't need DEVICE anymore
(UPat(Ops.VIEW, name="view", src=(UPat(Ops.DEVICE),)), lambda view: view.replace(src=())),
# PRELOAD becomes LOAD
(UPat(Ops.PRELOAD, name="root"), lambda root:root.replace(op=Ops.LOAD)),
# once images are loaded they become the base dtype
(UPat(Ops.PRELOAD, name="x"), lambda x: x.replace(op=Ops.LOAD)),
# no ImageDType after load
(UPat(GroupOp.All-{Ops.DEFINE_GLOBAL}, name="x"), lambda x: x.replace(dtype=x.dtype.base) if isinstance(x.dtype, ImageDType) else None),
# if this kernel also assigns to the loaded buffer, ensure we can index it correctly
(UPat(Ops.LOAD, src=(UPat.var("glbl"), UPat.var("view"))), check_load_st),
Expand All @@ -384,19 +382,19 @@ def unbind_variable(ctx:dict[Variable, int], bind:UOp, var:UOp, val:UOp):
def schedule_uop(pre:UOp, ctx:ScheduleContext) -> UOp:
# unbind_vars + push views to edges
sink = graph_rewrite(graph_rewrite(pre, unbind_vars+view_left, ctx=ctx.var_vals), view_right)
# remove extra uops from SINK + substitue BUFFER with DEFINE_GLOBAL
ast = graph_rewrite(sink, to_si, si_ctx:=KernelContext(ctx.var_vals))
# deal with ASSIGN
if len(ctx.assigns) != 0:
assign_preloads = ctx.preloads[si_ctx.bufs[0].buffer]
assign_preloads = ctx.preloads[pre.src[0].buf_uop.buffer]
for x in list(sink.toposort)[::-1]:
# we only allow a kernel to depend on either the before ASSIGN or after ASSIGN version of a BUFFER
if x.op is Ops.LOAD and x.buf_uop in assign_preloads: raise RuntimeError("cycle detected in graph")
# PRELOAD tells the toposort this kernel should run before ASSIGN
if x.op is Ops.PRELOAD: assign_preloads[x.buf_uop] = None
# fix_kernel_ops
sink = graph_rewrite(sink, fix_kernel_ops, si_ctx:=KernelContext(ctx.var_vals))
# NOTE: we only add the metadata for fused tensors
metadata = tuple(dedup(m for x in pre.toposort if x.op is not Ops.BUFFER and (m:=ctx.ops_metadata.get(x)) is not None))
return UOp(Ops.KERNEL, src=tuple(si_ctx.bufs), arg=Kernel(ast, metadata))
return UOp(Ops.KERNEL, src=tuple(si_ctx.bufs), arg=Kernel(sink, metadata))

PROCESS_REPLAY_CAPTURE:dict[str, bytes] = {}
if CAPTURE_PROCESS_REPLAY:
Expand Down
5 changes: 3 additions & 2 deletions tinygrad/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class Ops(FastEnum):
REDUCE_AXIS = auto()

# helper ops
GEP = auto(); VECTORIZE = auto() # noqa: E702
GEP = auto(); VECTORIZE = auto(); CAT = auto() # noqa: E702

# UnaryOps
CAST = auto(); BITCAST = auto(); EXP2 = auto(); LOG2 = auto(); SIN = auto(); SQRT = auto(); RECIP = auto(); NEG = auto() # noqa: E702
Expand Down Expand Up @@ -152,6 +152,7 @@ class Ops(FastEnum):
# device
DEVICE = auto()
MULTI = auto()
CUSTOM = auto()

class GroupOp:
Unary = {Ops.EXP2, Ops.LOG2, Ops.SIN, Ops.SQRT, Ops.RECIP, Ops.NEG}
Expand Down Expand Up @@ -593,7 +594,7 @@ def const_factor(self) -> int:
if self.op is Ops.ADD: return math.gcd(self.src[0].const_factor(), self.src[1].const_factor())
if self.op is Ops.MUL: return self.src[0].arg if self.src[0].op is Ops.CONST else self.src[1].arg if self.src[1].op is Ops.CONST else 1
return 1
def divides(self, v) -> UOp|None:
def divides(self, v:int) -> UOp|None:
if v==1: return self
if self.op is Ops.CONST: return self.const_like(self.arg//v) if self.arg%v == 0 else None
if self.op is Ops.VCONST: return self.const_like(tuple(x//v for x in self.arg)) if all(x%v == 0 for x in self.arg) else None
Expand Down
4 changes: 3 additions & 1 deletion tinygrad/renderer/cstyle.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
(UPat(Ops.GEP, name="x"), lambda ctx,x: ctx[x.src[0]] + \
(f"[{x.arg[0]}]" if x.src[0].dtype.count > (8 if ctx.device in {"CUDA", "NV"} else 4) or ctx.device in {'CLANG', 'DSP'} else \
f".{'xyzwabcd'[x.arg[0]]}")),
# custom passes through with format
(UPat(Ops.CUSTOM, name="x"), lambda ctx,x: x.arg.format(*[ctx[y] for y in x.src])),
])

extra_pm = PatternMatcher([
Expand Down Expand Up @@ -148,7 +150,7 @@ def render(self, name:str, uops:list[UOp]) -> str:
assert l is not None, f"failed to render {u.op} {u.dtype} {[(x.op,x.dtype) for x in u.src]} {u.arg}"

if u.op in {Ops.ENDIF, Ops.ENDRANGE}: depth -= 1
if u.op in {Ops.CONST, Ops.GEP, Ops.INDEX} or \
if u.op in {Ops.CONST, Ops.GEP, Ops.INDEX, Ops.CUSTOM} or \
(u.op in {Ops.VECTORIZE, *GroupOp.ALU, Ops.CAST, Ops.BITCAST} and child_count[u] == 1 and not getenv("EXPAND_SSA")):
r[u] = l
else:
Expand Down
15 changes: 10 additions & 5 deletions tinygrad/runtime/ops_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.device import BufferSpec, CPUProgram
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
Expand Down Expand Up @@ -151,8 +151,7 @@ def _submit(self, dev:AMDDevice):
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value

dev.compute_queue.put_value += len(cmds)
dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value
dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
dev.compute_queue.signal_doorbell()

class AMDCopyQueue(HWQueue):
def __init__(self, max_copy_size=0x40000000):
Expand Down Expand Up @@ -236,8 +235,7 @@ def _submit(self, dev:AMDDevice):
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
dev.sdma_queue.put_value += rem_packet_cnt * 4

dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
dev.sdma_queue.signal_doorbell()

class AMDProgram(HCQProgram):
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
Expand Down Expand Up @@ -298,6 +296,13 @@ class AMDQueueDesc:
doorbell: memoryview
put_value: int = 0

def signal_doorbell(self):
self.write_ptr[0] = self.put_value

# Ensure all prior writes are visible to the GPU.
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
self.doorbell[0] = self.put_value

class KFDIface:
kfd:HWInterface|None = None
event_page:HCQBuffer|None = None
Expand Down
2 changes: 1 addition & 1 deletion tinygrad/runtime/ops_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(self, device:DiskDevice, size:int, offset=0):
self.device, self.size, self.offset = device, size, offset
def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
def _buf(self) -> memoryview:
assert hasattr(self.device, "mem"), "DiskBuffer wasn't opened"
assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}"
return memoryview(self.device.mem)[self.offset:self.offset+self.size]

MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
Expand Down
9 changes: 9 additions & 0 deletions tinygrad/runtime/ops_dsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,20 @@
from tinygrad.runtime.autogen import libc, qcom_dsp
if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import

from tinygrad.ops import PatternMatcher, UPat

dsp_pm = PatternMatcher([
(UPat(Ops.VECTORIZE, src=UPat.var("y"))*UPat.var("x"), lambda x,y: UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")*x),
(UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:]))
])

class DSPRenderer(ClangRenderer):
device = "DSP"
supports_float4 = True
buffer_suffix = " restrict __attribute__((align_value(128)))"
kernel_prefix = "__attribute__((noinline)) "
extra_matcher = dsp_pm+ClangRenderer.extra_matcher
type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
Expand Down
7 changes: 5 additions & 2 deletions tinygrad/runtime/ops_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
from tinygrad.ops import sint
from tinygrad.device import BufferSpec
from tinygrad.device import BufferSpec, CPUProgram
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import NVRenderer
Expand Down Expand Up @@ -122,6 +122,8 @@ def _submit_to_gpfifo(self, dev:NVDevice, gpfifo:GPFifo):

gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self._q) << 42) | (1 << 41)
gpfifo.controls.GPPut = (gpfifo.put_value + 1) % gpfifo.entries_count

if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
dev.gpu_mmio[0x90 // 4] = gpfifo.token
gpfifo.put_value += 1

Expand All @@ -141,6 +143,7 @@ def exec(self, prg:NVProgram, args_state:NVArgsState, global_size:tuple[sint, ..

self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I')
self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H')
self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I')
qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr)

if self.active_qmd is None:
Expand Down Expand Up @@ -188,7 +191,7 @@ def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)

class NVArgsState(CLikeArgsState):
def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)]
if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)]
super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0)

class NVProgram(HCQProgram):
Expand Down
2 changes: 1 addition & 1 deletion tinygrad/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
# NOTE: for testing, we let sinks be anything
#(UPat(UOps.SINK, src=UPat(UOps.STORE)), lambda: True),
(UPat(Ops.SINK, dtypes.void), lambda: True),
(UPat(Ops.NOOP), lambda: True),
(UPat((Ops.NOOP, Ops.CUSTOM)), lambda: True),

# PTX LOAD/STORE
(UPat((Ops.LOAD, Ops.STORE), src=(UPat(dtype=dtypes.int64),), allow_any_len=True), lambda: True),
Expand Down
Loading