benchmark single kernel launch (tinygrad#8921)

geohot · web-flow · commit a8e54df3630c · 2025-02-06T13:35:34.000+08:00
* benchmark kernel launch

* don't realize unneeded

* faster

* faster metal

* fix mypy

* without sync

* no div 0

* lru cache that

* no sync in the profile
diff --git a/test/external/external_benchmark_kernel_launch.py b/test/external/external_benchmark_kernel_launch.py
@@ -0,0 +1,38 @@
+import time
+from tinygrad import Tensor, TinyJit, Device, Context
+from tinygrad.helpers import Profiling, Timing, GlobalCounters
+
+# python3 test/test_speed_v_torch.py TestSpeed.test_add_a
+
+@TinyJit
+def plus(a:Tensor, b:Tensor): return a+b
+
+if __name__ == "__main__":
+  a = Tensor([1]).realize()
+  b = Tensor([1]).realize()
+  for i in range(5):
+    with Timing(prefix=f"{i}:"):
+      c = plus(a,b)
+      Device[c.device].synchronize()
+  assert c.item() == 2
+  for i in range(5):
+    st = time.perf_counter()
+    c = plus(a,b)
+    et = time.perf_counter() - st
+    print(f"nosync  {i}: {et*1e6:.2f} us")
+  Device[c.device].synchronize()
+  for i in range(5):
+    st = time.perf_counter()
+    c = plus(a,b)
+    Device[c.device].synchronize()
+    et = time.perf_counter() - st
+    print(f"precise {i}: {et*1e6:.2f} us")
+  assert GlobalCounters.time_sum_s == 0
+  with Context(DEBUG=2):
+    st = time.perf_counter()
+    c = plus(a,b)
+    Device[c.device].synchronize()
+    et = time.perf_counter() - st
+  print(f"kernel {GlobalCounters.time_sum_s*1e3:.2f} ms / full {et*1e3:.2f} ms -- {et/(GlobalCounters.time_sum_s+1e-12):.2f} x")
+  with Profiling():
+    c = plus(a,b)
diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py
@@ -202,8 +202,12 @@ def test_mul_sum(self):
     def f(a, b): return (a*b).sum()
     helper_test_generic_square('mul_sum', 4096, f, f)
 
-  def test_add(self):
-    for N in [1, 1024, 4096]:
+  def test_add_a(self):
+    def f(a, b): return a + b
+    helper_test_generic_square('add', 1, f, f)
+
+  def test_add_big(self):
+    for N in [1024, 4096]:
       def f(a, b): return a + b
       helper_test_generic_square('add', N, f, f)
 
diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py
@@ -193,7 +193,7 @@ def __call__(self, input_buffers:list[Buffer], var_vals:dict[Variable, int]) ->
 def _prepare_jit_inputs(args, kwargs):
   input_tensors: list[tuple[int|str, Tensor]] = [(name,t) for name,t in list(enumerate(args))+sorted(kwargs.items()) if t.__class__ is Tensor]
   names, tensors = [name for name,_ in input_tensors], [t for _,t in input_tensors]
-  if tensors: Tensor.realize(*tensors)
+  if len(unrealized_tensors := [x for x in tensors if not x.lazydata.is_realized]): Tensor.realize(*unrealized_tensors)
   # TODO: should we be unpacking multi here?
   lbs: list[UOp] = flatten([t.lazydata.src if t.lazydata.op is Ops.MULTI else [t.lazydata] for t in tensors])
   input_buffers: list[Buffer] = [lb.base.realized for lb in lbs if lb.base.realized is not None]
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
@@ -512,8 +512,8 @@ def metadata(self): return all_metadata.get(self, None)
 
   @property
   def base(self) -> UOp:
-    if self.op in GroupOp.Movement: return self.src[0].base
-    return self.src[0].base if self.op is Ops.VIEW and len(self.src) == 1 else self
+    if (self.op is Ops.VIEW and len(self.src) == 1) or self.op in GroupOp.Movement: return self.src[0].base
+    return self
   def view(self, new_st:ShapeTracker) -> UOp: return UOp(Ops.VIEW, self.dtype, (self.base,), new_st)
 
   def _mop(self, op:Ops, arg):
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
@@ -42,6 +42,7 @@ def msg(ptr: objc_id, selector: str, /, *args: Any, restype: type[T] = objc_id)
   sender.restype = restype
   return sender(ptr, sel(selector), *args)
 
+@functools.lru_cache(None)
 def to_ns_str(s: str): return msg(libobjc.objc_getClass(b"NSString"), "stringWithUTF8String:", s.encode(), restype=objc_instance)
 def from_ns_str(s): return bytes(msg(s, "UTF8String", restype=ctypes.c_char_p)).decode()
 
@@ -146,21 +147,22 @@ def __init__(self, dev:MetalDevice, name:str, lib:bytes):
     self.pipeline_state = msg(self.dev.sysdevice, "newComputePipelineStateWithDescriptor:options:reflection:error:",
       descriptor, MTLPipelineOption.MTLPipelineOptionNone, None, ctypes.byref(error_pipeline_creation:=objc_instance()), restype=objc_instance)
     error_check(error_pipeline_creation)
+    # cache these msg calls
+    self.max_total_threads: int = cast(int, msg(self.pipeline_state, "maxTotalThreadsPerThreadgroup", restype=ctypes.c_ulong))
 
   def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
-    max_total_threads = msg(self.pipeline_state, "maxTotalThreadsPerThreadgroup", restype=ctypes.c_ulong)
-    if prod(local_size) > cast(int, max_total_threads):
+    if prod(local_size) > self.max_total_threads:
       exec_width = msg(self.pipeline_state, "threadExecutionWidth", restype=ctypes.c_ulong)
       memory_length = msg(self.pipeline_state, "staticThreadgroupMemoryLength", restype=ctypes.c_ulong)
-      raise RuntimeError(f"local size {local_size} bigger than {max_total_threads} with exec width {exec_width} memory length {memory_length}")
+      raise RuntimeError(f"local size {local_size} bigger than {self.max_total_threads} with exec width {exec_width} memory length {memory_length}")
     command_buffer = msg(self.dev.mtl_queue, "commandBuffer", restype=objc_instance)
     encoder = msg(command_buffer, "computeCommandEncoder", restype=objc_instance)
     msg(encoder, "setComputePipelineState:", self.pipeline_state)
     for i,a in enumerate(bufs): msg(encoder, "setBuffer:offset:atIndex:", a.buf, a.offset, i)
     for i,a in enumerate(vals, start=len(bufs)): msg(encoder, "setBytes:length:atIndex:", bytes(ctypes.c_int(a)), 4, i)
     msg(encoder, "dispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_size), to_struct(*local_size))
     msg(encoder, "endEncoding")
-    msg(command_buffer, "setLabel:", to_ns_str(self.name))
+    msg(command_buffer, "setLabel:", to_ns_str(self.name)) # TODO: is this always needed?
     msg(command_buffer, "commit")
     self.dev.mtl_buffers_in_flight.append(command_buffer)
     if wait:
diff --git a/tinygrad/shape/shapetracker.py b/tinygrad/shape/shapetracker.py
@@ -109,6 +109,7 @@ def var_vals(self) -> dict[Variable, int]: return merge_dicts([dict([v.unbind()]
 
   def unbind(self) -> tuple[ShapeTracker, dict[Variable, int]]:
     unbound_views, var_vals = zip(*[v.unbind() for v in self.views])
+    if all(len(x) == 0 for x in var_vals): return self, {}
     return ShapeTracker(tuple(unbound_views)), merge_dicts(var_vals)
 
   def real_strides(self, ignore_valid=False) -> tuple[Optional[sint], ...]: return views_to_real_strides(self.views, ignore_valid)