reduce speed example [pr] (tinygrad#8978)

geohot · web-flow · commit 6ffee2fca9cf · 2025-02-09T14:13:59.000+08:00
* reduce speed example

* fast like a nascar
diff --git a/extra/reduce_speed.py b/extra/reduce_speed.py
@@ -0,0 +1,128 @@
+import numpy as np
+import ctypes
+from tinygrad import Tensor, GlobalCounters, Context
+from tinygrad.engine.realize import lower_schedule, CompiledRunner
+from tinygrad.device import CPUProgram
+from dataclasses import replace
+from keystone import Ks, KS_ARCH_ARM64, KS_MODE_LITTLE_ENDIAN
+
+# only the memory access, over 100 GB/s! (sometimes)
+reduce_asm = """
+movi  v0.2d, #0000000000000000
+mov   w9, #0x30
+mov   w10, #0x20
+mov   x8, #-0x10
+movi  v1.2d, #0000000000000000
+movk  w9, #0x300, lsl #16
+movi  v2.2d, #0000000000000000
+movk  w10, #0x200, lsl #16
+movi  v3.2d, #0000000000000000
+mov   w11, #0x1000000
+mov   w12, #0x3ffff0
+loop:
+ldp   q4, q5, [x1]
+add   x13, x1, x11
+add   x15, x1, x10
+add   x14, x1, x9
+add   x8, x8, #0x10
+cmp   x8, x12
+ldp   q6, q7, [x1, #0x20]
+add   x1, x1, #0x40
+ldp   q4, q5, [x13]
+ldp   q6, q7, [x13, #0x20]
+ldp   q4, q5, [x15, #-0x20]
+ldp   q6, q7, [x15]
+ldp   q4, q5, [x14, #-0x30]
+ldp   q6, q7, [x14, #-0x10]
+b.lo  loop
+fadd  v0.4s, v1.4s, v0.4s
+fadd  v0.4s, v2.4s, v0.4s
+fadd  v0.4s, v3.4s, v0.4s
+dup   v1.4s, v0.s[1]
+dup   v2.4s, v0.s[2]
+fadd  v1.4s, v0.4s, v1.4s
+dup   v0.4s, v0.s[3]
+fadd  v1.4s, v2.4s, v1.4s
+fadd  v0.4s, v0.4s, v1.4s
+str   s0, [x0]
+ret
+"""
+
+ks = Ks(KS_ARCH_ARM64, KS_MODE_LITTLE_ENDIAN)
+arm_bytecode, _ = ks.asm(reduce_asm)
+arm_bytecode = bytes(arm_bytecode)
+
+reduce_src = """
+// data1 is 16M inputs
+typedef float float4 __attribute__((aligned(32),vector_size(16)));
+void reduce(float* restrict data0, float* restrict data1) {
+  float4 acc0 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc1 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc2 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc3 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc4 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc5 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc6 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float4 acc7 = {0.0f, 0.0f, 0.0f, 0.0f};
+  float* data1_1 = data1+4194304;
+  float* data1_2 = data1+(4194304*2);
+  float* data1_3 = data1+(4194304*3);
+  for (int ridx0 = 0; ridx0 < 16777216/4; ridx0+=16) {
+    float4 val0 = *(float4*)((data1+(ridx0+0)));
+    float4 val1 = *(float4*)((data1+(ridx0+4)));
+    float4 val2 = *(float4*)((data1+(ridx0+8)));
+    float4 val3 = *(float4*)((data1+(ridx0+12)));
+    acc0 += val0;
+    acc1 += val1;
+    acc2 += val2;
+    acc3 += val3;
+    val0 = *(float4*)((data1_1+(ridx0+0)));
+    val1 = *(float4*)((data1_1+(ridx0+4)));
+    val2 = *(float4*)((data1_1+(ridx0+8)));
+    val3 = *(float4*)((data1_1+(ridx0+12)));
+    acc4 += val0;
+    acc5 += val1;
+    acc6 += val2;
+    acc7 += val3;
+    val0 = *(float4*)((data1_2+(ridx0+0)));
+    val1 = *(float4*)((data1_2+(ridx0+4)));
+    val2 = *(float4*)((data1_2+(ridx0+8)));
+    val3 = *(float4*)((data1_2+(ridx0+12)));
+    acc0 += val0;
+    acc1 += val1;
+    acc2 += val2;
+    acc3 += val3;
+    val0 = *(float4*)((data1_3+(ridx0+0)));
+    val1 = *(float4*)((data1_3+(ridx0+4)));
+    val2 = *(float4*)((data1_3+(ridx0+8)));
+    val3 = *(float4*)((data1_3+(ridx0+12)));
+    acc4 += val0;
+    acc5 += val1;
+    acc6 += val2;
+    acc7 += val3;
+  }
+  float4 out = acc0+acc1+acc2+acc3+acc4+acc5+acc6+acc7;
+  *(data0+0) = out[0]+out[1]+out[2]+out[3];
+}
+"""
+
+if __name__ == "__main__":
+  a = Tensor(np_array:=(np.random.default_rng().random((4096, 4096), dtype=np.float32)-0.5)).realize()
+  with Context(SPLIT_REDUCEOP=0):
+    # TODO: make it easy to alter the OptOps for a ScheduleItem
+    GlobalCounters.reset()
+    out = a.sum()
+    sis = out.schedule()
+    for i,ei in enumerate(lower_schedule(sis)):
+      if i == 0:
+        # change the source code
+        prg_spec = ei.prg.p
+        prg_spec = replace(prg_spec, name="reduce", src=reduce_src)
+        prg = CompiledRunner(prg_spec)
+        # change the assembly
+        #prg._prg = CPUProgram(prg_spec.name, arm_bytecode)
+        print("buffer at:",hex(ctypes.addressof(ei.bufs[1]._buf)))
+        ei = replace(ei, prg=prg)
+      ei.run()
+    print(out.item())
+    np.testing.assert_allclose(out.item(), np_array.sum(), atol=1, rtol=1e-4)
diff --git a/tinygrad/device.py b/tinygrad/device.py
@@ -208,7 +208,9 @@ def free(self, opaque:Any, size:int, options:Optional[BufferSpec]=None):
 class _MallocAllocator(LRUAllocator):
   def _alloc(self, size:int, options:BufferSpec):
     # must be aligned to 0x20 for 256-bit ymm registers
-    return (ctypes.c_uint8 * size).from_address(options.external_ptr) if options.external_ptr else self._alloc_aligned(size, 0x20)
+    # TODO: investigate if this is the cause of nondeterminism in speed
+    alignment = 0x1000 if size >= 0x1000 else 0x20
+    return (ctypes.c_uint8 * size).from_address(options.external_ptr) if options.external_ptr else self._alloc_aligned(size, alignment)
   def _alloc_aligned(self, size:int, alignment:int):
     buffer = (ctypes.c_uint8 * (size + alignment))()
     offset = round_up(ctypes.addressof(buffer), alignment) - ctypes.addressof(buffer)
diff --git a/tinygrad/runtime/support/llvm.py b/tinygrad/runtime/support/llvm.py
@@ -23,4 +23,4 @@
   if LLVM_PATH is None:
     raise FileNotFoundError("No LLVM library found on the system. Install it via your distro's package manager and ensure it's findable as 'LLVM'")
 
-if DEBUG>=2: print(f'Using LLVM at {repr(LLVM_PATH)}')
+if DEBUG>=3: print(f'Using LLVM at {repr(LLVM_PATH)}')