From 6811688d2952e74fdf53082e459033fceba0f88b Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Wed, 12 Feb 2025 18:27:35 +0200 Subject: [PATCH 1/5] disallow VIEW(BUFFER) in tensor [pr] (#9041) --- tinygrad/spec.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tinygrad/spec.py b/tinygrad/spec.py index 567875d664d42..89d46df9075d5 100644 --- a/tinygrad/spec.py +++ b/tinygrad/spec.py @@ -32,11 +32,6 @@ # NOTE: the arg here specifies clone=True, which prevents folding same device copy (UPat(Ops.COPY, name="copy", src=(UPat(Ops.DEVICE), UPat.var("x"))), lambda copy,x: isinstance(copy.arg, bool) and copy.dtype == x.dtype), - # VIEW(BUFFER) applies a ShapeTracker on top of the underlying device buffer - # NOTE: VIEW size exactly matches the underlying BUFFER, tensor doesn't apply movement ops to the VIEW - (UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf"),)), - lambda view,buf: view.dtype == buf.dtype and view.size == buf.size and view.st.contiguous), - # ASSIGN changes the value of a realized buffer (UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))), lambda assign,target,new_val: target.is_realized and (assign.dtype == target.dtype == new_val.dtype)), From f53b8196480fba732408ad915a4e4e907c863b28 Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 12 Feb 2025 12:53:23 -0500 Subject: [PATCH 2/5] UOps. -> Ops. [pr] (#9044) updated the comments and doc except extra --- test/test_linearizer.py | 20 ++++++++++---------- test/test_linearizer_failures.py | 6 +++--- tinygrad/codegen/kernel.py | 4 ++-- tinygrad/ops.py | 2 +- tinygrad/renderer/__init__.py | 4 ++-- tinygrad/spec.py | 2 +- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 6084eb7c9aefb..29fea76f5a32e 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -521,7 +521,7 @@ def test_var_multireduce(self): first_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((3, 27, 1, 32)).expand((3, 27, 32, 32)).to_uop())) first_reduce = UOp(Ops.REDUCE_AXIS, dtypes.float, (first_x,), (Ops.ADD, (3,))) neg_mean = first_reduce * ast_const(dtypes.float, -0.03125, (3, 27, 32, 1)) - # store = UOp(UOps.STORE, src=(g0, ShapeTracker.from_shape((3, 27, 32, 1)).to_uop(), mean)) + # store = UOp(Ops.STORE, src=(g0, ShapeTracker.from_shape((3, 27, 32, 1)).to_uop(), mean)) # verify_lazyop(store) second_x = UOp(Ops.LOAD, dtypes.float, (g1, x.lazydata.st.reshape((3, 27, 32, 1)).to_uop())) squares = (second_x+neg_mean)*(second_x+neg_mean) @@ -854,7 +854,7 @@ def test_two_nested_range(self): ranges = [i for i,u in enumerate(lin.uops) if u.op is Ops.RANGE] assert len(ranges) == 1 # NOTE: it collapses now # RANGE -> LOAD -> RANGE -> ASSIGN - #assert any(x.op is UOps.LOAD for x in lin.uops[ranges[0]:ranges[1]]) + #assert any(x.op is Ops.LOAD for x in lin.uops[ranges[0]:ranges[1]]) def test_three_nested_range(self): a = Tensor.randn(2, ).realize() @@ -865,7 +865,7 @@ def test_three_nested_range(self): # RANGE -> RANGE -> LOAD -> RANGE -> ASSIGN # NOTE: nothing should toposort between the first two ranges #assert ranges[0]+1 == ranges[1] - #assert any(x.op is UOps.LOAD for x in lin.uops[ranges[1]:ranges[2]]) + #assert any(x.op is Ops.LOAD for x in lin.uops[ranges[1]:ranges[2]]) def test_two_nested_range_alt_indexing(self): a = Tensor([2, 2]).realize() @@ -895,14 +895,14 @@ def test_range_outer_op_before_phi_nested_range(self): assert len(ranges) == 1 # NOTE: it collapses now #if getenv("PTX"): # LOAD -> RANGE -> CAST -> ALU -> ALU -> LOAD -> ALU -> RANGE -> ALU -> ASSIGN - # assert lin.uops[ranges[0]-2].op is UOps.LOAD + # assert lin.uops[ranges[0]-2].op is Ops.LOAD # assert ranges[1] == ranges[0]+6 - # assert [x.op for x in lin.uops[ranges[1]-2:ranges[1]]] == [UOps.LOAD, UOps.ALU] + # assert [x.op for x in lin.uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU] # LOAD -> RANGE -> LOAD -> ALU -> RANGE -> ASSIGN #else: - # assert lin.uops[ranges[0]-2].op is UOps.LOAD + # assert lin.uops[ranges[0]-2].op is Ops.LOAD # assert ranges[1] == ranges[0]+3 - # assert [x.op for x in lin.uops[ranges[1]-2:ranges[1]]] == [UOps.LOAD, UOps.ALU] + # assert [x.op for x in lin.uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU] def test_range_outer_op_after_phi(self): a = Tensor.randn(4, 1).realize() @@ -1306,7 +1306,7 @@ def test_grouped_store_phis(self): # check that the float4 cast collapses store_vals = [u.src[-1] for u in k.uops if u.op is Ops.STORE] for val in store_vals: - assert val.dtype == dtypes.float.vec(4) # and val.op is not UOps.VECTORIZE + assert val.dtype == dtypes.float.vec(4) # and val.op is not Ops.VECTORIZE @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals") @unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared") @@ -1345,7 +1345,7 @@ def get_recursive(uop): return set.union(set(uop.src), [uop], *[get_recursive(v) barrier = [u for u in k.uops if u.op is Ops.BARRIER][0] # check that the float4 cast collapses for all stores for store in local_stores+global_stores: - assert store.src[-1].dtype.count > 1 # and store.src[2].op is not UOps.VECTORIZE + assert store.src[-1].dtype.count > 1 # and store.src[2].op is not Ops.VECTORIZE # # check the children's vins # TODO: src ALU are not the same, should it? # assert barrier.src == tuple(local_stores) @@ -1362,7 +1362,7 @@ def test_grouped_store_local_only(self): # the float4 value stores directly in lds and we skip upcast self.assertEqual(stores[0].src[-1].dtype, dtypes.float.vec(4)) - #assert stores[0].src[-1].op is not UOps.VECTORIZE + #assert stores[0].src[-1].op is not Ops.VECTORIZE # the global store doesn't change assert stores[1].src[-1].dtype == dtypes.float diff --git a/test/test_linearizer_failures.py b/test/test_linearizer_failures.py index 8bacd0b6203bc..b37bb524b3bdd 100644 --- a/test/test_linearizer_failures.py +++ b/test/test_linearizer_failures.py @@ -117,7 +117,7 @@ def test_failure_6(self): ast_const(dtypes.int, 10, st_src=( UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(10, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=()),)),)),)),)) opts = [Opt(op=OptOps.UPCAST, axis=0, arg=2), Opt(op=OptOps.UPCAST, axis=0, arg=0)] - # COMPILE FAILED, KeyError: UOps.CONST + # COMPILE FAILED, KeyError: Ops.CONST helper_test_lin(Kernel(ast), opts, failed_platforms=[]) def test_failure_7(self): @@ -804,7 +804,7 @@ def test_failure_32(self): helper_test_lin(Kernel(ast), opts=opts, failed_platforms=[], atol=0.1, rtol=0.05) def test_failure_33(self): - # UOps.UNMUL left after linearize + # Ops.UNMUL left after linearize ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0, src=()), @@ -868,7 +868,7 @@ def test_failure_35(self): self.test_failure_34(True) # from world fuzz_linearizer: PYTHONPATH=. METAL=1 FUZZ_ALL_ACTIONS=1 DEPTH=1 FUZZ_N=100 FUZZ_NTH=84 python3 ./test/external/fuzz_linearizer.py def test_failure_36(self): - # UOps.UNMUL left after linearize + # Ops.UNMUL left after linearize ast = UOp(Ops.SINK, dtypes.void, arg=None, src=( UOp(Ops.STORE, dtypes.void, arg=None, src=( UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(), arg=0, src=()), diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 646eb42b58a11..888edcc850a63 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -325,8 +325,8 @@ def apply_tensor_cores(self, use_tensor_cores=1, extra_opts:Optional[list[Opt]]= -1: iterates through all available tensor cores in order and uses the first one that matches the requirements (dims and dtypes) [0-N]: uses only the n'th tensor core available; useful for search tc_opt -- controls which kinds of kernels may be eligible for tensor cores application (default 2 during BEAM, 0 otherwise) - 0: applies to only kernels with a single reduce axis and direct UOps.LOAD into Ops.MUL - 1: allows kernels with multiple reduce axes and also multiplication of UOps.CAST'd buffers + 0: applies to only kernels with a single reduce axis and direct Ops.LOAD into Ops.MUL + 1: allows kernels with multiple reduce axes and also multiplication of Ops.CAST'd buffers 2: allows kernels with M, N, K axes that are not multiples of the tensor core dimensions by applying padding those axes as needed """ if tc_select is None: tc_select = TC_SELECT.value diff --git a/tinygrad/ops.py b/tinygrad/ops.py index c1f2d457ff765..4788c6715fa25 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -638,7 +638,7 @@ def _min_max(self) -> tuple[ConstType, ConstType]: if self.op is Ops.RANGE: return self.src[0].vmin, (self.src[1]-1).vmax if self.op is Ops.BIND: return self.src[0]._min_max # ignore the bound value if self.op in {Ops.UNROLL, Ops.VECTORIZE}: return min(x.vmin for x in self.src), max(x.vmax for x in self.src) - # TODO: UOps.SPECIAL is UOps.DEFINE_VAR + # TODO: Ops.SPECIAL is Ops.DEFINE_VAR if self.op is Ops.SPECIAL: return 0, self.arg[1]-1 if isinstance(self.arg[1], int) else self.arg[1].vmax if self.op is Ops.CONST: return self.arg, self.arg if self.op is Ops.VCONST: return (min(self.arg), max(self.arg)) diff --git a/tinygrad/renderer/__init__.py b/tinygrad/renderer/__init__.py index e01301483a09c..884c6c4fc0a33 100644 --- a/tinygrad/renderer/__init__.py +++ b/tinygrad/renderer/__init__.py @@ -121,8 +121,8 @@ class Renderer: has_local: bool = True has_shared: bool = True # NOTE: these two should be in (x,y,z) order to match the max_sizes argument in get_grouped_dims - global_max: Optional[tuple[int, ...]] = (0x8FFFFFFF,) * (3) # TODO: UOps.SPECIAL int32 indexes right now - local_max: Optional[tuple[int, ...]] = (0x8FFFFFFF,) * (3) # TODO: UOps.SPECIAL int32 indexes right now + global_max: Optional[tuple[int, ...]] = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now + local_max: Optional[tuple[int, ...]] = (0x8FFFFFFF,) * (3) # TODO: Ops.SPECIAL int32 indexes right now shared_max: int = 32768 tensor_cores: list[TensorCore] = [] extra_matcher: Optional[PatternMatcher] = None diff --git a/tinygrad/spec.py b/tinygrad/spec.py index 89d46df9075d5..2355aec5428e8 100644 --- a/tinygrad/spec.py +++ b/tinygrad/spec.py @@ -108,7 +108,7 @@ (UPat(Ops.BARRIER, dtypes.void, src=UPat(Ops.STORE, allow_any_len=True)), lambda: True), # NOTE: all pointers must be local # NOTE: for testing, we let sinks be anything - #(UPat(UOps.SINK, src=UPat(UOps.STORE)), lambda: True), + #(UPat(Ops.SINK, src=UPat(Ops.STORE)), lambda: True), (UPat(Ops.SINK, dtypes.void), lambda: True), (UPat((Ops.NOOP, Ops.CUSTOM)), lambda: True), From b3eab0305535529cedda1d77528f1551de175679 Mon Sep 17 00:00:00 2001 From: rmtew Date: Thu, 13 Feb 2025 08:41:00 +1300 Subject: [PATCH 3/5] Three things to get Windows CI working correctly: (#9047) - Ensure that the set backend environment variable is persisted to the next step via $GITHUB_ENV - It doesn't actually persist for Windows unless shell is explicitly set to bash. - Add the assertion to ensure the selected backend is actually used. --- .github/workflows/test.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2709522dffdb5..ea8e174bfd2fa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -612,7 +612,10 @@ jobs: key: windows-minimal deps: testing_minimal - name: Set env - run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1'}}" + shell: bash + run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1'}}" >> $GITHUB_ENV - name: Run pytest (${{ matrix.backend }}) shell: bash - run: python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20 + run: | + python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT" + python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20 From c15486cf3989a26b07fe6674d6d2f7a9820e6cfb Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 12 Feb 2025 14:41:16 -0500 Subject: [PATCH 4/5] remove contiguous in test_subbuffer_used [pr] (#9046) test works without contiguous --- test/test_subbuffer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_subbuffer.py b/test/test_subbuffer.py index 40fb7ad3a365d..9227c321dab6f 100644 --- a/test/test_subbuffer.py +++ b/test/test_subbuffer.py @@ -36,8 +36,7 @@ def test_subbuffer_len(self): def test_subbuffer_used(self): t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize() - # TODO: why does it needs contiguous - vt = t[2:4].contiguous().realize() + vt = t[2:4].realize() out = (vt + 100).tolist() assert out == [102, 103] From f4f56d7c1552d680fc85e780647cbcdca69d8741 Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 12 Feb 2025 15:49:58 -0500 Subject: [PATCH 5/5] move time_linearizer to extra.optimization.helpers [pr] (#9048) no longer used in tinygrad --- examples/handcode_opt.py | 3 ++- extra/optimization/helpers.py | 21 +++++++++++++++++++++ extra/optimization/rl.py | 4 ++-- extra/optimization/search.py | 4 ++-- extra/optimization/test_net.py | 4 ++-- extra/optimization/test_time_linearizer.py | 4 ++-- test/external/external_benchmark_hcopt.py | 4 ++-- test/external/speed_beam_v_hcopt.py | 4 ++-- test/external/verify_kernel.py | 3 +-- test/test_linearizer_overflows.py | 4 ++-- test/test_search.py | 3 ++- tinygrad/engine/search.py | 19 +------------------ 12 files changed, 41 insertions(+), 36 deletions(-) diff --git a/examples/handcode_opt.py b/examples/handcode_opt.py index b0f0ca6238c33..48c54b56703bb 100644 --- a/examples/handcode_opt.py +++ b/examples/handcode_opt.py @@ -5,8 +5,9 @@ from tinygrad.codegen.kernel import Kernel from tinygrad.ops import Ops, sym_infer from tinygrad.device import Compiled -from tinygrad.engine.search import time_linearizer, beam_search, bufs_from_lin +from tinygrad.engine.search import beam_search, bufs_from_lin from tinygrad.helpers import DEBUG, ansilen, getenv, colored, TRACEMETA +from extra.optimization.helpers import time_linearizer def get_sched_resnet(): mdl = ResNet50() diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py index 85a070f42b9d0..dc6defb2a6489 100644 --- a/extra/optimization/helpers.py +++ b/extra/optimization/helpers.py @@ -100,3 +100,24 @@ def lin_to_feats(lin:Kernel, use_sts=True): else: assert len(ret) == 274, f"wrong len {len(ret)}" return ret + +from tinygrad.device import Device, Buffer +from tinygrad.engine.search import _ensure_buffer_alloc, _time_program +from tinygrad.helpers import to_function_name, CACHELEVEL, diskcache_put + +def time_linearizer(lin:Kernel, rawbufs:list[Buffer], allow_test_size=True, max_global_size=65536, cnt=3, disable_cache=False, clear_l2=False) -> float: # noqa: E501 + key = {"ast": lin.ast.key, "opts": str(lin.applied_opts), "allow_test_size": allow_test_size, + "max_global_size": max_global_size, "clear_l2": clear_l2, "device": lin.opts.device, "suffix": lin.opts.suffix} + if not disable_cache and CACHELEVEL >= 2 and (val:=diskcache_get("time_linearizer", key)) is not None: return min(val) + + dev = Device[lin.opts.device] + assert dev.compiler is not None + + rawbufs = _ensure_buffer_alloc(rawbufs) + var_vals: dict[Variable, int] = {k:int(k.vmax+k.vmin)//2 for k in lin.ast.variables()} + p = lin.to_program() + tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs, + max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name)) + + if CACHELEVEL >= 2: diskcache_put("time_linearizer", key, tms) + return min(tms) diff --git a/extra/optimization/rl.py b/extra/optimization/rl.py index df0b791a71d80..232002c21796e 100644 --- a/extra/optimization/rl.py +++ b/extra/optimization/rl.py @@ -3,10 +3,10 @@ import math, random from tinygrad.tensor import Tensor from tinygrad.nn.state import get_parameters, get_state_dict, safe_save, safe_load, load_state_dict -from tinygrad.engine.search import actions, bufs_from_lin, time_linearizer, get_kernel_actions +from tinygrad.engine.search import actions, bufs_from_lin, get_kernel_actions from tinygrad.nn.optim import Adam from extra.optimization.extract_policynet import PolicyNet -from extra.optimization.helpers import load_worlds, ast_str_to_lin, lin_to_feats +from extra.optimization.helpers import load_worlds, ast_str_to_lin, lin_to_feats, time_linearizer if __name__ == "__main__": net = PolicyNet() diff --git a/extra/optimization/search.py b/extra/optimization/search.py index 9b8aec9bf5c1d..469487da58d90 100644 --- a/extra/optimization/search.py +++ b/extra/optimization/search.py @@ -1,11 +1,11 @@ import argparse -from extra.optimization.helpers import ast_str_to_lin +from extra.optimization.helpers import ast_str_to_lin, time_linearizer from tinygrad import dtypes from tinygrad.helpers import BEAM, getenv from tinygrad.device import Device, Compiled from tinygrad.codegen.kernel import Kernel -from tinygrad.engine.search import time_linearizer, beam_search, bufs_from_lin +from tinygrad.engine.search import beam_search, bufs_from_lin if __name__ == '__main__': diff --git a/extra/optimization/test_net.py b/extra/optimization/test_net.py index 59844440766b7..0c5b53f99dae0 100644 --- a/extra/optimization/test_net.py +++ b/extra/optimization/test_net.py @@ -6,8 +6,8 @@ from tinygrad.helpers import getenv, colored from tinygrad.tensor import Tensor from tinygrad.nn.state import get_parameters, get_state_dict, safe_save, safe_load, load_state_dict -from tinygrad.engine.search import bufs_from_lin, time_linearizer, actions, get_kernel_actions -from extra.optimization.helpers import load_worlds, ast_str_to_lin, lin_to_feats +from tinygrad.engine.search import bufs_from_lin, actions, get_kernel_actions +from extra.optimization.helpers import load_worlds, ast_str_to_lin, lin_to_feats, time_linearizer from extra.optimization.extract_policynet import PolicyNet from extra.optimization.pretrain_valuenet import ValueNet diff --git a/extra/optimization/test_time_linearizer.py b/extra/optimization/test_time_linearizer.py index effabf1e1e34c..4bfb1f0f03ebf 100644 --- a/extra/optimization/test_time_linearizer.py +++ b/extra/optimization/test_time_linearizer.py @@ -1,5 +1,5 @@ -from extra.optimization.helpers import load_worlds, ast_str_to_lin -from tinygrad.engine.search import bufs_from_lin, time_linearizer, get_kernel_actions +from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer +from tinygrad.engine.search import bufs_from_lin, get_kernel_actions if __name__ == "__main__": ast_strs = load_worlds() diff --git a/test/external/external_benchmark_hcopt.py b/test/external/external_benchmark_hcopt.py index 252b0531c67c3..eb42c7af39f28 100644 --- a/test/external/external_benchmark_hcopt.py +++ b/test/external/external_benchmark_hcopt.py @@ -1,7 +1,7 @@ import random from tinygrad.helpers import getenv -from tinygrad.engine.search import time_linearizer, beam_search, bufs_from_lin -from extra.optimization.helpers import load_worlds, ast_str_to_lin +from tinygrad.engine.search import beam_search, bufs_from_lin +from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer def optimize_kernel(k): # TODO: update this diff --git a/test/external/speed_beam_v_hcopt.py b/test/external/speed_beam_v_hcopt.py index 447a891ea73cb..c6f0b430fd31e 100644 --- a/test/external/speed_beam_v_hcopt.py +++ b/test/external/speed_beam_v_hcopt.py @@ -1,7 +1,7 @@ from tinygrad import Device from tinygrad.helpers import getenv, DEBUG, BEAM -from tinygrad.engine.search import beam_search, time_linearizer, bufs_from_lin -from extra.optimization.helpers import load_worlds, ast_str_to_lin +from tinygrad.engine.search import beam_search, bufs_from_lin +from extra.optimization.helpers import load_worlds, ast_str_to_lin, time_linearizer if __name__ == "__main__": filter_reduce = bool(getenv("FILTER_REDUCE")) diff --git a/test/external/verify_kernel.py b/test/external/verify_kernel.py index e8c99e74151d0..5a6fb23ba0fc2 100644 --- a/test/external/verify_kernel.py +++ b/test/external/verify_kernel.py @@ -1,10 +1,9 @@ import argparse from collections import defaultdict -from extra.optimization.helpers import kern_str_to_lin +from extra.optimization.helpers import kern_str_to_lin, time_linearizer from test.external.fuzz_linearizer import compare_linearizer from tinygrad.helpers import colored from tinygrad.codegen.kernel import Kernel -from tinygrad.engine.search import time_linearizer # Use this with the LOGKERNS options to verify that all executed kernels are valid and evaluate to the same ground truth results diff --git a/test/test_linearizer_overflows.py b/test/test_linearizer_overflows.py index f5fb749956985..4b6b871f2219f 100644 --- a/test/test_linearizer_overflows.py +++ b/test/test_linearizer_overflows.py @@ -4,8 +4,8 @@ from tinygrad import dtypes, Device from tinygrad.helpers import CI from tinygrad.codegen.kernel import Kernel -from tinygrad.engine.search import Opt, OptOps -from tinygrad.engine.search import time_linearizer, bufs_from_lin +from tinygrad.engine.search import Opt, OptOps, bufs_from_lin +from extra.optimization.helpers import time_linearizer # stuff needed to unpack a kernel from tinygrad.ops import UOp, Ops diff --git a/test/test_search.py b/test/test_search.py index d9b7241cd2e87..da2d55cfadd0f 100644 --- a/test/test_search.py +++ b/test/test_search.py @@ -4,7 +4,7 @@ from tinygrad.codegen.kernel import Opt, OptOps from tinygrad.codegen.kernel import Kernel from tinygrad.ops import UOp, Ops -from tinygrad.engine.search import time_linearizer, bufs_from_lin, actions, beam_search +from tinygrad.engine.search import bufs_from_lin, actions, beam_search from tinygrad.device import Device, Buffer from tinygrad.tensor import Tensor from tinygrad.dtype import dtypes @@ -12,6 +12,7 @@ from tinygrad.engine.realize import capturing from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View +from extra.optimization.helpers import time_linearizer class TestTimeLinearizer(unittest.TestCase): @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WebGPU timestamps are low precision, tm is 0") diff --git a/tinygrad/engine/search.py b/tinygrad/engine/search.py index 30e0efe8c6145..e1a2d9569d1aa 100644 --- a/tinygrad/engine/search.py +++ b/tinygrad/engine/search.py @@ -4,7 +4,7 @@ from dataclasses import replace from tinygrad.ops import UOp, Ops, Variable, sym_infer from tinygrad.device import Device, Buffer, Compiler -from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name +from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored from tinygrad.helpers import IGNORE_BEAM_CACHE, TC_SEARCH_OVER_SHAPE from tinygrad.dtype import ImageDType, PtrDType from tinygrad.codegen.kernel import Kernel, Opt, OptOps, KernelOptError @@ -197,20 +197,3 @@ def try_exec(local_size): ret = min([(try_exec(local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))]) assert not math.isinf(ret[0]), "all optimize_local_size exec failed" return ret[1] - -def time_linearizer(lin:Kernel, rawbufs:list[Buffer], allow_test_size=True, max_global_size=65536, cnt=3, disable_cache=False, clear_l2=False) -> float: # noqa: E501 - key = {"ast": lin.ast.key, "opts": str(lin.applied_opts), "allow_test_size": allow_test_size, - "max_global_size": max_global_size, "clear_l2": clear_l2, "device": lin.opts.device, "suffix": lin.opts.suffix} - if not disable_cache and CACHELEVEL >= 2 and (val:=diskcache_get("time_linearizer", key)) is not None: return min(val) - - dev = Device[lin.opts.device] - assert dev.compiler is not None - - rawbufs = _ensure_buffer_alloc(rawbufs) - var_vals: dict[Variable, int] = {k:int(k.vmax+k.vmin)//2 for k in lin.ast.variables()} - p = lin.to_program() - tms = _time_program(p, dev.compiler.compile(p.src), var_vals, rawbufs, - max_global_size=max_global_size if allow_test_size else None, clear_l2=clear_l2, cnt=cnt, name=to_function_name(lin.name)) - - if CACHELEVEL >= 2: diskcache_put("time_linearizer", key, tms) - return min(tms)