add proper support for Ops.IGNORE to remove store masks (tinygrad#9692)

geohot · web-flow · commit f72a87fd0e21 · 2025-04-02T16:38:01.000+08:00
* add proper support for Ops.IGNORE to remove store masks

* remove useless NHWC

* revert that
diff --git a/extra/onnx.py b/extra/onnx.py
@@ -724,8 +724,6 @@ def QuantizeLinear(x:Tensor, y_scale:Tensor, y_zero_point:Tensor|int=0, axis:int
       ret = _clamp_cast((x / y_scale + 0.4999999 + y_zero_point).int(), out_dtype)
     else:
       ret = _clamp_cast(((x / y_scale).round() + y_zero_point), out_dtype)
-    # you need both NHWC=1 DONT_GROUP_REDUCES=1 for this to work
-    if getenv("NHWC") and len(ret.shape) == 4: return ret.permute(0,2,3,1).contiguous().permute(0,3,1,2)
     return ret.contiguous()
 
   def DynamicQuantizeLinear(x: Tensor):
@@ -737,10 +735,6 @@ def DynamicQuantizeLinear(x: Tensor):
     return y, scale, zero_point
 
   def DequantizeLinear(x:Tensor, x_scale:Tensor, x_zero_point:Tensor|int=0, axis:int=1, block_size:int=0):
-    WEIGHT_SHIFT = 4
-    if getenv("NHWC") and len(x.shape) == 4 and x.shape[2:] == (1,1) and x.shape[1]%WEIGHT_SHIFT == 0:
-      # DSP swizzle memory
-      x = x.reshape(x.shape[0], x.shape[1]//WEIGHT_SHIFT, WEIGHT_SHIFT).permute(1,0,2).contiguous().permute(1,0,2).reshape(x.shape)
     x_scale, x_zero_point = _prepare_quantize(x, x_scale, x_zero_point, axis, block_size)
     return ((x.int() - x_zero_point) * x_scale).cast(x_scale.dtype)
 
diff --git a/tinygrad/codegen/expander.py b/tinygrad/codegen/expander.py
@@ -116,9 +116,37 @@ def _gate_srcs(u:UOp, gate:UOp) -> UOp:
   (UPat(Ops.STORE, name="root"), create_gate),
 ])
 
+# **** IGNORE support ****
+
+pm_store_ignore = PatternMatcher([
+  (UPat().index(UPat(), UPat(name="mask")).store(UPat()).named("store"),
+   lambda store,mask: store.replace(src=(store.src[0], UOp(Ops.IGNORE, src=(store.src[1], mask)))) if store.src[1].op is not Ops.IGNORE else None),
+])
+
+pm_move_ignore = PatternMatcher([
+  # IGNORE on SELF is nothing
+  (UPat(Ops.IGNORE, src=(UPat(name="x"), UPat(name="x"))), lambda x: x.const_like(True)),
+  # IGNORE on a CONST is nothing
+  (UPat(Ops.IGNORE, src=(UPat((Ops.CONST, Ops.VCONST), name="c"), UPat())), lambda c: c),
+  # move the IGNOREs
+  (UPat(Ops.IGNORE, src=(UPat((*GroupOp.ALU, Ops.CAST, Ops.VECTORIZE), name="alu"), UPat.var("mask")), name="ig"),
+   lambda ig,alu,mask: alu.replace(src=tuple(UOp(Ops.IGNORE, x.dtype, (x, mask)) for x in alu.src))),
+])
+
+pm_delete_ignore = PatternMatcher([
+  # IGNORE on SELF is nothing
+  (UPat(Ops.IGNORE, src=(UPat(name="x"), UPat())), lambda x: x),
+])
+
 def expand_rewrite(sink:UOp) -> UOp:
   # initial symbolic + migrate indexing (remove this)
   sink = graph_rewrite(sink, sym+migrate_indexing)
 
-  # expand
-  return graph_rewrite(sink, sym+expander)
+  # store IGNORE
+  sink = graph_rewrite(sink, pm_store_ignore, name="store_ignore")
+
+  # move IGNORE
+  sink = graph_rewrite(sink, pm_move_ignore, name="move_ignore")
+
+  # expand + remove surviving ignores
+  return graph_rewrite(sink, pm_delete_ignore+sym+expander)
diff --git a/tinygrad/renderer/__init__.py b/tinygrad/renderer/__init__.py
@@ -111,8 +111,7 @@ def __post_init__(self):
           # NOTE: you have to set local_size and global_size to the base [1,1,1] outside this
           if u.arg[0][0] == 'i': self.local_size = None
           special_size = self.local_size if u.arg[0][0] == 'l' else self.global_size
-          assert special_size is not None
-          special_size[int(u.arg[0][-1])] = u.arg[1]
+          if special_size is not None: special_size[int(u.arg[0][-1])] = u.arg[1]
       self.vars = sorted(self.vars, key=lambda v: v.arg)
       self.outs = sorted(dedup(self.outs))
       self.ins = sorted(dedup(self.ins))