contiguous is no longer needed in SGD [pr] (tinygrad#8760)

geohot · web-flow · commit 96bff0b4f74c · 2025-01-27T15:19:11.000+09:00
* contiguous is no longer needed in SGD [pr]

* add allow condition
diff --git a/test/test_schedule.py b/test/test_schedule.py
@@ -323,7 +323,7 @@ def test_fold_conv_batchnorm(self):
 
   def test_fold_conv_batchnorm_optim(self):
     # this is too high
-    for optim, cnt in [(nn.optim.Adam, 18), (nn.optim.SGD, 15)]:
+    for optim, cnt in [(nn.optim.Adam, 18), (nn.optim.SGD, 11)]:
       with self.subTest(optim=optim.__name__):
         with Tensor.train():
           img = Tensor.ones(1,3,4,4)
@@ -1070,7 +1070,7 @@ def test_sgd_conv_fuse(self):
       opt = nn.optim.SGD(nn.state.get_parameters(c1))
       opt.zero_grad()
       c1(img).relu().sum().backward()
-      check_schedule(opt.schedule_step(), 5)
+      check_schedule(opt.schedule_step(), 3)
 
   def test_sgd_2convs_fuse(self):
     with Tensor.train():
@@ -1081,7 +1081,7 @@ def test_sgd_2convs_fuse(self):
       opt = nn.optim.SGD(nn.state.get_parameters([c1, c2]))
       opt.zero_grad()
       c2(c1(img).relu()).relu().sum().backward()
-      check_schedule(opt.schedule_step(), 9)
+      check_schedule(opt.schedule_step(), 7)
 
   def test_fold_2convs_sgd_nesterov_momentum_wd(self):
     with Tensor.train():
@@ -1092,7 +1092,7 @@ def test_fold_2convs_sgd_nesterov_momentum_wd(self):
       opt = nn.optim.SGD(nn.state.get_parameters([c1, c2]), nesterov=True, momentum=0.9, weight_decay=0.1)
       opt.zero_grad()
       c2(c1(img).relu()).relu().sum().backward()
-      check_schedule(opt.schedule_step(), 11)
+      check_schedule(opt.schedule_step(), 9)
 
   def test_sgd_4convs_fuse(self):
     with Tensor.train():
@@ -1105,7 +1105,7 @@ def test_sgd_4convs_fuse(self):
       opt = nn.optim.SGD(nn.state.get_parameters([c1, c2, c3, c4]))
       opt.zero_grad()
       c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
-      check_schedule(opt.schedule_step(), 21)
+      check_schedule(opt.schedule_step(), 17)
 
   def test_sgd_4convs_fuse_conv_bw(self):
     with Tensor.train():
@@ -1118,7 +1118,7 @@ def test_sgd_4convs_fuse_conv_bw(self):
       opt = nn.optim.SGD(nn.state.get_parameters([c1, c2, c3, c4]))
       opt.zero_grad()
       c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
-      with Context(FUSE_CONV_BW=1): check_schedule(opt.schedule_step(), 18)
+      with Context(FUSE_CONV_BW=1): check_schedule(opt.schedule_step(), 14)
 
   @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
   def test_prefer_half_buffer(self):
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
@@ -222,10 +222,13 @@ def schedule_uop(pre:UOp, ctx:ScheduleContext) -> ScheduleItem:
         assign_preloads[x.buf_uop] = None
         # if this kernel also assigns to the buffer, we only allow either contiguous or masked views for the LOAD
         if x.buf_uop in store_bufs and not (st:=x.st_arg).contiguous:
+          # if it has a single view and it becomes contiguous when you shrink expanded axes, it's fine
+          if len(st.views) == 1 and st.shrink(tuple((0,1) if st == 0 else (0,s) for s,st in zip(st.shape, st.views[0].strides))).contiguous: pass
           # if it has a single view and it's equal when you shrink a contig, it's fine
-          if len(st.views) != 1 or (mask:=st.views[0].mask) is None or ShapeTracker.from_shape(st.shape).shrink(mask) != st.shrink(mask):
-            raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
-                               +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
+          elif len(st.views) == 1 and (mask:=st.views[0].mask) is not None and ShapeTracker.from_shape(st.shape).shrink(mask) == st.shrink(mask): pass
+          # otherwise, it's not fine
+          else: raise RuntimeError("self operand of augmented assign must be contiguous.\nhelp: consider using .contiguous():\n"
+                                   +colored("   - a += a.T\n", "red")+colored("   + a += a.T.contiguous()", "green"))
   # capture process replay
   if CAPTURE_PROCESS_REPLAY:
     with Context(PICKLE_BUFFERS=0): PROCESS_REPLAY_CAPTURE[str(pre.key)] = pickle.dumps((pre, ContextVar._cache, ast))
diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py
@@ -77,9 +77,6 @@ def __init__(self, params:list[Tensor], lr=0.001, momentum=0.9, weight_decay=1e-
 
   def schedule_step_with_grads(self, grads:list[Tensor]) -> list[Tensor]:
     for i, (t, g) in enumerate(zip(self.params, grads)):
-      # contiguous is needed since the grads can allegedly form a "diamond"
-      # TODO: fix this in lazy.py
-      g = g.contiguous()
       if self.tcoef != 0:
         r1 = t.detach().square().sum().sqrt()
         r2 = g.square().sum().sqrt()