2
2
import time , pprint
3
3
from dataclasses import dataclass , replace
4
4
from tinygrad .helpers import all_same , colored , getenv , DEBUG , GlobalCounters , ansilen , BEAM , NOOPT , all_int , CAPTURING , Metadata , TRACEMETA
5
- from tinygrad .helpers import DEVECTORIZE , time_to_str
5
+ from tinygrad .helpers import DEVECTORIZE , time_to_str , VALIDATE_WITH_CPU
6
6
from tinygrad .ops import Ops , PatternMatcher , UOp , UPat , Variable , sym_infer
7
7
from tinygrad .device import Device , Buffer
8
8
from tinygrad .renderer import Renderer , ProgramSpec , Estimates
@@ -150,10 +150,10 @@ def run(self, _var_vals:Optional[dict[Variable, int]]=None, wait=False, jit=Fals
150
150
])
151
151
def lower_schedule_item (si :ScheduleItem ) -> ExecItem : return ExecItem (* cast (tuple [Runner ,list ], si_lowerer .rewrite (si .ast , si .bufs )), si .metadata )
152
152
153
- def lower_schedule (schedule :list [ScheduleItem ]) -> Generator [ExecItem , None , None ]:
153
+ def lower_schedule (schedule :list [ScheduleItem ]) -> Generator [tuple [ ScheduleItem , ExecItem ] , None , None ]:
154
154
while len (schedule ):
155
155
si = schedule .pop (0 )
156
- try : yield lower_schedule_item (si )
156
+ try : yield ( si , lower_schedule_item (si ) )
157
157
except Exception as e :
158
158
if DEBUG >= 2 :
159
159
print (f"error lowering { si .ast .op } " )
@@ -166,6 +166,21 @@ def lower_schedule(schedule:list[ScheduleItem]) -> Generator[ExecItem, None, Non
166
166
capturing : list = [] # put classes with an add method in here
167
167
168
168
def run_schedule (schedule :list [ScheduleItem ], var_vals :Optional [dict [Variable , int ]]= None , do_update_stats = True ):
169
- for ei in lower_schedule (schedule ):
169
+ for si , ei in lower_schedule (schedule ):
170
170
if len (capturing ) and CAPTURING : capturing [0 ].add (ei )
171
- ei .run (var_vals , do_update_stats = do_update_stats )
171
+ if VALIDATE_WITH_CPU and si .ast .op is Ops .SINK :
172
+ # copy in allocated buffers from the GPU
173
+ nb : tuple [Buffer , ...] = tuple (Buffer ("CPU" , b .size , b .dtype ) for b in si .bufs )
174
+ for cpu_b , gpu_b in zip (nb , si .bufs ):
175
+ if gpu_b .is_allocated (): cpu_b .ensure_allocated ().copyin (gpu_b .as_buffer ())
176
+
177
+ # run on GPU
178
+ ei .run (var_vals , do_update_stats = do_update_stats )
179
+
180
+ # validate the output buffers match (NOTE: this is assuming the output is buffer 0)
181
+ lower_schedule_item (ScheduleItem (si .ast , nb , si .metadata )).run (var_vals , do_update_stats = do_update_stats )
182
+ import numpy as np
183
+ np .testing .assert_allclose (nb [0 ].numpy (), si .bufs [0 ].numpy (), rtol = 1e-3 , atol = 1e-3 )
184
+ else :
185
+ ei .run (var_vals , do_update_stats = do_update_stats )
186
+
0 commit comments