kp-forks · pull · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -622,10 +622,7 @@ jobs:
   wintests:
     strategy:
       fail-fast: false
-      matrix:
-        backend: [llvm]
-
-    name: Tests on Windows (${{ matrix.backend }})
+    name: Tests on Windows (llvm+clang)
     runs-on: windows-latest
     timeout-minutes: 45
     steps:
@@ -641,21 +638,35 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages
-          key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }}
+          key: windows-packages-${{ hashFiles('**/setup.py') }}
+      - name: Set env
+        shell: bash
+        run: |
+          if [ "${{ matrix.backend }}" = "clang" ]; then
+            echo "CLANG=1" >> $GITHUB_ENV
+          elif [ "${{ matrix.backend }}" = "llvm" ]; then
+            echo "LLVM=1" >> $GITHUB_ENV
+          fi
       - name: Install dependencies
-        run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-      - name: Check Device.DEFAULT and print some source
-        env:
-          DEBUG: 5
-          LLVM: 1
-          PYTHONPATH: ${{ github.workspace }}
+        run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Check Device.DEFAULT and print some source (llvm)
+        shell: bash
         run: |
-          python3 test/test_ops.py TestOps.test_add
-      - name: Run pytest
-        env:
-          DEBUG: 5
-          LLVM: 1
-        run: python -m pytest -n=auto test/test_tiny.py --durations=20
+          PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT"
+          DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+      - name: Check Device.DEFAULT and print some source (clang)
+        shell: bash
+        run: |
+          PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT"
+          DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+      - name: Run pytest (llvm)
+        shell: bash
+        run: |
+          DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20
+      - name: Run pytest (clang)
+        shell: bash
+        run: |
+          DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20
 
   #testunicorn:
   #  name: ARM64 unicorn Test

diff --git a/test/unit/test_verify_ast.py b/test/unit/test_verify_ast.py
@@ -92,8 +92,7 @@ def test_const_view_always_valid(self):
     buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0)
     a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CLANG"),), ShapeTracker.from_shape(())),))
     st = UOp.store(buf, ShapeTracker.from_shape(()).to_uop(), a.cast(dtypes.float))
-    # lowerer asserts because it does not remove ShapeTracker on CONST(VIEW(DEVICE))
-    with self.assertRaises(AssertionError): helper_test_verify_ast(st)
+    helper_test_verify_ast(st)
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py
@@ -5,7 +5,7 @@
 from tinygrad.dtype import dtypes, PtrDType
 from tinygrad.ops import KernelInfo, UOp, Ops, graph_rewrite, PatternMatcher, UPat, sint, identity_element, sint_to_uop
 from tinygrad.renderer import Renderer
-from tinygrad.helpers import all_int, prod, partition, flatten
+from tinygrad.helpers import all_int, prod, partition, flatten, unwrap
 
 # returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
 def get_contraction(old_shape:tuple[sint, ...], new_shape:tuple[sint, ...]) -> list[list[int]]|None:
@@ -122,8 +122,13 @@ def lower_load_store(ctx: IndexContext, x: UOp):
       if oidx is not ridx: valid = valid * oidx.eq(0)
   return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid), x.src[2]))
 
+def lower_const(x:UOp):
+  assert all(v.mask is None for v in unwrap(x.st).views), f"VIEW in CONST/DEFINE_VAR source must be unmasked, got {x.st}"
+  return x.replace(src=())
+
 pm_lowerer = PatternMatcher([
   (UPat(Ops.REDUCE_AXIS, name="x"), lower_reduce_axis),
+  (UPat((Ops.CONST, Ops.DEFINE_VAR), src=(UPat(Ops.VIEW),), name="x"), lower_const),
   (UPat(Ops.VALID, src=(UPat(Ops.VIEW),), name="x"), lambda ctx,x: x.st_arg.to_indexed_uops(ctx.idxs)[1]),
   # rewrite LOAD/STORE VIEW to LOAD/STORE with indexed
   (UPat((Ops.LOAD, Ops.STORE), src=(UPat(), UPat(Ops.VIEW)), allow_any_len=True, name="x"), lower_load_store),

diff --git a/tinygrad/device.py b/tinygrad/device.py
@@ -222,23 +222,31 @@ def _offset(self, buf, size:int, offset:int): return from_mv(self._as_buffer(buf
 class CPUProgram:
   helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s'))
   def __init__(self, name:str, lib:bytes):
-    assert sys.platform != "win32", "clang is not supported for windows yet"
-    from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
-    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
-    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
-    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
-
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
-    self.mem.write(lib)
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
-
-    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
-    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
-    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
-    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
-    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
-
-    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
+    if sys.platform == "win32":
+      PAGE_EXECUTE_READWRITE = 0x40
+      MEM_COMMIT =  0x1000
+      MEM_RESERVE = 0x2000
+      ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64
+      ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
+      ctypes.memmove(ptr, lib, len(lib))
+      self.fxn = ctypes.CFUNCTYPE(None)(ptr)
+    else:
+      from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
+      # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
+      # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
+      self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
+
+      if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
+      self.mem.write(lib)
+      if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
+
+      # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
+      # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
+      # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
+      # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
+      CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
+
+      self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
 
   def __call__(self, *bufs, vals=(), wait=False):
     args = list(bufs) + list(vals)

diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
@@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'):
 def capstone_flatdump(lib: bytes):
   import capstone
   match platform.machine():
-    case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
+    case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
     case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
     case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
   for instr in cs.disasm(lib, 0):

diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
@@ -1,5 +1,5 @@
 from typing import Optional, Union, Literal, Callable, cast
-import os, math
+import os, math, sys
 from collections import defaultdict, Counter
 from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat
 from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX
@@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage):
     tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt,
                                swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1"))
                                for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]]
-
+  if sys.platform == 'win32':
+    kernel_prefix = "__attribute__((ms_abi)) "
   def render_vector_prefix(self, dt:DType) -> str:
     return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));"
 

diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
@@ -1,4 +1,4 @@
-import platform, tempfile, pathlib, subprocess
+import platform, tempfile, pathlib, subprocess, sys
 from tinygrad.helpers import cpu_objdump, capstone_flatdump
 from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
 from tinygrad.runtime.support.elf import jit_loader
@@ -26,7 +26,8 @@ def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
   def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
-    args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
+    target = 'x86_64' if sys.platform == 'win32' else platform.machine()
+    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
     arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
     obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py
@@ -296,8 +296,8 @@ def __init__(self, devfmt, vram_bar:memoryview, doorbell_bar:memoryview, mmio_ba
     if DEBUG >= 2: print(f"am {self.devfmt}: boot done")
 
   def fini(self):
-    self.smu.set_clocks(perf=False)
     for ip in [self.sdma, self.gfx]: ip.fini()
+    self.smu.set_clocks(perf=False)
 
   def paddr2cpu(self, paddr:int) -> int: return mv_address(self.vram) + paddr
   def paddr2mc(self, paddr:int) -> int: return self.gmc.mc_base + paddr

diff --git a/tinygrad/viz/index.html b/tinygrad/viz/index.html
@@ -247,8 +247,8 @@
     const g = new dagreD3.graphlib.Graph({ compound: true }).setGraph({ rankdir: "LR" }).setDefaultEdgeLabel(function() { return {}; });
     g.setNode("addition", {label: "", clusterLabelPos: "top", style: additions.length !== 0 ? "fill: rgba(26, 27, 38, 0.5);" : "display: none;"});
     for (const [k,u] of Object.entries(graph)) {
-      g.setNode(k, {label: u[0], style: `fill: ${u[4]};` });
-      for (const src of u[2]) {
+      g.setNode(k, {label: u[0], style: `fill: ${u[2]};` });
+      for (const src of u[1]) {
         g.setEdge(src, k, {curve: d3.curveBasis})
       }
       if (additions.includes(parseInt(k))) {
@@ -307,8 +307,7 @@
       if (i === currentKernel) {
         requestAnimationFrame(() => kernelUl.scrollIntoView({ behavior: "auto", block: "nearest" }));
       }
-      const p = Object.assign(document.createElement("p"), { id: `kernel-${key}`, innerText: key ?? "UNPARENTED",
-        style: "cursor: pointer;"});
+      const p = Object.assign(document.createElement("p"), { id: `kernel-${key}`, innerText: key, style: "cursor: pointer;"});
       kernelUl.appendChild(p)
       items.forEach((u, j) => {
         const rwUl = Object.assign(document.createElement("ul"), { innerText: `${toPath(u.loc)} - ${u.match_count}`, key: `uop-rewrite-${j}`,

diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py
@@ -19,8 +19,8 @@
 # VIZ API
 
 class GraphRewriteMetadata(TypedDict):
-  loc: tuple[str, int]    # [path, lineno] calling graph_rewrite
-  match_count: int        # total match count in this context
+  loc: tuple[str, int]           # [path, lineno] calling graph_rewrite
+  match_count: int               # total match count in this context
 
 class GraphRewriteDetails(GraphRewriteMetadata):
   graphs: list[dict]             # JSON serialized UOp at every rewrite step
@@ -36,9 +36,10 @@ def pcall(fxn:Callable[..., str], *args, **kwargs) -> str:
   try: return fxn(*args, **kwargs)
   except Exception as e: return f"ERROR: {e}"
 
-def uop_to_json(x:UOp) -> dict[int, tuple[str, str, list[int], str, str]]:
+def uop_to_json(x:UOp) -> dict[int, tuple[str, list[int], str]]:
   assert isinstance(x, UOp)
-  graph: dict[int, tuple[str, str, list[int], str, str]] = {}
+  # NOTE: this is [id, [label, src_ids, color]]
+  graph: dict[int, tuple[str, list[int], str]] = {}
   excluded: set[UOp] = set()
   for u in (toposort:=x.toposort):
     if u.op in {Ops.CONST, Ops.DEVICE}: excluded.update((u,) + u.src)
@@ -53,7 +54,7 @@ def uop_to_json(x:UOp) -> dict[int, tuple[str, str, list[int], str, str]]:
       if x in excluded:
         if x.op is Ops.CONST and dtypes.is_float(u.dtype): label += f"\nCONST{idx} {x.arg:g}"
         else: label += f"\n{x.op.name}{idx} {x.arg}"
-    graph[id(u)] = (label, str(u.dtype), [id(x) for x in u.src if x not in excluded], str(u.arg), uops_colors.get(u.op, "#ffffff"))
+    graph[id(u)] = (label, [id(x) for x in u.src if x not in excluded], uops_colors.get(u.op, "#ffffff"))
   return graph
 
 def get_metadata(keys:list[Any], contexts:list[list[TrackedGraphRewrite]]) -> list[tuple[str, list[GraphRewriteMetadata]]]: