diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 537cc04e6e5e6..f8531fb6b03d3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -622,10 +622,7 @@ jobs: wintests: strategy: fail-fast: false - matrix: - backend: [llvm] - - name: Tests on Windows (${{ matrix.backend }}) + name: Tests on Windows (llvm+clang) runs-on: windows-latest timeout-minutes: 45 steps: @@ -641,21 +638,35 @@ jobs: uses: actions/cache@v4 with: path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages - key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }} + key: windows-packages-${{ hashFiles('**/setup.py') }} + - name: Set env + shell: bash + run: | + if [ "${{ matrix.backend }}" = "clang" ]; then + echo "CLANG=1" >> $GITHUB_ENV + elif [ "${{ matrix.backend }}" = "llvm" ]; then + echo "LLVM=1" >> $GITHUB_ENV + fi - name: Install dependencies - run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu - - name: Check Device.DEFAULT and print some source - env: - DEBUG: 5 - LLVM: 1 - PYTHONPATH: ${{ github.workspace }} + run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu + - name: Check Device.DEFAULT and print some source (llvm) + shell: bash run: | - python3 test/test_ops.py TestOps.test_add - - name: Run pytest - env: - DEBUG: 5 - LLVM: 1 - run: python -m pytest -n=auto test/test_tiny.py --durations=20 + PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT" + DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add + - name: Check Device.DEFAULT and print some source (clang) + shell: bash + run: | + PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT" + DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add + - name: Run pytest (llvm) + shell: bash + run: | + DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20 + - name: Run pytest (clang) + shell: bash + run: | + DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20 #testunicorn: # name: ARM64 unicorn Test diff --git a/test/unit/test_verify_ast.py b/test/unit/test_verify_ast.py index 2dd103e975aee..255c2fab33712 100644 --- a/test/unit/test_verify_ast.py +++ b/test/unit/test_verify_ast.py @@ -92,8 +92,7 @@ def test_const_view_always_valid(self): buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0) a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CLANG"),), ShapeTracker.from_shape(())),)) st = UOp.store(buf, ShapeTracker.from_shape(()).to_uop(), a.cast(dtypes.float)) - # lowerer asserts because it does not remove ShapeTracker on CONST(VIEW(DEVICE)) - with self.assertRaises(AssertionError): helper_test_verify_ast(st) + helper_test_verify_ast(st) if __name__ == '__main__': unittest.main() diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py index 429769fd1dda6..147ba33bce826 100644 --- a/tinygrad/codegen/lowerer.py +++ b/tinygrad/codegen/lowerer.py @@ -5,7 +5,7 @@ from tinygrad.dtype import dtypes, PtrDType from tinygrad.ops import KernelInfo, UOp, Ops, graph_rewrite, PatternMatcher, UPat, sint, identity_element, sint_to_uop from tinygrad.renderer import Renderer -from tinygrad.helpers import all_int, prod, partition, flatten +from tinygrad.helpers import all_int, prod, partition, flatten, unwrap # returns the axes to create new_shape if new_shape can be created by combining axis from old_shape def get_contraction(old_shape:tuple[sint, ...], new_shape:tuple[sint, ...]) -> list[list[int]]|None: @@ -122,8 +122,13 @@ def lower_load_store(ctx: IndexContext, x: UOp): if oidx is not ridx: valid = valid * oidx.eq(0) return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid), x.src[2])) +def lower_const(x:UOp): + assert all(v.mask is None for v in unwrap(x.st).views), f"VIEW in CONST/DEFINE_VAR source must be unmasked, got {x.st}" + return x.replace(src=()) + pm_lowerer = PatternMatcher([ (UPat(Ops.REDUCE_AXIS, name="x"), lower_reduce_axis), + (UPat((Ops.CONST, Ops.DEFINE_VAR), src=(UPat(Ops.VIEW),), name="x"), lower_const), (UPat(Ops.VALID, src=(UPat(Ops.VIEW),), name="x"), lambda ctx,x: x.st_arg.to_indexed_uops(ctx.idxs)[1]), # rewrite LOAD/STORE VIEW to LOAD/STORE with indexed (UPat((Ops.LOAD, Ops.STORE), src=(UPat(), UPat(Ops.VIEW)), allow_any_len=True, name="x"), lower_load_store), diff --git a/tinygrad/device.py b/tinygrad/device.py index 04182a3c33681..e7b502432b7be 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -222,23 +222,31 @@ def _offset(self, buf, size:int, offset:int): return from_mv(self._as_buffer(buf class CPUProgram: helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s')) def __init__(self, name:str, lib:bytes): - assert sys.platform != "win32", "clang is not supported for windows yet" - from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE - # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/ - # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np) - self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC) - - if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False) - self.mem.write(lib) - if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True) - - # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang. - # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately - # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux - # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5 - CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) - - self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem)) + if sys.platform == "win32": + PAGE_EXECUTE_READWRITE = 0x40 + MEM_COMMIT = 0x1000 + MEM_RESERVE = 0x2000 + ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64 + ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE) + ctypes.memmove(ptr, lib, len(lib)) + self.fxn = ctypes.CFUNCTYPE(None)(ptr) + else: + from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE + # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/ + # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np) + self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC) + + if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False) + self.mem.write(lib) + if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True) + + # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang. + # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately + # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux + # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5 + CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib))) + + self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem)) def __call__(self, *bufs, vals=(), wait=False): args = list(bufs) + list(vals) diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 5895b0cc68c42..9c055acd87c02 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'): def capstone_flatdump(lib: bytes): import capstone match platform.machine(): - case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64) + case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64) case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM) case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}") for instr in cs.disasm(lib, 0): diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index 4f2e604f05689..6b9a9f266d334 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -1,5 +1,5 @@ from typing import Optional, Union, Literal, Callable, cast -import os, math +import os, math, sys from collections import defaultdict, Counter from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX @@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage): tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt, swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1")) for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]] - + if sys.platform == 'win32': + kernel_prefix = "__attribute__((ms_abi)) " def render_vector_prefix(self, dt:DType) -> str: return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));" diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 689c04c460866..2baf5723820d1 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,4 +1,4 @@ -import platform, tempfile, pathlib, subprocess +import platform, tempfile, pathlib, subprocess, sys from tinygrad.helpers import cpu_objdump, capstone_flatdump from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram from tinygrad.runtime.support.elf import jit_loader @@ -26,7 +26,8 @@ def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey) def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it - args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib'] + target = 'x86_64' if sys.platform == 'win32' else platform.machine() + args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib'] arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else [] obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py index 9b14e024f83fa..9a3480c7efff2 100644 --- a/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad/runtime/support/am/amdev.py @@ -296,8 +296,8 @@ def __init__(self, devfmt, vram_bar:memoryview, doorbell_bar:memoryview, mmio_ba if DEBUG >= 2: print(f"am {self.devfmt}: boot done") def fini(self): - self.smu.set_clocks(perf=False) for ip in [self.sdma, self.gfx]: ip.fini() + self.smu.set_clocks(perf=False) def paddr2cpu(self, paddr:int) -> int: return mv_address(self.vram) + paddr def paddr2mc(self, paddr:int) -> int: return self.gmc.mc_base + paddr diff --git a/tinygrad/viz/index.html b/tinygrad/viz/index.html index 08b7f4d00a1f7..f1de08f22ebba 100644 --- a/tinygrad/viz/index.html +++ b/tinygrad/viz/index.html @@ -247,8 +247,8 @@ const g = new dagreD3.graphlib.Graph({ compound: true }).setGraph({ rankdir: "LR" }).setDefaultEdgeLabel(function() { return {}; }); g.setNode("addition", {label: "", clusterLabelPos: "top", style: additions.length !== 0 ? "fill: rgba(26, 27, 38, 0.5);" : "display: none;"}); for (const [k,u] of Object.entries(graph)) { - g.setNode(k, {label: u[0], style: `fill: ${u[4]};` }); - for (const src of u[2]) { + g.setNode(k, {label: u[0], style: `fill: ${u[2]};` }); + for (const src of u[1]) { g.setEdge(src, k, {curve: d3.curveBasis}) } if (additions.includes(parseInt(k))) { @@ -307,8 +307,7 @@ if (i === currentKernel) { requestAnimationFrame(() => kernelUl.scrollIntoView({ behavior: "auto", block: "nearest" })); } - const p = Object.assign(document.createElement("p"), { id: `kernel-${key}`, innerText: key ?? "UNPARENTED", - style: "cursor: pointer;"}); + const p = Object.assign(document.createElement("p"), { id: `kernel-${key}`, innerText: key, style: "cursor: pointer;"}); kernelUl.appendChild(p) items.forEach((u, j) => { const rwUl = Object.assign(document.createElement("ul"), { innerText: `${toPath(u.loc)} - ${u.match_count}`, key: `uop-rewrite-${j}`, diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index 02dbee0a5e2b0..13f4fa0915ad8 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -19,8 +19,8 @@ # VIZ API class GraphRewriteMetadata(TypedDict): - loc: tuple[str, int] # [path, lineno] calling graph_rewrite - match_count: int # total match count in this context + loc: tuple[str, int] # [path, lineno] calling graph_rewrite + match_count: int # total match count in this context class GraphRewriteDetails(GraphRewriteMetadata): graphs: list[dict] # JSON serialized UOp at every rewrite step @@ -36,9 +36,10 @@ def pcall(fxn:Callable[..., str], *args, **kwargs) -> str: try: return fxn(*args, **kwargs) except Exception as e: return f"ERROR: {e}" -def uop_to_json(x:UOp) -> dict[int, tuple[str, str, list[int], str, str]]: +def uop_to_json(x:UOp) -> dict[int, tuple[str, list[int], str]]: assert isinstance(x, UOp) - graph: dict[int, tuple[str, str, list[int], str, str]] = {} + # NOTE: this is [id, [label, src_ids, color]] + graph: dict[int, tuple[str, list[int], str]] = {} excluded: set[UOp] = set() for u in (toposort:=x.toposort): if u.op in {Ops.CONST, Ops.DEVICE}: excluded.update((u,) + u.src) @@ -53,7 +54,7 @@ def uop_to_json(x:UOp) -> dict[int, tuple[str, str, list[int], str, str]]: if x in excluded: if x.op is Ops.CONST and dtypes.is_float(u.dtype): label += f"\nCONST{idx} {x.arg:g}" else: label += f"\n{x.op.name}{idx} {x.arg}" - graph[id(u)] = (label, str(u.dtype), [id(x) for x in u.src if x not in excluded], str(u.arg), uops_colors.get(u.op, "#ffffff")) + graph[id(u)] = (label, [id(x) for x in u.src if x not in excluded], uops_colors.get(u.op, "#ffffff")) return graph def get_metadata(keys:list[Any], contexts:list[list[TrackedGraphRewrite]]) -> list[tuple[str, list[GraphRewriteMetadata]]]: