Skip to content

[pull] master from tinygrad:master #60

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 28 additions & 17 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,7 @@ jobs:
wintests:
strategy:
fail-fast: false
matrix:
backend: [llvm]

name: Tests on Windows (${{ matrix.backend }})
name: Tests on Windows (llvm+clang)
runs-on: windows-latest
timeout-minutes: 45
steps:
Expand All @@ -641,21 +638,35 @@ jobs:
uses: actions/cache@v4
with:
path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages
key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }}
key: windows-packages-${{ hashFiles('**/setup.py') }}
- name: Set env
shell: bash
run: |
if [ "${{ matrix.backend }}" = "clang" ]; then
echo "CLANG=1" >> $GITHUB_ENV
elif [ "${{ matrix.backend }}" = "llvm" ]; then
echo "LLVM=1" >> $GITHUB_ENV
fi
- name: Install dependencies
run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Check Device.DEFAULT and print some source
env:
DEBUG: 5
LLVM: 1
PYTHONPATH: ${{ github.workspace }}
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Check Device.DEFAULT and print some source (llvm)
shell: bash
run: |
python3 test/test_ops.py TestOps.test_add
- name: Run pytest
env:
DEBUG: 5
LLVM: 1
run: python -m pytest -n=auto test/test_tiny.py --durations=20
PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT"
DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Check Device.DEFAULT and print some source (clang)
shell: bash
run: |
PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT"
DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Run pytest (llvm)
shell: bash
run: |
DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20
- name: Run pytest (clang)
shell: bash
run: |
DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20

#testunicorn:
# name: ARM64 unicorn Test
Expand Down
3 changes: 1 addition & 2 deletions test/unit/test_verify_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ def test_const_view_always_valid(self):
buf = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), (), 0)
a = UOp.const(dtypes.int, 0).replace(src=(UOp(Ops.VIEW, dtypes.void, (UOp(Ops.DEVICE, arg="CLANG"),), ShapeTracker.from_shape(())),))
st = UOp.store(buf, ShapeTracker.from_shape(()).to_uop(), a.cast(dtypes.float))
# lowerer asserts because it does not remove ShapeTracker on CONST(VIEW(DEVICE))
with self.assertRaises(AssertionError): helper_test_verify_ast(st)
helper_test_verify_ast(st)

if __name__ == '__main__':
unittest.main()
7 changes: 6 additions & 1 deletion tinygrad/codegen/lowerer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from tinygrad.dtype import dtypes, PtrDType
from tinygrad.ops import KernelInfo, UOp, Ops, graph_rewrite, PatternMatcher, UPat, sint, identity_element, sint_to_uop
from tinygrad.renderer import Renderer
from tinygrad.helpers import all_int, prod, partition, flatten
from tinygrad.helpers import all_int, prod, partition, flatten, unwrap

# returns the axes to create new_shape if new_shape can be created by combining axis from old_shape
def get_contraction(old_shape:tuple[sint, ...], new_shape:tuple[sint, ...]) -> list[list[int]]|None:
Expand Down Expand Up @@ -122,8 +122,13 @@ def lower_load_store(ctx: IndexContext, x: UOp):
if oidx is not ridx: valid = valid * oidx.eq(0)
return UOp(Ops.STORE, dtypes.void, (buf.index(idx, valid), x.src[2]))

def lower_const(x:UOp):
assert all(v.mask is None for v in unwrap(x.st).views), f"VIEW in CONST/DEFINE_VAR source must be unmasked, got {x.st}"
return x.replace(src=())

pm_lowerer = PatternMatcher([
(UPat(Ops.REDUCE_AXIS, name="x"), lower_reduce_axis),
(UPat((Ops.CONST, Ops.DEFINE_VAR), src=(UPat(Ops.VIEW),), name="x"), lower_const),
(UPat(Ops.VALID, src=(UPat(Ops.VIEW),), name="x"), lambda ctx,x: x.st_arg.to_indexed_uops(ctx.idxs)[1]),
# rewrite LOAD/STORE VIEW to LOAD/STORE with indexed
(UPat((Ops.LOAD, Ops.STORE), src=(UPat(), UPat(Ops.VIEW)), allow_any_len=True, name="x"), lower_load_store),
Expand Down
42 changes: 25 additions & 17 deletions tinygrad/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,23 +222,31 @@ def _offset(self, buf, size:int, offset:int): return from_mv(self._as_buffer(buf
class CPUProgram:
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s'))
def __init__(self, name:str, lib:bytes):
assert sys.platform != "win32", "clang is not supported for windows yet"
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)

if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)

# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))

self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
if sys.platform == "win32":
PAGE_EXECUTE_READWRITE = 0x40
MEM_COMMIT = 0x1000
MEM_RESERVE = 0x2000
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64
ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
ctypes.memmove(ptr, lib, len(lib))
self.fxn = ctypes.CFUNCTYPE(None)(ptr)
else:
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)

if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)

# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))

self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))

def __call__(self, *bufs, vals=(), wait=False):
args = list(bufs) + list(vals)
Expand Down
2 changes: 1 addition & 1 deletion tinygrad/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'):
def capstone_flatdump(lib: bytes):
import capstone
match platform.machine():
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
for instr in cs.disasm(lib, 0):
Expand Down
5 changes: 3 additions & 2 deletions tinygrad/renderer/cstyle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Optional, Union, Literal, Callable, cast
import os, math
import os, math, sys
from collections import defaultdict, Counter
from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat
from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX
Expand Down Expand Up @@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage):
tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt,
swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1"))
for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]]

if sys.platform == 'win32':
kernel_prefix = "__attribute__((ms_abi)) "
def render_vector_prefix(self, dt:DType) -> str:
return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));"

Expand Down
5 changes: 3 additions & 2 deletions tinygrad/runtime/ops_clang.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import platform, tempfile, pathlib, subprocess
import platform, tempfile, pathlib, subprocess, sys
from tinygrad.helpers import cpu_objdump, capstone_flatdump
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
from tinygrad.runtime.support.elf import jit_loader
Expand Down Expand Up @@ -26,7 +26,8 @@ def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
def compile(self, src:str) -> bytes:
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
return jit_loader(obj)
Expand Down
2 changes: 1 addition & 1 deletion tinygrad/runtime/support/am/amdev.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ def __init__(self, devfmt, vram_bar:memoryview, doorbell_bar:memoryview, mmio_ba
if DEBUG >= 2: print(f"am {self.devfmt}: boot done")

def fini(self):
self.smu.set_clocks(perf=False)
for ip in [self.sdma, self.gfx]: ip.fini()
self.smu.set_clocks(perf=False)

def paddr2cpu(self, paddr:int) -> int: return mv_address(self.vram) + paddr
def paddr2mc(self, paddr:int) -> int: return self.gmc.mc_base + paddr
Expand Down
7 changes: 3 additions & 4 deletions tinygrad/viz/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,8 @@
const g = new dagreD3.graphlib.Graph({ compound: true }).setGraph({ rankdir: "LR" }).setDefaultEdgeLabel(function() { return {}; });
g.setNode("addition", {label: "", clusterLabelPos: "top", style: additions.length !== 0 ? "fill: rgba(26, 27, 38, 0.5);" : "display: none;"});
for (const [k,u] of Object.entries(graph)) {
g.setNode(k, {label: u[0], style: `fill: ${u[4]};` });
for (const src of u[2]) {
g.setNode(k, {label: u[0], style: `fill: ${u[2]};` });
for (const src of u[1]) {
g.setEdge(src, k, {curve: d3.curveBasis})
}
if (additions.includes(parseInt(k))) {
Expand Down Expand Up @@ -307,8 +307,7 @@
if (i === currentKernel) {
requestAnimationFrame(() => kernelUl.scrollIntoView({ behavior: "auto", block: "nearest" }));
}
const p = Object.assign(document.createElement("p"), { id: `kernel-${key}`, innerText: key ?? "UNPARENTED",
style: "cursor: pointer;"});
const p = Object.assign(document.createElement("p"), { id: `kernel-${key}`, innerText: key, style: "cursor: pointer;"});
kernelUl.appendChild(p)
items.forEach((u, j) => {
const rwUl = Object.assign(document.createElement("ul"), { innerText: `${toPath(u.loc)} - ${u.match_count}`, key: `uop-rewrite-${j}`,
Expand Down
11 changes: 6 additions & 5 deletions tinygrad/viz/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
# VIZ API

class GraphRewriteMetadata(TypedDict):
loc: tuple[str, int] # [path, lineno] calling graph_rewrite
match_count: int # total match count in this context
loc: tuple[str, int] # [path, lineno] calling graph_rewrite
match_count: int # total match count in this context

class GraphRewriteDetails(GraphRewriteMetadata):
graphs: list[dict] # JSON serialized UOp at every rewrite step
Expand All @@ -36,9 +36,10 @@ def pcall(fxn:Callable[..., str], *args, **kwargs) -> str:
try: return fxn(*args, **kwargs)
except Exception as e: return f"ERROR: {e}"

def uop_to_json(x:UOp) -> dict[int, tuple[str, str, list[int], str, str]]:
def uop_to_json(x:UOp) -> dict[int, tuple[str, list[int], str]]:
assert isinstance(x, UOp)
graph: dict[int, tuple[str, str, list[int], str, str]] = {}
# NOTE: this is [id, [label, src_ids, color]]
graph: dict[int, tuple[str, list[int], str]] = {}
excluded: set[UOp] = set()
for u in (toposort:=x.toposort):
if u.op in {Ops.CONST, Ops.DEVICE}: excluded.update((u,) + u.src)
Expand All @@ -53,7 +54,7 @@ def uop_to_json(x:UOp) -> dict[int, tuple[str, str, list[int], str, str]]:
if x in excluded:
if x.op is Ops.CONST and dtypes.is_float(u.dtype): label += f"\nCONST{idx} {x.arg:g}"
else: label += f"\n{x.op.name}{idx} {x.arg}"
graph[id(u)] = (label, str(u.dtype), [id(x) for x in u.src if x not in excluded], str(u.arg), uops_colors.get(u.op, "#ffffff"))
graph[id(u)] = (label, [id(x) for x in u.src if x not in excluded], uops_colors.get(u.op, "#ffffff"))
return graph

def get_metadata(keys:list[Any], contexts:list[list[TrackedGraphRewrite]]) -> list[tuple[str, list[GraphRewriteMetadata]]]:
Expand Down
Loading