From 7bdd312382f5630de2b98f1dd696b7277ecfa2cf Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Fri, 22 Mar 2024 22:10:18 +0100
Subject: [PATCH 01/10] hw: Keep IO fixed regardless of configuration

---
 .../src/snitch_cluster_wrapper.sv.tpl            | 16 ++++++----------
 target/snitch_cluster/test/testharness.sv        |  4 ++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index 293417ff68..c40f504406 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -210,28 +210,24 @@ ${ssr_cfg(core, "'{{{indirection:d}, {isect_master:d}, {isect_master_idx:d}, {is
 ${ssr_cfg(core, '{reg_idx}', '/*None*/ 0', ',')}\
   };
 
+  // Forward potentially optional configuration parameters
+  localparam logic [9:0] CfgBaseHartId      =  (${to_sv_hex(cfg['cluster_base_hartid'], 10)});
+  localparam addr_t    	 CfgClusterBaseAddr = (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])});
+
 endpackage
 // verilog_lint: waive-stop package-filename
 
 module ${cfg['name']}_wrapper (
   input  logic                                   clk_i,
   input  logic                                   rst_ni,
-% if cfg['enable_debug']:
   input  logic [${cfg['pkg_name']}::NrCores-1:0] debug_req_i,
-% endif
   input  logic [${cfg['pkg_name']}::NrCores-1:0] meip_i,
   input  logic [${cfg['pkg_name']}::NrCores-1:0] mtip_i,
   input  logic [${cfg['pkg_name']}::NrCores-1:0] msip_i,
-% if cfg['cluster_base_expose']:
   input  logic [9:0]                             hart_base_id_i,
   input  logic [${cfg['addr_width']-1}:0]                            cluster_base_addr_i,
-% endif
-% if cfg['timing']['iso_crossings']:
   input  logic                                   clk_d2_bypass_i,
-% endif
-% if cfg['sram_cfg_expose']:
   input  ${cfg['pkg_name']}::sram_cfgs_t         sram_cfgs_i,
-%endif
   input  ${cfg['pkg_name']}::narrow_in_req_t     narrow_in_req_i,
   output ${cfg['pkg_name']}::narrow_in_resp_t    narrow_in_resp_o,
   output ${cfg['pkg_name']}::narrow_out_req_t    narrow_out_req_o,
@@ -354,8 +350,8 @@ module ${cfg['name']}_wrapper (
     .hart_base_id_i,
     .cluster_base_addr_i,
 % else:
-    .hart_base_id_i (${to_sv_hex(cfg['cluster_base_hartid'], 10)}),
-    .cluster_base_addr_i (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])}),
+    .hart_base_id_i (snitch_cluster_pkg::CfgBaseHartId),
+    .cluster_base_addr_i (snitch_cluster_pkg::CfgClusterBaseAddr),
 % endif
 % if cfg['timing']['iso_crossings']:
     .clk_d2_bypass_i,
diff --git a/target/snitch_cluster/test/testharness.sv b/target/snitch_cluster/test/testharness.sv
index afc6972ed1..dbde824efc 100644
--- a/target/snitch_cluster/test/testharness.sv
+++ b/target/snitch_cluster/test/testharness.sv
@@ -29,6 +29,10 @@ module testharness import snitch_cluster_pkg::*; (
     .meip_i ('0),
     .mtip_i ('0),
     .msip_i (msip),
+    .hart_base_id_i (CfgBaseHartId),
+    .cluster_base_addr_i (CfgClusterBaseAddr),
+    .clk_d2_bypass_i (1'b0),
+    .sram_cfgs_i (snitch_cluster_pkg::sram_cfgs_t'('0)),
     .narrow_in_req_i (narrow_in_req),
     .narrow_in_resp_o (narrow_in_resp),
     .narrow_out_req_o (narrow_out_req),

From ecdc4657dbc78b05657e5bee4608a6e113d3358b Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Fri, 22 Mar 2024 23:03:54 +0100
Subject: [PATCH 02/10] target/snitch_cluster: Add Occamy-like config with
 SSSRs

---
 target/snitch_cluster/cfg/sssr.hjson | 153 +++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 target/snitch_cluster/cfg/sssr.hjson

diff --git a/target/snitch_cluster/cfg/sssr.hjson b/target/snitch_cluster/cfg/sssr.hjson
new file mode 100644
index 0000000000..ee297960a9
--- /dev/null
+++ b/target/snitch_cluster/cfg/sssr.hjson
@@ -0,0 +1,153 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Occamy-like Cluster configuration (+alias) for SSSR experiments
+{
+    nr_s1_quadrant: 1,
+    s1_quadrant: {
+        nr_clusters: 1,
+    },
+
+    cluster: {
+        boot_addr: 4096, // 0x1000
+        cluster_base_addr: 268435456, // 0x1000_0000
+        cluster_base_offset: 0, // 0x0
+        cluster_base_hartid: 0,
+        addr_width: 48,
+        data_width: 64,
+        user_width: 5, // clog2(total number of clusters)
+        tcdm: {
+            size: 128,
+            banks: 32,
+        },
+        cluster_periph_size: 64, // kB
+        zero_mem_size: 64, // kB
+        alias_region_enable: true,
+        dma_data_width: 512,
+        dma_axi_req_fifo_depth: 24,
+        dma_req_fifo_depth: 8,
+        narrow_trans: 4,
+        wide_trans: 32,
+        dma_user_width: 1,
+        // We don't need Snitch debugging in Occamy
+        enable_debug: false,
+        // We don't need Snitch (core-internal) virtual memory support
+        vm_support: false,
+        // Memory configuration inputs
+        sram_cfg_expose: true,
+        sram_cfg_fields: {
+            ema: 3,
+            emaw: 2,
+            emas: 1
+        },
+        // Timing parameters
+        timing: {
+            lat_comp_fp32: 2,
+            lat_comp_fp64: 3,
+            lat_comp_fp16: 1,
+            lat_comp_fp16_alt: 1,
+            lat_comp_fp8: 1,
+            lat_comp_fp8_alt: 1,
+            lat_noncomp: 1,
+            lat_conv: 2,
+            lat_sdotp: 3,
+            fpu_pipe_config: "BEFORE",
+            narrow_xbar_latency: "CUT_ALL_PORTS",
+            wide_xbar_latency: "CUT_ALL_PORTS",
+            // Isolate the core.
+            register_core_req: true,
+            register_core_rsp: true,
+            register_offload_req: true,
+            register_offload_rsp: true,
+            register_fpu_req: true,
+            register_ext_narrow: false,
+            register_ext_wide: false
+        },
+        hives: [
+            // Hive 0
+            {
+                icache: {
+                    size: 8, // total instruction cache size in kByte
+                    sets: 2, // number of ways
+                    cacheline: 256 // word size in bits
+                },
+                cores: [
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/dma_core_template" },
+                ]
+            }
+        ]
+    },
+    dram: {
+        // 0x8000_0000
+        address: 2147483648,
+        // 0x8000_0000
+        length: 2147483648
+    },
+    peripherals: {
+        clint: {
+            // 0xffff_0000
+            address: 4294901760,
+            // 0x0000_1000
+            length: 4096
+        },
+    },
+    // Templates.
+    compute_core_template: {
+        isa: "rv32imafd",
+        xssr: true,
+        xfrep: true,
+        xdma: false,
+        xf16: true,
+        xf16alt: true,
+        xf8: true,
+        xf8alt: true,
+        xfdotp: true,
+        xfvec: true,
+        ssr_nr_credits: 4,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+        // SSSR configuration below
+        ssr_intersection: true,
+        ssr_intersection_triple: [0, 1, 2],
+        ssrs: [
+            {indirection: true},    // Master 0
+            {indirection: true},    // Master 1
+            {},                     // Slave
+        ],
+    },
+    dma_core_template: {
+        isa: "rv32imafd",
+        // Xdiv_sqrt: true,
+        // isa: "rv32ema",
+        xdma: true,
+        xssr: false,
+        xfrep: false,
+        xf16: false,
+        xf16alt: false,
+        xf8: false,
+        xf8alt: false,
+        xfdotp: false,
+        xfvec: false,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+    }
+}

From 9629cb985f5c7ea03bd2e6cf55a342cd93f5ea67 Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Fri, 22 Mar 2024 23:04:26 +0100
Subject: [PATCH 03/10] sw: Add SARIS kernels

---
 sw/saris/.gitignore               |   3 +
 sw/saris/Makefile                 | 126 +++++
 sw/saris/README.md                |   1 +
 sw/saris/eval.json                | 396 ++++++++++++++
 sw/saris/runtime/crt0.S           | 159 ++++++
 sw/saris/runtime/dma.h            |  75 +++
 sw/saris/runtime/link.ld          |  42 ++
 sw/saris/runtime/runtime.h        | 137 +++++
 sw/saris/runtime/runtime.hpp      |  20 +
 sw/saris/runtime/sssr.h           | 189 +++++++
 sw/saris/stencils/istc.common.hpp | 181 ++++++
 sw/saris/stencils/istc.issr.hpp   | 879 ++++++++++++++++++++++++++++++
 sw/saris/stencils/istc.par.hpp    | 239 ++++++++
 sw/saris/util/eval.cpp.tpl        |  55 ++
 sw/saris/util/evalgen.py          | 312 +++++++++++
 15 files changed, 2814 insertions(+)
 create mode 100644 sw/saris/.gitignore
 create mode 100644 sw/saris/Makefile
 create mode 100644 sw/saris/README.md
 create mode 100644 sw/saris/eval.json
 create mode 100644 sw/saris/runtime/crt0.S
 create mode 100644 sw/saris/runtime/dma.h
 create mode 100644 sw/saris/runtime/link.ld
 create mode 100644 sw/saris/runtime/runtime.h
 create mode 100644 sw/saris/runtime/runtime.hpp
 create mode 100644 sw/saris/runtime/sssr.h
 create mode 100644 sw/saris/stencils/istc.common.hpp
 create mode 100644 sw/saris/stencils/istc.issr.hpp
 create mode 100644 sw/saris/stencils/istc.par.hpp
 create mode 100644 sw/saris/util/eval.cpp.tpl
 create mode 100644 sw/saris/util/evalgen.py

diff --git a/sw/saris/.gitignore b/sw/saris/.gitignore
new file mode 100644
index 0000000000..7d0ba6408d
--- /dev/null
+++ b/sw/saris/.gitignore
@@ -0,0 +1,3 @@
+bin
+dump
+gen
diff --git a/sw/saris/Makefile b/sw/saris/Makefile
new file mode 100644
index 0000000000..e9bfb82500
--- /dev/null
+++ b/sw/saris/Makefile
@@ -0,0 +1,126 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Paul Scheffler <paulsc@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+all:
+
+###############
+# Environment #
+###############
+
+# NOTE: This needs to be a specific revision of PULP RISCV LLVM 15:
+# TODO: add commit link here
+LLVM_BINROOT ?= /home/paulsc/dev/llvm-ssr/llvm-iis/install/bin
+PYTHON3 ?= python3
+
+SARISDIR ?= .
+GENDIR   ?= $(SARISDIR)/gen
+UTILDIR  ?= $(SARISDIR)/util
+BINDIR 	 ?= $(SARISDIR)/bin
+DUMPDIR  ?= $(SARISDIR)/dump
+RTDIR 	 ?= $(SARISDIR)/runtime
+
+# We depend on the printf submodule
+PRINTFDIR ?= $(SARISDIR)/../deps/printf
+
+############################
+# Compiler (LLVM 15) Setup #
+############################
+
+RISCV_MARCH  ?= \
+rv32imafd_zfh_xfrep_xssr_xdma_xfalthalf_xfquarter_xfaltquarter_xfvecsingle_xfvechalf_$\
+xfvecalthalf_xfvecquarter_xfvecaltquarter_xfauxhalf_xfauxalthalf_xfauxquarter_xfauxaltquarter_$\
+xfauxvecsingle_xfauxvechalf_xfauxvecalthalf_xfauxvecquarter_xfauxvecaltquarter_xfexpauxvechalf_$\
+xfexpauxvecalthalf_xfexpauxvecquarter_xfexpauxvecaltquarter
+
+RISCV_MABI   ?= ilp32d
+
+RISCV_CC      ?= $(LLVM_BINROOT)/clang
+RISCV_CXX     ?= $(LLVM_BINROOT)/clang++
+RISCV_OBJDUMP ?= $(LLVM_BINROOT)/llvm-objdump
+RISCV_STRIP   ?= $(LLVM_BINROOT)/llvm-strip
+
+RISCV_STACK  ?= 2048
+RISCV_FLAGS ?= -mcpu=snitch -march=$(RISCV_MARCH) -Ofast -flto -mabi=$(RISCV_MABI) \
+               -Wframe-larger-than=$(RISCV_STACK) -nostdlib -mcmodel=medany -I$(RTDIR) \
+               -I$(SARISDIR)/stencils -I$(PRINTFDIR) -ffreestanding -fno-builtin \
+               -ffunction-sections
+
+RISCV_CFLAGS ?= $(RISCV_FLAGS)
+# Loop unrolling optimization
+RISCV_CFLAGS += -mllvm --allow-unroll-and-jam
+RISCV_CFLAGS += -mllvm --unroll-allow-partial
+RISCV_CFLAGS += -mllvm --unroll-runtime
+# Tree height reduction options
+RISCV_CFLAGS += -mllvm --enable-fp-thr
+RISCV_CFLAGS += -mllvm --thr-max-depth=5
+RISCV_CFLAGS += -mllvm --thr-se-leaves
+RISCV_CFLAGS += -mllvm --thr-fuse-bias
+RISCV_CFLAGS += -mllvm --thr-se-factor=2
+RISCV_CFLAGS += -mllvm --thr-re-factor=1
+# Machine scheduler and PostRA options
+RISCV_CFLAGS += -mllvm --post-RA-scheduler
+RISCV_CFLAGS += -mllvm --enable-misched
+RISCV_CFLAGS += -mllvm --enable-post-misched
+RISCV_CFLAGS += -mllvm --misched-postra
+
+RISCV_CCFLAGS  ?= $(RISCV_CFLAGS) -std=gnu11
+RISCV_CXXFLAGS ?= $(RISCV_CFLAGS) -std=gnu++14
+RISCV_LDFLAGS  ?= -fuse-ld=$(LLVM_BINROOT)/ld.lld -flto -static -lm $(RISCV_FLAGS) \
+                  -Wl,--fatal-warnings -Wl,-z,stack-size=$(RISCV_STACK)
+RISCV_DMPFLAGS ?= --mcpu=snitch
+
+############################
+# SARIS Program Build Flow #
+############################
+
+.SECONDEXPANSION:
+.DELETE_ON_ERROR:
+
+# Extracting word nr. $(1) from $(2)-separated list $(3)
+pw = $(word $(1), $(subst $(2), ,$(3)))
+
+$(GENDIR) $(BINDIR) $(DUMPDIR):
+	mkdir -p $@
+
+$(BINDIR)/crt0.o: $(SARISDIR)/runtime/crt0.S | $(BINDIR)
+	$(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@
+
+$(BINDIR)/istc.%.c.o: $(GENDIR)/$$(call pw,1,.,$$*).cpp | $(BINDIR)
+	$(RISCV_CXX) $(RISCV_CXXFLAGS) -c $< -o $@
+
+.PRECIOUS: $(BINDIR)/%.elf
+$(BINDIR)/istc.%.elf: $(BINDIR)/istc.%.c.o $(BINDIR)/crt0.o $(RTDIR)/link.ld | $(BINDIR)
+	$(RISCV_CC) $(RISCV_LDFLAGS) -o $@ $< $(BINDIR)/crt0.o -T$(RTDIR)/link.ld
+	$(RISCV_STRIP) $@ -g -S -d --strip-debug -R .comment -R .riscv.attributes
+
+.PRECIOUS: $(DUMPDIR)/%.dump
+$(DUMPDIR)/%.dump: $(BINDIR)/%.elf | $(DUMPDIR)
+	@$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .text  -d $< >$@
+	@$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .misc  -s $< | tail -n +3 >>$@
+	@$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .tcdm  -s $< | tail -n +3 >>$@
+	@$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .tcdmc -s $< | tail -n +3 >>$@
+
+# Phony for program and dump build
+prog.%: $(BINDIR)/%.elf $(DUMPDIR)/%.dump
+	@echo -e '\x1b[44;33;1mBUILT: $*\x1b[0m'
+
+clean:
+	rm -rf $(BINDIR) $(DUMPDIR) $(GENDIR)
+
+############################
+# SARIS Program Generation #
+############################
+
+.PRECIOUS: $(GENDIR)/%.cpp
+$(GENDIR)/%.cpp: $(UTILDIR)/evalgen.py $(SARISDIR)/eval.json $(UTILDIR)/eval.cpp.tpl | $(GENDIR)
+	$(PYTHON3) $^ $* > $@
+
+EVAL_NAMES ?= $(shell jq -r 'keys | join(" ")' $(SARISDIR)/eval.json)
+ISTC_PROGS += $(patsubst %,istc.%,$(EVAL_NAMES))
+
+# Default: compile all SARIS programs in eval.json
+all: $(addprefix prog.,$(ISTC_PROGS))
diff --git a/sw/saris/README.md b/sw/saris/README.md
new file mode 100644
index 0000000000..464090415c
--- /dev/null
+++ b/sw/saris/README.md
@@ -0,0 +1 @@
+# TODO
diff --git a/sw/saris/eval.json b/sw/saris/eval.json
new file mode 100644
index 0000000000..f1b102588b
--- /dev/null
+++ b/sw/saris/eval.json
@@ -0,0 +1,396 @@
+{
+
+  "pb_jacobi_2d_ml_par": {
+    "radius": 1,
+    "grids": {
+      "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "kernels": [
+        [1, "istcp_pb_jacobi_2d<st1,s2ml,sp2>(core_id, &Ap2ml, &Bp2ml)"],
+        [1, "istcp_pb_jacobi_2d<st1,s2ml,sp2>(core_id, &Ap2ml, &Bp2ml)"]
+      ],
+    "touch": ["Ap2ml", "Bp2ml"],
+    "dma": ["Cp2ml", "Dp2xl"]
+  },
+
+  "pb_jacobi_2d_ml_issr": {
+    "radius": 1,
+    "grids": {
+      "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "kernels": [
+        [1, "istci_pb_jacobi_2d<st1,s2ml,sp2>(core_id, &Ai2ml, &Bi2ml)"],
+        [1, "istci_pb_jacobi_2d<st1,s2ml,sp2>(core_id, &Ai2ml, &Bi2ml)"]
+      ],
+    "touch": ["Ai2ml", "Bi2ml"],
+    "dma": ["Ci2ml", "Di2xl"]
+  },
+
+
+
+  "an5d_j2d5pt_ml_par": {
+    "radius": 1,
+    "grids": {
+      "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+    "kernels": [
+      [1, "istcp_an5d_j2d5pt<st1,s2ml,sp2,ct>(core_id, &Ap22ml[0])"],
+      [1, "istcp_an5d_j2d5pt<st1,s2ml,sp2,ct>(core_id, &Ap22ml[0])"]
+    ],
+    "touch": ["Ap2ml", "Bp2ml"],
+    "dma": ["Cp2ml", "Dp2xl"]
+  },
+
+  "an5d_j2d5pt_ml_issr": {
+    "radius": 1,
+    "grids": {
+      "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+    "kernels": [
+      [1, "istci_an5d_j2d5pt<st1,s2ml,sp2,ct>(core_id, &Ai22ml[0])"],
+      [1, "istci_an5d_j2d5pt<st1,s2ml,sp2,ct>(core_id, &Ai22ml[0])"]
+    ],
+    "touch": ["Ai2ml", "Bi2ml"],
+    "dma": ["Ci2ml", "Di2xl"]
+  },
+
+
+
+  "an5d_j2d9pt_ml_par": {
+    "radius": 2,
+    "grids": {
+      "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+    "kernels": [
+      [1, "istcp_an5d_j2d9pt<st1,s2ml,sp2,ct>(core_id, &Ap22ml[0])"],
+      [1, "istcp_an5d_j2d9pt<st1,s2ml,sp2,ct>(core_id, &Ap22ml[0])"]
+    ],
+    "touch": ["Ap2ml", "Bp2ml"],
+    "dma": ["Cp2ml", "Dp2xl"]
+  },
+
+  "an5d_j2d9pt_ml_issr": {
+    "radius": 2,
+    "grids": {
+      "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+    "kernels": [
+      [1, "istci_an5d_j2d9pt<st1,s2ml,sp2,ct>(core_id, &Ai22ml[0])"],
+      [1, "istci_an5d_j2d9pt<st1,s2ml,sp2,ct>(core_id, &Ai22ml[0])"]
+
+    ],
+    "touch": ["Ai2ml", "Bi2ml"],
+    "dma": ["Ci2ml", "Di2xl"]
+  },
+
+
+
+  "an5d_j2d9pt_gol_ml_par": {
+    "radius": 1,
+    "grids": {
+      "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+    "kernels": [
+      [1, "istcp_an5d_j2d9pt_gol<st1,s2ml,sp2,ct>(core_id, &Ap22ml[0])"],
+      [1, "istcp_an5d_j2d9pt_gol<st1,s2ml,sp2,ct>(core_id, &Ap22ml[0])"]
+
+    ],
+    "touch": ["Ap2ml", "Bp2ml"],
+    "dma": ["Cp2ml", "Dp2xl"]
+  },
+
+  "an5d_j2d9pt_gol_ml_issr": {
+    "radius": 1,
+    "grids": {
+      "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+    "kernels": [
+      [1, "istci_an5d_j2d9pt_gol<st1,s2ml,sp2,ct>(core_id, &Ai22ml[0])"],
+      [1, "istci_an5d_j2d9pt_gol<st1,s2ml,sp2,ct>(core_id, &Ai22ml[0])"]
+
+    ],
+    "touch": ["Ai2ml", "Bi2ml"],
+    "dma": ["Ci2ml", "Di2xl"]
+  },
+
+
+
+  "an5d_j3d27pt_ml_par": {
+    "radius": 1,
+    "grids": {
+      "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+    },
+    "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+    "kernels": [
+      [1, "istcp_an5d_j3d27pt<st1,s3ml,sp3,ct>(core_id, &Ap32ml[0])"],
+      [1, "istcp_an5d_j3d27pt<st1,s3ml,sp3,ct>(core_id, &Ap32ml[0])"]
+    ],
+    "touch": ["Ap3ml", "Bp3ml"],
+    "dma": ["Cp3ml", "Dp3xl"]
+  },
+
+  "an5d_j3d27pt_ml_issr": {
+    "radius": 1,
+    "grids": {
+      "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+    },
+    "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+    "kernels": [
+      [1, "istci_an5d_j3d27pt<st1,s3ml,sp3,ct>(core_id, &Ai32ml[0])"],
+      [1, "istci_an5d_j3d27pt<st1,s3ml,sp3,ct>(core_id, &Ai32ml[0])"]
+
+    ],
+    "touch": ["Ai3ml", "Bi3ml"],
+    "dma": ["Ci3ml", "Di3xl"]
+  },
+
+
+
+  "an5d_star2d3r_ml_par": {
+    "radius": 3,
+    "grids": {
+      "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+    "params": {"r": 3},
+    "kernels": [
+      [1, "istcp_an5d_star2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ap22ml[0])"],
+      [1, "istcp_an5d_star2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ap22ml[0])"]
+    ],
+    "touch": ["Ap2ml", "Bp2ml"],
+    "dma": ["Cp2ml", "Dp2xl"]
+  },
+
+  "an5d_star2d3r_ml_issr": {
+    "radius": 3,
+    "grids": {
+      "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+    "params": {"r": 3},
+    "kernels": [
+      [1, "istci_an5d_star2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ai22ml[0])"],
+      [1, "istci_an5d_star2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ai22ml[0])"]
+
+    ],
+    "touch": ["Ai2ml", "Bi2ml"],
+    "dma": ["Ci2ml", "Di2xl"]
+  },
+
+
+
+  "an5d_box2d1r_ml_par": {
+    "radius": 1,
+    "grids": {
+      "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+    "params": {"r": 1},
+    "kernels": [
+      [1, "istcp_an5d_box2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ap22ml[0])"],
+      [1, "istcp_an5d_box2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ap22ml[0])"]
+
+    ],
+    "touch": ["Ap2ml", "Bp2ml"],
+    "dma": ["Cp2ml", "Dp2xl"]
+  },
+
+  "an5d_box2d1r_ml_issr": {
+    "radius": 1,
+    "grids": {
+      "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+      "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+    },
+    "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+    "params": {"r": 1},
+    "kernels": [
+      [1, "istci_an5d_box2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ai22ml[0])"],
+      [1, "istci_an5d_box2dXr<st1,s2ml,sp2,ct,ci>(core_id, &Ai22ml[0])"]
+    ],
+    "touch": ["Ai2ml", "Bi2ml"],
+    "dma": ["Ci2ml", "Di2xl"]
+  },
+
+
+
+  "an5d_star3d2r_ml_par": {
+    "radius": 2,
+    "grids": {
+      "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+    },
+    "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+    "params": {"r": 2},
+    "kernels": [
+      [1, "istcp_an5d_star3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ap32ml[0])"],
+      [1, "istcp_an5d_star3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ap32ml[0])"]
+    ],
+    "touch": ["Ap3ml", "Bp3ml"],
+    "dma": ["Cp3ml", "Dp3xl"]
+  },
+
+  "an5d_star3d2r_ml_issr": {
+    "radius": 2,
+    "grids": {
+      "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+    },
+    "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+    "params": {"r": 2},
+    "kernels": [
+      [1, "istci_an5d_star3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ai32ml[0])"],
+      [1, "istci_an5d_star3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ai32ml[0])"]
+
+    ],
+    "touch": ["Ai3ml", "Bi3ml"],
+    "dma": ["Ci3ml", "Di3xl"]
+  },
+
+
+
+  "an5d_box3d1r_ml_par": {
+    "radius": 1,
+    "grids": {
+      "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+    },
+    "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+    "params": {"r": 1},
+    "kernels": [
+      [1, "istcp_an5d_box3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ap32ml[0])"],
+      [1, "istcp_an5d_box3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ap32ml[0])"]
+    ],
+    "touch": ["Ap3ml", "Bp3ml"],
+    "dma": ["Cp3ml", "Dp3xl"]
+  },
+
+  "an5d_box3d1r_ml_issr": {
+    "radius": 1,
+    "grids": {
+      "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+    },
+    "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+    "params": {"r": 1},
+    "kernels": [
+      [1, "istci_an5d_box3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ai32ml[0])"],
+      [1, "istci_an5d_box3dXr<st1,s3ml,sp3,ct,ci>(core_id, &Ai32ml[0])"]
+    ],
+    "touch": ["Ai3ml", "Bi3ml"],
+    "dma": ["Ci3ml", "Di3xl"]
+  },
+
+
+
+  "minimod_acoustic_iso_cd_ml_par": {
+    "radius": 4,
+    "grids": {
+      "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Dp3ml": {"seed": 1339, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "F3ml": {"seed": 1338, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "G3ml": {"seed": 1340, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "Hp3xl": {"seed": 1341, "dims": ["3xl", "3xl", "3xl"]},
+      "I3xl": {"seed": 1342, "dims": [16, 16, 16]},
+      "Ep3ml": {"seed": 1343, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "Jp3xl": {"seed": 1344, "dims": [16, 16, 16]}
+    },
+    "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+    "kernels": [
+      [1, "istcp_minimod_acoustic_iso_cd<st1,s3ml,sp3,ct>(core_id, &Ap32ml[0], &F3ml)"],
+      [1, "istcp_minimod_acoustic_iso_cd<st1,s3ml,sp3,ct>(core_id, &Ap32ml[0], &F3ml)"]
+    ],
+    "touch": ["Ap3ml", "Bp3ml", "F3ml", "Dp3ml"],
+    "dma": [
+      ["Cp3ml", "Hp3xl", "out"],
+      ["G3ml", "I3xl", "in", 0],
+      ["Cp3ml", "Hp3xl", "in"],
+      ["Ep3ml", "Jp3xl", "in"]
+    ]
+  },
+
+  "minimod_acoustic_iso_cd_ml_issr": {
+    "radius": 4,
+    "grids": {
+      "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "Di3ml": {"seed": 1339, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "F3ml": {"seed": 1338, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+      "G3ml": {"seed": 1340, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "Hp3xl": {"seed": 1341, "dims": ["3xl", "3xl", "3xl"]},
+      "I3xl": {"seed": 1342, "dims": [16, 16, 16]},
+      "Ep3ml": {"seed": 1343, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+      "Jp3xl": {"seed": 1344, "dims": [16, 16, 16]}
+    },
+    "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+    "kernels": [
+      [1, "istci_minimod_acoustic_iso_cd<st1,s3ml,sp3,ct>(core_id, &Ai32ml[0], &F3ml)"],
+      [1, "istci_minimod_acoustic_iso_cd<st1,s3ml,sp3,ct>(core_id, &Ai32ml[0], &F3ml)"]
+    ],
+    "touch": ["Ai3ml", "Bi3ml", "F3ml", "Di3ml"],
+    "dma": [
+      ["Cp3ml", "Hp3xl", "out"],
+      ["G3ml", "I3xl", "in", 0],
+      ["Cp3ml", "Hp3xl", "in"],
+      ["Ep3ml", "Jp3xl", "in"]
+    ]
+  }
+
+}
diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S
new file mode 100644
index 0000000000..79efb0cbbe
--- /dev/null
+++ b/sw/saris/runtime/crt0.S
@@ -0,0 +1,159 @@
+# HTIF sections
+.pushsection .htif,"aw",@progbits;
+.align 6; .global tohost; tohost: .dword 0;
+.align 6; .global fromhost; fromhost: .dword 0;
+
+.globl _start
+.section .text._start
+_start:
+  # Set global pointer
+  .option push
+  .option norelax
+  la      gp, __global_pointer
+  .option pop
+
+  # Prepare main arguments for single cluster
+  csrr    a0, mhartid
+  la      a1, __const_num_cores
+  la      a2, __const_tcdm_start
+  la      a3, __const_tcdm_end
+
+  # Set stack pointer; 1KiB per core
+  # Offset by 8B to prevent bank collisions
+  slli    t0, a0, 10
+  addi    sp, a3, -8
+  sub     sp, sp, t0
+  slli    t0, a0, 3
+  sub     sp, sp, t0
+
+  # check if the core has the F-extension
+  csrr    t0, misa
+  andi    t0, t0, (1 << 5)
+  beqz    t0, _clr_ireg
+
+_skip_dmcc_work:
+  # Skip the coming two steps unless we are the DMA core
+  # NOTE: this assumes the DMA core being the last in the cluster
+  addi    t0, a1, -1
+  bne     a0, t0, _dmcc_work_sync
+
+_preload_tcdm:
+  # Preload thread-local storage (TCDM) using DMA
+  la      t0, __const_tcdm_losta
+  la      t1, __const_tcdm_loend
+  sub     t3, t1, t0
+  # Branch off if no tcdm data
+  beqz    t3, _preload_tcdmc
+  # Launch copy to base of TCDM
+  dmsrc   t0, zero
+  dmdst   a2, zero
+  dmcpyi  zero, t3, 0
+  # Await DMA
+  1:
+  dmstati t0, 2
+  bnez    t0, 1b
+
+_preload_tcdmc:
+  # Preload thread-local storage (TCDM) using DMA
+  la      t0, __const_tcdmc_losta
+  la      t1, __const_tcdmc_loend
+  sub     t3, t1, t0
+  # Get tcdmc base, branch off if no tcdmc data
+  la      t2, __const_tcdm_startc
+  beqz    t3, _dmcc_work_sync
+  # Launch copy to past end of TCDM
+  dmsrc   t0, zero
+  dmdst   t2, zero
+  dmcpyi  zero, t3, 0
+  # Await DMA
+  1:
+  dmstati t0, 2
+  bnez    t0, 1b
+
+_dmcc_work_sync:
+  # Synchronize cores so data is ready
+  csrr    x0, 0x7C2
+
+  # Reset float regs if present
+_clr_freg:
+  fcvt.d.w f0, x0
+  fmv.d   f1, f0
+  fmv.d   f2, f0
+  fmv.d   f3, f0
+  fmv.d   f4, f0
+  fmv.d   f5, f0
+  fmv.d   f6, f0
+  fmv.d   f7, f0
+  fmv.d   f8, f0
+  fmv.d   f9, f0
+  fmv.d   f10, f0
+  fmv.d   f11, f0
+  fmv.d   f12, f0
+  fmv.d   f13, f0
+  fmv.d   f14, f0
+  fmv.d   f15, f0
+  fmv.d   f16, f0
+  fmv.d   f17, f0
+  fmv.d   f18, f0
+  fmv.d   f19, f0
+  fmv.d   f20, f0
+  fmv.d   f10, f0
+  fmv.d   f21, f0
+  fmv.d   f22, f0
+  fmv.d   f23, f0
+  fmv.d   f24, f0
+  fmv.d   f25, f0
+  fmv.d   f26, f0
+  fmv.d   f27, f0
+  fmv.d   f28, f0
+  fmv.d   f29, f0
+  fmv.d   f30, f0
+  fmv.d   f31, f0
+
+  # Reset remaining int regs
+_clr_ireg:
+  li      tp, 0
+  li      t0, 0
+  li      t1, 0
+  li      t2, 0
+  li      t3, 0
+  li      t4, 0
+  li      t5, 0
+  li      t6, 0
+  li      a6, 0
+  li      a7, 0
+  li      s0, 0
+  li      s1, 0
+  li      s2, 0
+  li      s3, 0
+  li      s4, 0
+  li      s5, 0
+  li      s6, 0
+  li      s7, 0
+  li      s8, 0
+  li      s9, 0
+  li      s10, 0
+  li      s11, 0
+
+  # Call main
+  call smain
+
+_eoc:
+  # Synchronize cores
+  csrr    x0, 0x7C2
+  # Only core 0 (of all cores) returns
+  csrr    t0, mhartid
+  bnez    t0, _done
+  # Write termination bit and return code (a0) to tohost
+  slli    a0, a0, 1
+  ori     a0, a0, 1
+  la      t0, tohost
+  sw      a0, 0(t0)
+  # Go to sleep
+_done:
+  wfi
+
+
+.globl _putcb
+.section .data._putcb
+_putcb:
diff --git a/sw/saris/runtime/dma.h b/sw/saris/runtime/dma.h
new file mode 100644
index 0000000000..80956b0f73
--- /dev/null
+++ b/sw/saris/runtime/dma.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+
+// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers.
+static inline uint32_t __rt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
+                                          size_t size) {
+    register uint32_t reg_txid;  // 10
+    asm volatile("dmsrc   %[sl], %[sh]"     :: [sh]"r"(src >> 32), [sl]"r"(src));
+    asm volatile("dmdst   %[dl], %[dh]"     :: [dh]"r"(dst >> 32), [dl]"r"(dst));
+    asm volatile("dmcpyi  %[id], %[sz], 0"  : [id]"=r"(reg_txid) : [sz]"r"(size));
+    return reg_txid;
+}
+
+// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers.
+static inline uint32_t __rt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
+                                          size_t size, size_t dst_stride,
+                                          size_t src_stride, size_t repeat) {
+    register uint32_t reg_txid;  // 10
+    asm volatile("dmsrc   %[sl], %[sh]"     :: [sh]"r"(src >> 32), [sl]"r"(src));
+    asm volatile("dmdst   %[dl], %[dh]"     :: [dh]"r"(dst >> 32), [dl]"r"(dst));
+    asm volatile("dmstr   %[rd], %[rs]"     :: [rd]"r"(dst_stride), [rs]"r"(src_stride));
+    asm volatile("dmrep   %[rp]"            :: [rp]"r"(repeat));
+    asm volatile("dmcpyi  %[id], %[sz], 2"  : [id]"=r"(reg_txid) : [sz]"r"(size));
+    return reg_txid;
+}
+
+// Initiate an asynchronous 1D DMA transfer.
+static inline uint32_t __rt_dma_start_1d(void *dst, const void *src, size_t size) {
+    return __rt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size);
+}
+
+// Initiate an asynchronous 2D DMA transfer.
+static inline uint32_t __rt_dma_start_2d(void *dst, const void *src, size_t size,
+                                  size_t src_stride, size_t dst_stride,
+                                  size_t repeat) {
+    return __rt_dma_start_2d_wideptr((size_t)dst, (size_t)src, size, src_stride,
+                                     dst_stride, repeat);
+}
+
+// Last completed ID
+static inline volatile uint32_t __rt_dma_completed_id() {
+    register uint32_t cid;
+    asm volatile(
+        "dmstati  %[cid], 0           \n " // 0=status.completed_id
+        : [cid]"=&r"(cid) :: "memory"
+      );
+    // TODO: Fix off-by-one bug in DMA hardware!
+    return cid+1;
+}
+
+// Block until a transfer finishes.
+static inline void __rt_dma_wait(uint32_t tid) {
+    register uint32_t tmp;
+    // TODO: Fix off-by-one bug in DMA hardware!
+    tid++;
+    asm volatile(
+        "1: \n"
+        "dmstati  %[tmp], 0           \n " // 0=status.completed_id
+        "bgt      %[tid], %[tmp], 1b  \n"  // branch back if ID to wait for > last completed ID
+        : [tmp]"=&r"(tmp) : [tid]"r"(tid)
+      );
+}
+
+// Block until all operation on the DMA ceases.
+static inline void __rt_dma_wait_all() {
+    register uint32_t tmp;
+    asm volatile(
+        "1: \n"
+        "dmstati  %[tmp], 2           \n " // 2=status.busy
+        "bne      %[tmp], zero, 1b    \n"
+        : [tmp]"=&r"(tmp) :
+      );
+}
diff --git a/sw/saris/runtime/link.ld b/sw/saris/runtime/link.ld
new file mode 100644
index 0000000000..5788547bdd
--- /dev/null
+++ b/sw/saris/runtime/link.ld
@@ -0,0 +1,42 @@
+OUTPUT_ARCH( "riscv" )
+ENTRY(_start)
+
+MEMORY
+{
+  /*  Reserve upper 9*1Ki = 9Ki of TCDM for stack, plus some padding.
+      This can be expanded to allocate the full CC TCDM as needed.
+      A 2 KiB RO is provided in the TCDM for small data + consts. */
+  tcdm (rw)   : ORIGIN = 0x10000000,  LENGTH = 0x1CC00
+  tcdmc (r)   : ORIGIN = 0x1001CC00,  LENGTH = 2K
+  dram (rwxa) : ORIGIN = 0x90000000,  LENGTH = 1024M
+  dtxt (rwxa) : ORIGIN = 0x80000000,  LENGTH = 1024M
+}
+
+SECTIONS
+{
+  /DISCARD/ : { *(.riscv.attributes) *(.comment) *(.rela.*) *(.sym.*) }
+
+  .text : { *(.text._start) *(.text) *(.text.*); . = ALIGN(16); } >dtxt
+  .misc : { *(.data) *(.data.*) *(.putcb) } >dram
+  .tcdm : { *(.tcdm) *(.l1) } >tcdm AT>dram
+  .tcdmc : { *(.sdata) *(.sdata.*) *(.rodata) *(.rodata.*) } >tcdmc AT>dram
+
+  /* Global and stack pointer: in TCDM */
+  __global_pointer    = ADDR(.tcdmc) + SIZEOF(.tcdmc) / 2;
+
+  /* Memory Layout Constants */
+  __const_num_cores   = 9;
+  __const_tcdm_start  = ORIGIN(tcdm);
+  __const_tcdm_startc = ORIGIN(tcdmc);
+  __const_tcdm_end    = ORIGIN(tcdm) + 128K;
+  __const_dram_start  = ORIGIN(dram);
+
+  /* TCDM Loading */
+  __const_tcdm_losta  = LOADADDR(.tcdm);
+  __const_tcdm_loend  = LOADADDR(.tcdm) + SIZEOF(.tcdm);
+  __const_tcdmc_losta = LOADADDR(.tcdmc);
+  __const_tcdmc_loend = LOADADDR(.tcdmc) + SIZEOF(.tcdmc);
+
+  /* HTIF section for FESVR */
+  .htif : { } >dram
+}
diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h
new file mode 100644
index 0000000000..883bacb2ae
--- /dev/null
+++ b/sw/saris/runtime/runtime.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+#include "dma.h"
+#include "sssr.h"
+
+#define PRINTF_NTOA_BUFFER_SIZE 12
+#define PRINTF_DISABLE_SUPPORT_LONG_LONG 1
+
+#include "printf.h"
+
+extern uintptr_t volatile tohost, fromhost;
+
+extern void *__const_tcdm_start;
+extern void *__const_dram_start;
+
+// Use this to identify and differentiate TCDM data and pointers
+#define TCDMSPC __attribute__((address_space(1)))
+#define TCDMSEC __attribute__((section(".l1")))
+#define TCDM TCDMSPC
+#define TCDMDECL TCDMSPC TCDMSEC
+
+static inline volatile uint32_t __rt_get_hartid() {
+    uint32_t register r;
+    asm volatile ("csrr %0, mhartid" : "=r"(r));
+    return r;
+}
+// Rudimentary string buffer for putchar calls.
+extern uint32_t _putcb;
+#define PUTC_BUFFER_LEN (1024 - sizeof(size_t))
+
+typedef struct {
+    size_t size;
+    uint64_t syscall_mem[8];
+} putc_buffer_header_t;
+
+typedef struct {
+    putc_buffer_header_t hdr;
+    char data[PUTC_BUFFER_LEN];
+} putc_buffer_t;
+
+static volatile putc_buffer_t *const putc_buffer = (putc_buffer_t *const)(void *)&_putcb;
+
+// Provide an implementation for putchar.
+void _putchar(char character) {
+    volatile putc_buffer_t *buf = &putc_buffer[__rt_get_hartid()];
+    buf->data[buf->hdr.size++] = character;
+    if (buf->hdr.size == PUTC_BUFFER_LEN || character == '\n') {
+        buf->hdr.syscall_mem[0] = 64;  // sys_write
+        buf->hdr.syscall_mem[1] = 1;   // file descriptor (1 = stdout)
+        buf->hdr.syscall_mem[2] = (uintptr_t)&buf->data;  // buffer
+        buf->hdr.syscall_mem[3] = buf->hdr.size;          // length
+
+        tohost = (uintptr_t)buf->hdr.syscall_mem;
+        while (fromhost == 0)
+            ;
+        fromhost = 0;
+
+        buf->hdr.size = 0;
+    }
+}
+
+// Print a (null-terminated) string
+static inline void __rt_print(const char* buf) {
+    for (; *buf; ++buf) _putchar(*buf);
+}
+
+// Print a decimal number
+static inline void __rt_print_dec_uint(uint32_t val) {
+    const int DEC_BUF_LEN = 10;
+    char out [DEC_BUF_LEN];
+    int out_msd;
+    int i;
+    // Capture digits
+    for (i=DEC_BUF_LEN-2; i >= 0; --i) {
+        char digit = (val % 10);
+        out[i] = digit + '0';
+        val /= 10;
+        out_msd = i;
+        if (val == 0) break;
+    }
+    out[DEC_BUF_LEN-1] = '\0';
+    // Print digits
+    __rt_print(out + out_msd);
+}
+
+// Cluster-local barrier
+static inline void __rt_barrier() {
+    asm volatile("csrr x0, 0x7C2" ::: "memory");
+}
+
+// Full memory fence
+static inline void __rt_fence() {
+    asm volatile("fence" ::: "memory");
+}
+
+#define __RT_FPU_FENCE  "fmv.x.w zero, fa0\n"
+
+// Fence waiting for FPU to catch up to core
+static inline void __rt_fpu_fence() {
+    asm volatile(__RT_FPU_FENCE ::: "memory");
+}
+
+// Cluster-local barrier
+static inline void __rt_fpu_fence_full() {
+    uint32_t register tmp;
+    asm volatile (
+        "fmv.x.w %[tmp], fa0 \n"
+        "mv zero, %[tmp] \n"
+        : [tmp]"=r"(tmp) :: "memory"
+    );
+}
+
+// Memcopy using FPU
+static inline void __rt_memcpy_fpu(double* dst, double* src, size_t lend) {
+    #pragma clang loop unroll_count(8)
+    for (int i = 0; i < lend; i++)
+        *(volatile double*)(dst + i) = *(volatile double*)(src + i);
+}
+
+// Monotonically increasing cycle count
+static inline volatile uint32_t __rt_get_timer() {
+    uint32_t register r;
+    asm volatile ("csrr %0, mcycle" : "=r"(r));
+    return r;
+}
+
+// Sleep for multiples of 10 (Deca) cycles
+static inline void __rt_shortsleep(uint32_t Dcycles) {
+    for (int i = 0; i < Dcycles; ++i) {
+        asm volatile ("nop; nop; nop; nop; nop; nop; nop; nop; nop; nop" ::: "memory");
+    }
+}
+
+// Include putchar code directly (header-only implementation)
+#include "printf.c"
diff --git a/sw/saris/runtime/runtime.hpp b/sw/saris/runtime/runtime.hpp
new file mode 100644
index 0000000000..df501ff20e
--- /dev/null
+++ b/sw/saris/runtime/runtime.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+// C linkage macros
+#ifdef __cplusplus
+#define EXTERN_C extern "C"
+#define EXTERN_C_BEGIN extern "C" {
+#define EXTERN_C_END }
+#else
+#define EXTERN_C
+#define EXTERN_C_BEGIN
+#define EXTERN_C_END
+#endif
+
+// Include C runtime, ignoring benign CXX-only warnings
+EXTERN_C_BEGIN
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-register"
+#include "runtime.h"
+#pragma GCC diagnostic pop
+EXTERN_C_END
diff --git a/sw/saris/runtime/sssr.h b/sw/saris/runtime/sssr.h
new file mode 100644
index 0000000000..171ccb454f
--- /dev/null
+++ b/sw/saris/runtime/sssr.h
@@ -0,0 +1,189 @@
+#pragma once
+
+// Registers
+#define __RT_SSSR_REG_STATUS     0
+#define __RT_SSSR_REG_REPEAT     1
+
+#define __RT_SSSR_REG_BOUND_0    2
+#define __RT_SSSR_REG_BOUND_1    3
+#define __RT_SSSR_REG_BOUND_2    4
+#define __RT_SSSR_REG_BOUND_3    5
+
+#define __RT_SSSR_REG_STRIDE_0   6
+#define __RT_SSSR_REG_STRIDE_1   7
+#define __RT_SSSR_REG_STRIDE_2   8
+#define __RT_SSSR_REG_STRIDE_3   9
+
+#define __RT_SSSR_REG_IDX_CFG    10
+#define __RT_SSSR_REG_IDX_BASE   11
+#define __RT_SSSR_REG_IDX_ISECT  12
+
+#define __RT_SSSR_REG_RPTR_INDIR 16
+#define __RT_SSSR_REG_RPTR_SLV   17
+#define __RT_SSSR_REG_RPTR_MST_NOSLV 18
+#define __RT_SSSR_REG_RPTR_MST_TOSLV 19
+
+#define __RT_SSSR_REG_WPTR_INDIR 20
+#define __RT_SSSR_REG_WPTR_SLV   21
+#define __RT_SSSR_REG_WPTR_MST_NOSLV 22
+#define __RT_SSSR_REG_WPTR_MST_TOSLV 23
+
+#define __RT_SSSR_REG_RPTR_0     24
+#define __RT_SSSR_REG_RPTR_1     25
+#define __RT_SSSR_REG_RPTR_2     26
+#define __RT_SSSR_REG_RPTR_3     27
+
+#define __RT_SSSR_REG_WPTR_0     28
+#define __RT_SSSR_REG_WPTR_1     29
+#define __RT_SSSR_REG_WPTR_2     30
+#define __RT_SSSR_REG_WPTR_3     31
+
+// Enable and disable
+#define __RT_SSSR_ENABLE  "csrsi 0x7C0, 1\n"
+#define __RT_SSSR_DISABLE "csrci 0x7C0, 1\n"
+
+// Write configuration registers
+// To write to all SSRs, use ssridx=31
+#define __RT_SSSR_IDXALL 31
+#define __RT_SSSR_SCFGWI_INT(valreg,ssridx,regidx) "scfgwi "#valreg", "#ssridx" | "#regidx"<<5\n"
+#define __RT_SSSR_SCFGWI(valreg,ssridx,regname) __RT_SSSR_SCFGWI_INT(valreg,ssridx,regname)
+
+// Read configuration registers
+#define __RT_SSSR_SCFGRI_INT(valreg,ssridx,regidx) "scfgri "#valreg", "#ssridx" | "#regidx"<<5\n"
+#define __RT_SSSR_SCFGRI(valreg,ssridx,regname) __RT_SSSR_SCFGRI_INT(valreg,ssridx,regname)
+
+// Assemble index configuration word
+#define __RT_SSSR_IDXSIZE_U8   0
+#define __RT_SSSR_IDXSIZE_U16  1
+#define __RT_SSSR_IDXSIZE_U32  2
+#define __RT_SSSR_IDXSIZE_U64  3
+#define __RT_SSSR_IDX_NOMERGE  0
+#define __RT_SSSR_IDX_MERGE    1
+#define __RT_SSSR_IDX_CFG(size,shift,flags) (((flags & 0xFFFF)<<16) | ((shift & 0xFF)<<8) | (size & 0xFF) )
+
+// Block until job is done
+// TODO: Replace with (shadowed) blocking read or write
+#define __RT_SSSR_WAIT_DONE(tempreg, ssridx) \
+    "1:" __RT_SSSR_SCFGRI(tempreg,ssridx,__RT_SSSR_REG_STATUS) \
+    "srli   "#tempreg", "#tempreg", 31  \n" \
+    "beqz   "#tempreg", 1b              \n"
+
+// Allocates the specified registers and fakes them as
+// outputs of an SSSR enable, enforcing an order.
+#define __RT_SSSR_BLOCK_BEGIN \
+    { \
+    register double _rt_sssr_0 asm("ft0"); \
+    register double _rt_sssr_1 asm("ft1"); \
+    register double _rt_sssr_2 asm("ft2"); \
+    asm volatile(__RT_SSSR_ENABLE : "+f"(_rt_sssr_0), "+f"(_rt_sssr_1), "+f"(_rt_sssr_2) :: "memory");
+
+// Disables the SSSRs, taking as fake inputs the allocated
+// registers for the SSRs and thus allowing reallocation.
+#define __RT_SSSR_BLOCK_END \
+    asm volatile(__RT_SSSR_DISABLE : "+f"(_rt_sssr_0), "+f"(_rt_sssr_1), "+f"(_rt_sssr_2) :: "memory"); \
+    }
+
+static inline void __rt_sssr_cfg_write(uint32_t val, uint32_t ssridx, uint32_t regidx) {
+    asm volatile (
+        __RT_SSSR_SCFGWI_INT(%[valreg],%[ssridx],%[regidx])
+        :: [valreg]"r"(val), [ssridx]"i"(ssridx), [regidx]"i"(regidx) : "memory"
+    );
+}
+
+static inline void __rt_sssr_cfg_write_ptr(void* val, uint32_t ssridx, uint32_t regidx) {
+    __rt_sssr_cfg_write((uintptr_t)val, ssridx, regidx);
+}
+
+static inline uint32_t __rt_sssr_cfg_read(uint32_t ssridx, uint32_t regidx) {
+    uint32_t ret;
+    asm volatile (
+        __RT_SSSR_SCFGRI_INT(%[retreg],%[ssridx],%[regidx])
+        : [retreg]"=r"(ret) : [ssridx]"i"(ssridx), [regidx]"i"(regidx) : "memory"
+    );
+    return ret;
+}
+
+static inline void __rt_sssr_enable() {
+    asm volatile(__RT_SSSR_ENABLE ::: "memory");
+}
+
+static inline void __rt_sssr_disable() {
+    asm volatile(__RT_SSSR_DISABLE ::: "memory");
+}
+
+static inline uint16_t __rt_sssr_ptoi(void* ptr) {
+    // We assume TCDM alignment here; TCDM address offset is ignored
+    // as it will be masked in the SSR at at the latest
+    return (uint16_t)((uintptr_t)ptr >> 3);
+}
+
+static inline void __rt_sssr_bound_stride_1d(
+    uint32_t ssridx,
+    uint32_t b0, uint32_t s0
+) {
+    // argument bounds and strides are *non-inclusive* for convenience
+    __rt_sssr_cfg_write(--b0, ssridx, __RT_SSSR_REG_BOUND_0);
+    __rt_sssr_cfg_write(s0, ssridx, __RT_SSSR_REG_STRIDE_0);
+}
+
+static inline void __rt_sssr_bound_stride_2d(
+    uint32_t ssridx,
+    uint32_t b0, uint32_t s0,
+    uint32_t b1, uint32_t s1
+) {
+    // argument bounds and strides are *non-inclusive* for convenience
+    __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0);
+    __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1);
+    uint32_t a = 0;
+    __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0);
+    a += s0 * b0;
+    __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1);
+}
+
+static inline void __rt_sssr_bound_stride_3d(
+    uint32_t ssridx,
+    uint32_t b0, uint32_t s0,
+    uint32_t b1, uint32_t s1,
+    uint32_t b2, uint32_t s2
+) {
+    // argument bounds and strides are *non-inclusive* for convenience
+    __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0);
+    __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1);
+    __rt_sssr_cfg_write(--b2 , ssridx, __RT_SSSR_REG_BOUND_2);
+    uint32_t a = 0;
+    __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0);
+    a += s0 * b0;
+    __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1);
+    a += s1 * b1;
+    __rt_sssr_cfg_write(s2-a, ssridx, __RT_SSSR_REG_STRIDE_2);
+}
+
+static inline void __rt_sssr_bound_stride_4d(
+    uint32_t ssridx,
+    uint32_t b0, uint32_t s0,
+    uint32_t b1, uint32_t s1,
+    uint32_t b2, uint32_t s2,
+    uint32_t b3, uint32_t s3
+) {
+    // argument bounds and strides are *non-inclusive* for convenience
+    __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0);
+    __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1);
+    __rt_sssr_cfg_write(--b2 , ssridx, __RT_SSSR_REG_BOUND_2);
+    __rt_sssr_cfg_write(--b3 , ssridx, __RT_SSSR_REG_BOUND_3);
+    uint32_t a = 0;
+    __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0);
+    a += s0 * b0;
+    __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1);
+    a += s1 * b1;
+    __rt_sssr_cfg_write(s2-a, ssridx, __RT_SSSR_REG_STRIDE_2);
+    a += s2 * b2;
+    __rt_sssr_cfg_write(s3-a, ssridx, __RT_SSSR_REG_STRIDE_3);
+}
+
+static inline void __rt_sssr_wait_done(uint32_t ssridx) {
+    uint32_t tmp;
+    asm volatile (
+        __RT_SSSR_WAIT_DONE(%[tmpreg],%[ssridx])
+        : [tmpreg]"+&r"(tmp) : [ssridx]"i"(ssridx) : "memory"
+    );
+}
diff --git a/sw/saris/stencils/istc.common.hpp b/sw/saris/stencils/istc.common.hpp
new file mode 100644
index 0000000000..042005a741
--- /dev/null
+++ b/sw/saris/stencils/istc.common.hpp
@@ -0,0 +1,181 @@
+#include <runtime.h>
+#include <math.h>
+#include <stdint.h>
+
+#pragma once
+
+// ============
+//    Macros
+// ============
+
+// ST and S contain temporal and spatial dimension constants, SP parallelization and unroll constants, C value constants of type `d_t`
+#define RCP *__restrict__ const
+#define PRM static constexpr int
+#define PRMD static constexpr double
+#define PRMX constexpr int
+#define PRMXD constexpr double
+struct __istc_dstr{PRM __dummy=0;};
+PRMX __istc_dstr::__dummy;
+#define KNL template<class st=__istc_dstr, class s=__istc_dstr, class sp=__istc_dstr, \
+                     class c=__istc_dstr,  class ci=__istc_dstr, typename d_t=double, typename i_t=uint16_t> \
+                     static __attribute__((noinline)) void
+#define IDXA volatile  __attribute__ ((__aligned__(8))) i_t
+#define COFA volatile  __attribute__ ((__aligned__(8))) d_t
+
+// Shorten indexing code a bit
+#define I(ptr) __rt_sssr_ptoi(ptr)
+// Further simplify RCP deref magic (selexp indexes into A)
+#define J(A, selexp) I(&(*A) selexp)
+
+// Shorten unroll for loops and canonical axis loops
+#define PRAGMA(X) _Pragma(#X)
+#define foru(unroll) \
+        PRAGMA(clang loop unroll_count(unroll)) \
+        for
+#define forp(unroll, i, init, pte, stride) for (int i = init; i < pte; i += stride)
+#define forpu(unroll, i, init, pte, stride) foru(unroll) (int i = init; i < pte; i += stride)
+// Axis assist macro: shortcut for most axes (requires KNL_IDS)
+#define forpx(axis, ii, init, pte) forp(sp::u##axis, ii, i##axis+init, pte, sp::p##axis)
+#define forpux(axis, ii, init, pte) forpu(sp::u##axis, ii, i##axis+init, pte, sp::p##axis)
+// Same as forpux, but explicitly control unroll (e.g. 1). Helps when kernels
+// get so large that register allocation suffocates and addresses stack-swap.
+#define forpex(unroll, axis, ii, init, pte) forpu(unroll, ii, i##axis+init, pte, sp::p##axis)
+// For manual unrolling: simply combines strides
+#define form(i, init, pte, stride) for (int i = init; i < pte; i += stride)
+
+// Macro to define core constants
+#define KNL_IDS(cid) \
+        const uint32_t ix = cid % sp::px; \
+        const uint32_t iy = (cid / sp::px) % sp::py; \
+        const uint32_t iz = cid / (sp::px * sp::py);
+
+#define sodt sizeof(d_t)
+
+// Macro for core constants with *local* unroll
+#define KNL_IDS_LOC(cid) \
+    KNL_IDS(cid) \
+    uint32_t lx = ix * sp::ux; \
+    uint32_t ly = iy * sp::uy; \
+    uint32_t lz = iz * sp::uz; \
+    constexpr uint32_t jmpz = sp::pz*sp::uz; \
+    constexpr uint32_t jmpy = sp::py*sp::uy; \
+    constexpr uint32_t jmpx = sp::px*sp::ux;
+
+// ========================
+//    Dimension defaults
+// ========================
+
+#define SU(name, dim) \
+    struct name {PRM n=dim; PRM nx=dim; PRM ny=dim; PRM nz=dim;}; \
+    PRMX name::n, name::nx, name::ny, name::nz;
+
+// Keep these dimensions aligned with data generation
+SU(s1s,  1000)
+SU(s1sm, 1728)
+SU(s1m,  2744)
+SU(s1ml, 4096)
+SU(s1l,  5832)
+
+SU(s2s,  32)
+SU(s2sm, 42)
+SU(s2m,  52)
+SU(s2ml, 64)
+SU(s2l,  76)
+
+SU(s3s,  10)
+SU(s3sm, 12)
+SU(s3m,  14)
+SU(s3ml, 16)
+SU(s3l,  18)
+
+#define ST(name, steps) \
+    struct name {PRM t=steps;}; \
+    PRMX name::t;
+
+ST(st1, 1)
+ST(st4, 4)
+ST(st12, 12)
+
+#define SP(name, ncores, parz, pary, parx, unrz, unry, unrx, unru) \
+    struct name {PRM nc=ncores; PRM px=parx; PRM py=pary; PRM pz=parz; PRM ux=unrx; PRM uy=unry; PRM uz=unrz; PRM uu=unru;}; \
+    PRMX name::nc, name::px, name::py, name::pz, name::ux, name::uy, name::uz, name::uu;
+
+SP(sp1, 8, 1, 1, 8, 1, 1, 4, 8)
+SP(sp2, 8, 1, 2, 4, 1, 2, 2, 8)
+SP(sp3, 8, 2, 2, 2, 1, 2, 2, 8)
+
+// =============
+//    Helpers
+// =============
+
+inline void __istc_barrier() {
+    __rt_barrier();
+}
+
+inline double __istc_sgnjx(double rs1, double rs2) {
+    double rd;
+    asm volatile("fsgnjx.d %[rd], %[rs1], %[rs2]" : [rd]"=f"(rd) : [rs1]"f"(rs1), [rs2]"f"(rs2));
+    return rd;
+}
+
+// Implements `sign(a) ==  sign(b) ? 0 : a` using only FP operations and no conditional logic
+inline double __istc_ternclip(double a, double b) {
+    // If `sign(a) == sign(b)`, then ainj is +|a|, otherwise |-a|
+    double ainj = __istc_sgnjx(a, b);
+    // This gives us +|a| if the condition holds, otherwise 0
+    double ainj_clip = fmax(ainj, 0.0);
+    // Inject original sign of a into the clipped result, yielding a or (+/-) 0
+    return copysign(ainj_clip, a);
+}
+
+// ==================
+//    ISSR helpers
+// ==================
+
+inline void __istc_setup_issrs(uint32_t idxsize, uint32_t i0l, uint32_t i1l) {
+    __rt_sssr_cfg_write(__RT_SSSR_IDX_CFG(idxsize, 0, 0), __RT_SSSR_IDXALL, __RT_SSSR_REG_IDX_CFG);
+    __rt_sssr_cfg_write(i0l-1, 0, __RT_SSSR_REG_BOUND_0);
+    __rt_sssr_cfg_write(i1l-1, 1, __RT_SSSR_REG_BOUND_0);
+}
+
+
+inline void __istc_iter_issrs(void* base, void* i0, void* i1) {
+    __rt_sssr_cfg_write_ptr(base, __RT_SSSR_IDXALL, __RT_SSSR_REG_IDX_BASE);
+    __rt_sssr_cfg_write_ptr(i0, 0, __RT_SSSR_REG_RPTR_INDIR);
+    __rt_sssr_cfg_write_ptr(i1, 1, __RT_SSSR_REG_RPTR_INDIR);
+}
+
+// ==========================
+//    Verification helpers
+// ==========================
+
+inline void __istc_cmp_grids(
+    uint32_t core_id, uint32_t core_num, uint32_t core_stride,
+    TCDM double* grid1, TCDM double* grid2, uint32_t len, double rel_eps,
+    TCDM volatile uint32_t* err_sema
+) {
+    __rt_barrier();
+    uint32_t errors = 0;
+    uint32_t stride = core_num * core_stride;
+    #pragma clang loop unroll_count(16)
+    for (int i = core_id; i < len; i += stride)
+        errors += (fabs(grid1[i] - grid2[i]) > fabs(rel_eps * grid1[i]));
+    __atomic_fetch_add(err_sema, errors, __ATOMIC_RELAXED);
+    __rt_barrier();
+}
+
+volatile void __attribute__((noinline)) __istc_touch_grid(
+    uint32_t core_id, uint32_t core_num, uint32_t core_stride,
+    TCDM double* grid, uint32_t len, TCDM volatile uint32_t* ret_sema
+) {
+    __rt_barrier();
+    uint32_t ret_loc;
+    double sum = 0.0;
+    uint32_t stride = core_num * core_stride;
+    #pragma clang loop unroll_count(16)
+    for (int i = core_id; i < len; i += stride)
+        sum += grid[i];
+    asm volatile("fcvt.w.d t1, %1; sub %0, t1, t1" : "=r"(ret_loc) : "f"(sum) : "memory", "t1");
+    __atomic_fetch_add(ret_sema, ret_loc, __ATOMIC_RELAXED);
+    __rt_barrier();
+}
diff --git a/sw/saris/stencils/istc.issr.hpp b/sw/saris/stencils/istc.issr.hpp
new file mode 100644
index 0000000000..c74d76b4dc
--- /dev/null
+++ b/sw/saris/stencils/istc.issr.hpp
@@ -0,0 +1,879 @@
+#include "istc.common.hpp"
+
+// ===============
+//    Polybench
+// ===============
+
+KNL istci_pb_jacobi_2d(
+    const int cid,
+    TCDM d_t (RCP A)[s::n][s::n],
+    TCDM d_t (RCP B)[s::n][s::n]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sx+sy;
+    constexpr uint32_t b = dx, l = dy, cc = dx+dy, r = cc+dx, tt = cc+dy;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    IDXA i0[10], i1[10];
+    /*b*/ i0[ 0] = b;  i0[ 1] = b + sx; i0[ 2] = b + sy; i0[ 3] = b + sb;
+    /*l*/ i0[ 4] = l;  i0[ 5] = l + sx; i0[ 6] = l + sy; i0[ 7] = l + sb;
+    /*c*/ i0[ 8] = cc; i0[ 9] = cc + sy;
+    /*r*/ i1[ 0] = r;  i1[ 1] = r  + sx; i1[ 2] = r  + sy; i1[ 3] = r  + sb;
+    /*t*/ i1[ 4] = tt; i1[ 5] = tt + sx; i1[ 6] = tt + sy; i1[ 7] = tt + sb;
+    /*c*/ i1[ 8] = cc + sx; i1[ 9] = cc + sb;
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 10, 10);
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (i, ly, s::n-2, jmpy) {
+            __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+            bool winit = true;
+            form (j, lx, s::n-2, jmpx) {
+                __istc_iter_issrs((void*)&(*A)[i][j], (void*)i0, (void*)i1);
+                if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*B)[i+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);}
+                asm volatile (
+                    // br0..3 = b0..3 + r0..3 and lt0..3 = l0..3 + t0..3
+                    "frep.i     %[c7], 1, 7, 0b001  \n"
+                    "fadd.d     fa0, ft0, ft1       \n"
+                    // p0..3 = br0..3 + lt0..3
+                    "frep.i     %[c3], 1, 3, 0b111  \n"
+                    "fadd.d     fa0, fa0, fa4       \n"
+                    // tt0..3 = p0..3 + c0..3
+                    "fadd.d     fa0, fa0, ft0       \n"
+                    "fadd.d     fa1, fa1, ft1       \n"
+                    "fadd.d     fa2, fa2, ft0       \n"
+                    "fadd.d     fa3, fa3, ft1       \n"
+                    // res0..3 = 0.2 * tt0..3
+                    "frep.i     %[c3], 1, 3, 0b100  \n"
+                    "fmul.d     ft2, %[cf], fa0     \n"
+                    :: [c7]"r"(7), [c3]"r"(3), [cf]"f"(0.2)
+                     : "memory", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"
+                );
+            }
+            lx = (lx + sp::px) % jmpx;
+        }
+        ly = (ly + sp::py) % jmpy;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+// ==========
+//    AN5D
+// ==========
+
+KNL istci_an5d_j2d5pt(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t b = dx, l = dy, cc = dx+dy, r = cc+dx, tt = cc+dy;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    IDXA i0[10], i1[10];
+    /*c*/ i0[ 0] = cc; i0[ 1] = cc+sy; /*b*/ i0[ 2] = b; i0[ 3] = b+sy;
+    /*l*/ i0[ 4] = l;  i0[ 5] = l+sy;  /*r*/ i0[ 6] = r; i0[ 7] = r+sy;
+    /*t*/ i0[ 8] = tt; i0[ 9] = tt+sy;
+    /*c*/ i1[ 0] = cc+sx; i1[ 1] = cc+sb; /*b*/ i1[ 2] = b+sx; i1[ 3] = b+sb;
+    /*l*/ i1[ 4] = l+sx;  i1[ 5] = l+sb;  /*r*/ i1[ 6] = r+sx; i1[ 7] = r+sb;
+    /*t*/ i1[ 8] = tt+sx; i1[ 9] = tt+sb;
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 10, 10);
+
+    // Avoid constant FP division
+    register d_t fac asm("ft7") = 1.0 / c::c0;
+    // Use stacked registers for FREP
+    register d_t cb asm("ft3") = c::ym[0];
+    register d_t cl asm("ft4") = c::xm[0];
+    register d_t cr asm("ft5") = c::xp[0];
+    register d_t ct asm("ft6") = c::yp[0];
+    register d_t cc_ asm("ft8") = c::cc;
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (y, ly, s::n-2, jmpy) {
+            __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+            bool winit = true;
+            form (x, lx, s::n-2, jmpx) {
+                __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+                if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);}
+                asm volatile (
+                    // Initialize accumulators: center
+                    "fmul.d    fa0, %[cc], ft0      \n"
+                    "fmul.d    fa1, %[cc], ft1      \n"
+                    "fmul.d    fa2, %[cc], ft0      \n"
+                    "fmul.d    fa3, %[cc], ft1      \n"
+                    // Do directionals as loop
+                    "frep.o    %[c3], 4, 3, 0b0010  \n"
+                    "fmadd.d   fa0, ft3, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft3, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft3, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft3, ft1, fa3   \n"
+                    // Final scaling and writeback
+                    "frep.i    %[c3], 1, 3, 0b100   \n"
+                    "fmul.d    ft2, %[fc], fa0      \n"
+                    : [cb]"+&f"(cb), [cl]"+&f"(cl), [cr]"+&f"(cr), [ct]"+&f"(ct),
+                      [cc]"+&f"(cc_), [fc]"+&f"(fac)
+                    : [c3]"r"(3)
+                    : "memory", "fa0", "fa1", "fa2", "fa3"
+                );
+            }
+            lx = (lx + sp::px) % jmpx;
+        }
+        ly = (ly + sp::py) % jmpy;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_j2d9pt(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t cc = 2*dy+2*dx,
+        b0 = cc-dy, b1 = cc-2*dy,
+        l0 = cc-dx, l1 = cc-2*dx,
+        r0 = cc+dx, r1 = cc+2*dx,
+        t0 = cc+dy, t1 = cc+2*dy;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    IDXA i0[18], i1[18];
+    /*cc*/ i0[ 0] = cc; i0[ 1] = cc+sy;
+    /*b0*/ i0[ 2] = b0; i0[ 3] = b0+sy; /*l0*/ i0[ 4] = l0; i0[ 5] = l0+sy;
+    /*r0*/ i0[ 6] = r0; i0[ 7] = r0+sy; /*t0*/ i0[ 8] = t0; i0[ 9] = t0+sy;
+    /*b1*/ i0[10] = b1; i0[11] = b1+sy; /*l1*/ i0[12] = l1; i0[13] = l1+sy;
+    /*r1*/ i0[14] = r1; i0[15] = r1+sy; /*t1*/ i0[16] = t1; i0[17] = t1+sy;
+    /*cc*/ i1[ 0] = cc+sx; i1[ 1] = cc+sb;
+    /*b0*/ i1[ 2] = b0+sx; i1[ 3] = b0+sb; /*l0*/ i1[ 4] = l0+sx; i1[ 5] = l0+sb;
+    /*r0*/ i1[ 6] = r0+sx; i1[ 7] = r0+sb; /*t0*/ i1[ 8] = t0+sx; i1[ 9] = t0+sb;
+    /*b1*/ i1[10] = b1+sx; i1[11] = b1+sb; /*l1*/ i1[12] = l1+sx; i1[13] = l1+sb;
+    /*r1*/ i1[14] = r1+sx; i1[15] = r1+sb; /*t1*/ i1[16] = t1+sx; i1[17] = t1+sb;
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 18, 18);
+
+    // Avoid constant FP division
+    register d_t fac asm("fa4") = 1.0 / c::c0;
+    // Use stacked registers for FREP
+    register d_t cb0 asm("ft3")  = c::ym[0];
+    register d_t cl0 asm("ft4")  = c::xm[0];
+    register d_t cr0 asm("ft5")  = c::xp[0];
+    register d_t ct0 asm("ft6")  = c::yp[0];
+    register d_t cb1 asm("ft8")  = c::ym[1];
+    register d_t cl1 asm("ft9")  = c::xm[1];
+    register d_t cr1 asm("ft10") = c::xp[1];
+    register d_t ct1 asm("ft11") = c::yp[1];
+    register d_t cc_ asm("fa5") = c::cc;
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (y, ly, s::n-4,jmpy) {
+            __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-4+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+            bool winit = true;
+            form (x, lx, s::n-4, jmpx) {
+                __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+                if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+2][lx+2], 2, __RT_SSSR_REG_WPTR_2);}
+                asm volatile (
+                    // Initialize accumulators: center
+                    "fmul.d    fa0, %[cc], ft0      \n"
+                    "fmul.d    fa1, %[cc], ft1      \n"
+                    "fmul.d    fa2, %[cc], ft0      \n"
+                    "fmul.d    fa3, %[cc], ft1      \n"
+                    // Do directionals as loop
+                    "frep.o    %[c3], 4, 3, 0b0010  \n"
+                    "fmadd.d   fa0, ft3, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft3, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft3, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft3, ft1, fa3   \n"
+                    // Do directionals as loop
+                    "frep.o    %[c3], 4, 3, 0b0010  \n"
+                    "fmadd.d   fa0, ft8, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft8, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft8, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft8, ft1, fa3   \n"
+                    // Final scaling and writeback
+                    "frep.i    %[c3], 1, 3, 0b100   \n"
+                    "fmul.d    ft2, %[fc], fa0      \n"
+                    : [cb0]"+&f"(cb0), [cl0]"+&f"(cl0), [cr0]"+&f"(cr0), [ct0]"+&f"(ct0),
+                      [cb1]"+&f"(cb1), [cl1]"+&f"(cl1), [cr1]"+&f"(cr1), [ct1]"+&f"(ct1),
+                      [cc]"+&f"(cc_), [fc]"+&f"(fac)
+                    : [c3]"r"(3)
+                    : "memory", "fa0", "fa1", "fa2", "fa3"
+                );
+            }
+            lx = (lx + sp::px) % jmpx;
+        }
+        ly = (ly + sp::py) % jmpy;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_j2d9pt_gol(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t
+        bl = 0,    bc = dx,      br = 2*dx,
+        ml = dy,   mc = dx+dy,   mr = 2*dx+dy,
+        tl = 2*dy, tc = dx+2*dy, tr = 2*dx+2*dy;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    IDXA i0[18], i1[18];
+    /*mc*/ i0[ 0] = mc; i0[ 1] = mc + sy;
+    /*bl*/ i0[ 2] = bl; i0[ 3] = bl + sy; /*bc*/ i0[ 4] = bc; i0[ 5] = bc + sy;
+    /*br*/ i0[ 6] = br; i0[ 7] = br + sy; /*ml*/ i0[ 8] = ml; i0[ 9] = ml + sy;
+    /*mr*/ i0[10] = mr; i0[11] = mr + sy; /*tl*/ i0[12] = tl; i0[13] = tl + sy;
+    /*tc*/ i0[14] = tc; i0[15] = tc + sy; /*tr*/ i0[16] = tr; i0[17] = tr + sy;
+    /*mc*/ i1[ 0] = mc + sx; i1[ 1] = mc + sb;
+    /*bl*/ i1[ 2] = bl + sx; i1[ 3] = bl + sb; /*bc*/ i1[ 4] = bc + sx; i1[ 5] = bc + sb;
+    /*br*/ i1[ 6] = br + sx; i1[ 7] = br + sb; /*ml*/ i1[ 8] = ml + sx; i1[ 9] = ml + sb;
+    /*mr*/ i1[10] = mr + sx; i1[11] = mr + sb; /*tl*/ i1[12] = tl + sx; i1[13] = tl + sb;
+    /*tc*/ i1[14] = tc + sx; i1[15] = tc + sb; /*tr*/ i1[16] = tr + sx; i1[17] = tr + sb;
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 18, 18);
+
+    // Avoid constant FP division
+    register d_t fac asm("fa4") = 1.0 / c::c0;
+    // Use stacked registers for FREP
+    register d_t cmc asm("fa5")  = c::c[1][1];
+    register d_t cbl asm("ft3")  = c::c[0][0];
+    register d_t cbc asm("ft4")  = c::c[0][1];
+    register d_t cbr asm("ft5")  = c::c[0][2];
+    register d_t cml asm("ft6")  = c::c[1][0];
+    register d_t cmr asm("ft8")  = c::c[1][2];
+    register d_t ctl asm("ft9")  = c::c[2][0];
+    register d_t ctc asm("ft10") = c::c[2][1];
+    register d_t ctr asm("ft11") = c::c[2][2];
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (y, ly, s::n-2,jmpy) {
+            __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+            bool winit = true;
+            form (x, lx, s::n-2, jmpx) {
+                __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+                if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);}
+                asm volatile (
+                    // Initialize accumulators: center
+                    "fmul.d    fa0, %[cmc], ft0     \n"
+                    "fmul.d    fa1, %[cmc], ft1     \n"
+                    "fmul.d    fa2, %[cmc], ft0     \n"
+                    "fmul.d    fa3, %[cmc], ft1     \n"
+                    // Do directionals as loop
+                    "frep.o    %[c3], 4, 3, 0b0010  \n"
+                    "fmadd.d   fa0, ft3, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft3, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft3, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft3, ft1, fa3   \n"
+                    // Do directionals as loop
+                    "frep.o    %[c3], 4, 3, 0b0010  \n"
+                    "fmadd.d   fa0, ft8, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft8, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft8, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft8, ft1, fa3   \n"
+                    // Final scaling and writeback
+                    "frep.i    %[c3], 1, 3, 0b100   \n"
+                    "fmul.d    ft2, %[fc], fa0      \n"
+                    : [cbl]"+&f"(cbl), [cbc]"+&f"(cbc), [cbr]"+&f"(cbr), [cml]"+&f"(cml),
+                      [cmr]"+&f"(cmr), [ctl]"+&f"(ctl), [ctc]"+&f"(ctc), [ctr]"+&f"(ctr),
+                      [cmc]"+&f"(cmc), [fc]"+&f"(fac)
+                    : [c3]"r"(3)
+                    : "memory", "fa0", "fa1", "fa2", "fa3"
+                );
+            }
+            lx = (lx + sp::px) % jmpx;
+        }
+        ly = (ly + sp::py) % jmpy;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_j3d27pt(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    constexpr uint32_t ilen = 2*27;
+    IDXA i0[ilen], i1[ilen];
+    IDXA *p0 = i0, *p1 = i1;
+    #pragma unroll
+    for (int z = 0; z < 3; ++z)
+        #pragma unroll
+        for (int y = 0; y < 3; ++y)
+            #pragma unroll
+            for (int x = 0; x < 3; ++x) {
+                uint32_t pt = z*dz + y*dy + x*dx;
+                /*pt0*/ *(p0++) = pt;    *(p0++) = pt+sy;
+                /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb;
+            }
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+    // Avoid constant FP division
+    register d_t fac asm("ft3") = 1.0 / c::c0;
+    // Buffer constants in order for SSR use (each repeated to cover unroll)
+    COFA ca[27];
+    COFA* pa = ca;
+    #pragma unroll
+    for (int z = 0; z < 3; ++z)
+        #pragma unroll
+        for (int y = 0; y < 3; ++y)
+            #pragma unroll
+            for (int x = 0; x < 3; ++x)
+                *(pa++) = c::c3[z][y][x];
+    __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+
+        form (z, lz, s::n-2,jmpz) {
+            form (y, ly, s::n-2,jmpy) {
+                __rt_sssr_bound_stride_2d(2, 27, sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, 0);
+                bool winit = true;
+                form (x, lx, s::n-2, jmpx) {
+                    __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1);
+                    if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+                    asm volatile (
+                        // Initialize accumulators: bottom left
+                        "fmul.d    fa0, ft2, ft0       \n"
+                        "fmul.d    fa1, ft2, ft1       \n"
+                        "fmul.d    fa2, ft2, ft0       \n"
+                        "fmul.d    fa3, ft2, ft1       \n"
+                        // Do remaining blocks as loop
+                        "frep.o    %[cd], 4, 3, 0b0000 \n"
+                        "fmadd.d   fa0, ft2, ft0, fa0  \n"
+                        "fmadd.d   fa1, ft2, ft1, fa1  \n"
+                        "fmadd.d   fa2, ft2, ft0, fa2  \n"
+                        "fmadd.d   fa3, ft2, ft1, fa3  \n"
+                        // Final scaling
+                        "frep.i    %[c3], 1, 3, 0b101  \n"
+                        "fmul.d    fa0, %[fc], fa0     \n"
+                        // Final writeback
+                        "fsd       fa0, 0    (%[wb])   \n"
+                        "fsd       fa1, %[sx](%[wb])   \n"
+                        "fsd       fa2, %[sy](%[wb])   \n"
+                        "fsd       fa3, %[sb](%[wb])   \n"
+                        : [fc]"+&f"(fac)
+                        : [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(27-2), [c3]"r"(3),
+                          [wb]"r"(&(*A[(t+1)%2])[z+1][y+1][x+1])
+                        : "memory", "fa0", "fa1", "fa2", "fa3"
+                    );
+                }
+                lx = (lx + sp::px) % jmpx;
+            }
+            ly = (ly + sp::py) % jmpy;
+        }
+        lz = (lz + sp::pz) % jmpz;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_star2dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    static_assert(ci::r >= 1, "Radius must be at least 1!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t cc = ci::r*dy + ci::r*dx;
+    constexpr uint32_t npoints = 1+4*ci::r;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    constexpr uint32_t ilen = 2*npoints;
+    IDXA i0[ilen], i1[ilen];
+    IDXA *p0 = i0, *p1 = i1;
+    /*cc0*/ *(p0++) = cc;    *(p0++) = cc+sy;
+    /*cc1*/ *(p1++) = cc+sx; *(p1++) = cc+sb;
+    #pragma unroll
+    for (int j = 1; j <= ci::r; ++j) {
+            uint32_t bb = cc-j*dy, ll = cc-j*dx, rr = cc+j*dx, tt = cc+j*dy;
+            /*bb0*/ *(p0++) = bb;    *(p0++) = bb+sy; /*ll0*/ *(p0++) = ll;    *(p0++) = ll+sy;
+            /*rr0*/ *(p0++) = rr;    *(p0++) = rr+sy; /*tt0*/ *(p0++) = tt;    *(p0++) = tt+sy;
+            /*bb1*/ *(p1++) = bb+sx; *(p1++) = bb+sb; /*ll1*/ *(p1++) = ll+sx; *(p1++) = ll+sb;
+            /*rr1*/ *(p1++) = rr+sx; *(p1++) = rr+sb; /*tt1*/ *(p1++) = tt+sx; *(p1++) = tt+sb;
+    }
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+    // Buffer constants in order for SSR use (each repeated to cover unroll)
+    COFA ca[npoints];
+    COFA* pa = ca;
+    /*cc*/ *(pa++) = c::cc;
+    #pragma unroll
+    for (int j = 0; j < ci::r; ++j) {
+        /*bb*/ *(pa++) = c::ym[j]; /*ll*/ *(pa++) = c::xm[j];
+        /*rr*/ *(pa++) = c::xp[j]; /*tt*/ *(pa++) = c::yp[j];
+    }
+    __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (y, ly, s::n-2*ci::r,jmpy) {
+            __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+            bool winit = true;
+            form (x, lx, s::n-2*ci::r, jmpx) {
+                __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+                if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+                asm volatile (
+                    // Initialize accumulators: center
+                    "fmul.d    fa0, ft2, ft0        \n"
+                    "fmul.d    fa1, ft2, ft1        \n"
+                    "fmul.d    fa2, ft2, ft0        \n"
+                    "fmul.d    fa3, ft2, ft1        \n"
+                    // Do directionals as loop
+                    "frep.o    %[cd], 4, 3, 0b0000  \n"
+                    "fmadd.d   fa0, ft2, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft2, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft2, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft2, ft1, fa3   \n"
+                    // Final writeback
+                    "fsd       fa0, 0    (%[wb])    \n"
+                    "fsd       fa1, %[sx](%[wb])    \n"
+                    "fsd       fa2, %[sy](%[wb])    \n"
+                    "fsd       fa3, %[sb](%[wb])    \n"
+                    :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+                       [wb]"r"(&(*A[(t+1)%2])[y+ci::r][x+ci::r])
+                    : "memory", "fa0", "fa1", "fa2", "fa3"
+                );
+            }
+            lx = (lx + sp::px) % jmpx;
+        }
+        ly = (ly + sp::py) % jmpy;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_box2dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    static_assert(ci::r >= 1, "Radius must be at least 1!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t npoints = (2*ci::r+1)*(2*ci::r+1);
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    constexpr uint32_t ilen = 2*npoints;
+    IDXA i0[ilen], i1[ilen];
+    IDXA *p0 = i0, *p1 = i1;
+    #pragma unroll
+    for (int y = 0; y < 2*ci::r+1; ++y)
+        #pragma unroll
+        for (int x = 0; x < 2*ci::r+1; ++x) {
+            uint32_t pt = y*dy + x*dx;
+            /*pt0*/ *(p0++) = pt;    *(p0++) = pt+sy;
+            /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb;
+        }
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+    // Buffer constants in order for SSR use (each repeated to cover unroll)
+    COFA ca[npoints];
+    COFA* pa = ca;
+    #pragma unroll
+    for (int y = 0; y < 2*ci::r+1; ++y)
+        #pragma unroll
+        for (int x = 0; x < 2*ci::r+1; ++x)
+            *(pa++) = c::c[y][x];
+    __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (y, ly, s::n-2*ci::r,jmpy) {
+            __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+            bool winit = true;
+            form (x, lx, s::n-2*ci::r, jmpx) {
+                __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+                if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+                asm volatile (
+                    // Initialize accumulators: center
+                    "fmul.d    fa0, ft2, ft0        \n"
+                    "fmul.d    fa1, ft2, ft1        \n"
+                    "fmul.d    fa2, ft2, ft0        \n"
+                    "fmul.d    fa3, ft2, ft1        \n"
+                    // Do directionals as loop
+                    "frep.o    %[cd], 4, 3, 0b0000  \n"
+                    "fmadd.d   fa0, ft2, ft0, fa0   \n"
+                    "fmadd.d   fa1, ft2, ft1, fa1   \n"
+                    "fmadd.d   fa2, ft2, ft0, fa2   \n"
+                    "fmadd.d   fa3, ft2, ft1, fa3   \n"
+                    // Final writeback
+                    "fsd       fa0, 0    (%[wb])    \n"
+                    "fsd       fa1, %[sx](%[wb])    \n"
+                    "fsd       fa2, %[sy](%[wb])    \n"
+                    "fsd       fa3, %[sb](%[wb])    \n"
+                    :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+                       [wb]"r"(&(*A[(t+1)%2])[y+ci::r][x+ci::r])
+                    : "memory", "fa0", "fa1", "fa2", "fa3"
+                );
+            }
+            lx = (lx + sp::px) % jmpx;
+        }
+        ly = (ly + sp::py) % jmpy;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_star3dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    static_assert(ci::r >= 1, "Radius must be at least 1!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t cc = ci::r*dz + ci::r*dy + ci::r*dx;
+    constexpr uint32_t npoints = 1+6*ci::r;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    constexpr uint32_t ilen = 2*npoints;
+    IDXA i0[ilen], i1[ilen];
+    IDXA *p0 = i0, *p1 = i1;
+    /*cc0*/ *(p0++) = cc;    *(p0++) = cc+sy;
+    /*cc1*/ *(p1++) = cc+sx; *(p1++) = cc+sb;
+    #pragma unroll
+    for (int j = 1; j <= ci::r; ++j) {
+            uint32_t bb = cc-j*dy, ll=cc-j*dx, rr = cc+j*dx, tt = cc+j*dy, aa = cc-j*dz, ff = cc+j*dz;
+            /*bb0*/ *(p0++) = bb;    *(p0++) = bb+sy; /*ll0*/ *(p0++) = ll;    *(p0++) = ll+sy;
+            /*rr0*/ *(p0++) = rr;    *(p0++) = rr+sy; /*tt0*/ *(p0++) = tt;    *(p0++) = tt+sy;
+            /*aa0*/ *(p0++) = aa;    *(p0++) = aa+sy; /*ff0*/ *(p0++) = ff;    *(p0++) = ff+sy;
+            /*bb1*/ *(p1++) = bb+sx; *(p1++) = bb+sb; /*ll1*/ *(p1++) = ll+sx; *(p1++) = ll+sb;
+            /*rr1*/ *(p1++) = rr+sx; *(p1++) = rr+sb; /*tt1*/ *(p1++) = tt+sx; *(p1++) = tt+sb;
+            /*aa1*/ *(p1++) = aa+sx; *(p1++) = aa+sb; /*ff1*/ *(p1++) = ff+sx; *(p1++) = ff+sb;
+    }
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+    // Buffer constants in order for SSR use (each repeated to cover unroll)
+    COFA ca[npoints];
+    COFA* pa = ca;
+    /*cc*/ *(pa++) = c::cc;
+    #pragma unroll
+    for (int j = 0; j < ci::r; ++j) {
+        /*bb*/ *(pa++) = c::ym[j]; /*ll*/ *(pa++) = c::xm[j];
+        /*rr*/ *(pa++) = c::xp[j]; /*tt*/ *(pa++) = c::yp[j];
+        /*rr*/ *(pa++) = c::zm[j]; /*tt*/ *(pa++) = c::zp[j];
+    }
+    __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (z, lz, s::n-2*ci::r,jmpz) {
+            form (y, ly, s::n-2*ci::r,jmpy) {
+                __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+                bool winit = true;
+                form (x, lx, s::n-2*ci::r, jmpx) {
+                    __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1);
+                    if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+                    asm volatile (
+                        // Initialize accumulators: center
+                        "fmul.d    fa0, ft2, ft0        \n"
+                        "fmul.d    fa1, ft2, ft1        \n"
+                        "fmul.d    fa2, ft2, ft0        \n"
+                        "fmul.d    fa3, ft2, ft1        \n"
+                        // Do directionals as loop
+                        "frep.o    %[cd], 4, 3, 0b0000  \n"
+                        "fmadd.d   fa0, ft2, ft0, fa0   \n"
+                        "fmadd.d   fa1, ft2, ft1, fa1   \n"
+                        "fmadd.d   fa2, ft2, ft0, fa2   \n"
+                        "fmadd.d   fa3, ft2, ft1, fa3   \n"
+                        // Final writeback
+                        "fsd       fa0, 0    (%[wb])    \n"
+                        "fsd       fa1, %[sx](%[wb])    \n"
+                        "fsd       fa2, %[sy](%[wb])    \n"
+                        "fsd       fa3, %[sb](%[wb])    \n"
+                        :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+                           [wb]"r"(&(*A[(t+1)%2])[z+ci::r][y+ci::r][x+ci::r])
+                        : "memory", "fa0", "fa1", "fa2", "fa3"
+                    );
+                }
+                lx = (lx + sp::px) % jmpx;
+            }
+            ly = (ly + sp::py) % jmpy;
+        }
+        lz = (lz + sp::pz) % jmpz;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_box3dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+    // Assertions and IDs
+    static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    static_assert(ci::r >= 1, "Radius must be at least 1!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t npoints = (2*ci::r+1)*(2*ci::r+1)*(2*ci::r+1);
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    constexpr uint32_t ilen = 2*npoints;
+    IDXA i0[ilen], i1[ilen];
+    IDXA *p0 = i0, *p1 = i1;
+    #pragma unroll
+    for (int z = 0; z < 2*ci::r+1; ++z)
+        #pragma unroll
+        for (int y = 0; y < 2*ci::r+1; ++y)
+            #pragma unroll
+            for (int x = 0; x < 2*ci::r+1; ++x) {
+                uint32_t pt = z*dz + y*dy + x*dx;
+                /*pt0*/ *(p0++) = pt;    *(p0++) = pt+sy;
+                /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb;
+            }
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+    // Buffer constants in order for SSR use (each repeated to cover unroll)
+    COFA ca[npoints];
+    COFA* pa = ca;
+    #pragma unroll
+    for (int z = 0; z < 2*ci::r+1; ++z)
+        #pragma unroll
+        for (int y = 0; y < 2*ci::r+1; ++y)
+            #pragma unroll
+            for (int x = 0; x < 2*ci::r+1; ++x)
+                *(pa++) = c::c3[z][y][x];
+    __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        form (z, lz, s::n-2*ci::r,jmpz) {
+            form (y, ly, s::n-2*ci::r,jmpy) {
+                __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+                bool winit = true;
+                form (x, lx, s::n-2*ci::r, jmpx) {
+                    __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1);
+                    if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+                    asm volatile (
+                        // Initialize accumulators: center
+                        "fmul.d    fa0, ft2, ft0        \n"
+                        "fmul.d    fa1, ft2, ft1        \n"
+                        "fmul.d    fa2, ft2, ft0        \n"
+                        "fmul.d    fa3, ft2, ft1        \n"
+                        // Do directionals as loop
+                        "frep.o    %[cd], 4, 3, 0b0000  \n"
+                        "fmadd.d   fa0, ft2, ft0, fa0   \n"
+                        "fmadd.d   fa1, ft2, ft1, fa1   \n"
+                        "fmadd.d   fa2, ft2, ft0, fa2   \n"
+                        "fmadd.d   fa3, ft2, ft1, fa3   \n"
+                        // Final writeback
+                        "fsd       fa0, 0    (%[wb])    \n"
+                        "fsd       fa1, %[sx](%[wb])    \n"
+                        "fsd       fa2, %[sy](%[wb])    \n"
+                        "fsd       fa3, %[sb](%[wb])    \n"
+                        :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+                           [wb]"r"(&(*A[(t+1)%2])[z+ci::r][y+ci::r][x+ci::r])
+                        : "memory", "fa0", "fa1", "fa2", "fa3"
+                    );
+                }
+                lx = (lx + sp::px) % jmpx;
+            }
+            ly = (ly + sp::py) % jmpy;
+        }
+        lz = (lz + sp::pz) % jmpz;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
+
+
+// =============
+//    Minimod
+// =============
+
+KNL istci_minimod_acoustic_iso_cd(
+    const int cid,
+    TCDM d_t (RCP u[2])[s::nz][s::ny][s::nx],
+    TCDM d_t (RCP f)[s::nz-8][s::ny-8][s::nx-8]
+) {
+    // Assertions and IDs
+    static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+    static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+    KNL_IDS_LOC(cid)
+
+    // Define points of stencil and unroll copies
+    constexpr uint32_t rad = 4;
+    constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+    constexpr uint32_t cct = rad*dz + rad*dy + rad*dx;
+    constexpr uint32_t nhpoints = 6*rad;
+    // Indices include padding on axes (do not init arrays to prevent memcpy)
+    constexpr uint32_t ilen = 2*nhpoints+4;
+    IDXA i0[ilen], i1[ilen];
+    IDXA *p0 = i0, *p1 = i1;
+    /*cc0*/ *(p0++) = cct; *(p1++) = cct+sx;
+    /*cc1*/ *(p0++) = cct+sy; *(p1++) = cct+sb;
+    #pragma unroll
+    for (int j = 1; j <= rad; ++j) {
+        uint32_t ll=cct-j*dx, rr = cct+j*dx, bb = cct-j*dy, tt = cct+j*dy, aa = cct-j*dz, ff = cct+j*dz;
+        /*ll0*/ *(p0++) = ll;    *(p1++) = ll+sx;
+        /*ll1*/ *(p0++) = ll+sy; *(p1++) = ll+sb;
+        /*rr0*/ *(p0++) = rr;    *(p1++) = rr+sx;
+        /*rr1*/ *(p0++) = rr+sy; *(p1++) = rr+sb;
+        /*bb0*/ *(p0++) = bb;    *(p1++) = bb+sx;
+        /*bb1*/ *(p0++) = bb+sy; *(p1++) = bb+sb;
+        /*tt0*/ *(p0++) = tt;    *(p1++) = tt+sx;
+        /*tt1*/ *(p0++) = tt+sy; *(p1++) = tt+sb;
+        /*aa0*/ *(p0++) = aa;    *(p1++) = aa+sx;
+        /*aa1*/ *(p0++) = aa+sy; *(p1++) = aa+sb;
+        /*ff0*/ *(p0++) = ff;    *(p1++) = ff+sx;
+        /*ff1*/ *(p0++) = ff+sy; *(p1++) = ff+sb;
+    }
+    __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+    // Use registers for coefficients
+    register d_t cc0 asm("f3");
+    register d_t cx0 asm("f4");
+    register d_t cy0 asm("f5");
+    register d_t cz0 asm("f6");
+    register d_t cx1 asm("f7");
+    register d_t cy1 asm("f8");
+    register d_t cz1 asm("f9");
+    register d_t cx2 asm("f10");
+    register d_t cy2 asm("f11");
+    register d_t cz2 asm("f12");
+    register d_t cx3 asm("f13");
+    register d_t cy3 asm("f14");
+    register d_t cz3 asm("f15");
+
+    // Preload registers
+    asm volatile(
+        "fld f13, -8(%[xp])  \n"
+        "fld f14, -8(%[yp])  \n"
+        "fld f15, -8(%[zp])  \n"
+            "fadd.d  f3, f13, f14 \n"
+        "fld f4,   0(%[xp])  \n"
+        "fld f5,   0(%[yp])  \n"
+        "fld f6,   0(%[zp])  \n"
+            "fadd.d  f3, f3, f15 \n"
+        "fld f7,   8(%[xp])  \n"
+        "fld f8,   8(%[yp])  \n"
+        "fld f9,   8(%[zp])  \n"
+            "fmul.d  f3, f3, %[cf2]\n"
+        "fld f10, 16(%[xp])  \n"
+        "fld f11, 16(%[yp])  \n"
+        "fld f12, 16(%[zp])  \n"
+        "fld f13, 24(%[xp])  \n"
+        "fld f14, 24(%[yp])  \n"
+        "fld f15, 24(%[zp])  \n"
+        : "+&f"(cx0), "+&f"(cy0), "+&f"(cz0), "+&f"(cx1), "+&f"(cy1), "+&f"(cz1),
+          "+&f"(cx2), "+&f"(cy2), "+&f"(cz2), "+&f"(cx3), "+&f"(cy3), "+&f"(cz3),
+          "+&f"(cc0)
+        : [xp]"r"(&c::xp[1]), [yp]"r"(&c::yp[1]), [zp]"r"(&c::zp[1]), [cf2]"f"(2.0)
+    );
+
+    // introduce variable for tracking impulse offsets
+    uint32_t lf = cid;
+
+    __RT_SSSR_BLOCK_BEGIN
+    for (int t = 0; t < st::t; t++) {
+        // We load last grid's center piece inside the time loop as it keeps changing
+        int32_t ccoffs = &(*u[(t+1)%2])[rad][rad][rad] - &(*u[t%2])[0][0][0];
+        /*cc0*/ i0[ilen-2] = ccoffs;    i0[ilen-1] = ccoffs+sy;
+        /*cc1*/ i1[ilen-2] = ccoffs+sx; i1[ilen-1] = ccoffs+sb;
+        form (z, lz, s::n-2*rad, jmpz) {
+            form (y, ly, s::n-2*rad, jmpy) {
+                __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2*rad+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+                bool winit = true;
+                form (x, lx, s::n-2*rad, jmpx) {
+                    register d_t fi0 asm("f28") = c::uffac * (*f)[z][y  ][x  ];
+                    register d_t fix asm("f29") = c::uffac * (*f)[z][y  ][x+1];
+                    // Set up SSRs
+                    __istc_iter_issrs((void*)&(*u[t%2])[z][y][x], (void*)i0, (void*)i1);
+                    // Load impulses
+                    register d_t fiy asm("f30") = c::uffac * (*f)[z][y+1][x  ];
+                    register d_t fib asm("f31") = c::uffac * (*f)[z][y+1][x+1];
+                    if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*u[(t+1)%2])[z+rad][y+rad][lx+rad], 2, __RT_SSSR_REG_WPTR_2);}
+                    asm volatile (
+                        // First add centerpoint
+                        "fmadd.d   f28,  f3, f0, f28    \n"
+                        "fmadd.d   f29,  f3, f1, f29    \n"
+                        "fmadd.d   f30,  f3, f0, f30    \n"
+                        "fmadd.d   f31,  f3, f1, f31    \n"
+                        // Iterate over points (stagger coeffs)
+                        "frep.o    %[c3], 8, 3, 0b010   \n"
+                        "fmadd.d   f28,  f4, f0, f28    \n"
+                        "fmadd.d   f29,  f4, f1, f29    \n"
+                        "fmadd.d   f30,  f4, f0, f30    \n"
+                        "fmadd.d   f31,  f4, f1, f31    \n"
+                        "fmadd.d   f28,  f4, f0, f28    \n"
+                        "fmadd.d   f29,  f4, f1, f29    \n"
+                        "fmadd.d   f30,  f4, f0, f30    \n"
+                        "fmadd.d   f31,  f4, f1, f31    \n"
+                        "frep.o    %[c7], 8, 7, 0b010   \n"
+                        "fmadd.d   f28,  f8, f0, f28    \n"
+                        "fmadd.d   f29,  f8, f1, f29    \n"
+                        "fmadd.d   f30,  f8, f0, f30    \n"
+                        "fmadd.d   f31,  f8, f1, f31    \n"
+                        "fmadd.d   f28,  f8, f0, f28    \n"
+                        "fmadd.d   f29,  f8, f1, f29    \n"
+                        "fmadd.d   f30,  f8, f0, f30    \n"
+                        "fmadd.d   f31,  f8, f1, f31    \n"
+                        // Final subtraction and writeback
+                        "fsub.d    f2, f28, f0          \n"
+                        "fsub.d    f2, f29, f1          \n"
+                        "fsub.d    f2, f30, f0          \n"
+                        "fsub.d    f2, f31, f1          \n"
+                        : "+&f"(cx0), "+&f"(cy0), "+&f"(cz0), "+&f"(cx1), "+&f"(cy1), "+&f"(cz1),
+                          "+&f"(cx2), "+&f"(cy2), "+&f"(cz2), "+&f"(cx3), "+&f"(cy3), "+&f"(cz3),
+                          "+&f"(cc0),
+                          "+&f"(fi0), "+&f"(fix), "+&f"(fiy), "+&f"(fib)
+                        : [c7]"r"(7), [c3]"r"(3)
+                        : "memory"
+                    );
+                }
+                lx = (lx + sp::ux) % jmpx;
+            }
+            ly = (ly + sp::uy) % jmpy;
+        }
+        lz = (lz + sp::uz) % jmpz;
+        __rt_barrier();
+    }
+    __RT_SSSR_BLOCK_END
+}
diff --git a/sw/saris/stencils/istc.par.hpp b/sw/saris/stencils/istc.par.hpp
new file mode 100644
index 0000000000..37ba6fd4e3
--- /dev/null
+++ b/sw/saris/stencils/istc.par.hpp
@@ -0,0 +1,239 @@
+#include "istc.common.hpp"
+
+// ===============
+//    Polybench
+// ===============
+
+KNL istcp_pb_jacobi_2d(
+    const int cid,
+    TCDM d_t (RCP A)[s::n][s::n],
+    TCDM d_t (RCP B)[s::n][s::n]
+) {
+    KNL_IDS(cid)
+    for (int t = 0; t < st::t; t++) {
+        forpx (y, i, 1, s::n-1)
+            forpex (4, x, j, 1, s::n-1)
+                (*B)[i][j] = 0.2 * ((*A)[i][j] + (*A)[i][j-1] + (*A)[i][1+j] + (*A)[1+i][j] + (*A)[i-1][j]);
+        __rt_barrier();
+    }
+}
+
+
+// ==========
+//    AN5D
+// ==========
+
+KNL istcp_an5d_j2d5pt(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    // Avoid constant FP division
+    constexpr d_t fac = 1.0 / c::c0;
+    for (int t = 0; t < st::t; t++) {
+        forpx (y, y, 1, s::ny-1)
+            forpex (4, x, x, 1, s::nx-1)
+                (*A[(t+1)%2])[y][x] = fac * (
+                    c::ym[0] * (*A[t%2])[y-1][x  ] +
+                    c::xm[0] * (*A[t%2])[y  ][x-1] +
+                    c::cc    * (*A[t%2])[y  ][x  ] +
+                    c::xp[0] * (*A[t%2])[y  ][x+1] +
+                    c::yp[0] * (*A[t%2])[y+1][x  ]
+                );
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_j2d9pt(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    // Avoid constant FP division
+    constexpr d_t fac = 1.0 / c::c0;
+    for (int t = 0; t < st::t; t++) {
+        forpx (y, y, 2, s::ny-2)
+            forpex (2, x, x, 2, s::nx-2)
+                (*A[(t+1)%2])[y][x] = fac * (
+                    c::ym[0] * (*A[t%2])[y-1][x  ] + c::ym[1] * (*A[t%2])[y-2][x  ] +
+                    c::xm[0] * (*A[t%2])[y  ][x-1] + c::xm[1] * (*A[t%2])[y  ][x-2] +
+                    c::cc    * (*A[t%2])[y  ][x  ] +
+                    c::xp[0] * (*A[t%2])[y  ][x+1] + c::xp[1] * (*A[t%2])[y  ][x+2] +
+                    c::yp[0] * (*A[t%2])[y+1][x  ] + c::yp[1] * (*A[t%2])[y+2][x  ]
+                );
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_j2d9pt_gol(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    // Avoid constant FP division
+    constexpr d_t fac = 1.0 / c::c0;
+    for (int t = 0; t < st::t; t++) {
+        forpx (y, y, 1, s::ny-1)
+            forpex (2, x, x, 1, s::nx-1) {
+                d_t acc = 0.0;
+                #pragma unroll
+                for (int dy = -1; dy <= 1; ++dy)
+                    #pragma unroll
+                    for (int dx = -1; dx <= 1; ++dx)
+                        acc += c::c[dy+1][dx+1] * (*A[t%2])[y+dy][x+dx];
+                (*A[(t+1)%2])[y][x] = fac * acc;
+            }
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_star2dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    for (int t = 0; t < st::t; t++) {
+        forpx (y, y, ci::r, s::ny-ci::r)
+            forpx (x, x, ci::r, s::nx-ci::r) {
+                d_t acc = c::cc * (*A[t%2])[y][x];
+                #pragma unroll
+                for (int dr = 0; dr < ci::r; ++dr) {
+                    acc += c::xm[dr] * (*A[t%2])[y][x-1-dr];
+                    acc += c::xp[dr] * (*A[t%2])[y][x+1+dr];
+                    acc += c::ym[dr] * (*A[t%2])[y-1-dr][x];
+                    acc += c::yp[dr] * (*A[t%2])[y+1+dr][x];
+                }
+                (*A[(t+1)%2])[y][x] = acc;
+            }
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_box2dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    for (int t = 0; t < st::t; t++) {
+        forpx (y, y, ci::r, s::ny-ci::r)
+            forpx (x, x, ci::r, s::nx-ci::r) {
+                d_t acc = 0.0;
+                #pragma unroll
+                for (int dy = -ci::r; dy <= ci::r; ++dy)
+                    #pragma unroll
+                    for (int dx = -ci::r; dx <= ci::r; ++dx)
+                        acc += c::c[dy+ci::r][dx+ci::r] * (*A[t%2])[y+dy][x+dx];
+                (*A[(t+1)%2])[y][x] = acc;
+            }
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_star3dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    for (int t = 0; t < st::t; t++) {
+        forpx (z, z, ci::r, s::nz-ci::r)
+            forpx (y, y, ci::r, s::ny-ci::r)
+                forpx (x, x, ci::r, s::nx-ci::r) {
+                    d_t acc = c::cc * (*A[t%2])[z][y][x];
+                    #pragma unroll
+                    for (int dr = 0; dr < ci::r; ++dr) {
+                        acc += c::xm[dr] * (*A[t%2])[z][y][x-1-dr];
+                        acc += c::xp[dr] * (*A[t%2])[z][y][x+1+dr];
+                        acc += c::ym[dr] * (*A[t%2])[z][y-1-dr][x];
+                        acc += c::yp[dr] * (*A[t%2])[z][y+1+dr][x];
+                        acc += c::zm[dr] * (*A[t%2])[z-1-dr][y][x];
+                        acc += c::zp[dr] * (*A[t%2])[z+1+dr][y][x];
+                    }
+                    (*A[(t+1)%2])[z][y][x] = acc;
+                }
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_box3dXr(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    for (int t = 0; t < st::t; t++) {
+        forpx (z, z, ci::r, s::nz-ci::r)
+            forpx (y, y, ci::r, s::ny-ci::r)
+                forpx (x, x, ci::r, s::nx-ci::r) {
+                    d_t acc = 0.0;
+                    for (int dz = -ci::r; dz <= ci::r; ++dz)
+                        #pragma unroll
+                        for (int dy = -ci::r; dy <= ci::r; ++dy)
+                            #pragma unroll
+                            for (int dx = -ci::r; dx <= ci::r; ++dx)
+                                acc += c::c3[dz+ci::r][dy+ci::r][dx+ci::r] * (*A[t%2])[z+dz][y+dy][x+dx];
+                    (*A[(t+1)%2])[z][y][x] = acc;
+                }
+        __rt_barrier();
+    }
+}
+
+
+KNL istcp_an5d_j3d27pt(
+    const int cid,
+    TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+    KNL_IDS(cid)
+    // Avoid constant FP division
+    constexpr d_t fac = 1.0 / c::c0;
+    for (int t = 0; t < st::t; t++) {
+        forpx (z, z, 1, s::nz-1)
+            forpx (y, y, 1, s::ny-1)
+                forpx (x, x, 1, s::nx-1) {
+                    d_t acc = 0.0;
+                    for (int dz = -1; dz <= 1; ++dz)
+                        #pragma unroll
+                        for (int dy = -1; dy <= 1; ++dy)
+                            #pragma unroll
+                            for (int dx = -1; dx <= 1; ++dx)
+                                acc += c::c3[dz+1][dy+1][dx+1] * (*A[t%2])[z+dz][y+dy][x+dx];
+                    (*A[(t+1)%2])[z][y][x] = fac * acc;
+                }
+        __rt_barrier();
+    }
+}
+
+// =============
+//    Minimod
+// =============
+
+KNL istcp_minimod_acoustic_iso_cd(
+    const int cid,
+    TCDM d_t (RCP u[2])[s::nz][s::ny][s::nx],
+    TCDM d_t (RCP f)[s::nz-8][s::ny-8][s::nx-8]
+) {
+    KNL_IDS(cid)
+    constexpr uint32_t rad = 4;
+    // Compute coefficient of center point
+    constexpr float cc = 2 * (c::xp[0] + c::yp[0] + c::zp[0]);
+    for (int t = 0; t < st::t; t++) {
+        forpx (z, z, rad, s::nz-rad)
+            forpx (y, y, rad, s::ny-rad)
+                forpx (x, x, rad, s::nx-rad) {
+                    // Initialize with incorporated impulse (has optional factor)
+                    d_t lapl = c::uffac * (*f)[z-rad][y-rad][x-rad];
+                    // Compute Laplacian
+                    lapl += cc * (*u[t%2])[z][y][x];
+                    for (int m = 1; m <= rad; ++m)
+                        lapl += c::xp[m] * ((*u[t%2])[z][y][x-m] + (*u[t%2])[z][y][x+m]) +
+                                c::yp[m] * ((*u[t%2])[z][y-m][x] + (*u[t%2])[z][y+m][x]) +
+                                c::zp[m] * ((*u[t%2])[z-m][y][x] + (*u[t%2])[z+m][y][x]);
+                    (*u[(t+1)%2])[z][y][x] = lapl - (*u[(t+1)%2])[z][y][x];
+                }
+        __rt_barrier();
+    }
+}
diff --git a/sw/saris/util/eval.cpp.tpl b/sw/saris/util/eval.cpp.tpl
new file mode 100644
index 0000000000..ad22b628ba
--- /dev/null
+++ b/sw/saris/util/eval.cpp.tpl
@@ -0,0 +1,55 @@
+#include "runtime.hpp"
+#include "istc.par.hpp"
+#include "istc.issr.hpp"
+
+${datadecls}
+${bundledecls}
+
+${ctgrids}
+
+${ciparams}
+
+TCDMDECL volatile uint32_t err_sema = 0;
+
+EXTERN_C int smain(uint32_t core_id, uint32_t core_num, void* tcdm_start, void* tcdm_end) {
+
+    // Kick DMCC
+    if (core_id == core_num-1) {
+        __rt_barrier();
+
+% for i in range(nbarriers):
+        // Kernel ${i}
+${indent(dma_transfers, " "*8)}
+        __rt_barrier();
+% endfor
+        goto past_knl;
+    }
+
+    __rt_barrier();
+    __rt_get_timer();
+% for k in kernels:
+    ${k[1]};
+    __rt_get_timer();
+% endfor
+
+past_knl:
+% for name, touch in touches.items():
+    if (core_id == 0) printf("touching `${name}`\n");
+    __istc_touch_grid(
+        core_id, core_num, ${touch['stride']},
+        ${touch['ptr']}, ${touch['len']}, &err_sema
+    );
+% endfor
+% for i, check in enumerate(checks):
+    if (core_id == 0) printf("Performing check ${i}\n");
+    __istc_cmp_grids(
+        core_id, core_num, ${check['stride']},
+        ${check['a']}, ${check['b']}, ${check['len']}, ${check['eps']},
+        &err_sema
+    );
+% endfor
+
+    return err_sema;
+}
+
+${datainits}
diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py
new file mode 100644
index 0000000000..4af67d6e74
--- /dev/null
+++ b/sw/saris/util/evalgen.py
@@ -0,0 +1,312 @@
+import sys
+import json
+import numpy as np
+from textwrap import indent
+from mako.template import Template
+
+
+CHECK_DEF_STRIDE = 17
+CHECK_DEF_EPS = 1e-7
+ELEMTYPE = 'double'
+ELEMS_PER_ROW = 4
+
+# Keep these dimensions aligned with code headers
+GRID_DIMS = {
+    1: { 's': 1000,  'sm': 1728,  'm': 2744,  'ml': 4096,  'l': 5832, 'xl': 8192 },
+    2: { 's': 32,    'sm': 42,    'm': 52,    'ml': 64,    'l': 76,   'xl': 128  },
+    3: { 's': 10,    'sm': 12,    'm': 14,    'ml': 16,    'l': 18,   'xl': 32   },
+}
+
+CSTRUCT_FMT = 'struct TCDMSPC {prname} {{\n{body}\n}};\n{dtype} {decls};'
+
+CTSTRUCT_FTYPE = 'TCDM PRMD'
+CTSTRUCT_DTYPE = 'TCDM PRMXD'
+
+CTSTRUCT_DEFAULT_GRIDS = {
+    'xm':    {'seed': 1513, 'dims': [8]},
+    'xp':    {'seed': 1514, 'dims': [8]},
+    'ym':    {'seed': 1515, 'dims': [8]},
+    'yp':    {'seed': 1516, 'dims': [8]},
+    'zm':    {'seed': 1517, 'dims': [8]},
+    'zp':    {'seed': 1518, 'dims': [8]},
+    'cc':    {'seed': 1519},
+    'c0':    {'seed': 1520},
+    'uffac': {'seed': 1521},
+    'c':     {'seed': 1522, 'dims': [6, 6]},
+    'c3':    {'seed': 1523, 'dims': [3, 3, 3]}
+}
+
+CISTRUCT_FTYPE = 'TCDM PRM'
+CISTRUCT_DTYPE = 'TCDM PRMX'
+
+
+def set_seed(seed: int = None):
+    if seed is not None:
+        np.random.seed(seed)
+
+
+def resolve_dim(dim: str) -> int:
+    try:
+        ret = int(dim)
+    except ValueError:
+        # If the string does not match our expectations, this will throw accordingly
+        return GRID_DIMS[int(dim[0])][dim[1:]]
+    if ret <= 0:
+        raise ValueError(f'Dimensions must be bigger than 1 (got {ret})')
+    return ret
+
+
+def resolve_dims(grid_args: list) -> list:
+    return [resolve_dim(dim) for dim in grid_args]
+
+
+def gen_subscripts(int_dims: list) -> str:
+    return "".join(f'[{d}]' for d in int_dims)
+
+
+def resolve_check(check: dict, grids: dict):
+    # Set defaults as needed
+    if 'eps' not in check:
+        check['eps'] = CHECK_DEF_EPS
+    if 'stride' not in check:
+        check['stride'] = CHECK_DEF_STRIDE
+    # Resolve grids
+    for grid in ('a', 'b'):
+        # If either comparison reference is a known grid, resolve it and adopt its length
+        gname = check[grid]
+        if gname in grids:
+            dims = resolve_dims(grids[gname]['dims'])
+            check[grid] = f'&{gname}' + '[0]'*len(dims)
+            tgt_len = np.product(dims)
+            if 'len' in check:
+                assert check['len'] == tgt_len, \
+                       f'Mismatching grid check lengths: {tgt_len} ({grids[gname]}) vs {check["len"]}'
+            else:
+                check['len'] = tgt_len
+    # Make sure we have a length now
+    assert 'len' in check, f'Could not resolve length for check {check}'
+
+
+def resolve_touches(grids: dict, stride: int = CHECK_DEF_STRIDE) -> dict:
+    ret = {}
+    for name, grid in grids.items():
+        ret[name] = {'stride': stride}
+        # Resolve grid
+        dims = resolve_dims(grid['dims'])
+        ret[name]['ptr'] = f'&{name}' + '[0]'*len(dims)
+        ret[name]['len'] = np.product(dims)
+    return ret
+
+
+# Handles one level of nested array initialization.
+def generate_array_level(int_dims: list, zero, pos: int = 0) -> str:
+    # Handle degenerate scalar case
+    if (len(int_dims) == 0):
+        return str(np.random.normal(size=1)[0] if not zero else 0.0)
+    elif pos == len(int_dims)-1:
+        rand_doubles = np.random.normal(size=int_dims[-1]) if not zero else np.zeros(shape=int_dims[-1])
+        elems = [str(d) for d in rand_doubles]
+        elems_fmt = ",\n".join([", ".join(elems[i:i + ELEMS_PER_ROW])
+                               for i in range(0, len(elems), ELEMS_PER_ROW)])
+    else:
+        elems = [generate_array_level(int_dims, zero, pos+1) for _ in range(int_dims[pos])]
+        elems_fmt = ', '.join(elems)
+    return f'{{\n{indent(elems_fmt, " " * 4*(pos+1))}\n}}'
+
+
+# Returns declaration and initialization separately
+def generate_grids(grids: dict) -> (str, str):
+    decls = []
+    inits = []
+    for name, args in grids.items():
+        # First argument provides generation seed
+        set_seed(args['seed'])
+        int_dims = resolve_dims(args['dims'])
+        subscripts = gen_subscripts(int_dims)
+        attrs = (args['attrs'] + ' ') if 'attrs' in args else ''
+        decls.append(f'extern __attribute__((visibility("default"))) {attrs}{ELEMTYPE} {name}{subscripts};')
+        inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+    return '\n'.join(decls), '\n'.join(inits)
+
+
+# Returns the instantiation of a parameter static class
+def generate_ctstruct(grids: dict, prname = 'ct') -> str:
+    body = []
+    decls = []
+    for name, args in grids.items():
+        # First argument provides generation seed
+        set_seed(args['seed'])
+        int_dims = resolve_dims(args['dims']) if 'dims' in args else []
+        subscripts = gen_subscripts(int_dims)
+        body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+        decls.append(f'{prname}::{name}{subscripts}')
+    return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CTSTRUCT_DTYPE, decls=", ".join(decls))
+
+
+# Returns the instantiation of a parameter static class
+def generate_cistruct(params: dict, prname = 'ci') -> str:
+    body = []
+    decls = []
+    for lval, rval in params.items():
+        body.append(f'{CISTRUCT_FTYPE} {lval} = {rval};')
+        decls.append(f'{prname}::{lval}')
+    return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CISTRUCT_DTYPE, decls=", ".join(decls))
+
+
+# Returns declaration and initialization separately
+def generate_bundles(bundles: dict, grids: dict) -> str:
+    decls = []
+    for name, grid_names in bundles.items():
+        int_dims = resolve_dims(grids[grid_names[0]]['dims'])
+        if any(int_dims != resolve_dims(grids[g]['dims']) for g in grid_names[1:]):
+            raise ValueError(f'Bundle {name} has mismatching grid dimensions')
+        attrs = grids[grid_names[0]]['attrs']
+        if any(attrs != grids[g]['attrs'] for g in grid_names[1:]):
+            raise ValueError(f'Bundle {name} has mismatching attributes')
+        attrs = (attrs + ' ') if attrs else ''
+        decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}]){gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};')
+    return '\n'.join(decls)
+
+
+# Returns a code snippet performing a DMA out transfer
+def generate_dma_out(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
+    ndim = len(dst_grid['dims'])
+    assert ndim == 3 or ndim == 2, 'Only 2D and 3D grids supported'
+
+    dst_dims = resolve_dims(dst_grid['dims'])
+    src_dims = resolve_dims(src_grid['dims'])
+
+    args = []
+    subscripts = f'[{radius}][{radius}]'
+    if ndim == 3:
+        subscripts = f'[{radius} + i]{subscripts}'
+    args.append(f'(void *)&({dst_grid["uid"]}{subscripts})')             # dst
+    args.append(f'(void *)&({src_grid["uid"]}{subscripts})')             # src
+    args.append(f'{src_dims[0] - radius * 2} * sizeof(double)')  # size
+    args.append(f'{src_dims[0]} * sizeof(double)')               # src_stride
+    args.append(f'{dst_dims[0]} * sizeof(double)')               # dst_stride
+    args.append(f'{src_dims[1] - radius * 2}')                   # repeat
+    args = ',\n'.join(args)
+
+    dma_call = f'__rt_dma_start_2d(\n{indent(args, " "*4)}\n);'
+    dma_transfer = f'{dma_call}\n'
+
+    if ndim == 3:
+        loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+        return loop
+    else:
+        return dma_transfer
+
+
+# Returns a code snippet performing a DMA in transfer
+def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
+    ndim = len(dst_grid['dims'])
+    assert ndim == 3 or ndim == 2, 'Only 2D and 3D grids supported'
+
+    dst_dims = resolve_dims(dst_grid['dims'])
+    src_dims = resolve_dims(src_grid['dims'])
+
+    args = []
+    subscripts = f'[0][0]'
+    if ndim == 3:
+        subscripts = f'[i]{subscripts}'
+    args.append(f'(void *)&({dst_grid["uid"]}{subscripts})')  # dst
+    args.append(f'(void *)&({src_grid["uid"]}{subscripts})')  # src
+    args.append(f'{dst_dims[0]} * sizeof(double)')    # size
+    args.append(f'{src_dims[0]} * sizeof(double)')    # src_stride
+    args.append(f'{dst_dims[0]} * sizeof(double)')    # dst_stride
+    args.append(f'{dst_dims[1]}')                     # repeat
+    args = ',\n'.join(args)
+
+    dma_call = f'__rt_dma_start_2d(\n{indent(args, " "*4)}\n);'
+    dma_transfer = f'{dma_call}\n'
+
+    if ndim == 3:
+        loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+        return loop
+    else:
+        return dma_transfer
+
+
+# Returns a grid dictionary from the grids dictionary,
+# where the key in the grids dictionary is appended to the value
+# as the 'uid' field.
+def get_grid(grids: dict, grid_uid: str) -> tuple:
+    grid = grids[grid_uid]
+    grid['uid'] = grid_uid
+    return grid
+
+
+def resolve_dma_transfers(transfers: list, radius: int) -> list:
+    # Uniformize single transfer and multiple transfer cases
+    if not isinstance(transfers[0], list):
+        transfers = [transfers]
+    # Expand bidirectional transfers into unidirectional transfers
+    unidir_transfers = []
+    for transfer in transfers:
+        if len(transfer) < 3:
+            unidir_transfers.append([*transfer, "in"])
+            unidir_transfers.append([*transfer, "out"])
+        else:
+            unidir_transfers.append(transfer)
+    # Add default radius if absent
+    for transfer in unidir_transfers:
+        if len(transfer) < 4:
+            transfer.append(radius)
+    return unidir_transfers
+
+
+# Returns a code snippet performing DMA transfers
+def generate_dma_transfers(grids: dict, transfers: list) -> str:
+    s = ''
+    for transfer in transfers:
+        l1_grid_name, l3_grid_name, direction, radius = transfer
+        l1_grid = get_grid(grids, l1_grid_name)
+        l3_grid = get_grid(grids, l3_grid_name)
+        if direction == 'out':
+            s += generate_dma_out(l3_grid, l1_grid, radius)
+        elif direction == 'in':
+            s += generate_dma_in(l1_grid, l3_grid, radius)
+        else:
+            raise ValueError()
+    s += '\n__rt_dma_wait_all();'
+    return s
+
+
+def main(cfg_file: str, tpl_file: str, program: str):
+    # Load programs to generate from config
+    with open(cfg_file) as f:
+        progs = json.load(f)
+    # Generate code for test program according to its config entry
+    cfg = progs[program]
+    grids = cfg['grids']
+    cfg['datadecls'], cfg['datainits'] = generate_grids(grids)
+    cfg['bundledecls'] = ""
+    if 'bundles' in cfg:
+        cfg['bundledecls'] = generate_bundles(cfg['bundles'], grids)
+    ctgrids = CTSTRUCT_DEFAULT_GRIDS;
+    if 'ctgrids' in cfg:
+        ctgrids.update(cfg['ctgrids'])
+    cfg['ctgrids'] = generate_ctstruct(ctgrids)
+    cfg['ciparams'] = ""
+    if 'params' in cfg:
+        cfg['ciparams'] = generate_cistruct(cfg['params'])
+    if 'checks' not in cfg:
+        cfg['checks'] = []
+    for check in cfg['checks']:
+        resolve_check(check, grids)
+    cfg['touches'] = {}
+    if 'touch' in cfg:
+        touches = {grid_name: grids[grid_name] for grid_name in cfg['touch']}
+        cfg['touches'] = resolve_touches(touches)
+    cfg['dma_transfers'] = ''
+    if 'dma' in cfg:
+        transfers = resolve_dma_transfers(cfg['dma'], cfg['radius'])
+        cfg['dma_transfers'] = generate_dma_transfers(grids, transfers)
+    cfg["nbarriers"] = sum(k[0] for k in cfg['kernels'])
+    cfg['indent'] = indent
+    print(Template(filename=tpl_file).render(**cfg))
+
+
+if __name__ == '__main__':
+    main(*sys.argv[1:])

From e73ef430ffa8355c0d9877ec8c2cd14b7d77dbbb Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 2 Apr 2024 16:38:47 +0200
Subject: [PATCH 04/10] sw/saris: Fix license headers

---
 sw/saris/runtime/crt0.S           | 4 ++++
 sw/saris/runtime/dma.h            | 4 ++++
 sw/saris/runtime/link.ld          | 4 ++++
 sw/saris/runtime/runtime.h        | 4 ++++
 sw/saris/runtime/runtime.hpp      | 4 ++++
 sw/saris/runtime/sssr.h           | 4 ++++
 sw/saris/stencils/istc.common.hpp | 4 ++++
 sw/saris/stencils/istc.issr.hpp   | 4 ++++
 sw/saris/stencils/istc.par.hpp    | 4 ++++
 sw/saris/util/eval.cpp.tpl        | 4 ++++
 sw/saris/util/evalgen.py          | 5 +++++
 11 files changed, 45 insertions(+)

diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S
index 79efb0cbbe..96efe9b49b 100644
--- a/sw/saris/runtime/crt0.S
+++ b/sw/saris/runtime/crt0.S
@@ -1,3 +1,7 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
 # HTIF sections
 .pushsection .htif,"aw",@progbits;
 .align 6; .global tohost; tohost: .dword 0;
diff --git a/sw/saris/runtime/dma.h b/sw/saris/runtime/dma.h
index 80956b0f73..5a664b0ce3 100644
--- a/sw/saris/runtime/dma.h
+++ b/sw/saris/runtime/dma.h
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include <stdint.h>
diff --git a/sw/saris/runtime/link.ld b/sw/saris/runtime/link.ld
index 5788547bdd..13fc1570f9 100644
--- a/sw/saris/runtime/link.ld
+++ b/sw/saris/runtime/link.ld
@@ -1,3 +1,7 @@
+/* Copyright 2024 ETH Zurich and University of Bologna. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: SHL-0.51 */
+
 OUTPUT_ARCH( "riscv" )
 ENTRY(_start)
 
diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h
index 883bacb2ae..414fa9e394 100644
--- a/sw/saris/runtime/runtime.h
+++ b/sw/saris/runtime/runtime.h
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 #include <stdint.h>
diff --git a/sw/saris/runtime/runtime.hpp b/sw/saris/runtime/runtime.hpp
index df501ff20e..b9a60e564a 100644
--- a/sw/saris/runtime/runtime.hpp
+++ b/sw/saris/runtime/runtime.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 // C linkage macros
diff --git a/sw/saris/runtime/sssr.h b/sw/saris/runtime/sssr.h
index 171ccb454f..78fec8f366 100644
--- a/sw/saris/runtime/sssr.h
+++ b/sw/saris/runtime/sssr.h
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #pragma once
 
 // Registers
diff --git a/sw/saris/stencils/istc.common.hpp b/sw/saris/stencils/istc.common.hpp
index 042005a741..e005e39ac7 100644
--- a/sw/saris/stencils/istc.common.hpp
+++ b/sw/saris/stencils/istc.common.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #include <runtime.h>
 #include <math.h>
 #include <stdint.h>
diff --git a/sw/saris/stencils/istc.issr.hpp b/sw/saris/stencils/istc.issr.hpp
index c74d76b4dc..d81614e36c 100644
--- a/sw/saris/stencils/istc.issr.hpp
+++ b/sw/saris/stencils/istc.issr.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #include "istc.common.hpp"
 
 // ===============
diff --git a/sw/saris/stencils/istc.par.hpp b/sw/saris/stencils/istc.par.hpp
index 37ba6fd4e3..26a042d05f 100644
--- a/sw/saris/stencils/istc.par.hpp
+++ b/sw/saris/stencils/istc.par.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #include "istc.common.hpp"
 
 // ===============
diff --git a/sw/saris/util/eval.cpp.tpl b/sw/saris/util/eval.cpp.tpl
index ad22b628ba..edd26e6c5b 100644
--- a/sw/saris/util/eval.cpp.tpl
+++ b/sw/saris/util/eval.cpp.tpl
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
 #include "runtime.hpp"
 #include "istc.par.hpp"
 #include "istc.issr.hpp"
diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py
index 4af67d6e74..df48f00f3d 100644
--- a/sw/saris/util/evalgen.py
+++ b/sw/saris/util/evalgen.py
@@ -1,3 +1,8 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
 import sys
 import json
 import numpy as np

From d41cd4e595450b77a4743861a523b8ae246c133b Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 2 Apr 2024 16:54:27 +0200
Subject: [PATCH 05/10] sw/saris: Fix python lint

---
 sw/saris/util/evalgen.py | 44 ++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py
index df48f00f3d..25c2c40b2b 100644
--- a/sw/saris/util/evalgen.py
+++ b/sw/saris/util/evalgen.py
@@ -17,9 +17,9 @@
 
 # Keep these dimensions aligned with code headers
 GRID_DIMS = {
-    1: { 's': 1000,  'sm': 1728,  'm': 2744,  'ml': 4096,  'l': 5832, 'xl': 8192 },
-    2: { 's': 32,    'sm': 42,    'm': 52,    'ml': 64,    'l': 76,   'xl': 128  },
-    3: { 's': 10,    'sm': 12,    'm': 14,    'ml': 16,    'l': 18,   'xl': 32   },
+    1: {'s': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192},
+    2: {'s': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128},
+    3: {'s': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32},
 }
 
 CSTRUCT_FMT = 'struct TCDMSPC {prname} {{\n{body}\n}};\n{dtype} {decls};'
@@ -85,7 +85,8 @@ def resolve_check(check: dict, grids: dict):
             tgt_len = np.product(dims)
             if 'len' in check:
                 assert check['len'] == tgt_len, \
-                       f'Mismatching grid check lengths: {tgt_len} ({grids[gname]}) vs {check["len"]}'
+                       'Mismatching grid check lengths:' \
+                       f'{tgt_len} ({grids[gname]}) vs {check["len"]}'
             else:
                 check['len'] = tgt_len
     # Make sure we have a length now
@@ -109,7 +110,8 @@ def generate_array_level(int_dims: list, zero, pos: int = 0) -> str:
     if (len(int_dims) == 0):
         return str(np.random.normal(size=1)[0] if not zero else 0.0)
     elif pos == len(int_dims)-1:
-        rand_doubles = np.random.normal(size=int_dims[-1]) if not zero else np.zeros(shape=int_dims[-1])
+        rand_doubles = np.random.normal(size=int_dims[-1]) if \
+            not zero else np.zeros(shape=int_dims[-1])
         elems = [str(d) for d in rand_doubles]
         elems_fmt = ",\n".join([", ".join(elems[i:i + ELEMS_PER_ROW])
                                for i in range(0, len(elems), ELEMS_PER_ROW)])
@@ -129,13 +131,15 @@ def generate_grids(grids: dict) -> (str, str):
         int_dims = resolve_dims(args['dims'])
         subscripts = gen_subscripts(int_dims)
         attrs = (args['attrs'] + ' ') if 'attrs' in args else ''
-        decls.append(f'extern __attribute__((visibility("default"))) {attrs}{ELEMTYPE} {name}{subscripts};')
-        inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+        decls.append('extern __attribute__((visibility("default")))' +
+                     f' {attrs}{ELEMTYPE} {name}{subscripts};')
+        inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} =' +
+                     f'{generate_array_level(int_dims, args["seed"] == 0)};')
     return '\n'.join(decls), '\n'.join(inits)
 
 
 # Returns the instantiation of a parameter static class
-def generate_ctstruct(grids: dict, prname = 'ct') -> str:
+def generate_ctstruct(grids: dict, prname='ct') -> str:
     body = []
     decls = []
     for name, args in grids.items():
@@ -143,19 +147,22 @@ def generate_ctstruct(grids: dict, prname = 'ct') -> str:
         set_seed(args['seed'])
         int_dims = resolve_dims(args['dims']) if 'dims' in args else []
         subscripts = gen_subscripts(int_dims)
-        body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+        body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = ' +
+                    f'{generate_array_level(int_dims, args["seed"] == 0)};')
         decls.append(f'{prname}::{name}{subscripts}')
-    return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CTSTRUCT_DTYPE, decls=", ".join(decls))
+    return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4),
+                              dtype=CTSTRUCT_DTYPE, decls=", ".join(decls))
 
 
 # Returns the instantiation of a parameter static class
-def generate_cistruct(params: dict, prname = 'ci') -> str:
+def generate_cistruct(params: dict, prname='ci') -> str:
     body = []
     decls = []
     for lval, rval in params.items():
         body.append(f'{CISTRUCT_FTYPE} {lval} = {rval};')
         decls.append(f'{prname}::{lval}')
-    return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CISTRUCT_DTYPE, decls=", ".join(decls))
+    return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4),
+                              dtype=CISTRUCT_DTYPE, decls=", ".join(decls))
 
 
 # Returns declaration and initialization separately
@@ -169,7 +176,8 @@ def generate_bundles(bundles: dict, grids: dict) -> str:
         if any(attrs != grids[g]['attrs'] for g in grid_names[1:]):
             raise ValueError(f'Bundle {name} has mismatching attributes')
         attrs = (attrs + ' ') if attrs else ''
-        decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}]){gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};')
+        decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}])' +
+                     f'{gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};')
     return '\n'.join(decls)
 
 
@@ -197,7 +205,8 @@ def generate_dma_out(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
     dma_transfer = f'{dma_call}\n'
 
     if ndim == 3:
-        loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+        loop = '#pragma clang loop unroll(disable)\nfor (int i = 0; i < ' + \
+            f'{src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
         return loop
     else:
         return dma_transfer
@@ -212,7 +221,7 @@ def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
     src_dims = resolve_dims(src_grid['dims'])
 
     args = []
-    subscripts = f'[0][0]'
+    subscripts = '[0][0]'
     if ndim == 3:
         subscripts = f'[i]{subscripts}'
     args.append(f'(void *)&({dst_grid["uid"]}{subscripts})')  # dst
@@ -227,7 +236,8 @@ def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
     dma_transfer = f'{dma_call}\n'
 
     if ndim == 3:
-        loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+        loop = '#pragma clang loop unroll(disable)\nfor (int i = 0; i < ' + \
+            f'{dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
         return loop
     else:
         return dma_transfer
@@ -289,7 +299,7 @@ def main(cfg_file: str, tpl_file: str, program: str):
     cfg['bundledecls'] = ""
     if 'bundles' in cfg:
         cfg['bundledecls'] = generate_bundles(cfg['bundles'], grids)
-    ctgrids = CTSTRUCT_DEFAULT_GRIDS;
+    ctgrids = CTSTRUCT_DEFAULT_GRIDS
     if 'ctgrids' in cfg:
         ctgrids.update(cfg['ctgrids'])
     cfg['ctgrids'] = generate_ctstruct(ctgrids)

From a62d6e41ef2dbbcbe7950a96cd517978c87b0b08 Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 2 Apr 2024 16:59:37 +0200
Subject: [PATCH 06/10] lint: Do not C++ lint SARIS sources

---
 .github/workflows/lint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 6c4f91184b..65159afabd 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -129,6 +129,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: DoozyX/clang-format-lint-action@v0.16.2
         with:
+          exclude: './sw/saris'
           clangFormatVersion: 10
 
   ######################

From 31aa679125f0ea9a5180cbae5bb1dfec7621291e Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 2 Apr 2024 18:00:11 +0200
Subject: [PATCH 07/10] sw/saris: Remove stub LLVM from makefile

---
 sw/saris/Makefile | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sw/saris/Makefile b/sw/saris/Makefile
index e9bfb82500..bb1033cd55 100644
--- a/sw/saris/Makefile
+++ b/sw/saris/Makefile
@@ -11,9 +11,13 @@ all:
 # Environment #
 ###############
 
-# NOTE: This needs to be a specific revision of PULP RISCV LLVM 15:
-# TODO: add commit link here
-LLVM_BINROOT ?= /home/paulsc/dev/llvm-ssr/llvm-iis/install/bin
+# NOTE: the LLVM_BINROOT environment variable must point to a specific revision of PULP RISCV
+# LLVM 15 (see README.md). After compilation, you can set LLVM_BINROOT in your environment, this
+# makefile, or pass it on invocation of `make`.
+ifndef LLVM_BINROOT
+$(error LLVM_BINROOT is not set; please compile the SARIS version of LLVM 15 (see README.md) and set LLVM_BINROOT to its binary location.)
+endif
+
 PYTHON3 ?= python3
 
 SARISDIR ?= .

From 2050a2aad569c8046c79be8a9aef5053b5597d69 Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 2 Apr 2024 18:00:32 +0200
Subject: [PATCH 08/10] sw/saris: Add README.md

---
 sw/saris/README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/sw/saris/README.md b/sw/saris/README.md
index 464090415c..29dd472152 100644
--- a/sw/saris/README.md
+++ b/sw/saris/README.md
@@ -1 +1,49 @@
-# TODO
+# SARIS Stencil Kernels
+
+This directory contains the baseline- and SSSR-accelerated Snitch cluster stencil kernels used in the evaluation section of the paper _"SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers"_. In our paper, we describe how indirect stream register architectures such as SSSRs can significantly accelerate stencil codes.
+
+If you use our code or compare against our work, please cite us:
+
+```
+TODO
+```
+
+> [!IMPORTANT]
+> - Unlike other software in this repository, compiling this code requires a **custom version of the LLVM 15 toolchain** with some extensions and improvements. The source code for this LLVM fork can be found [here](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0).
+> - The generated example programs are only intended to be used **in RTL simulation of an SSSR-extended cluster**, using the custom cluster configuration `cfg/sssr.json`.
+
+## Directory Structure
+
+* `stencils/`: Baseline (`istc.par.hpp`) and SARIS-accelerated (`istc.issr.hpp`) stencil codes.
+* `runtime/`: Additional runtime code and linking configuration needed for compilation.
+* `util/`: Evaluation program generator supporting different grid sizes and kernel calls.
+* `eval.json`: Configuration for test program generator.
+
+## Compile Evaluation Programs
+
+Before you can compile test problems, you need the [SARIS LLVM 15 toolchain](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0) along with `newlib` and `compiler-rt`. The required build steps are outlined [here](https://github.com/pulp-platform/llvm-toolchain-cd/blob/main/README.md).
+
+Then, you can build the test programs specified in `eval.json` by running:
+
+```
+make LLVM_BINROOT=<llvm_install_path>/bin all
+```
+
+By default, `eval.json` specifies RV32G and SSSR-accelerated test programs for all included stencils as specified in our paper. Binaries are generated in `bin/` and disassembled program dumps in `dump/`.
+
+
+## Run Evaluation Programs
+
+Evaluation programs can only be run in RTL simulation of a Snitch cluster using the configuration `cfg/sssr.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`:
+
+```
+make CFG_OVERRIDE=cfg/sssr.hjson bin/snitch_cluster.vsim
+```
+
+Then, the built evaluation programs can be run on this simulation setup as usual, for example:
+
+```
+bin/snitch_cluster.vsim ../../sw/saris/bin/istc.pb_jacobi_2d_ml_issr.elf
+```
+
+Performance metrics can be analyzed using the annotating Snitch tracer (`make traces`). In the default evaluation programs, the section of interest is section 2.

From ab4fe304366da849eca8ba1c27c332c817822913 Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Tue, 2 Apr 2024 19:07:23 +0200
Subject: [PATCH 09/10] sw/saris: Initialize putchar buffer, fix F extension
 skip

---
 sw/saris/runtime/crt0.S    | 18 +++++++++++++-----
 sw/saris/runtime/runtime.h |  2 +-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S
index 96efe9b49b..7b3b8644cc 100644
--- a/sw/saris/runtime/crt0.S
+++ b/sw/saris/runtime/crt0.S
@@ -30,10 +30,14 @@ _start:
   slli    t0, a0, 3
   sub     sp, sp, t0
 
-  # check if the core has the F-extension
-  csrr    t0, misa
-  andi    t0, t0, (1 << 5)
-  beqz    t0, _clr_ireg
+.globl _putcb
+_init_putcb:
+  la      t0, _putcb
+  # Initialize ptchar buffer size of each core to 0
+  slli    t1, a0, 10
+  add     t0, t0, t1
+  sw      zero, 0(t0)
+  sw      zero, 4(t0)
 
 _skip_dmcc_work:
   # Skip the coming two steps unless we are the DMA core
@@ -78,6 +82,11 @@ _dmcc_work_sync:
   # Synchronize cores so data is ready
   csrr    x0, 0x7C2
 
+  # check if the core has the F-extension
+  csrr    t0, misa
+  andi    t0, t0, (1 << 5)
+  beqz    t0, _clr_ireg
+
   # Reset float regs if present
 _clr_freg:
   fcvt.d.w f0, x0
@@ -158,6 +167,5 @@ _done:
   wfi
 
 
-.globl _putcb
 .section .data._putcb
 _putcb:
diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h
index 414fa9e394..072cfecbc0 100644
--- a/sw/saris/runtime/runtime.h
+++ b/sw/saris/runtime/runtime.h
@@ -32,7 +32,7 @@ static inline volatile uint32_t __rt_get_hartid() {
 }
 // Rudimentary string buffer for putchar calls.
 extern uint32_t _putcb;
-#define PUTC_BUFFER_LEN (1024 - sizeof(size_t))
+#define PUTC_BUFFER_LEN (1024 - sizeof(size_t) - 8*sizeof(uint64_t))
 
 typedef struct {
     size_t size;

From ea40640bd389721009a76fe4a19977dff68e1923 Mon Sep 17 00:00:00 2001
From: Paul Scheffler <paulsc@iis.ee.ethz.ch>
Date: Fri, 5 Apr 2024 18:11:37 +0200
Subject: [PATCH 10/10] sw/saris: Switch to, adapt default config, add bib
 placeholders

---
 README.md                               |  18 +++
 docs/publications.md                    |  18 +++
 sw/saris/README.md                      |  16 ++-
 target/snitch_cluster/cfg/default.hjson |  41 +++++--
 target/snitch_cluster/cfg/sssr.hjson    | 153 ------------------------
 5 files changed, 81 insertions(+), 165 deletions(-)
 delete mode 100644 target/snitch_cluster/cfg/sssr.hjson

diff --git a/README.md b/README.md
index 1f7b6459cd..4280d47438 100644
--- a/README.md
+++ b/README.md
@@ -161,3 +161,21 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
 ```
 
 </p>
+
+<details>
+<summary><b>SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers</b></summary>
+<p>
+
+```
+@misc{scheffler2024saris,
+      title={SARIS: Accelerating Stencil Computations on Energy-Efficient
+             RISC-V Compute Clusters with Indirect Stream Registers},
+      author={Paul Scheffler and Luca Colagrande and Luca Benini},
+      year={2024},
+      eprint={},
+      archivePrefix={arXiv},
+      primaryClass={cs.MS}
+}
+```
+
+</p>
diff --git a/docs/publications.md b/docs/publications.md
index e4c86b4c6d..2395b70c73 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -118,4 +118,22 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
 
 </p>
 
+<details>
+<summary><b>SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers</b></summary>
+<p>
+
+```
+@misc{scheffler2024saris,
+      title={SARIS: Accelerating Stencil Computations on Energy-Efficient
+             RISC-V Compute Clusters with Indirect Stream Registers},
+      author={Paul Scheffler and Luca Colagrande and Luca Benini},
+      year={2024},
+      eprint={},
+      archivePrefix={arXiv},
+      primaryClass={cs.MS}
+}
+```
+
+</p>
+
 <!--end-publications-->
diff --git a/sw/saris/README.md b/sw/saris/README.md
index 29dd472152..2da223df0d 100644
--- a/sw/saris/README.md
+++ b/sw/saris/README.md
@@ -5,12 +5,20 @@ This directory contains the baseline- and SSSR-accelerated Snitch cluster stenci
 If you use our code or compare against our work, please cite us:
 
 ```
-TODO
+@misc{scheffler2024saris,
+      title={SARIS: Accelerating Stencil Computations on Energy-Efficient
+             RISC-V Compute Clusters with Indirect Stream Registers},
+      author={Paul Scheffler and Luca Colagrande and Luca Benini},
+      year={2024},
+      eprint={},
+      archivePrefix={arXiv},
+      primaryClass={cs.MS}
+}
 ```
 
 > [!IMPORTANT]
 > - Unlike other software in this repository, compiling this code requires a **custom version of the LLVM 15 toolchain** with some extensions and improvements. The source code for this LLVM fork can be found [here](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0).
-> - The generated example programs are only intended to be used **in RTL simulation of an SSSR-extended cluster**, using the custom cluster configuration `cfg/sssr.json`.
+> - The generated example programs are only intended to be used **in RTL simulation of a default, SSSR-extended cluster**, using the cluster configuration `cfg/default.hjson`.
 
 ## Directory Structure
 
@@ -34,10 +42,10 @@ By default, `eval.json` specifies RV32G and SSSR-accelerated test programs for a
 
 ## Run Evaluation Programs
 
-Evaluation programs can only be run in RTL simulation of a Snitch cluster using the configuration `cfg/sssr.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`:
+Evaluation programs can only be run in RTL simulation of a Snitch cluster using the default, SSSR-enhanced configuration `cfg/default.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`:
 
 ```
-make CFG_OVERRIDE=cfg/sssr.hjson bin/snitch_cluster.vsim
+make CFG_OVERRIDE=cfg/default.hjson bin/snitch_cluster.vsim
 ```
 
 Then, the built evaluation programs can be run on this simulation setup as usual, for example:
diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson
index adfe7adf9e..2267b57525 100644
--- a/target/snitch_cluster/cfg/default.hjson
+++ b/target/snitch_cluster/cfg/default.hjson
@@ -16,6 +16,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
+        user_width: 5, // clog2(total number of clusters)
         tcdm: {
             size: 128,
             banks: 32,
@@ -24,14 +25,28 @@
         zero_mem_size: 64, // kB
         alias_region_enable: true,
         dma_data_width: 512,
-        dma_axi_req_fifo_depth: 3,
-        dma_req_fifo_depth: 3,
+        dma_axi_req_fifo_depth: 24,
+        dma_req_fifo_depth: 8,
+        narrow_trans: 4,
+        wide_trans: 32,
+        dma_user_width: 1,
+        // We don't need Snitch debugging in Occamy
+        enable_debug: false,
+        // We don't need Snitch (core-internal) virtual memory support
+        vm_support: false,
+        // Memory configuration inputs
+        sram_cfg_expose: true,
+        sram_cfg_fields: {
+            ema: 3,
+            emaw: 2,
+            emas: 1
+        },
         // Timing parameters
         timing: {
-            lat_comp_fp32: 3,
+            lat_comp_fp32: 2,
             lat_comp_fp64: 3,
-            lat_comp_fp16: 2,
-            lat_comp_fp16_alt: 2,
+            lat_comp_fp16: 1,
+            lat_comp_fp16_alt: 1,
             lat_comp_fp8: 1,
             lat_comp_fp8_alt: 1,
             lat_noncomp: 1,
@@ -44,7 +59,10 @@
             register_core_req: true,
             register_core_rsp: true,
             register_offload_req: true,
-            register_offload_rsp: true
+            register_offload_rsp: true,
+            register_fpu_req: true,
+            register_ext_narrow: false,
+            register_ext_wide: false
         },
         hives: [
             // Hive 0
@@ -94,6 +112,7 @@
         xf8alt: true,
         xfdotp: true,
         xfvec: true,
+        ssr_nr_credits: 4,
         num_int_outstanding_loads: 1,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
@@ -101,8 +120,14 @@
         num_sequencer_instructions: 16,
         num_dtlb_entries: 1,
         num_itlb_entries: 1,
-        // Enable division/square root unit
-        // Xdiv_sqrt: true,
+        // SSSR configuration below
+        ssr_intersection: true,
+        ssr_intersection_triple: [0, 1, 2],
+        ssrs: [
+            {indirection: true},    // Master 0
+            {indirection: true},    // Master 1
+            {},                     // Slave
+        ],
     },
     dma_core_template: {
         isa: "rv32imafd",
diff --git a/target/snitch_cluster/cfg/sssr.hjson b/target/snitch_cluster/cfg/sssr.hjson
deleted file mode 100644
index ee297960a9..0000000000
--- a/target/snitch_cluster/cfg/sssr.hjson
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Occamy-like Cluster configuration (+alias) for SSSR experiments
-{
-    nr_s1_quadrant: 1,
-    s1_quadrant: {
-        nr_clusters: 1,
-    },
-
-    cluster: {
-        boot_addr: 4096, // 0x1000
-        cluster_base_addr: 268435456, // 0x1000_0000
-        cluster_base_offset: 0, // 0x0
-        cluster_base_hartid: 0,
-        addr_width: 48,
-        data_width: 64,
-        user_width: 5, // clog2(total number of clusters)
-        tcdm: {
-            size: 128,
-            banks: 32,
-        },
-        cluster_periph_size: 64, // kB
-        zero_mem_size: 64, // kB
-        alias_region_enable: true,
-        dma_data_width: 512,
-        dma_axi_req_fifo_depth: 24,
-        dma_req_fifo_depth: 8,
-        narrow_trans: 4,
-        wide_trans: 32,
-        dma_user_width: 1,
-        // We don't need Snitch debugging in Occamy
-        enable_debug: false,
-        // We don't need Snitch (core-internal) virtual memory support
-        vm_support: false,
-        // Memory configuration inputs
-        sram_cfg_expose: true,
-        sram_cfg_fields: {
-            ema: 3,
-            emaw: 2,
-            emas: 1
-        },
-        // Timing parameters
-        timing: {
-            lat_comp_fp32: 2,
-            lat_comp_fp64: 3,
-            lat_comp_fp16: 1,
-            lat_comp_fp16_alt: 1,
-            lat_comp_fp8: 1,
-            lat_comp_fp8_alt: 1,
-            lat_noncomp: 1,
-            lat_conv: 2,
-            lat_sdotp: 3,
-            fpu_pipe_config: "BEFORE",
-            narrow_xbar_latency: "CUT_ALL_PORTS",
-            wide_xbar_latency: "CUT_ALL_PORTS",
-            // Isolate the core.
-            register_core_req: true,
-            register_core_rsp: true,
-            register_offload_req: true,
-            register_offload_rsp: true,
-            register_fpu_req: true,
-            register_ext_narrow: false,
-            register_ext_wide: false
-        },
-        hives: [
-            // Hive 0
-            {
-                icache: {
-                    size: 8, // total instruction cache size in kByte
-                    sets: 2, // number of ways
-                    cacheline: 256 // word size in bits
-                },
-                cores: [
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/dma_core_template" },
-                ]
-            }
-        ]
-    },
-    dram: {
-        // 0x8000_0000
-        address: 2147483648,
-        // 0x8000_0000
-        length: 2147483648
-    },
-    peripherals: {
-        clint: {
-            // 0xffff_0000
-            address: 4294901760,
-            // 0x0000_1000
-            length: 4096
-        },
-    },
-    // Templates.
-    compute_core_template: {
-        isa: "rv32imafd",
-        xssr: true,
-        xfrep: true,
-        xdma: false,
-        xf16: true,
-        xf16alt: true,
-        xf8: true,
-        xf8alt: true,
-        xfdotp: true,
-        xfvec: true,
-        ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
-        num_int_outstanding_mem: 4,
-        num_fp_outstanding_loads: 4,
-        num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
-        num_dtlb_entries: 1,
-        num_itlb_entries: 1,
-        // SSSR configuration below
-        ssr_intersection: true,
-        ssr_intersection_triple: [0, 1, 2],
-        ssrs: [
-            {indirection: true},    // Master 0
-            {indirection: true},    // Master 1
-            {},                     // Slave
-        ],
-    },
-    dma_core_template: {
-        isa: "rv32imafd",
-        // Xdiv_sqrt: true,
-        // isa: "rv32ema",
-        xdma: true,
-        xssr: false,
-        xfrep: false,
-        xf16: false,
-        xf16alt: false,
-        xf8: false,
-        xf8alt: false,
-        xfdotp: false,
-        xfvec: false,
-        num_int_outstanding_loads: 1,
-        num_int_outstanding_mem: 4,
-        num_fp_outstanding_loads: 4,
-        num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
-        num_dtlb_entries: 1,
-        num_itlb_entries: 1,
-    }
-}