From 7bdd312382f5630de2b98f1dd696b7277ecfa2cf Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Fri, 22 Mar 2024 22:10:18 +0100
Subject: [PATCH 01/10] hw: Keep IO fixed regardless of configuration
---
.../src/snitch_cluster_wrapper.sv.tpl | 16 ++++++----------
target/snitch_cluster/test/testharness.sv | 4 ++++
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index 293417ff68..c40f504406 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -210,28 +210,24 @@ ${ssr_cfg(core, "'{{{indirection:d}, {isect_master:d}, {isect_master_idx:d}, {is
${ssr_cfg(core, '{reg_idx}', '/*None*/ 0', ',')}\
};
+ // Forward potentially optional configuration parameters
+ localparam logic [9:0] CfgBaseHartId = (${to_sv_hex(cfg['cluster_base_hartid'], 10)});
+ localparam addr_t CfgClusterBaseAddr = (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])});
+
endpackage
// verilog_lint: waive-stop package-filename
module ${cfg['name']}_wrapper (
input logic clk_i,
input logic rst_ni,
-% if cfg['enable_debug']:
input logic [${cfg['pkg_name']}::NrCores-1:0] debug_req_i,
-% endif
input logic [${cfg['pkg_name']}::NrCores-1:0] meip_i,
input logic [${cfg['pkg_name']}::NrCores-1:0] mtip_i,
input logic [${cfg['pkg_name']}::NrCores-1:0] msip_i,
-% if cfg['cluster_base_expose']:
input logic [9:0] hart_base_id_i,
input logic [${cfg['addr_width']-1}:0] cluster_base_addr_i,
-% endif
-% if cfg['timing']['iso_crossings']:
input logic clk_d2_bypass_i,
-% endif
-% if cfg['sram_cfg_expose']:
input ${cfg['pkg_name']}::sram_cfgs_t sram_cfgs_i,
-%endif
input ${cfg['pkg_name']}::narrow_in_req_t narrow_in_req_i,
output ${cfg['pkg_name']}::narrow_in_resp_t narrow_in_resp_o,
output ${cfg['pkg_name']}::narrow_out_req_t narrow_out_req_o,
@@ -354,8 +350,8 @@ module ${cfg['name']}_wrapper (
.hart_base_id_i,
.cluster_base_addr_i,
% else:
- .hart_base_id_i (${to_sv_hex(cfg['cluster_base_hartid'], 10)}),
- .cluster_base_addr_i (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])}),
+ .hart_base_id_i (snitch_cluster_pkg::CfgBaseHartId),
+ .cluster_base_addr_i (snitch_cluster_pkg::CfgClusterBaseAddr),
% endif
% if cfg['timing']['iso_crossings']:
.clk_d2_bypass_i,
diff --git a/target/snitch_cluster/test/testharness.sv b/target/snitch_cluster/test/testharness.sv
index afc6972ed1..dbde824efc 100644
--- a/target/snitch_cluster/test/testharness.sv
+++ b/target/snitch_cluster/test/testharness.sv
@@ -29,6 +29,10 @@ module testharness import snitch_cluster_pkg::*; (
.meip_i ('0),
.mtip_i ('0),
.msip_i (msip),
+ .hart_base_id_i (CfgBaseHartId),
+ .cluster_base_addr_i (CfgClusterBaseAddr),
+ .clk_d2_bypass_i (1'b0),
+ .sram_cfgs_i (snitch_cluster_pkg::sram_cfgs_t'('0)),
.narrow_in_req_i (narrow_in_req),
.narrow_in_resp_o (narrow_in_resp),
.narrow_out_req_o (narrow_out_req),
From ecdc4657dbc78b05657e5bee4608a6e113d3358b Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Fri, 22 Mar 2024 23:03:54 +0100
Subject: [PATCH 02/10] target/snitch_cluster: Add Occamy-like config with
SSSRs
---
target/snitch_cluster/cfg/sssr.hjson | 153 +++++++++++++++++++++++++++
1 file changed, 153 insertions(+)
create mode 100644 target/snitch_cluster/cfg/sssr.hjson
diff --git a/target/snitch_cluster/cfg/sssr.hjson b/target/snitch_cluster/cfg/sssr.hjson
new file mode 100644
index 0000000000..ee297960a9
--- /dev/null
+++ b/target/snitch_cluster/cfg/sssr.hjson
@@ -0,0 +1,153 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Occamy-like Cluster configuration (+alias) for SSSR experiments
+{
+ nr_s1_quadrant: 1,
+ s1_quadrant: {
+ nr_clusters: 1,
+ },
+
+ cluster: {
+ boot_addr: 4096, // 0x1000
+ cluster_base_addr: 268435456, // 0x1000_0000
+ cluster_base_offset: 0, // 0x0
+ cluster_base_hartid: 0,
+ addr_width: 48,
+ data_width: 64,
+ user_width: 5, // clog2(total number of clusters)
+ tcdm: {
+ size: 128,
+ banks: 32,
+ },
+ cluster_periph_size: 64, // kB
+ zero_mem_size: 64, // kB
+ alias_region_enable: true,
+ dma_data_width: 512,
+ dma_axi_req_fifo_depth: 24,
+ dma_req_fifo_depth: 8,
+ narrow_trans: 4,
+ wide_trans: 32,
+ dma_user_width: 1,
+ // We don't need Snitch debugging in Occamy
+ enable_debug: false,
+ // We don't need Snitch (core-internal) virtual memory support
+ vm_support: false,
+ // Memory configuration inputs
+ sram_cfg_expose: true,
+ sram_cfg_fields: {
+ ema: 3,
+ emaw: 2,
+ emas: 1
+ },
+ // Timing parameters
+ timing: {
+ lat_comp_fp32: 2,
+ lat_comp_fp64: 3,
+ lat_comp_fp16: 1,
+ lat_comp_fp16_alt: 1,
+ lat_comp_fp8: 1,
+ lat_comp_fp8_alt: 1,
+ lat_noncomp: 1,
+ lat_conv: 2,
+ lat_sdotp: 3,
+ fpu_pipe_config: "BEFORE",
+ narrow_xbar_latency: "CUT_ALL_PORTS",
+ wide_xbar_latency: "CUT_ALL_PORTS",
+ // Isolate the core.
+ register_core_req: true,
+ register_core_rsp: true,
+ register_offload_req: true,
+ register_offload_rsp: true,
+ register_fpu_req: true,
+ register_ext_narrow: false,
+ register_ext_wide: false
+ },
+ hives: [
+ // Hive 0
+ {
+ icache: {
+ size: 8, // total instruction cache size in kByte
+ sets: 2, // number of ways
+ cacheline: 256 // word size in bits
+ },
+ cores: [
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/compute_core_template" },
+ { $ref: "#/dma_core_template" },
+ ]
+ }
+ ]
+ },
+ dram: {
+ // 0x8000_0000
+ address: 2147483648,
+ // 0x8000_0000
+ length: 2147483648
+ },
+ peripherals: {
+ clint: {
+ // 0xffff_0000
+ address: 4294901760,
+ // 0x0000_1000
+ length: 4096
+ },
+ },
+ // Templates.
+ compute_core_template: {
+ isa: "rv32imafd",
+ xssr: true,
+ xfrep: true,
+ xdma: false,
+ xf16: true,
+ xf16alt: true,
+ xf8: true,
+ xf8alt: true,
+ xfdotp: true,
+ xfvec: true,
+ ssr_nr_credits: 4,
+ num_int_outstanding_loads: 1,
+ num_int_outstanding_mem: 4,
+ num_fp_outstanding_loads: 4,
+ num_fp_outstanding_mem: 4,
+ num_sequencer_instructions: 16,
+ num_dtlb_entries: 1,
+ num_itlb_entries: 1,
+ // SSSR configuration below
+ ssr_intersection: true,
+ ssr_intersection_triple: [0, 1, 2],
+ ssrs: [
+ {indirection: true}, // Master 0
+ {indirection: true}, // Master 1
+ {}, // Slave
+ ],
+ },
+ dma_core_template: {
+ isa: "rv32imafd",
+ // Xdiv_sqrt: true,
+ // isa: "rv32ema",
+ xdma: true,
+ xssr: false,
+ xfrep: false,
+ xf16: false,
+ xf16alt: false,
+ xf8: false,
+ xf8alt: false,
+ xfdotp: false,
+ xfvec: false,
+ num_int_outstanding_loads: 1,
+ num_int_outstanding_mem: 4,
+ num_fp_outstanding_loads: 4,
+ num_fp_outstanding_mem: 4,
+ num_sequencer_instructions: 16,
+ num_dtlb_entries: 1,
+ num_itlb_entries: 1,
+ }
+}
From 9629cb985f5c7ea03bd2e6cf55a342cd93f5ea67 Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Fri, 22 Mar 2024 23:04:26 +0100
Subject: [PATCH 03/10] sw: Add SARIS kernels
---
sw/saris/.gitignore | 3 +
sw/saris/Makefile | 126 +++++
sw/saris/README.md | 1 +
sw/saris/eval.json | 396 ++++++++++++++
sw/saris/runtime/crt0.S | 159 ++++++
sw/saris/runtime/dma.h | 75 +++
sw/saris/runtime/link.ld | 42 ++
sw/saris/runtime/runtime.h | 137 +++++
sw/saris/runtime/runtime.hpp | 20 +
sw/saris/runtime/sssr.h | 189 +++++++
sw/saris/stencils/istc.common.hpp | 181 ++++++
sw/saris/stencils/istc.issr.hpp | 879 ++++++++++++++++++++++++++++++
sw/saris/stencils/istc.par.hpp | 239 ++++++++
sw/saris/util/eval.cpp.tpl | 55 ++
sw/saris/util/evalgen.py | 312 +++++++++++
15 files changed, 2814 insertions(+)
create mode 100644 sw/saris/.gitignore
create mode 100644 sw/saris/Makefile
create mode 100644 sw/saris/README.md
create mode 100644 sw/saris/eval.json
create mode 100644 sw/saris/runtime/crt0.S
create mode 100644 sw/saris/runtime/dma.h
create mode 100644 sw/saris/runtime/link.ld
create mode 100644 sw/saris/runtime/runtime.h
create mode 100644 sw/saris/runtime/runtime.hpp
create mode 100644 sw/saris/runtime/sssr.h
create mode 100644 sw/saris/stencils/istc.common.hpp
create mode 100644 sw/saris/stencils/istc.issr.hpp
create mode 100644 sw/saris/stencils/istc.par.hpp
create mode 100644 sw/saris/util/eval.cpp.tpl
create mode 100644 sw/saris/util/evalgen.py
diff --git a/sw/saris/.gitignore b/sw/saris/.gitignore
new file mode 100644
index 0000000000..7d0ba6408d
--- /dev/null
+++ b/sw/saris/.gitignore
@@ -0,0 +1,3 @@
+bin
+dump
+gen
diff --git a/sw/saris/Makefile b/sw/saris/Makefile
new file mode 100644
index 0000000000..e9bfb82500
--- /dev/null
+++ b/sw/saris/Makefile
@@ -0,0 +1,126 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Paul Scheffler
+# Luca Colagrande
+
+all:
+
+###############
+# Environment #
+###############
+
+# NOTE: This needs to be a specific revision of PULP RISCV LLVM 15:
+# TODO: add commit link here
+LLVM_BINROOT ?= /home/paulsc/dev/llvm-ssr/llvm-iis/install/bin
+PYTHON3 ?= python3
+
+SARISDIR ?= .
+GENDIR ?= $(SARISDIR)/gen
+UTILDIR ?= $(SARISDIR)/util
+BINDIR ?= $(SARISDIR)/bin
+DUMPDIR ?= $(SARISDIR)/dump
+RTDIR ?= $(SARISDIR)/runtime
+
+# We depend on the printf submodule
+PRINTFDIR ?= $(SARISDIR)/../deps/printf
+
+############################
+# Compiler (LLVM 15) Setup #
+############################
+
+RISCV_MARCH ?= \
+rv32imafd_zfh_xfrep_xssr_xdma_xfalthalf_xfquarter_xfaltquarter_xfvecsingle_xfvechalf_$\
+xfvecalthalf_xfvecquarter_xfvecaltquarter_xfauxhalf_xfauxalthalf_xfauxquarter_xfauxaltquarter_$\
+xfauxvecsingle_xfauxvechalf_xfauxvecalthalf_xfauxvecquarter_xfauxvecaltquarter_xfexpauxvechalf_$\
+xfexpauxvecalthalf_xfexpauxvecquarter_xfexpauxvecaltquarter
+
+RISCV_MABI ?= ilp32d
+
+RISCV_CC ?= $(LLVM_BINROOT)/clang
+RISCV_CXX ?= $(LLVM_BINROOT)/clang++
+RISCV_OBJDUMP ?= $(LLVM_BINROOT)/llvm-objdump
+RISCV_STRIP ?= $(LLVM_BINROOT)/llvm-strip
+
+RISCV_STACK ?= 2048
+RISCV_FLAGS ?= -mcpu=snitch -march=$(RISCV_MARCH) -Ofast -flto -mabi=$(RISCV_MABI) \
+ -Wframe-larger-than=$(RISCV_STACK) -nostdlib -mcmodel=medany -I$(RTDIR) \
+ -I$(SARISDIR)/stencils -I$(PRINTFDIR) -ffreestanding -fno-builtin \
+ -ffunction-sections
+
+RISCV_CFLAGS ?= $(RISCV_FLAGS)
+# Loop unrolling optimization
+RISCV_CFLAGS += -mllvm --allow-unroll-and-jam
+RISCV_CFLAGS += -mllvm --unroll-allow-partial
+RISCV_CFLAGS += -mllvm --unroll-runtime
+# Tree height reduction options
+RISCV_CFLAGS += -mllvm --enable-fp-thr
+RISCV_CFLAGS += -mllvm --thr-max-depth=5
+RISCV_CFLAGS += -mllvm --thr-se-leaves
+RISCV_CFLAGS += -mllvm --thr-fuse-bias
+RISCV_CFLAGS += -mllvm --thr-se-factor=2
+RISCV_CFLAGS += -mllvm --thr-re-factor=1
+# Machine scheduler and PostRA options
+RISCV_CFLAGS += -mllvm --post-RA-scheduler
+RISCV_CFLAGS += -mllvm --enable-misched
+RISCV_CFLAGS += -mllvm --enable-post-misched
+RISCV_CFLAGS += -mllvm --misched-postra
+
+RISCV_CCFLAGS ?= $(RISCV_CFLAGS) -std=gnu11
+RISCV_CXXFLAGS ?= $(RISCV_CFLAGS) -std=gnu++14
+RISCV_LDFLAGS ?= -fuse-ld=$(LLVM_BINROOT)/ld.lld -flto -static -lm $(RISCV_FLAGS) \
+ -Wl,--fatal-warnings -Wl,-z,stack-size=$(RISCV_STACK)
+RISCV_DMPFLAGS ?= --mcpu=snitch
+
+############################
+# SARIS Program Build Flow #
+############################
+
+.SECONDEXPANSION:
+.DELETE_ON_ERROR:
+
+# Extracting word nr. $(1) from $(2)-separated list $(3)
+pw = $(word $(1), $(subst $(2), ,$(3)))
+
+$(GENDIR) $(BINDIR) $(DUMPDIR):
+ mkdir -p $@
+
+$(BINDIR)/crt0.o: $(SARISDIR)/runtime/crt0.S | $(BINDIR)
+ $(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@
+
+$(BINDIR)/istc.%.c.o: $(GENDIR)/$$(call pw,1,.,$$*).cpp | $(BINDIR)
+ $(RISCV_CXX) $(RISCV_CXXFLAGS) -c $< -o $@
+
+.PRECIOUS: $(BINDIR)/%.elf
+$(BINDIR)/istc.%.elf: $(BINDIR)/istc.%.c.o $(BINDIR)/crt0.o $(RTDIR)/link.ld | $(BINDIR)
+ $(RISCV_CC) $(RISCV_LDFLAGS) -o $@ $< $(BINDIR)/crt0.o -T$(RTDIR)/link.ld
+ $(RISCV_STRIP) $@ -g -S -d --strip-debug -R .comment -R .riscv.attributes
+
+.PRECIOUS: $(DUMPDIR)/%.dump
+$(DUMPDIR)/%.dump: $(BINDIR)/%.elf | $(DUMPDIR)
+ @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .text -d $< >$@
+ @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .misc -s $< | tail -n +3 >>$@
+ @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .tcdm -s $< | tail -n +3 >>$@
+ @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .tcdmc -s $< | tail -n +3 >>$@
+
+# Phony for program and dump build
+prog.%: $(BINDIR)/%.elf $(DUMPDIR)/%.dump
+ @echo -e '\x1b[44;33;1mBUILT: $*\x1b[0m'
+
+clean:
+ rm -rf $(BINDIR) $(DUMPDIR) $(GENDIR)
+
+############################
+# SARIS Program Generation #
+############################
+
+.PRECIOUS: $(GENDIR)/%.cpp
+$(GENDIR)/%.cpp: $(UTILDIR)/evalgen.py $(SARISDIR)/eval.json $(UTILDIR)/eval.cpp.tpl | $(GENDIR)
+ $(PYTHON3) $^ $* > $@
+
+EVAL_NAMES ?= $(shell jq -r 'keys | join(" ")' $(SARISDIR)/eval.json)
+ISTC_PROGS += $(patsubst %,istc.%,$(EVAL_NAMES))
+
+# Default: compile all SARIS programs in eval.json
+all: $(addprefix prog.,$(ISTC_PROGS))
diff --git a/sw/saris/README.md b/sw/saris/README.md
new file mode 100644
index 0000000000..464090415c
--- /dev/null
+++ b/sw/saris/README.md
@@ -0,0 +1 @@
+# TODO
diff --git a/sw/saris/eval.json b/sw/saris/eval.json
new file mode 100644
index 0000000000..f1b102588b
--- /dev/null
+++ b/sw/saris/eval.json
@@ -0,0 +1,396 @@
+{
+
+ "pb_jacobi_2d_ml_par": {
+ "radius": 1,
+ "grids": {
+ "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "kernels": [
+ [1, "istcp_pb_jacobi_2d(core_id, &Ap2ml, &Bp2ml)"],
+ [1, "istcp_pb_jacobi_2d(core_id, &Ap2ml, &Bp2ml)"]
+ ],
+ "touch": ["Ap2ml", "Bp2ml"],
+ "dma": ["Cp2ml", "Dp2xl"]
+ },
+
+ "pb_jacobi_2d_ml_issr": {
+ "radius": 1,
+ "grids": {
+ "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "kernels": [
+ [1, "istci_pb_jacobi_2d(core_id, &Ai2ml, &Bi2ml)"],
+ [1, "istci_pb_jacobi_2d(core_id, &Ai2ml, &Bi2ml)"]
+ ],
+ "touch": ["Ai2ml", "Bi2ml"],
+ "dma": ["Ci2ml", "Di2xl"]
+ },
+
+
+
+ "an5d_j2d5pt_ml_par": {
+ "radius": 1,
+ "grids": {
+ "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+ "kernels": [
+ [1, "istcp_an5d_j2d5pt(core_id, &Ap22ml[0])"],
+ [1, "istcp_an5d_j2d5pt(core_id, &Ap22ml[0])"]
+ ],
+ "touch": ["Ap2ml", "Bp2ml"],
+ "dma": ["Cp2ml", "Dp2xl"]
+ },
+
+ "an5d_j2d5pt_ml_issr": {
+ "radius": 1,
+ "grids": {
+ "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+ "kernels": [
+ [1, "istci_an5d_j2d5pt(core_id, &Ai22ml[0])"],
+ [1, "istci_an5d_j2d5pt(core_id, &Ai22ml[0])"]
+ ],
+ "touch": ["Ai2ml", "Bi2ml"],
+ "dma": ["Ci2ml", "Di2xl"]
+ },
+
+
+
+ "an5d_j2d9pt_ml_par": {
+ "radius": 2,
+ "grids": {
+ "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+ "kernels": [
+ [1, "istcp_an5d_j2d9pt(core_id, &Ap22ml[0])"],
+ [1, "istcp_an5d_j2d9pt(core_id, &Ap22ml[0])"]
+ ],
+ "touch": ["Ap2ml", "Bp2ml"],
+ "dma": ["Cp2ml", "Dp2xl"]
+ },
+
+ "an5d_j2d9pt_ml_issr": {
+ "radius": 2,
+ "grids": {
+ "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+ "kernels": [
+ [1, "istci_an5d_j2d9pt(core_id, &Ai22ml[0])"],
+ [1, "istci_an5d_j2d9pt(core_id, &Ai22ml[0])"]
+
+ ],
+ "touch": ["Ai2ml", "Bi2ml"],
+ "dma": ["Ci2ml", "Di2xl"]
+ },
+
+
+
+ "an5d_j2d9pt_gol_ml_par": {
+ "radius": 1,
+ "grids": {
+ "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+ "kernels": [
+ [1, "istcp_an5d_j2d9pt_gol(core_id, &Ap22ml[0])"],
+ [1, "istcp_an5d_j2d9pt_gol(core_id, &Ap22ml[0])"]
+
+ ],
+ "touch": ["Ap2ml", "Bp2ml"],
+ "dma": ["Cp2ml", "Dp2xl"]
+ },
+
+ "an5d_j2d9pt_gol_ml_issr": {
+ "radius": 1,
+ "grids": {
+ "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+ "kernels": [
+ [1, "istci_an5d_j2d9pt_gol(core_id, &Ai22ml[0])"],
+ [1, "istci_an5d_j2d9pt_gol(core_id, &Ai22ml[0])"]
+
+ ],
+ "touch": ["Ai2ml", "Bi2ml"],
+ "dma": ["Ci2ml", "Di2xl"]
+ },
+
+
+
+ "an5d_j3d27pt_ml_par": {
+ "radius": 1,
+ "grids": {
+ "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+ },
+ "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+ "kernels": [
+ [1, "istcp_an5d_j3d27pt(core_id, &Ap32ml[0])"],
+ [1, "istcp_an5d_j3d27pt(core_id, &Ap32ml[0])"]
+ ],
+ "touch": ["Ap3ml", "Bp3ml"],
+ "dma": ["Cp3ml", "Dp3xl"]
+ },
+
+ "an5d_j3d27pt_ml_issr": {
+ "radius": 1,
+ "grids": {
+ "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+ },
+ "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+ "kernels": [
+ [1, "istci_an5d_j3d27pt(core_id, &Ai32ml[0])"],
+ [1, "istci_an5d_j3d27pt(core_id, &Ai32ml[0])"]
+
+ ],
+ "touch": ["Ai3ml", "Bi3ml"],
+ "dma": ["Ci3ml", "Di3xl"]
+ },
+
+
+
+ "an5d_star2d3r_ml_par": {
+ "radius": 3,
+ "grids": {
+ "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+ "params": {"r": 3},
+ "kernels": [
+ [1, "istcp_an5d_star2dXr(core_id, &Ap22ml[0])"],
+ [1, "istcp_an5d_star2dXr(core_id, &Ap22ml[0])"]
+ ],
+ "touch": ["Ap2ml", "Bp2ml"],
+ "dma": ["Cp2ml", "Dp2xl"]
+ },
+
+ "an5d_star2d3r_ml_issr": {
+ "radius": 3,
+ "grids": {
+ "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+ "params": {"r": 3},
+ "kernels": [
+ [1, "istci_an5d_star2dXr(core_id, &Ai22ml[0])"],
+ [1, "istci_an5d_star2dXr(core_id, &Ai22ml[0])"]
+
+ ],
+ "touch": ["Ai2ml", "Bi2ml"],
+ "dma": ["Ci2ml", "Di2xl"]
+ },
+
+
+
+ "an5d_box2d1r_ml_par": {
+ "radius": 1,
+ "grids": {
+ "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]},
+ "params": {"r": 1},
+ "kernels": [
+ [1, "istcp_an5d_box2dXr(core_id, &Ap22ml[0])"],
+ [1, "istcp_an5d_box2dXr(core_id, &Ap22ml[0])"]
+
+ ],
+ "touch": ["Ap2ml", "Bp2ml"],
+ "dma": ["Cp2ml", "Dp2xl"]
+ },
+
+ "an5d_box2d1r_ml_issr": {
+ "radius": 1,
+ "grids": {
+ "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"},
+ "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]}
+ },
+ "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]},
+ "params": {"r": 1},
+ "kernels": [
+ [1, "istci_an5d_box2dXr(core_id, &Ai22ml[0])"],
+ [1, "istci_an5d_box2dXr(core_id, &Ai22ml[0])"]
+ ],
+ "touch": ["Ai2ml", "Bi2ml"],
+ "dma": ["Ci2ml", "Di2xl"]
+ },
+
+
+
+ "an5d_star3d2r_ml_par": {
+ "radius": 2,
+ "grids": {
+ "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+ },
+ "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+ "params": {"r": 2},
+ "kernels": [
+ [1, "istcp_an5d_star3dXr(core_id, &Ap32ml[0])"],
+ [1, "istcp_an5d_star3dXr(core_id, &Ap32ml[0])"]
+ ],
+ "touch": ["Ap3ml", "Bp3ml"],
+ "dma": ["Cp3ml", "Dp3xl"]
+ },
+
+ "an5d_star3d2r_ml_issr": {
+ "radius": 2,
+ "grids": {
+ "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+ },
+ "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+ "params": {"r": 2},
+ "kernels": [
+ [1, "istci_an5d_star3dXr(core_id, &Ai32ml[0])"],
+ [1, "istci_an5d_star3dXr(core_id, &Ai32ml[0])"]
+
+ ],
+ "touch": ["Ai3ml", "Bi3ml"],
+ "dma": ["Ci3ml", "Di3xl"]
+ },
+
+
+
+ "an5d_box3d1r_ml_par": {
+ "radius": 1,
+ "grids": {
+ "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+ },
+ "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+ "params": {"r": 1},
+ "kernels": [
+ [1, "istcp_an5d_box3dXr(core_id, &Ap32ml[0])"],
+ [1, "istcp_an5d_box3dXr(core_id, &Ap32ml[0])"]
+ ],
+ "touch": ["Ap3ml", "Bp3ml"],
+ "dma": ["Cp3ml", "Dp3xl"]
+ },
+
+ "an5d_box3d1r_ml_issr": {
+ "radius": 1,
+ "grids": {
+ "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]}
+ },
+ "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+ "params": {"r": 1},
+ "kernels": [
+ [1, "istci_an5d_box3dXr(core_id, &Ai32ml[0])"],
+ [1, "istci_an5d_box3dXr(core_id, &Ai32ml[0])"]
+ ],
+ "touch": ["Ai3ml", "Bi3ml"],
+ "dma": ["Ci3ml", "Di3xl"]
+ },
+
+
+
+ "minimod_acoustic_iso_cd_ml_par": {
+ "radius": 4,
+ "grids": {
+ "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Dp3ml": {"seed": 1339, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "F3ml": {"seed": 1338, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "G3ml": {"seed": 1340, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "Hp3xl": {"seed": 1341, "dims": ["3xl", "3xl", "3xl"]},
+ "I3xl": {"seed": 1342, "dims": [16, 16, 16]},
+ "Ep3ml": {"seed": 1343, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "Jp3xl": {"seed": 1344, "dims": [16, 16, 16]}
+ },
+ "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]},
+ "kernels": [
+ [1, "istcp_minimod_acoustic_iso_cd(core_id, &Ap32ml[0], &F3ml)"],
+ [1, "istcp_minimod_acoustic_iso_cd(core_id, &Ap32ml[0], &F3ml)"]
+ ],
+ "touch": ["Ap3ml", "Bp3ml", "F3ml", "Dp3ml"],
+ "dma": [
+ ["Cp3ml", "Hp3xl", "out"],
+ ["G3ml", "I3xl", "in", 0],
+ ["Cp3ml", "Hp3xl", "in"],
+ ["Ep3ml", "Jp3xl", "in"]
+ ]
+ },
+
+ "minimod_acoustic_iso_cd_ml_issr": {
+ "radius": 4,
+ "grids": {
+ "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "Di3ml": {"seed": 1339, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "F3ml": {"seed": 1338, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"},
+ "G3ml": {"seed": 1340, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "Hp3xl": {"seed": 1341, "dims": ["3xl", "3xl", "3xl"]},
+ "I3xl": {"seed": 1342, "dims": [16, 16, 16]},
+ "Ep3ml": {"seed": 1343, "dims": [8, 8, 8], "attrs": "TCDMDECL"},
+ "Jp3xl": {"seed": 1344, "dims": [16, 16, 16]}
+ },
+ "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]},
+ "kernels": [
+ [1, "istci_minimod_acoustic_iso_cd(core_id, &Ai32ml[0], &F3ml)"],
+ [1, "istci_minimod_acoustic_iso_cd(core_id, &Ai32ml[0], &F3ml)"]
+ ],
+ "touch": ["Ai3ml", "Bi3ml", "F3ml", "Di3ml"],
+ "dma": [
+ ["Cp3ml", "Hp3xl", "out"],
+ ["G3ml", "I3xl", "in", 0],
+ ["Cp3ml", "Hp3xl", "in"],
+ ["Ep3ml", "Jp3xl", "in"]
+ ]
+ }
+
+}
diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S
new file mode 100644
index 0000000000..79efb0cbbe
--- /dev/null
+++ b/sw/saris/runtime/crt0.S
@@ -0,0 +1,159 @@
+# HTIF sections
+.pushsection .htif,"aw",@progbits;
+.align 6; .global tohost; tohost: .dword 0;
+.align 6; .global fromhost; fromhost: .dword 0;
+
+.globl _start
+.section .text._start
+_start:
+ # Set global pointer
+ .option push
+ .option norelax
+ la gp, __global_pointer
+ .option pop
+
+ # Prepare main arguments for single cluster
+ csrr a0, mhartid
+ la a1, __const_num_cores
+ la a2, __const_tcdm_start
+ la a3, __const_tcdm_end
+
+ # Set stack pointer; 1KiB per core
+ # Offset by 8B to prevent bank collisions
+ slli t0, a0, 10
+ addi sp, a3, -8
+ sub sp, sp, t0
+ slli t0, a0, 3
+ sub sp, sp, t0
+
+ # check if the core has the F-extension
+ csrr t0, misa
+ andi t0, t0, (1 << 5)
+ beqz t0, _clr_ireg
+
+_skip_dmcc_work:
+ # Skip the coming two steps unless we are the DMA core
+ # NOTE: this assumes the DMA core being the last in the cluster
+ addi t0, a1, -1
+ bne a0, t0, _dmcc_work_sync
+
+_preload_tcdm:
+ # Preload thread-local storage (TCDM) using DMA
+ la t0, __const_tcdm_losta
+ la t1, __const_tcdm_loend
+ sub t3, t1, t0
+ # Branch off if no tcdm data
+ beqz t3, _preload_tcdmc
+ # Launch copy to base of TCDM
+ dmsrc t0, zero
+ dmdst a2, zero
+ dmcpyi zero, t3, 0
+ # Await DMA
+ 1:
+ dmstati t0, 2
+ bnez t0, 1b
+
+_preload_tcdmc:
+ # Preload thread-local storage (TCDM) using DMA
+ la t0, __const_tcdmc_losta
+ la t1, __const_tcdmc_loend
+ sub t3, t1, t0
+ # Get tcdmc base, branch off if no tcdmc data
+ la t2, __const_tcdm_startc
+ beqz t3, _dmcc_work_sync
+ # Launch copy to past end of TCDM
+ dmsrc t0, zero
+ dmdst t2, zero
+ dmcpyi zero, t3, 0
+ # Await DMA
+ 1:
+ dmstati t0, 2
+ bnez t0, 1b
+
+_dmcc_work_sync:
+ # Synchronize cores so data is ready
+ csrr x0, 0x7C2
+
+ # Reset float regs if present
+_clr_freg:
+ fcvt.d.w f0, x0
+ fmv.d f1, f0
+ fmv.d f2, f0
+ fmv.d f3, f0
+ fmv.d f4, f0
+ fmv.d f5, f0
+ fmv.d f6, f0
+ fmv.d f7, f0
+ fmv.d f8, f0
+ fmv.d f9, f0
+ fmv.d f10, f0
+ fmv.d f11, f0
+ fmv.d f12, f0
+ fmv.d f13, f0
+ fmv.d f14, f0
+ fmv.d f15, f0
+ fmv.d f16, f0
+ fmv.d f17, f0
+ fmv.d f18, f0
+ fmv.d f19, f0
+ fmv.d f20, f0
+ fmv.d f10, f0
+ fmv.d f21, f0
+ fmv.d f22, f0
+ fmv.d f23, f0
+ fmv.d f24, f0
+ fmv.d f25, f0
+ fmv.d f26, f0
+ fmv.d f27, f0
+ fmv.d f28, f0
+ fmv.d f29, f0
+ fmv.d f30, f0
+ fmv.d f31, f0
+
+ # Reset remaining int regs
+_clr_ireg:
+ li tp, 0
+ li t0, 0
+ li t1, 0
+ li t2, 0
+ li t3, 0
+ li t4, 0
+ li t5, 0
+ li t6, 0
+ li a6, 0
+ li a7, 0
+ li s0, 0
+ li s1, 0
+ li s2, 0
+ li s3, 0
+ li s4, 0
+ li s5, 0
+ li s6, 0
+ li s7, 0
+ li s8, 0
+ li s9, 0
+ li s10, 0
+ li s11, 0
+
+ # Call main
+ call smain
+
+_eoc:
+ # Synchronize cores
+ csrr x0, 0x7C2
+ # Only core 0 (of all cores) returns
+ csrr t0, mhartid
+ bnez t0, _done
+ # Write termination bit and return code (a0) to tohost
+ slli a0, a0, 1
+ ori a0, a0, 1
+ la t0, tohost
+ sw a0, 0(t0)
+ # Go to sleep
+_done:
+ wfi
+
+
+.globl _putcb
+.section .data._putcb
+_putcb:
diff --git a/sw/saris/runtime/dma.h b/sw/saris/runtime/dma.h
new file mode 100644
index 0000000000..80956b0f73
--- /dev/null
+++ b/sw/saris/runtime/dma.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include
+#include
+
+// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers.
+static inline uint32_t __rt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
+ size_t size) {
+ register uint32_t reg_txid; // 10
+ asm volatile("dmsrc %[sl], %[sh]" :: [sh]"r"(src >> 32), [sl]"r"(src));
+ asm volatile("dmdst %[dl], %[dh]" :: [dh]"r"(dst >> 32), [dl]"r"(dst));
+ asm volatile("dmcpyi %[id], %[sz], 0" : [id]"=r"(reg_txid) : [sz]"r"(size));
+ return reg_txid;
+}
+
+// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers.
+static inline uint32_t __rt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
+ size_t size, size_t dst_stride,
+ size_t src_stride, size_t repeat) {
+ register uint32_t reg_txid; // 10
+ asm volatile("dmsrc %[sl], %[sh]" :: [sh]"r"(src >> 32), [sl]"r"(src));
+ asm volatile("dmdst %[dl], %[dh]" :: [dh]"r"(dst >> 32), [dl]"r"(dst));
+ asm volatile("dmstr %[rd], %[rs]" :: [rd]"r"(dst_stride), [rs]"r"(src_stride));
+ asm volatile("dmrep %[rp]" :: [rp]"r"(repeat));
+ asm volatile("dmcpyi %[id], %[sz], 2" : [id]"=r"(reg_txid) : [sz]"r"(size));
+ return reg_txid;
+}
+
+// Initiate an asynchronous 1D DMA transfer.
+static inline uint32_t __rt_dma_start_1d(void *dst, const void *src, size_t size) {
+ return __rt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size);
+}
+
+// Initiate an asynchronous 2D DMA transfer.
+static inline uint32_t __rt_dma_start_2d(void *dst, const void *src, size_t size,
+ size_t src_stride, size_t dst_stride,
+ size_t repeat) {
+ return __rt_dma_start_2d_wideptr((size_t)dst, (size_t)src, size, src_stride,
+ dst_stride, repeat);
+}
+
+// Last completed ID
+static inline volatile uint32_t __rt_dma_completed_id() {
+ register uint32_t cid;
+ asm volatile(
+ "dmstati %[cid], 0 \n " // 0=status.completed_id
+ : [cid]"=&r"(cid) :: "memory"
+ );
+ // TODO: Fix off-by-one bug in DMA hardware!
+ return cid+1;
+}
+
+// Block until a transfer finishes.
+static inline void __rt_dma_wait(uint32_t tid) {
+ register uint32_t tmp;
+ // TODO: Fix off-by-one bug in DMA hardware!
+ tid++;
+ asm volatile(
+ "1: \n"
+ "dmstati %[tmp], 0 \n " // 0=status.completed_id
+ "bgt %[tid], %[tmp], 1b \n" // branch back if ID to wait for > last completed ID
+ : [tmp]"=&r"(tmp) : [tid]"r"(tid)
+ );
+}
+
+// Block until all operation on the DMA ceases.
+static inline void __rt_dma_wait_all() {
+ register uint32_t tmp;
+ asm volatile(
+ "1: \n"
+ "dmstati %[tmp], 2 \n " // 2=status.busy
+ "bne %[tmp], zero, 1b \n"
+ : [tmp]"=&r"(tmp) :
+ );
+}
diff --git a/sw/saris/runtime/link.ld b/sw/saris/runtime/link.ld
new file mode 100644
index 0000000000..5788547bdd
--- /dev/null
+++ b/sw/saris/runtime/link.ld
@@ -0,0 +1,42 @@
+OUTPUT_ARCH( "riscv" )
+ENTRY(_start)
+
+MEMORY
+{
+ /* Reserve upper 9*1Ki = 9Ki of TCDM for stack, plus some padding.
+ This can be expanded to allocate the full CC TCDM as needed.
+ A 2 KiB RO is provided in the TCDM for small data + consts. */
+ tcdm (rw) : ORIGIN = 0x10000000, LENGTH = 0x1CC00
+ tcdmc (r) : ORIGIN = 0x1001CC00, LENGTH = 2K
+ dram (rwxa) : ORIGIN = 0x90000000, LENGTH = 1024M
+ dtxt (rwxa) : ORIGIN = 0x80000000, LENGTH = 1024M
+}
+
+SECTIONS
+{
+ /DISCARD/ : { *(.riscv.attributes) *(.comment) *(.rela.*) *(.sym.*) }
+
+ .text : { *(.text._start) *(.text) *(.text.*); . = ALIGN(16); } >dtxt
+ .misc : { *(.data) *(.data.*) *(.putcb) } >dram
+ .tcdm : { *(.tcdm) *(.l1) } >tcdm AT>dram
+ .tcdmc : { *(.sdata) *(.sdata.*) *(.rodata) *(.rodata.*) } >tcdmc AT>dram
+
+ /* Global and stack pointer: in TCDM */
+ __global_pointer = ADDR(.tcdmc) + SIZEOF(.tcdmc) / 2;
+
+ /* Memory Layout Constants */
+ __const_num_cores = 9;
+ __const_tcdm_start = ORIGIN(tcdm);
+ __const_tcdm_startc = ORIGIN(tcdmc);
+ __const_tcdm_end = ORIGIN(tcdm) + 128K;
+ __const_dram_start = ORIGIN(dram);
+
+ /* TCDM Loading */
+ __const_tcdm_losta = LOADADDR(.tcdm);
+ __const_tcdm_loend = LOADADDR(.tcdm) + SIZEOF(.tcdm);
+ __const_tcdmc_losta = LOADADDR(.tcdmc);
+ __const_tcdmc_loend = LOADADDR(.tcdmc) + SIZEOF(.tcdmc);
+
+ /* HTIF section for FESVR */
+ .htif : { } >dram
+}
diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h
new file mode 100644
index 0000000000..883bacb2ae
--- /dev/null
+++ b/sw/saris/runtime/runtime.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include
+#include
+#include "dma.h"
+#include "sssr.h"
+
+#define PRINTF_NTOA_BUFFER_SIZE 12
+#define PRINTF_DISABLE_SUPPORT_LONG_LONG 1
+
+#include "printf.h"
+
+extern uintptr_t volatile tohost, fromhost;
+
+extern void *__const_tcdm_start;
+extern void *__const_dram_start;
+
+// Use this to identify and differentiate TCDM data and pointers
+#define TCDMSPC __attribute__((address_space(1)))
+#define TCDMSEC __attribute__((section(".l1")))
+#define TCDM TCDMSPC
+#define TCDMDECL TCDMSPC TCDMSEC
+
+static inline volatile uint32_t __rt_get_hartid() {
+ uint32_t register r;
+ asm volatile ("csrr %0, mhartid" : "=r"(r));
+ return r;
+}
+// Rudimentary string buffer for putchar calls.
+extern uint32_t _putcb;
+#define PUTC_BUFFER_LEN (1024 - sizeof(size_t))
+
+typedef struct {
+ size_t size;
+ uint64_t syscall_mem[8];
+} putc_buffer_header_t;
+
+typedef struct {
+ putc_buffer_header_t hdr;
+ char data[PUTC_BUFFER_LEN];
+} putc_buffer_t;
+
+static volatile putc_buffer_t *const putc_buffer = (putc_buffer_t *const)(void *)&_putcb;
+
+// Provide an implementation for putchar.
+void _putchar(char character) {
+ volatile putc_buffer_t *buf = &putc_buffer[__rt_get_hartid()];
+ buf->data[buf->hdr.size++] = character;
+ if (buf->hdr.size == PUTC_BUFFER_LEN || character == '\n') {
+ buf->hdr.syscall_mem[0] = 64; // sys_write
+ buf->hdr.syscall_mem[1] = 1; // file descriptor (1 = stdout)
+ buf->hdr.syscall_mem[2] = (uintptr_t)&buf->data; // buffer
+ buf->hdr.syscall_mem[3] = buf->hdr.size; // length
+
+ tohost = (uintptr_t)buf->hdr.syscall_mem;
+ while (fromhost == 0)
+ ;
+ fromhost = 0;
+
+ buf->hdr.size = 0;
+ }
+}
+
+// Print a (null-terminated) string
+static inline void __rt_print(const char* buf) {
+ for (; *buf; ++buf) _putchar(*buf);
+}
+
+// Print a decimal number
+static inline void __rt_print_dec_uint(uint32_t val) {
+ const int DEC_BUF_LEN = 10;
+ char out [DEC_BUF_LEN];
+ int out_msd;
+ int i;
+ // Capture digits
+ for (i=DEC_BUF_LEN-2; i >= 0; --i) {
+ char digit = (val % 10);
+ out[i] = digit + '0';
+ val /= 10;
+ out_msd = i;
+ if (val == 0) break;
+ }
+ out[DEC_BUF_LEN-1] = '\0';
+ // Print digits
+ __rt_print(out + out_msd);
+}
+
+// Cluster-local barrier
+static inline void __rt_barrier() {
+ asm volatile("csrr x0, 0x7C2" ::: "memory");
+}
+
+// Full memory fence
+static inline void __rt_fence() {
+ asm volatile("fence" ::: "memory");
+}
+
+#define __RT_FPU_FENCE "fmv.x.w zero, fa0\n"
+
+// Fence waiting for FPU to catch up to core
+static inline void __rt_fpu_fence() {
+ asm volatile(__RT_FPU_FENCE ::: "memory");
+}
+
+// Cluster-local barrier
+static inline void __rt_fpu_fence_full() {
+ uint32_t register tmp;
+ asm volatile (
+ "fmv.x.w %[tmp], fa0 \n"
+ "mv zero, %[tmp] \n"
+ : [tmp]"=r"(tmp) :: "memory"
+ );
+}
+
+// Memcopy using FPU
+static inline void __rt_memcpy_fpu(double* dst, double* src, size_t lend) {
+ #pragma clang loop unroll_count(8)
+ for (int i = 0; i < lend; i++)
+ *(volatile double*)(dst + i) = *(volatile double*)(src + i);
+}
+
+// Monotonically increasing cycle count
+static inline volatile uint32_t __rt_get_timer() {
+ uint32_t register r;
+ asm volatile ("csrr %0, mcycle" : "=r"(r));
+ return r;
+}
+
+// Sleep for multiples of 10 (Deca) cycles
+static inline void __rt_shortsleep(uint32_t Dcycles) {
+ for (int i = 0; i < Dcycles; ++i) {
+ asm volatile ("nop; nop; nop; nop; nop; nop; nop; nop; nop; nop" ::: "memory");
+ }
+}
+
+// Include putchar code directly (header-only implementation)
+#include "printf.c"
diff --git a/sw/saris/runtime/runtime.hpp b/sw/saris/runtime/runtime.hpp
new file mode 100644
index 0000000000..df501ff20e
--- /dev/null
+++ b/sw/saris/runtime/runtime.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+// C linkage macros
+#ifdef __cplusplus
+#define EXTERN_C extern "C"
+#define EXTERN_C_BEGIN extern "C" {
+#define EXTERN_C_END }
+#else
+#define EXTERN_C
+#define EXTERN_C_BEGIN
+#define EXTERN_C_END
+#endif
+
+// Include C runtime, ignoring benign CXX-only warnings
+EXTERN_C_BEGIN
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-register"
+#include "runtime.h"
+#pragma GCC diagnostic pop
+EXTERN_C_END
diff --git a/sw/saris/runtime/sssr.h b/sw/saris/runtime/sssr.h
new file mode 100644
index 0000000000..171ccb454f
--- /dev/null
+++ b/sw/saris/runtime/sssr.h
@@ -0,0 +1,189 @@
+#pragma once
+
+// Registers
+#define __RT_SSSR_REG_STATUS 0
+#define __RT_SSSR_REG_REPEAT 1
+
+#define __RT_SSSR_REG_BOUND_0 2
+#define __RT_SSSR_REG_BOUND_1 3
+#define __RT_SSSR_REG_BOUND_2 4
+#define __RT_SSSR_REG_BOUND_3 5
+
+#define __RT_SSSR_REG_STRIDE_0 6
+#define __RT_SSSR_REG_STRIDE_1 7
+#define __RT_SSSR_REG_STRIDE_2 8
+#define __RT_SSSR_REG_STRIDE_3 9
+
+#define __RT_SSSR_REG_IDX_CFG 10
+#define __RT_SSSR_REG_IDX_BASE 11
+#define __RT_SSSR_REG_IDX_ISECT 12
+
+#define __RT_SSSR_REG_RPTR_INDIR 16
+#define __RT_SSSR_REG_RPTR_SLV 17
+#define __RT_SSSR_REG_RPTR_MST_NOSLV 18
+#define __RT_SSSR_REG_RPTR_MST_TOSLV 19
+
+#define __RT_SSSR_REG_WPTR_INDIR 20
+#define __RT_SSSR_REG_WPTR_SLV 21
+#define __RT_SSSR_REG_WPTR_MST_NOSLV 22
+#define __RT_SSSR_REG_WPTR_MST_TOSLV 23
+
+#define __RT_SSSR_REG_RPTR_0 24
+#define __RT_SSSR_REG_RPTR_1 25
+#define __RT_SSSR_REG_RPTR_2 26
+#define __RT_SSSR_REG_RPTR_3 27
+
+#define __RT_SSSR_REG_WPTR_0 28
+#define __RT_SSSR_REG_WPTR_1 29
+#define __RT_SSSR_REG_WPTR_2 30
+#define __RT_SSSR_REG_WPTR_3 31
+
+// Enable and disable
+#define __RT_SSSR_ENABLE "csrsi 0x7C0, 1\n"
+#define __RT_SSSR_DISABLE "csrci 0x7C0, 1\n"
+
+// Write configuration registers
+// To write to all SSRs, use ssridx=31
+#define __RT_SSSR_IDXALL 31
+#define __RT_SSSR_SCFGWI_INT(valreg,ssridx,regidx) "scfgwi "#valreg", "#ssridx" | "#regidx"<<5\n"
+#define __RT_SSSR_SCFGWI(valreg,ssridx,regname) __RT_SSSR_SCFGWI_INT(valreg,ssridx,regname)
+
+// Read configuration registers
+#define __RT_SSSR_SCFGRI_INT(valreg,ssridx,regidx) "scfgri "#valreg", "#ssridx" | "#regidx"<<5\n"
+#define __RT_SSSR_SCFGRI(valreg,ssridx,regname) __RT_SSSR_SCFGRI_INT(valreg,ssridx,regname)
+
+// Assemble index configuration word
+#define __RT_SSSR_IDXSIZE_U8 0
+#define __RT_SSSR_IDXSIZE_U16 1
+#define __RT_SSSR_IDXSIZE_U32 2
+#define __RT_SSSR_IDXSIZE_U64 3
+#define __RT_SSSR_IDX_NOMERGE 0
+#define __RT_SSSR_IDX_MERGE 1
+#define __RT_SSSR_IDX_CFG(size,shift,flags) (((flags & 0xFFFF)<<16) | ((shift & 0xFF)<<8) | (size & 0xFF) )
+
+// Block until job is done
+// TODO: Replace with (shadowed) blocking read or write
+#define __RT_SSSR_WAIT_DONE(tempreg, ssridx) \
+ "1:" __RT_SSSR_SCFGRI(tempreg,ssridx,__RT_SSSR_REG_STATUS) \
+ "srli "#tempreg", "#tempreg", 31 \n" \
+ "beqz "#tempreg", 1b \n"
+
+// Allocates the specified registers and fakes them as
+// outputs of an SSSR enable, enforcing an order.
+#define __RT_SSSR_BLOCK_BEGIN \
+ { \
+ register double _rt_sssr_0 asm("ft0"); \
+ register double _rt_sssr_1 asm("ft1"); \
+ register double _rt_sssr_2 asm("ft2"); \
+ asm volatile(__RT_SSSR_ENABLE : "+f"(_rt_sssr_0), "+f"(_rt_sssr_1), "+f"(_rt_sssr_2) :: "memory");
+
+// Disables the SSSRs, taking as fake inputs the allocated
+// registers for the SSRs and thus allowing reallocation.
+#define __RT_SSSR_BLOCK_END \
+ asm volatile(__RT_SSSR_DISABLE : "+f"(_rt_sssr_0), "+f"(_rt_sssr_1), "+f"(_rt_sssr_2) :: "memory"); \
+ }
+
+static inline void __rt_sssr_cfg_write(uint32_t val, uint32_t ssridx, uint32_t regidx) {
+ asm volatile (
+ __RT_SSSR_SCFGWI_INT(%[valreg],%[ssridx],%[regidx])
+ :: [valreg]"r"(val), [ssridx]"i"(ssridx), [regidx]"i"(regidx) : "memory"
+ );
+}
+
+static inline void __rt_sssr_cfg_write_ptr(void* val, uint32_t ssridx, uint32_t regidx) {
+ __rt_sssr_cfg_write((uintptr_t)val, ssridx, regidx);
+}
+
+static inline uint32_t __rt_sssr_cfg_read(uint32_t ssridx, uint32_t regidx) {
+ uint32_t ret;
+ asm volatile (
+ __RT_SSSR_SCFGRI_INT(%[retreg],%[ssridx],%[regidx])
+ : [retreg]"=r"(ret) : [ssridx]"i"(ssridx), [regidx]"i"(regidx) : "memory"
+ );
+ return ret;
+}
+
+static inline void __rt_sssr_enable() {
+ asm volatile(__RT_SSSR_ENABLE ::: "memory");
+}
+
+static inline void __rt_sssr_disable() {
+ asm volatile(__RT_SSSR_DISABLE ::: "memory");
+}
+
+static inline uint16_t __rt_sssr_ptoi(void* ptr) {
+ // We assume TCDM alignment here; TCDM address offset is ignored
+ // as it will be masked in the SSR at at the latest
+ return (uint16_t)((uintptr_t)ptr >> 3);
+}
+
+static inline void __rt_sssr_bound_stride_1d(
+ uint32_t ssridx,
+ uint32_t b0, uint32_t s0
+) {
+ // argument bounds and strides are *non-inclusive* for convenience
+ __rt_sssr_cfg_write(--b0, ssridx, __RT_SSSR_REG_BOUND_0);
+ __rt_sssr_cfg_write(s0, ssridx, __RT_SSSR_REG_STRIDE_0);
+}
+
+static inline void __rt_sssr_bound_stride_2d(
+ uint32_t ssridx,
+ uint32_t b0, uint32_t s0,
+ uint32_t b1, uint32_t s1
+) {
+ // argument bounds and strides are *non-inclusive* for convenience
+ __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0);
+ __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1);
+ uint32_t a = 0;
+ __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0);
+ a += s0 * b0;
+ __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1);
+}
+
+static inline void __rt_sssr_bound_stride_3d(
+ uint32_t ssridx,
+ uint32_t b0, uint32_t s0,
+ uint32_t b1, uint32_t s1,
+ uint32_t b2, uint32_t s2
+) {
+ // argument bounds and strides are *non-inclusive* for convenience
+ __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0);
+ __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1);
+ __rt_sssr_cfg_write(--b2 , ssridx, __RT_SSSR_REG_BOUND_2);
+ uint32_t a = 0;
+ __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0);
+ a += s0 * b0;
+ __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1);
+ a += s1 * b1;
+ __rt_sssr_cfg_write(s2-a, ssridx, __RT_SSSR_REG_STRIDE_2);
+}
+
+static inline void __rt_sssr_bound_stride_4d(
+ uint32_t ssridx,
+ uint32_t b0, uint32_t s0,
+ uint32_t b1, uint32_t s1,
+ uint32_t b2, uint32_t s2,
+ uint32_t b3, uint32_t s3
+) {
+ // argument bounds and strides are *non-inclusive* for convenience
+ __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0);
+ __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1);
+ __rt_sssr_cfg_write(--b2 , ssridx, __RT_SSSR_REG_BOUND_2);
+ __rt_sssr_cfg_write(--b3 , ssridx, __RT_SSSR_REG_BOUND_3);
+ uint32_t a = 0;
+ __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0);
+ a += s0 * b0;
+ __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1);
+ a += s1 * b1;
+ __rt_sssr_cfg_write(s2-a, ssridx, __RT_SSSR_REG_STRIDE_2);
+ a += s2 * b2;
+ __rt_sssr_cfg_write(s3-a, ssridx, __RT_SSSR_REG_STRIDE_3);
+}
+
+static inline void __rt_sssr_wait_done(uint32_t ssridx) {
+ uint32_t tmp;
+ asm volatile (
+ __RT_SSSR_WAIT_DONE(%[tmpreg],%[ssridx])
+ : [tmpreg]"+&r"(tmp) : [ssridx]"i"(ssridx) : "memory"
+ );
+}
diff --git a/sw/saris/stencils/istc.common.hpp b/sw/saris/stencils/istc.common.hpp
new file mode 100644
index 0000000000..042005a741
--- /dev/null
+++ b/sw/saris/stencils/istc.common.hpp
@@ -0,0 +1,181 @@
+#include
+#include
+#include
+
+#pragma once
+
+// ============
+// Macros
+// ============
+
+// ST and S contain temporal and spatial dimension constants, SP parallelization and unroll constants, C value constants of type `d_t`
+#define RCP *__restrict__ const
+#define PRM static constexpr int
+#define PRMD static constexpr double
+#define PRMX constexpr int
+#define PRMXD constexpr double
+struct __istc_dstr{PRM __dummy=0;};
+PRMX __istc_dstr::__dummy;
+#define KNL template \
+ static __attribute__((noinline)) void
+#define IDXA volatile __attribute__ ((__aligned__(8))) i_t
+#define COFA volatile __attribute__ ((__aligned__(8))) d_t
+
+// Shorten indexing code a bit
+#define I(ptr) __rt_sssr_ptoi(ptr)
+// Further simplify RCP deref magic (selexp indexes into A)
+#define J(A, selexp) I(&(*A) selexp)
+
+// Shorten unroll for loops and canonical axis loops
+#define PRAGMA(X) _Pragma(#X)
+#define foru(unroll) \
+ PRAGMA(clang loop unroll_count(unroll)) \
+ for
+#define forp(unroll, i, init, pte, stride) for (int i = init; i < pte; i += stride)
+#define forpu(unroll, i, init, pte, stride) foru(unroll) (int i = init; i < pte; i += stride)
+// Axis assist macro: shortcut for most axes (requires KNL_IDS)
+#define forpx(axis, ii, init, pte) forp(sp::u##axis, ii, i##axis+init, pte, sp::p##axis)
+#define forpux(axis, ii, init, pte) forpu(sp::u##axis, ii, i##axis+init, pte, sp::p##axis)
+// Same as forpux, but explicitly control unroll (e.g. 1). Helps when kernels
+// get so large that register allocation suffocates and addresses stack-swap.
+#define forpex(unroll, axis, ii, init, pte) forpu(unroll, ii, i##axis+init, pte, sp::p##axis)
+// For manual unrolling: simply combines strides
+#define form(i, init, pte, stride) for (int i = init; i < pte; i += stride)
+
+// Macro to define core constants
+#define KNL_IDS(cid) \
+ const uint32_t ix = cid % sp::px; \
+ const uint32_t iy = (cid / sp::px) % sp::py; \
+ const uint32_t iz = cid / (sp::px * sp::py);
+
+#define sodt sizeof(d_t)
+
+// Macro for core constants with *local* unroll
+#define KNL_IDS_LOC(cid) \
+ KNL_IDS(cid) \
+ uint32_t lx = ix * sp::ux; \
+ uint32_t ly = iy * sp::uy; \
+ uint32_t lz = iz * sp::uz; \
+ constexpr uint32_t jmpz = sp::pz*sp::uz; \
+ constexpr uint32_t jmpy = sp::py*sp::uy; \
+ constexpr uint32_t jmpx = sp::px*sp::ux;
+
+// ========================
+// Dimension defaults
+// ========================
+
+#define SU(name, dim) \
+ struct name {PRM n=dim; PRM nx=dim; PRM ny=dim; PRM nz=dim;}; \
+ PRMX name::n, name::nx, name::ny, name::nz;
+
+// Keep these dimensions aligned with data generation
+SU(s1s, 1000)
+SU(s1sm, 1728)
+SU(s1m, 2744)
+SU(s1ml, 4096)
+SU(s1l, 5832)
+
+SU(s2s, 32)
+SU(s2sm, 42)
+SU(s2m, 52)
+SU(s2ml, 64)
+SU(s2l, 76)
+
+SU(s3s, 10)
+SU(s3sm, 12)
+SU(s3m, 14)
+SU(s3ml, 16)
+SU(s3l, 18)
+
+#define ST(name, steps) \
+ struct name {PRM t=steps;}; \
+ PRMX name::t;
+
+ST(st1, 1)
+ST(st4, 4)
+ST(st12, 12)
+
+#define SP(name, ncores, parz, pary, parx, unrz, unry, unrx, unru) \
+ struct name {PRM nc=ncores; PRM px=parx; PRM py=pary; PRM pz=parz; PRM ux=unrx; PRM uy=unry; PRM uz=unrz; PRM uu=unru;}; \
+ PRMX name::nc, name::px, name::py, name::pz, name::ux, name::uy, name::uz, name::uu;
+
+SP(sp1, 8, 1, 1, 8, 1, 1, 4, 8)
+SP(sp2, 8, 1, 2, 4, 1, 2, 2, 8)
+SP(sp3, 8, 2, 2, 2, 1, 2, 2, 8)
+
+// =============
+// Helpers
+// =============
+
+inline void __istc_barrier() {
+ __rt_barrier();
+}
+
+inline double __istc_sgnjx(double rs1, double rs2) {
+ double rd;
+ asm volatile("fsgnjx.d %[rd], %[rs1], %[rs2]" : [rd]"=f"(rd) : [rs1]"f"(rs1), [rs2]"f"(rs2));
+ return rd;
+}
+
+// Implements `sign(a) == sign(b) ? 0 : a` using only FP operations and no conditional logic
+inline double __istc_ternclip(double a, double b) {
+ // If `sign(a) == sign(b)`, then ainj is +|a|, otherwise |-a|
+ double ainj = __istc_sgnjx(a, b);
+ // This gives us +|a| if the condition holds, otherwise 0
+ double ainj_clip = fmax(ainj, 0.0);
+ // Inject original sign of a into the clipped result, yielding a or (+/-) 0
+ return copysign(ainj_clip, a);
+}
+
+// ==================
+// ISSR helpers
+// ==================
+
+inline void __istc_setup_issrs(uint32_t idxsize, uint32_t i0l, uint32_t i1l) {
+ __rt_sssr_cfg_write(__RT_SSSR_IDX_CFG(idxsize, 0, 0), __RT_SSSR_IDXALL, __RT_SSSR_REG_IDX_CFG);
+ __rt_sssr_cfg_write(i0l-1, 0, __RT_SSSR_REG_BOUND_0);
+ __rt_sssr_cfg_write(i1l-1, 1, __RT_SSSR_REG_BOUND_0);
+}
+
+
+inline void __istc_iter_issrs(void* base, void* i0, void* i1) {
+ __rt_sssr_cfg_write_ptr(base, __RT_SSSR_IDXALL, __RT_SSSR_REG_IDX_BASE);
+ __rt_sssr_cfg_write_ptr(i0, 0, __RT_SSSR_REG_RPTR_INDIR);
+ __rt_sssr_cfg_write_ptr(i1, 1, __RT_SSSR_REG_RPTR_INDIR);
+}
+
+// ==========================
+// Verification helpers
+// ==========================
+
+inline void __istc_cmp_grids(
+ uint32_t core_id, uint32_t core_num, uint32_t core_stride,
+ TCDM double* grid1, TCDM double* grid2, uint32_t len, double rel_eps,
+ TCDM volatile uint32_t* err_sema
+) {
+ __rt_barrier();
+ uint32_t errors = 0;
+ uint32_t stride = core_num * core_stride;
+ #pragma clang loop unroll_count(16)
+ for (int i = core_id; i < len; i += stride)
+ errors += (fabs(grid1[i] - grid2[i]) > fabs(rel_eps * grid1[i]));
+ __atomic_fetch_add(err_sema, errors, __ATOMIC_RELAXED);
+ __rt_barrier();
+}
+
+volatile void __attribute__((noinline)) __istc_touch_grid(
+ uint32_t core_id, uint32_t core_num, uint32_t core_stride,
+ TCDM double* grid, uint32_t len, TCDM volatile uint32_t* ret_sema
+) {
+ __rt_barrier();
+ uint32_t ret_loc;
+ double sum = 0.0;
+ uint32_t stride = core_num * core_stride;
+ #pragma clang loop unroll_count(16)
+ for (int i = core_id; i < len; i += stride)
+ sum += grid[i];
+ asm volatile("fcvt.w.d t1, %1; sub %0, t1, t1" : "=r"(ret_loc) : "f"(sum) : "memory", "t1");
+ __atomic_fetch_add(ret_sema, ret_loc, __ATOMIC_RELAXED);
+ __rt_barrier();
+}
diff --git a/sw/saris/stencils/istc.issr.hpp b/sw/saris/stencils/istc.issr.hpp
new file mode 100644
index 0000000000..c74d76b4dc
--- /dev/null
+++ b/sw/saris/stencils/istc.issr.hpp
@@ -0,0 +1,879 @@
+#include "istc.common.hpp"
+
+// ===============
+// Polybench
+// ===============
+
+KNL istci_pb_jacobi_2d(
+ const int cid,
+ TCDM d_t (RCP A)[s::n][s::n],
+ TCDM d_t (RCP B)[s::n][s::n]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sx+sy;
+ constexpr uint32_t b = dx, l = dy, cc = dx+dy, r = cc+dx, tt = cc+dy;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ IDXA i0[10], i1[10];
+ /*b*/ i0[ 0] = b; i0[ 1] = b + sx; i0[ 2] = b + sy; i0[ 3] = b + sb;
+ /*l*/ i0[ 4] = l; i0[ 5] = l + sx; i0[ 6] = l + sy; i0[ 7] = l + sb;
+ /*c*/ i0[ 8] = cc; i0[ 9] = cc + sy;
+ /*r*/ i1[ 0] = r; i1[ 1] = r + sx; i1[ 2] = r + sy; i1[ 3] = r + sb;
+ /*t*/ i1[ 4] = tt; i1[ 5] = tt + sx; i1[ 6] = tt + sy; i1[ 7] = tt + sb;
+ /*c*/ i1[ 8] = cc + sx; i1[ 9] = cc + sb;
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 10, 10);
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (i, ly, s::n-2, jmpy) {
+ __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+ bool winit = true;
+ form (j, lx, s::n-2, jmpx) {
+ __istc_iter_issrs((void*)&(*A)[i][j], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*B)[i+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);}
+ asm volatile (
+ // br0..3 = b0..3 + r0..3 and lt0..3 = l0..3 + t0..3
+ "frep.i %[c7], 1, 7, 0b001 \n"
+ "fadd.d fa0, ft0, ft1 \n"
+ // p0..3 = br0..3 + lt0..3
+ "frep.i %[c3], 1, 3, 0b111 \n"
+ "fadd.d fa0, fa0, fa4 \n"
+ // tt0..3 = p0..3 + c0..3
+ "fadd.d fa0, fa0, ft0 \n"
+ "fadd.d fa1, fa1, ft1 \n"
+ "fadd.d fa2, fa2, ft0 \n"
+ "fadd.d fa3, fa3, ft1 \n"
+ // res0..3 = 0.2 * tt0..3
+ "frep.i %[c3], 1, 3, 0b100 \n"
+ "fmul.d ft2, %[cf], fa0 \n"
+ :: [c7]"r"(7), [c3]"r"(3), [cf]"f"(0.2)
+ : "memory", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+// ==========
+// AN5D
+// ==========
+
+KNL istci_an5d_j2d5pt(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t b = dx, l = dy, cc = dx+dy, r = cc+dx, tt = cc+dy;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ IDXA i0[10], i1[10];
+ /*c*/ i0[ 0] = cc; i0[ 1] = cc+sy; /*b*/ i0[ 2] = b; i0[ 3] = b+sy;
+ /*l*/ i0[ 4] = l; i0[ 5] = l+sy; /*r*/ i0[ 6] = r; i0[ 7] = r+sy;
+ /*t*/ i0[ 8] = tt; i0[ 9] = tt+sy;
+ /*c*/ i1[ 0] = cc+sx; i1[ 1] = cc+sb; /*b*/ i1[ 2] = b+sx; i1[ 3] = b+sb;
+ /*l*/ i1[ 4] = l+sx; i1[ 5] = l+sb; /*r*/ i1[ 6] = r+sx; i1[ 7] = r+sb;
+ /*t*/ i1[ 8] = tt+sx; i1[ 9] = tt+sb;
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 10, 10);
+
+ // Avoid constant FP division
+ register d_t fac asm("ft7") = 1.0 / c::c0;
+ // Use stacked registers for FREP
+ register d_t cb asm("ft3") = c::ym[0];
+ register d_t cl asm("ft4") = c::xm[0];
+ register d_t cr asm("ft5") = c::xp[0];
+ register d_t ct asm("ft6") = c::yp[0];
+ register d_t cc_ asm("ft8") = c::cc;
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (y, ly, s::n-2, jmpy) {
+ __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+ bool winit = true;
+ form (x, lx, s::n-2, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, %[cc], ft0 \n"
+ "fmul.d fa1, %[cc], ft1 \n"
+ "fmul.d fa2, %[cc], ft0 \n"
+ "fmul.d fa3, %[cc], ft1 \n"
+ // Do directionals as loop
+ "frep.o %[c3], 4, 3, 0b0010 \n"
+ "fmadd.d fa0, ft3, ft0, fa0 \n"
+ "fmadd.d fa1, ft3, ft1, fa1 \n"
+ "fmadd.d fa2, ft3, ft0, fa2 \n"
+ "fmadd.d fa3, ft3, ft1, fa3 \n"
+ // Final scaling and writeback
+ "frep.i %[c3], 1, 3, 0b100 \n"
+ "fmul.d ft2, %[fc], fa0 \n"
+ : [cb]"+&f"(cb), [cl]"+&f"(cl), [cr]"+&f"(cr), [ct]"+&f"(ct),
+ [cc]"+&f"(cc_), [fc]"+&f"(fac)
+ : [c3]"r"(3)
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_j2d9pt(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t cc = 2*dy+2*dx,
+ b0 = cc-dy, b1 = cc-2*dy,
+ l0 = cc-dx, l1 = cc-2*dx,
+ r0 = cc+dx, r1 = cc+2*dx,
+ t0 = cc+dy, t1 = cc+2*dy;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ IDXA i0[18], i1[18];
+ /*cc*/ i0[ 0] = cc; i0[ 1] = cc+sy;
+ /*b0*/ i0[ 2] = b0; i0[ 3] = b0+sy; /*l0*/ i0[ 4] = l0; i0[ 5] = l0+sy;
+ /*r0*/ i0[ 6] = r0; i0[ 7] = r0+sy; /*t0*/ i0[ 8] = t0; i0[ 9] = t0+sy;
+ /*b1*/ i0[10] = b1; i0[11] = b1+sy; /*l1*/ i0[12] = l1; i0[13] = l1+sy;
+ /*r1*/ i0[14] = r1; i0[15] = r1+sy; /*t1*/ i0[16] = t1; i0[17] = t1+sy;
+ /*cc*/ i1[ 0] = cc+sx; i1[ 1] = cc+sb;
+ /*b0*/ i1[ 2] = b0+sx; i1[ 3] = b0+sb; /*l0*/ i1[ 4] = l0+sx; i1[ 5] = l0+sb;
+ /*r0*/ i1[ 6] = r0+sx; i1[ 7] = r0+sb; /*t0*/ i1[ 8] = t0+sx; i1[ 9] = t0+sb;
+ /*b1*/ i1[10] = b1+sx; i1[11] = b1+sb; /*l1*/ i1[12] = l1+sx; i1[13] = l1+sb;
+ /*r1*/ i1[14] = r1+sx; i1[15] = r1+sb; /*t1*/ i1[16] = t1+sx; i1[17] = t1+sb;
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 18, 18);
+
+ // Avoid constant FP division
+ register d_t fac asm("fa4") = 1.0 / c::c0;
+ // Use stacked registers for FREP
+ register d_t cb0 asm("ft3") = c::ym[0];
+ register d_t cl0 asm("ft4") = c::xm[0];
+ register d_t cr0 asm("ft5") = c::xp[0];
+ register d_t ct0 asm("ft6") = c::yp[0];
+ register d_t cb1 asm("ft8") = c::ym[1];
+ register d_t cl1 asm("ft9") = c::xm[1];
+ register d_t cr1 asm("ft10") = c::xp[1];
+ register d_t ct1 asm("ft11") = c::yp[1];
+ register d_t cc_ asm("fa5") = c::cc;
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (y, ly, s::n-4,jmpy) {
+ __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-4+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+ bool winit = true;
+ form (x, lx, s::n-4, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+2][lx+2], 2, __RT_SSSR_REG_WPTR_2);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, %[cc], ft0 \n"
+ "fmul.d fa1, %[cc], ft1 \n"
+ "fmul.d fa2, %[cc], ft0 \n"
+ "fmul.d fa3, %[cc], ft1 \n"
+ // Do directionals as loop
+ "frep.o %[c3], 4, 3, 0b0010 \n"
+ "fmadd.d fa0, ft3, ft0, fa0 \n"
+ "fmadd.d fa1, ft3, ft1, fa1 \n"
+ "fmadd.d fa2, ft3, ft0, fa2 \n"
+ "fmadd.d fa3, ft3, ft1, fa3 \n"
+ // Do directionals as loop
+ "frep.o %[c3], 4, 3, 0b0010 \n"
+ "fmadd.d fa0, ft8, ft0, fa0 \n"
+ "fmadd.d fa1, ft8, ft1, fa1 \n"
+ "fmadd.d fa2, ft8, ft0, fa2 \n"
+ "fmadd.d fa3, ft8, ft1, fa3 \n"
+ // Final scaling and writeback
+ "frep.i %[c3], 1, 3, 0b100 \n"
+ "fmul.d ft2, %[fc], fa0 \n"
+ : [cb0]"+&f"(cb0), [cl0]"+&f"(cl0), [cr0]"+&f"(cr0), [ct0]"+&f"(ct0),
+ [cb1]"+&f"(cb1), [cl1]"+&f"(cl1), [cr1]"+&f"(cr1), [ct1]"+&f"(ct1),
+ [cc]"+&f"(cc_), [fc]"+&f"(fac)
+ : [c3]"r"(3)
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_j2d9pt_gol(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t
+ bl = 0, bc = dx, br = 2*dx,
+ ml = dy, mc = dx+dy, mr = 2*dx+dy,
+ tl = 2*dy, tc = dx+2*dy, tr = 2*dx+2*dy;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ IDXA i0[18], i1[18];
+ /*mc*/ i0[ 0] = mc; i0[ 1] = mc + sy;
+ /*bl*/ i0[ 2] = bl; i0[ 3] = bl + sy; /*bc*/ i0[ 4] = bc; i0[ 5] = bc + sy;
+ /*br*/ i0[ 6] = br; i0[ 7] = br + sy; /*ml*/ i0[ 8] = ml; i0[ 9] = ml + sy;
+ /*mr*/ i0[10] = mr; i0[11] = mr + sy; /*tl*/ i0[12] = tl; i0[13] = tl + sy;
+ /*tc*/ i0[14] = tc; i0[15] = tc + sy; /*tr*/ i0[16] = tr; i0[17] = tr + sy;
+ /*mc*/ i1[ 0] = mc + sx; i1[ 1] = mc + sb;
+ /*bl*/ i1[ 2] = bl + sx; i1[ 3] = bl + sb; /*bc*/ i1[ 4] = bc + sx; i1[ 5] = bc + sb;
+ /*br*/ i1[ 6] = br + sx; i1[ 7] = br + sb; /*ml*/ i1[ 8] = ml + sx; i1[ 9] = ml + sb;
+ /*mr*/ i1[10] = mr + sx; i1[11] = mr + sb; /*tl*/ i1[12] = tl + sx; i1[13] = tl + sb;
+ /*tc*/ i1[14] = tc + sx; i1[15] = tc + sb; /*tr*/ i1[16] = tr + sx; i1[17] = tr + sb;
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 18, 18);
+
+ // Avoid constant FP division
+ register d_t fac asm("fa4") = 1.0 / c::c0;
+ // Use stacked registers for FREP
+ register d_t cmc asm("fa5") = c::c[1][1];
+ register d_t cbl asm("ft3") = c::c[0][0];
+ register d_t cbc asm("ft4") = c::c[0][1];
+ register d_t cbr asm("ft5") = c::c[0][2];
+ register d_t cml asm("ft6") = c::c[1][0];
+ register d_t cmr asm("ft8") = c::c[1][2];
+ register d_t ctl asm("ft9") = c::c[2][0];
+ register d_t ctc asm("ft10") = c::c[2][1];
+ register d_t ctr asm("ft11") = c::c[2][2];
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (y, ly, s::n-2,jmpy) {
+ __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+ bool winit = true;
+ form (x, lx, s::n-2, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, %[cmc], ft0 \n"
+ "fmul.d fa1, %[cmc], ft1 \n"
+ "fmul.d fa2, %[cmc], ft0 \n"
+ "fmul.d fa3, %[cmc], ft1 \n"
+ // Do directionals as loop
+ "frep.o %[c3], 4, 3, 0b0010 \n"
+ "fmadd.d fa0, ft3, ft0, fa0 \n"
+ "fmadd.d fa1, ft3, ft1, fa1 \n"
+ "fmadd.d fa2, ft3, ft0, fa2 \n"
+ "fmadd.d fa3, ft3, ft1, fa3 \n"
+ // Do directionals as loop
+ "frep.o %[c3], 4, 3, 0b0010 \n"
+ "fmadd.d fa0, ft8, ft0, fa0 \n"
+ "fmadd.d fa1, ft8, ft1, fa1 \n"
+ "fmadd.d fa2, ft8, ft0, fa2 \n"
+ "fmadd.d fa3, ft8, ft1, fa3 \n"
+ // Final scaling and writeback
+ "frep.i %[c3], 1, 3, 0b100 \n"
+ "fmul.d ft2, %[fc], fa0 \n"
+ : [cbl]"+&f"(cbl), [cbc]"+&f"(cbc), [cbr]"+&f"(cbr), [cml]"+&f"(cml),
+ [cmr]"+&f"(cmr), [ctl]"+&f"(ctl), [ctc]"+&f"(ctc), [ctr]"+&f"(ctr),
+ [cmc]"+&f"(cmc), [fc]"+&f"(fac)
+ : [c3]"r"(3)
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_j3d27pt(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ constexpr uint32_t ilen = 2*27;
+ IDXA i0[ilen], i1[ilen];
+ IDXA *p0 = i0, *p1 = i1;
+ #pragma unroll
+ for (int z = 0; z < 3; ++z)
+ #pragma unroll
+ for (int y = 0; y < 3; ++y)
+ #pragma unroll
+ for (int x = 0; x < 3; ++x) {
+ uint32_t pt = z*dz + y*dy + x*dx;
+ /*pt0*/ *(p0++) = pt; *(p0++) = pt+sy;
+ /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb;
+ }
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+ // Avoid constant FP division
+ register d_t fac asm("ft3") = 1.0 / c::c0;
+ // Buffer constants in order for SSR use (each repeated to cover unroll)
+ COFA ca[27];
+ COFA* pa = ca;
+ #pragma unroll
+ for (int z = 0; z < 3; ++z)
+ #pragma unroll
+ for (int y = 0; y < 3; ++y)
+ #pragma unroll
+ for (int x = 0; x < 3; ++x)
+ *(pa++) = c::c3[z][y][x];
+ __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+
+ form (z, lz, s::n-2,jmpz) {
+ form (y, ly, s::n-2,jmpy) {
+ __rt_sssr_bound_stride_2d(2, 27, sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, 0);
+ bool winit = true;
+ form (x, lx, s::n-2, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+ asm volatile (
+ // Initialize accumulators: bottom left
+ "fmul.d fa0, ft2, ft0 \n"
+ "fmul.d fa1, ft2, ft1 \n"
+ "fmul.d fa2, ft2, ft0 \n"
+ "fmul.d fa3, ft2, ft1 \n"
+ // Do remaining blocks as loop
+ "frep.o %[cd], 4, 3, 0b0000 \n"
+ "fmadd.d fa0, ft2, ft0, fa0 \n"
+ "fmadd.d fa1, ft2, ft1, fa1 \n"
+ "fmadd.d fa2, ft2, ft0, fa2 \n"
+ "fmadd.d fa3, ft2, ft1, fa3 \n"
+ // Final scaling
+ "frep.i %[c3], 1, 3, 0b101 \n"
+ "fmul.d fa0, %[fc], fa0 \n"
+ // Final writeback
+ "fsd fa0, 0 (%[wb]) \n"
+ "fsd fa1, %[sx](%[wb]) \n"
+ "fsd fa2, %[sy](%[wb]) \n"
+ "fsd fa3, %[sb](%[wb]) \n"
+ : [fc]"+&f"(fac)
+ : [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(27-2), [c3]"r"(3),
+ [wb]"r"(&(*A[(t+1)%2])[z+1][y+1][x+1])
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ }
+ lz = (lz + sp::pz) % jmpz;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_star2dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ static_assert(ci::r >= 1, "Radius must be at least 1!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t cc = ci::r*dy + ci::r*dx;
+ constexpr uint32_t npoints = 1+4*ci::r;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ constexpr uint32_t ilen = 2*npoints;
+ IDXA i0[ilen], i1[ilen];
+ IDXA *p0 = i0, *p1 = i1;
+ /*cc0*/ *(p0++) = cc; *(p0++) = cc+sy;
+ /*cc1*/ *(p1++) = cc+sx; *(p1++) = cc+sb;
+ #pragma unroll
+ for (int j = 1; j <= ci::r; ++j) {
+ uint32_t bb = cc-j*dy, ll = cc-j*dx, rr = cc+j*dx, tt = cc+j*dy;
+ /*bb0*/ *(p0++) = bb; *(p0++) = bb+sy; /*ll0*/ *(p0++) = ll; *(p0++) = ll+sy;
+ /*rr0*/ *(p0++) = rr; *(p0++) = rr+sy; /*tt0*/ *(p0++) = tt; *(p0++) = tt+sy;
+ /*bb1*/ *(p1++) = bb+sx; *(p1++) = bb+sb; /*ll1*/ *(p1++) = ll+sx; *(p1++) = ll+sb;
+ /*rr1*/ *(p1++) = rr+sx; *(p1++) = rr+sb; /*tt1*/ *(p1++) = tt+sx; *(p1++) = tt+sb;
+ }
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+ // Buffer constants in order for SSR use (each repeated to cover unroll)
+ COFA ca[npoints];
+ COFA* pa = ca;
+ /*cc*/ *(pa++) = c::cc;
+ #pragma unroll
+ for (int j = 0; j < ci::r; ++j) {
+ /*bb*/ *(pa++) = c::ym[j]; /*ll*/ *(pa++) = c::xm[j];
+ /*rr*/ *(pa++) = c::xp[j]; /*tt*/ *(pa++) = c::yp[j];
+ }
+ __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (y, ly, s::n-2*ci::r,jmpy) {
+ __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+ bool winit = true;
+ form (x, lx, s::n-2*ci::r, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, ft2, ft0 \n"
+ "fmul.d fa1, ft2, ft1 \n"
+ "fmul.d fa2, ft2, ft0 \n"
+ "fmul.d fa3, ft2, ft1 \n"
+ // Do directionals as loop
+ "frep.o %[cd], 4, 3, 0b0000 \n"
+ "fmadd.d fa0, ft2, ft0, fa0 \n"
+ "fmadd.d fa1, ft2, ft1, fa1 \n"
+ "fmadd.d fa2, ft2, ft0, fa2 \n"
+ "fmadd.d fa3, ft2, ft1, fa3 \n"
+ // Final writeback
+ "fsd fa0, 0 (%[wb]) \n"
+ "fsd fa1, %[sx](%[wb]) \n"
+ "fsd fa2, %[sy](%[wb]) \n"
+ "fsd fa3, %[sb](%[wb]) \n"
+ :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+ [wb]"r"(&(*A[(t+1)%2])[y+ci::r][x+ci::r])
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_box2dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ static_assert(ci::r >= 1, "Radius must be at least 1!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t npoints = (2*ci::r+1)*(2*ci::r+1);
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ constexpr uint32_t ilen = 2*npoints;
+ IDXA i0[ilen], i1[ilen];
+ IDXA *p0 = i0, *p1 = i1;
+ #pragma unroll
+ for (int y = 0; y < 2*ci::r+1; ++y)
+ #pragma unroll
+ for (int x = 0; x < 2*ci::r+1; ++x) {
+ uint32_t pt = y*dy + x*dx;
+ /*pt0*/ *(p0++) = pt; *(p0++) = pt+sy;
+ /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb;
+ }
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+ // Buffer constants in order for SSR use (each repeated to cover unroll)
+ COFA ca[npoints];
+ COFA* pa = ca;
+ #pragma unroll
+ for (int y = 0; y < 2*ci::r+1; ++y)
+ #pragma unroll
+ for (int x = 0; x < 2*ci::r+1; ++x)
+ *(pa++) = c::c[y][x];
+ __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (y, ly, s::n-2*ci::r,jmpy) {
+ __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+ bool winit = true;
+ form (x, lx, s::n-2*ci::r, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, ft2, ft0 \n"
+ "fmul.d fa1, ft2, ft1 \n"
+ "fmul.d fa2, ft2, ft0 \n"
+ "fmul.d fa3, ft2, ft1 \n"
+ // Do directionals as loop
+ "frep.o %[cd], 4, 3, 0b0000 \n"
+ "fmadd.d fa0, ft2, ft0, fa0 \n"
+ "fmadd.d fa1, ft2, ft1, fa1 \n"
+ "fmadd.d fa2, ft2, ft0, fa2 \n"
+ "fmadd.d fa3, ft2, ft1, fa3 \n"
+ // Final writeback
+ "fsd fa0, 0 (%[wb]) \n"
+ "fsd fa1, %[sx](%[wb]) \n"
+ "fsd fa2, %[sy](%[wb]) \n"
+ "fsd fa3, %[sb](%[wb]) \n"
+ :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+ [wb]"r"(&(*A[(t+1)%2])[y+ci::r][x+ci::r])
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_star3dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ static_assert(ci::r >= 1, "Radius must be at least 1!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t cc = ci::r*dz + ci::r*dy + ci::r*dx;
+ constexpr uint32_t npoints = 1+6*ci::r;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ constexpr uint32_t ilen = 2*npoints;
+ IDXA i0[ilen], i1[ilen];
+ IDXA *p0 = i0, *p1 = i1;
+ /*cc0*/ *(p0++) = cc; *(p0++) = cc+sy;
+ /*cc1*/ *(p1++) = cc+sx; *(p1++) = cc+sb;
+ #pragma unroll
+ for (int j = 1; j <= ci::r; ++j) {
+ uint32_t bb = cc-j*dy, ll=cc-j*dx, rr = cc+j*dx, tt = cc+j*dy, aa = cc-j*dz, ff = cc+j*dz;
+ /*bb0*/ *(p0++) = bb; *(p0++) = bb+sy; /*ll0*/ *(p0++) = ll; *(p0++) = ll+sy;
+ /*rr0*/ *(p0++) = rr; *(p0++) = rr+sy; /*tt0*/ *(p0++) = tt; *(p0++) = tt+sy;
+ /*aa0*/ *(p0++) = aa; *(p0++) = aa+sy; /*ff0*/ *(p0++) = ff; *(p0++) = ff+sy;
+ /*bb1*/ *(p1++) = bb+sx; *(p1++) = bb+sb; /*ll1*/ *(p1++) = ll+sx; *(p1++) = ll+sb;
+ /*rr1*/ *(p1++) = rr+sx; *(p1++) = rr+sb; /*tt1*/ *(p1++) = tt+sx; *(p1++) = tt+sb;
+ /*aa1*/ *(p1++) = aa+sx; *(p1++) = aa+sb; /*ff1*/ *(p1++) = ff+sx; *(p1++) = ff+sb;
+ }
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+ // Buffer constants in order for SSR use (each repeated to cover unroll)
+ COFA ca[npoints];
+ COFA* pa = ca;
+ /*cc*/ *(pa++) = c::cc;
+ #pragma unroll
+ for (int j = 0; j < ci::r; ++j) {
+ /*bb*/ *(pa++) = c::ym[j]; /*ll*/ *(pa++) = c::xm[j];
+ /*rr*/ *(pa++) = c::xp[j]; /*tt*/ *(pa++) = c::yp[j];
+ /*rr*/ *(pa++) = c::zm[j]; /*tt*/ *(pa++) = c::zp[j];
+ }
+ __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (z, lz, s::n-2*ci::r,jmpz) {
+ form (y, ly, s::n-2*ci::r,jmpy) {
+ __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+ bool winit = true;
+ form (x, lx, s::n-2*ci::r, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, ft2, ft0 \n"
+ "fmul.d fa1, ft2, ft1 \n"
+ "fmul.d fa2, ft2, ft0 \n"
+ "fmul.d fa3, ft2, ft1 \n"
+ // Do directionals as loop
+ "frep.o %[cd], 4, 3, 0b0000 \n"
+ "fmadd.d fa0, ft2, ft0, fa0 \n"
+ "fmadd.d fa1, ft2, ft1, fa1 \n"
+ "fmadd.d fa2, ft2, ft0, fa2 \n"
+ "fmadd.d fa3, ft2, ft1, fa3 \n"
+ // Final writeback
+ "fsd fa0, 0 (%[wb]) \n"
+ "fsd fa1, %[sx](%[wb]) \n"
+ "fsd fa2, %[sy](%[wb]) \n"
+ "fsd fa3, %[sb](%[wb]) \n"
+ :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+ [wb]"r"(&(*A[(t+1)%2])[z+ci::r][y+ci::r][x+ci::r])
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ }
+ lz = (lz + sp::pz) % jmpz;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+KNL istci_an5d_box3dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+ // Assertions and IDs
+ static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ static_assert(ci::r >= 1, "Radius must be at least 1!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t npoints = (2*ci::r+1)*(2*ci::r+1)*(2*ci::r+1);
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ constexpr uint32_t ilen = 2*npoints;
+ IDXA i0[ilen], i1[ilen];
+ IDXA *p0 = i0, *p1 = i1;
+ #pragma unroll
+ for (int z = 0; z < 2*ci::r+1; ++z)
+ #pragma unroll
+ for (int y = 0; y < 2*ci::r+1; ++y)
+ #pragma unroll
+ for (int x = 0; x < 2*ci::r+1; ++x) {
+ uint32_t pt = z*dz + y*dy + x*dx;
+ /*pt0*/ *(p0++) = pt; *(p0++) = pt+sy;
+ /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb;
+ }
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+ // Buffer constants in order for SSR use (each repeated to cover unroll)
+ COFA ca[npoints];
+ COFA* pa = ca;
+ #pragma unroll
+ for (int z = 0; z < 2*ci::r+1; ++z)
+ #pragma unroll
+ for (int y = 0; y < 2*ci::r+1; ++y)
+ #pragma unroll
+ for (int x = 0; x < 2*ci::r+1; ++x)
+ *(pa++) = c::c3[z][y][x];
+ __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT);
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ form (z, lz, s::n-2*ci::r,jmpz) {
+ form (y, ly, s::n-2*ci::r,jmpy) {
+ __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0);
+ bool winit = true;
+ form (x, lx, s::n-2*ci::r, jmpx) {
+ __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1);
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);}
+ asm volatile (
+ // Initialize accumulators: center
+ "fmul.d fa0, ft2, ft0 \n"
+ "fmul.d fa1, ft2, ft1 \n"
+ "fmul.d fa2, ft2, ft0 \n"
+ "fmul.d fa3, ft2, ft1 \n"
+ // Do directionals as loop
+ "frep.o %[cd], 4, 3, 0b0000 \n"
+ "fmadd.d fa0, ft2, ft0, fa0 \n"
+ "fmadd.d fa1, ft2, ft1, fa1 \n"
+ "fmadd.d fa2, ft2, ft0, fa2 \n"
+ "fmadd.d fa3, ft2, ft1, fa3 \n"
+ // Final writeback
+ "fsd fa0, 0 (%[wb]) \n"
+ "fsd fa1, %[sx](%[wb]) \n"
+ "fsd fa2, %[sy](%[wb]) \n"
+ "fsd fa3, %[sb](%[wb]) \n"
+ :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2),
+ [wb]"r"(&(*A[(t+1)%2])[z+ci::r][y+ci::r][x+ci::r])
+ : "memory", "fa0", "fa1", "fa2", "fa3"
+ );
+ }
+ lx = (lx + sp::px) % jmpx;
+ }
+ ly = (ly + sp::py) % jmpy;
+ }
+ lz = (lz + sp::pz) % jmpz;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
+
+
+// =============
+// Minimod
+// =============
+
+KNL istci_minimod_acoustic_iso_cd(
+ const int cid,
+ TCDM d_t (RCP u[2])[s::nz][s::ny][s::nx],
+ TCDM d_t (RCP f)[s::nz-8][s::ny-8][s::nx-8]
+) {
+ // Assertions and IDs
+ static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!");
+ static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!");
+ KNL_IDS_LOC(cid)
+
+ // Define points of stencil and unroll copies
+ constexpr uint32_t rad = 4;
+ constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx;
+ constexpr uint32_t cct = rad*dz + rad*dy + rad*dx;
+ constexpr uint32_t nhpoints = 6*rad;
+ // Indices include padding on axes (do not init arrays to prevent memcpy)
+ constexpr uint32_t ilen = 2*nhpoints+4;
+ IDXA i0[ilen], i1[ilen];
+ IDXA *p0 = i0, *p1 = i1;
+ /*cc0*/ *(p0++) = cct; *(p1++) = cct+sx;
+ /*cc1*/ *(p0++) = cct+sy; *(p1++) = cct+sb;
+ #pragma unroll
+ for (int j = 1; j <= rad; ++j) {
+ uint32_t ll=cct-j*dx, rr = cct+j*dx, bb = cct-j*dy, tt = cct+j*dy, aa = cct-j*dz, ff = cct+j*dz;
+ /*ll0*/ *(p0++) = ll; *(p1++) = ll+sx;
+ /*ll1*/ *(p0++) = ll+sy; *(p1++) = ll+sb;
+ /*rr0*/ *(p0++) = rr; *(p1++) = rr+sx;
+ /*rr1*/ *(p0++) = rr+sy; *(p1++) = rr+sb;
+ /*bb0*/ *(p0++) = bb; *(p1++) = bb+sx;
+ /*bb1*/ *(p0++) = bb+sy; *(p1++) = bb+sb;
+ /*tt0*/ *(p0++) = tt; *(p1++) = tt+sx;
+ /*tt1*/ *(p0++) = tt+sy; *(p1++) = tt+sb;
+ /*aa0*/ *(p0++) = aa; *(p1++) = aa+sx;
+ /*aa1*/ *(p0++) = aa+sy; *(p1++) = aa+sb;
+ /*ff0*/ *(p0++) = ff; *(p1++) = ff+sx;
+ /*ff1*/ *(p0++) = ff+sy; *(p1++) = ff+sb;
+ }
+ __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen);
+
+ // Use registers for coefficients
+ register d_t cc0 asm("f3");
+ register d_t cx0 asm("f4");
+ register d_t cy0 asm("f5");
+ register d_t cz0 asm("f6");
+ register d_t cx1 asm("f7");
+ register d_t cy1 asm("f8");
+ register d_t cz1 asm("f9");
+ register d_t cx2 asm("f10");
+ register d_t cy2 asm("f11");
+ register d_t cz2 asm("f12");
+ register d_t cx3 asm("f13");
+ register d_t cy3 asm("f14");
+ register d_t cz3 asm("f15");
+
+ // Preload registers
+ asm volatile(
+ "fld f13, -8(%[xp]) \n"
+ "fld f14, -8(%[yp]) \n"
+ "fld f15, -8(%[zp]) \n"
+ "fadd.d f3, f13, f14 \n"
+ "fld f4, 0(%[xp]) \n"
+ "fld f5, 0(%[yp]) \n"
+ "fld f6, 0(%[zp]) \n"
+ "fadd.d f3, f3, f15 \n"
+ "fld f7, 8(%[xp]) \n"
+ "fld f8, 8(%[yp]) \n"
+ "fld f9, 8(%[zp]) \n"
+ "fmul.d f3, f3, %[cf2]\n"
+ "fld f10, 16(%[xp]) \n"
+ "fld f11, 16(%[yp]) \n"
+ "fld f12, 16(%[zp]) \n"
+ "fld f13, 24(%[xp]) \n"
+ "fld f14, 24(%[yp]) \n"
+ "fld f15, 24(%[zp]) \n"
+ : "+&f"(cx0), "+&f"(cy0), "+&f"(cz0), "+&f"(cx1), "+&f"(cy1), "+&f"(cz1),
+ "+&f"(cx2), "+&f"(cy2), "+&f"(cz2), "+&f"(cx3), "+&f"(cy3), "+&f"(cz3),
+ "+&f"(cc0)
+ : [xp]"r"(&c::xp[1]), [yp]"r"(&c::yp[1]), [zp]"r"(&c::zp[1]), [cf2]"f"(2.0)
+ );
+
+ // introduce variable for tracking impulse offsets
+ uint32_t lf = cid;
+
+ __RT_SSSR_BLOCK_BEGIN
+ for (int t = 0; t < st::t; t++) {
+ // We load last grid's center piece inside the time loop as it keeps changing
+ int32_t ccoffs = &(*u[(t+1)%2])[rad][rad][rad] - &(*u[t%2])[0][0][0];
+ /*cc0*/ i0[ilen-2] = ccoffs; i0[ilen-1] = ccoffs+sy;
+ /*cc1*/ i1[ilen-2] = ccoffs+sx; i1[ilen-1] = ccoffs+sb;
+ form (z, lz, s::n-2*rad, jmpz) {
+ form (y, ly, s::n-2*rad, jmpy) {
+ __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2*rad+jmpx-lx-sp::ux)/jmpx, jmpx*sodt);
+ bool winit = true;
+ form (x, lx, s::n-2*rad, jmpx) {
+ register d_t fi0 asm("f28") = c::uffac * (*f)[z][y ][x ];
+ register d_t fix asm("f29") = c::uffac * (*f)[z][y ][x+1];
+ // Set up SSRs
+ __istc_iter_issrs((void*)&(*u[t%2])[z][y][x], (void*)i0, (void*)i1);
+ // Load impulses
+ register d_t fiy asm("f30") = c::uffac * (*f)[z][y+1][x ];
+ register d_t fib asm("f31") = c::uffac * (*f)[z][y+1][x+1];
+ if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*u[(t+1)%2])[z+rad][y+rad][lx+rad], 2, __RT_SSSR_REG_WPTR_2);}
+ asm volatile (
+ // First add centerpoint
+ "fmadd.d f28, f3, f0, f28 \n"
+ "fmadd.d f29, f3, f1, f29 \n"
+ "fmadd.d f30, f3, f0, f30 \n"
+ "fmadd.d f31, f3, f1, f31 \n"
+ // Iterate over points (stagger coeffs)
+ "frep.o %[c3], 8, 3, 0b010 \n"
+ "fmadd.d f28, f4, f0, f28 \n"
+ "fmadd.d f29, f4, f1, f29 \n"
+ "fmadd.d f30, f4, f0, f30 \n"
+ "fmadd.d f31, f4, f1, f31 \n"
+ "fmadd.d f28, f4, f0, f28 \n"
+ "fmadd.d f29, f4, f1, f29 \n"
+ "fmadd.d f30, f4, f0, f30 \n"
+ "fmadd.d f31, f4, f1, f31 \n"
+ "frep.o %[c7], 8, 7, 0b010 \n"
+ "fmadd.d f28, f8, f0, f28 \n"
+ "fmadd.d f29, f8, f1, f29 \n"
+ "fmadd.d f30, f8, f0, f30 \n"
+ "fmadd.d f31, f8, f1, f31 \n"
+ "fmadd.d f28, f8, f0, f28 \n"
+ "fmadd.d f29, f8, f1, f29 \n"
+ "fmadd.d f30, f8, f0, f30 \n"
+ "fmadd.d f31, f8, f1, f31 \n"
+ // Final subtraction and writeback
+ "fsub.d f2, f28, f0 \n"
+ "fsub.d f2, f29, f1 \n"
+ "fsub.d f2, f30, f0 \n"
+ "fsub.d f2, f31, f1 \n"
+ : "+&f"(cx0), "+&f"(cy0), "+&f"(cz0), "+&f"(cx1), "+&f"(cy1), "+&f"(cz1),
+ "+&f"(cx2), "+&f"(cy2), "+&f"(cz2), "+&f"(cx3), "+&f"(cy3), "+&f"(cz3),
+ "+&f"(cc0),
+ "+&f"(fi0), "+&f"(fix), "+&f"(fiy), "+&f"(fib)
+ : [c7]"r"(7), [c3]"r"(3)
+ : "memory"
+ );
+ }
+ lx = (lx + sp::ux) % jmpx;
+ }
+ ly = (ly + sp::uy) % jmpy;
+ }
+ lz = (lz + sp::uz) % jmpz;
+ __rt_barrier();
+ }
+ __RT_SSSR_BLOCK_END
+}
diff --git a/sw/saris/stencils/istc.par.hpp b/sw/saris/stencils/istc.par.hpp
new file mode 100644
index 0000000000..37ba6fd4e3
--- /dev/null
+++ b/sw/saris/stencils/istc.par.hpp
@@ -0,0 +1,239 @@
+#include "istc.common.hpp"
+
+// ===============
+// Polybench
+// ===============
+
+KNL istcp_pb_jacobi_2d(
+ const int cid,
+ TCDM d_t (RCP A)[s::n][s::n],
+ TCDM d_t (RCP B)[s::n][s::n]
+) {
+ KNL_IDS(cid)
+ for (int t = 0; t < st::t; t++) {
+ forpx (y, i, 1, s::n-1)
+ forpex (4, x, j, 1, s::n-1)
+ (*B)[i][j] = 0.2 * ((*A)[i][j] + (*A)[i][j-1] + (*A)[i][1+j] + (*A)[1+i][j] + (*A)[i-1][j]);
+ __rt_barrier();
+ }
+}
+
+
+// ==========
+// AN5D
+// ==========
+
+KNL istcp_an5d_j2d5pt(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ // Avoid constant FP division
+ constexpr d_t fac = 1.0 / c::c0;
+ for (int t = 0; t < st::t; t++) {
+ forpx (y, y, 1, s::ny-1)
+ forpex (4, x, x, 1, s::nx-1)
+ (*A[(t+1)%2])[y][x] = fac * (
+ c::ym[0] * (*A[t%2])[y-1][x ] +
+ c::xm[0] * (*A[t%2])[y ][x-1] +
+ c::cc * (*A[t%2])[y ][x ] +
+ c::xp[0] * (*A[t%2])[y ][x+1] +
+ c::yp[0] * (*A[t%2])[y+1][x ]
+ );
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_j2d9pt(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ // Avoid constant FP division
+ constexpr d_t fac = 1.0 / c::c0;
+ for (int t = 0; t < st::t; t++) {
+ forpx (y, y, 2, s::ny-2)
+ forpex (2, x, x, 2, s::nx-2)
+ (*A[(t+1)%2])[y][x] = fac * (
+ c::ym[0] * (*A[t%2])[y-1][x ] + c::ym[1] * (*A[t%2])[y-2][x ] +
+ c::xm[0] * (*A[t%2])[y ][x-1] + c::xm[1] * (*A[t%2])[y ][x-2] +
+ c::cc * (*A[t%2])[y ][x ] +
+ c::xp[0] * (*A[t%2])[y ][x+1] + c::xp[1] * (*A[t%2])[y ][x+2] +
+ c::yp[0] * (*A[t%2])[y+1][x ] + c::yp[1] * (*A[t%2])[y+2][x ]
+ );
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_j2d9pt_gol(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ // Avoid constant FP division
+ constexpr d_t fac = 1.0 / c::c0;
+ for (int t = 0; t < st::t; t++) {
+ forpx (y, y, 1, s::ny-1)
+ forpex (2, x, x, 1, s::nx-1) {
+ d_t acc = 0.0;
+ #pragma unroll
+ for (int dy = -1; dy <= 1; ++dy)
+ #pragma unroll
+ for (int dx = -1; dx <= 1; ++dx)
+ acc += c::c[dy+1][dx+1] * (*A[t%2])[y+dy][x+dx];
+ (*A[(t+1)%2])[y][x] = fac * acc;
+ }
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_star2dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ for (int t = 0; t < st::t; t++) {
+ forpx (y, y, ci::r, s::ny-ci::r)
+ forpx (x, x, ci::r, s::nx-ci::r) {
+ d_t acc = c::cc * (*A[t%2])[y][x];
+ #pragma unroll
+ for (int dr = 0; dr < ci::r; ++dr) {
+ acc += c::xm[dr] * (*A[t%2])[y][x-1-dr];
+ acc += c::xp[dr] * (*A[t%2])[y][x+1+dr];
+ acc += c::ym[dr] * (*A[t%2])[y-1-dr][x];
+ acc += c::yp[dr] * (*A[t%2])[y+1+dr][x];
+ }
+ (*A[(t+1)%2])[y][x] = acc;
+ }
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_box2dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ for (int t = 0; t < st::t; t++) {
+ forpx (y, y, ci::r, s::ny-ci::r)
+ forpx (x, x, ci::r, s::nx-ci::r) {
+ d_t acc = 0.0;
+ #pragma unroll
+ for (int dy = -ci::r; dy <= ci::r; ++dy)
+ #pragma unroll
+ for (int dx = -ci::r; dx <= ci::r; ++dx)
+ acc += c::c[dy+ci::r][dx+ci::r] * (*A[t%2])[y+dy][x+dx];
+ (*A[(t+1)%2])[y][x] = acc;
+ }
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_star3dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ for (int t = 0; t < st::t; t++) {
+ forpx (z, z, ci::r, s::nz-ci::r)
+ forpx (y, y, ci::r, s::ny-ci::r)
+ forpx (x, x, ci::r, s::nx-ci::r) {
+ d_t acc = c::cc * (*A[t%2])[z][y][x];
+ #pragma unroll
+ for (int dr = 0; dr < ci::r; ++dr) {
+ acc += c::xm[dr] * (*A[t%2])[z][y][x-1-dr];
+ acc += c::xp[dr] * (*A[t%2])[z][y][x+1+dr];
+ acc += c::ym[dr] * (*A[t%2])[z][y-1-dr][x];
+ acc += c::yp[dr] * (*A[t%2])[z][y+1+dr][x];
+ acc += c::zm[dr] * (*A[t%2])[z-1-dr][y][x];
+ acc += c::zp[dr] * (*A[t%2])[z+1+dr][y][x];
+ }
+ (*A[(t+1)%2])[z][y][x] = acc;
+ }
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_box3dXr(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ for (int t = 0; t < st::t; t++) {
+ forpx (z, z, ci::r, s::nz-ci::r)
+ forpx (y, y, ci::r, s::ny-ci::r)
+ forpx (x, x, ci::r, s::nx-ci::r) {
+ d_t acc = 0.0;
+ for (int dz = -ci::r; dz <= ci::r; ++dz)
+ #pragma unroll
+ for (int dy = -ci::r; dy <= ci::r; ++dy)
+ #pragma unroll
+ for (int dx = -ci::r; dx <= ci::r; ++dx)
+ acc += c::c3[dz+ci::r][dy+ci::r][dx+ci::r] * (*A[t%2])[z+dz][y+dy][x+dx];
+ (*A[(t+1)%2])[z][y][x] = acc;
+ }
+ __rt_barrier();
+ }
+}
+
+
+KNL istcp_an5d_j3d27pt(
+ const int cid,
+ TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx]
+) {
+ KNL_IDS(cid)
+ // Avoid constant FP division
+ constexpr d_t fac = 1.0 / c::c0;
+ for (int t = 0; t < st::t; t++) {
+ forpx (z, z, 1, s::nz-1)
+ forpx (y, y, 1, s::ny-1)
+ forpx (x, x, 1, s::nx-1) {
+ d_t acc = 0.0;
+ for (int dz = -1; dz <= 1; ++dz)
+ #pragma unroll
+ for (int dy = -1; dy <= 1; ++dy)
+ #pragma unroll
+ for (int dx = -1; dx <= 1; ++dx)
+ acc += c::c3[dz+1][dy+1][dx+1] * (*A[t%2])[z+dz][y+dy][x+dx];
+ (*A[(t+1)%2])[z][y][x] = fac * acc;
+ }
+ __rt_barrier();
+ }
+}
+
+// =============
+// Minimod
+// =============
+
+KNL istcp_minimod_acoustic_iso_cd(
+ const int cid,
+ TCDM d_t (RCP u[2])[s::nz][s::ny][s::nx],
+ TCDM d_t (RCP f)[s::nz-8][s::ny-8][s::nx-8]
+) {
+ KNL_IDS(cid)
+ constexpr uint32_t rad = 4;
+ // Compute coefficient of center point
+ constexpr float cc = 2 * (c::xp[0] + c::yp[0] + c::zp[0]);
+ for (int t = 0; t < st::t; t++) {
+ forpx (z, z, rad, s::nz-rad)
+ forpx (y, y, rad, s::ny-rad)
+ forpx (x, x, rad, s::nx-rad) {
+ // Initialize with incorporated impulse (has optional factor)
+ d_t lapl = c::uffac * (*f)[z-rad][y-rad][x-rad];
+ // Compute Laplacian
+ lapl += cc * (*u[t%2])[z][y][x];
+ for (int m = 1; m <= rad; ++m)
+ lapl += c::xp[m] * ((*u[t%2])[z][y][x-m] + (*u[t%2])[z][y][x+m]) +
+ c::yp[m] * ((*u[t%2])[z][y-m][x] + (*u[t%2])[z][y+m][x]) +
+ c::zp[m] * ((*u[t%2])[z-m][y][x] + (*u[t%2])[z+m][y][x]);
+ (*u[(t+1)%2])[z][y][x] = lapl - (*u[(t+1)%2])[z][y][x];
+ }
+ __rt_barrier();
+ }
+}
diff --git a/sw/saris/util/eval.cpp.tpl b/sw/saris/util/eval.cpp.tpl
new file mode 100644
index 0000000000..ad22b628ba
--- /dev/null
+++ b/sw/saris/util/eval.cpp.tpl
@@ -0,0 +1,55 @@
+#include "runtime.hpp"
+#include "istc.par.hpp"
+#include "istc.issr.hpp"
+
+${datadecls}
+${bundledecls}
+
+${ctgrids}
+
+${ciparams}
+
+TCDMDECL volatile uint32_t err_sema = 0;
+
+EXTERN_C int smain(uint32_t core_id, uint32_t core_num, void* tcdm_start, void* tcdm_end) {
+
+ // Kick DMCC
+ if (core_id == core_num-1) {
+ __rt_barrier();
+
+% for i in range(nbarriers):
+ // Kernel ${i}
+${indent(dma_transfers, " "*8)}
+ __rt_barrier();
+% endfor
+ goto past_knl;
+ }
+
+ __rt_barrier();
+ __rt_get_timer();
+% for k in kernels:
+ ${k[1]};
+ __rt_get_timer();
+% endfor
+
+past_knl:
+% for name, touch in touches.items():
+ if (core_id == 0) printf("touching `${name}`\n");
+ __istc_touch_grid(
+ core_id, core_num, ${touch['stride']},
+ ${touch['ptr']}, ${touch['len']}, &err_sema
+ );
+% endfor
+% for i, check in enumerate(checks):
+ if (core_id == 0) printf("Performing check ${i}\n");
+ __istc_cmp_grids(
+ core_id, core_num, ${check['stride']},
+ ${check['a']}, ${check['b']}, ${check['len']}, ${check['eps']},
+ &err_sema
+ );
+% endfor
+
+ return err_sema;
+}
+
+${datainits}
diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py
new file mode 100644
index 0000000000..4af67d6e74
--- /dev/null
+++ b/sw/saris/util/evalgen.py
@@ -0,0 +1,312 @@
+import sys
+import json
+import numpy as np
+from textwrap import indent
+from mako.template import Template
+
+
+CHECK_DEF_STRIDE = 17
+CHECK_DEF_EPS = 1e-7
+ELEMTYPE = 'double'
+ELEMS_PER_ROW = 4
+
+# Keep these dimensions aligned with code headers
+GRID_DIMS = {
+ 1: { 's': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192 },
+ 2: { 's': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128 },
+ 3: { 's': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32 },
+}
+
+CSTRUCT_FMT = 'struct TCDMSPC {prname} {{\n{body}\n}};\n{dtype} {decls};'
+
+CTSTRUCT_FTYPE = 'TCDM PRMD'
+CTSTRUCT_DTYPE = 'TCDM PRMXD'
+
+CTSTRUCT_DEFAULT_GRIDS = {
+ 'xm': {'seed': 1513, 'dims': [8]},
+ 'xp': {'seed': 1514, 'dims': [8]},
+ 'ym': {'seed': 1515, 'dims': [8]},
+ 'yp': {'seed': 1516, 'dims': [8]},
+ 'zm': {'seed': 1517, 'dims': [8]},
+ 'zp': {'seed': 1518, 'dims': [8]},
+ 'cc': {'seed': 1519},
+ 'c0': {'seed': 1520},
+ 'uffac': {'seed': 1521},
+ 'c': {'seed': 1522, 'dims': [6, 6]},
+ 'c3': {'seed': 1523, 'dims': [3, 3, 3]}
+}
+
+CISTRUCT_FTYPE = 'TCDM PRM'
+CISTRUCT_DTYPE = 'TCDM PRMX'
+
+
+def set_seed(seed: int = None):
+ if seed is not None:
+ np.random.seed(seed)
+
+
+def resolve_dim(dim: str) -> int:
+ try:
+ ret = int(dim)
+ except ValueError:
+ # If the string does not match our expectations, this will throw accordingly
+ return GRID_DIMS[int(dim[0])][dim[1:]]
+ if ret <= 0:
+ raise ValueError(f'Dimensions must be bigger than 1 (got {ret})')
+ return ret
+
+
+def resolve_dims(grid_args: list) -> list:
+ return [resolve_dim(dim) for dim in grid_args]
+
+
+def gen_subscripts(int_dims: list) -> str:
+ return "".join(f'[{d}]' for d in int_dims)
+
+
+def resolve_check(check: dict, grids: dict):
+ # Set defaults as needed
+ if 'eps' not in check:
+ check['eps'] = CHECK_DEF_EPS
+ if 'stride' not in check:
+ check['stride'] = CHECK_DEF_STRIDE
+ # Resolve grids
+ for grid in ('a', 'b'):
+ # If either comparison reference is a known grid, resolve it and adopt its length
+ gname = check[grid]
+ if gname in grids:
+ dims = resolve_dims(grids[gname]['dims'])
+ check[grid] = f'&{gname}' + '[0]'*len(dims)
+ tgt_len = np.product(dims)
+ if 'len' in check:
+ assert check['len'] == tgt_len, \
+ f'Mismatching grid check lengths: {tgt_len} ({grids[gname]}) vs {check["len"]}'
+ else:
+ check['len'] = tgt_len
+ # Make sure we have a length now
+ assert 'len' in check, f'Could not resolve length for check {check}'
+
+
+def resolve_touches(grids: dict, stride: int = CHECK_DEF_STRIDE) -> dict:
+ ret = {}
+ for name, grid in grids.items():
+ ret[name] = {'stride': stride}
+ # Resolve grid
+ dims = resolve_dims(grid['dims'])
+ ret[name]['ptr'] = f'&{name}' + '[0]'*len(dims)
+ ret[name]['len'] = np.product(dims)
+ return ret
+
+
+# Handles one level of nested array initialization.
+def generate_array_level(int_dims: list, zero, pos: int = 0) -> str:
+ # Handle degenerate scalar case
+ if (len(int_dims) == 0):
+ return str(np.random.normal(size=1)[0] if not zero else 0.0)
+ elif pos == len(int_dims)-1:
+ rand_doubles = np.random.normal(size=int_dims[-1]) if not zero else np.zeros(shape=int_dims[-1])
+ elems = [str(d) for d in rand_doubles]
+ elems_fmt = ",\n".join([", ".join(elems[i:i + ELEMS_PER_ROW])
+ for i in range(0, len(elems), ELEMS_PER_ROW)])
+ else:
+ elems = [generate_array_level(int_dims, zero, pos+1) for _ in range(int_dims[pos])]
+ elems_fmt = ', '.join(elems)
+ return f'{{\n{indent(elems_fmt, " " * 4*(pos+1))}\n}}'
+
+
+# Returns declaration and initialization separately
+def generate_grids(grids: dict) -> (str, str):
+ decls = []
+ inits = []
+ for name, args in grids.items():
+ # First argument provides generation seed
+ set_seed(args['seed'])
+ int_dims = resolve_dims(args['dims'])
+ subscripts = gen_subscripts(int_dims)
+ attrs = (args['attrs'] + ' ') if 'attrs' in args else ''
+ decls.append(f'extern __attribute__((visibility("default"))) {attrs}{ELEMTYPE} {name}{subscripts};')
+ inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+ return '\n'.join(decls), '\n'.join(inits)
+
+
+# Returns the instantiation of a parameter static class
+def generate_ctstruct(grids: dict, prname = 'ct') -> str:
+ body = []
+ decls = []
+ for name, args in grids.items():
+ # First argument provides generation seed
+ set_seed(args['seed'])
+ int_dims = resolve_dims(args['dims']) if 'dims' in args else []
+ subscripts = gen_subscripts(int_dims)
+ body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+ decls.append(f'{prname}::{name}{subscripts}')
+ return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CTSTRUCT_DTYPE, decls=", ".join(decls))
+
+
+# Returns the instantiation of a parameter static class
+def generate_cistruct(params: dict, prname = 'ci') -> str:
+ body = []
+ decls = []
+ for lval, rval in params.items():
+ body.append(f'{CISTRUCT_FTYPE} {lval} = {rval};')
+ decls.append(f'{prname}::{lval}')
+ return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CISTRUCT_DTYPE, decls=", ".join(decls))
+
+
+# Returns declaration and initialization separately
+def generate_bundles(bundles: dict, grids: dict) -> str:
+ decls = []
+ for name, grid_names in bundles.items():
+ int_dims = resolve_dims(grids[grid_names[0]]['dims'])
+ if any(int_dims != resolve_dims(grids[g]['dims']) for g in grid_names[1:]):
+ raise ValueError(f'Bundle {name} has mismatching grid dimensions')
+ attrs = grids[grid_names[0]]['attrs']
+ if any(attrs != grids[g]['attrs'] for g in grid_names[1:]):
+ raise ValueError(f'Bundle {name} has mismatching attributes')
+ attrs = (attrs + ' ') if attrs else ''
+ decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}]){gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};')
+ return '\n'.join(decls)
+
+
+# Returns a code snippet performing a DMA out transfer
+def generate_dma_out(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
+ ndim = len(dst_grid['dims'])
+ assert ndim == 3 or ndim == 2, 'Only 2D and 3D grids supported'
+
+ dst_dims = resolve_dims(dst_grid['dims'])
+ src_dims = resolve_dims(src_grid['dims'])
+
+ args = []
+ subscripts = f'[{radius}][{radius}]'
+ if ndim == 3:
+ subscripts = f'[{radius} + i]{subscripts}'
+ args.append(f'(void *)&({dst_grid["uid"]}{subscripts})') # dst
+ args.append(f'(void *)&({src_grid["uid"]}{subscripts})') # src
+ args.append(f'{src_dims[0] - radius * 2} * sizeof(double)') # size
+ args.append(f'{src_dims[0]} * sizeof(double)') # src_stride
+ args.append(f'{dst_dims[0]} * sizeof(double)') # dst_stride
+ args.append(f'{src_dims[1] - radius * 2}') # repeat
+ args = ',\n'.join(args)
+
+ dma_call = f'__rt_dma_start_2d(\n{indent(args, " "*4)}\n);'
+ dma_transfer = f'{dma_call}\n'
+
+ if ndim == 3:
+ loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+ return loop
+ else:
+ return dma_transfer
+
+
+# Returns a code snippet performing a DMA in transfer
+def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
+ ndim = len(dst_grid['dims'])
+ assert ndim == 3 or ndim == 2, 'Only 2D and 3D grids supported'
+
+ dst_dims = resolve_dims(dst_grid['dims'])
+ src_dims = resolve_dims(src_grid['dims'])
+
+ args = []
+ subscripts = f'[0][0]'
+ if ndim == 3:
+ subscripts = f'[i]{subscripts}'
+ args.append(f'(void *)&({dst_grid["uid"]}{subscripts})') # dst
+ args.append(f'(void *)&({src_grid["uid"]}{subscripts})') # src
+ args.append(f'{dst_dims[0]} * sizeof(double)') # size
+ args.append(f'{src_dims[0]} * sizeof(double)') # src_stride
+ args.append(f'{dst_dims[0]} * sizeof(double)') # dst_stride
+ args.append(f'{dst_dims[1]}') # repeat
+ args = ',\n'.join(args)
+
+ dma_call = f'__rt_dma_start_2d(\n{indent(args, " "*4)}\n);'
+ dma_transfer = f'{dma_call}\n'
+
+ if ndim == 3:
+ loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+ return loop
+ else:
+ return dma_transfer
+
+
+# Returns a grid dictionary from the grids dictionary,
+# where the key in the grids dictionary is appended to the value
+# as the 'uid' field.
+def get_grid(grids: dict, grid_uid: str) -> tuple:
+ grid = grids[grid_uid]
+ grid['uid'] = grid_uid
+ return grid
+
+
+def resolve_dma_transfers(transfers: list, radius: int) -> list:
+ # Uniformize single transfer and multiple transfer cases
+ if not isinstance(transfers[0], list):
+ transfers = [transfers]
+ # Expand bidirectional transfers into unidirectional transfers
+ unidir_transfers = []
+ for transfer in transfers:
+ if len(transfer) < 3:
+ unidir_transfers.append([*transfer, "in"])
+ unidir_transfers.append([*transfer, "out"])
+ else:
+ unidir_transfers.append(transfer)
+ # Add default radius if absent
+ for transfer in unidir_transfers:
+ if len(transfer) < 4:
+ transfer.append(radius)
+ return unidir_transfers
+
+
+# Returns a code snippet performing DMA transfers
+def generate_dma_transfers(grids: dict, transfers: list) -> str:
+ s = ''
+ for transfer in transfers:
+ l1_grid_name, l3_grid_name, direction, radius = transfer
+ l1_grid = get_grid(grids, l1_grid_name)
+ l3_grid = get_grid(grids, l3_grid_name)
+ if direction == 'out':
+ s += generate_dma_out(l3_grid, l1_grid, radius)
+ elif direction == 'in':
+ s += generate_dma_in(l1_grid, l3_grid, radius)
+ else:
+ raise ValueError()
+ s += '\n__rt_dma_wait_all();'
+ return s
+
+
+def main(cfg_file: str, tpl_file: str, program: str):
+ # Load programs to generate from config
+ with open(cfg_file) as f:
+ progs = json.load(f)
+ # Generate code for test program according to its config entry
+ cfg = progs[program]
+ grids = cfg['grids']
+ cfg['datadecls'], cfg['datainits'] = generate_grids(grids)
+ cfg['bundledecls'] = ""
+ if 'bundles' in cfg:
+ cfg['bundledecls'] = generate_bundles(cfg['bundles'], grids)
+ ctgrids = CTSTRUCT_DEFAULT_GRIDS;
+ if 'ctgrids' in cfg:
+ ctgrids.update(cfg['ctgrids'])
+ cfg['ctgrids'] = generate_ctstruct(ctgrids)
+ cfg['ciparams'] = ""
+ if 'params' in cfg:
+ cfg['ciparams'] = generate_cistruct(cfg['params'])
+ if 'checks' not in cfg:
+ cfg['checks'] = []
+ for check in cfg['checks']:
+ resolve_check(check, grids)
+ cfg['touches'] = {}
+ if 'touch' in cfg:
+ touches = {grid_name: grids[grid_name] for grid_name in cfg['touch']}
+ cfg['touches'] = resolve_touches(touches)
+ cfg['dma_transfers'] = ''
+ if 'dma' in cfg:
+ transfers = resolve_dma_transfers(cfg['dma'], cfg['radius'])
+ cfg['dma_transfers'] = generate_dma_transfers(grids, transfers)
+ cfg["nbarriers"] = sum(k[0] for k in cfg['kernels'])
+ cfg['indent'] = indent
+ print(Template(filename=tpl_file).render(**cfg))
+
+
+if __name__ == '__main__':
+ main(*sys.argv[1:])
From e73ef430ffa8355c0d9877ec8c2cd14b7d77dbbb Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Tue, 2 Apr 2024 16:38:47 +0200
Subject: [PATCH 04/10] sw/saris: Fix license headers
---
sw/saris/runtime/crt0.S | 4 ++++
sw/saris/runtime/dma.h | 4 ++++
sw/saris/runtime/link.ld | 4 ++++
sw/saris/runtime/runtime.h | 4 ++++
sw/saris/runtime/runtime.hpp | 4 ++++
sw/saris/runtime/sssr.h | 4 ++++
sw/saris/stencils/istc.common.hpp | 4 ++++
sw/saris/stencils/istc.issr.hpp | 4 ++++
sw/saris/stencils/istc.par.hpp | 4 ++++
sw/saris/util/eval.cpp.tpl | 4 ++++
sw/saris/util/evalgen.py | 5 +++++
11 files changed, 45 insertions(+)
diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S
index 79efb0cbbe..96efe9b49b 100644
--- a/sw/saris/runtime/crt0.S
+++ b/sw/saris/runtime/crt0.S
@@ -1,3 +1,7 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
# HTIF sections
.pushsection .htif,"aw",@progbits;
.align 6; .global tohost; tohost: .dword 0;
diff --git a/sw/saris/runtime/dma.h b/sw/saris/runtime/dma.h
index 80956b0f73..5a664b0ce3 100644
--- a/sw/saris/runtime/dma.h
+++ b/sw/saris/runtime/dma.h
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#pragma once
#include
diff --git a/sw/saris/runtime/link.ld b/sw/saris/runtime/link.ld
index 5788547bdd..13fc1570f9 100644
--- a/sw/saris/runtime/link.ld
+++ b/sw/saris/runtime/link.ld
@@ -1,3 +1,7 @@
+/* Copyright 2024 ETH Zurich and University of Bologna. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: SHL-0.51 */
+
OUTPUT_ARCH( "riscv" )
ENTRY(_start)
diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h
index 883bacb2ae..414fa9e394 100644
--- a/sw/saris/runtime/runtime.h
+++ b/sw/saris/runtime/runtime.h
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#pragma once
#include
diff --git a/sw/saris/runtime/runtime.hpp b/sw/saris/runtime/runtime.hpp
index df501ff20e..b9a60e564a 100644
--- a/sw/saris/runtime/runtime.hpp
+++ b/sw/saris/runtime/runtime.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#pragma once
// C linkage macros
diff --git a/sw/saris/runtime/sssr.h b/sw/saris/runtime/sssr.h
index 171ccb454f..78fec8f366 100644
--- a/sw/saris/runtime/sssr.h
+++ b/sw/saris/runtime/sssr.h
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#pragma once
// Registers
diff --git a/sw/saris/stencils/istc.common.hpp b/sw/saris/stencils/istc.common.hpp
index 042005a741..e005e39ac7 100644
--- a/sw/saris/stencils/istc.common.hpp
+++ b/sw/saris/stencils/istc.common.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#include
#include
#include
diff --git a/sw/saris/stencils/istc.issr.hpp b/sw/saris/stencils/istc.issr.hpp
index c74d76b4dc..d81614e36c 100644
--- a/sw/saris/stencils/istc.issr.hpp
+++ b/sw/saris/stencils/istc.issr.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#include "istc.common.hpp"
// ===============
diff --git a/sw/saris/stencils/istc.par.hpp b/sw/saris/stencils/istc.par.hpp
index 37ba6fd4e3..26a042d05f 100644
--- a/sw/saris/stencils/istc.par.hpp
+++ b/sw/saris/stencils/istc.par.hpp
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#include "istc.common.hpp"
// ===============
diff --git a/sw/saris/util/eval.cpp.tpl b/sw/saris/util/eval.cpp.tpl
index ad22b628ba..edd26e6c5b 100644
--- a/sw/saris/util/eval.cpp.tpl
+++ b/sw/saris/util/eval.cpp.tpl
@@ -1,3 +1,7 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
#include "runtime.hpp"
#include "istc.par.hpp"
#include "istc.issr.hpp"
diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py
index 4af67d6e74..df48f00f3d 100644
--- a/sw/saris/util/evalgen.py
+++ b/sw/saris/util/evalgen.py
@@ -1,3 +1,8 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
import sys
import json
import numpy as np
From d41cd4e595450b77a4743861a523b8ae246c133b Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Tue, 2 Apr 2024 16:54:27 +0200
Subject: [PATCH 05/10] sw/saris: Fix python lint
---
sw/saris/util/evalgen.py | 44 ++++++++++++++++++++++++----------------
1 file changed, 27 insertions(+), 17 deletions(-)
diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py
index df48f00f3d..25c2c40b2b 100644
--- a/sw/saris/util/evalgen.py
+++ b/sw/saris/util/evalgen.py
@@ -17,9 +17,9 @@
# Keep these dimensions aligned with code headers
GRID_DIMS = {
- 1: { 's': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192 },
- 2: { 's': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128 },
- 3: { 's': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32 },
+ 1: {'s': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192},
+ 2: {'s': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128},
+ 3: {'s': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32},
}
CSTRUCT_FMT = 'struct TCDMSPC {prname} {{\n{body}\n}};\n{dtype} {decls};'
@@ -85,7 +85,8 @@ def resolve_check(check: dict, grids: dict):
tgt_len = np.product(dims)
if 'len' in check:
assert check['len'] == tgt_len, \
- f'Mismatching grid check lengths: {tgt_len} ({grids[gname]}) vs {check["len"]}'
+ 'Mismatching grid check lengths:' \
+ f'{tgt_len} ({grids[gname]}) vs {check["len"]}'
else:
check['len'] = tgt_len
# Make sure we have a length now
@@ -109,7 +110,8 @@ def generate_array_level(int_dims: list, zero, pos: int = 0) -> str:
if (len(int_dims) == 0):
return str(np.random.normal(size=1)[0] if not zero else 0.0)
elif pos == len(int_dims)-1:
- rand_doubles = np.random.normal(size=int_dims[-1]) if not zero else np.zeros(shape=int_dims[-1])
+ rand_doubles = np.random.normal(size=int_dims[-1]) if \
+ not zero else np.zeros(shape=int_dims[-1])
elems = [str(d) for d in rand_doubles]
elems_fmt = ",\n".join([", ".join(elems[i:i + ELEMS_PER_ROW])
for i in range(0, len(elems), ELEMS_PER_ROW)])
@@ -129,13 +131,15 @@ def generate_grids(grids: dict) -> (str, str):
int_dims = resolve_dims(args['dims'])
subscripts = gen_subscripts(int_dims)
attrs = (args['attrs'] + ' ') if 'attrs' in args else ''
- decls.append(f'extern __attribute__((visibility("default"))) {attrs}{ELEMTYPE} {name}{subscripts};')
- inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+ decls.append('extern __attribute__((visibility("default")))' +
+ f' {attrs}{ELEMTYPE} {name}{subscripts};')
+ inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} =' +
+ f'{generate_array_level(int_dims, args["seed"] == 0)};')
return '\n'.join(decls), '\n'.join(inits)
# Returns the instantiation of a parameter static class
-def generate_ctstruct(grids: dict, prname = 'ct') -> str:
+def generate_ctstruct(grids: dict, prname='ct') -> str:
body = []
decls = []
for name, args in grids.items():
@@ -143,19 +147,22 @@ def generate_ctstruct(grids: dict, prname = 'ct') -> str:
set_seed(args['seed'])
int_dims = resolve_dims(args['dims']) if 'dims' in args else []
subscripts = gen_subscripts(int_dims)
- body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};')
+ body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = ' +
+ f'{generate_array_level(int_dims, args["seed"] == 0)};')
decls.append(f'{prname}::{name}{subscripts}')
- return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CTSTRUCT_DTYPE, decls=", ".join(decls))
+ return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4),
+ dtype=CTSTRUCT_DTYPE, decls=", ".join(decls))
# Returns the instantiation of a parameter static class
-def generate_cistruct(params: dict, prname = 'ci') -> str:
+def generate_cistruct(params: dict, prname='ci') -> str:
body = []
decls = []
for lval, rval in params.items():
body.append(f'{CISTRUCT_FTYPE} {lval} = {rval};')
decls.append(f'{prname}::{lval}')
- return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CISTRUCT_DTYPE, decls=", ".join(decls))
+ return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4),
+ dtype=CISTRUCT_DTYPE, decls=", ".join(decls))
# Returns declaration and initialization separately
@@ -169,7 +176,8 @@ def generate_bundles(bundles: dict, grids: dict) -> str:
if any(attrs != grids[g]['attrs'] for g in grid_names[1:]):
raise ValueError(f'Bundle {name} has mismatching attributes')
attrs = (attrs + ' ') if attrs else ''
- decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}]){gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};')
+ decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}])' +
+ f'{gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};')
return '\n'.join(decls)
@@ -197,7 +205,8 @@ def generate_dma_out(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
dma_transfer = f'{dma_call}\n'
if ndim == 3:
- loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+ loop = '#pragma clang loop unroll(disable)\nfor (int i = 0; i < ' + \
+ f'{src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
return loop
else:
return dma_transfer
@@ -212,7 +221,7 @@ def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
src_dims = resolve_dims(src_grid['dims'])
args = []
- subscripts = f'[0][0]'
+ subscripts = '[0][0]'
if ndim == 3:
subscripts = f'[i]{subscripts}'
args.append(f'(void *)&({dst_grid["uid"]}{subscripts})') # dst
@@ -227,7 +236,8 @@ def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str:
dma_transfer = f'{dma_call}\n'
if ndim == 3:
- loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
+ loop = '#pragma clang loop unroll(disable)\nfor (int i = 0; i < ' + \
+ f'{dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n'
return loop
else:
return dma_transfer
@@ -289,7 +299,7 @@ def main(cfg_file: str, tpl_file: str, program: str):
cfg['bundledecls'] = ""
if 'bundles' in cfg:
cfg['bundledecls'] = generate_bundles(cfg['bundles'], grids)
- ctgrids = CTSTRUCT_DEFAULT_GRIDS;
+ ctgrids = CTSTRUCT_DEFAULT_GRIDS
if 'ctgrids' in cfg:
ctgrids.update(cfg['ctgrids'])
cfg['ctgrids'] = generate_ctstruct(ctgrids)
From a62d6e41ef2dbbcbe7950a96cd517978c87b0b08 Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Tue, 2 Apr 2024 16:59:37 +0200
Subject: [PATCH 06/10] lint: Do not C++ lint SARIS sources
---
.github/workflows/lint.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 6c4f91184b..65159afabd 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -129,6 +129,7 @@ jobs:
- uses: actions/checkout@v3
- uses: DoozyX/clang-format-lint-action@v0.16.2
with:
+ exclude: './sw/saris'
clangFormatVersion: 10
######################
From 31aa679125f0ea9a5180cbae5bb1dfec7621291e Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Tue, 2 Apr 2024 18:00:11 +0200
Subject: [PATCH 07/10] sw/saris: Remove stub LLVM from makefile
---
sw/saris/Makefile | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/sw/saris/Makefile b/sw/saris/Makefile
index e9bfb82500..bb1033cd55 100644
--- a/sw/saris/Makefile
+++ b/sw/saris/Makefile
@@ -11,9 +11,13 @@ all:
# Environment #
###############
-# NOTE: This needs to be a specific revision of PULP RISCV LLVM 15:
-# TODO: add commit link here
-LLVM_BINROOT ?= /home/paulsc/dev/llvm-ssr/llvm-iis/install/bin
+# NOTE: the LLVM_BINROOT environment variable must point to a specific revision of PULP RISCV
+# LLVM 15 (see README.md). After compilation, you can set LLVM_BINROOT in your environment, this
+# makefile, or pass it on invocation of `make`.
+ifndef LLVM_BINROOT
+$(error LLVM_BINROOT is not set; please compile the SARIS version of LLVM 15 (see README.md) and set LLVM_BINROOT to its binary location.)
+endif
+
PYTHON3 ?= python3
SARISDIR ?= .
From 2050a2aad569c8046c79be8a9aef5053b5597d69 Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Tue, 2 Apr 2024 18:00:32 +0200
Subject: [PATCH 08/10] sw/saris: Add README.md
---
sw/saris/README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)
diff --git a/sw/saris/README.md b/sw/saris/README.md
index 464090415c..29dd472152 100644
--- a/sw/saris/README.md
+++ b/sw/saris/README.md
@@ -1 +1,49 @@
-# TODO
+# SARIS Stencil Kernels
+
+This directory contains the baseline- and SSSR-accelerated Snitch cluster stencil kernels used in the evaluation section of the paper _"SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers"_. In our paper, we describe how indirect stream register architectures such as SSSRs can significantly accelerate stencil codes.
+
+If you use our code or compare against our work, please cite us:
+
+```
+TODO
+```
+
+> [!IMPORTANT]
+> - Unlike other software in this repository, compiling this code requires a **custom version of the LLVM 15 toolchain** with some extensions and improvements. The source code for this LLVM fork can be found [here](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0).
+> - The generated example programs are only intended to be used **in RTL simulation of an SSSR-extended cluster**, using the custom cluster configuration `cfg/sssr.json`.
+
+## Directory Structure
+
+* `stencils/`: Baseline (`istc.par.hpp`) and SARIS-accelerated (`istc.issr.hpp`) stencil codes.
+* `runtime/`: Additional runtime code and linking configuration needed for compilation.
+* `util/`: Evaluation program generator supporting different grid sizes and kernel calls.
+* `eval.json`: Configuration for test program generator.
+
+## Compile Evaluation Programs
+
+Before you can compile test problems, you need the [SARIS LLVM 15 toolchain](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0) along with `newlib` and `compiler-rt`. The required build steps are outlined [here](https://github.com/pulp-platform/llvm-toolchain-cd/blob/main/README.md).
+
+Then, you can build the test programs specified in `eval.json` by running:
+
+```
+make LLVM_BINROOT=/bin all
+```
+
+By default, `eval.json` specifies RV32G and SSSR-accelerated test programs for all included stencils as specified in our paper. Binaries are generated in `bin/` and disassembled program dumps in `dump/`.
+
+
+## Run Evaluation Programs
+
+Evaluation programs can only be run in RTL simulation of a Snitch cluster using the configuration `cfg/sssr.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`:
+
+```
+make CFG_OVERRIDE=cfg/sssr.hjson bin/snitch_cluster.vsim
+```
+
+Then, the built evaluation programs can be run on this simulation setup as usual, for example:
+
+```
+bin/snitch_cluster.vsim ../../sw/saris/bin/istc.pb_jacobi_2d_ml_issr.elf
+```
+
+Performance metrics can be analyzed using the annotating Snitch tracer (`make traces`). In the default evaluation programs, the section of interest is section 2.
From ab4fe304366da849eca8ba1c27c332c817822913 Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Tue, 2 Apr 2024 19:07:23 +0200
Subject: [PATCH 09/10] sw/saris: Initialize putchar buffer, fix F extension
skip
---
sw/saris/runtime/crt0.S | 18 +++++++++++++-----
sw/saris/runtime/runtime.h | 2 +-
2 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S
index 96efe9b49b..7b3b8644cc 100644
--- a/sw/saris/runtime/crt0.S
+++ b/sw/saris/runtime/crt0.S
@@ -30,10 +30,14 @@ _start:
slli t0, a0, 3
sub sp, sp, t0
- # check if the core has the F-extension
- csrr t0, misa
- andi t0, t0, (1 << 5)
- beqz t0, _clr_ireg
+.globl _putcb
+_init_putcb:
+ la t0, _putcb
+ # Initialize ptchar buffer size of each core to 0
+ slli t1, a0, 10
+ add t0, t0, t1
+ sw zero, 0(t0)
+ sw zero, 4(t0)
_skip_dmcc_work:
# Skip the coming two steps unless we are the DMA core
@@ -78,6 +82,11 @@ _dmcc_work_sync:
# Synchronize cores so data is ready
csrr x0, 0x7C2
+ # check if the core has the F-extension
+ csrr t0, misa
+ andi t0, t0, (1 << 5)
+ beqz t0, _clr_ireg
+
# Reset float regs if present
_clr_freg:
fcvt.d.w f0, x0
@@ -158,6 +167,5 @@ _done:
wfi
-.globl _putcb
.section .data._putcb
_putcb:
diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h
index 414fa9e394..072cfecbc0 100644
--- a/sw/saris/runtime/runtime.h
+++ b/sw/saris/runtime/runtime.h
@@ -32,7 +32,7 @@ static inline volatile uint32_t __rt_get_hartid() {
}
// Rudimentary string buffer for putchar calls.
extern uint32_t _putcb;
-#define PUTC_BUFFER_LEN (1024 - sizeof(size_t))
+#define PUTC_BUFFER_LEN (1024 - sizeof(size_t) - 8*sizeof(uint64_t))
typedef struct {
size_t size;
From ea40640bd389721009a76fe4a19977dff68e1923 Mon Sep 17 00:00:00 2001
From: Paul Scheffler
Date: Fri, 5 Apr 2024 18:11:37 +0200
Subject: [PATCH 10/10] sw/saris: Switch to, adapt default config, add bib
placeholders
---
README.md | 18 +++
docs/publications.md | 18 +++
sw/saris/README.md | 16 ++-
target/snitch_cluster/cfg/default.hjson | 41 +++++--
target/snitch_cluster/cfg/sssr.hjson | 153 ------------------------
5 files changed, 81 insertions(+), 165 deletions(-)
delete mode 100644 target/snitch_cluster/cfg/sssr.hjson
diff --git a/README.md b/README.md
index 1f7b6459cd..4280d47438 100644
--- a/README.md
+++ b/README.md
@@ -161,3 +161,21 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
```
+
+
+SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers
+
+
+```
+@misc{scheffler2024saris,
+ title={SARIS: Accelerating Stencil Computations on Energy-Efficient
+ RISC-V Compute Clusters with Indirect Stream Registers},
+ author={Paul Scheffler and Luca Colagrande and Luca Benini},
+ year={2024},
+ eprint={},
+ archivePrefix={arXiv},
+ primaryClass={cs.MS}
+}
+```
+
+
diff --git a/docs/publications.md b/docs/publications.md
index e4c86b4c6d..2395b70c73 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -118,4 +118,22 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:
+
+SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers
+
+
+```
+@misc{scheffler2024saris,
+ title={SARIS: Accelerating Stencil Computations on Energy-Efficient
+ RISC-V Compute Clusters with Indirect Stream Registers},
+ author={Paul Scheffler and Luca Colagrande and Luca Benini},
+ year={2024},
+ eprint={},
+ archivePrefix={arXiv},
+ primaryClass={cs.MS}
+}
+```
+
+
+
diff --git a/sw/saris/README.md b/sw/saris/README.md
index 29dd472152..2da223df0d 100644
--- a/sw/saris/README.md
+++ b/sw/saris/README.md
@@ -5,12 +5,20 @@ This directory contains the baseline- and SSSR-accelerated Snitch cluster stenci
If you use our code or compare against our work, please cite us:
```
-TODO
+@misc{scheffler2024saris,
+ title={SARIS: Accelerating Stencil Computations on Energy-Efficient
+ RISC-V Compute Clusters with Indirect Stream Registers},
+ author={Paul Scheffler and Luca Colagrande and Luca Benini},
+ year={2024},
+ eprint={},
+ archivePrefix={arXiv},
+ primaryClass={cs.MS}
+}
```
> [!IMPORTANT]
> - Unlike other software in this repository, compiling this code requires a **custom version of the LLVM 15 toolchain** with some extensions and improvements. The source code for this LLVM fork can be found [here](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0).
-> - The generated example programs are only intended to be used **in RTL simulation of an SSSR-extended cluster**, using the custom cluster configuration `cfg/sssr.json`.
+> - The generated example programs are only intended to be used **in RTL simulation of a default, SSSR-extended cluster**, using the cluster configuration `cfg/default.hjson`.
## Directory Structure
@@ -34,10 +42,10 @@ By default, `eval.json` specifies RV32G and SSSR-accelerated test programs for a
## Run Evaluation Programs
-Evaluation programs can only be run in RTL simulation of a Snitch cluster using the configuration `cfg/sssr.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`:
+Evaluation programs can only be run in RTL simulation of a Snitch cluster using the default, SSSR-enhanced configuration `cfg/default.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`:
```
-make CFG_OVERRIDE=cfg/sssr.hjson bin/snitch_cluster.vsim
+make CFG_OVERRIDE=cfg/default.hjson bin/snitch_cluster.vsim
```
Then, the built evaluation programs can be run on this simulation setup as usual, for example:
diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson
index adfe7adf9e..2267b57525 100644
--- a/target/snitch_cluster/cfg/default.hjson
+++ b/target/snitch_cluster/cfg/default.hjson
@@ -16,6 +16,7 @@
cluster_base_hartid: 0,
addr_width: 48,
data_width: 64,
+ user_width: 5, // clog2(total number of clusters)
tcdm: {
size: 128,
banks: 32,
@@ -24,14 +25,28 @@
zero_mem_size: 64, // kB
alias_region_enable: true,
dma_data_width: 512,
- dma_axi_req_fifo_depth: 3,
- dma_req_fifo_depth: 3,
+ dma_axi_req_fifo_depth: 24,
+ dma_req_fifo_depth: 8,
+ narrow_trans: 4,
+ wide_trans: 32,
+ dma_user_width: 1,
+ // We don't need Snitch debugging in Occamy
+ enable_debug: false,
+ // We don't need Snitch (core-internal) virtual memory support
+ vm_support: false,
+ // Memory configuration inputs
+ sram_cfg_expose: true,
+ sram_cfg_fields: {
+ ema: 3,
+ emaw: 2,
+ emas: 1
+ },
// Timing parameters
timing: {
- lat_comp_fp32: 3,
+ lat_comp_fp32: 2,
lat_comp_fp64: 3,
- lat_comp_fp16: 2,
- lat_comp_fp16_alt: 2,
+ lat_comp_fp16: 1,
+ lat_comp_fp16_alt: 1,
lat_comp_fp8: 1,
lat_comp_fp8_alt: 1,
lat_noncomp: 1,
@@ -44,7 +59,10 @@
register_core_req: true,
register_core_rsp: true,
register_offload_req: true,
- register_offload_rsp: true
+ register_offload_rsp: true,
+ register_fpu_req: true,
+ register_ext_narrow: false,
+ register_ext_wide: false
},
hives: [
// Hive 0
@@ -94,6 +112,7 @@
xf8alt: true,
xfdotp: true,
xfvec: true,
+ ssr_nr_credits: 4,
num_int_outstanding_loads: 1,
num_int_outstanding_mem: 4,
num_fp_outstanding_loads: 4,
@@ -101,8 +120,14 @@
num_sequencer_instructions: 16,
num_dtlb_entries: 1,
num_itlb_entries: 1,
- // Enable division/square root unit
- // Xdiv_sqrt: true,
+ // SSSR configuration below
+ ssr_intersection: true,
+ ssr_intersection_triple: [0, 1, 2],
+ ssrs: [
+ {indirection: true}, // Master 0
+ {indirection: true}, // Master 1
+ {}, // Slave
+ ],
},
dma_core_template: {
isa: "rv32imafd",
diff --git a/target/snitch_cluster/cfg/sssr.hjson b/target/snitch_cluster/cfg/sssr.hjson
deleted file mode 100644
index ee297960a9..0000000000
--- a/target/snitch_cluster/cfg/sssr.hjson
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Occamy-like Cluster configuration (+alias) for SSSR experiments
-{
- nr_s1_quadrant: 1,
- s1_quadrant: {
- nr_clusters: 1,
- },
-
- cluster: {
- boot_addr: 4096, // 0x1000
- cluster_base_addr: 268435456, // 0x1000_0000
- cluster_base_offset: 0, // 0x0
- cluster_base_hartid: 0,
- addr_width: 48,
- data_width: 64,
- user_width: 5, // clog2(total number of clusters)
- tcdm: {
- size: 128,
- banks: 32,
- },
- cluster_periph_size: 64, // kB
- zero_mem_size: 64, // kB
- alias_region_enable: true,
- dma_data_width: 512,
- dma_axi_req_fifo_depth: 24,
- dma_req_fifo_depth: 8,
- narrow_trans: 4,
- wide_trans: 32,
- dma_user_width: 1,
- // We don't need Snitch debugging in Occamy
- enable_debug: false,
- // We don't need Snitch (core-internal) virtual memory support
- vm_support: false,
- // Memory configuration inputs
- sram_cfg_expose: true,
- sram_cfg_fields: {
- ema: 3,
- emaw: 2,
- emas: 1
- },
- // Timing parameters
- timing: {
- lat_comp_fp32: 2,
- lat_comp_fp64: 3,
- lat_comp_fp16: 1,
- lat_comp_fp16_alt: 1,
- lat_comp_fp8: 1,
- lat_comp_fp8_alt: 1,
- lat_noncomp: 1,
- lat_conv: 2,
- lat_sdotp: 3,
- fpu_pipe_config: "BEFORE",
- narrow_xbar_latency: "CUT_ALL_PORTS",
- wide_xbar_latency: "CUT_ALL_PORTS",
- // Isolate the core.
- register_core_req: true,
- register_core_rsp: true,
- register_offload_req: true,
- register_offload_rsp: true,
- register_fpu_req: true,
- register_ext_narrow: false,
- register_ext_wide: false
- },
- hives: [
- // Hive 0
- {
- icache: {
- size: 8, // total instruction cache size in kByte
- sets: 2, // number of ways
- cacheline: 256 // word size in bits
- },
- cores: [
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/compute_core_template" },
- { $ref: "#/dma_core_template" },
- ]
- }
- ]
- },
- dram: {
- // 0x8000_0000
- address: 2147483648,
- // 0x8000_0000
- length: 2147483648
- },
- peripherals: {
- clint: {
- // 0xffff_0000
- address: 4294901760,
- // 0x0000_1000
- length: 4096
- },
- },
- // Templates.
- compute_core_template: {
- isa: "rv32imafd",
- xssr: true,
- xfrep: true,
- xdma: false,
- xf16: true,
- xf16alt: true,
- xf8: true,
- xf8alt: true,
- xfdotp: true,
- xfvec: true,
- ssr_nr_credits: 4,
- num_int_outstanding_loads: 1,
- num_int_outstanding_mem: 4,
- num_fp_outstanding_loads: 4,
- num_fp_outstanding_mem: 4,
- num_sequencer_instructions: 16,
- num_dtlb_entries: 1,
- num_itlb_entries: 1,
- // SSSR configuration below
- ssr_intersection: true,
- ssr_intersection_triple: [0, 1, 2],
- ssrs: [
- {indirection: true}, // Master 0
- {indirection: true}, // Master 1
- {}, // Slave
- ],
- },
- dma_core_template: {
- isa: "rv32imafd",
- // Xdiv_sqrt: true,
- // isa: "rv32ema",
- xdma: true,
- xssr: false,
- xfrep: false,
- xf16: false,
- xf16alt: false,
- xf8: false,
- xf8alt: false,
- xfdotp: false,
- xfvec: false,
- num_int_outstanding_loads: 1,
- num_int_outstanding_mem: 4,
- num_fp_outstanding_loads: 4,
- num_fp_outstanding_mem: 4,
- num_sequencer_instructions: 16,
- num_dtlb_entries: 1,
- num_itlb_entries: 1,
- }
-}