From 7bdd312382f5630de2b98f1dd696b7277ecfa2cf Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Fri, 22 Mar 2024 22:10:18 +0100 Subject: [PATCH 01/10] hw: Keep IO fixed regardless of configuration --- .../src/snitch_cluster_wrapper.sv.tpl | 16 ++++++---------- target/snitch_cluster/test/testharness.sv | 4 ++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index 293417ff68..c40f504406 100644 --- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -210,28 +210,24 @@ ${ssr_cfg(core, "'{{{indirection:d}, {isect_master:d}, {isect_master_idx:d}, {is ${ssr_cfg(core, '{reg_idx}', '/*None*/ 0', ',')}\ }; + // Forward potentially optional configuration parameters + localparam logic [9:0] CfgBaseHartId = (${to_sv_hex(cfg['cluster_base_hartid'], 10)}); + localparam addr_t CfgClusterBaseAddr = (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])}); + endpackage // verilog_lint: waive-stop package-filename module ${cfg['name']}_wrapper ( input logic clk_i, input logic rst_ni, -% if cfg['enable_debug']: input logic [${cfg['pkg_name']}::NrCores-1:0] debug_req_i, -% endif input logic [${cfg['pkg_name']}::NrCores-1:0] meip_i, input logic [${cfg['pkg_name']}::NrCores-1:0] mtip_i, input logic [${cfg['pkg_name']}::NrCores-1:0] msip_i, -% if cfg['cluster_base_expose']: input logic [9:0] hart_base_id_i, input logic [${cfg['addr_width']-1}:0] cluster_base_addr_i, -% endif -% if cfg['timing']['iso_crossings']: input logic clk_d2_bypass_i, -% endif -% if cfg['sram_cfg_expose']: input ${cfg['pkg_name']}::sram_cfgs_t sram_cfgs_i, -%endif input ${cfg['pkg_name']}::narrow_in_req_t narrow_in_req_i, output ${cfg['pkg_name']}::narrow_in_resp_t narrow_in_resp_o, output ${cfg['pkg_name']}::narrow_out_req_t narrow_out_req_o, @@ -354,8 +350,8 @@ module ${cfg['name']}_wrapper ( .hart_base_id_i, .cluster_base_addr_i, % else: - .hart_base_id_i (${to_sv_hex(cfg['cluster_base_hartid'], 10)}), - .cluster_base_addr_i (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])}), + .hart_base_id_i (snitch_cluster_pkg::CfgBaseHartId), + .cluster_base_addr_i (snitch_cluster_pkg::CfgClusterBaseAddr), % endif % if cfg['timing']['iso_crossings']: .clk_d2_bypass_i, diff --git a/target/snitch_cluster/test/testharness.sv b/target/snitch_cluster/test/testharness.sv index afc6972ed1..dbde824efc 100644 --- a/target/snitch_cluster/test/testharness.sv +++ b/target/snitch_cluster/test/testharness.sv @@ -29,6 +29,10 @@ module testharness import snitch_cluster_pkg::*; ( .meip_i ('0), .mtip_i ('0), .msip_i (msip), + .hart_base_id_i (CfgBaseHartId), + .cluster_base_addr_i (CfgClusterBaseAddr), + .clk_d2_bypass_i (1'b0), + .sram_cfgs_i (snitch_cluster_pkg::sram_cfgs_t'('0)), .narrow_in_req_i (narrow_in_req), .narrow_in_resp_o (narrow_in_resp), .narrow_out_req_o (narrow_out_req), From ecdc4657dbc78b05657e5bee4608a6e113d3358b Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Fri, 22 Mar 2024 23:03:54 +0100 Subject: [PATCH 02/10] target/snitch_cluster: Add Occamy-like config with SSSRs --- target/snitch_cluster/cfg/sssr.hjson | 153 +++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 target/snitch_cluster/cfg/sssr.hjson diff --git a/target/snitch_cluster/cfg/sssr.hjson b/target/snitch_cluster/cfg/sssr.hjson new file mode 100644 index 0000000000..ee297960a9 --- /dev/null +++ b/target/snitch_cluster/cfg/sssr.hjson @@ -0,0 +1,153 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Occamy-like Cluster configuration (+alias) for SSSR experiments +{ + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 1, + }, + + cluster: { + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x1000_0000 + cluster_base_offset: 0, // 0x0 + cluster_base_hartid: 0, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + alias_region_enable: true, + dma_data_width: 512, + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + dma_user_width: 1, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE", + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ] + }, + dram: { + // 0x8000_0000 + address: 2147483648, + // 0x8000_0000 + length: 2147483648 + }, + peripherals: { + clint: { + // 0xffff_0000 + address: 4294901760, + // 0x0000_1000 + length: 4096 + }, + }, + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + // isa: "rv32ema", + xdma: true, + xssr: false, + xfrep: false, + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } +} From 9629cb985f5c7ea03bd2e6cf55a342cd93f5ea67 Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Fri, 22 Mar 2024 23:04:26 +0100 Subject: [PATCH 03/10] sw: Add SARIS kernels --- sw/saris/.gitignore | 3 + sw/saris/Makefile | 126 +++++ sw/saris/README.md | 1 + sw/saris/eval.json | 396 ++++++++++++++ sw/saris/runtime/crt0.S | 159 ++++++ sw/saris/runtime/dma.h | 75 +++ sw/saris/runtime/link.ld | 42 ++ sw/saris/runtime/runtime.h | 137 +++++ sw/saris/runtime/runtime.hpp | 20 + sw/saris/runtime/sssr.h | 189 +++++++ sw/saris/stencils/istc.common.hpp | 181 ++++++ sw/saris/stencils/istc.issr.hpp | 879 ++++++++++++++++++++++++++++++ sw/saris/stencils/istc.par.hpp | 239 ++++++++ sw/saris/util/eval.cpp.tpl | 55 ++ sw/saris/util/evalgen.py | 312 +++++++++++ 15 files changed, 2814 insertions(+) create mode 100644 sw/saris/.gitignore create mode 100644 sw/saris/Makefile create mode 100644 sw/saris/README.md create mode 100644 sw/saris/eval.json create mode 100644 sw/saris/runtime/crt0.S create mode 100644 sw/saris/runtime/dma.h create mode 100644 sw/saris/runtime/link.ld create mode 100644 sw/saris/runtime/runtime.h create mode 100644 sw/saris/runtime/runtime.hpp create mode 100644 sw/saris/runtime/sssr.h create mode 100644 sw/saris/stencils/istc.common.hpp create mode 100644 sw/saris/stencils/istc.issr.hpp create mode 100644 sw/saris/stencils/istc.par.hpp create mode 100644 sw/saris/util/eval.cpp.tpl create mode 100644 sw/saris/util/evalgen.py diff --git a/sw/saris/.gitignore b/sw/saris/.gitignore new file mode 100644 index 0000000000..7d0ba6408d --- /dev/null +++ b/sw/saris/.gitignore @@ -0,0 +1,3 @@ +bin +dump +gen diff --git a/sw/saris/Makefile b/sw/saris/Makefile new file mode 100644 index 0000000000..e9bfb82500 --- /dev/null +++ b/sw/saris/Makefile @@ -0,0 +1,126 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Paul Scheffler +# Luca Colagrande + +all: + +############### +# Environment # +############### + +# NOTE: This needs to be a specific revision of PULP RISCV LLVM 15: +# TODO: add commit link here +LLVM_BINROOT ?= /home/paulsc/dev/llvm-ssr/llvm-iis/install/bin +PYTHON3 ?= python3 + +SARISDIR ?= . +GENDIR ?= $(SARISDIR)/gen +UTILDIR ?= $(SARISDIR)/util +BINDIR ?= $(SARISDIR)/bin +DUMPDIR ?= $(SARISDIR)/dump +RTDIR ?= $(SARISDIR)/runtime + +# We depend on the printf submodule +PRINTFDIR ?= $(SARISDIR)/../deps/printf + +############################ +# Compiler (LLVM 15) Setup # +############################ + +RISCV_MARCH ?= \ +rv32imafd_zfh_xfrep_xssr_xdma_xfalthalf_xfquarter_xfaltquarter_xfvecsingle_xfvechalf_$\ +xfvecalthalf_xfvecquarter_xfvecaltquarter_xfauxhalf_xfauxalthalf_xfauxquarter_xfauxaltquarter_$\ +xfauxvecsingle_xfauxvechalf_xfauxvecalthalf_xfauxvecquarter_xfauxvecaltquarter_xfexpauxvechalf_$\ +xfexpauxvecalthalf_xfexpauxvecquarter_xfexpauxvecaltquarter + +RISCV_MABI ?= ilp32d + +RISCV_CC ?= $(LLVM_BINROOT)/clang +RISCV_CXX ?= $(LLVM_BINROOT)/clang++ +RISCV_OBJDUMP ?= $(LLVM_BINROOT)/llvm-objdump +RISCV_STRIP ?= $(LLVM_BINROOT)/llvm-strip + +RISCV_STACK ?= 2048 +RISCV_FLAGS ?= -mcpu=snitch -march=$(RISCV_MARCH) -Ofast -flto -mabi=$(RISCV_MABI) \ + -Wframe-larger-than=$(RISCV_STACK) -nostdlib -mcmodel=medany -I$(RTDIR) \ + -I$(SARISDIR)/stencils -I$(PRINTFDIR) -ffreestanding -fno-builtin \ + -ffunction-sections + +RISCV_CFLAGS ?= $(RISCV_FLAGS) +# Loop unrolling optimization +RISCV_CFLAGS += -mllvm --allow-unroll-and-jam +RISCV_CFLAGS += -mllvm --unroll-allow-partial +RISCV_CFLAGS += -mllvm --unroll-runtime +# Tree height reduction options +RISCV_CFLAGS += -mllvm --enable-fp-thr +RISCV_CFLAGS += -mllvm --thr-max-depth=5 +RISCV_CFLAGS += -mllvm --thr-se-leaves +RISCV_CFLAGS += -mllvm --thr-fuse-bias +RISCV_CFLAGS += -mllvm --thr-se-factor=2 +RISCV_CFLAGS += -mllvm --thr-re-factor=1 +# Machine scheduler and PostRA options +RISCV_CFLAGS += -mllvm --post-RA-scheduler +RISCV_CFLAGS += -mllvm --enable-misched +RISCV_CFLAGS += -mllvm --enable-post-misched +RISCV_CFLAGS += -mllvm --misched-postra + +RISCV_CCFLAGS ?= $(RISCV_CFLAGS) -std=gnu11 +RISCV_CXXFLAGS ?= $(RISCV_CFLAGS) -std=gnu++14 +RISCV_LDFLAGS ?= -fuse-ld=$(LLVM_BINROOT)/ld.lld -flto -static -lm $(RISCV_FLAGS) \ + -Wl,--fatal-warnings -Wl,-z,stack-size=$(RISCV_STACK) +RISCV_DMPFLAGS ?= --mcpu=snitch + +############################ +# SARIS Program Build Flow # +############################ + +.SECONDEXPANSION: +.DELETE_ON_ERROR: + +# Extracting word nr. $(1) from $(2)-separated list $(3) +pw = $(word $(1), $(subst $(2), ,$(3))) + +$(GENDIR) $(BINDIR) $(DUMPDIR): + mkdir -p $@ + +$(BINDIR)/crt0.o: $(SARISDIR)/runtime/crt0.S | $(BINDIR) + $(RISCV_CC) $(RISCV_CCFLAGS) -c $< -o $@ + +$(BINDIR)/istc.%.c.o: $(GENDIR)/$$(call pw,1,.,$$*).cpp | $(BINDIR) + $(RISCV_CXX) $(RISCV_CXXFLAGS) -c $< -o $@ + +.PRECIOUS: $(BINDIR)/%.elf +$(BINDIR)/istc.%.elf: $(BINDIR)/istc.%.c.o $(BINDIR)/crt0.o $(RTDIR)/link.ld | $(BINDIR) + $(RISCV_CC) $(RISCV_LDFLAGS) -o $@ $< $(BINDIR)/crt0.o -T$(RTDIR)/link.ld + $(RISCV_STRIP) $@ -g -S -d --strip-debug -R .comment -R .riscv.attributes + +.PRECIOUS: $(DUMPDIR)/%.dump +$(DUMPDIR)/%.dump: $(BINDIR)/%.elf | $(DUMPDIR) + @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .text -d $< >$@ + @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .misc -s $< | tail -n +3 >>$@ + @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .tcdm -s $< | tail -n +3 >>$@ + @$(RISCV_OBJDUMP) $(RISCV_DMPFLAGS) -j .tcdmc -s $< | tail -n +3 >>$@ + +# Phony for program and dump build +prog.%: $(BINDIR)/%.elf $(DUMPDIR)/%.dump + @echo -e '\x1b[44;33;1mBUILT: $*\x1b[0m' + +clean: + rm -rf $(BINDIR) $(DUMPDIR) $(GENDIR) + +############################ +# SARIS Program Generation # +############################ + +.PRECIOUS: $(GENDIR)/%.cpp +$(GENDIR)/%.cpp: $(UTILDIR)/evalgen.py $(SARISDIR)/eval.json $(UTILDIR)/eval.cpp.tpl | $(GENDIR) + $(PYTHON3) $^ $* > $@ + +EVAL_NAMES ?= $(shell jq -r 'keys | join(" ")' $(SARISDIR)/eval.json) +ISTC_PROGS += $(patsubst %,istc.%,$(EVAL_NAMES)) + +# Default: compile all SARIS programs in eval.json +all: $(addprefix prog.,$(ISTC_PROGS)) diff --git a/sw/saris/README.md b/sw/saris/README.md new file mode 100644 index 0000000000..464090415c --- /dev/null +++ b/sw/saris/README.md @@ -0,0 +1 @@ +# TODO diff --git a/sw/saris/eval.json b/sw/saris/eval.json new file mode 100644 index 0000000000..f1b102588b --- /dev/null +++ b/sw/saris/eval.json @@ -0,0 +1,396 @@ +{ + + "pb_jacobi_2d_ml_par": { + "radius": 1, + "grids": { + "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "kernels": [ + [1, "istcp_pb_jacobi_2d(core_id, &Ap2ml, &Bp2ml)"], + [1, "istcp_pb_jacobi_2d(core_id, &Ap2ml, &Bp2ml)"] + ], + "touch": ["Ap2ml", "Bp2ml"], + "dma": ["Cp2ml", "Dp2xl"] + }, + + "pb_jacobi_2d_ml_issr": { + "radius": 1, + "grids": { + "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "kernels": [ + [1, "istci_pb_jacobi_2d(core_id, &Ai2ml, &Bi2ml)"], + [1, "istci_pb_jacobi_2d(core_id, &Ai2ml, &Bi2ml)"] + ], + "touch": ["Ai2ml", "Bi2ml"], + "dma": ["Ci2ml", "Di2xl"] + }, + + + + "an5d_j2d5pt_ml_par": { + "radius": 1, + "grids": { + "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]}, + "kernels": [ + [1, "istcp_an5d_j2d5pt(core_id, &Ap22ml[0])"], + [1, "istcp_an5d_j2d5pt(core_id, &Ap22ml[0])"] + ], + "touch": ["Ap2ml", "Bp2ml"], + "dma": ["Cp2ml", "Dp2xl"] + }, + + "an5d_j2d5pt_ml_issr": { + "radius": 1, + "grids": { + "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]}, + "kernels": [ + [1, "istci_an5d_j2d5pt(core_id, &Ai22ml[0])"], + [1, "istci_an5d_j2d5pt(core_id, &Ai22ml[0])"] + ], + "touch": ["Ai2ml", "Bi2ml"], + "dma": ["Ci2ml", "Di2xl"] + }, + + + + "an5d_j2d9pt_ml_par": { + "radius": 2, + "grids": { + "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]}, + "kernels": [ + [1, "istcp_an5d_j2d9pt(core_id, &Ap22ml[0])"], + [1, "istcp_an5d_j2d9pt(core_id, &Ap22ml[0])"] + ], + "touch": ["Ap2ml", "Bp2ml"], + "dma": ["Cp2ml", "Dp2xl"] + }, + + "an5d_j2d9pt_ml_issr": { + "radius": 2, + "grids": { + "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]}, + "kernels": [ + [1, "istci_an5d_j2d9pt(core_id, &Ai22ml[0])"], + [1, "istci_an5d_j2d9pt(core_id, &Ai22ml[0])"] + + ], + "touch": ["Ai2ml", "Bi2ml"], + "dma": ["Ci2ml", "Di2xl"] + }, + + + + "an5d_j2d9pt_gol_ml_par": { + "radius": 1, + "grids": { + "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]}, + "kernels": [ + [1, "istcp_an5d_j2d9pt_gol(core_id, &Ap22ml[0])"], + [1, "istcp_an5d_j2d9pt_gol(core_id, &Ap22ml[0])"] + + ], + "touch": ["Ap2ml", "Bp2ml"], + "dma": ["Cp2ml", "Dp2xl"] + }, + + "an5d_j2d9pt_gol_ml_issr": { + "radius": 1, + "grids": { + "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]}, + "kernels": [ + [1, "istci_an5d_j2d9pt_gol(core_id, &Ai22ml[0])"], + [1, "istci_an5d_j2d9pt_gol(core_id, &Ai22ml[0])"] + + ], + "touch": ["Ai2ml", "Bi2ml"], + "dma": ["Ci2ml", "Di2xl"] + }, + + + + "an5d_j3d27pt_ml_par": { + "radius": 1, + "grids": { + "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]} + }, + "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]}, + "kernels": [ + [1, "istcp_an5d_j3d27pt(core_id, &Ap32ml[0])"], + [1, "istcp_an5d_j3d27pt(core_id, &Ap32ml[0])"] + ], + "touch": ["Ap3ml", "Bp3ml"], + "dma": ["Cp3ml", "Dp3xl"] + }, + + "an5d_j3d27pt_ml_issr": { + "radius": 1, + "grids": { + "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]} + }, + "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]}, + "kernels": [ + [1, "istci_an5d_j3d27pt(core_id, &Ai32ml[0])"], + [1, "istci_an5d_j3d27pt(core_id, &Ai32ml[0])"] + + ], + "touch": ["Ai3ml", "Bi3ml"], + "dma": ["Ci3ml", "Di3xl"] + }, + + + + "an5d_star2d3r_ml_par": { + "radius": 3, + "grids": { + "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]}, + "params": {"r": 3}, + "kernels": [ + [1, "istcp_an5d_star2dXr(core_id, &Ap22ml[0])"], + [1, "istcp_an5d_star2dXr(core_id, &Ap22ml[0])"] + ], + "touch": ["Ap2ml", "Bp2ml"], + "dma": ["Cp2ml", "Dp2xl"] + }, + + "an5d_star2d3r_ml_issr": { + "radius": 3, + "grids": { + "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]}, + "params": {"r": 3}, + "kernels": [ + [1, "istci_an5d_star2dXr(core_id, &Ai22ml[0])"], + [1, "istci_an5d_star2dXr(core_id, &Ai22ml[0])"] + + ], + "touch": ["Ai2ml", "Bi2ml"], + "dma": ["Ci2ml", "Di2xl"] + }, + + + + "an5d_box2d1r_ml_par": { + "radius": 1, + "grids": { + "Ap2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bp2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Cp2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Dp2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ap22ml": ["Ap2ml", "Bp2ml"]}, + "params": {"r": 1}, + "kernels": [ + [1, "istcp_an5d_box2dXr(core_id, &Ap22ml[0])"], + [1, "istcp_an5d_box2dXr(core_id, &Ap22ml[0])"] + + ], + "touch": ["Ap2ml", "Bp2ml"], + "dma": ["Cp2ml", "Dp2xl"] + }, + + "an5d_box2d1r_ml_issr": { + "radius": 1, + "grids": { + "Ai2ml": {"seed": 1337, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Bi2ml": {"seed": 1338, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Ci2ml": {"seed": 1339, "dims": ["2ml", "2ml"], "attrs": "TCDMDECL"}, + "Di2xl": {"seed": 1340, "dims": ["2xl", "2xl"]} + }, + "bundles": {"Ai22ml": ["Ai2ml", "Bi2ml"]}, + "params": {"r": 1}, + "kernels": [ + [1, "istci_an5d_box2dXr(core_id, &Ai22ml[0])"], + [1, "istci_an5d_box2dXr(core_id, &Ai22ml[0])"] + ], + "touch": ["Ai2ml", "Bi2ml"], + "dma": ["Ci2ml", "Di2xl"] + }, + + + + "an5d_star3d2r_ml_par": { + "radius": 2, + "grids": { + "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]} + }, + "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]}, + "params": {"r": 2}, + "kernels": [ + [1, "istcp_an5d_star3dXr(core_id, &Ap32ml[0])"], + [1, "istcp_an5d_star3dXr(core_id, &Ap32ml[0])"] + ], + "touch": ["Ap3ml", "Bp3ml"], + "dma": ["Cp3ml", "Dp3xl"] + }, + + "an5d_star3d2r_ml_issr": { + "radius": 2, + "grids": { + "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]} + }, + "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]}, + "params": {"r": 2}, + "kernels": [ + [1, "istci_an5d_star3dXr(core_id, &Ai32ml[0])"], + [1, "istci_an5d_star3dXr(core_id, &Ai32ml[0])"] + + ], + "touch": ["Ai3ml", "Bi3ml"], + "dma": ["Ci3ml", "Di3xl"] + }, + + + + "an5d_box3d1r_ml_par": { + "radius": 1, + "grids": { + "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Dp3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]} + }, + "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]}, + "params": {"r": 1}, + "kernels": [ + [1, "istcp_an5d_box3dXr(core_id, &Ap32ml[0])"], + [1, "istcp_an5d_box3dXr(core_id, &Ap32ml[0])"] + ], + "touch": ["Ap3ml", "Bp3ml"], + "dma": ["Cp3ml", "Dp3xl"] + }, + + "an5d_box3d1r_ml_issr": { + "radius": 1, + "grids": { + "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Ci3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Di3xl": {"seed": 1340, "dims": ["3xl", "3xl", "3xl"]} + }, + "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]}, + "params": {"r": 1}, + "kernels": [ + [1, "istci_an5d_box3dXr(core_id, &Ai32ml[0])"], + [1, "istci_an5d_box3dXr(core_id, &Ai32ml[0])"] + ], + "touch": ["Ai3ml", "Bi3ml"], + "dma": ["Ci3ml", "Di3xl"] + }, + + + + "minimod_acoustic_iso_cd_ml_par": { + "radius": 4, + "grids": { + "Ap3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bp3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Dp3ml": {"seed": 1339, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "F3ml": {"seed": 1338, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "G3ml": {"seed": 1340, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "Hp3xl": {"seed": 1341, "dims": ["3xl", "3xl", "3xl"]}, + "I3xl": {"seed": 1342, "dims": [16, 16, 16]}, + "Ep3ml": {"seed": 1343, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "Jp3xl": {"seed": 1344, "dims": [16, 16, 16]} + }, + "bundles": {"Ap32ml": ["Ap3ml", "Bp3ml"]}, + "kernels": [ + [1, "istcp_minimod_acoustic_iso_cd(core_id, &Ap32ml[0], &F3ml)"], + [1, "istcp_minimod_acoustic_iso_cd(core_id, &Ap32ml[0], &F3ml)"] + ], + "touch": ["Ap3ml", "Bp3ml", "F3ml", "Dp3ml"], + "dma": [ + ["Cp3ml", "Hp3xl", "out"], + ["G3ml", "I3xl", "in", 0], + ["Cp3ml", "Hp3xl", "in"], + ["Ep3ml", "Jp3xl", "in"] + ] + }, + + "minimod_acoustic_iso_cd_ml_issr": { + "radius": 4, + "grids": { + "Ai3ml": {"seed": 1337, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Bi3ml": {"seed": 1338, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "Di3ml": {"seed": 1339, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "F3ml": {"seed": 1338, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "Cp3ml": {"seed": 1339, "dims": ["3ml", "3ml", "3ml"], "attrs": "TCDMDECL"}, + "G3ml": {"seed": 1340, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "Hp3xl": {"seed": 1341, "dims": ["3xl", "3xl", "3xl"]}, + "I3xl": {"seed": 1342, "dims": [16, 16, 16]}, + "Ep3ml": {"seed": 1343, "dims": [8, 8, 8], "attrs": "TCDMDECL"}, + "Jp3xl": {"seed": 1344, "dims": [16, 16, 16]} + }, + "bundles": {"Ai32ml": ["Ai3ml", "Bi3ml"]}, + "kernels": [ + [1, "istci_minimod_acoustic_iso_cd(core_id, &Ai32ml[0], &F3ml)"], + [1, "istci_minimod_acoustic_iso_cd(core_id, &Ai32ml[0], &F3ml)"] + ], + "touch": ["Ai3ml", "Bi3ml", "F3ml", "Di3ml"], + "dma": [ + ["Cp3ml", "Hp3xl", "out"], + ["G3ml", "I3xl", "in", 0], + ["Cp3ml", "Hp3xl", "in"], + ["Ep3ml", "Jp3xl", "in"] + ] + } + +} diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S new file mode 100644 index 0000000000..79efb0cbbe --- /dev/null +++ b/sw/saris/runtime/crt0.S @@ -0,0 +1,159 @@ +# HTIF sections +.pushsection .htif,"aw",@progbits; +.align 6; .global tohost; tohost: .dword 0; +.align 6; .global fromhost; fromhost: .dword 0; + +.globl _start +.section .text._start +_start: + # Set global pointer + .option push + .option norelax + la gp, __global_pointer + .option pop + + # Prepare main arguments for single cluster + csrr a0, mhartid + la a1, __const_num_cores + la a2, __const_tcdm_start + la a3, __const_tcdm_end + + # Set stack pointer; 1KiB per core + # Offset by 8B to prevent bank collisions + slli t0, a0, 10 + addi sp, a3, -8 + sub sp, sp, t0 + slli t0, a0, 3 + sub sp, sp, t0 + + # check if the core has the F-extension + csrr t0, misa + andi t0, t0, (1 << 5) + beqz t0, _clr_ireg + +_skip_dmcc_work: + # Skip the coming two steps unless we are the DMA core + # NOTE: this assumes the DMA core being the last in the cluster + addi t0, a1, -1 + bne a0, t0, _dmcc_work_sync + +_preload_tcdm: + # Preload thread-local storage (TCDM) using DMA + la t0, __const_tcdm_losta + la t1, __const_tcdm_loend + sub t3, t1, t0 + # Branch off if no tcdm data + beqz t3, _preload_tcdmc + # Launch copy to base of TCDM + dmsrc t0, zero + dmdst a2, zero + dmcpyi zero, t3, 0 + # Await DMA + 1: + dmstati t0, 2 + bnez t0, 1b + +_preload_tcdmc: + # Preload thread-local storage (TCDM) using DMA + la t0, __const_tcdmc_losta + la t1, __const_tcdmc_loend + sub t3, t1, t0 + # Get tcdmc base, branch off if no tcdmc data + la t2, __const_tcdm_startc + beqz t3, _dmcc_work_sync + # Launch copy to past end of TCDM + dmsrc t0, zero + dmdst t2, zero + dmcpyi zero, t3, 0 + # Await DMA + 1: + dmstati t0, 2 + bnez t0, 1b + +_dmcc_work_sync: + # Synchronize cores so data is ready + csrr x0, 0x7C2 + + # Reset float regs if present +_clr_freg: + fcvt.d.w f0, x0 + fmv.d f1, f0 + fmv.d f2, f0 + fmv.d f3, f0 + fmv.d f4, f0 + fmv.d f5, f0 + fmv.d f6, f0 + fmv.d f7, f0 + fmv.d f8, f0 + fmv.d f9, f0 + fmv.d f10, f0 + fmv.d f11, f0 + fmv.d f12, f0 + fmv.d f13, f0 + fmv.d f14, f0 + fmv.d f15, f0 + fmv.d f16, f0 + fmv.d f17, f0 + fmv.d f18, f0 + fmv.d f19, f0 + fmv.d f20, f0 + fmv.d f10, f0 + fmv.d f21, f0 + fmv.d f22, f0 + fmv.d f23, f0 + fmv.d f24, f0 + fmv.d f25, f0 + fmv.d f26, f0 + fmv.d f27, f0 + fmv.d f28, f0 + fmv.d f29, f0 + fmv.d f30, f0 + fmv.d f31, f0 + + # Reset remaining int regs +_clr_ireg: + li tp, 0 + li t0, 0 + li t1, 0 + li t2, 0 + li t3, 0 + li t4, 0 + li t5, 0 + li t6, 0 + li a6, 0 + li a7, 0 + li s0, 0 + li s1, 0 + li s2, 0 + li s3, 0 + li s4, 0 + li s5, 0 + li s6, 0 + li s7, 0 + li s8, 0 + li s9, 0 + li s10, 0 + li s11, 0 + + # Call main + call smain + +_eoc: + # Synchronize cores + csrr x0, 0x7C2 + # Only core 0 (of all cores) returns + csrr t0, mhartid + bnez t0, _done + # Write termination bit and return code (a0) to tohost + slli a0, a0, 1 + ori a0, a0, 1 + la t0, tohost + sw a0, 0(t0) + # Go to sleep +_done: + wfi + + +.globl _putcb +.section .data._putcb +_putcb: diff --git a/sw/saris/runtime/dma.h b/sw/saris/runtime/dma.h new file mode 100644 index 0000000000..80956b0f73 --- /dev/null +++ b/sw/saris/runtime/dma.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include + +// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers. +static inline uint32_t __rt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, + size_t size) { + register uint32_t reg_txid; // 10 + asm volatile("dmsrc %[sl], %[sh]" :: [sh]"r"(src >> 32), [sl]"r"(src)); + asm volatile("dmdst %[dl], %[dh]" :: [dh]"r"(dst >> 32), [dl]"r"(dst)); + asm volatile("dmcpyi %[id], %[sz], 0" : [id]"=r"(reg_txid) : [sz]"r"(size)); + return reg_txid; +} + +// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers. +static inline uint32_t __rt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, + size_t size, size_t dst_stride, + size_t src_stride, size_t repeat) { + register uint32_t reg_txid; // 10 + asm volatile("dmsrc %[sl], %[sh]" :: [sh]"r"(src >> 32), [sl]"r"(src)); + asm volatile("dmdst %[dl], %[dh]" :: [dh]"r"(dst >> 32), [dl]"r"(dst)); + asm volatile("dmstr %[rd], %[rs]" :: [rd]"r"(dst_stride), [rs]"r"(src_stride)); + asm volatile("dmrep %[rp]" :: [rp]"r"(repeat)); + asm volatile("dmcpyi %[id], %[sz], 2" : [id]"=r"(reg_txid) : [sz]"r"(size)); + return reg_txid; +} + +// Initiate an asynchronous 1D DMA transfer. +static inline uint32_t __rt_dma_start_1d(void *dst, const void *src, size_t size) { + return __rt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size); +} + +// Initiate an asynchronous 2D DMA transfer. +static inline uint32_t __rt_dma_start_2d(void *dst, const void *src, size_t size, + size_t src_stride, size_t dst_stride, + size_t repeat) { + return __rt_dma_start_2d_wideptr((size_t)dst, (size_t)src, size, src_stride, + dst_stride, repeat); +} + +// Last completed ID +static inline volatile uint32_t __rt_dma_completed_id() { + register uint32_t cid; + asm volatile( + "dmstati %[cid], 0 \n " // 0=status.completed_id + : [cid]"=&r"(cid) :: "memory" + ); + // TODO: Fix off-by-one bug in DMA hardware! + return cid+1; +} + +// Block until a transfer finishes. +static inline void __rt_dma_wait(uint32_t tid) { + register uint32_t tmp; + // TODO: Fix off-by-one bug in DMA hardware! + tid++; + asm volatile( + "1: \n" + "dmstati %[tmp], 0 \n " // 0=status.completed_id + "bgt %[tid], %[tmp], 1b \n" // branch back if ID to wait for > last completed ID + : [tmp]"=&r"(tmp) : [tid]"r"(tid) + ); +} + +// Block until all operation on the DMA ceases. +static inline void __rt_dma_wait_all() { + register uint32_t tmp; + asm volatile( + "1: \n" + "dmstati %[tmp], 2 \n " // 2=status.busy + "bne %[tmp], zero, 1b \n" + : [tmp]"=&r"(tmp) : + ); +} diff --git a/sw/saris/runtime/link.ld b/sw/saris/runtime/link.ld new file mode 100644 index 0000000000..5788547bdd --- /dev/null +++ b/sw/saris/runtime/link.ld @@ -0,0 +1,42 @@ +OUTPUT_ARCH( "riscv" ) +ENTRY(_start) + +MEMORY +{ + /* Reserve upper 9*1Ki = 9Ki of TCDM for stack, plus some padding. + This can be expanded to allocate the full CC TCDM as needed. + A 2 KiB RO is provided in the TCDM for small data + consts. */ + tcdm (rw) : ORIGIN = 0x10000000, LENGTH = 0x1CC00 + tcdmc (r) : ORIGIN = 0x1001CC00, LENGTH = 2K + dram (rwxa) : ORIGIN = 0x90000000, LENGTH = 1024M + dtxt (rwxa) : ORIGIN = 0x80000000, LENGTH = 1024M +} + +SECTIONS +{ + /DISCARD/ : { *(.riscv.attributes) *(.comment) *(.rela.*) *(.sym.*) } + + .text : { *(.text._start) *(.text) *(.text.*); . = ALIGN(16); } >dtxt + .misc : { *(.data) *(.data.*) *(.putcb) } >dram + .tcdm : { *(.tcdm) *(.l1) } >tcdm AT>dram + .tcdmc : { *(.sdata) *(.sdata.*) *(.rodata) *(.rodata.*) } >tcdmc AT>dram + + /* Global and stack pointer: in TCDM */ + __global_pointer = ADDR(.tcdmc) + SIZEOF(.tcdmc) / 2; + + /* Memory Layout Constants */ + __const_num_cores = 9; + __const_tcdm_start = ORIGIN(tcdm); + __const_tcdm_startc = ORIGIN(tcdmc); + __const_tcdm_end = ORIGIN(tcdm) + 128K; + __const_dram_start = ORIGIN(dram); + + /* TCDM Loading */ + __const_tcdm_losta = LOADADDR(.tcdm); + __const_tcdm_loend = LOADADDR(.tcdm) + SIZEOF(.tcdm); + __const_tcdmc_losta = LOADADDR(.tcdmc); + __const_tcdmc_loend = LOADADDR(.tcdmc) + SIZEOF(.tcdmc); + + /* HTIF section for FESVR */ + .htif : { } >dram +} diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h new file mode 100644 index 0000000000..883bacb2ae --- /dev/null +++ b/sw/saris/runtime/runtime.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include +#include "dma.h" +#include "sssr.h" + +#define PRINTF_NTOA_BUFFER_SIZE 12 +#define PRINTF_DISABLE_SUPPORT_LONG_LONG 1 + +#include "printf.h" + +extern uintptr_t volatile tohost, fromhost; + +extern void *__const_tcdm_start; +extern void *__const_dram_start; + +// Use this to identify and differentiate TCDM data and pointers +#define TCDMSPC __attribute__((address_space(1))) +#define TCDMSEC __attribute__((section(".l1"))) +#define TCDM TCDMSPC +#define TCDMDECL TCDMSPC TCDMSEC + +static inline volatile uint32_t __rt_get_hartid() { + uint32_t register r; + asm volatile ("csrr %0, mhartid" : "=r"(r)); + return r; +} +// Rudimentary string buffer for putchar calls. +extern uint32_t _putcb; +#define PUTC_BUFFER_LEN (1024 - sizeof(size_t)) + +typedef struct { + size_t size; + uint64_t syscall_mem[8]; +} putc_buffer_header_t; + +typedef struct { + putc_buffer_header_t hdr; + char data[PUTC_BUFFER_LEN]; +} putc_buffer_t; + +static volatile putc_buffer_t *const putc_buffer = (putc_buffer_t *const)(void *)&_putcb; + +// Provide an implementation for putchar. +void _putchar(char character) { + volatile putc_buffer_t *buf = &putc_buffer[__rt_get_hartid()]; + buf->data[buf->hdr.size++] = character; + if (buf->hdr.size == PUTC_BUFFER_LEN || character == '\n') { + buf->hdr.syscall_mem[0] = 64; // sys_write + buf->hdr.syscall_mem[1] = 1; // file descriptor (1 = stdout) + buf->hdr.syscall_mem[2] = (uintptr_t)&buf->data; // buffer + buf->hdr.syscall_mem[3] = buf->hdr.size; // length + + tohost = (uintptr_t)buf->hdr.syscall_mem; + while (fromhost == 0) + ; + fromhost = 0; + + buf->hdr.size = 0; + } +} + +// Print a (null-terminated) string +static inline void __rt_print(const char* buf) { + for (; *buf; ++buf) _putchar(*buf); +} + +// Print a decimal number +static inline void __rt_print_dec_uint(uint32_t val) { + const int DEC_BUF_LEN = 10; + char out [DEC_BUF_LEN]; + int out_msd; + int i; + // Capture digits + for (i=DEC_BUF_LEN-2; i >= 0; --i) { + char digit = (val % 10); + out[i] = digit + '0'; + val /= 10; + out_msd = i; + if (val == 0) break; + } + out[DEC_BUF_LEN-1] = '\0'; + // Print digits + __rt_print(out + out_msd); +} + +// Cluster-local barrier +static inline void __rt_barrier() { + asm volatile("csrr x0, 0x7C2" ::: "memory"); +} + +// Full memory fence +static inline void __rt_fence() { + asm volatile("fence" ::: "memory"); +} + +#define __RT_FPU_FENCE "fmv.x.w zero, fa0\n" + +// Fence waiting for FPU to catch up to core +static inline void __rt_fpu_fence() { + asm volatile(__RT_FPU_FENCE ::: "memory"); +} + +// Cluster-local barrier +static inline void __rt_fpu_fence_full() { + uint32_t register tmp; + asm volatile ( + "fmv.x.w %[tmp], fa0 \n" + "mv zero, %[tmp] \n" + : [tmp]"=r"(tmp) :: "memory" + ); +} + +// Memcopy using FPU +static inline void __rt_memcpy_fpu(double* dst, double* src, size_t lend) { + #pragma clang loop unroll_count(8) + for (int i = 0; i < lend; i++) + *(volatile double*)(dst + i) = *(volatile double*)(src + i); +} + +// Monotonically increasing cycle count +static inline volatile uint32_t __rt_get_timer() { + uint32_t register r; + asm volatile ("csrr %0, mcycle" : "=r"(r)); + return r; +} + +// Sleep for multiples of 10 (Deca) cycles +static inline void __rt_shortsleep(uint32_t Dcycles) { + for (int i = 0; i < Dcycles; ++i) { + asm volatile ("nop; nop; nop; nop; nop; nop; nop; nop; nop; nop" ::: "memory"); + } +} + +// Include putchar code directly (header-only implementation) +#include "printf.c" diff --git a/sw/saris/runtime/runtime.hpp b/sw/saris/runtime/runtime.hpp new file mode 100644 index 0000000000..df501ff20e --- /dev/null +++ b/sw/saris/runtime/runtime.hpp @@ -0,0 +1,20 @@ +#pragma once + +// C linkage macros +#ifdef __cplusplus +#define EXTERN_C extern "C" +#define EXTERN_C_BEGIN extern "C" { +#define EXTERN_C_END } +#else +#define EXTERN_C +#define EXTERN_C_BEGIN +#define EXTERN_C_END +#endif + +// Include C runtime, ignoring benign CXX-only warnings +EXTERN_C_BEGIN +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-register" +#include "runtime.h" +#pragma GCC diagnostic pop +EXTERN_C_END diff --git a/sw/saris/runtime/sssr.h b/sw/saris/runtime/sssr.h new file mode 100644 index 0000000000..171ccb454f --- /dev/null +++ b/sw/saris/runtime/sssr.h @@ -0,0 +1,189 @@ +#pragma once + +// Registers +#define __RT_SSSR_REG_STATUS 0 +#define __RT_SSSR_REG_REPEAT 1 + +#define __RT_SSSR_REG_BOUND_0 2 +#define __RT_SSSR_REG_BOUND_1 3 +#define __RT_SSSR_REG_BOUND_2 4 +#define __RT_SSSR_REG_BOUND_3 5 + +#define __RT_SSSR_REG_STRIDE_0 6 +#define __RT_SSSR_REG_STRIDE_1 7 +#define __RT_SSSR_REG_STRIDE_2 8 +#define __RT_SSSR_REG_STRIDE_3 9 + +#define __RT_SSSR_REG_IDX_CFG 10 +#define __RT_SSSR_REG_IDX_BASE 11 +#define __RT_SSSR_REG_IDX_ISECT 12 + +#define __RT_SSSR_REG_RPTR_INDIR 16 +#define __RT_SSSR_REG_RPTR_SLV 17 +#define __RT_SSSR_REG_RPTR_MST_NOSLV 18 +#define __RT_SSSR_REG_RPTR_MST_TOSLV 19 + +#define __RT_SSSR_REG_WPTR_INDIR 20 +#define __RT_SSSR_REG_WPTR_SLV 21 +#define __RT_SSSR_REG_WPTR_MST_NOSLV 22 +#define __RT_SSSR_REG_WPTR_MST_TOSLV 23 + +#define __RT_SSSR_REG_RPTR_0 24 +#define __RT_SSSR_REG_RPTR_1 25 +#define __RT_SSSR_REG_RPTR_2 26 +#define __RT_SSSR_REG_RPTR_3 27 + +#define __RT_SSSR_REG_WPTR_0 28 +#define __RT_SSSR_REG_WPTR_1 29 +#define __RT_SSSR_REG_WPTR_2 30 +#define __RT_SSSR_REG_WPTR_3 31 + +// Enable and disable +#define __RT_SSSR_ENABLE "csrsi 0x7C0, 1\n" +#define __RT_SSSR_DISABLE "csrci 0x7C0, 1\n" + +// Write configuration registers +// To write to all SSRs, use ssridx=31 +#define __RT_SSSR_IDXALL 31 +#define __RT_SSSR_SCFGWI_INT(valreg,ssridx,regidx) "scfgwi "#valreg", "#ssridx" | "#regidx"<<5\n" +#define __RT_SSSR_SCFGWI(valreg,ssridx,regname) __RT_SSSR_SCFGWI_INT(valreg,ssridx,regname) + +// Read configuration registers +#define __RT_SSSR_SCFGRI_INT(valreg,ssridx,regidx) "scfgri "#valreg", "#ssridx" | "#regidx"<<5\n" +#define __RT_SSSR_SCFGRI(valreg,ssridx,regname) __RT_SSSR_SCFGRI_INT(valreg,ssridx,regname) + +// Assemble index configuration word +#define __RT_SSSR_IDXSIZE_U8 0 +#define __RT_SSSR_IDXSIZE_U16 1 +#define __RT_SSSR_IDXSIZE_U32 2 +#define __RT_SSSR_IDXSIZE_U64 3 +#define __RT_SSSR_IDX_NOMERGE 0 +#define __RT_SSSR_IDX_MERGE 1 +#define __RT_SSSR_IDX_CFG(size,shift,flags) (((flags & 0xFFFF)<<16) | ((shift & 0xFF)<<8) | (size & 0xFF) ) + +// Block until job is done +// TODO: Replace with (shadowed) blocking read or write +#define __RT_SSSR_WAIT_DONE(tempreg, ssridx) \ + "1:" __RT_SSSR_SCFGRI(tempreg,ssridx,__RT_SSSR_REG_STATUS) \ + "srli "#tempreg", "#tempreg", 31 \n" \ + "beqz "#tempreg", 1b \n" + +// Allocates the specified registers and fakes them as +// outputs of an SSSR enable, enforcing an order. +#define __RT_SSSR_BLOCK_BEGIN \ + { \ + register double _rt_sssr_0 asm("ft0"); \ + register double _rt_sssr_1 asm("ft1"); \ + register double _rt_sssr_2 asm("ft2"); \ + asm volatile(__RT_SSSR_ENABLE : "+f"(_rt_sssr_0), "+f"(_rt_sssr_1), "+f"(_rt_sssr_2) :: "memory"); + +// Disables the SSSRs, taking as fake inputs the allocated +// registers for the SSRs and thus allowing reallocation. +#define __RT_SSSR_BLOCK_END \ + asm volatile(__RT_SSSR_DISABLE : "+f"(_rt_sssr_0), "+f"(_rt_sssr_1), "+f"(_rt_sssr_2) :: "memory"); \ + } + +static inline void __rt_sssr_cfg_write(uint32_t val, uint32_t ssridx, uint32_t regidx) { + asm volatile ( + __RT_SSSR_SCFGWI_INT(%[valreg],%[ssridx],%[regidx]) + :: [valreg]"r"(val), [ssridx]"i"(ssridx), [regidx]"i"(regidx) : "memory" + ); +} + +static inline void __rt_sssr_cfg_write_ptr(void* val, uint32_t ssridx, uint32_t regidx) { + __rt_sssr_cfg_write((uintptr_t)val, ssridx, regidx); +} + +static inline uint32_t __rt_sssr_cfg_read(uint32_t ssridx, uint32_t regidx) { + uint32_t ret; + asm volatile ( + __RT_SSSR_SCFGRI_INT(%[retreg],%[ssridx],%[regidx]) + : [retreg]"=r"(ret) : [ssridx]"i"(ssridx), [regidx]"i"(regidx) : "memory" + ); + return ret; +} + +static inline void __rt_sssr_enable() { + asm volatile(__RT_SSSR_ENABLE ::: "memory"); +} + +static inline void __rt_sssr_disable() { + asm volatile(__RT_SSSR_DISABLE ::: "memory"); +} + +static inline uint16_t __rt_sssr_ptoi(void* ptr) { + // We assume TCDM alignment here; TCDM address offset is ignored + // as it will be masked in the SSR at at the latest + return (uint16_t)((uintptr_t)ptr >> 3); +} + +static inline void __rt_sssr_bound_stride_1d( + uint32_t ssridx, + uint32_t b0, uint32_t s0 +) { + // argument bounds and strides are *non-inclusive* for convenience + __rt_sssr_cfg_write(--b0, ssridx, __RT_SSSR_REG_BOUND_0); + __rt_sssr_cfg_write(s0, ssridx, __RT_SSSR_REG_STRIDE_0); +} + +static inline void __rt_sssr_bound_stride_2d( + uint32_t ssridx, + uint32_t b0, uint32_t s0, + uint32_t b1, uint32_t s1 +) { + // argument bounds and strides are *non-inclusive* for convenience + __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0); + __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1); + uint32_t a = 0; + __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0); + a += s0 * b0; + __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1); +} + +static inline void __rt_sssr_bound_stride_3d( + uint32_t ssridx, + uint32_t b0, uint32_t s0, + uint32_t b1, uint32_t s1, + uint32_t b2, uint32_t s2 +) { + // argument bounds and strides are *non-inclusive* for convenience + __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0); + __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1); + __rt_sssr_cfg_write(--b2 , ssridx, __RT_SSSR_REG_BOUND_2); + uint32_t a = 0; + __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0); + a += s0 * b0; + __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1); + a += s1 * b1; + __rt_sssr_cfg_write(s2-a, ssridx, __RT_SSSR_REG_STRIDE_2); +} + +static inline void __rt_sssr_bound_stride_4d( + uint32_t ssridx, + uint32_t b0, uint32_t s0, + uint32_t b1, uint32_t s1, + uint32_t b2, uint32_t s2, + uint32_t b3, uint32_t s3 +) { + // argument bounds and strides are *non-inclusive* for convenience + __rt_sssr_cfg_write(--b0 , ssridx, __RT_SSSR_REG_BOUND_0); + __rt_sssr_cfg_write(--b1 , ssridx, __RT_SSSR_REG_BOUND_1); + __rt_sssr_cfg_write(--b2 , ssridx, __RT_SSSR_REG_BOUND_2); + __rt_sssr_cfg_write(--b3 , ssridx, __RT_SSSR_REG_BOUND_3); + uint32_t a = 0; + __rt_sssr_cfg_write(s0-a, ssridx, __RT_SSSR_REG_STRIDE_0); + a += s0 * b0; + __rt_sssr_cfg_write(s1-a, ssridx, __RT_SSSR_REG_STRIDE_1); + a += s1 * b1; + __rt_sssr_cfg_write(s2-a, ssridx, __RT_SSSR_REG_STRIDE_2); + a += s2 * b2; + __rt_sssr_cfg_write(s3-a, ssridx, __RT_SSSR_REG_STRIDE_3); +} + +static inline void __rt_sssr_wait_done(uint32_t ssridx) { + uint32_t tmp; + asm volatile ( + __RT_SSSR_WAIT_DONE(%[tmpreg],%[ssridx]) + : [tmpreg]"+&r"(tmp) : [ssridx]"i"(ssridx) : "memory" + ); +} diff --git a/sw/saris/stencils/istc.common.hpp b/sw/saris/stencils/istc.common.hpp new file mode 100644 index 0000000000..042005a741 --- /dev/null +++ b/sw/saris/stencils/istc.common.hpp @@ -0,0 +1,181 @@ +#include +#include +#include + +#pragma once + +// ============ +// Macros +// ============ + +// ST and S contain temporal and spatial dimension constants, SP parallelization and unroll constants, C value constants of type `d_t` +#define RCP *__restrict__ const +#define PRM static constexpr int +#define PRMD static constexpr double +#define PRMX constexpr int +#define PRMXD constexpr double +struct __istc_dstr{PRM __dummy=0;}; +PRMX __istc_dstr::__dummy; +#define KNL template \ + static __attribute__((noinline)) void +#define IDXA volatile __attribute__ ((__aligned__(8))) i_t +#define COFA volatile __attribute__ ((__aligned__(8))) d_t + +// Shorten indexing code a bit +#define I(ptr) __rt_sssr_ptoi(ptr) +// Further simplify RCP deref magic (selexp indexes into A) +#define J(A, selexp) I(&(*A) selexp) + +// Shorten unroll for loops and canonical axis loops +#define PRAGMA(X) _Pragma(#X) +#define foru(unroll) \ + PRAGMA(clang loop unroll_count(unroll)) \ + for +#define forp(unroll, i, init, pte, stride) for (int i = init; i < pte; i += stride) +#define forpu(unroll, i, init, pte, stride) foru(unroll) (int i = init; i < pte; i += stride) +// Axis assist macro: shortcut for most axes (requires KNL_IDS) +#define forpx(axis, ii, init, pte) forp(sp::u##axis, ii, i##axis+init, pte, sp::p##axis) +#define forpux(axis, ii, init, pte) forpu(sp::u##axis, ii, i##axis+init, pte, sp::p##axis) +// Same as forpux, but explicitly control unroll (e.g. 1). Helps when kernels +// get so large that register allocation suffocates and addresses stack-swap. +#define forpex(unroll, axis, ii, init, pte) forpu(unroll, ii, i##axis+init, pte, sp::p##axis) +// For manual unrolling: simply combines strides +#define form(i, init, pte, stride) for (int i = init; i < pte; i += stride) + +// Macro to define core constants +#define KNL_IDS(cid) \ + const uint32_t ix = cid % sp::px; \ + const uint32_t iy = (cid / sp::px) % sp::py; \ + const uint32_t iz = cid / (sp::px * sp::py); + +#define sodt sizeof(d_t) + +// Macro for core constants with *local* unroll +#define KNL_IDS_LOC(cid) \ + KNL_IDS(cid) \ + uint32_t lx = ix * sp::ux; \ + uint32_t ly = iy * sp::uy; \ + uint32_t lz = iz * sp::uz; \ + constexpr uint32_t jmpz = sp::pz*sp::uz; \ + constexpr uint32_t jmpy = sp::py*sp::uy; \ + constexpr uint32_t jmpx = sp::px*sp::ux; + +// ======================== +// Dimension defaults +// ======================== + +#define SU(name, dim) \ + struct name {PRM n=dim; PRM nx=dim; PRM ny=dim; PRM nz=dim;}; \ + PRMX name::n, name::nx, name::ny, name::nz; + +// Keep these dimensions aligned with data generation +SU(s1s, 1000) +SU(s1sm, 1728) +SU(s1m, 2744) +SU(s1ml, 4096) +SU(s1l, 5832) + +SU(s2s, 32) +SU(s2sm, 42) +SU(s2m, 52) +SU(s2ml, 64) +SU(s2l, 76) + +SU(s3s, 10) +SU(s3sm, 12) +SU(s3m, 14) +SU(s3ml, 16) +SU(s3l, 18) + +#define ST(name, steps) \ + struct name {PRM t=steps;}; \ + PRMX name::t; + +ST(st1, 1) +ST(st4, 4) +ST(st12, 12) + +#define SP(name, ncores, parz, pary, parx, unrz, unry, unrx, unru) \ + struct name {PRM nc=ncores; PRM px=parx; PRM py=pary; PRM pz=parz; PRM ux=unrx; PRM uy=unry; PRM uz=unrz; PRM uu=unru;}; \ + PRMX name::nc, name::px, name::py, name::pz, name::ux, name::uy, name::uz, name::uu; + +SP(sp1, 8, 1, 1, 8, 1, 1, 4, 8) +SP(sp2, 8, 1, 2, 4, 1, 2, 2, 8) +SP(sp3, 8, 2, 2, 2, 1, 2, 2, 8) + +// ============= +// Helpers +// ============= + +inline void __istc_barrier() { + __rt_barrier(); +} + +inline double __istc_sgnjx(double rs1, double rs2) { + double rd; + asm volatile("fsgnjx.d %[rd], %[rs1], %[rs2]" : [rd]"=f"(rd) : [rs1]"f"(rs1), [rs2]"f"(rs2)); + return rd; +} + +// Implements `sign(a) == sign(b) ? 0 : a` using only FP operations and no conditional logic +inline double __istc_ternclip(double a, double b) { + // If `sign(a) == sign(b)`, then ainj is +|a|, otherwise |-a| + double ainj = __istc_sgnjx(a, b); + // This gives us +|a| if the condition holds, otherwise 0 + double ainj_clip = fmax(ainj, 0.0); + // Inject original sign of a into the clipped result, yielding a or (+/-) 0 + return copysign(ainj_clip, a); +} + +// ================== +// ISSR helpers +// ================== + +inline void __istc_setup_issrs(uint32_t idxsize, uint32_t i0l, uint32_t i1l) { + __rt_sssr_cfg_write(__RT_SSSR_IDX_CFG(idxsize, 0, 0), __RT_SSSR_IDXALL, __RT_SSSR_REG_IDX_CFG); + __rt_sssr_cfg_write(i0l-1, 0, __RT_SSSR_REG_BOUND_0); + __rt_sssr_cfg_write(i1l-1, 1, __RT_SSSR_REG_BOUND_0); +} + + +inline void __istc_iter_issrs(void* base, void* i0, void* i1) { + __rt_sssr_cfg_write_ptr(base, __RT_SSSR_IDXALL, __RT_SSSR_REG_IDX_BASE); + __rt_sssr_cfg_write_ptr(i0, 0, __RT_SSSR_REG_RPTR_INDIR); + __rt_sssr_cfg_write_ptr(i1, 1, __RT_SSSR_REG_RPTR_INDIR); +} + +// ========================== +// Verification helpers +// ========================== + +inline void __istc_cmp_grids( + uint32_t core_id, uint32_t core_num, uint32_t core_stride, + TCDM double* grid1, TCDM double* grid2, uint32_t len, double rel_eps, + TCDM volatile uint32_t* err_sema +) { + __rt_barrier(); + uint32_t errors = 0; + uint32_t stride = core_num * core_stride; + #pragma clang loop unroll_count(16) + for (int i = core_id; i < len; i += stride) + errors += (fabs(grid1[i] - grid2[i]) > fabs(rel_eps * grid1[i])); + __atomic_fetch_add(err_sema, errors, __ATOMIC_RELAXED); + __rt_barrier(); +} + +volatile void __attribute__((noinline)) __istc_touch_grid( + uint32_t core_id, uint32_t core_num, uint32_t core_stride, + TCDM double* grid, uint32_t len, TCDM volatile uint32_t* ret_sema +) { + __rt_barrier(); + uint32_t ret_loc; + double sum = 0.0; + uint32_t stride = core_num * core_stride; + #pragma clang loop unroll_count(16) + for (int i = core_id; i < len; i += stride) + sum += grid[i]; + asm volatile("fcvt.w.d t1, %1; sub %0, t1, t1" : "=r"(ret_loc) : "f"(sum) : "memory", "t1"); + __atomic_fetch_add(ret_sema, ret_loc, __ATOMIC_RELAXED); + __rt_barrier(); +} diff --git a/sw/saris/stencils/istc.issr.hpp b/sw/saris/stencils/istc.issr.hpp new file mode 100644 index 0000000000..c74d76b4dc --- /dev/null +++ b/sw/saris/stencils/istc.issr.hpp @@ -0,0 +1,879 @@ +#include "istc.common.hpp" + +// =============== +// Polybench +// =============== + +KNL istci_pb_jacobi_2d( + const int cid, + TCDM d_t (RCP A)[s::n][s::n], + TCDM d_t (RCP B)[s::n][s::n] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sx+sy; + constexpr uint32_t b = dx, l = dy, cc = dx+dy, r = cc+dx, tt = cc+dy; + // Indices include padding on axes (do not init arrays to prevent memcpy) + IDXA i0[10], i1[10]; + /*b*/ i0[ 0] = b; i0[ 1] = b + sx; i0[ 2] = b + sy; i0[ 3] = b + sb; + /*l*/ i0[ 4] = l; i0[ 5] = l + sx; i0[ 6] = l + sy; i0[ 7] = l + sb; + /*c*/ i0[ 8] = cc; i0[ 9] = cc + sy; + /*r*/ i1[ 0] = r; i1[ 1] = r + sx; i1[ 2] = r + sy; i1[ 3] = r + sb; + /*t*/ i1[ 4] = tt; i1[ 5] = tt + sx; i1[ 6] = tt + sy; i1[ 7] = tt + sb; + /*c*/ i1[ 8] = cc + sx; i1[ 9] = cc + sb; + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 10, 10); + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (i, ly, s::n-2, jmpy) { + __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt); + bool winit = true; + form (j, lx, s::n-2, jmpx) { + __istc_iter_issrs((void*)&(*A)[i][j], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*B)[i+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);} + asm volatile ( + // br0..3 = b0..3 + r0..3 and lt0..3 = l0..3 + t0..3 + "frep.i %[c7], 1, 7, 0b001 \n" + "fadd.d fa0, ft0, ft1 \n" + // p0..3 = br0..3 + lt0..3 + "frep.i %[c3], 1, 3, 0b111 \n" + "fadd.d fa0, fa0, fa4 \n" + // tt0..3 = p0..3 + c0..3 + "fadd.d fa0, fa0, ft0 \n" + "fadd.d fa1, fa1, ft1 \n" + "fadd.d fa2, fa2, ft0 \n" + "fadd.d fa3, fa3, ft1 \n" + // res0..3 = 0.2 * tt0..3 + "frep.i %[c3], 1, 3, 0b100 \n" + "fmul.d ft2, %[cf], fa0 \n" + :: [c7]"r"(7), [c3]"r"(3), [cf]"f"(0.2) + : "memory", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +// ========== +// AN5D +// ========== + +KNL istci_an5d_j2d5pt( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t b = dx, l = dy, cc = dx+dy, r = cc+dx, tt = cc+dy; + // Indices include padding on axes (do not init arrays to prevent memcpy) + IDXA i0[10], i1[10]; + /*c*/ i0[ 0] = cc; i0[ 1] = cc+sy; /*b*/ i0[ 2] = b; i0[ 3] = b+sy; + /*l*/ i0[ 4] = l; i0[ 5] = l+sy; /*r*/ i0[ 6] = r; i0[ 7] = r+sy; + /*t*/ i0[ 8] = tt; i0[ 9] = tt+sy; + /*c*/ i1[ 0] = cc+sx; i1[ 1] = cc+sb; /*b*/ i1[ 2] = b+sx; i1[ 3] = b+sb; + /*l*/ i1[ 4] = l+sx; i1[ 5] = l+sb; /*r*/ i1[ 6] = r+sx; i1[ 7] = r+sb; + /*t*/ i1[ 8] = tt+sx; i1[ 9] = tt+sb; + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 10, 10); + + // Avoid constant FP division + register d_t fac asm("ft7") = 1.0 / c::c0; + // Use stacked registers for FREP + register d_t cb asm("ft3") = c::ym[0]; + register d_t cl asm("ft4") = c::xm[0]; + register d_t cr asm("ft5") = c::xp[0]; + register d_t ct asm("ft6") = c::yp[0]; + register d_t cc_ asm("ft8") = c::cc; + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (y, ly, s::n-2, jmpy) { + __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt); + bool winit = true; + form (x, lx, s::n-2, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, %[cc], ft0 \n" + "fmul.d fa1, %[cc], ft1 \n" + "fmul.d fa2, %[cc], ft0 \n" + "fmul.d fa3, %[cc], ft1 \n" + // Do directionals as loop + "frep.o %[c3], 4, 3, 0b0010 \n" + "fmadd.d fa0, ft3, ft0, fa0 \n" + "fmadd.d fa1, ft3, ft1, fa1 \n" + "fmadd.d fa2, ft3, ft0, fa2 \n" + "fmadd.d fa3, ft3, ft1, fa3 \n" + // Final scaling and writeback + "frep.i %[c3], 1, 3, 0b100 \n" + "fmul.d ft2, %[fc], fa0 \n" + : [cb]"+&f"(cb), [cl]"+&f"(cl), [cr]"+&f"(cr), [ct]"+&f"(ct), + [cc]"+&f"(cc_), [fc]"+&f"(fac) + : [c3]"r"(3) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_j2d9pt( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t cc = 2*dy+2*dx, + b0 = cc-dy, b1 = cc-2*dy, + l0 = cc-dx, l1 = cc-2*dx, + r0 = cc+dx, r1 = cc+2*dx, + t0 = cc+dy, t1 = cc+2*dy; + // Indices include padding on axes (do not init arrays to prevent memcpy) + IDXA i0[18], i1[18]; + /*cc*/ i0[ 0] = cc; i0[ 1] = cc+sy; + /*b0*/ i0[ 2] = b0; i0[ 3] = b0+sy; /*l0*/ i0[ 4] = l0; i0[ 5] = l0+sy; + /*r0*/ i0[ 6] = r0; i0[ 7] = r0+sy; /*t0*/ i0[ 8] = t0; i0[ 9] = t0+sy; + /*b1*/ i0[10] = b1; i0[11] = b1+sy; /*l1*/ i0[12] = l1; i0[13] = l1+sy; + /*r1*/ i0[14] = r1; i0[15] = r1+sy; /*t1*/ i0[16] = t1; i0[17] = t1+sy; + /*cc*/ i1[ 0] = cc+sx; i1[ 1] = cc+sb; + /*b0*/ i1[ 2] = b0+sx; i1[ 3] = b0+sb; /*l0*/ i1[ 4] = l0+sx; i1[ 5] = l0+sb; + /*r0*/ i1[ 6] = r0+sx; i1[ 7] = r0+sb; /*t0*/ i1[ 8] = t0+sx; i1[ 9] = t0+sb; + /*b1*/ i1[10] = b1+sx; i1[11] = b1+sb; /*l1*/ i1[12] = l1+sx; i1[13] = l1+sb; + /*r1*/ i1[14] = r1+sx; i1[15] = r1+sb; /*t1*/ i1[16] = t1+sx; i1[17] = t1+sb; + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 18, 18); + + // Avoid constant FP division + register d_t fac asm("fa4") = 1.0 / c::c0; + // Use stacked registers for FREP + register d_t cb0 asm("ft3") = c::ym[0]; + register d_t cl0 asm("ft4") = c::xm[0]; + register d_t cr0 asm("ft5") = c::xp[0]; + register d_t ct0 asm("ft6") = c::yp[0]; + register d_t cb1 asm("ft8") = c::ym[1]; + register d_t cl1 asm("ft9") = c::xm[1]; + register d_t cr1 asm("ft10") = c::xp[1]; + register d_t ct1 asm("ft11") = c::yp[1]; + register d_t cc_ asm("fa5") = c::cc; + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (y, ly, s::n-4,jmpy) { + __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-4+jmpx-lx-sp::ux)/jmpx, jmpx*sodt); + bool winit = true; + form (x, lx, s::n-4, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+2][lx+2], 2, __RT_SSSR_REG_WPTR_2);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, %[cc], ft0 \n" + "fmul.d fa1, %[cc], ft1 \n" + "fmul.d fa2, %[cc], ft0 \n" + "fmul.d fa3, %[cc], ft1 \n" + // Do directionals as loop + "frep.o %[c3], 4, 3, 0b0010 \n" + "fmadd.d fa0, ft3, ft0, fa0 \n" + "fmadd.d fa1, ft3, ft1, fa1 \n" + "fmadd.d fa2, ft3, ft0, fa2 \n" + "fmadd.d fa3, ft3, ft1, fa3 \n" + // Do directionals as loop + "frep.o %[c3], 4, 3, 0b0010 \n" + "fmadd.d fa0, ft8, ft0, fa0 \n" + "fmadd.d fa1, ft8, ft1, fa1 \n" + "fmadd.d fa2, ft8, ft0, fa2 \n" + "fmadd.d fa3, ft8, ft1, fa3 \n" + // Final scaling and writeback + "frep.i %[c3], 1, 3, 0b100 \n" + "fmul.d ft2, %[fc], fa0 \n" + : [cb0]"+&f"(cb0), [cl0]"+&f"(cl0), [cr0]"+&f"(cr0), [ct0]"+&f"(ct0), + [cb1]"+&f"(cb1), [cl1]"+&f"(cl1), [cr1]"+&f"(cr1), [ct1]"+&f"(ct1), + [cc]"+&f"(cc_), [fc]"+&f"(fac) + : [c3]"r"(3) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_j2d9pt_gol( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t + bl = 0, bc = dx, br = 2*dx, + ml = dy, mc = dx+dy, mr = 2*dx+dy, + tl = 2*dy, tc = dx+2*dy, tr = 2*dx+2*dy; + // Indices include padding on axes (do not init arrays to prevent memcpy) + IDXA i0[18], i1[18]; + /*mc*/ i0[ 0] = mc; i0[ 1] = mc + sy; + /*bl*/ i0[ 2] = bl; i0[ 3] = bl + sy; /*bc*/ i0[ 4] = bc; i0[ 5] = bc + sy; + /*br*/ i0[ 6] = br; i0[ 7] = br + sy; /*ml*/ i0[ 8] = ml; i0[ 9] = ml + sy; + /*mr*/ i0[10] = mr; i0[11] = mr + sy; /*tl*/ i0[12] = tl; i0[13] = tl + sy; + /*tc*/ i0[14] = tc; i0[15] = tc + sy; /*tr*/ i0[16] = tr; i0[17] = tr + sy; + /*mc*/ i1[ 0] = mc + sx; i1[ 1] = mc + sb; + /*bl*/ i1[ 2] = bl + sx; i1[ 3] = bl + sb; /*bc*/ i1[ 4] = bc + sx; i1[ 5] = bc + sb; + /*br*/ i1[ 6] = br + sx; i1[ 7] = br + sb; /*ml*/ i1[ 8] = ml + sx; i1[ 9] = ml + sb; + /*mr*/ i1[10] = mr + sx; i1[11] = mr + sb; /*tl*/ i1[12] = tl + sx; i1[13] = tl + sb; + /*tc*/ i1[14] = tc + sx; i1[15] = tc + sb; /*tr*/ i1[16] = tr + sx; i1[17] = tr + sb; + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, 18, 18); + + // Avoid constant FP division + register d_t fac asm("fa4") = 1.0 / c::c0; + // Use stacked registers for FREP + register d_t cmc asm("fa5") = c::c[1][1]; + register d_t cbl asm("ft3") = c::c[0][0]; + register d_t cbc asm("ft4") = c::c[0][1]; + register d_t cbr asm("ft5") = c::c[0][2]; + register d_t cml asm("ft6") = c::c[1][0]; + register d_t cmr asm("ft8") = c::c[1][2]; + register d_t ctl asm("ft9") = c::c[2][0]; + register d_t ctc asm("ft10") = c::c[2][1]; + register d_t ctr asm("ft11") = c::c[2][2]; + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (y, ly, s::n-2,jmpy) { + __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, jmpx*sodt); + bool winit = true; + form (x, lx, s::n-2, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*A[(t+1)%2])[y+1][lx+1], 2, __RT_SSSR_REG_WPTR_2);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, %[cmc], ft0 \n" + "fmul.d fa1, %[cmc], ft1 \n" + "fmul.d fa2, %[cmc], ft0 \n" + "fmul.d fa3, %[cmc], ft1 \n" + // Do directionals as loop + "frep.o %[c3], 4, 3, 0b0010 \n" + "fmadd.d fa0, ft3, ft0, fa0 \n" + "fmadd.d fa1, ft3, ft1, fa1 \n" + "fmadd.d fa2, ft3, ft0, fa2 \n" + "fmadd.d fa3, ft3, ft1, fa3 \n" + // Do directionals as loop + "frep.o %[c3], 4, 3, 0b0010 \n" + "fmadd.d fa0, ft8, ft0, fa0 \n" + "fmadd.d fa1, ft8, ft1, fa1 \n" + "fmadd.d fa2, ft8, ft0, fa2 \n" + "fmadd.d fa3, ft8, ft1, fa3 \n" + // Final scaling and writeback + "frep.i %[c3], 1, 3, 0b100 \n" + "fmul.d ft2, %[fc], fa0 \n" + : [cbl]"+&f"(cbl), [cbc]"+&f"(cbc), [cbr]"+&f"(cbr), [cml]"+&f"(cml), + [cmr]"+&f"(cmr), [ctl]"+&f"(ctl), [ctc]"+&f"(ctc), [ctr]"+&f"(ctr), + [cmc]"+&f"(cmc), [fc]"+&f"(fac) + : [c3]"r"(3) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_j3d27pt( + const int cid, + TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx; + // Indices include padding on axes (do not init arrays to prevent memcpy) + constexpr uint32_t ilen = 2*27; + IDXA i0[ilen], i1[ilen]; + IDXA *p0 = i0, *p1 = i1; + #pragma unroll + for (int z = 0; z < 3; ++z) + #pragma unroll + for (int y = 0; y < 3; ++y) + #pragma unroll + for (int x = 0; x < 3; ++x) { + uint32_t pt = z*dz + y*dy + x*dx; + /*pt0*/ *(p0++) = pt; *(p0++) = pt+sy; + /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb; + } + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen); + + // Avoid constant FP division + register d_t fac asm("ft3") = 1.0 / c::c0; + // Buffer constants in order for SSR use (each repeated to cover unroll) + COFA ca[27]; + COFA* pa = ca; + #pragma unroll + for (int z = 0; z < 3; ++z) + #pragma unroll + for (int y = 0; y < 3; ++y) + #pragma unroll + for (int x = 0; x < 3; ++x) + *(pa++) = c::c3[z][y][x]; + __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT); + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + + form (z, lz, s::n-2,jmpz) { + form (y, ly, s::n-2,jmpy) { + __rt_sssr_bound_stride_2d(2, 27, sodt, (s::n-2+jmpx-lx-sp::ux)/jmpx, 0); + bool winit = true; + form (x, lx, s::n-2, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);} + asm volatile ( + // Initialize accumulators: bottom left + "fmul.d fa0, ft2, ft0 \n" + "fmul.d fa1, ft2, ft1 \n" + "fmul.d fa2, ft2, ft0 \n" + "fmul.d fa3, ft2, ft1 \n" + // Do remaining blocks as loop + "frep.o %[cd], 4, 3, 0b0000 \n" + "fmadd.d fa0, ft2, ft0, fa0 \n" + "fmadd.d fa1, ft2, ft1, fa1 \n" + "fmadd.d fa2, ft2, ft0, fa2 \n" + "fmadd.d fa3, ft2, ft1, fa3 \n" + // Final scaling + "frep.i %[c3], 1, 3, 0b101 \n" + "fmul.d fa0, %[fc], fa0 \n" + // Final writeback + "fsd fa0, 0 (%[wb]) \n" + "fsd fa1, %[sx](%[wb]) \n" + "fsd fa2, %[sy](%[wb]) \n" + "fsd fa3, %[sb](%[wb]) \n" + : [fc]"+&f"(fac) + : [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(27-2), [c3]"r"(3), + [wb]"r"(&(*A[(t+1)%2])[z+1][y+1][x+1]) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + } + lz = (lz + sp::pz) % jmpz; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_star2dXr( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + static_assert(ci::r >= 1, "Radius must be at least 1!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t cc = ci::r*dy + ci::r*dx; + constexpr uint32_t npoints = 1+4*ci::r; + // Indices include padding on axes (do not init arrays to prevent memcpy) + constexpr uint32_t ilen = 2*npoints; + IDXA i0[ilen], i1[ilen]; + IDXA *p0 = i0, *p1 = i1; + /*cc0*/ *(p0++) = cc; *(p0++) = cc+sy; + /*cc1*/ *(p1++) = cc+sx; *(p1++) = cc+sb; + #pragma unroll + for (int j = 1; j <= ci::r; ++j) { + uint32_t bb = cc-j*dy, ll = cc-j*dx, rr = cc+j*dx, tt = cc+j*dy; + /*bb0*/ *(p0++) = bb; *(p0++) = bb+sy; /*ll0*/ *(p0++) = ll; *(p0++) = ll+sy; + /*rr0*/ *(p0++) = rr; *(p0++) = rr+sy; /*tt0*/ *(p0++) = tt; *(p0++) = tt+sy; + /*bb1*/ *(p1++) = bb+sx; *(p1++) = bb+sb; /*ll1*/ *(p1++) = ll+sx; *(p1++) = ll+sb; + /*rr1*/ *(p1++) = rr+sx; *(p1++) = rr+sb; /*tt1*/ *(p1++) = tt+sx; *(p1++) = tt+sb; + } + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen); + + // Buffer constants in order for SSR use (each repeated to cover unroll) + COFA ca[npoints]; + COFA* pa = ca; + /*cc*/ *(pa++) = c::cc; + #pragma unroll + for (int j = 0; j < ci::r; ++j) { + /*bb*/ *(pa++) = c::ym[j]; /*ll*/ *(pa++) = c::xm[j]; + /*rr*/ *(pa++) = c::xp[j]; /*tt*/ *(pa++) = c::yp[j]; + } + __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT); + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (y, ly, s::n-2*ci::r,jmpy) { + __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0); + bool winit = true; + form (x, lx, s::n-2*ci::r, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, ft2, ft0 \n" + "fmul.d fa1, ft2, ft1 \n" + "fmul.d fa2, ft2, ft0 \n" + "fmul.d fa3, ft2, ft1 \n" + // Do directionals as loop + "frep.o %[cd], 4, 3, 0b0000 \n" + "fmadd.d fa0, ft2, ft0, fa0 \n" + "fmadd.d fa1, ft2, ft1, fa1 \n" + "fmadd.d fa2, ft2, ft0, fa2 \n" + "fmadd.d fa3, ft2, ft1, fa3 \n" + // Final writeback + "fsd fa0, 0 (%[wb]) \n" + "fsd fa1, %[sx](%[wb]) \n" + "fsd fa2, %[sy](%[wb]) \n" + "fsd fa3, %[sb](%[wb]) \n" + :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2), + [wb]"r"(&(*A[(t+1)%2])[y+ci::r][x+ci::r]) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_box2dXr( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + static_assert(ci::r >= 1, "Radius must be at least 1!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t npoints = (2*ci::r+1)*(2*ci::r+1); + // Indices include padding on axes (do not init arrays to prevent memcpy) + constexpr uint32_t ilen = 2*npoints; + IDXA i0[ilen], i1[ilen]; + IDXA *p0 = i0, *p1 = i1; + #pragma unroll + for (int y = 0; y < 2*ci::r+1; ++y) + #pragma unroll + for (int x = 0; x < 2*ci::r+1; ++x) { + uint32_t pt = y*dy + x*dx; + /*pt0*/ *(p0++) = pt; *(p0++) = pt+sy; + /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb; + } + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen); + + // Buffer constants in order for SSR use (each repeated to cover unroll) + COFA ca[npoints]; + COFA* pa = ca; + #pragma unroll + for (int y = 0; y < 2*ci::r+1; ++y) + #pragma unroll + for (int x = 0; x < 2*ci::r+1; ++x) + *(pa++) = c::c[y][x]; + __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT); + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (y, ly, s::n-2*ci::r,jmpy) { + __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0); + bool winit = true; + form (x, lx, s::n-2*ci::r, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, ft2, ft0 \n" + "fmul.d fa1, ft2, ft1 \n" + "fmul.d fa2, ft2, ft0 \n" + "fmul.d fa3, ft2, ft1 \n" + // Do directionals as loop + "frep.o %[cd], 4, 3, 0b0000 \n" + "fmadd.d fa0, ft2, ft0, fa0 \n" + "fmadd.d fa1, ft2, ft1, fa1 \n" + "fmadd.d fa2, ft2, ft0, fa2 \n" + "fmadd.d fa3, ft2, ft1, fa3 \n" + // Final writeback + "fsd fa0, 0 (%[wb]) \n" + "fsd fa1, %[sx](%[wb]) \n" + "fsd fa2, %[sy](%[wb]) \n" + "fsd fa3, %[sb](%[wb]) \n" + :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2), + [wb]"r"(&(*A[(t+1)%2])[y+ci::r][x+ci::r]) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_star3dXr( + const int cid, + TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + static_assert(ci::r >= 1, "Radius must be at least 1!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t cc = ci::r*dz + ci::r*dy + ci::r*dx; + constexpr uint32_t npoints = 1+6*ci::r; + // Indices include padding on axes (do not init arrays to prevent memcpy) + constexpr uint32_t ilen = 2*npoints; + IDXA i0[ilen], i1[ilen]; + IDXA *p0 = i0, *p1 = i1; + /*cc0*/ *(p0++) = cc; *(p0++) = cc+sy; + /*cc1*/ *(p1++) = cc+sx; *(p1++) = cc+sb; + #pragma unroll + for (int j = 1; j <= ci::r; ++j) { + uint32_t bb = cc-j*dy, ll=cc-j*dx, rr = cc+j*dx, tt = cc+j*dy, aa = cc-j*dz, ff = cc+j*dz; + /*bb0*/ *(p0++) = bb; *(p0++) = bb+sy; /*ll0*/ *(p0++) = ll; *(p0++) = ll+sy; + /*rr0*/ *(p0++) = rr; *(p0++) = rr+sy; /*tt0*/ *(p0++) = tt; *(p0++) = tt+sy; + /*aa0*/ *(p0++) = aa; *(p0++) = aa+sy; /*ff0*/ *(p0++) = ff; *(p0++) = ff+sy; + /*bb1*/ *(p1++) = bb+sx; *(p1++) = bb+sb; /*ll1*/ *(p1++) = ll+sx; *(p1++) = ll+sb; + /*rr1*/ *(p1++) = rr+sx; *(p1++) = rr+sb; /*tt1*/ *(p1++) = tt+sx; *(p1++) = tt+sb; + /*aa1*/ *(p1++) = aa+sx; *(p1++) = aa+sb; /*ff1*/ *(p1++) = ff+sx; *(p1++) = ff+sb; + } + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen); + + // Buffer constants in order for SSR use (each repeated to cover unroll) + COFA ca[npoints]; + COFA* pa = ca; + /*cc*/ *(pa++) = c::cc; + #pragma unroll + for (int j = 0; j < ci::r; ++j) { + /*bb*/ *(pa++) = c::ym[j]; /*ll*/ *(pa++) = c::xm[j]; + /*rr*/ *(pa++) = c::xp[j]; /*tt*/ *(pa++) = c::yp[j]; + /*rr*/ *(pa++) = c::zm[j]; /*tt*/ *(pa++) = c::zp[j]; + } + __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT); + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (z, lz, s::n-2*ci::r,jmpz) { + form (y, ly, s::n-2*ci::r,jmpy) { + __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0); + bool winit = true; + form (x, lx, s::n-2*ci::r, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, ft2, ft0 \n" + "fmul.d fa1, ft2, ft1 \n" + "fmul.d fa2, ft2, ft0 \n" + "fmul.d fa3, ft2, ft1 \n" + // Do directionals as loop + "frep.o %[cd], 4, 3, 0b0000 \n" + "fmadd.d fa0, ft2, ft0, fa0 \n" + "fmadd.d fa1, ft2, ft1, fa1 \n" + "fmadd.d fa2, ft2, ft0, fa2 \n" + "fmadd.d fa3, ft2, ft1, fa3 \n" + // Final writeback + "fsd fa0, 0 (%[wb]) \n" + "fsd fa1, %[sx](%[wb]) \n" + "fsd fa2, %[sy](%[wb]) \n" + "fsd fa3, %[sb](%[wb]) \n" + :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2), + [wb]"r"(&(*A[(t+1)%2])[z+ci::r][y+ci::r][x+ci::r]) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + } + lz = (lz + sp::pz) % jmpz; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +KNL istci_an5d_box3dXr( + const int cid, + TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx] +) { + // Assertions and IDs + static_assert(sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + static_assert(ci::r >= 1, "Radius must be at least 1!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t npoints = (2*ci::r+1)*(2*ci::r+1)*(2*ci::r+1); + // Indices include padding on axes (do not init arrays to prevent memcpy) + constexpr uint32_t ilen = 2*npoints; + IDXA i0[ilen], i1[ilen]; + IDXA *p0 = i0, *p1 = i1; + #pragma unroll + for (int z = 0; z < 2*ci::r+1; ++z) + #pragma unroll + for (int y = 0; y < 2*ci::r+1; ++y) + #pragma unroll + for (int x = 0; x < 2*ci::r+1; ++x) { + uint32_t pt = z*dz + y*dy + x*dx; + /*pt0*/ *(p0++) = pt; *(p0++) = pt+sy; + /*pt1*/ *(p1++) = pt+sx; *(p1++) = pt+sb; + } + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen); + + // Buffer constants in order for SSR use (each repeated to cover unroll) + COFA ca[npoints]; + COFA* pa = ca; + #pragma unroll + for (int z = 0; z < 2*ci::r+1; ++z) + #pragma unroll + for (int y = 0; y < 2*ci::r+1; ++y) + #pragma unroll + for (int x = 0; x < 2*ci::r+1; ++x) + *(pa++) = c::c3[z][y][x]; + __rt_sssr_cfg_write(sp::uy*sp::ux-1, 2, __RT_SSSR_REG_REPEAT); + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + form (z, lz, s::n-2*ci::r,jmpz) { + form (y, ly, s::n-2*ci::r,jmpy) { + __rt_sssr_bound_stride_2d(2, npoints, sodt, (s::n-2*ci::r+jmpx-lx-sp::ux)/jmpx, 0); + bool winit = true; + form (x, lx, s::n-2*ci::r, jmpx) { + __istc_iter_issrs((void*)&(*A[t%2])[z][y][x], (void*)i0, (void*)i1); + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)ca, 2, __RT_SSSR_REG_RPTR_1);} + asm volatile ( + // Initialize accumulators: center + "fmul.d fa0, ft2, ft0 \n" + "fmul.d fa1, ft2, ft1 \n" + "fmul.d fa2, ft2, ft0 \n" + "fmul.d fa3, ft2, ft1 \n" + // Do directionals as loop + "frep.o %[cd], 4, 3, 0b0000 \n" + "fmadd.d fa0, ft2, ft0, fa0 \n" + "fmadd.d fa1, ft2, ft1, fa1 \n" + "fmadd.d fa2, ft2, ft0, fa2 \n" + "fmadd.d fa3, ft2, ft1, fa3 \n" + // Final writeback + "fsd fa0, 0 (%[wb]) \n" + "fsd fa1, %[sx](%[wb]) \n" + "fsd fa2, %[sy](%[wb]) \n" + "fsd fa3, %[sb](%[wb]) \n" + :: [sx]"i"(8*sx), [sy]"i"(8*sy), [sb]"i"(8*sb), [cd]"r"(npoints-2), + [wb]"r"(&(*A[(t+1)%2])[z+ci::r][y+ci::r][x+ci::r]) + : "memory", "fa0", "fa1", "fa2", "fa3" + ); + } + lx = (lx + sp::px) % jmpx; + } + ly = (ly + sp::py) % jmpy; + } + lz = (lz + sp::pz) % jmpz; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} + + +// ============= +// Minimod +// ============= + +KNL istci_minimod_acoustic_iso_cd( + const int cid, + TCDM d_t (RCP u[2])[s::nz][s::ny][s::nx], + TCDM d_t (RCP f)[s::nz-8][s::ny-8][s::nx-8] +) { + // Assertions and IDs + static_assert(sp::uz == 1 && sp::uy == 2 && sp::ux == 2, "Axes unrolls are static!"); + static_assert(s::n % 2 == 0, "Axes must be unroll-aligned!"); + KNL_IDS_LOC(cid) + + // Define points of stencil and unroll copies + constexpr uint32_t rad = 4; + constexpr uint32_t dx = 1, dy = s::n, dz = s::n*s::n, sx = dx, sy = dy, sb = sy+sx; + constexpr uint32_t cct = rad*dz + rad*dy + rad*dx; + constexpr uint32_t nhpoints = 6*rad; + // Indices include padding on axes (do not init arrays to prevent memcpy) + constexpr uint32_t ilen = 2*nhpoints+4; + IDXA i0[ilen], i1[ilen]; + IDXA *p0 = i0, *p1 = i1; + /*cc0*/ *(p0++) = cct; *(p1++) = cct+sx; + /*cc1*/ *(p0++) = cct+sy; *(p1++) = cct+sb; + #pragma unroll + for (int j = 1; j <= rad; ++j) { + uint32_t ll=cct-j*dx, rr = cct+j*dx, bb = cct-j*dy, tt = cct+j*dy, aa = cct-j*dz, ff = cct+j*dz; + /*ll0*/ *(p0++) = ll; *(p1++) = ll+sx; + /*ll1*/ *(p0++) = ll+sy; *(p1++) = ll+sb; + /*rr0*/ *(p0++) = rr; *(p1++) = rr+sx; + /*rr1*/ *(p0++) = rr+sy; *(p1++) = rr+sb; + /*bb0*/ *(p0++) = bb; *(p1++) = bb+sx; + /*bb1*/ *(p0++) = bb+sy; *(p1++) = bb+sb; + /*tt0*/ *(p0++) = tt; *(p1++) = tt+sx; + /*tt1*/ *(p0++) = tt+sy; *(p1++) = tt+sb; + /*aa0*/ *(p0++) = aa; *(p1++) = aa+sx; + /*aa1*/ *(p0++) = aa+sy; *(p1++) = aa+sb; + /*ff0*/ *(p0++) = ff; *(p1++) = ff+sx; + /*ff1*/ *(p0++) = ff+sy; *(p1++) = ff+sb; + } + __istc_setup_issrs(__RT_SSSR_IDXSIZE_U16, ilen, ilen); + + // Use registers for coefficients + register d_t cc0 asm("f3"); + register d_t cx0 asm("f4"); + register d_t cy0 asm("f5"); + register d_t cz0 asm("f6"); + register d_t cx1 asm("f7"); + register d_t cy1 asm("f8"); + register d_t cz1 asm("f9"); + register d_t cx2 asm("f10"); + register d_t cy2 asm("f11"); + register d_t cz2 asm("f12"); + register d_t cx3 asm("f13"); + register d_t cy3 asm("f14"); + register d_t cz3 asm("f15"); + + // Preload registers + asm volatile( + "fld f13, -8(%[xp]) \n" + "fld f14, -8(%[yp]) \n" + "fld f15, -8(%[zp]) \n" + "fadd.d f3, f13, f14 \n" + "fld f4, 0(%[xp]) \n" + "fld f5, 0(%[yp]) \n" + "fld f6, 0(%[zp]) \n" + "fadd.d f3, f3, f15 \n" + "fld f7, 8(%[xp]) \n" + "fld f8, 8(%[yp]) \n" + "fld f9, 8(%[zp]) \n" + "fmul.d f3, f3, %[cf2]\n" + "fld f10, 16(%[xp]) \n" + "fld f11, 16(%[yp]) \n" + "fld f12, 16(%[zp]) \n" + "fld f13, 24(%[xp]) \n" + "fld f14, 24(%[yp]) \n" + "fld f15, 24(%[zp]) \n" + : "+&f"(cx0), "+&f"(cy0), "+&f"(cz0), "+&f"(cx1), "+&f"(cy1), "+&f"(cz1), + "+&f"(cx2), "+&f"(cy2), "+&f"(cz2), "+&f"(cx3), "+&f"(cy3), "+&f"(cz3), + "+&f"(cc0) + : [xp]"r"(&c::xp[1]), [yp]"r"(&c::yp[1]), [zp]"r"(&c::zp[1]), [cf2]"f"(2.0) + ); + + // introduce variable for tracking impulse offsets + uint32_t lf = cid; + + __RT_SSSR_BLOCK_BEGIN + for (int t = 0; t < st::t; t++) { + // We load last grid's center piece inside the time loop as it keeps changing + int32_t ccoffs = &(*u[(t+1)%2])[rad][rad][rad] - &(*u[t%2])[0][0][0]; + /*cc0*/ i0[ilen-2] = ccoffs; i0[ilen-1] = ccoffs+sy; + /*cc1*/ i1[ilen-2] = ccoffs+sx; i1[ilen-1] = ccoffs+sb; + form (z, lz, s::n-2*rad, jmpz) { + form (y, ly, s::n-2*rad, jmpy) { + __rt_sssr_bound_stride_3d(2, 2, sodt, 2, s::n*sodt, (s::n-2*rad+jmpx-lx-sp::ux)/jmpx, jmpx*sodt); + bool winit = true; + form (x, lx, s::n-2*rad, jmpx) { + register d_t fi0 asm("f28") = c::uffac * (*f)[z][y ][x ]; + register d_t fix asm("f29") = c::uffac * (*f)[z][y ][x+1]; + // Set up SSRs + __istc_iter_issrs((void*)&(*u[t%2])[z][y][x], (void*)i0, (void*)i1); + // Load impulses + register d_t fiy asm("f30") = c::uffac * (*f)[z][y+1][x ]; + register d_t fib asm("f31") = c::uffac * (*f)[z][y+1][x+1]; + if (winit) {winit = false; __rt_sssr_cfg_write_ptr((void*)&(*u[(t+1)%2])[z+rad][y+rad][lx+rad], 2, __RT_SSSR_REG_WPTR_2);} + asm volatile ( + // First add centerpoint + "fmadd.d f28, f3, f0, f28 \n" + "fmadd.d f29, f3, f1, f29 \n" + "fmadd.d f30, f3, f0, f30 \n" + "fmadd.d f31, f3, f1, f31 \n" + // Iterate over points (stagger coeffs) + "frep.o %[c3], 8, 3, 0b010 \n" + "fmadd.d f28, f4, f0, f28 \n" + "fmadd.d f29, f4, f1, f29 \n" + "fmadd.d f30, f4, f0, f30 \n" + "fmadd.d f31, f4, f1, f31 \n" + "fmadd.d f28, f4, f0, f28 \n" + "fmadd.d f29, f4, f1, f29 \n" + "fmadd.d f30, f4, f0, f30 \n" + "fmadd.d f31, f4, f1, f31 \n" + "frep.o %[c7], 8, 7, 0b010 \n" + "fmadd.d f28, f8, f0, f28 \n" + "fmadd.d f29, f8, f1, f29 \n" + "fmadd.d f30, f8, f0, f30 \n" + "fmadd.d f31, f8, f1, f31 \n" + "fmadd.d f28, f8, f0, f28 \n" + "fmadd.d f29, f8, f1, f29 \n" + "fmadd.d f30, f8, f0, f30 \n" + "fmadd.d f31, f8, f1, f31 \n" + // Final subtraction and writeback + "fsub.d f2, f28, f0 \n" + "fsub.d f2, f29, f1 \n" + "fsub.d f2, f30, f0 \n" + "fsub.d f2, f31, f1 \n" + : "+&f"(cx0), "+&f"(cy0), "+&f"(cz0), "+&f"(cx1), "+&f"(cy1), "+&f"(cz1), + "+&f"(cx2), "+&f"(cy2), "+&f"(cz2), "+&f"(cx3), "+&f"(cy3), "+&f"(cz3), + "+&f"(cc0), + "+&f"(fi0), "+&f"(fix), "+&f"(fiy), "+&f"(fib) + : [c7]"r"(7), [c3]"r"(3) + : "memory" + ); + } + lx = (lx + sp::ux) % jmpx; + } + ly = (ly + sp::uy) % jmpy; + } + lz = (lz + sp::uz) % jmpz; + __rt_barrier(); + } + __RT_SSSR_BLOCK_END +} diff --git a/sw/saris/stencils/istc.par.hpp b/sw/saris/stencils/istc.par.hpp new file mode 100644 index 0000000000..37ba6fd4e3 --- /dev/null +++ b/sw/saris/stencils/istc.par.hpp @@ -0,0 +1,239 @@ +#include "istc.common.hpp" + +// =============== +// Polybench +// =============== + +KNL istcp_pb_jacobi_2d( + const int cid, + TCDM d_t (RCP A)[s::n][s::n], + TCDM d_t (RCP B)[s::n][s::n] +) { + KNL_IDS(cid) + for (int t = 0; t < st::t; t++) { + forpx (y, i, 1, s::n-1) + forpex (4, x, j, 1, s::n-1) + (*B)[i][j] = 0.2 * ((*A)[i][j] + (*A)[i][j-1] + (*A)[i][1+j] + (*A)[1+i][j] + (*A)[i-1][j]); + __rt_barrier(); + } +} + + +// ========== +// AN5D +// ========== + +KNL istcp_an5d_j2d5pt( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + KNL_IDS(cid) + // Avoid constant FP division + constexpr d_t fac = 1.0 / c::c0; + for (int t = 0; t < st::t; t++) { + forpx (y, y, 1, s::ny-1) + forpex (4, x, x, 1, s::nx-1) + (*A[(t+1)%2])[y][x] = fac * ( + c::ym[0] * (*A[t%2])[y-1][x ] + + c::xm[0] * (*A[t%2])[y ][x-1] + + c::cc * (*A[t%2])[y ][x ] + + c::xp[0] * (*A[t%2])[y ][x+1] + + c::yp[0] * (*A[t%2])[y+1][x ] + ); + __rt_barrier(); + } +} + + +KNL istcp_an5d_j2d9pt( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + KNL_IDS(cid) + // Avoid constant FP division + constexpr d_t fac = 1.0 / c::c0; + for (int t = 0; t < st::t; t++) { + forpx (y, y, 2, s::ny-2) + forpex (2, x, x, 2, s::nx-2) + (*A[(t+1)%2])[y][x] = fac * ( + c::ym[0] * (*A[t%2])[y-1][x ] + c::ym[1] * (*A[t%2])[y-2][x ] + + c::xm[0] * (*A[t%2])[y ][x-1] + c::xm[1] * (*A[t%2])[y ][x-2] + + c::cc * (*A[t%2])[y ][x ] + + c::xp[0] * (*A[t%2])[y ][x+1] + c::xp[1] * (*A[t%2])[y ][x+2] + + c::yp[0] * (*A[t%2])[y+1][x ] + c::yp[1] * (*A[t%2])[y+2][x ] + ); + __rt_barrier(); + } +} + + +KNL istcp_an5d_j2d9pt_gol( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + KNL_IDS(cid) + // Avoid constant FP division + constexpr d_t fac = 1.0 / c::c0; + for (int t = 0; t < st::t; t++) { + forpx (y, y, 1, s::ny-1) + forpex (2, x, x, 1, s::nx-1) { + d_t acc = 0.0; + #pragma unroll + for (int dy = -1; dy <= 1; ++dy) + #pragma unroll + for (int dx = -1; dx <= 1; ++dx) + acc += c::c[dy+1][dx+1] * (*A[t%2])[y+dy][x+dx]; + (*A[(t+1)%2])[y][x] = fac * acc; + } + __rt_barrier(); + } +} + + +KNL istcp_an5d_star2dXr( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + KNL_IDS(cid) + for (int t = 0; t < st::t; t++) { + forpx (y, y, ci::r, s::ny-ci::r) + forpx (x, x, ci::r, s::nx-ci::r) { + d_t acc = c::cc * (*A[t%2])[y][x]; + #pragma unroll + for (int dr = 0; dr < ci::r; ++dr) { + acc += c::xm[dr] * (*A[t%2])[y][x-1-dr]; + acc += c::xp[dr] * (*A[t%2])[y][x+1+dr]; + acc += c::ym[dr] * (*A[t%2])[y-1-dr][x]; + acc += c::yp[dr] * (*A[t%2])[y+1+dr][x]; + } + (*A[(t+1)%2])[y][x] = acc; + } + __rt_barrier(); + } +} + + +KNL istcp_an5d_box2dXr( + const int cid, + TCDM d_t (RCP A[2])[s::ny][s::nx] +) { + KNL_IDS(cid) + for (int t = 0; t < st::t; t++) { + forpx (y, y, ci::r, s::ny-ci::r) + forpx (x, x, ci::r, s::nx-ci::r) { + d_t acc = 0.0; + #pragma unroll + for (int dy = -ci::r; dy <= ci::r; ++dy) + #pragma unroll + for (int dx = -ci::r; dx <= ci::r; ++dx) + acc += c::c[dy+ci::r][dx+ci::r] * (*A[t%2])[y+dy][x+dx]; + (*A[(t+1)%2])[y][x] = acc; + } + __rt_barrier(); + } +} + + +KNL istcp_an5d_star3dXr( + const int cid, + TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx] +) { + KNL_IDS(cid) + for (int t = 0; t < st::t; t++) { + forpx (z, z, ci::r, s::nz-ci::r) + forpx (y, y, ci::r, s::ny-ci::r) + forpx (x, x, ci::r, s::nx-ci::r) { + d_t acc = c::cc * (*A[t%2])[z][y][x]; + #pragma unroll + for (int dr = 0; dr < ci::r; ++dr) { + acc += c::xm[dr] * (*A[t%2])[z][y][x-1-dr]; + acc += c::xp[dr] * (*A[t%2])[z][y][x+1+dr]; + acc += c::ym[dr] * (*A[t%2])[z][y-1-dr][x]; + acc += c::yp[dr] * (*A[t%2])[z][y+1+dr][x]; + acc += c::zm[dr] * (*A[t%2])[z-1-dr][y][x]; + acc += c::zp[dr] * (*A[t%2])[z+1+dr][y][x]; + } + (*A[(t+1)%2])[z][y][x] = acc; + } + __rt_barrier(); + } +} + + +KNL istcp_an5d_box3dXr( + const int cid, + TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx] +) { + KNL_IDS(cid) + for (int t = 0; t < st::t; t++) { + forpx (z, z, ci::r, s::nz-ci::r) + forpx (y, y, ci::r, s::ny-ci::r) + forpx (x, x, ci::r, s::nx-ci::r) { + d_t acc = 0.0; + for (int dz = -ci::r; dz <= ci::r; ++dz) + #pragma unroll + for (int dy = -ci::r; dy <= ci::r; ++dy) + #pragma unroll + for (int dx = -ci::r; dx <= ci::r; ++dx) + acc += c::c3[dz+ci::r][dy+ci::r][dx+ci::r] * (*A[t%2])[z+dz][y+dy][x+dx]; + (*A[(t+1)%2])[z][y][x] = acc; + } + __rt_barrier(); + } +} + + +KNL istcp_an5d_j3d27pt( + const int cid, + TCDM d_t (RCP A[2])[s::nz][s::ny][s::nx] +) { + KNL_IDS(cid) + // Avoid constant FP division + constexpr d_t fac = 1.0 / c::c0; + for (int t = 0; t < st::t; t++) { + forpx (z, z, 1, s::nz-1) + forpx (y, y, 1, s::ny-1) + forpx (x, x, 1, s::nx-1) { + d_t acc = 0.0; + for (int dz = -1; dz <= 1; ++dz) + #pragma unroll + for (int dy = -1; dy <= 1; ++dy) + #pragma unroll + for (int dx = -1; dx <= 1; ++dx) + acc += c::c3[dz+1][dy+1][dx+1] * (*A[t%2])[z+dz][y+dy][x+dx]; + (*A[(t+1)%2])[z][y][x] = fac * acc; + } + __rt_barrier(); + } +} + +// ============= +// Minimod +// ============= + +KNL istcp_minimod_acoustic_iso_cd( + const int cid, + TCDM d_t (RCP u[2])[s::nz][s::ny][s::nx], + TCDM d_t (RCP f)[s::nz-8][s::ny-8][s::nx-8] +) { + KNL_IDS(cid) + constexpr uint32_t rad = 4; + // Compute coefficient of center point + constexpr float cc = 2 * (c::xp[0] + c::yp[0] + c::zp[0]); + for (int t = 0; t < st::t; t++) { + forpx (z, z, rad, s::nz-rad) + forpx (y, y, rad, s::ny-rad) + forpx (x, x, rad, s::nx-rad) { + // Initialize with incorporated impulse (has optional factor) + d_t lapl = c::uffac * (*f)[z-rad][y-rad][x-rad]; + // Compute Laplacian + lapl += cc * (*u[t%2])[z][y][x]; + for (int m = 1; m <= rad; ++m) + lapl += c::xp[m] * ((*u[t%2])[z][y][x-m] + (*u[t%2])[z][y][x+m]) + + c::yp[m] * ((*u[t%2])[z][y-m][x] + (*u[t%2])[z][y+m][x]) + + c::zp[m] * ((*u[t%2])[z-m][y][x] + (*u[t%2])[z+m][y][x]); + (*u[(t+1)%2])[z][y][x] = lapl - (*u[(t+1)%2])[z][y][x]; + } + __rt_barrier(); + } +} diff --git a/sw/saris/util/eval.cpp.tpl b/sw/saris/util/eval.cpp.tpl new file mode 100644 index 0000000000..ad22b628ba --- /dev/null +++ b/sw/saris/util/eval.cpp.tpl @@ -0,0 +1,55 @@ +#include "runtime.hpp" +#include "istc.par.hpp" +#include "istc.issr.hpp" + +${datadecls} +${bundledecls} + +${ctgrids} + +${ciparams} + +TCDMDECL volatile uint32_t err_sema = 0; + +EXTERN_C int smain(uint32_t core_id, uint32_t core_num, void* tcdm_start, void* tcdm_end) { + + // Kick DMCC + if (core_id == core_num-1) { + __rt_barrier(); + +% for i in range(nbarriers): + // Kernel ${i} +${indent(dma_transfers, " "*8)} + __rt_barrier(); +% endfor + goto past_knl; + } + + __rt_barrier(); + __rt_get_timer(); +% for k in kernels: + ${k[1]}; + __rt_get_timer(); +% endfor + +past_knl: +% for name, touch in touches.items(): + if (core_id == 0) printf("touching `${name}`\n"); + __istc_touch_grid( + core_id, core_num, ${touch['stride']}, + ${touch['ptr']}, ${touch['len']}, &err_sema + ); +% endfor +% for i, check in enumerate(checks): + if (core_id == 0) printf("Performing check ${i}\n"); + __istc_cmp_grids( + core_id, core_num, ${check['stride']}, + ${check['a']}, ${check['b']}, ${check['len']}, ${check['eps']}, + &err_sema + ); +% endfor + + return err_sema; +} + +${datainits} diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py new file mode 100644 index 0000000000..4af67d6e74 --- /dev/null +++ b/sw/saris/util/evalgen.py @@ -0,0 +1,312 @@ +import sys +import json +import numpy as np +from textwrap import indent +from mako.template import Template + + +CHECK_DEF_STRIDE = 17 +CHECK_DEF_EPS = 1e-7 +ELEMTYPE = 'double' +ELEMS_PER_ROW = 4 + +# Keep these dimensions aligned with code headers +GRID_DIMS = { + 1: { 's': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192 }, + 2: { 's': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128 }, + 3: { 's': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32 }, +} + +CSTRUCT_FMT = 'struct TCDMSPC {prname} {{\n{body}\n}};\n{dtype} {decls};' + +CTSTRUCT_FTYPE = 'TCDM PRMD' +CTSTRUCT_DTYPE = 'TCDM PRMXD' + +CTSTRUCT_DEFAULT_GRIDS = { + 'xm': {'seed': 1513, 'dims': [8]}, + 'xp': {'seed': 1514, 'dims': [8]}, + 'ym': {'seed': 1515, 'dims': [8]}, + 'yp': {'seed': 1516, 'dims': [8]}, + 'zm': {'seed': 1517, 'dims': [8]}, + 'zp': {'seed': 1518, 'dims': [8]}, + 'cc': {'seed': 1519}, + 'c0': {'seed': 1520}, + 'uffac': {'seed': 1521}, + 'c': {'seed': 1522, 'dims': [6, 6]}, + 'c3': {'seed': 1523, 'dims': [3, 3, 3]} +} + +CISTRUCT_FTYPE = 'TCDM PRM' +CISTRUCT_DTYPE = 'TCDM PRMX' + + +def set_seed(seed: int = None): + if seed is not None: + np.random.seed(seed) + + +def resolve_dim(dim: str) -> int: + try: + ret = int(dim) + except ValueError: + # If the string does not match our expectations, this will throw accordingly + return GRID_DIMS[int(dim[0])][dim[1:]] + if ret <= 0: + raise ValueError(f'Dimensions must be bigger than 1 (got {ret})') + return ret + + +def resolve_dims(grid_args: list) -> list: + return [resolve_dim(dim) for dim in grid_args] + + +def gen_subscripts(int_dims: list) -> str: + return "".join(f'[{d}]' for d in int_dims) + + +def resolve_check(check: dict, grids: dict): + # Set defaults as needed + if 'eps' not in check: + check['eps'] = CHECK_DEF_EPS + if 'stride' not in check: + check['stride'] = CHECK_DEF_STRIDE + # Resolve grids + for grid in ('a', 'b'): + # If either comparison reference is a known grid, resolve it and adopt its length + gname = check[grid] + if gname in grids: + dims = resolve_dims(grids[gname]['dims']) + check[grid] = f'&{gname}' + '[0]'*len(dims) + tgt_len = np.product(dims) + if 'len' in check: + assert check['len'] == tgt_len, \ + f'Mismatching grid check lengths: {tgt_len} ({grids[gname]}) vs {check["len"]}' + else: + check['len'] = tgt_len + # Make sure we have a length now + assert 'len' in check, f'Could not resolve length for check {check}' + + +def resolve_touches(grids: dict, stride: int = CHECK_DEF_STRIDE) -> dict: + ret = {} + for name, grid in grids.items(): + ret[name] = {'stride': stride} + # Resolve grid + dims = resolve_dims(grid['dims']) + ret[name]['ptr'] = f'&{name}' + '[0]'*len(dims) + ret[name]['len'] = np.product(dims) + return ret + + +# Handles one level of nested array initialization. +def generate_array_level(int_dims: list, zero, pos: int = 0) -> str: + # Handle degenerate scalar case + if (len(int_dims) == 0): + return str(np.random.normal(size=1)[0] if not zero else 0.0) + elif pos == len(int_dims)-1: + rand_doubles = np.random.normal(size=int_dims[-1]) if not zero else np.zeros(shape=int_dims[-1]) + elems = [str(d) for d in rand_doubles] + elems_fmt = ",\n".join([", ".join(elems[i:i + ELEMS_PER_ROW]) + for i in range(0, len(elems), ELEMS_PER_ROW)]) + else: + elems = [generate_array_level(int_dims, zero, pos+1) for _ in range(int_dims[pos])] + elems_fmt = ', '.join(elems) + return f'{{\n{indent(elems_fmt, " " * 4*(pos+1))}\n}}' + + +# Returns declaration and initialization separately +def generate_grids(grids: dict) -> (str, str): + decls = [] + inits = [] + for name, args in grids.items(): + # First argument provides generation seed + set_seed(args['seed']) + int_dims = resolve_dims(args['dims']) + subscripts = gen_subscripts(int_dims) + attrs = (args['attrs'] + ' ') if 'attrs' in args else '' + decls.append(f'extern __attribute__((visibility("default"))) {attrs}{ELEMTYPE} {name}{subscripts};') + inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};') + return '\n'.join(decls), '\n'.join(inits) + + +# Returns the instantiation of a parameter static class +def generate_ctstruct(grids: dict, prname = 'ct') -> str: + body = [] + decls = [] + for name, args in grids.items(): + # First argument provides generation seed + set_seed(args['seed']) + int_dims = resolve_dims(args['dims']) if 'dims' in args else [] + subscripts = gen_subscripts(int_dims) + body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};') + decls.append(f'{prname}::{name}{subscripts}') + return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CTSTRUCT_DTYPE, decls=", ".join(decls)) + + +# Returns the instantiation of a parameter static class +def generate_cistruct(params: dict, prname = 'ci') -> str: + body = [] + decls = [] + for lval, rval in params.items(): + body.append(f'{CISTRUCT_FTYPE} {lval} = {rval};') + decls.append(f'{prname}::{lval}') + return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CISTRUCT_DTYPE, decls=", ".join(decls)) + + +# Returns declaration and initialization separately +def generate_bundles(bundles: dict, grids: dict) -> str: + decls = [] + for name, grid_names in bundles.items(): + int_dims = resolve_dims(grids[grid_names[0]]['dims']) + if any(int_dims != resolve_dims(grids[g]['dims']) for g in grid_names[1:]): + raise ValueError(f'Bundle {name} has mismatching grid dimensions') + attrs = grids[grid_names[0]]['attrs'] + if any(attrs != grids[g]['attrs'] for g in grid_names[1:]): + raise ValueError(f'Bundle {name} has mismatching attributes') + attrs = (attrs + ' ') if attrs else '' + decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}]){gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};') + return '\n'.join(decls) + + +# Returns a code snippet performing a DMA out transfer +def generate_dma_out(dst_grid: tuple, src_grid: tuple, radius: int) -> str: + ndim = len(dst_grid['dims']) + assert ndim == 3 or ndim == 2, 'Only 2D and 3D grids supported' + + dst_dims = resolve_dims(dst_grid['dims']) + src_dims = resolve_dims(src_grid['dims']) + + args = [] + subscripts = f'[{radius}][{radius}]' + if ndim == 3: + subscripts = f'[{radius} + i]{subscripts}' + args.append(f'(void *)&({dst_grid["uid"]}{subscripts})') # dst + args.append(f'(void *)&({src_grid["uid"]}{subscripts})') # src + args.append(f'{src_dims[0] - radius * 2} * sizeof(double)') # size + args.append(f'{src_dims[0]} * sizeof(double)') # src_stride + args.append(f'{dst_dims[0]} * sizeof(double)') # dst_stride + args.append(f'{src_dims[1] - radius * 2}') # repeat + args = ',\n'.join(args) + + dma_call = f'__rt_dma_start_2d(\n{indent(args, " "*4)}\n);' + dma_transfer = f'{dma_call}\n' + + if ndim == 3: + loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n' + return loop + else: + return dma_transfer + + +# Returns a code snippet performing a DMA in transfer +def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str: + ndim = len(dst_grid['dims']) + assert ndim == 3 or ndim == 2, 'Only 2D and 3D grids supported' + + dst_dims = resolve_dims(dst_grid['dims']) + src_dims = resolve_dims(src_grid['dims']) + + args = [] + subscripts = f'[0][0]' + if ndim == 3: + subscripts = f'[i]{subscripts}' + args.append(f'(void *)&({dst_grid["uid"]}{subscripts})') # dst + args.append(f'(void *)&({src_grid["uid"]}{subscripts})') # src + args.append(f'{dst_dims[0]} * sizeof(double)') # size + args.append(f'{src_dims[0]} * sizeof(double)') # src_stride + args.append(f'{dst_dims[0]} * sizeof(double)') # dst_stride + args.append(f'{dst_dims[1]}') # repeat + args = ',\n'.join(args) + + dma_call = f'__rt_dma_start_2d(\n{indent(args, " "*4)}\n);' + dma_transfer = f'{dma_call}\n' + + if ndim == 3: + loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n' + return loop + else: + return dma_transfer + + +# Returns a grid dictionary from the grids dictionary, +# where the key in the grids dictionary is appended to the value +# as the 'uid' field. +def get_grid(grids: dict, grid_uid: str) -> tuple: + grid = grids[grid_uid] + grid['uid'] = grid_uid + return grid + + +def resolve_dma_transfers(transfers: list, radius: int) -> list: + # Uniformize single transfer and multiple transfer cases + if not isinstance(transfers[0], list): + transfers = [transfers] + # Expand bidirectional transfers into unidirectional transfers + unidir_transfers = [] + for transfer in transfers: + if len(transfer) < 3: + unidir_transfers.append([*transfer, "in"]) + unidir_transfers.append([*transfer, "out"]) + else: + unidir_transfers.append(transfer) + # Add default radius if absent + for transfer in unidir_transfers: + if len(transfer) < 4: + transfer.append(radius) + return unidir_transfers + + +# Returns a code snippet performing DMA transfers +def generate_dma_transfers(grids: dict, transfers: list) -> str: + s = '' + for transfer in transfers: + l1_grid_name, l3_grid_name, direction, radius = transfer + l1_grid = get_grid(grids, l1_grid_name) + l3_grid = get_grid(grids, l3_grid_name) + if direction == 'out': + s += generate_dma_out(l3_grid, l1_grid, radius) + elif direction == 'in': + s += generate_dma_in(l1_grid, l3_grid, radius) + else: + raise ValueError() + s += '\n__rt_dma_wait_all();' + return s + + +def main(cfg_file: str, tpl_file: str, program: str): + # Load programs to generate from config + with open(cfg_file) as f: + progs = json.load(f) + # Generate code for test program according to its config entry + cfg = progs[program] + grids = cfg['grids'] + cfg['datadecls'], cfg['datainits'] = generate_grids(grids) + cfg['bundledecls'] = "" + if 'bundles' in cfg: + cfg['bundledecls'] = generate_bundles(cfg['bundles'], grids) + ctgrids = CTSTRUCT_DEFAULT_GRIDS; + if 'ctgrids' in cfg: + ctgrids.update(cfg['ctgrids']) + cfg['ctgrids'] = generate_ctstruct(ctgrids) + cfg['ciparams'] = "" + if 'params' in cfg: + cfg['ciparams'] = generate_cistruct(cfg['params']) + if 'checks' not in cfg: + cfg['checks'] = [] + for check in cfg['checks']: + resolve_check(check, grids) + cfg['touches'] = {} + if 'touch' in cfg: + touches = {grid_name: grids[grid_name] for grid_name in cfg['touch']} + cfg['touches'] = resolve_touches(touches) + cfg['dma_transfers'] = '' + if 'dma' in cfg: + transfers = resolve_dma_transfers(cfg['dma'], cfg['radius']) + cfg['dma_transfers'] = generate_dma_transfers(grids, transfers) + cfg["nbarriers"] = sum(k[0] for k in cfg['kernels']) + cfg['indent'] = indent + print(Template(filename=tpl_file).render(**cfg)) + + +if __name__ == '__main__': + main(*sys.argv[1:]) From e73ef430ffa8355c0d9877ec8c2cd14b7d77dbbb Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Tue, 2 Apr 2024 16:38:47 +0200 Subject: [PATCH 04/10] sw/saris: Fix license headers --- sw/saris/runtime/crt0.S | 4 ++++ sw/saris/runtime/dma.h | 4 ++++ sw/saris/runtime/link.ld | 4 ++++ sw/saris/runtime/runtime.h | 4 ++++ sw/saris/runtime/runtime.hpp | 4 ++++ sw/saris/runtime/sssr.h | 4 ++++ sw/saris/stencils/istc.common.hpp | 4 ++++ sw/saris/stencils/istc.issr.hpp | 4 ++++ sw/saris/stencils/istc.par.hpp | 4 ++++ sw/saris/util/eval.cpp.tpl | 4 ++++ sw/saris/util/evalgen.py | 5 +++++ 11 files changed, 45 insertions(+) diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S index 79efb0cbbe..96efe9b49b 100644 --- a/sw/saris/runtime/crt0.S +++ b/sw/saris/runtime/crt0.S @@ -1,3 +1,7 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + # HTIF sections .pushsection .htif,"aw",@progbits; .align 6; .global tohost; tohost: .dword 0; diff --git a/sw/saris/runtime/dma.h b/sw/saris/runtime/dma.h index 80956b0f73..5a664b0ce3 100644 --- a/sw/saris/runtime/dma.h +++ b/sw/saris/runtime/dma.h @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/sw/saris/runtime/link.ld b/sw/saris/runtime/link.ld index 5788547bdd..13fc1570f9 100644 --- a/sw/saris/runtime/link.ld +++ b/sw/saris/runtime/link.ld @@ -1,3 +1,7 @@ +/* Copyright 2024 ETH Zurich and University of Bologna. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: SHL-0.51 */ + OUTPUT_ARCH( "riscv" ) ENTRY(_start) diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h index 883bacb2ae..414fa9e394 100644 --- a/sw/saris/runtime/runtime.h +++ b/sw/saris/runtime/runtime.h @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #pragma once #include diff --git a/sw/saris/runtime/runtime.hpp b/sw/saris/runtime/runtime.hpp index df501ff20e..b9a60e564a 100644 --- a/sw/saris/runtime/runtime.hpp +++ b/sw/saris/runtime/runtime.hpp @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #pragma once // C linkage macros diff --git a/sw/saris/runtime/sssr.h b/sw/saris/runtime/sssr.h index 171ccb454f..78fec8f366 100644 --- a/sw/saris/runtime/sssr.h +++ b/sw/saris/runtime/sssr.h @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #pragma once // Registers diff --git a/sw/saris/stencils/istc.common.hpp b/sw/saris/stencils/istc.common.hpp index 042005a741..e005e39ac7 100644 --- a/sw/saris/stencils/istc.common.hpp +++ b/sw/saris/stencils/istc.common.hpp @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #include #include #include diff --git a/sw/saris/stencils/istc.issr.hpp b/sw/saris/stencils/istc.issr.hpp index c74d76b4dc..d81614e36c 100644 --- a/sw/saris/stencils/istc.issr.hpp +++ b/sw/saris/stencils/istc.issr.hpp @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #include "istc.common.hpp" // =============== diff --git a/sw/saris/stencils/istc.par.hpp b/sw/saris/stencils/istc.par.hpp index 37ba6fd4e3..26a042d05f 100644 --- a/sw/saris/stencils/istc.par.hpp +++ b/sw/saris/stencils/istc.par.hpp @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #include "istc.common.hpp" // =============== diff --git a/sw/saris/util/eval.cpp.tpl b/sw/saris/util/eval.cpp.tpl index ad22b628ba..edd26e6c5b 100644 --- a/sw/saris/util/eval.cpp.tpl +++ b/sw/saris/util/eval.cpp.tpl @@ -1,3 +1,7 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + #include "runtime.hpp" #include "istc.par.hpp" #include "istc.issr.hpp" diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py index 4af67d6e74..df48f00f3d 100644 --- a/sw/saris/util/evalgen.py +++ b/sw/saris/util/evalgen.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + import sys import json import numpy as np From d41cd4e595450b77a4743861a523b8ae246c133b Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Tue, 2 Apr 2024 16:54:27 +0200 Subject: [PATCH 05/10] sw/saris: Fix python lint --- sw/saris/util/evalgen.py | 44 ++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/sw/saris/util/evalgen.py b/sw/saris/util/evalgen.py index df48f00f3d..25c2c40b2b 100644 --- a/sw/saris/util/evalgen.py +++ b/sw/saris/util/evalgen.py @@ -17,9 +17,9 @@ # Keep these dimensions aligned with code headers GRID_DIMS = { - 1: { 's': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192 }, - 2: { 's': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128 }, - 3: { 's': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32 }, + 1: {'s': 1000, 'sm': 1728, 'm': 2744, 'ml': 4096, 'l': 5832, 'xl': 8192}, + 2: {'s': 32, 'sm': 42, 'm': 52, 'ml': 64, 'l': 76, 'xl': 128}, + 3: {'s': 10, 'sm': 12, 'm': 14, 'ml': 16, 'l': 18, 'xl': 32}, } CSTRUCT_FMT = 'struct TCDMSPC {prname} {{\n{body}\n}};\n{dtype} {decls};' @@ -85,7 +85,8 @@ def resolve_check(check: dict, grids: dict): tgt_len = np.product(dims) if 'len' in check: assert check['len'] == tgt_len, \ - f'Mismatching grid check lengths: {tgt_len} ({grids[gname]}) vs {check["len"]}' + 'Mismatching grid check lengths:' \ + f'{tgt_len} ({grids[gname]}) vs {check["len"]}' else: check['len'] = tgt_len # Make sure we have a length now @@ -109,7 +110,8 @@ def generate_array_level(int_dims: list, zero, pos: int = 0) -> str: if (len(int_dims) == 0): return str(np.random.normal(size=1)[0] if not zero else 0.0) elif pos == len(int_dims)-1: - rand_doubles = np.random.normal(size=int_dims[-1]) if not zero else np.zeros(shape=int_dims[-1]) + rand_doubles = np.random.normal(size=int_dims[-1]) if \ + not zero else np.zeros(shape=int_dims[-1]) elems = [str(d) for d in rand_doubles] elems_fmt = ",\n".join([", ".join(elems[i:i + ELEMS_PER_ROW]) for i in range(0, len(elems), ELEMS_PER_ROW)]) @@ -129,13 +131,15 @@ def generate_grids(grids: dict) -> (str, str): int_dims = resolve_dims(args['dims']) subscripts = gen_subscripts(int_dims) attrs = (args['attrs'] + ' ') if 'attrs' in args else '' - decls.append(f'extern __attribute__((visibility("default"))) {attrs}{ELEMTYPE} {name}{subscripts};') - inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};') + decls.append('extern __attribute__((visibility("default")))' + + f' {attrs}{ELEMTYPE} {name}{subscripts};') + inits.append(f'{attrs}{ELEMTYPE} {name}{subscripts} =' + + f'{generate_array_level(int_dims, args["seed"] == 0)};') return '\n'.join(decls), '\n'.join(inits) # Returns the instantiation of a parameter static class -def generate_ctstruct(grids: dict, prname = 'ct') -> str: +def generate_ctstruct(grids: dict, prname='ct') -> str: body = [] decls = [] for name, args in grids.items(): @@ -143,19 +147,22 @@ def generate_ctstruct(grids: dict, prname = 'ct') -> str: set_seed(args['seed']) int_dims = resolve_dims(args['dims']) if 'dims' in args else [] subscripts = gen_subscripts(int_dims) - body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = {generate_array_level(int_dims, args["seed"] == 0)};') + body.append(f'{CTSTRUCT_FTYPE} {name}{subscripts} = ' + + f'{generate_array_level(int_dims, args["seed"] == 0)};') decls.append(f'{prname}::{name}{subscripts}') - return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CTSTRUCT_DTYPE, decls=", ".join(decls)) + return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), + dtype=CTSTRUCT_DTYPE, decls=", ".join(decls)) # Returns the instantiation of a parameter static class -def generate_cistruct(params: dict, prname = 'ci') -> str: +def generate_cistruct(params: dict, prname='ci') -> str: body = [] decls = [] for lval, rval in params.items(): body.append(f'{CISTRUCT_FTYPE} {lval} = {rval};') decls.append(f'{prname}::{lval}') - return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), dtype=CISTRUCT_DTYPE, decls=", ".join(decls)) + return CSTRUCT_FMT.format(prname=prname, body=indent("\n".join(body), " "*4), + dtype=CISTRUCT_DTYPE, decls=", ".join(decls)) # Returns declaration and initialization separately @@ -169,7 +176,8 @@ def generate_bundles(bundles: dict, grids: dict) -> str: if any(attrs != grids[g]['attrs'] for g in grid_names[1:]): raise ValueError(f'Bundle {name} has mismatching attributes') attrs = (attrs + ' ') if attrs else '' - decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}]){gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};') + decls.append(f'{attrs}{ELEMTYPE} (*{name}[{len(grid_names)}])' + + f'{gen_subscripts(int_dims)} = {{{", ".join("&" + g for g in grid_names)}}};') return '\n'.join(decls) @@ -197,7 +205,8 @@ def generate_dma_out(dst_grid: tuple, src_grid: tuple, radius: int) -> str: dma_transfer = f'{dma_call}\n' if ndim == 3: - loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n' + loop = '#pragma clang loop unroll(disable)\nfor (int i = 0; i < ' + \ + f'{src_dims[2] - radius * 2}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n' return loop else: return dma_transfer @@ -212,7 +221,7 @@ def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str: src_dims = resolve_dims(src_grid['dims']) args = [] - subscripts = f'[0][0]' + subscripts = '[0][0]' if ndim == 3: subscripts = f'[i]{subscripts}' args.append(f'(void *)&({dst_grid["uid"]}{subscripts})') # dst @@ -227,7 +236,8 @@ def generate_dma_in(dst_grid: tuple, src_grid: tuple, radius: int) -> str: dma_transfer = f'{dma_call}\n' if ndim == 3: - loop = f'#pragma clang loop unroll(disable)\nfor (int i = 0; i < {dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n' + loop = '#pragma clang loop unroll(disable)\nfor (int i = 0; i < ' + \ + f'{dst_dims[2]}; i++) {{\n{indent(dma_transfer, " "*4)}\n}}\n' return loop else: return dma_transfer @@ -289,7 +299,7 @@ def main(cfg_file: str, tpl_file: str, program: str): cfg['bundledecls'] = "" if 'bundles' in cfg: cfg['bundledecls'] = generate_bundles(cfg['bundles'], grids) - ctgrids = CTSTRUCT_DEFAULT_GRIDS; + ctgrids = CTSTRUCT_DEFAULT_GRIDS if 'ctgrids' in cfg: ctgrids.update(cfg['ctgrids']) cfg['ctgrids'] = generate_ctstruct(ctgrids) From a62d6e41ef2dbbcbe7950a96cd517978c87b0b08 Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Tue, 2 Apr 2024 16:59:37 +0200 Subject: [PATCH 06/10] lint: Do not C++ lint SARIS sources --- .github/workflows/lint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 6c4f91184b..65159afabd 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -129,6 +129,7 @@ jobs: - uses: actions/checkout@v3 - uses: DoozyX/clang-format-lint-action@v0.16.2 with: + exclude: './sw/saris' clangFormatVersion: 10 ###################### From 31aa679125f0ea9a5180cbae5bb1dfec7621291e Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Tue, 2 Apr 2024 18:00:11 +0200 Subject: [PATCH 07/10] sw/saris: Remove stub LLVM from makefile --- sw/saris/Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sw/saris/Makefile b/sw/saris/Makefile index e9bfb82500..bb1033cd55 100644 --- a/sw/saris/Makefile +++ b/sw/saris/Makefile @@ -11,9 +11,13 @@ all: # Environment # ############### -# NOTE: This needs to be a specific revision of PULP RISCV LLVM 15: -# TODO: add commit link here -LLVM_BINROOT ?= /home/paulsc/dev/llvm-ssr/llvm-iis/install/bin +# NOTE: the LLVM_BINROOT environment variable must point to a specific revision of PULP RISCV +# LLVM 15 (see README.md). After compilation, you can set LLVM_BINROOT in your environment, this +# makefile, or pass it on invocation of `make`. +ifndef LLVM_BINROOT +$(error LLVM_BINROOT is not set; please compile the SARIS version of LLVM 15 (see README.md) and set LLVM_BINROOT to its binary location.) +endif + PYTHON3 ?= python3 SARISDIR ?= . From 2050a2aad569c8046c79be8a9aef5053b5597d69 Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Tue, 2 Apr 2024 18:00:32 +0200 Subject: [PATCH 08/10] sw/saris: Add README.md --- sw/saris/README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/sw/saris/README.md b/sw/saris/README.md index 464090415c..29dd472152 100644 --- a/sw/saris/README.md +++ b/sw/saris/README.md @@ -1 +1,49 @@ -# TODO +# SARIS Stencil Kernels + +This directory contains the baseline- and SSSR-accelerated Snitch cluster stencil kernels used in the evaluation section of the paper _"SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers"_. In our paper, we describe how indirect stream register architectures such as SSSRs can significantly accelerate stencil codes. + +If you use our code or compare against our work, please cite us: + +``` +TODO +``` + +> [!IMPORTANT] +> - Unlike other software in this repository, compiling this code requires a **custom version of the LLVM 15 toolchain** with some extensions and improvements. The source code for this LLVM fork can be found [here](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0). +> - The generated example programs are only intended to be used **in RTL simulation of an SSSR-extended cluster**, using the custom cluster configuration `cfg/sssr.json`. + +## Directory Structure + +* `stencils/`: Baseline (`istc.par.hpp`) and SARIS-accelerated (`istc.issr.hpp`) stencil codes. +* `runtime/`: Additional runtime code and linking configuration needed for compilation. +* `util/`: Evaluation program generator supporting different grid sizes and kernel calls. +* `eval.json`: Configuration for test program generator. + +## Compile Evaluation Programs + +Before you can compile test problems, you need the [SARIS LLVM 15 toolchain](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0) along with `newlib` and `compiler-rt`. The required build steps are outlined [here](https://github.com/pulp-platform/llvm-toolchain-cd/blob/main/README.md). + +Then, you can build the test programs specified in `eval.json` by running: + +``` +make LLVM_BINROOT=/bin all +``` + +By default, `eval.json` specifies RV32G and SSSR-accelerated test programs for all included stencils as specified in our paper. Binaries are generated in `bin/` and disassembled program dumps in `dump/`. + + +## Run Evaluation Programs + +Evaluation programs can only be run in RTL simulation of a Snitch cluster using the configuration `cfg/sssr.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`: + +``` +make CFG_OVERRIDE=cfg/sssr.hjson bin/snitch_cluster.vsim +``` + +Then, the built evaluation programs can be run on this simulation setup as usual, for example: + +``` +bin/snitch_cluster.vsim ../../sw/saris/bin/istc.pb_jacobi_2d_ml_issr.elf +``` + +Performance metrics can be analyzed using the annotating Snitch tracer (`make traces`). In the default evaluation programs, the section of interest is section 2. From ab4fe304366da849eca8ba1c27c332c817822913 Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Tue, 2 Apr 2024 19:07:23 +0200 Subject: [PATCH 09/10] sw/saris: Initialize putchar buffer, fix F extension skip --- sw/saris/runtime/crt0.S | 18 +++++++++++++----- sw/saris/runtime/runtime.h | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/sw/saris/runtime/crt0.S b/sw/saris/runtime/crt0.S index 96efe9b49b..7b3b8644cc 100644 --- a/sw/saris/runtime/crt0.S +++ b/sw/saris/runtime/crt0.S @@ -30,10 +30,14 @@ _start: slli t0, a0, 3 sub sp, sp, t0 - # check if the core has the F-extension - csrr t0, misa - andi t0, t0, (1 << 5) - beqz t0, _clr_ireg +.globl _putcb +_init_putcb: + la t0, _putcb + # Initialize ptchar buffer size of each core to 0 + slli t1, a0, 10 + add t0, t0, t1 + sw zero, 0(t0) + sw zero, 4(t0) _skip_dmcc_work: # Skip the coming two steps unless we are the DMA core @@ -78,6 +82,11 @@ _dmcc_work_sync: # Synchronize cores so data is ready csrr x0, 0x7C2 + # check if the core has the F-extension + csrr t0, misa + andi t0, t0, (1 << 5) + beqz t0, _clr_ireg + # Reset float regs if present _clr_freg: fcvt.d.w f0, x0 @@ -158,6 +167,5 @@ _done: wfi -.globl _putcb .section .data._putcb _putcb: diff --git a/sw/saris/runtime/runtime.h b/sw/saris/runtime/runtime.h index 414fa9e394..072cfecbc0 100644 --- a/sw/saris/runtime/runtime.h +++ b/sw/saris/runtime/runtime.h @@ -32,7 +32,7 @@ static inline volatile uint32_t __rt_get_hartid() { } // Rudimentary string buffer for putchar calls. extern uint32_t _putcb; -#define PUTC_BUFFER_LEN (1024 - sizeof(size_t)) +#define PUTC_BUFFER_LEN (1024 - sizeof(size_t) - 8*sizeof(uint64_t)) typedef struct { size_t size; From ea40640bd389721009a76fe4a19977dff68e1923 Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Fri, 5 Apr 2024 18:11:37 +0200 Subject: [PATCH 10/10] sw/saris: Switch to, adapt default config, add bib placeholders --- README.md | 18 +++ docs/publications.md | 18 +++ sw/saris/README.md | 16 ++- target/snitch_cluster/cfg/default.hjson | 41 +++++-- target/snitch_cluster/cfg/sssr.hjson | 153 ------------------------ 5 files changed, 81 insertions(+), 165 deletions(-) delete mode 100644 target/snitch_cluster/cfg/sssr.hjson diff --git a/README.md b/README.md index 1f7b6459cd..4280d47438 100644 --- a/README.md +++ b/README.md @@ -161,3 +161,21 @@ If you use the Snitch cluster or its extensions in your work, you can cite us: ```

+ +
+SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers +

+ +``` +@misc{scheffler2024saris, + title={SARIS: Accelerating Stencil Computations on Energy-Efficient + RISC-V Compute Clusters with Indirect Stream Registers}, + author={Paul Scheffler and Luca Colagrande and Luca Benini}, + year={2024}, + eprint={}, + archivePrefix={arXiv}, + primaryClass={cs.MS} +} +``` + +

diff --git a/docs/publications.md b/docs/publications.md index e4c86b4c6d..2395b70c73 100644 --- a/docs/publications.md +++ b/docs/publications.md @@ -118,4 +118,22 @@ If you use the Snitch cluster or its extensions in your work, you can cite us:

+
+SARIS: Accelerating Stencil Computations on Energy-Efficient RISC-V Compute Clusters with Indirect Stream Registers +

+ +``` +@misc{scheffler2024saris, + title={SARIS: Accelerating Stencil Computations on Energy-Efficient + RISC-V Compute Clusters with Indirect Stream Registers}, + author={Paul Scheffler and Luca Colagrande and Luca Benini}, + year={2024}, + eprint={}, + archivePrefix={arXiv}, + primaryClass={cs.MS} +} +``` + +

+ diff --git a/sw/saris/README.md b/sw/saris/README.md index 29dd472152..2da223df0d 100644 --- a/sw/saris/README.md +++ b/sw/saris/README.md @@ -5,12 +5,20 @@ This directory contains the baseline- and SSSR-accelerated Snitch cluster stenci If you use our code or compare against our work, please cite us: ``` -TODO +@misc{scheffler2024saris, + title={SARIS: Accelerating Stencil Computations on Energy-Efficient + RISC-V Compute Clusters with Indirect Stream Registers}, + author={Paul Scheffler and Luca Colagrande and Luca Benini}, + year={2024}, + eprint={}, + archivePrefix={arXiv}, + primaryClass={cs.MS} +} ``` > [!IMPORTANT] > - Unlike other software in this repository, compiling this code requires a **custom version of the LLVM 15 toolchain** with some extensions and improvements. The source code for this LLVM fork can be found [here](https://github.com/pulp-platform/llvm-project/tree/15.0.0-saris-0.1.0). -> - The generated example programs are only intended to be used **in RTL simulation of an SSSR-extended cluster**, using the custom cluster configuration `cfg/sssr.json`. +> - The generated example programs are only intended to be used **in RTL simulation of a default, SSSR-extended cluster**, using the cluster configuration `cfg/default.hjson`. ## Directory Structure @@ -34,10 +42,10 @@ By default, `eval.json` specifies RV32G and SSSR-accelerated test programs for a ## Run Evaluation Programs -Evaluation programs can only be run in RTL simulation of a Snitch cluster using the configuration `cfg/sssr.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`: +Evaluation programs can only be run in RTL simulation of a Snitch cluster using the default, SSSR-enhanced configuration `cfg/default.json`. For example, when building a QuestaSim RTL simulation setup from `target/snitch_cluster`: ``` -make CFG_OVERRIDE=cfg/sssr.hjson bin/snitch_cluster.vsim +make CFG_OVERRIDE=cfg/default.hjson bin/snitch_cluster.vsim ``` Then, the built evaluation programs can be run on this simulation setup as usual, for example: diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson index adfe7adf9e..2267b57525 100644 --- a/target/snitch_cluster/cfg/default.hjson +++ b/target/snitch_cluster/cfg/default.hjson @@ -16,6 +16,7 @@ cluster_base_hartid: 0, addr_width: 48, data_width: 64, + user_width: 5, // clog2(total number of clusters) tcdm: { size: 128, banks: 32, @@ -24,14 +25,28 @@ zero_mem_size: 64, // kB alias_region_enable: true, dma_data_width: 512, - dma_axi_req_fifo_depth: 3, - dma_req_fifo_depth: 3, + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + dma_user_width: 1, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, // Timing parameters timing: { - lat_comp_fp32: 3, + lat_comp_fp32: 2, lat_comp_fp64: 3, - lat_comp_fp16: 2, - lat_comp_fp16_alt: 2, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, lat_comp_fp8: 1, lat_comp_fp8_alt: 1, lat_noncomp: 1, @@ -44,7 +59,10 @@ register_core_req: true, register_core_rsp: true, register_offload_req: true, - register_offload_rsp: true + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false }, hives: [ // Hive 0 @@ -94,6 +112,7 @@ xf8alt: true, xfdotp: true, xfvec: true, + ssr_nr_credits: 4, num_int_outstanding_loads: 1, num_int_outstanding_mem: 4, num_fp_outstanding_loads: 4, @@ -101,8 +120,14 @@ num_sequencer_instructions: 16, num_dtlb_entries: 1, num_itlb_entries: 1, - // Enable division/square root unit - // Xdiv_sqrt: true, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], }, dma_core_template: { isa: "rv32imafd", diff --git a/target/snitch_cluster/cfg/sssr.hjson b/target/snitch_cluster/cfg/sssr.hjson deleted file mode 100644 index ee297960a9..0000000000 --- a/target/snitch_cluster/cfg/sssr.hjson +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2023 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Occamy-like Cluster configuration (+alias) for SSSR experiments -{ - nr_s1_quadrant: 1, - s1_quadrant: { - nr_clusters: 1, - }, - - cluster: { - boot_addr: 4096, // 0x1000 - cluster_base_addr: 268435456, // 0x1000_0000 - cluster_base_offset: 0, // 0x0 - cluster_base_hartid: 0, - addr_width: 48, - data_width: 64, - user_width: 5, // clog2(total number of clusters) - tcdm: { - size: 128, - banks: 32, - }, - cluster_periph_size: 64, // kB - zero_mem_size: 64, // kB - alias_region_enable: true, - dma_data_width: 512, - dma_axi_req_fifo_depth: 24, - dma_req_fifo_depth: 8, - narrow_trans: 4, - wide_trans: 32, - dma_user_width: 1, - // We don't need Snitch debugging in Occamy - enable_debug: false, - // We don't need Snitch (core-internal) virtual memory support - vm_support: false, - // Memory configuration inputs - sram_cfg_expose: true, - sram_cfg_fields: { - ema: 3, - emaw: 2, - emas: 1 - }, - // Timing parameters - timing: { - lat_comp_fp32: 2, - lat_comp_fp64: 3, - lat_comp_fp16: 1, - lat_comp_fp16_alt: 1, - lat_comp_fp8: 1, - lat_comp_fp8_alt: 1, - lat_noncomp: 1, - lat_conv: 2, - lat_sdotp: 3, - fpu_pipe_config: "BEFORE", - narrow_xbar_latency: "CUT_ALL_PORTS", - wide_xbar_latency: "CUT_ALL_PORTS", - // Isolate the core. - register_core_req: true, - register_core_rsp: true, - register_offload_req: true, - register_offload_rsp: true, - register_fpu_req: true, - register_ext_narrow: false, - register_ext_wide: false - }, - hives: [ - // Hive 0 - { - icache: { - size: 8, // total instruction cache size in kByte - sets: 2, // number of ways - cacheline: 256 // word size in bits - }, - cores: [ - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/compute_core_template" }, - { $ref: "#/dma_core_template" }, - ] - } - ] - }, - dram: { - // 0x8000_0000 - address: 2147483648, - // 0x8000_0000 - length: 2147483648 - }, - peripherals: { - clint: { - // 0xffff_0000 - address: 4294901760, - // 0x0000_1000 - length: 4096 - }, - }, - // Templates. - compute_core_template: { - isa: "rv32imafd", - xssr: true, - xfrep: true, - xdma: false, - xf16: true, - xf16alt: true, - xf8: true, - xf8alt: true, - xfdotp: true, - xfvec: true, - ssr_nr_credits: 4, - num_int_outstanding_loads: 1, - num_int_outstanding_mem: 4, - num_fp_outstanding_loads: 4, - num_fp_outstanding_mem: 4, - num_sequencer_instructions: 16, - num_dtlb_entries: 1, - num_itlb_entries: 1, - // SSSR configuration below - ssr_intersection: true, - ssr_intersection_triple: [0, 1, 2], - ssrs: [ - {indirection: true}, // Master 0 - {indirection: true}, // Master 1 - {}, // Slave - ], - }, - dma_core_template: { - isa: "rv32imafd", - // Xdiv_sqrt: true, - // isa: "rv32ema", - xdma: true, - xssr: false, - xfrep: false, - xf16: false, - xf16alt: false, - xf8: false, - xf8alt: false, - xfdotp: false, - xfvec: false, - num_int_outstanding_loads: 1, - num_int_outstanding_mem: 4, - num_fp_outstanding_loads: 4, - num_fp_outstanding_mem: 4, - num_sequencer_instructions: 16, - num_dtlb_entries: 1, - num_itlb_entries: 1, - } -}