Skip to content

Commit aa1af9e

Browse files
committed
treewide: Add narrow multicast support
1 parent 77bcb0c commit aa1af9e

File tree

16 files changed

+111
-5
lines changed

16 files changed

+111
-5
lines changed

hw/reqrsp_interface/include/reqrsp_interface/typedef.svh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
`define REQRSP_TYPEDEF_REQ_CHAN_T(__req_chan_t, __addr_t, __data_t, __strb_t) \
1212
typedef struct packed { \
1313
__addr_t addr; \
14+
__addr_t mask; \
1415
logic write; \
1516
reqrsp_pkg::amo_op_e amo; \
1617
__data_t data; \

hw/reqrsp_interface/src/axi_to_reqrsp.sv

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,8 @@ module axi_to_reqrsp #(
303303
// Silence those channels in case of a read.
304304
data: data & {DataWidth{meta.write}},
305305
strb: axi_req_i.w.strb & {StrbWidth{meta.write}},
306-
size: meta.size
306+
size: meta.size,
307+
default: '0
307308
};
308309

309310
always_comb begin

hw/snitch/src/riscv_instr.sv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,6 @@ package riscv_instr;
329329
localparam logic [31:0] DMREP = 32'b000011100000?????000000000101011;
330330
localparam logic [31:0] DMMCAST = 32'b000100000000?????000000000101011;
331331
localparam logic [31:0] FREP_O = 32'b????????????????????????10001011;
332-
localparam logic [31:0] FREP_I = 32'b????????????????????????00001011;
333332
localparam logic [31:0] IREP = 32'b?????????????????????????0111111;
334333
localparam logic [31:0] SCFGRI = 32'b????????????00000001?????0101011;
335334
localparam logic [31:0] SCFGWI = 32'b?????????????????010000000101011;
@@ -1141,6 +1140,7 @@ package riscv_instr;
11411140
localparam logic [11:0] CSR_FPMODE = 12'h7c1;
11421141
localparam logic [11:0] CSR_BARRIER = 12'h7c2;
11431142
localparam logic [11:0] CSR_SC = 12'h7c3;
1143+
localparam logic [11:0] CSR_MCAST = 12'h7c4;
11441144
localparam logic [11:0] CSR_HTIMEDELTAH = 12'h615;
11451145
localparam logic [11:0] CSR_CYCLEH = 12'hc80;
11461146
localparam logic [11:0] CSR_TIMEH = 12'hc81;

hw/snitch/src/snitch.sv

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
282282
logic [31:0] dpc_d, dpc_q;
283283
logic [31:0] dscratch_d, dscratch_q;
284284
logic debug_d, debug_q;
285+
286+
// Multicast mask
287+
logic [31:0] csr_mcast_d, csr_mcast_q;
285288

286289
`FFAR(scratch_q, scratch_d, '0, clk_i, rst_i)
287290
`FFAR(tvec_q, tvec_d, '0, clk_i, rst_i)
@@ -317,6 +320,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
317320
end
318321

319322
`FFAR(csr_stall_q, csr_stall_d, '0, clk_i, rst_i)
323+
`FFAR(csr_mcast_q, csr_mcast_d, '0, clk_i, rst_i)
320324

321325
typedef struct packed {
322326
fpnew_pkg::fmt_mode_t fmode;
@@ -2353,6 +2357,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
23532357
dcsr_d = dcsr_q;
23542358
dpc_d = dpc_q;
23552359
dscratch_d = dscratch_q;
2360+
csr_mcast_d = csr_mcast_q;
23562361

23572362
csr_stall_d = csr_stall_q;
23582363

@@ -2578,6 +2583,11 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
25782583
barrier_o = 1'b1;
25792584
csr_stall_d = 1'b1;
25802585
end
2586+
// Multicast mask
2587+
CSR_MCAST: begin
2588+
csr_rvalue = csr_mcast_q;
2589+
csr_mcast_d = alu_result[31:0];
2590+
end
25812591
default: begin
25822592
csr_rvalue = '0;
25832593
csr_dump = 1'b1;
@@ -2898,6 +2908,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
28982908
.lsu_qsize_i (ls_size),
28992909
.lsu_qamo_i (ls_amo),
29002910
.lsu_qrepd_i (1'b0),
2911+
.lsu_qmcast_i (addr_t'(csr_mcast_q)),
29012912
.lsu_qvalid_i (lsu_qvalid),
29022913
.lsu_qready_o (lsu_qready),
29032914
.lsu_pdata_o (ld_result),

hw/snitch/src/snitch_lsu.sv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ module snitch_lsu #(
5252
input logic [1:0] lsu_qsize_i,
5353
input reqrsp_pkg::amo_op_e lsu_qamo_i,
5454
input logic lsu_qrepd_i, // Whether this is a sequencer repetition
55+
input addr_t lsu_qmcast_i, // Multicast mask
5556
input logic lsu_qvalid_i,
5657
output logic lsu_qready_o,
5758
// response channel
@@ -250,6 +251,7 @@ module snitch_lsu #(
250251
assign data_req_o.q_valid = lsu_postcaq_qvalid & (lsu_qwrite_i | ~laq_full) & ~mem_full;
251252
assign data_req_o.q.write = lsu_qwrite_i;
252253
assign data_req_o.q.addr = lsu_qaddr_i;
254+
assign data_req_o.q.mask = lsu_qmcast_i;
253255
assign data_req_o.q.amo = lsu_qamo_i;
254256
assign data_req_o.q.size = lsu_qsize_i;
255257

hw/snitch_cluster/src/snitch_cluster.sv

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ module snitch_cluster
3838
parameter int unsigned NarrowUserWidth = 1,
3939
/// AXI: dma user width.
4040
parameter int unsigned WideUserWidth = 1,
41+
/// Width of the atomic ID to be used in a system.
42+
parameter int unsigned AtomicIdWidth = 1,
4143
/// Boot Address from which to fetch the first instructions.
4244
/// Used if `AliasRegionEnable` or `IntBootromEnable` is not set.
4345
parameter logic [31:0] BootAddr = 32'h0,
@@ -1149,7 +1151,7 @@ module snitch_cluster
11491151
user_t cluster_user;
11501152
// Atomic ID, needs to be unique ID of cluster
11511153
// cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters)
1152-
assign cluster_user = (hart_base_id_i / NrCores) + (hart_base_id_i % NrCores) + 1'b1;
1154+
assign cluster_user = (core_to_axi_req.q.mask << AtomicIdWidth) | ((hart_base_id_i / NrCores) + (hart_base_id_i % NrCores) + 1'b1);
11531155

11541156
reqrsp_mux #(
11551157
.NrPorts (NrCores),

hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ package ${cfg['cluster']['name']}_pkg;
5858

5959
localparam int unsigned NarrowUserWidth = ${cfg['cluster']['user_width']};
6060
localparam int unsigned WideUserWidth = ${cfg['cluster']['dma_user_width']};
61+
localparam int unsigned AtomicIdWidth = ${cfg['cluster']['atomic_id_width']};
6162

6263
localparam int unsigned ICacheLineWidth [NrHives] = '{${icache_cfg('cacheline')}};
6364
localparam int unsigned ICacheLineCount [NrHives] = '{${icache_cfg('depth')}};

hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ module ${cfg['cluster']['name']}_wrapper (
6868
.WideIdWidthIn (${cfg['cluster']['name']}_pkg::WideIdWidthIn),
6969
.NarrowUserWidth (${cfg['cluster']['name']}_pkg::NarrowUserWidth),
7070
.WideUserWidth (${cfg['cluster']['name']}_pkg::WideUserWidth),
71+
.AtomicIdWidth (${cfg['cluster']['name']}_pkg::AtomicIdWidth),
7172
.BootAddr (${to_sv_hex(cfg['cluster']['boot_addr'], 32)}),
7273
.IntBootromEnable (${int(cfg['cluster']['int_bootrom_enable'])}),
7374
.narrow_in_req_t (${cfg['cluster']['name']}_pkg::narrow_in_req_t),

hw/snitch_cluster/src/snitch_fp_ss.sv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2709,6 +2709,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
27092709
.lsu_qsize_i (ls_size),
27102710
.lsu_qamo_i (reqrsp_pkg::AMONone),
27112711
.lsu_qrepd_i (acc_req_repd_q),
2712+
.lsu_qmcast_i ('0),
27122713
.lsu_qvalid_i (lsu_qvalid),
27132714
.lsu_qready_o (lsu_qready),
27142715
.lsu_pdata_o (ld_result),

sw/snRuntime/api/sync_decls.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,7 @@ inline void snrt_global_barrier();
3030
inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);
3131

3232
inline void snrt_wait_writeback(uint32_t val);
33+
34+
inline void snrt_enable_multicast(uint32_t mask);
35+
36+
inline void snrt_disable_multicast();

sw/snRuntime/src/dma.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ inline uint32_t snrt_dma_start_1d_mcast_wideptr(uint64_t dst, uint64_t src,
9898
10, OP_CUSTOM1)),
9999
"r"(reg_size));
100100

101+
// Reset dmmcast or next transfers will inherit this setting
102+
asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMMCAST_FUNCT7, 0b00000, 0,
103+
XDMA_FUNCT3, 0, OP_CUSTOM1)),
104+
"r"(reg_mcast));
105+
101106
return reg_txid;
102107
}
103108

@@ -472,6 +477,22 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
472477
return snrt_dma_start_1d(dst, src + tile_idx * tile_nbytes, tile_nbytes);
473478
}
474479

480+
/**
481+
* @brief Load a tile of a 1D array.
482+
* @param dst Pointer to the tile destination.
483+
* @param src Pointer to the source array.
484+
* @param tile_idx Index of the tile in the 1D array.
485+
* @param tile_size Number of elements within a tile of the 1D array.
486+
* @param prec Number of bytes of each element in the 1D array.
487+
* @param mcast Multicast mask applied on the destination address.
488+
*/
489+
inline snrt_dma_txid_t snrt_dma_mcast_load_1d_tile(void *dst, void *src,
490+
size_t tile_idx, size_t tile_size,
491+
uint32_t prec, uint32_t mcast) {
492+
size_t tile_nbytes = tile_size * prec;
493+
return snrt_dma_start_1d_mcast(dst, src + tile_idx * tile_nbytes, tile_nbytes, mcast);
494+
}
495+
475496
/**
476497
* @brief Store a tile to a 1D array.
477498
* @param dst Pointer to the destination array.

sw/snRuntime/src/sync.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,7 @@ extern void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
3434
extern uint32_t snrt_global_all_to_all_reduction(uint32_t value);
3535

3636
extern void snrt_wait_writeback(uint32_t val);
37+
38+
extern void snrt_enable_multicast(uint32_t mask);
39+
40+
extern void snrt_disable_multicast();

sw/snRuntime/src/sync.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
#pragma once
1414

15+
#include "../../deps/riscv-opcodes/encoding.h"
16+
1517
#include <math.h>
1618

1719
//================================================================================
@@ -89,6 +91,35 @@ inline void snrt_cluster_hw_barrier() {
8991
* will stall indefinitely.
9092
*/
9193
inline void snrt_inter_cluster_barrier() {
94+
#ifdef SUPPORTS_MULTICAST
95+
// Everyone increments a shared counter
96+
uint32_t cnt =
97+
__atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
98+
99+
// All but the last cluster enter WFI, while the last cluster resets the
100+
// counter for the next barrier and multicasts an interrupt to wake up the
101+
// other clusters.
102+
if (cnt == snrt_cluster_num()) {
103+
_snrt_barrier.cnt = 0;
104+
105+
// Multicast cluster interrupt to every other cluster's core
106+
// Note: we need to address another cluster's address space
107+
// because the cluster XBAR has not been extended to support
108+
// multicast yet. We address the second cluster, if we are the
109+
// first cluster, and the second otherwise.
110+
uintptr_t addr = (uintptr_t)snrt_cluster_clint_set_ptr() - SNRT_CLUSTER_OFFSET * snrt_cluster_idx();
111+
if (snrt_cluster_idx() == 0) addr += SNRT_CLUSTER_OFFSET;
112+
snrt_enable_multicast(BCAST_MASK_ALL);
113+
*((uint32_t *)addr) = 1 << snrt_cluster_core_idx();
114+
snrt_disable_multicast();
115+
// Clear interrupt for next barrier
116+
snrt_int_clr_mcip();
117+
} else {
118+
snrt_wfi();
119+
// Clear interrupt for next barrier
120+
snrt_int_clr_mcip();
121+
}
122+
#else
92123
// Remember previous iteration
93124
uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
94125
uint32_t cnt =
@@ -102,6 +133,7 @@ inline void snrt_inter_cluster_barrier() {
102133
while (prev_barrier_iteration == _snrt_barrier.iteration)
103134
;
104135
}
136+
#endif
105137
}
106138

107139
/**
@@ -119,6 +151,7 @@ inline void snrt_global_barrier() {
119151
// Synchronize all DM cores in software
120152
if (snrt_is_dm_core()) {
121153
snrt_inter_cluster_barrier();
154+
122155
}
123156
// Synchronize cores in a cluster with the HW barrier
124157
snrt_cluster_hw_barrier();
@@ -264,3 +297,21 @@ inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
264297
inline void snrt_wait_writeback(uint32_t val) {
265298
asm volatile("mv %0, %0" : "+r"(val)::);
266299
}
300+
301+
//================================================================================
302+
// Multicast functions
303+
//================================================================================
304+
305+
/**
306+
* @brief Enable LSU multicast
307+
* @details All stores performed after this call will be multicast to all
308+
* addresses specified by the address and mask pair.
309+
*
310+
* @param mask Multicast mask value
311+
*/
312+
inline void snrt_enable_multicast(uint32_t mask) { write_csr(0x7c3, mask); }
313+
314+
/**
315+
* @brief Disable LSU multicast
316+
*/
317+
inline void snrt_disable_multicast() { write_csr(0x7c3, 0); }

target/snitch_cluster/cfg/mcast.hjson

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
cluster_base_hartid: 0,
1111
addr_width: 48,
1212
data_width: 64,
13-
user_width: 5, // clog2(total number of clusters)
13+
atomic_id_width: 5, // clog2(total number of clusters)
14+
user_width: 53, // addr_width + atomic_id_width
1415
tcdm: {
1516
size: 128,
1617
banks: 32,

util/clustergen/schema/snitch_cluster.schema.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,11 @@
153153
"description": "User width of the wide AXI plug into the cluster.",
154154
"default": 1
155155
},
156+
"atomic_id_width": {
157+
"type": "number",
158+
"description": "Width of the cluster's atomics ID.",
159+
"default": 1
160+
},
156161
"enable_multicast": {
157162
"type": "boolean",
158163
"description": "Whether to enable the mutlicast feature into the cluster",

0 commit comments

Comments
 (0)