From 4df7f8213f60887737b015c59e2ca1066ee75ed9 Mon Sep 17 00:00:00 2001 From: Thomas Benz Date: Fri, 6 Dec 2024 17:52:19 +0100 Subject: [PATCH] idma: Streamline TCDM connection, enable multi-channel operation --------- Co-authored-by: Lucia Luzi --- Bender.lock | 4 +- Bender.yml | 3 +- hw/snitch/src/riscv_instr.sv | 1 + hw/snitch/src/snitch.sv | 3 +- hw/snitch/src/snitch_pkg.sv | 16 +- hw/snitch_cluster/src/snitch_cc.sv | 34 ++++- hw/snitch_cluster/src/snitch_cluster.sv | 192 +++++++++++++++++------- hw/tcdm_interface/src/obi_to_tcdm.sv | 56 +++++++ sw/deps/riscv-opcodes | 2 +- sw/snRuntime/src/dma.h | 62 +++++++- sw/snRuntime/src/start.c | 11 +- sw/tests/dma_dminit.c | 44 ++++++ util/trace/gen_trace.py | 2 +- 13 files changed, 353 insertions(+), 77 deletions(-) create mode 100644 hw/tcdm_interface/src/obi_to_tcdm.sv create mode 100644 sw/tests/dma_dminit.c diff --git a/Bender.lock b/Bender.lock index 767d2c991..1cbc6f0a8 100644 --- a/Bender.lock +++ b/Bender.lock @@ -71,8 +71,8 @@ packages: dependencies: - common_cells idma: - revision: ff5d56fffb3767814db88d6bf8f381974ea33aa5 - version: 0.6.4 + revision: d7ad14b31e82e50c2973a70400ccc460f3f617f4 + version: null source: Git: https://github.com/pulp-platform/iDMA dependencies: diff --git a/Bender.yml b/Bender.yml index 75537b8aa..5e3f422d8 100644 --- a/Bender.yml +++ b/Bender.yml @@ -27,7 +27,7 @@ dependencies: tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.13 } riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 } cluster_icache: { git: https://github.com/pulp-platform/cluster_icache.git, rev: 64e21ae455bbdde850c4df13bef86ea55ac42537 } - idma: { git: https://github.com/pulp-platform/iDMA, version: 0.6.4 } + idma: { git: https://github.com/pulp-platform/iDMA, rev: d7ad14b31e82e50c2973a70400ccc460f3f617f4 } export_include_dirs: - hw/reqrsp_interface/include @@ -79,6 +79,7 @@ sources: - hw/tcdm_interface/src/tcdm_interface.sv # Level 1 - hw/tcdm_interface/src/axi_to_tcdm.sv + - hw/tcdm_interface/src/obi_to_tcdm.sv - hw/tcdm_interface/src/reqrsp_to_tcdm.sv - hw/tcdm_interface/src/tcdm_mux.sv - target: simulation diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv index 02d0d7564..6e2928c87 100644 --- a/hw/snitch/src/riscv_instr.sv +++ b/hw/snitch/src/riscv_instr.sv @@ -327,6 +327,7 @@ package riscv_instr; localparam logic [31:0] DMSTAT = 32'b0000101?????00000000?????0101011; localparam logic [31:0] DMSTR = 32'b0000110??????????000000000101011; localparam logic [31:0] DMREP = 32'b000011100000?????000000000101011; + localparam logic [31:0] DMINIT = 32'b0001000??????????000?????0101011; localparam logic [31:0] FREP_O = 32'b????????????????????????10001011; localparam logic [31:0] FREP_I = 32'b????????????????????????00001011; localparam logic [31:0] IREP = 32'b?????????????????????????0111111; diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv index f0e099f43..531f84419 100644 --- a/hw/snitch/src/snitch.sv +++ b/hw/snitch/src/snitch.sv @@ -2162,7 +2162,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( illegal_inst = 1'b1; end end - DMCPYI: begin + DMCPYI, + DMINIT: begin if (Xdma) begin acc_qreq_o.addr = DMA_SS; opa_select = Reg; diff --git a/hw/snitch/src/snitch_pkg.sv b/hw/snitch/src/snitch_pkg.sv index 6918a0a77..677a25c7d 100644 --- a/hw/snitch/src/snitch_pkg.sv +++ b/hw/snitch/src/snitch_pkg.sv @@ -137,16 +137,18 @@ package snitch_pkg; // Slaves on Cluster DMA AXI Bus typedef enum int unsigned { - TCDMDMA = 0, - SoCDMAOut = 1, - ZeroMemory = 2, - BootRom = 3 + SoCDMAOut = 0, + BootRom = 1 } cluster_slave_dma_e; + typedef enum logic { + TCDMDMA = 0, + ToSoC = 1 + } dma_e; + typedef enum int unsigned { - SoCDMAIn = 32'd0, - SDMAMst = 32'd1, - ICache = 32'd2 + SDMAMst = 32'd0, + ICache = 32'd1 } cluster_master_dma_e; /// Possible interconnect implementations. diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv index bdc118ceb..75112684b 100644 --- a/hw/snitch_cluster/src/snitch_cc.sv +++ b/hw/snitch_cluster/src/snitch_cc.sv @@ -40,6 +40,14 @@ module snitch_cc #( parameter type axi_aw_chan_t = logic, parameter type axi_req_t = logic, parameter type axi_rsp_t = logic, + parameter type init_req_chan_t = logic, + parameter type init_rsp_chan_t = logic, + parameter type init_req_t = logic, + parameter type init_rsp_t = logic, + parameter type obi_a_chan_t = logic, + parameter type obi_r_chan_t = logic, + parameter type obi_req_t = logic, + parameter type obi_rsp_t = logic, parameter type hive_req_t = logic, parameter type hive_rsp_t = logic, parameter type acc_req_t = logic, @@ -115,7 +123,8 @@ module snitch_cc #( /// Derived parameter *Do not override* parameter int unsigned TCDMPorts = (NumSsrs > 1 ? NumSsrs : 1), parameter type addr_t = logic [AddrWidth-1:0], - parameter type data_t = logic [DataWidth-1:0] + parameter type data_t = logic [DataWidth-1:0], + parameter type addr_rule_t = axi_pkg::xbar_rule_64_t ) ( input logic clk_i, input logic clk_d2_i, @@ -136,6 +145,8 @@ module snitch_cc #( // DMA ports output axi_req_t [DMANumChannels-1:0] axi_dma_req_o, input axi_rsp_t [DMANumChannels-1:0] axi_dma_res_i, + output obi_req_t [DMANumChannels-1:0] obi_dma_req_o, + input obi_rsp_t [DMANumChannels-1:0] obi_dma_res_i, output logic [DMANumChannels-1:0] axi_dma_busy_o, output dma_events_t [DMANumChannels-1:0] axi_dma_events_o, // Core event strobes @@ -143,7 +154,9 @@ module snitch_cc #( input addr_t tcdm_addr_base_i, // Cluster HW barrier output logic barrier_o, - input logic barrier_i + input logic barrier_i, + // address decode map + input addr_rule_t [TCDMAliasEnable:0] dma_addr_rule_i ); // FMA architecture is "merged" -> mulexp and macexp instructions are supported @@ -390,20 +403,32 @@ module snitch_cc #( .NumAxInFlight (DMANumAxInFlight), .DMAReqFifoDepth (DMAReqFifoDepth), .NumChannels (DMANumChannels), + .TCDMAliasEnable (TCDMAliasEnable), .DMATracing (1), .axi_ar_chan_t (axi_ar_chan_t), .axi_aw_chan_t (axi_aw_chan_t), .axi_req_t (axi_req_t), .axi_res_t (axi_rsp_t), + .init_req_chan_t (init_req_chan_t), + .init_rsp_chan_t (init_rsp_chan_t), + .init_req_t (init_req_t), + .init_rsp_t (init_rsp_t), + .obi_a_chan_t (obi_a_chan_t), + .obi_r_chan_t (obi_r_chan_t), + .obi_req_t (obi_req_t), + .obi_res_t (obi_rsp_t), .acc_req_t (acc_req_t), .acc_res_t (acc_resp_t), - .dma_events_t (dma_events_t) + .dma_events_t (dma_events_t), + .addr_rule_t (addr_rule_t) ) i_idma_inst64_top ( .clk_i, .rst_ni, .testmode_i ( 1'b0 ), .axi_req_o ( axi_dma_req_o ), .axi_res_i ( axi_dma_res_i ), + .obi_req_o ( obi_dma_req_o ), + .obi_res_i ( obi_dma_res_i ), .busy_o ( axi_dma_busy_o ), .acc_req_i ( acc_snitch_req ), .acc_req_valid_i ( dma_qvalid ), @@ -412,7 +437,8 @@ module snitch_cc #( .acc_res_valid_o ( dma_pvalid ), .acc_res_ready_i ( dma_pready ), .hart_id_i ( hart_id_i ), - .events_o ( axi_dma_events_o ) + .events_o ( axi_dma_events_o ), + .addr_map_i ( dma_addr_rule_i ) ); // no DMA instanciated diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv index 1222bb4a1..150ba0b32 100644 --- a/hw/snitch_cluster/src/snitch_cluster.sv +++ b/hw/snitch_cluster/src/snitch_cluster.sv @@ -18,6 +18,8 @@ `include "snitch_vm/typedef.svh" +`include "obi/typedef.svh" + /// Snitch many-core cluster with improved TCDM interconnect. /// Snitch Cluster Top-Level. module snitch_cluster @@ -285,13 +287,14 @@ module snitch_cluster localparam int unsigned NrRules = (1 + AliasRegionEnable) * NrRuleIdcs; // DMA X-BAR configuration - // SoC in Request, DMA Channels, `n` instruction caches. - localparam int unsigned NrWideMasters = 1 + DMANumChannels + NrHives; + // DMA Channels, `n` instruction caches. + localparam int unsigned NrWideMasters = DMANumChannels + NrHives; localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn; - // TCDM, SoC out, ZeroMemory, (Bootrom) - localparam int unsigned NrWideSlaves = 3 + IntBootromEnable; + // SoC out, (Bootrom) + localparam int unsigned NrWideSlaves = 1 + IntBootromEnable; localparam int unsigned NrWideRuleIdcs = NrWideSlaves - 1; - localparam int unsigned NrWideRules = (1 + AliasRegionEnable) * NrWideRuleIdcs; + localparam int unsigned NrWideRules = (NrWideRuleIdcs > 0) ? + ((1 + AliasRegionEnable) * NrWideRuleIdcs) : (1 + AliasRegionEnable); // AXI Configuration localparam axi_pkg::xbar_cfg_t ClusterXbarCfg = '{ @@ -377,11 +380,45 @@ module snitch_cluster `MEM_TYPEDEF_ALL(mem_dma, tcdm_mem_addr_t, data_dma_t, strb_dma_t, logic) `TCDM_TYPEDEF_ALL(tcdm, tcdm_addr_t, data_t, strb_t, tcdm_user_t) - `TCDM_TYPEDEF_ALL(tcdm_dma, tcdm_addr_t, data_dma_t, strb_dma_t, logic) + `TCDM_TYPEDEF_ALL(tcdm_dma, addr_t, data_dma_t, strb_dma_t, logic) `REG_BUS_TYPEDEF_REQ(reg_req_t, addr_t, data_t, strb_t) `REG_BUS_TYPEDEF_RSP(reg_rsp_t, data_t) + // Memory Init typedefs + typedef struct packed { + logic [PhysicalAddrWidth-1:0] cfg; + logic [WideDataWidth-1:0] term; + logic [WideDataWidth/8-1:0] strb; + logic [WideIdWidthOut-1:0] id; + } init_req_chan_t; + + typedef struct packed { + init_req_chan_t req_chan; + logic req_valid; + logic rsp_ready; + } init_req_t; + + typedef struct packed { + logic [WideDataWidth-1:0] init; + } init_rsp_chan_t; + + typedef struct packed { + init_rsp_chan_t rsp_chan; + logic rsp_valid; + logic req_ready; + } init_rsp_t; + + // OBI typedefs + `OBI_TYPEDEF_MINIMAL_A_OPTIONAL(a_opt_t) + `OBI_TYPEDEF_MINIMAL_R_OPTIONAL(r_opt_t) + + `OBI_TYPEDEF_TYPE_A_CHAN_T(obi_a_chan_t, addr_t, data_dma_t, strb_dma_t, id_dma_mst_t, a_opt_t) + `OBI_TYPEDEF_TYPE_R_CHAN_T(obi_r_chan_t, data_dma_t, id_dma_slv_t, r_opt_t) + + `OBI_TYPEDEF_REQ_T(obi_dma_req_t, obi_a_chan_t) + `OBI_TYPEDEF_RSP_T(obi_dma_rsp_t, obi_r_chan_t) + // Event counter increments for the TCDM. typedef struct packed { /// Number requests going in @@ -502,6 +539,9 @@ module snitch_cluster axi_mst_req_t [NrNarrowMasters-1:0] narrow_axi_mst_req; axi_mst_resp_t [NrNarrowMasters-1:0] narrow_axi_mst_rsp; + axi_mst_dma_req_t soc_in_axi_req; + axi_mst_dma_resp_t soc_in_axi_rsp; + // DMA AXI buses axi_mst_dma_req_t [NrWideMasters-1:0] wide_axi_mst_req; axi_mst_dma_resp_t [NrWideMasters-1:0] wide_axi_mst_rsp; @@ -531,6 +571,9 @@ module snitch_cluster dma_events_t [DMANumChannels-1:0] dma_events; icache_l0_events_t [NrCores-1:0] icache_events; + tcdm_dma_req_t [DMANumChannels-1:0] tcdm_dma_req; + tcdm_dma_rsp_t [DMANumChannels-1:0] tcdm_dma_rsp; + // 4. Memory Subsystem (Core side). reqrsp_req_t [NrCores-1:0] core_req; reqrsp_rsp_t [NrCores-1:0] core_rsp; @@ -547,6 +590,11 @@ module snitch_cluster logic [NrCores-1:0] barrier_in; logic barrier_out; + // OBI + obi_dma_req_t [NrCores-1:0][DMANumChannels-1:0] obi_dma_req; + obi_dma_rsp_t [NrCores-1:0][DMANumChannels-1:0] obi_dma_res; + + // ------------- // DMA Subsystem // ------------- @@ -583,38 +631,47 @@ module snitch_cluster .rst_ni (rst_ni), .slv_req_i (wide_in_req_i), .slv_resp_o (wide_in_resp_o), - .mst_req_o (wide_axi_mst_req[SoCDMAIn]), - .mst_resp_i (wide_axi_mst_rsp[SoCDMAIn]) + .mst_req_o (soc_in_axi_req), + .mst_resp_i (soc_in_axi_rsp) ); logic [DmaXbarCfg.NoSlvPorts-1:0][$clog2(DmaXbarCfg.NoMstPorts)-1:0] dma_xbar_default_port; assign dma_xbar_default_port = '{default: SoCDMAOut}; - xbar_rule_t [5:0] dma_xbar_rules; + xbar_rule_t [1:0] dma_xbar_rules; xbar_rule_t [DmaXbarCfg.NoAddrRules-1:0] enabled_dma_xbar_rule; assign dma_xbar_rules = '{ '{idx: BootRom, start_addr: BootRomAliasStart, end_addr: BootRomAliasEnd}, - '{idx: ZeroMemory, start_addr: ZeroMemAliasStart, end_addr: ZeroMemAliasEnd}, + '{idx: BootRom, start_addr: bootrom_start_address, end_addr: bootrom_end_address} + }; + + always_comb begin + automatic int unsigned i = 0; + if (IntBootromEnable) enabled_dma_xbar_rule[i] = dma_xbar_rules[0]; i++; // Bootrom + if (AliasRegionEnable) begin + if (IntBootromEnable) enabled_dma_xbar_rule[i] = dma_xbar_rules[1]; // Bootrom Alias + end + end + + // dma address rules + xbar_rule_t [1:0] dma_addr_rule; + xbar_rule_t [AliasRegionEnable:0] enabled_dma_addr_rule; + assign dma_addr_rule = '{ '{idx: TCDMDMA, start_addr: TCDMAliasStart, end_addr: TCDMAliasEnd}, - '{idx: BootRom, start_addr: bootrom_start_address, end_addr: bootrom_end_address}, - '{idx: ZeroMemory, start_addr: zero_mem_start_address, end_addr: zero_mem_end_address}, '{idx: TCDMDMA, start_addr: tcdm_start_address, end_addr: tcdm_end_address} + }; always_comb begin automatic int unsigned i = 0; - enabled_dma_xbar_rule[i] = dma_xbar_rules[0]; i++; // TCDM - enabled_dma_xbar_rule[i] = dma_xbar_rules[1]; i++; // ZeroMemory - if (IntBootromEnable) enabled_dma_xbar_rule[i] = dma_xbar_rules[2]; i++; // Bootrom + enabled_dma_addr_rule[i] = dma_addr_rule[0]; i++; // TCDM if (AliasRegionEnable) begin - enabled_dma_xbar_rule[i] = dma_xbar_rules[3]; i++; // TCDM Alias - enabled_dma_xbar_rule[i] = dma_xbar_rules[4]; i++; // ZeroMemory Alias - if (IntBootromEnable) enabled_dma_xbar_rule[i] = dma_xbar_rules[5]; // Bootrom Alias + enabled_dma_addr_rule[i] = dma_addr_rule[1]; i++; // TCDM Alias end end - localparam bit [DmaXbarCfg.NoSlvPorts-1:0] DMAEnableDefaultMstPort = '1; + axi_xbar #( .Cfg (DmaXbarCfg), .ATOPs (0), @@ -641,43 +698,27 @@ module snitch_cluster .mst_ports_req_o (wide_axi_slv_req), .mst_ports_resp_i (wide_axi_slv_rsp), .addr_map_i (enabled_dma_xbar_rule), - .en_default_mst_port_i (DMAEnableDefaultMstPort), + .en_default_mst_port_i ('1), .default_mst_port_i (dma_xbar_default_port) ); - axi_zero_mem #( - .axi_req_t (axi_slv_dma_req_t), - .axi_resp_t (axi_slv_dma_resp_t), - .AddrWidth (PhysicalAddrWidth), - .DataWidth (WideDataWidth), - .IdWidth (WideIdWidthOut), - .NumBanks (1), - .BufDepth (1) - ) i_axi_zeromem ( - .clk_i, - .rst_ni, - .busy_o (), - .axi_req_i (wide_axi_slv_req[ZeroMemory]), - .axi_resp_o (wide_axi_slv_rsp[ZeroMemory]) - ); addr_t ext_dma_req_q_addr_nontrunc; axi_to_mem_interleaved #( - .axi_req_t (axi_slv_dma_req_t), - .axi_resp_t (axi_slv_dma_resp_t), + .axi_req_t (axi_mst_dma_req_t), + .axi_resp_t (axi_mst_dma_resp_t), .AddrWidth (PhysicalAddrWidth), .DataWidth (WideDataWidth), .IdWidth (WideIdWidthOut), .NumBanks (1), .BufDepth (MemoryMacroLatency + 1) - ) i_axi_to_mem_dma ( + ) i_axi_to_mem_soc_in ( .clk_i, .rst_ni, .busy_o (), - .test_i (1'b0), - .axi_req_i (wide_axi_slv_req[TCDMDMA]), - .axi_resp_o (wide_axi_slv_rsp[TCDMDMA]), + .axi_req_i ( soc_in_axi_req ), + .axi_resp_o ( soc_in_axi_rsp ), .mem_req_o (ext_dma_req.q_valid), .mem_gnt_i (ext_dma_rsp.q_ready), .mem_addr_o (ext_dma_req_q_addr_nontrunc), @@ -689,26 +730,59 @@ module snitch_cluster .mem_rdata_i (ext_dma_rsp.p.data) ); + + // ------------ + // TCDM Arbiter + // ------------ + for (genvar i = 0; i < NrCores; i++) begin : gen_core_obi_to_tcdm + // This currently assumes only one DMA core is present in the system. However this limitation + // could easily be overcome by adapting the number of inputs to the i_dma_interconnect + // according to the number of DMA cores present. + if (Xdma[i]) begin : gen_dma_obi_to_tcdm + obi_to_tcdm #( + .obi_req_t (obi_dma_req_t), + .obi_rsp_t (obi_dma_rsp_t), + .tcdm_req_t (tcdm_dma_req_t), + .tcdm_rsp_t (tcdm_dma_rsp_t), + .AddrWidth (PhysicalAddrWidth), + .DataWidth (WideDataWidth), + .IdWidth (WideIdWidthOut), + .BufDepth (MemoryMacroLatency + 1), + .NumChannels (DMANumChannels) + ) i_obi_to_tcdm ( + .clk_i, + .rst_ni, + .obi_req_i (obi_dma_req[i]), + .obi_rsp_o (obi_dma_res[i]), + .tcdm_req_o (tcdm_dma_req), + .tcdm_rsp_i (tcdm_dma_rsp) + ); + end else begin : gen_dma_obi_to_tcdm_stub + assign obi_dma_res[i] = '0; + end + end + assign ext_dma_req.q.addr = tcdm_addr_t'(ext_dma_req_q_addr_nontrunc); assign ext_dma_req.q.amo = reqrsp_pkg::AMONone; assign ext_dma_req.q.user = '0; + localparam int unsigned NumDMAIcoInputs = DMANumChannels + 1; snitch_tcdm_interconnect #( - .NumInp (1), - .NumOut (NrSuperBanks), - .tcdm_req_t (tcdm_dma_req_t), - .tcdm_rsp_t (tcdm_dma_rsp_t), - .mem_req_t (mem_dma_req_t), - .mem_rsp_t (mem_dma_rsp_t), - .user_t (logic), + .NumInp (NumDMAIcoInputs), + .NumOut (NrSuperBanks), + .tcdm_req_t (tcdm_dma_req_t), + .tcdm_rsp_t (tcdm_dma_rsp_t), + .mem_req_t (mem_dma_req_t), + .mem_rsp_t (mem_dma_rsp_t), + .user_t (logic), .MemAddrWidth (TCDMMemAddrWidth), - .DataWidth (WideDataWidth), + .DataWidth (WideDataWidth), .MemoryResponseLatency (MemoryMacroLatency) ) i_dma_interconnect ( .clk_i, .rst_ni, - .req_i (ext_dma_req), - .rsp_o (ext_dma_rsp), + .req_i ({ext_dma_req, tcdm_dma_req}), + .rsp_o ({ext_dma_rsp, tcdm_dma_rsp}), .mem_req_o (sb_dma_req), .mem_rsp_i (sb_dma_rsp) ); @@ -891,6 +965,14 @@ module snitch_cluster .axi_aw_chan_t (axi_mst_dma_aw_chan_t), .axi_req_t (axi_mst_dma_req_t), .axi_rsp_t (axi_mst_dma_resp_t), + .init_req_chan_t (init_req_chan_t), + .init_rsp_chan_t (init_rsp_chan_t), + .init_req_t (init_req_t), + .init_rsp_t (init_rsp_t), + .obi_a_chan_t (obi_a_chan_t), + .obi_r_chan_t (obi_r_chan_t), + .obi_req_t (obi_dma_req_t), + .obi_rsp_t (obi_dma_rsp_t), .hive_req_t (hive_req_t), .hive_rsp_t (hive_rsp_t), .acc_req_t (acc_req_t), @@ -939,7 +1021,8 @@ module snitch_cluster .CaqTagWidth (CaqTagWidth), .DebugSupport (DebugSupport), .TCDMAliasEnable (AliasRegionEnable), - .TCDMAliasStart (TCDMAliasStart) + .TCDMAliasStart (TCDMAliasStart), + .addr_rule_t (xbar_rule_t) ) i_snitch_cc ( .clk_i, .clk_d2_i (clk_d2), @@ -956,12 +1039,15 @@ module snitch_cluster .tcdm_rsp_i (tcdm_rsp[TcdmPortsOffs+:TcdmPorts]), .axi_dma_req_o (axi_dma_req), .axi_dma_res_i (axi_dma_res), + .obi_dma_req_o (obi_dma_req[i]), + .obi_dma_res_i (obi_dma_res[i]), .axi_dma_busy_o (), .axi_dma_events_o (dma_core_events), .core_events_o (core_events[i]), .tcdm_addr_base_i (tcdm_start_address), .barrier_o (barrier_in[i]), - .barrier_i (barrier_out) + .barrier_i (barrier_out), + .dma_addr_rule_i (enabled_dma_addr_rule) ); for (genvar j = 0; j < TcdmPorts; j++) begin : gen_tcdm_user always_comb begin diff --git a/hw/tcdm_interface/src/obi_to_tcdm.sv b/hw/tcdm_interface/src/obi_to_tcdm.sv new file mode 100644 index 000000000..cc8044550 --- /dev/null +++ b/hw/tcdm_interface/src/obi_to_tcdm.sv @@ -0,0 +1,56 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Lucia Luzi + +`include "reqrsp_interface/typedef.svh" + +/// Convert OBI to TCDM protocol. +module obi_to_tcdm #( + parameter type obi_req_t = logic, + parameter type obi_rsp_t = logic, + parameter type tcdm_req_t = logic, + parameter type tcdm_rsp_t = logic, + parameter int unsigned AddrWidth = 0, + parameter int unsigned DataWidth = 0, + parameter int unsigned IdWidth = 0, + parameter int unsigned BufDepth = 1, + parameter int unsigned NumChannels = 1 +) ( + input logic clk_i, + input logic rst_ni, + input obi_req_t [NumChannels-1:0] obi_req_i, + output obi_rsp_t [NumChannels-1:0] obi_rsp_o, + output tcdm_req_t [NumChannels-1:0] tcdm_req_o, + input tcdm_rsp_t [NumChannels-1:0] tcdm_rsp_i +); + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + typedef logic [DataWidth/8-1:0] strb_t; + + `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t) + + for (genvar i = 0; i < NumChannels; i++) begin : gen_tcdm_obi_adapt + assign tcdm_req_o[i].q_valid = obi_req_i[i].req; + assign tcdm_req_o[i].q = '{ + addr: obi_req_i[i].a.addr, + write: obi_req_i[i].a.we, + amo: reqrsp_pkg::AMONone, + data: obi_req_i[i].a.wdata, + strb: obi_req_i[i].a.be, + user: '0 + }; + + assign obi_rsp_o[i].r = '{ + rdata: tcdm_rsp_i[i].p.data, + rid: '0, + err: 1'b0, + r_optional: '0 + }; + assign obi_rsp_o[i].gnt = tcdm_rsp_i[i].q_ready; + assign obi_rsp_o[i].rvalid = tcdm_rsp_i[i].p_valid; + end + +endmodule diff --git a/sw/deps/riscv-opcodes b/sw/deps/riscv-opcodes index cf2ddfd74..ccea05454 160000 --- a/sw/deps/riscv-opcodes +++ b/sw/deps/riscv-opcodes @@ -1 +1 @@ -Subproject commit cf2ddfd747e0228a004a30af443d4895c6f8475f +Subproject commit ccea05454b549a0493dad7e69e66352ca0f3e0ed diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h index 37fbc74f9..0c7c1d9e1 100644 --- a/sw/snRuntime/src/dma.h +++ b/sw/snRuntime/src/dma.h @@ -18,6 +18,7 @@ #define DMSTAT_FUNCT7 0b0000101 #define DMSTR_FUNCT7 0b0000110 #define DMREP_FUNCT7 0b0000111 +#define DMINIT_FUNCT7 0b0001000 /// A DMA transfer identifier. typedef uint32_t snrt_dma_txid_t; @@ -309,8 +310,7 @@ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) { asm volatile( "1: \n" ".word %0\n" - "sub t0, t0, %1 \n" - "blez t0, 1b \n" ::"i"( + "bltu t0, %1, 1b \n" ::"i"( R_TYPE_ENCODE(DMSTAT_FUNCT7, 6, 0, XDMA_FUNCT3, 5, OP_CUSTOM1)), "r"(tid), "r"(cfg) : "t0"); @@ -402,6 +402,64 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) { snrt_dma_wait_all(); } +/** + * @brief Fast memset function performed by DMA with the dminit instruction. + * @param ptr Pointer to the start of the region. + * @param value Value to set. + * @param size The size of the transfer in bytes. + * @param channel The index of the channel. + */ +inline uint32_t snrt_dma_memset_init_1d(uint64_t ptr, uint8_t value, + uint32_t size, uint32_t channel) { + register uint32_t reg_dst_low asm("a0") = ptr >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = ptr >> 32; // 11 + register uint32_t reg_value asm("a2") = value; // 12 + register uint32_t reg_txid asm("a3"); // 13 + register uint32_t reg_size asm("a4") = size; // 14 + + // dmdst a0, a1 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMDST_FUNCT7, 11, 10, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_dst_high), "r"(reg_dst_low)); + + if (value == 0x00) { + // register uint32_t cfg asm("a5") = channel << 2; // 15 + uint32_t cfg = channel << 2; + // dminit a3, a4, channel | 0b00 + asm volatile(".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMINIT_FUNCT7, cfg, 14, XDMA_FUNCT3, + 10, OP_CUSTOM1)), + "r"(reg_size)); + + } else if (value == 0xff) { + uint32_t cfg = channel << 2 | 1; // 15 + + // dminit a3, a4, channel | 0b01 + asm volatile(".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMINIT_FUNCT7, cfg, 14, XDMA_FUNCT3, + 10, OP_CUSTOM1)), + "r"(reg_size)); + } else { + uint32_t cfg = channel << 2 | 2; // 15 + + // dmsrc value, 0 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE( + DMSRC_FUNCT7, 0, 12, XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_value)); + + // dminit a3, a4, channel | 0b10 + asm volatile(".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMINIT_FUNCT7, cfg, 14, XDMA_FUNCT3, + 10, OP_CUSTOM1)), + "r"(reg_size)); + } + + return reg_txid; +} + /** * @brief Load a tile of a 1D array. * @param dst Pointer to the tile destination. diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c index f169ec6ff..48d3d7990 100644 --- a/sw/snRuntime/src/start.c +++ b/sw/snRuntime/src/start.c @@ -38,9 +38,10 @@ static inline void snrt_init_tls() { tls_ptr += size; size = (size_t)(&__tbss_end) - (size_t)(&__tbss_start); for (int i = 0; i < snrt_cluster_core_num(); i++) { - snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), - (void*)(snrt_zero_memory_ptr()), size); + snrt_dma_memset_init_1d((uint64_t)(tls_ptr + i * tls_offset), 0, + size, 0); } + snrt_dma_wait_all(); } @@ -55,8 +56,7 @@ static inline void snrt_init_bss() { // Only one core needs to perform the initialization if (snrt_cluster_idx() == 0 && snrt_is_dm_core()) { size_t size = (size_t)(&__bss_end) - (size_t)(&__bss_start); - snrt_dma_start_1d_wideptr((uint64_t)(&__bss_start), - (uint64_t)(snrt_zero_memory_ptr()), size); + snrt_dma_memset_init_1d((uint64_t)(&__bss_start), 0, size, 0); } } #endif @@ -80,7 +80,8 @@ static inline void snrt_init_cls() { // Clear cbss section ptr = (void*)((uint32_t)ptr + size); size = (size_t)(&__cbss_end) - (size_t)(&__cbss_start); - snrt_dma_start_1d(ptr, (void*)(snrt_zero_memory_ptr()), size); + snrt_dma_memset_init_1d((uint64_t)ptr, 0, size, 0); + snrt_dma_wait_all(); } } #endif diff --git a/sw/tests/dma_dminit.c b/sw/tests/dma_dminit.c new file mode 100644 index 000000000..a58b6f8ff --- /dev/null +++ b/sw/tests/dma_dminit.c @@ -0,0 +1,44 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +// Allocate a buffer in the main memory which we will use to copy data around +// with the DMA. +uint32_t buffer[32]; + +int main() { + if (snrt_global_core_idx() != 8) return 0; // only DMA core + uint32_t errors = 0; + + // Populate buffers. + uint32_t buffer_src[32], buffer_dst[32]; + for (uint32_t i = 0; i < 32; i++) { + buffer[i] = 0xAAAAAAAA; + buffer_dst[i] = 0x55555555; + buffer_src[i] = 0x55555555; + } + + uint8_t byte = 0x11; + // Write data to main memory. + snrt_dma_txid_t id = + snrt_dma_memset_init_1d((uint64_t)buffer, 0x55, sizeof(buffer), 0); + snrt_dma_wait_all_channel(0); + + // Check that the main memory buffer contains the correct data. + for (uint32_t i = 0; i < 32; i++) { + errors += (buffer[i] != buffer_src[i]); + } + + // Write data to L1. + id = snrt_dma_memset_init_1d((uint64_t)buffer_dst, 0xff, sizeof(buffer), 0); + snrt_dma_wait_all_channel(0); + + // Check that the L1 buffer contains the correct data. + for (uint32_t i = 0; i < 32; i++) { + errors += (buffer_dst[i] != 0xffffffff); + } + + return errors; +} diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py index 718b2eaac..935fbb0dc 100755 --- a/util/trace/gen_trace.py +++ b/util/trace/gen_trace.py @@ -562,7 +562,7 @@ def update_dma(insn, extras, dma_trans): pass elif mnemonic == 'dmrep': dma_trans[-1]['rep'] = extras['opa'] - elif mnemonic in ['dmcpy', 'dmcpyi']: + elif mnemonic in ['dmcpy', 'dmcpyi', 'dminit']: # Create new placeholder transaction to inherit current DMA settings dma_trans.append(dma_trans[-1].copy()) # Set size of the transaction