diff --git a/.github/workflow_metadata/pr_hash b/.github/workflow_metadata/pr_hash index 930c3c4..37d18b7 100644 --- a/.github/workflow_metadata/pr_hash +++ b/.github/workflow_metadata/pr_hash @@ -1 +1 @@ -148a6d381422de56ae26bc8c4288130b67b86f624ee2adb675b36c18e09bc5319f1cc53b9c3268c98892d594e9a28b44 \ No newline at end of file +4f57be3471046889f34027ace2f1c510ede448243a09622503b3ede22983243c960758c271e18642e9a08a28b76f30d3 \ No newline at end of file diff --git a/.github/workflow_metadata/pr_timestamp b/.github/workflow_metadata/pr_timestamp index 69e59bc..6a19786 100644 --- a/.github/workflow_metadata/pr_timestamp +++ b/.github/workflow_metadata/pr_timestamp @@ -1 +1 @@ -1733339945 \ No newline at end of file +1733959211 \ No newline at end of file diff --git a/src/abr_libs/rtl/abr_masked_N_bit_mult_two_share.sv b/src/abr_libs/rtl/abr_masked_N_bit_mult_two_share.sv index d91478a..dd105ca 100644 --- a/src/abr_libs/rtl/abr_masked_N_bit_mult_two_share.sv +++ b/src/abr_libs/rtl/abr_masked_N_bit_mult_two_share.sv @@ -25,7 +25,7 @@ // - Final output is obtained by combining the reshared and masked intermediate results. // - It requires fresh randomness. // - This design assumes that both x and y are secret, although y input from top level is usually public -// - It has one cycle latency and can accept a new input set at every clock. +// - It has two cycle latency and can accept a new input set at every clock. // //====================================================================== @@ -43,6 +43,7 @@ // Intermediate calculation logic for multiplication operations logic [WIDTH-1:0] calculation [3:0]; + logic [WIDTH-1:0] calculation_reg [1:0]; logic [WIDTH-1:0] calculation_rand [1:0]; logic [WIDTH-1:0] final_res [1:0]; logic [WIDTH-1:0] x0, x1, y0, y1; @@ -53,12 +54,30 @@ calculation[1] = WIDTH'(x[1] * y[0]); // Multiplication of the second share x and first share y calculation[2] = WIDTH'(x[0] * y[1]); // Multiplication of the first share x and second share y calculation[3] = WIDTH'(x[1] * y[1]); // Multiplication of the second share x and second share y - - calculation_rand[0] = calculation[2] + random; - calculation_rand[1] = calculation[1] - random; - - final_res[0] = calculation[0] + calculation_rand[0]; - final_res[1] = calculation[3] + calculation_rand[1]; + end + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + for (int i = 0; i < 2; i++) begin + calculation_rand[i] <= 'h0; + calculation_reg[i] <= 'h0; + end + end + else if (zeroize) begin + for (int i = 0; i < 2; i++) begin + calculation_rand[i] <= 'h0; + calculation_reg[i] <= 'h0; + end + end + else begin + calculation_rand[0] <= calculation[2] + random; + calculation_rand[1] <= calculation[1] - random; + calculation_reg[0] <= calculation[0]; + calculation_reg[1] <= calculation[3]; + end + end + always_comb begin + final_res[0] = calculation_reg[0] + calculation_rand[0]; + final_res[1] = calculation_reg[1] + calculation_rand[1]; end // Final output assignment diff --git a/src/mldsa_top/rtl/mldsa_ctrl.sv b/src/mldsa_top/rtl/mldsa_ctrl.sv index f1bb33c..217d9bc 100644 --- a/src/mldsa_top/rtl/mldsa_ctrl.sv +++ b/src/mldsa_top/rtl/mldsa_ctrl.sv @@ -1603,7 +1603,7 @@ mldsa_seq_sec mldsa_seq_sec_inst INTT_raw_signal <= 'h0; end else begin - if (seq_en) begin + if (sec_seq_en) begin unique case(sec_prog_cntr_nxt) MLDSA_SIGN_VALID_S : begin //NTT(C) NTT_raw_signal <= 'h1; diff --git a/src/ntt_top/rtl/ntt_butterfly2x2.sv b/src/ntt_top/rtl/ntt_butterfly2x2.sv index f54ef17..7023cf8 100644 --- a/src/ntt_top/rtl/ntt_butterfly2x2.sv +++ b/src/ntt_top/rtl/ntt_butterfly2x2.sv @@ -27,12 +27,7 @@ module ntt_butterfly2x2 #( parameter REG_SIZE = 23, parameter MLDSA_Q = 23'd8380417, - parameter MLDSA_Q_DIV2_ODD = (MLDSA_Q + 1) / 2, - parameter BF_LATENCY = 10, //5 cycles per butterfly * 2 instances in serial = 10 clks - parameter PWM_LATENCY = 5, //latency of modular multiplier + modular addition to perform accumulation - parameter PWA_LATENCY = 1, //latency of modular addition - parameter PWS_LATENCY = 1, //latency of modular subtraction - parameter BF_STAGE1_LATENCY = BF_LATENCY/2 + parameter MLDSA_Q_DIV2_ODD = (MLDSA_Q + 1) / 2 ) ( //Clock and reset @@ -66,9 +61,9 @@ module ntt_butterfly2x2 logic [REG_SIZE-1:0] w01; logic [REG_SIZE-1:0] w10; logic [REG_SIZE-1:0] w11; - logic [BF_STAGE1_LATENCY-1:0][REG_SIZE-1:0] w10_reg, w11_reg; //Shift w10 by 5 cycles to match 1st stage BF latency + logic [UNMASKED_BF_STAGE1_LATENCY-1:0][REG_SIZE-1:0] w10_reg, w11_reg; //Shift w10 by 5 cycles to match 1st stage BF latency logic pwo_mode; - logic [BF_LATENCY-1:0] ready_reg; + logic [UNMASKED_BF_LATENCY-1:0] ready_reg; //Each butterfly unit takes u, v, w inputs and produces //u, v outputs for the next stage to consume. Each butterfly @@ -90,8 +85,8 @@ module ntt_butterfly2x2 w11_reg <= 'h0; end else begin - w10_reg <= {uvw_i.w10_i, w10_reg[BF_STAGE1_LATENCY-1:1]}; - w11_reg <= {uvw_i.w11_i, w11_reg[BF_STAGE1_LATENCY-1:1]}; + w10_reg <= {uvw_i.w10_i, w10_reg[UNMASKED_BF_STAGE1_LATENCY-1:1]}; + w11_reg <= {uvw_i.w11_i, w11_reg[UNMASKED_BF_STAGE1_LATENCY-1:1]}; end end @@ -221,9 +216,9 @@ module ntt_butterfly2x2 ready_reg <= 'b0; else begin unique case(mode) - ct: ready_reg <= {enable, ready_reg[BF_LATENCY-1:1]}; - gs: ready_reg <= {enable, ready_reg[BF_LATENCY-1:1]}; - pwm: ready_reg <= accumulate ? {5'h0, enable, ready_reg[PWM_LATENCY-1:1]} : {6'h0, enable, ready_reg[PWM_LATENCY-2:1]}; + ct: ready_reg <= {enable, ready_reg[UNMASKED_BF_LATENCY-1:1]}; + gs: ready_reg <= {enable, ready_reg[UNMASKED_BF_LATENCY-1:1]}; + pwm: ready_reg <= accumulate ? {5'h0, enable, ready_reg[UNMASKED_PWM_LATENCY-1:1]} : {6'h0, enable, ready_reg[UNMASKED_PWM_LATENCY-2:1]}; pwa: ready_reg <= {9'h0, enable}; pws: ready_reg <= {9'h0, enable}; default: ready_reg <= 'h0; diff --git a/src/ntt_top/rtl/ntt_ctrl.sv b/src/ntt_top/rtl/ntt_ctrl.sv index 447125b..191fec6 100644 --- a/src/ntt_top/rtl/ntt_ctrl.sv +++ b/src/ntt_top/rtl/ntt_ctrl.sv @@ -33,9 +33,7 @@ module ntt_ctrl parameter MLDSA_Q_DIV2_ODD = (MLDSA_Q+1)/2, parameter MLDSA_N = 256, parameter MLDSA_LOGN = 8, - parameter MEM_ADDR_WIDTH = 15, - parameter BF_LATENCY = 10, //5 cycles per butterfly * 2 instances in serial = 10 clks - parameter NTT_BUF_LATENCY = 4 + parameter MEM_ADDR_WIDTH = 15 ) ( input wire clk, @@ -90,13 +88,9 @@ localparam INTT_READ_ADDR_STEP = 1; localparam INTT_WRITE_ADDR_STEP = 16; localparam PWO_READ_ADDR_STEP = 1; localparam PWO_WRITE_ADDR_STEP = 1; -localparam PWM_LATENCY = 5; -localparam MASKED_BF_STAGE1_LATENCY = 264; //TODO check -localparam MASKED_PWM_LATENCY = 209; //For 1 masked pwm operation localparam [MEM_ADDR_WIDTH-1:0] MEM_LAST_ADDR = 63; -localparam INTT_WRBUF_LATENCY = 13; //includes BF latency + mem latency for shuffled reads to begin -localparam MASKED_PWM_INTT_WRBUF_LATENCY = 481; //masked PWM+INTT latency + mem latency for shuffled reads to begin + //FSM states ntt_read_state_t read_fsm_state_ps, read_fsm_state_ns; ntt_write_state_t write_fsm_state_ps, write_fsm_state_ns; @@ -115,11 +109,12 @@ logic [3:0] chunk_count; logic [1:0] index_rand_offset, index_count, mem_rd_index_ofst; logic [1:0] buf_rdptr_int; logic [1:0] buf_rdptr_f; -logic [BF_LATENCY:0][1:0] buf_rdptr_reg; +logic [UNMASKED_BF_LATENCY:0][1:0] buf_rdptr_reg; //logic [INTT_WRBUF_LATENCY-1:0][1:0] buf_wrptr_reg; -logic [MASKED_PWM_INTT_WRBUF_LATENCY-1:0][1:0] buf_wrptr_reg; -logic [MASKED_BF_STAGE1_LATENCY:0][3:0] chunk_count_reg; -// logic [MASKED_PWM_INTT_WRBUF_LATENCY:0] chunk_count_reg; +logic [MASKED_INTT_WRBUF_LATENCY-1:0][1:0] buf_wrptr_reg; +// logic [MASKED_BF_STAGE1_LATENCY:0][3:0] chunk_count_reg; +logic [MASKED_INTT_WRBUF_LATENCY-3:0][3:0] chunk_count_reg; //buf latency not rqd + logic latch_chunk_rand_offset, latch_index_rand_offset; logic last_rd_addr, last_wr_addr; logic mem_wr_en_fsm, mem_wr_en_reg; @@ -366,9 +361,9 @@ always_comb begin pw_mem_rd_addr_a_nxt = pw_base_addr_a + (4*chunk_count) + (PWO_READ_ADDR_STEP*mem_rd_index_ofst); pw_mem_rd_addr_b_nxt = pw_base_addr_b + (4*chunk_count) + (PWO_READ_ADDR_STEP*mem_rd_index_ofst); pw_mem_rd_addr_c_nxt = accumulate ? pw_base_addr_c + ((4*chunk_count)+(PWO_READ_ADDR_STEP*mem_rd_index_ofst)) : 'h0; //TODO check timing - pw_mem_wr_addr_c_nxt = accumulate ? pw_base_addr_c + (4*chunk_count_reg[PWM_LATENCY-2]) + (PWO_WRITE_ADDR_STEP*buf_rdptr_reg[PWM_LATENCY-2]) + pw_mem_wr_addr_c_nxt = accumulate ? pw_base_addr_c + (4*chunk_count_reg[UNMASKED_PWM_LATENCY-2]) + (PWO_WRITE_ADDR_STEP*buf_rdptr_reg[UNMASKED_PWM_LATENCY-2]) : (pwa_mode | pws_mode) ? pw_base_addr_c + (4*chunk_count_reg[7]) + (PWO_WRITE_ADDR_STEP*buf_rdptr_reg[7]) - : pw_base_addr_c + (4*chunk_count_reg[PWM_LATENCY-1]) + (PWO_WRITE_ADDR_STEP*buf_rdptr_reg[PWM_LATENCY-1]); //2 + : pw_base_addr_c + (4*chunk_count_reg[UNMASKED_PWM_LATENCY-1]) + (PWO_WRITE_ADDR_STEP*buf_rdptr_reg[UNMASKED_PWM_LATENCY-1]); //2 end //PWO addr @@ -412,24 +407,24 @@ end //------------------------------------------ -//Twiddle addr logic - TODO: shuffling+masking (adjust latency) +//Twiddle addr logic //------------------------------------------ always_comb begin unique case(rounds_count) 'h0: begin twiddle_end_addr = ct_mode ? 'd0 : 'd63; twiddle_offset = 'h0; - twiddle_rand_offset = ct_mode ? 'h0 : pwm_intt_mode ? 7'((4*chunk_count_reg[MASKED_BF_STAGE1_LATENCY]) + buf_wrptr_reg[MASKED_PWM_INTT_WRBUF_LATENCY-1]) : 7'((4*chunk_count_reg[BF_LATENCY]) + buf_wrptr_reg[INTT_WRBUF_LATENCY-1]); + twiddle_rand_offset = ct_mode ? 'h0 : pwm_intt_mode ? 7'((4*chunk_count_reg[MASKED_INTT_WRBUF_LATENCY-MASKED_PWM_LATENCY-3]) + buf_wrptr_reg[MASKED_INTT_WRBUF_LATENCY-MASKED_PWM_LATENCY-1]) : 7'((4*chunk_count_reg[UNMASKED_BF_LATENCY]) + buf_wrptr_reg[INTT_WRBUF_LATENCY-1]); //pwm_intt mode only applies to round 0. Other rounds follow gs calc end 'h1: begin twiddle_end_addr = ct_mode ? 'd3 : 'd15; twiddle_offset = ct_mode ? 'd1 : 'd64; - twiddle_rand_offset = ct_mode ? 7'(buf_rdptr_int) : pwm_intt_mode ? 7'((chunk_count_reg[MASKED_BF_STAGE1_LATENCY] % 4)*4 + buf_wrptr_reg[MASKED_PWM_INTT_WRBUF_LATENCY-1]) : 7'((chunk_count_reg[BF_LATENCY] % 4)*4 + buf_wrptr_reg[INTT_WRBUF_LATENCY-1]); + twiddle_rand_offset = ct_mode ? 7'(buf_rdptr_int) : 7'((chunk_count_reg[UNMASKED_BF_LATENCY] % 4)*4 + buf_wrptr_reg[INTT_WRBUF_LATENCY-1]); end 'h2: begin twiddle_end_addr = ct_mode ? 'd15 : 'd3; twiddle_offset = ct_mode ? 'd5 : 'd80; - twiddle_rand_offset = ct_mode ? 7'((chunk_count % 'd4)*'d4 + buf_rdptr_int) : pwm_intt_mode ? 7'(buf_wrptr_reg[MASKED_PWM_INTT_WRBUF_LATENCY-1]) : 7'(buf_wrptr_reg[INTT_WRBUF_LATENCY-1]); + twiddle_rand_offset = ct_mode ? 7'((chunk_count % 'd4)*'d4 + buf_rdptr_int) : 7'(buf_wrptr_reg[INTT_WRBUF_LATENCY-1]); end 'h3: begin twiddle_end_addr = ct_mode ? 'd63 : 'd0; @@ -578,16 +573,19 @@ always_ff @(posedge clk or negedge reset_n) begin buf_wrptr_reg <= 'h0; end else if (ct_mode & (buf_rden_ntt | butterfly_ready)) begin - buf_rdptr_reg <= {buf_rdptr_int, buf_rdptr_reg[BF_LATENCY:1]}; + buf_rdptr_reg <= {buf_rdptr_int, buf_rdptr_reg[UNMASKED_BF_LATENCY:1]}; end else if ((gs_mode & (incr_mem_rd_addr | butterfly_ready))) begin - buf_wrptr_reg <= {{(MASKED_PWM_INTT_WRBUF_LATENCY-INTT_WRBUF_LATENCY){2'h0}}, mem_rd_index_ofst, buf_wrptr_reg[INTT_WRBUF_LATENCY-1:1]}; + buf_wrptr_reg <= {{(MASKED_INTT_WRBUF_LATENCY-INTT_WRBUF_LATENCY){2'h0}}, mem_rd_index_ofst, buf_wrptr_reg[INTT_WRBUF_LATENCY-1:1]}; end else if (pwo_mode & (incr_pw_rd_addr | butterfly_ready)) begin - buf_rdptr_reg <= {mem_rd_index_ofst, buf_rdptr_reg[BF_LATENCY:1]}; //TODO: create new reg with apt name for PWO + buf_rdptr_reg <= {mem_rd_index_ofst, buf_rdptr_reg[UNMASKED_BF_LATENCY:1]}; //TODO: create new reg with apt name for PWO + end + else if ((pwm_intt_mode)) begin + buf_wrptr_reg <= {mem_rd_index_ofst, buf_wrptr_reg[MASKED_INTT_WRBUF_LATENCY-1:1]}; end else if ((pwm_intt_mode)) begin - buf_wrptr_reg <= {mem_rd_index_ofst, buf_wrptr_reg[MASKED_PWM_INTT_WRBUF_LATENCY-1:1]}; + buf_wrptr_reg <= {mem_rd_index_ofst, buf_wrptr_reg[MASKED_INTT_WRBUF_LATENCY-1:1]}; end else begin buf_rdptr_reg <= 'h0; @@ -627,11 +625,11 @@ always_ff @(posedge clk or negedge reset_n) begin chunk_count_reg <= 'h0; end //chunk update can't use incr_mem_rd_addr in pwm_intt mode. - else if (pwm_intt_mode & incr_pw_rd_addr) begin - chunk_count_reg <= {chunk_count, chunk_count_reg[MASKED_BF_STAGE1_LATENCY:1]}; + else if (pwm_intt_mode/* & incr_pw_rd_addr*/) begin + chunk_count_reg <= {chunk_count, chunk_count_reg[MASKED_INTT_WRBUF_LATENCY-3:1]}; end else if (buf_rden_ntt | butterfly_ready | (gs_mode & incr_mem_rd_addr) | (pwo_mode & incr_pw_rd_addr)) begin //TODO: replace gs condition with an fsm generated flag perhaps? - chunk_count_reg <= {{(MASKED_BF_STAGE1_LATENCY+1-BF_LATENCY){4'h0}}, chunk_count, chunk_count_reg[BF_LATENCY:1]}; + chunk_count_reg <= {{(MASKED_BF_STAGE1_LATENCY+1-UNMASKED_BF_LATENCY){4'h0}}, chunk_count, chunk_count_reg[UNMASKED_BF_LATENCY:1]}; end end diff --git a/src/ntt_top/rtl/ntt_defines_pkg.sv b/src/ntt_top/rtl/ntt_defines_pkg.sv index 606736b..9e86be7 100644 --- a/src/ntt_top/rtl/ntt_defines_pkg.sv +++ b/src/ntt_top/rtl/ntt_defines_pkg.sv @@ -30,6 +30,23 @@ parameter NTT_REG_SIZE = REG_SIZE-1; parameter MASKED_WIDTH = 46; // parameter MEM_DEPTH = 2**MLDSA_MEM_ADDR_WIDTH; +//---------------------- +//Latency params for NTT +//---------------------- +parameter INTT_WRBUF_LATENCY = 13; +parameter UNMASKED_BF_LATENCY = 10; //5 cycles per butterfly * 2 instances in serial = 10 clks +parameter UNMASKED_PWM_LATENCY = 5; //latency of modular multiplier + modular addition to perform accumulation +parameter UNMASKED_PWA_LATENCY = 1; //latency of modular addition +parameter UNMASKED_PWS_LATENCY = 1; //latency of modular subtraction +parameter UNMASKED_BF_STAGE1_LATENCY = UNMASKED_BF_LATENCY/2; + +parameter MASKED_PWM_LATENCY = 211; //For 1 masked pwm operation +parameter MASKED_BF_STAGE1_LATENCY = 266; //For 1 masked butterfly operation +parameter MASKED_PWM_MASKED_INTT_LATENCY = MASKED_PWM_LATENCY + MASKED_BF_STAGE1_LATENCY; //PWM+stage1 INTT latency +parameter MASKED_INTT_LATENCY = MASKED_BF_STAGE1_LATENCY + UNMASKED_BF_STAGE1_LATENCY; //masked INTT latency +parameter MASKED_PWM_INTT_LATENCY = MASKED_PWM_LATENCY + MASKED_INTT_LATENCY + 1; //TODO: adjust for PWMA case. Adding 1 cyc as a placeholder for it +parameter MASKED_ADD_SUB_LATENCY = 53; //For 1 masked add/sub operation +parameter MASKED_INTT_WRBUF_LATENCY = MASKED_PWM_LATENCY + MASKED_INTT_LATENCY + 3; //masked PWM+INTT latency + mem latency for shuffled reads to begin (does not include PWMA case) // typedef enum logic [2:0] {ct, gs, pwm, pwa, pws} mode_t; //TODO: tb has issue with enums in top level ports. For now, using this workaround diff --git a/src/ntt_top/rtl/ntt_hybrid_butterfly_2x2.sv b/src/ntt_top/rtl/ntt_hybrid_butterfly_2x2.sv index 27408d8..f73356c 100644 --- a/src/ntt_top/rtl/ntt_hybrid_butterfly_2x2.sv +++ b/src/ntt_top/rtl/ntt_hybrid_butterfly_2x2.sv @@ -47,20 +47,6 @@ module ntt_hybrid_butterfly_2x2 output logic ready_o ); -//---------------------- -//Latency params -//---------------------- -localparam UNMASKED_BF_LATENCY = 10; //5 cycles per butterfly * 2 instances in serial = 10 clks -localparam UNMASKED_PWM_LATENCY = 5; //latency of modular multiplier + modular addition to perform accumulation -localparam UNMASKED_PWA_LATENCY = 1; //latency of modular addition -localparam UNMASKED_PWS_LATENCY = 1; //latency of modular subtraction -localparam UNMASKED_BF_STAGE1_LATENCY = UNMASKED_BF_LATENCY/2; -localparam MASKED_BF_STAGE1_LATENCY = 264; //For 1 masked butterfly -localparam MASKED_PWM_LATENCY = 209; //For 1 masked pwm operation -localparam MASKED_PWM_MASKED_INTT_LATENCY = MASKED_PWM_LATENCY + MASKED_BF_STAGE1_LATENCY; -localparam MASKED_INTT_LATENCY = MASKED_BF_STAGE1_LATENCY + UNMASKED_BF_STAGE1_LATENCY; -localparam MASKED_PWM_INTT_LATENCY = MASKED_PWM_LATENCY + MASKED_INTT_LATENCY + 1; //TODO: adjust for PWMA case. Adding 1 cyc as a placeholder for it - //---------------------- //Unmasked wires //---------------------- @@ -77,8 +63,8 @@ logic masking_en_reg; //Other internal wires logic [UNMASKED_BF_STAGE1_LATENCY-1:0][HALF_WIDTH-1:0] w10_reg, w11_reg; //Shift w10 by 5 cycles to match 1st stage BF latency -logic [MASKED_PWM_LATENCY-1:0][HALF_WIDTH-1:0] masked_w00_reg, masked_w01_reg; -logic [MASKED_PWM_MASKED_INTT_LATENCY-1:0][HALF_WIDTH-1:0] masked_w10_reg, masked_w11_reg; +// logic [MASKED_PWM_LATENCY-1:0][HALF_WIDTH-1:0] masked_w00_reg, masked_w01_reg; +logic [MASKED_BF_STAGE1_LATENCY-1:0][HALF_WIDTH-1:0] masked_w10_reg, masked_w11_reg; logic pwo_mode, pwm_intt_mode; // logic [UNMASKED_BF_LATENCY-1:0] ready_reg; logic [MASKED_PWM_INTT_LATENCY-1:0] masked_ready_reg; @@ -87,6 +73,7 @@ logic [MASKED_PWM_INTT_LATENCY-1:0] masked_ready_reg; logic [1:0][WIDTH-1:0] u00_share, u01_share, v00_share, v01_share, u10_share, v10_share, u11_share, v11_share; logic [1:0][WIDTH-1:0] w00_share, w01_share, w10_share, w11_share; //, w10_reg_share, w11_reg_share; logic [1:0][WIDTH-1:0] uv00_share, uv01_share, uv10_share, uv11_share; +logic [1:0][WIDTH-1:0] uv00_share_reg, uv01_share_reg, uv10_share_reg, uv11_share_reg; logic [1:0][WIDTH-1:0] twiddle_w00_share, twiddle_w01_share; bf_uvo_t masked_gs_stage1_uvo; @@ -112,25 +99,18 @@ end always_ff @(posedge clk or negedge reset_n) begin if (!reset_n) begin - masked_w00_reg <= 'h0; - masked_w01_reg <= 'h0; masked_w10_reg <= 'h0; - masked_w11_reg <= 'h0; end else if (zeroize) begin - masked_w00_reg <= 'h0; - masked_w01_reg <= 'h0; masked_w10_reg <= 'h0; - masked_w11_reg <= 'h0; end else begin - masked_w00_reg <= {hybrid_pw_uvw_i.twiddle_w0_i, masked_w00_reg[MASKED_PWM_LATENCY-1:1]}; //TODO add PWMA latency when Ay countermeasure is added - masked_w01_reg <= {hybrid_pw_uvw_i.twiddle_w1_i, masked_w01_reg[MASKED_PWM_LATENCY-1:1]}; - masked_w10_reg <= {hybrid_pw_uvw_i.twiddle_w2_i, masked_w10_reg[MASKED_PWM_MASKED_INTT_LATENCY-1:1]}; - masked_w11_reg <= {hybrid_pw_uvw_i.twiddle_w3_i, masked_w11_reg[MASKED_PWM_MASKED_INTT_LATENCY-1:1]}; + masked_w10_reg <= {hybrid_pw_uvw_i.twiddle_w2_i, masked_w10_reg[MASKED_BF_STAGE1_LATENCY-1:1]}; end end +assign masked_w11_reg = masked_w10_reg; //used only in masked INTT, both are equal, so can opt num of flops + assign pwo_mode = (mode inside {pwm, pwa, pws}); assign pwm_intt_mode = (mode == pwm_intt) & masking_en; @@ -191,105 +171,99 @@ always_comb begin end //Split into shares -always_comb begin - //TODO: check randomness with Emre - //TODO: add flops here (input side) +always_ff @(posedge clk or negedge reset_n) begin + if (!reset_n) begin + for (int i = 0; i < 2; i++) begin + u00_share[i] <= 'h0; + u01_share[i] <= 'h0; + u10_share[i] <= 'h0; + u11_share[i] <= 'h0; + + v00_share[i] <= 'h0; + v01_share[i] <= 'h0; + v10_share[i] <= 'h0; + v11_share[i] <= 'h0; + + w00_share[i] <= 'h0; + w01_share[i] <= 'h0; + w10_share[i] <= 'h0; + w11_share[i] <= 'h0; + + twiddle_w00_share[i] <= 'h0; + twiddle_w01_share[i] <= 'h0; + end + end + else if (zeroize) begin + for (int i = 0; i < 2; i++) begin + u00_share[i] <= 'h0; + u01_share[i] <= 'h0; + u10_share[i] <= 'h0; + u11_share[i] <= 'h0; + + v00_share[i] <= 'h0; + v01_share[i] <= 'h0; + v10_share[i] <= 'h0; + v11_share[i] <= 'h0; + + w00_share[i] <= 'h0; + w01_share[i] <= 'h0; + w10_share[i] <= 'h0; + w11_share[i] <= 'h0; + + twiddle_w00_share[i] <= 'h0; + twiddle_w01_share[i] <= 'h0; + end + end + else begin //Split u inputs - if (masking_en) begin - u00_share[0] = WIDTH'(u00) - rnd_i[0]; - u00_share[1] = rnd_i[0]; + u00_share[0] <= WIDTH'(u00) - rnd_i[0]; + u00_share[1] <= rnd_i[0]; - u01_share[0] = WIDTH'(u01) - rnd_i[1]; - u01_share[1] = rnd_i[1]; + u01_share[0] <= WIDTH'(u01) - rnd_i[0]; + u01_share[1] <= rnd_i[0]; - u10_share[0] = WIDTH'(u10) - rnd_i[0]; - u10_share[1] = rnd_i[0]; + u10_share[0] <= WIDTH'(u10) - rnd_i[0]; + u10_share[1] <= rnd_i[0]; - u11_share[0] = WIDTH'(u11) - rnd_i[0]; - u11_share[1] = rnd_i[0]; + u11_share[0] <= WIDTH'(u11) - rnd_i[0]; + u11_share[1] <= rnd_i[0]; //Split v inputs - v00_share[0] = WIDTH'(v00) - rnd_i[2]; - v00_share[1] = rnd_i[2]; + v00_share[0] <= WIDTH'(v00) - rnd_i[1]; + v00_share[1] <= rnd_i[1]; - v01_share[0] = WIDTH'(v01) - rnd_i[3]; - v01_share[1] = rnd_i[3]; + v01_share[0] <= WIDTH'(v01) - rnd_i[1]; + v01_share[1] <= rnd_i[1]; - v10_share[0] = WIDTH'(v10) - rnd_i[2]; - v10_share[1] = rnd_i[2]; + v10_share[0] <= WIDTH'(v10) - rnd_i[1]; + v10_share[1] <= rnd_i[1]; - v11_share[0] = WIDTH'(v11) - rnd_i[2]; - v11_share[1] = rnd_i[2]; + v11_share[0] <= WIDTH'(v11) - rnd_i[1]; + v11_share[1] <= rnd_i[1]; //Split w inputs - w00_share[0] = WIDTH'(w00) - rnd_i[4]; - w00_share[1] = rnd_i[4]; - - w01_share[0] = WIDTH'(w01) - rnd_i[0]; - w01_share[1] = rnd_i[0]; - - w10_share[0] = WIDTH'(w10) - rnd_i[1]; - w10_share[1] = rnd_i[1]; - - w11_share[0] = WIDTH'(w11) - rnd_i[2]; - w11_share[1] = rnd_i[2]; - - twiddle_w00_share[0] = WIDTH'(masked_w00_reg[0]) - rnd_i[0]; - twiddle_w00_share[1] = rnd_i[0]; - - twiddle_w01_share[0] = WIDTH'(masked_w01_reg[0]) - rnd_i[1]; - twiddle_w01_share[1] = rnd_i[1]; - - end - else begin - u00_share[0] = 'h0; - u00_share[1] = 'h0; - - u01_share[0] = 'h0; - u01_share[1] = 'h0; - - u10_share[0] = 'h0; - u10_share[1] = 'h0; - - u11_share[0] = 'h0; - u11_share[1] = 'h0; - - //Split v input - v00_share[0] = 'h0; - v00_share[1] = 'h0; - - v01_share[0] = 'h0; - v01_share[1] = 'h0; - - v10_share[0] = 'h0; - v10_share[1] = 'h0; - - v11_share[0] = 'h0; - v11_share[1] = 'h0; - - //Split w input - w00_share[0] = 'h0; - w00_share[1] = 'h0; + w00_share[0] <= WIDTH'(w00) - rnd_i[2]; + w00_share[1] <= rnd_i[2]; - w01_share[0] = 'h0; - w01_share[1] = 'h0; + w01_share[0] <= WIDTH'(w01) - rnd_i[2]; + w01_share[1] <= rnd_i[2]; - w10_share[0] = 'h0; - w10_share[1] = 'h0; + w10_share[0] <= WIDTH'(w10) - rnd_i[2]; + w10_share[1] <= rnd_i[2]; - w11_share[0] = 'h0; - w11_share[1] = 'h0; + w11_share[0] <= WIDTH'(w11) - rnd_i[2]; + w11_share[1] <= rnd_i[2]; - twiddle_w00_share[0] = 'h0; - twiddle_w00_share[1] = 'h0; + twiddle_w00_share[0] <= WIDTH'(hybrid_pw_uvw_i.twiddle_w0_i) - rnd_i[3]; + twiddle_w00_share[1] <= rnd_i[3]; - twiddle_w01_share[0] = 'h0; - twiddle_w01_share[1] = 'h0; + twiddle_w01_share[0] <= WIDTH'(hybrid_pw_uvw_i.twiddle_w1_i) - rnd_i[3]; + twiddle_w01_share[1] <= rnd_i[3]; end end //---------------------------------------------------- -//Masked PWMs - Used in masked PWM+INTT mode only - 209 clks +//Masked PWMs - Used in masked PWM+INTT mode only - 210 clks //---------------------------------------------------- ntt_masked_pwm #( .WIDTH(WIDTH) @@ -347,6 +321,39 @@ ntt_masked_pwm #( .res(uv11_share) ); +//--------------------------- +//Refresh randomness +//--------------------------- +always_ff @(posedge clk or negedge reset_n) begin + if (!reset_n) begin + for (int i = 0; i < 2; i++) begin + uv00_share_reg[i] <= 'h0; + uv01_share_reg[i] <= 'h0; + uv10_share_reg[i] <= 'h0; + uv11_share_reg[i] <= 'h0; + end + end + else if (zeroize) begin + for (int i = 0; i < 2; i++) begin + uv00_share_reg[i] <= 'h0; + uv01_share_reg[i] <= 'h0; + uv10_share_reg[i] <= 'h0; + uv11_share_reg[i] <= 'h0; + end + end + else begin + uv00_share_reg[0] <= uv00_share[0] - rnd_i[0]; + uv01_share_reg[0] <= uv01_share[0] - rnd_i[1]; + uv10_share_reg[0] <= uv10_share[0] - rnd_i[2]; + uv11_share_reg[0] <= uv11_share[0] - rnd_i[3]; + + uv00_share_reg[1] <= uv00_share[1] + rnd_i[0]; + uv01_share_reg[1] <= uv01_share[1] + rnd_i[1]; + uv10_share_reg[1] <= uv10_share[1] + rnd_i[2]; + uv11_share_reg[1] <= uv11_share[1] + rnd_i[3]; + end +end + //---------------------------------------------------- //Masked BFU stage 1 - Used in masked PWM+INTT mode only - 264 clks //PWM outputs: uv00[1:0], uv01[1:0], uv10[1:0], uv11[1:0] @@ -357,7 +364,7 @@ ntt_masked_butterfly1x2 #( .clk(clk), .reset_n(reset_n), .zeroize(zeroize), - .uvw_i({uv00_share, uv10_share, uv01_share, uv11_share, twiddle_w00_share, twiddle_w01_share}), + .uvw_i({uv00_share_reg, uv10_share_reg, uv01_share_reg, uv11_share_reg, twiddle_w00_share, twiddle_w01_share}), .rnd_i({rnd_i[4], rnd_i[3], rnd_i[2], rnd_i[1], rnd_i[0]}), .uv_o(masked_gs_stage1_uvo) ); @@ -443,12 +450,12 @@ always_ff @(posedge clk or negedge reset_n) begin masked_ready_reg <= 'b0; else begin unique case(mode) //471:0 delay flop for enable - TODO: optimize - ct: masked_ready_reg <= {462'h0, enable, masked_ready_reg[UNMASKED_BF_LATENCY-1:1]}; - gs: masked_ready_reg <= {462'h0, enable, masked_ready_reg[UNMASKED_BF_LATENCY-1:1]}; - pwm: masked_ready_reg <= accumulate ? {467'h0, enable, masked_ready_reg[UNMASKED_PWM_LATENCY-1:1]} : {6'h0, enable, masked_ready_reg[UNMASKED_PWM_LATENCY-2:1]}; + ct: masked_ready_reg <= {{(MASKED_PWM_INTT_LATENCY-UNMASKED_BF_LATENCY){1'b0}}, enable, masked_ready_reg[UNMASKED_BF_LATENCY-1:1]}; + gs: masked_ready_reg <= {{(MASKED_PWM_INTT_LATENCY-UNMASKED_BF_LATENCY){1'b0}}, enable, masked_ready_reg[UNMASKED_BF_LATENCY-1:1]}; + pwm: masked_ready_reg <= accumulate ? {{(MASKED_PWM_INTT_LATENCY-UNMASKED_PWM_LATENCY){1'b0}}, enable, masked_ready_reg[UNMASKED_PWM_LATENCY-1:1]} : {6'h0, enable, masked_ready_reg[UNMASKED_PWM_LATENCY-2:1]}; pwm_intt: masked_ready_reg <= accumulate ? {enable, masked_ready_reg[MASKED_PWM_INTT_LATENCY-1:1]} : {1'b0, enable, masked_ready_reg[MASKED_PWM_INTT_LATENCY-2:1]}; //TODO revisit - pwa: masked_ready_reg <= {471'h0, enable}; - pws: masked_ready_reg <= {471'h0, enable}; + pwa: masked_ready_reg <= {{MASKED_PWM_INTT_LATENCY-1{1'b0}}, enable}; + pws: masked_ready_reg <= {{MASKED_PWM_INTT_LATENCY-1{1'b0}}, enable}; default: masked_ready_reg <= 'h0; endcase end diff --git a/src/ntt_top/rtl/ntt_masked_BFU_mult.sv b/src/ntt_top/rtl/ntt_masked_BFU_mult.sv index 4d99f2d..c4ccbaf 100644 --- a/src/ntt_top/rtl/ntt_masked_BFU_mult.sv +++ b/src/ntt_top/rtl/ntt_masked_BFU_mult.sv @@ -15,7 +15,7 @@ //====================================================================== // // ntt_masked_BFU_mult -// Performs two share multiplication and reduction - total latency = 209 clks +// Performs two share multiplication and reduction - total latency = 210 clks //====================================================================== module ntt_masked_BFU_mult @@ -48,7 +48,7 @@ module ntt_masked_BFU_mult logic [1:0] mul_res_reduced [WIDTH-1:0]; logic [WIDTH-1:0] mul_res_bool_redux0, mul_res_bool_redux1, mul_res_redux0, mul_res_redux1; - //Perform mul on input shares - 1 clk + //Perform mul on input shares - 2 clk abr_masked_N_bit_mult_two_share #( .WIDTH(WIDTH) ) masked_two_share_mult_inst ( diff --git a/src/ntt_top/rtl/ntt_masked_butterfly1x2.sv b/src/ntt_top/rtl/ntt_masked_butterfly1x2.sv index a04de0c..b1caca8 100644 --- a/src/ntt_top/rtl/ntt_masked_butterfly1x2.sv +++ b/src/ntt_top/rtl/ntt_masked_butterfly1x2.sv @@ -58,7 +58,7 @@ module ntt_masked_butterfly1x2 w01 = uvw_i.w01_i; end - //263 + //264 ntt_masked_gs_butterfly #( .WIDTH(WIDTH) ) masked_bf_inst00 ( diff --git a/src/ntt_top/rtl/ntt_masked_gs_butterfly.sv b/src/ntt_top/rtl/ntt_masked_gs_butterfly.sv index 0a8034f..8c4964b 100644 --- a/src/ntt_top/rtl/ntt_masked_gs_butterfly.sv +++ b/src/ntt_top/rtl/ntt_masked_gs_butterfly.sv @@ -16,7 +16,7 @@ // ntt_masked_gs_butterfly.sv // -------- // Only performs gs (INTT) mode of operation. All blocks are masked -// Latency = 262 clks +// Latency = 264 clks module ntt_masked_gs_butterfly import mldsa_params_pkg::*; @@ -38,8 +38,6 @@ module ntt_masked_gs_butterfly output logic [1:0] v_o [WIDTH-1:0] ); - localparam MASKED_MULT_LATENCY = 209; - localparam MASKED_ADD_SUB_LATENCY = 53; logic [MASKED_ADD_SUB_LATENCY-1:0][1:0][WIDTH-1:0] w_reg; logic [1:0] add_res [WIDTH-1:0]; logic [1:0] sub_res [WIDTH-1:0]; @@ -70,7 +68,7 @@ module ntt_masked_gs_butterfly abr_delay_masked_shares #( .WIDTH(WIDTH), - .N(MASKED_MULT_LATENCY) + .N(MASKED_PWM_LATENCY-1) //Inputs to BF multiplier are internal to this block. There's no input flop in the path, so latency is 1 clk less than the mult latency defined in the pkg ) add_res_delay_inst ( .clk(clk), .rst_n(reset_n), @@ -125,7 +123,7 @@ module ntt_masked_gs_butterfly end end - //209 clks + //210 clks ntt_masked_BFU_mult #( .WIDTH(WIDTH) ) mult_inst_0 ( diff --git a/src/ntt_top/rtl/ntt_masked_pwm.sv b/src/ntt_top/rtl/ntt_masked_pwm.sv index a162180..ff934a9 100644 --- a/src/ntt_top/rtl/ntt_masked_pwm.sv +++ b/src/ntt_top/rtl/ntt_masked_pwm.sv @@ -19,14 +19,13 @@ // This module performs masked pwm operation with or without accumulate // on input shares. Always performs (u*v)+w (top level needs to drive 0 // to the w input if not in accumulate mode) -// 209 clks if PWM, 262 clks if PWMA +// 210 clks if PWM, 263 clks if PWMA module ntt_masked_pwm import mldsa_params_pkg::*; import ntt_defines_pkg::*; #( - parameter WIDTH = 46, - parameter MASKED_MULT_LATENCY = 209 + parameter WIDTH = 46 ) ( input wire clk, @@ -51,15 +50,15 @@ module ntt_masked_pwm w_unpacked[i][0] = w[0][i]; w_unpacked[i][1] = w[1][i]; - w_reg_packed[0][i] = w_reg[i][0]; - w_reg_packed[1][i] = w_reg[i][1]; + w_reg_packed[0][i] = 'h0; //w_reg[i][0]; + w_reg_packed[1][i] = 'h0; //w_reg[i][1]; //TODO: fix mul_res_packed[0][i] = mul_res[i][0]; mul_res_packed[1][i] = mul_res[i][1]; end end - //209 clks + //210 clks ntt_masked_BFU_mult #( .WIDTH(WIDTH) ) mult_inst0 ( @@ -76,17 +75,19 @@ module ntt_masked_pwm .res(mul_res) ); - abr_delay_masked_shares #( - .WIDTH(WIDTH), - .N(MASKED_MULT_LATENCY) - ) w_delay ( - .clk(clk), - .rst_n(reset_n), - .zeroize(zeroize), - .input_reg(w_unpacked), - .delayed_reg(w_reg) - ); + // //Delay reading addr until after PWM is done to do accumulate + // abr_delay_masked_shares #( + // .WIDTH(WIDTH), + // .N(MASKED_PWM_LATENCY-1) + // ) w_delay ( + // .clk(clk), + // .rst_n(reset_n), + // .zeroize(zeroize), + // .input_reg(w_unpacked), + // .delayed_reg(w_reg) + // ); + //53 clks (accumulate case) ntt_masked_BFU_add_sub #( .WIDTH(WIDTH) ) add_inst0 ( diff --git a/src/ntt_top/tb/ntt_top_masking_tb.sv b/src/ntt_top/tb/ntt_top_masking_tb.sv index cad3e0d..409a536 100644 --- a/src/ntt_top/tb/ntt_top_masking_tb.sv +++ b/src/ntt_top/tb/ntt_top_masking_tb.sv @@ -68,9 +68,11 @@ logic wren_tb, rden_tb; logic [1:0] wrptr_tb, rdptr_tb; logic [5:0] random_tb; bf_uvwi_t uvw_i_tb; +masked_bf_uvwi_t masked_uvw_i_tb; pwo_uvwi_t pw_uvw_i_tb; hybrid_bf_uvwi_t hybrid_uvw_i_tb; - +bf_uvo_t uv_o_tb; +logic [REG_SIZE-1:0] u10, u11, v10, v11; //---------------------------------------------------------------- // Device Under Test. //---------------------------------------------------------------- @@ -110,6 +112,7 @@ hybrid_bf_uvwi_t hybrid_uvw_i_tb; // .u(u), // .v(v), // .w(w), +// .accumulate(1'b1), // .rnd({rnd0+rnd1, rnd3, rnd2, rnd1, rnd0}), // .res() // ); @@ -118,7 +121,7 @@ hybrid_bf_uvwi_t hybrid_uvw_i_tb; // .clk(clk_tb), // .reset_n(reset_n_tb), // .zeroize(zeroize_tb), -// .uvw_i(uvw_i_tb), +// .uvw_i(masked_uvw_i_tb), // .rnd_i({rnd0+rnd1, rnd3, rnd2, rnd1, rnd0}), // .uv_o() // ); @@ -250,6 +253,8 @@ task init_sim; uvw_i_tb.v01_i = 'h0; uvw_i_tb.w00_i = 'h0; uvw_i_tb.w01_i = 'h0; + uvw_i_tb.w10_i = 'h0; + uvw_i_tb.w11_i = 'h0; pw_uvw_i_tb.u0_i = 'h0; pw_uvw_i_tb.v0_i = 'h0; @@ -287,6 +292,13 @@ task init_sim; hybrid_uvw_i_tb.twiddle_w2_i = 'h0; hybrid_uvw_i_tb.twiddle_w3_i = 'h0; + masked_uvw_i_tb = {'h0, 'h0, 'h0, 'h0, 'h0, 'h0}; + + u10 = 'h0; + u11 = 'h0; + v10 = 'h0; + v11 = 'h0; + $display("End of init\n"); end endtask @@ -447,48 +459,75 @@ task masked_pwm_test(); endtask */ -// task masked_bfu_1x2_test(); -// logic [45:0] rand0, rand1, rand2; -// for (int i = 0; i < 10; i++) begin -// @(posedge clk_tb); -// fork -// begin -// actual_u = $random()%PRIME; -// actual_v = $random()%PRIME; -// actual_w = 'h2; +task div2(logic [REG_SIZE-1:0] in, output logic [REG_SIZE-1:0] out); + if (in[0]) begin + out = (in >> 1) + ((PRIME+1)/2); + end + else begin + out = (in >> 1); + end +endtask +/* +task masked_bfu_1x2_test(); + logic [45:0] rand0, rand1, rand2; + logic [REG_SIZE:0] temp; + for (int i = 0; i < 10; i++) begin + @(posedge clk_tb); + fork + begin + actual_u = $random()%PRIME; + actual_v = $random()%PRIME; + actual_w = 'h2; -// // u_array = actual_u; -// // v_array = actual_v; -// rand0 = $random()%PRIME; -// rand1 = $random()%PRIME; -// rand2 = $random()%PRIME; + // u_array = actual_u; + // v_array = actual_v; + rand0 = $random()%PRIME; + rand1 = $random()%PRIME; + rand2 = $random()%PRIME; -// // $display("actual u = %h, actual v = %h", actual_u, actual_v); + $display("actual u = %h, actual v = %h", actual_u, actual_v); -// u[0] = actual_u-rand0; -// u[1] = rand0; -// v[0] = actual_v-rand1; -// v[1] = rand1; -// w[0] = actual_w-rand2; -// w[1] = rand2; + u[0] = actual_u-rand0; + u[1] = rand0; + v[0] = actual_v-rand1; + v[1] = rand1; + w[0] = actual_w-rand2; + w[1] = rand2; -// uvw_i_tb.u00_i = u; -// uvw_i_tb.u01_i = u; -// uvw_i_tb.v00_i = v; -// uvw_i_tb.v01_i = v; -// uvw_i_tb.w00_i = w; -// uvw_i_tb.w01_i = w; -// // $display("u0 = %h, u1 = %h, v0 = %h, v1 = %h", u[0], u[1], v[0], v[1]); -// end -// // begin -// // repeat(264) @(posedge clk_tb); -// // if ((dut.res[0] + dut.res[1]) != ((((actual_u * actual_v)%PRIME)+actual_w) % PRIME)) begin -// // $error("U = u*v+w Mismatch: exp_output = %h output shares = %h %h actual output = %h", ((((actual_u * actual_v)%PRIME)+actual_w) % PRIME), dut.res[0], dut.res[1], dut.res[0] + dut.res[1]); -// // end -// // end -// join -// end -// endtask + masked_uvw_i_tb.u00_i = u; + masked_uvw_i_tb.u01_i = u; + masked_uvw_i_tb.v00_i = v; + masked_uvw_i_tb.v01_i = v; + masked_uvw_i_tb.w00_i = w; + masked_uvw_i_tb.w01_i = w; + // $display("u0 = %h, u1 = %h, v0 = %h, v1 = %h", u[0], u[1], v[0], v[1]); + end + begin + repeat(267) @(posedge clk_tb); + $display("actual u = %h, actual v = %h", actual_u, actual_v); + u10 = (actual_u+actual_v)%PRIME; + if (actual_u < actual_v) + temp = actual_u + PRIME; + else + temp = actual_u; + v10 = (((temp-actual_v)%PRIME)*actual_w)%PRIME; + $display("u10 = %h", u10); + $display("v10 = %h", v10); + $display("----------"); + div2(u10, uv_o_tb.u20_o); + div2(v10, uv_o_tb.u21_o); + div2(u10, uv_o_tb.v20_o); //since same inputs + div2(v10, uv_o_tb.v21_o); + + if (dut.uv_o != uv_o_tb) + $error("Data mismatch. Expected = {%h, %h, %h, %h}, Observed = {%h, %h, %h, %h}", uv_o_tb.u20_o, uv_o_tb.u21_o, uv_o_tb.v20_o, uv_o_tb.v21_o, dut.uv_o.u20_o, dut.uv_o.u21_o, dut.uv_o.v20_o, dut.uv_o.v21_o); + else + $display("Success: Matched results"); + end + join + end +endtask +*/ task masked_hybrid_bf_2x2_test(); logic [45:0] rand0, rand1, rand2; @@ -569,7 +608,8 @@ initial begin // masked_BFU_mult_test(); // masked_gs_butterfly_test(); // masked_pwm_test(); - masked_hybrid_bf_2x2_test(); + masked_bfu_1x2_test(); + // masked_hybrid_bf_2x2_test(); repeat(1000) @(posedge clk_tb); $finish;