flashattention_2: Add low-precision implementations

colluca · Viviane Potocnik · commit 1df78252f6e0 · 2024-05-08T19:15:18.000+02:00
flashattention_2: Correct bug in datagen to support more sizes

flashattention_2: Add mcycle calls for benchmarking

flashattention_2: Add TCASAI experiments

flashattention_2: Correct bug reallocating V^t on every iteration
diff --git a/sw/dnn/flashattention_2/scripts/datagen.py b/sw/dnn/flashattention_2/scripts/datagen.py
@@ -23,6 +23,7 @@
                        format_struct_definition, format_array_definition, \
                        format_array_declaration  # noqa: E402
 
+np.random.seed(42)
 np.random.seed(42)
 torch.manual_seed(42)
 
@@ -92,6 +93,7 @@ def exact_golden_model(Q, K, V, B_r, B_c):
 def exact_flexfloat_golden_model(Q, K, V, B_r, B_c, desc):
     # Get layer dimensions
     N = Q.shape[0]
+    d = Q.shape[1]
     # Calculate tiling parameters
     T_r = N // B_r
     T_c = N // B_c
@@ -111,15 +113,16 @@ def exact_flexfloat_golden_model(Q, K, V, B_r, B_c, desc):
             start_col = j * B_c
             end_col = start_col + B_c
             K_t_j = K_t[:, start_col:end_col]
-            V_j = V[start_col:end_col, ]
+            V_j = V[start_col:end_col,]
             # Compute O tile update
             S_ij = ff.array(np.zeros((B_r, B_c)), desc)
             S_ij = gemm.datagen.GemmDataGen().exact_golden_model(1, Q_i, K_t_j, 0, S_ij)
             m_i_prev = m_i
             m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
             shifted_exp = np.exp((m_i_prev - m_i).astype(np.float32))
             P_ij = np.exp((S_ij - m_i).astype(np.float32))
-            PxV = gemm.datagen.GemmDataGen().exact_golden_model(1, P_ij, V_j, 0, S_ij)
+            PxV = ff.array(np.zeros((B_r, d)), desc)
+            PxV = gemm.datagen.GemmDataGen().exact_golden_model(1, P_ij, V_j, 0, PxV)
             row_sum = np.sum(P_ij.astype(np.float32), 1, keepdims=True)
             if j == 0:
                 l_i = row_sum
@@ -144,6 +147,7 @@ def validate_config(N, d, B_r, B_c, dtype, baseline, gemm_impl):
     assert (N % B_c) == 0, 'N is not an integer multiple of B_c'
     assert (B_r % 8) == 0, 'B_r must be an integer multiple of the number of cores in a cluster'
     assert dtype != 'FP64', 'FP64 precision is not supported yet'
+    assert dtype != 'FP64', 'FP64 precision is not supported yet'
 
     # Q*K^t
     gemm.datagen.GemmDataGen().validate_config(
@@ -224,6 +228,8 @@ def emit_header(section, params):
     data_str += [format_array_definition(ctype, v_uid, V)]
     # result_def = format_array_definition(ctype, 'golden', output)
     # data_str += [format_ifdef_wrapper('BIST', result_def)]
+    # result_def = format_array_definition(ctype, 'golden', output)
+    # data_str += [format_ifdef_wrapper('BIST', result_def)]
     data_str = '\n\n'.join(data_str)
 
     return data_str
diff --git a/sw/dnn/flashattention_2/src/flashattention_2_fp32.h b/sw/dnn/flashattention_2/src/flashattention_2_fp32.h
@@ -61,14 +61,22 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
     tcdm_ptr += m_i_prev_size;
     float *l_i = tcdm_ptr;
     tcdm_ptr += l_i_size;
+
+    // allocate space for V^t when using optimized kernels
+    float *V_t;
+    if (!baseline) {
+        V_t = tcdm_ptr;
+        tcdm_ptr += B_c * d * sizeof(float);
+    }
+
     float shifted_exp;
     float row_sum;
 
+    snrt_mcycle();
+
     // Iterate row blocks of Q
-    uint32_t start_loop_outer = snrt_mcycle();
     for (int t_r = 0; t_r < T_r; t_r++) {
         // DMA copy Q row block to TCDM
-        uint32_t start_dma = snrt_mcycle();
         if (snrt_is_dm_core()) {
             snrt_dma_load_2d_tile(Q_fa,          // dst
                                   Q_l3,          // src
@@ -81,10 +89,10 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
             );
             snrt_dma_wait_all();
         }
-        uint32_t end_dma = snrt_mcycle();
-
         snrt_cluster_hw_barrier();
 
+        snrt_mcycle();
+
         // Initialize m_i, m_i_prev, l_i, row_sum
         uint32_t rows_per_core = B_r / num_cores;
         uint32_t start_row = rows_per_core * compute_id;
@@ -99,16 +107,12 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
 
         snrt_cluster_hw_barrier();
 
-        snrt_cluster_hw_barrier();
+        snrt_mcycle();
 
         // Iterate column blocks of K (corresponding to row blocks of V)
-        uint32_t start_loop_inner = snrt_mcycle();
         for (int t_c = 0; t_c < T_c; t_c++) {
-            snrt_cluster_hw_barrier();
-
             // DMA copy K column block (B_c, d) and V row block (B_c, d) to
             // TCDM. Both K and V are stored in (N, d) form in memory
-            uint32_t start_dma = snrt_mcycle();
             if (!snrt_is_compute_core()) {
                 snrt_dma_load_2d_tile(K_fa,          // dst
                                       K_l3,          // src
@@ -130,22 +134,22 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
                 );
                 snrt_dma_wait_all();
             }
-            uint32_t end_dma = snrt_mcycle();
-
             snrt_cluster_hw_barrier();
 
+            snrt_mcycle();
+
             // Calculate O tile from Q, K and V tiles
             if (snrt_is_compute_core()) {
                 // Matrix multiplication between row block of Q and transposed
                 // column block of K to calculate a tile of S: S = Q * K^T.
                 // The S tile is of form (B_r, B_c)
-                uint32_t start_gemm = snrt_mcycle();
                 sc_st_gemm(dtype, 1, 0, 1, B_r, B_c, d, 1, Q_fa, d, K_fa, d, 0,
                            S_fa, B_c, gemm_implementation);
-                uint32_t end_gemm = snrt_mcycle();
 
                 snrt_cluster_hw_barrier();
 
+                snrt_mcycle();
+
                 // Iterate over the rows of the S row block, distributing
                 // the rows to the cores
                 for (int row_idx = start_row; row_idx < end_row; row_idx++) {
@@ -188,7 +192,7 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
 
                 snrt_cluster_hw_barrier();
 
-                snrt_cluster_hw_barrier();
+                snrt_mcycle();
 
                 // Calculate O tile (O_ij) of size (B_r, d).
                 // The P tile is of size (B_r, B_c) and V of size (B_c, d)
@@ -207,10 +211,6 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
                     // operation. We must transpose V in advance, so
                     // we can compute P*(V^t)^t with the optimized GEMM.
 
-                    // Allocate space for V^t
-                    float *V_t = tcdm_ptr;
-                    tcdm_ptr += B_c * d * sizeof(float);
-
                     // Compute V^t
                     transpose_kernel(FP32, V_fa, V_t, B_c, d, baseline);
 
@@ -225,19 +225,16 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
                     sc_st_gemm(dtype, 0, 0, 1, B_r, d, B_c, 1, P_fa, B_c, V_t,
                                B_c, beta, O_fa, d, gemm_implementation);
                 }
-
-                uint32_t end_stats = snrt_mcycle();
-
-                snrt_cluster_hw_barrier();
             } else {
                 snrt_cluster_hw_barrier();
                 snrt_cluster_hw_barrier();
-                snrt_cluster_hw_barrier();
-                snrt_cluster_hw_barrier();
+                snrt_mcycle();
+                snrt_mcycle();
             }
-        }  // end of T_c loop
+            snrt_cluster_hw_barrier();
 
-        snrt_cluster_hw_barrier();
+            snrt_mcycle();
+        }  // end of T_c loop
 
         // Rescaling for last t_c iteration
         // O_i = diag(l_i_Tc)^-1 * O_i
@@ -248,15 +245,12 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
                 }
             }
         }
-
         snrt_fpu_fence();
-
         snrt_cluster_hw_barrier();
 
-        snrt_cluster_hw_barrier();
+        snrt_mcycle();
 
         // Write back O row block (B_r, d) to DRAM
-        uint32_t start_dma_write_back = snrt_mcycle();
         if (snrt_is_dm_core()) {
             snrt_dma_store_2d_tile(O_l3,          // dst
                                    O_fa,          // src
@@ -269,10 +263,11 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
             );
             snrt_dma_wait_all();
         }
-        uint32_t end_dma_write_back = snrt_mcycle();
+        snrt_cluster_hw_barrier();
+
+        snrt_mcycle();
 
     }  // end of T_r loop
-    uint32_t end_loop_outer = snrt_mcycle();
 
     snrt_cluster_hw_barrier();
 }
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-gpt-3-xl.json b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-gpt-3-xl.json
@@ -0,0 +1,12 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    N: 64,
+    d: 128,
+    B_r: 16,
+    B_c: 64,
+    dtype: "FP32",
+    baseline: false
+}
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-gpt-j.json b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-gpt-j.json
@@ -0,0 +1,12 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    N: 64,
+    d: 256,
+    B_r: 16,
+    B_c: 64,
+    dtype: "FP32",
+    baseline: false
+}
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-vit-base.json b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-vit-base.json
@@ -0,0 +1,12 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    N: 64,
+    d: 64,
+    B_r: 16,
+    B_c: 64,
+    dtype: "FP32",
+    baseline: false
+}
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-vit-huge.json b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/cfg/fp32-opt-vit-huge.json
@@ -0,0 +1,12 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    N: 64,
+    d: 80,
+    B_r: 16,
+    B_c: 64,
+    dtype: "FP32",
+    baseline: false
+}
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/results.py b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/results.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from pathlib import Path
+import sys
+import subprocess
+import functools
+import json
+
+sys.path.append(str(Path(__file__).parent / '../../../../../util/'))
+
+ROI_SPEC = Path.cwd() / 'roi.json.tpl'
+MODELS = {
+    'vit-base': {'N': 192, 'd': 64},
+    'vit-large': {'N': 192, 'd': 64},
+    'vit-huge': {'N': 192, 'd': 80},
+    **{f'gpt-3-xl-forward-{N}': {'N': N, 'd': 128} for N in [128, 256, 512, 1024, 2048]},
+    **{f'gpt-j-forward-{N}': {'N': N, 'd': 256} for N in [128, 256, 512, 1024, 2048]},
+}
+
+
+class Simulation():
+
+    def __init__(self, sim_dir):
+        """Initializes a simulation object from the run directory."""
+        self.sim_dir = sim_dir
+
+    @functools.cached_property
+    def performance_data(self):
+        """Returns all performance data logged during simulation."""
+        roi_json = Path(self.sim_dir) / 'logs' / 'roi.json'
+        with open(roi_json, 'r') as f:
+            return json.load(f)
+
+    def get_metric(self, thread, region, metric, label_idx=0):
+        """Get a specific performance metric from a certain simulation run.
+
+        Args:
+            data: All performance metric data as returned by
+                `get_performance_data()`.
+            thread: The thread to extract the metric from.
+            region: The region to extract the metric from. Can be an integer
+                index or the label assigned to the region. In case of multiple
+                regions with the same label (as e.g. in a loop) you can get
+                the n-th occurrence by passing a value to `label_idx`.
+            metric: The name of the metric to extract.
+            label_idx: See description for `region`.
+        """
+        # Retrieve region index if supplied `region` argument is a region label.
+        reg_idx = None
+        if isinstance(region, str):
+            cnt = 0
+            for i, reg in enumerate(self.performance_data[thread]):
+                if reg['label'] == region:
+                    if cnt == label_idx:
+                        reg_idx = i
+                        break
+                    else:
+                        cnt += 1
+        elif isinstance(region, int):
+            reg_idx = region
+        else:
+            raise ValueError('region argument must be of type int or str')
+        # Get metric
+        return self.performance_data[thread][reg_idx]['attrs'][metric]
+
+    def build_visual_trace(self):
+        """Build the visual trace of the simulation."""
+        subprocess.run(['make', '-C', '../../../../../', 'visual-trace',
+                        f'SIM_DIR={self.sim_dir}',
+                        f'ROI_SPEC={ROI_SPEC}', '-j'], check=True)
+
+
+def load_simulation(model):
+    """Returns the simulation object for a given model."""
+    return Simulation(Path.cwd() / f'runs/flashattention_2-fp32-opt-{model}')
+
+
+def get_total_runtime(sim, model):
+    # Parameters
+    N = MODELS[model]['N']
+    Br = 16
+    Bc = 64
+
+    # Derived parameters
+    Tr = N / Br
+    Tc = N / Bc
+
+    # Calculate total runtime
+    tc_iter_time = sim.get_metric('hart_8', 'copy K & V', 'cycles') + \
+        sim.get_metric('hart_0', 'QxKt', 'cycles') + \
+        sim.get_metric('hart_0', 'softmax', 'cycles') + \
+        sim.get_metric('hart_0', 'PxV', 'cycles')
+    tc_loop_time = tc_iter_time * Tc
+    tr_iter_time = sim.get_metric('hart_8', 'copy Q', 'cycles') + \
+        sim.get_metric('hart_0', 'init', 'cycles') + \
+        tc_loop_time + \
+        sim.get_metric('hart_0', 'rescale', 'cycles') + \
+        sim.get_metric('hart_0', 'rescale', 'cycles')
+    total_time = tr_iter_time * Tr
+    return total_time
+
+
+def main():
+
+    sim = load_simulation('vit-base')
+    sim.build_visual_trace()
+
+    for model in MODELS:
+        print(f'{model}:')
+        total_time = get_total_runtime(sim, model)
+        print(f'\tTotal time: {total_time / 10e9}s')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/roi.json.tpl b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/roi.json.tpl
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/run.py b/target/snitch_cluster/sw/apps/dnn/flashattention_2/tcasai/run.py