From adcd7214d2f557014ada03d414bb4e32b7a4e951 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Fri, 30 May 2025 07:56:46 -0700
Subject: [PATCH] [ET-VK] Using push constants for buffer to image prepack
 nodes.

This diff enables the use of push constants for buffer to image prepack nodes in the Vulkan runtime graph. Push constants are a more efficient way to pass small amounts of data to shaders, compared to using uniform buffers.
*   The `nchw_to_*.yaml` files have been updated to include the `USE_PUSH_CONST` flag which is `True` by default, and enables the use of push constants for all `nchw_to_*` operations.
*   New variants of the `nchw_to_*` operation have been added with suffix `_no_pc`, which do not use push constants. These variants are used for compatibility with testing and utility functions.
*   The `Convolution.cpp` and `Staging.cpp` files have been updated to pass empty parameter buffers and instead use push constants.

Differential Revision: [D70102398](https://our.internmc.facebook.com/intern/diff/D70102398/)

[ghstack-poisoned]
---
 .../nchw_to_bitw8_image_nobitw8buffer.glsl    |  8 ++++-
 .../nchw_to_bitw8_image_nobitw8buffer.yaml    |  3 ++
 .../graph/ops/glsl/nchw_to_buffer.glsl        | 14 ++++++--
 .../graph/ops/glsl/nchw_to_buffer.yaml        |  3 ++
 .../runtime/graph/ops/glsl/nchw_to_image.glsl | 14 ++++++--
 .../runtime/graph/ops/glsl/nchw_to_image.yaml |  9 +++++
 .../vulkan/runtime/graph/ops/impl/Clone.cpp   |  4 +--
 .../runtime/graph/ops/impl/Convolution.cpp    |  5 +--
 .../vulkan/runtime/graph/ops/impl/Staging.cpp | 33 ++++++++++---------
 .../runtime/graph/ops/utils/StagingUtils.cpp  | 12 ++++++-
 .../runtime/graph/ops/utils/StagingUtils.h    |  3 +-
 backends/vulkan/test/utils/test_utils.cpp     |  4 +--
 .../vulkan/test/vulkan_compute_api_test.cpp   |  6 ++--
 13 files changed, 83 insertions(+), 35 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
index 327c3868847..4b18abbb1c5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
@@ -22,7 +22,13 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
-${layout_declare_ubo(B, "ivec4", "sizes")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 sizes;
+  };
+$else:
+  ${layout_declare_ubo(B, "ivec4", "sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
index 506a66c0d27..0b8bbecb7bd 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
@@ -8,6 +8,7 @@ nchw_to_bitw8_image_nobitw8buffer:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: int8
+    USE_PUSH_CONST: True
   generate_variant_forall:
     STORAGE:
       - VALUE: texture2d
@@ -17,3 +18,5 @@ nchw_to_bitw8_image_nobitw8buffer:
       - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_bitw8_image_nobitw8buffer
+    - NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc
+      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index 32235a9ad65..ba4e4dd9dd9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -12,9 +12,17 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "out_strides")}
-${layout_declare_ubo(4, "int", "numel")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 out_sizes;
+    ivec4 out_strides;
+    int numel;
+  };
+$else:
+  ${layout_declare_ubo(2, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(3, "ivec4", "out_strides")}
+  ${layout_declare_ubo(4, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
index 6292ef93337..ae645a21b47 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
@@ -8,6 +8,7 @@ nchw_to_buffer:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: buffer
+    USE_PUSH_CONST: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -16,3 +17,5 @@ nchw_to_buffer:
       - VALUE: int8
   shader_variants:
     - NAME: nchw_to_buffer
+    - NAME: nchw_to_buffer_no_pc
+      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
index 2f55535c82c..4674822ce6a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -21,9 +21,17 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
-${layout_declare_ubo(B, "ivec4", "sizes")}
-$if not FROM_STAGING:
-  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 sizes;
+  $if not FROM_STAGING:
+    ivec4 buf_strides;
+  };
+$else:
+  ${layout_declare_ubo(B, "ivec4", "sizes")}
+  $if not FROM_STAGING:
+    ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
index f44e1f74bfe..67fe0a602e1 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
@@ -9,6 +9,7 @@ nchw_to_image:
     STORAGE: texture3d
     DTYPE: float
     FROM_STAGING: True
+    USE_PUSH_CONST: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -21,3 +22,11 @@ nchw_to_image:
       STORAGE: texture2d
     - NAME: clone_buffer_to_image
       FROM_STAGING: False
+    - NAME: nchw_to_image_no_pc_texture3d
+      USE_PUSH_CONST: False
+    - NAME: nchw_to_image_no_pc_texture2d
+      STORAGE: texture2d
+      USE_PUSH_CONST: False
+    - NAME: clone_buffer_to_image_no_pc
+      FROM_STAGING: False
+      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
index b547bc3572d..d0276b1783b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -105,9 +105,9 @@ void add_buffer_to_image_node(
       // Input and Outputs
       {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
       // Parameter Buffers
-      {graph.sizes_ubo(image), graph.strides_ubo(buffer)},
-      // Push Constants
       {},
+      // Push Constants
+      {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)},
       // Specialization Constants
       {graph.hashed_layout_of(image)},
       // Resize Args
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index fbe4a61befc..32f478fa5bd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,9 +106,10 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo()},
+      {},
       // Specialization constants
-      {t->hashed_layout()}));
+      {t->hashed_layout()},
+      {graph.sizes_pc_of(v)}));
 
   return v;
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index f39b0fc33ff..8c060a9da4b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -28,14 +28,14 @@ void add_staging_to_tensor_node(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
 
-  vkapi::ParamsBindList ubos;
+  std::vector<PushConstantDataInfo> pcs;
   if (graph.is_buffer_storage(out_tensor)) {
-    ubos.append(
-        {graph.sizes_ubo(out_tensor),
-         graph.strides_ubo(out_tensor),
-         graph.numel_ubo(out_tensor)});
+    pcs = {
+        graph.sizes_pc_of(out_tensor),
+        graph.strides_pc_of(out_tensor),
+        graph.numel_pc_of(out_tensor)};
   } else {
-    ubos.append({graph.sizes_ubo(out_tensor)});
+    pcs = {graph.sizes_pc_of(out_tensor)};
   }
 
   graph.execute_nodes().emplace_back(new DispatchNode(
@@ -46,9 +46,9 @@ void add_staging_to_tensor_node(
       // Input and Outputs
       {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
       // Parameter Buffers
-      ubos,
-      // Push Constants
       {},
+      // Push Constants
+      pcs,
       // Specialization Constants
       {graph.hashed_layout_of(out_tensor)},
       // Resize Args
@@ -127,14 +127,14 @@ void add_prepack_standard_node(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       *graph.get_tensor(tensor), graph.int8_buffers_enabled());
 
-  vkapi::ParamsBindList ubos;
+  std::vector<PushConstantDataInfo> pcs;
   if (graph.is_buffer_storage(tensor)) {
-    ubos.append(
-        {graph.sizes_ubo(tensor),
-         graph.strides_ubo(tensor),
-         graph.numel_ubo(tensor)});
+    pcs = {
+        graph.sizes_pc_of(tensor),
+        graph.strides_pc_of(tensor),
+        graph.numel_pc_of(tensor)};
   } else {
-    ubos.append({graph.sizes_ubo(tensor)});
+    pcs = {graph.sizes_pc_of(tensor)};
   }
 
   int transpose_hw_spec = transpose_hw ? 1 : 0;
@@ -148,9 +148,10 @@ void add_prepack_standard_node(
       tensor_data,
       tensor,
       // Parameter Buffers
-      ubos,
+      {},
       // Specialization Constants
-      {graph.hashed_layout_of(tensor), transpose_hw_spec}));
+      {graph.hashed_layout_of(tensor), transpose_hw_spec},
+      pcs));
 }
 
 ValueRef prepack_standard(
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index fd7e6b78c22..6f3660fb0fc 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -22,13 +22,17 @@ bool is_bitw8(vkapi::ScalarType dtype) {
 
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
-    const bool int8_buffer_enabled) {
+    bool int8_buffer_enabled,
+    bool push_constant_variant) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
   if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer &&
       !int8_buffer_enabled) {
     kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
+    if (!push_constant_variant) {
+      kernel_name += "_no_pc";
+    }
     add_storage_type_suffix(kernel_name, v_dst);
     add_dtype_suffix(kernel_name, v_dst);
     return VK_KERNEL_FROM_STR(kernel_name);
@@ -36,11 +40,17 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
 
   if (v_dst.storage_type() == utils::kBuffer) {
     kernel_name = "nchw_to_buffer";
+    if (!push_constant_variant) {
+      kernel_name += "_no_pc";
+    }
     add_dtype_suffix(kernel_name, v_dst);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
 
   kernel_name = "nchw_to_image";
+  if (!push_constant_variant) {
+    kernel_name += "_no_pc";
+  }
   add_storage_type_suffix(kernel_name, v_dst);
   add_dtype_suffix(kernel_name, v_dst);
 
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
index 8d63958a738..6abbac45823 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -14,7 +14,8 @@ namespace vkcompute {
 
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
-    bool int8_buffer_enabled = true);
+    bool int8_buffer_enabled = true,
+    bool push_constant_variant = true);
 vkapi::ShaderInfo get_tensor_to_nchw_shader(
     const api::vTensor& v_src,
     bool int8_buffer_enabled = true);
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index c4acb41b7b0..6b11bcd8b80 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -28,7 +28,7 @@ void record_nchw_to_buffer_op(
   vkapi::PipelineBarrier pipeline_barrier{};
 
   context->submit_compute_job(
-      get_nchw_to_tensor_shader(v_dst),
+      get_nchw_to_tensor_shader(v_dst, true, false),
       pipeline_barrier,
       {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
@@ -74,7 +74,7 @@ void record_nchw_to_image_op(
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(
-          v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
+          v_dst, context->adapter_ptr()->has_full_int8_buffers_support(), false),
       pipeline_barrier,
       v_dst.logical_limits(),
       adaptive_work_group_size(v_dst.logical_limits()),
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a6475d95d07..9ddeba711b9 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1600,8 +1600,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
-  // +2: staging buffer for each input tensor
-  expected_vma_allocation_count += 4;
+  expected_vma_allocation_count += 2;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef c = graph.add_tensor(
@@ -1621,8 +1620,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 2);
 
   // +1: t.sizes_ubo() uniform buffer for staging shader
-  // +1: staging buffer for the input tensor
-  expected_vma_allocation_count += 2;
+  expected_vma_allocation_count += 1;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   ValueRef e = graph.add_tensor(