pytorch · trivedivivek · May 29, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -14,7 +14,6 @@
 
 #define TILE_SIZE_X ${TILE_SIZE_X}
 #define TILE_SIZE_Y ${TILE_SIZE_Y}
-#define LOCAL_WG_SIZE 64
 
 #define op(X, A, B) ${OPERATOR}
 
@@ -39,10 +38,7 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// For performance improvement, reduce register usage by caching positions in shared memory.
-// Offset index by 1 every 16 points to avoid bank access conflict.
-#define offset_pos_index(index) (index + ((index) >> 4))
-shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
+#extension GL_EXT_control_flow_attributes : require
 
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
@@ -51,47 +47,55 @@ shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE
  */
 void main() {
   const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y);
-  const uint shared_mem_stride = LOCAL_WG_SIZE;
 
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
   const ivec3 gpos = ivec3(
     gl_GlobalInvocationID.x % out_limits_scaled.x,
-    div_by_x % out_limits_scaled.y,
-    div_by_x / out_limits_scaled.y);
+    div_by_x,
+    gl_GlobalInvocationID.y);
+
+  // If the top left position is out of bounds, then this invocation will have
+  // no work to do.
+  if (gpos.y >= out_limits_scaled.y || gpos.z >= out_limits.z) {
+    return;
+  }
 
   // Output position for TILE_SIZE = 2
   // +--------+--------+
   // | pos[0] | pos[1] |
   // +--------+--------+
   // | pos[2] | pos[3] |
   // +--------+--------+
-  ivec2 pos[TILE_SIZE_X * TILE_SIZE_Y];
+  int pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
   for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
     for (int x = 0; x < TILE_SIZE_X; ++x) {
-      pos[i] = ivec2(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y);
-      pos_shared[offset_pos_index((shared_mem_stride * i) + gl_LocalInvocationIndex)] = ivec3(pos[i], gpos.z);
+      pos[i * 2] = gpos.x * TILE_SIZE_X + x;
+      pos[i * 2 + 1] = gpos.y * TILE_SIZE_Y + y;
       i++;
     }
   }
 
-  // If the top left position is out of bounds, then this invocation will have
-  // no work to do.
-  if (gpos.z >= out_limits.z) {
-    return;
-  }
-
   // Compute the index of the input texture that needs to be loaded for each
   // output position. Note that negative indices can be produced indicating that
   // the top-left element is in a region added by padding.
-  ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
+  int ipos[TILE_SIZE_X * TILE_SIZE_Y * 2];
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    ipos[i] = pos[i] * stride - padding;
+    ipos[i * 2] = pos[i * 2] * stride.x - padding.x;
+    ipos[i * 2 + 1] = pos[i * 2 + 1] * stride.y - padding.y;
   }
 
-  vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
-  sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
-  for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    sum[i] = sum[0];
+  // Final output array where each element is a tensor value.
+  // Tuple of consecutive 4 elements represents a single output texel.
+  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
+
+  const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
+
+  // Initialize the output array with the bias value
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
+    sum[i] = bias.x;
+    sum[i + 1] = bias.y;
+    sum[i + 2] = bias.z;
+    sum[i + 3] = bias.w;
   }
 
   int z4 = 0;
@@ -100,14 +104,26 @@ void main() {
     // During prepacking, the weight tensor has been permuted so that the
     // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
     // the z-axis.
-    const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0));
-    const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0));
-    const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0));
-    const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
+    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
+
+    // Load kernel values from texels to array
+    [[unroll]] for (int i = 0; i < 4; ++i) {
+      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
+      kernel_values[i * 4 + 0] = k_tex.x;
+      kernel_values[i * 4 + 1] = k_tex.y;
+      kernel_values[i * 4 + 2] = k_tex.z;
+      kernel_values[i * 4 + 3] = k_tex.w;
+    }
 
-#pragma unroll
     for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-      const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
+      const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i * 2], ipos[i * 2 + 1], z4), 0);
+      // Load the input texel into an array
+      float tex_values[4];
+      tex_values[0] = in_tex.x;
+      tex_values[1] = in_tex.y;
+      tex_values[2] = in_tex.z;
+      tex_values[3] = in_tex.w;
+
       // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents of one in_tex and the
       // group of 4 texels loaded from t_kernel are shown:
@@ -141,18 +157,19 @@ void main() {
       //
       //  which is what is expressed in the following calculations. This is done
       //  for each output position.
-      sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]);
-      sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]);
-      sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]);
-      sum[i] = fma(in_tex.wwww, ktex_3, sum[i]);
+      for (int j = 0; j < 4; ++j) {
+        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
+      }
     }
   }
 
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
-    const ivec3 pos = pos_shared[offset_pos_index(index)];
-    if (all(lessThan(pos, out_limits.xyz))) {
-      imageStore(t_out, pos, op(sum[i], out_min, out_max));
+    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], gpos.z);
+    if (all(lessThan(pos_l, out_limits.xyz))) {
+      imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
@@ -9,8 +9,8 @@ conv2d_pw:
     OPERATOR: X
     NDIM: 3
     DTYPE: float
-    TILE_SIZE_X: 2
-    TILE_SIZE_Y: 2
+    TILE_SIZE_X: 1
+    TILE_SIZE_Y: 4
   generate_variant_forall:
     DTYPE:
       - VALUE: half

@@ -305,8 +305,8 @@ utils::uvec3 create_conv2d_global_wg_size(
   if (method == Conv2dMethod::Pointwise) {
     const utils::uvec3 image_extents = graph.logical_limits_of(out);
     return {
-        utils::div_up(image_extents[0u], 2u),
-        utils::div_up(image_extents[1u], 2u),
+        utils::div_up(image_extents[0u], 1u),
+        utils::div_up(image_extents[1u], 4u),
         image_extents[2u]};
   } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
     const utils::uvec3 image_extents = graph.create_global_wg_size(out);
@@ -398,8 +398,25 @@ void add_conv2d_node(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
+  if (method == Conv2dMethod::Depthwise) {
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
+  } else if (method == Conv2dMethod::Pointwise) {
+    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
+  }
+
+  utils::uvec3 local_wg_size;
+  if (method == Conv2dMethod::Pointwise) {
+    uint32_t local_wg_size_y = 1;
+    if (wg_size[1] % 8 == 0) {
+      local_wg_size_y = 8;
+    } else if (wg_size[1] % 4 == 0) {
+      local_wg_size_y = 4;
+    } else if (wg_size[1] % 2 == 0) {
+      local_wg_size_y = 2;
+    }
+    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
+  } else {
+    local_wg_size = graph.create_local_wg_size(wg_size);
   }
 
   vkapi::ParamsBindList param_buffers;
@@ -462,7 +479,7 @@ void add_conv2d_node(
       graph,
       shader,
       wg_size,
-      graph.create_local_wg_size(wg_size),
+      local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
       // Shader params buffers