optimization.

b4rtaz · b4rtaz · commit 0b82d94ba99a · 2025-03-22T11:37:51.000+01:00
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -266,7 +266,7 @@ void testEmbedding_F32_F32() {
 }
 
 void testShift_F32_F32() {
-    #define SHIFT_DIM 48
+    #define SHIFT_DIM 64
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
             NnUint posPipeIndex = netBuilder->addPipe("POS", size2D(F_32, N_BATCHES, 1));
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -459,8 +459,10 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
     groupCount[2] = 1;
 
     if (opConfig->code == OP_CAST ||
+        opConfig->code == OP_RMS_NORM ||
         opConfig->code == OP_MUL ||
         opConfig->code == OP_SILU ||
+        opConfig->code == OP_SHIFT ||
         opConfig->code == OP_MERGE_ADD ||
         opConfig->code == OP_MATMUL)
         groupCount[2] = 32;
diff --git a/src/nn/vulkan/rms-norm-forward-f32-f32-f32.comp b/src/nn/vulkan/rms-norm-forward-f32-f32-f32.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#define N_THREADS 256
+#define N_THREADS 64
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -20,26 +20,38 @@ layout(binding = 4) readonly uniform configBuffer {
 };
 layout(binding = 5) readonly buffer invRmsBuffer { float invRms[]; };
 
-shared BatchInfo sharedInfo;
-shared float s;
+shared uint sharedDim;
+shared uint sharedDimOffset;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
+shared float sharedS;
 
 void main() {
     const uint threadIndex = uint(gl_LocalInvocationID.x);
-    const uint batchIndex = uint(gl_GlobalInvocationID.y);
 
     if (threadIndex == 0) {
-        sharedInfo = infos[batchIndex];
-        s = invRms[batchIndex];
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+        sharedDim = info.inputSizeX / nWorkGroups;
+        sharedDimOffset = sharedDim * workGroupIndex;
+        sharedXOffset = info.inputOffset + sharedDimOffset;
+        sharedYOffset = info.outputOffset + sharedDimOffset;
+        sharedS = invRms[batchIndex];
     }
 
     barrier();
     memoryBarrierShared();
 
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint xOffset = sharedInfo.inputOffset;
-    const uint yOffset = sharedInfo.outputOffset;
+    const uint dim = sharedDim;
+    const uint dimOffset = sharedDimOffset;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
+    const float s = sharedS;
 
-    for (uint i = threadIndex; i < inputSizeX; i += N_THREADS) {
-        y[yOffset + i] = (x[xOffset + i] * s) * weight[i];
+    for (uint i = threadIndex; i < dim; i += N_THREADS) {
+        y[yOffset + i] = (x[xOffset + i] * s) * weight[i + dimOffset];
     }
 }
diff --git a/src/nn/vulkan/shift-forward-f32-f32.comp b/src/nn/vulkan/shift-forward-f32-f32.comp
@@ -19,26 +19,34 @@ layout(binding = 3) readonly uniform configBuffer {
 };
 layout(binding = 4) readonly buffer indexBuffer { float indexes[]; };
 
-shared uint sharedIndex;
-shared BatchInfo sharedInfo;
+shared uint sharedDim;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
 
     if (threadIndex == 0) {
-        sharedIndex = uint(indexes[batchIndex]);
-        sharedInfo = infos[batchIndex];
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const uint index = uint(indexes[batchIndex]);
+        BatchInfo info = infos[batchIndex];
+        sharedDim = info.inputSizeX / nWorkGroups;
+        const uint dimOffset = sharedDim * workGroupIndex;
+        sharedXOffset = info.inputOffset + dimOffset;
+        sharedYOffset = index * info.inputSizeX + dimOffset;
     }
 
     barrier();
     memoryBarrierShared();
 
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint xOffset = sharedInfo.inputOffset;
-    const uint yOffset = sharedIndex * inputSizeX;
+    const uint dim = sharedDim;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
 
-    for (uint i = threadIndex; i < inputSizeX; i += N_THREADS) {
+    for (uint i = threadIndex; i < dim; i += N_THREADS) {
         y[yOffset + i] = x[xOffset + i];
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ void testEmbedding_F32_F32() {`
`266`	`266`	`}`
`267`	`267`
`268`	`268`	`void testShift_F32_F32() {`
`269`		`- #define SHIFT_DIM 48`
	`269`	`+ #define SHIFT_DIM 64`
`270`	`270`	`execute(`
`271`	`271`	`[](NnNetConfigBuilder netBuilder, NnNodeConfigBuilder nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {`
`272`	`272`	`NnUint posPipeIndex = netBuilder->addPipe("POS", size2D(F_32, N_BATCHES, 1));`