optimization.

b4rtaz · b4rtaz · commit df5765de91bd · 2025-03-22T00:36:12.000+01:00
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -309,7 +309,7 @@ void testShift_F32_F32() {
 }
 
 void testCast_F32_F32() {
-    #define CAST_DIM 48
+    #define CAST_DIM 64
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
             NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, CAST_DIM));
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -458,7 +458,11 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
     groupCount[1] = batchSize;
     groupCount[2] = 1;
 
-    if (opConfig->code == OP_MATMUL)
+    if (opConfig->code == OP_CAST ||
+        opConfig->code == OP_MUL ||
+        opConfig->code == OP_SILU ||
+        opConfig->code == OP_MERGE_ADD ||
+        opConfig->code == OP_MATMUL)
         groupCount[2] = 32;
 }
 
diff --git a/src/nn/vulkan/cast-forward-f32-f32.comp b/src/nn/vulkan/cast-forward-f32-f32.comp
@@ -15,24 +15,33 @@ layout(binding = 0) readonly buffer inputBuffer { float x[]; };
 layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
 
-shared BatchInfo sharedInfo;
+shared uint sharedDim;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
 
     if (threadIndex == 0) {
-        sharedInfo = infos[batchIndex];
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+        sharedDim = info.inputSizeX / nWorkGroups;
+        const uint dimOffset = sharedDim * workGroupIndex;
+        sharedXOffset = info.inputOffset + dimOffset;
+        sharedYOffset = info.outputOffset + dimOffset;
     }
 
     barrier();
     memoryBarrierShared();
 
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint xOffset = sharedInfo.inputOffset;
-    const uint yOffset = sharedInfo.outputOffset;
+    const uint dim = sharedDim;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
 
-    for (uint i = threadIndex; i < inputSizeX; i += N_THREADS) {
+    for (uint i = threadIndex; i < dim; i += N_THREADS) {
         y[yOffset + i] = x[xOffset + i];
     }
 }
diff --git a/src/nn/vulkan/matmul-forward-f32-f32-f32.comp b/src/nn/vulkan/matmul-forward-f32-f32-f32.comp
@@ -1,6 +1,5 @@
 #version 450
 
-#define N_WORK_GROUPS 32
 #define N_THREADS 256
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
@@ -18,23 +17,27 @@ layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
 layout(binding = 3) readonly buffer weightBuffer { float weight[]; };
 
 shared BatchInfo sharedInfo;
+shared uint sharedDim;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
-    const uint workGroupIndex = gl_GlobalInvocationID.z;
-    const uint batchIndex = gl_GlobalInvocationID.y;
+    const uint workGroupIndex = gl_WorkGroupID.z;
 
     if (threadIndex == 0) {
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+
         sharedInfo = infos[batchIndex];
+        sharedDim = sharedInfo.outputSizeX / nWorkGroups;
     }
 
     barrier();
     memoryBarrierShared();
 
     const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint dim = sharedInfo.outputSizeX / N_WORK_GROUPS;
     const uint xOffset = sharedInfo.inputOffset;
     const uint yOffset = sharedInfo.outputOffset;
+    const uint dim = sharedDim;
 
     for (uint i = threadIndex; i < dim; i += N_THREADS) {
         const uint d = (workGroupIndex * dim) + i;
diff --git a/src/nn/vulkan/merge-add-forward-f32-f32.comp b/src/nn/vulkan/merge-add-forward-f32-f32.comp
@@ -2,7 +2,7 @@
 
 #extension GL_EXT_control_flow_attributes : enable
 
-#define N_THREADS 256
+#define N_THREADS 64
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -17,29 +17,42 @@ layout(binding = 0) readonly buffer inputBuffer { float x[]; };
 layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
 
-shared BatchInfo sharedInfo;
+shared uint sharedDim;
+shared uint sharedOutputSizeX;
+shared uint sharedParts;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
 
     if (threadIndex == 0) {
-        sharedInfo = infos[batchIndex];
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+        sharedDim = info.outputSizeX / nWorkGroups;
+        sharedOutputSizeX = info.outputSizeX;
+        sharedParts = info.inputSizeX / info.outputSizeX;
+        sharedXOffset = info.inputOffset + sharedDim * workGroupIndex;
+        sharedYOffset = info.outputOffset + sharedDim * workGroupIndex;
     }
-    memoryBarrierShared();
+
     barrier();
+    memoryBarrierShared();
 
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint inputOffset = sharedInfo.inputOffset;
-    const uint outputOffset = sharedInfo.outputOffset;
-    const uint outputSizeX = sharedInfo.outputSizeX;
-    const uint nNodes = inputSizeX / outputSizeX;
+    const uint dim = sharedDim;
+    const uint outputSizeX = sharedOutputSizeX;
+    const uint parts = sharedParts;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
 
-    for (uint i = threadIndex; i < outputSizeX; i += N_THREADS) {
+    for (uint i = threadIndex; i < dim; i += N_THREADS) {
         float sum = 0.0;
-        const uint iOffset = inputOffset + i;
-        const uint oOffset = outputOffset + i;
-        for (uint n = 0; n < nNodes; n++) {
+        const uint iOffset = xOffset + i;
+        const uint oOffset = yOffset + i;
+        for (uint n = 0; n < parts; n++) {
             sum += x[n * outputSizeX + iOffset];
         }
         y[oOffset] += sum;
diff --git a/src/nn/vulkan/mul-forward-f32-f32.comp b/src/nn/vulkan/mul-forward-f32-f32.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#define N_THREADS 256
+#define N_THREADS 64
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -19,23 +19,32 @@ layout(binding = 3) readonly uniform configBuffer {
 };
 layout(binding = 4) readonly buffer multiplierBuffer { float m[]; };
 
-shared BatchInfo sharedInfo;
+shared uint sharedDim;
+shared uint sharedXyOffset;
+shared uint sharedMOffset;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
-    const uint batchIndex = gl_GlobalInvocationID.y;
 
     if (threadIndex == 0) {
-        sharedInfo = infos[batchIndex];
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+        sharedDim = info.inputSizeX / nWorkGroups;
+        sharedXyOffset = info.inputOffset + sharedDim * workGroupIndex;
+        sharedMOffset = info.inputSizeX * batchIndex + sharedDim * workGroupIndex;
     }
-    memoryBarrierShared();
+
     barrier();
+    memoryBarrierShared();
 
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint xyOffset = sharedInfo.inputOffset;
-    const uint mOffset = inputSizeX * batchIndex;
+    const uint dim = sharedDim;
+    const uint xyOffset = sharedXyOffset;
+    const uint mOffset = sharedMOffset;
 
-    for (uint i = threadIndex; i < inputSizeX; i += N_THREADS) {
+    for (uint i = threadIndex; i < dim; i += N_THREADS) {
         y[xyOffset + i] = x[xyOffset + i] * m[mOffset + i];
     }
 }
diff --git a/src/nn/vulkan/silu-forward-f32-f32.comp b/src/nn/vulkan/silu-forward-f32-f32.comp
@@ -15,23 +15,33 @@ layout(binding = 0) readonly buffer inputBuffer { float x[]; };
 layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
 
-shared BatchInfo sharedInfo;
+shared uint sharedDim;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
     const uint batchIndex = gl_GlobalInvocationID.y;
 
     if (threadIndex == 0) {
-        sharedInfo = infos[batchIndex];
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+        sharedDim = info.inputSizeX / nWorkGroups;
+        sharedXOffset = info.inputOffset + sharedDim * workGroupIndex;
+        sharedYOffset = info.outputOffset + sharedDim * workGroupIndex;
     }
 
     barrier();
+    memoryBarrierShared();
 
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint xOffset = sharedInfo.inputOffset;
-    const uint yOffset = sharedInfo.outputOffset;
+    const uint dim = sharedDim;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
 
-    for (uint i = threadIndex; i < inputSizeX; i += N_THREADS) {
+    for (uint i = threadIndex; i < dim; i += N_THREADS) {
         float v = x[xOffset + i];
         y[yOffset + i] = v / (1.0 + exp(-v));
     }

Original file line number	Diff line number	Diff line change
`@@ -309,7 +309,7 @@ void testShift_F32_F32() {`
`309`	`309`	`}`
`310`	`310`
`311`	`311`	`void testCast_F32_F32() {`
`312`		`- #define CAST_DIM 48`
	`312`	`+ #define CAST_DIM 64`
`313`	`313`	`execute(`
`314`	`314`	`[](NnNetConfigBuilder netBuilder, NnNodeConfigBuilder nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {`
`315`	`315`	`NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, CAST_DIM));`