feat: vulkan matmul optimization. (#192)

b4rtaz · web-flow · commit e208f5045124 · 2025-03-26T16:02:39.000+01:00
diff --git a/src/nn/vulkan/matmul-forward-q80-q40-f32.comp b/src/nn/vulkan/matmul-forward-q80-q40-f32.comp
@@ -4,9 +4,11 @@
 #extension GL_EXT_shader_16bit_storage : enable
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
-#define Q80_BLOCK_SIZE 32
-#define Q40_BLOCK_SIZE 32
-#define N_THREADS 128
+#define Q80_Q40_BLOCK_SIZE 32
+#define N_THREADS 256
+
+#define GROUP_SIZE 64
+#define N_THREADS_PER_GROUP (N_THREADS / GROUP_SIZE)
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
@@ -19,12 +21,12 @@ struct BatchInfo {
 
 struct BlockQ80 {
     float16_t d;
-    int8_t qs[Q80_BLOCK_SIZE];
+    int8_t qs[Q80_Q40_BLOCK_SIZE];
 };
 
 struct BlockQ40 {
     float16_t d;
-    uint8_t qs[Q40_BLOCK_SIZE / 2];
+    uint8_t qs[Q80_Q40_BLOCK_SIZE / 2];
 };
 
 layout(binding = 0) readonly buffer inputBuffer { BlockQ80 x[]; };
@@ -34,7 +36,11 @@ layout(binding = 3) readonly buffer weightBuffer { BlockQ40 weight[]; };
 
 shared uint sharedStart;
 shared uint sharedEnd;
-shared BatchInfo sharedInfo;
+shared uint sharedInputOffset;
+shared uint sharedInputSizeX;
+shared uint sharedOutputOffset;
+shared uint sharedInputSizeXPerGroup;
+shared float16_t sums[N_THREADS];
 
 void main() {
     const uint threadIndex = gl_LocalInvocationID.x;
@@ -44,44 +50,62 @@ void main() {
         const uint batchIndex = gl_WorkGroupID.y;
         const uint workGroupIndex = gl_WorkGroupID.z;
 
-        const BatchInfo info = infos[batchIndex];
+        sharedInputOffset = infos[batchIndex].inputOffset;
+        sharedInputSizeX = infos[batchIndex].inputSizeX;
+        sharedOutputOffset = infos[batchIndex].outputOffset;
+        sharedInputSizeXPerGroup = (sharedInputSizeX + N_THREADS_PER_GROUP - 1) / N_THREADS_PER_GROUP;
 
-        const uint ySlice = info.outputSizeX / nWorkGroups;
-        const uint yRest = info.outputSizeX % nWorkGroups;
+        const uint ySlice = infos[batchIndex].outputSizeX / nWorkGroups;
+        const uint yRest = infos[batchIndex].outputSizeX % nWorkGroups;
         sharedStart = workGroupIndex * ySlice + (workGroupIndex < yRest ? workGroupIndex : yRest);
         sharedEnd = sharedStart + ySlice + (workGroupIndex < yRest ? 1 : 0);
-        sharedInfo = info;
     }
 
     barrier();
     memoryBarrierShared();
 
-    const uint end = sharedEnd;
-    const uint inputOffset = sharedInfo.inputOffset;
-    const uint inputSizeX = sharedInfo.inputSizeX;
-    const uint outputOffset = sharedInfo.outputOffset;
+    const uint dEnd = sharedEnd;
+    const uint inputOffset = sharedInputOffset;
+    const uint inputSizeX = sharedInputSizeX;
+    const uint outputOffset = sharedOutputOffset;
+    const uint inputSizeXPerGroup = sharedInputSizeXPerGroup;
+
+    const uint dGroup = threadIndex / N_THREADS_PER_GROUP;
+    const uint iGroup = threadIndex % N_THREADS_PER_GROUP;
+    const uint iStart = inputSizeXPerGroup * iGroup;
+    const uint iEnd = min(iStart + inputSizeXPerGroup, inputSizeX);
+
+    for (uint dBatch = sharedStart; dBatch < dEnd; dBatch += GROUP_SIZE) {
+        const uint d = dBatch + dGroup;
+        if (d >= dEnd) {
+            break;
+        }
 
-    for (uint d = sharedStart + threadIndex; d < end; d += N_THREADS) {
         float16_t sum = float16_t(0.0f);
-        const uint wOffset = d * inputSizeX;
-
-        for (uint i = 0; i < inputSizeX; i++) {
-            const BlockQ80 xi = x[inputOffset + i];
-            const BlockQ40 wi = weight[wOffset + i];
+        for (uint i = iStart; i < iEnd; i++) {
+            const uint xi = inputOffset + i;
+            const uint wi = d * inputSizeX + i;
+            [[unroll]] for (uint j = 0; j < Q80_Q40_BLOCK_SIZE / 2; j++) {
+                sum += (
+                    float16_t(x[xi].qs[j])                          * (float16_t(weight[wi].qs[j] & 0xF) - float16_t(8.0f)) +
+                    float16_t(x[xi].qs[j + Q80_Q40_BLOCK_SIZE / 2]) * (float16_t(weight[wi].qs[j] >>  4) - float16_t(8.0f))
+                ) * x[xi].d * weight[wi].d;
+            }
+        }
+        sums[threadIndex] = sum;
 
-            float16_t s = float16_t(0.0f);
-            [[unroll]] for (uint j = 0; j < Q40_BLOCK_SIZE / 2; j++) {
-                const float16_t x0 = float16_t(xi.qs[j]);
-                const float16_t x1 = float16_t(xi.qs[j + Q80_BLOCK_SIZE / 2]);
+        barrier();
+        memoryBarrierShared();
 
-                const uint8_t wq = wi.qs[j];
-                const float16_t w0 = float16_t(wq & 0xF) - float16_t(8.0f);
-                const float16_t w1 = float16_t(wq >>  4) - float16_t(8.0f);
-                s += x0 * w0 + x1 * w1;
-            }
-            sum += s * xi.d * wi.d;
+        [[unroll]] for (uint i = N_THREADS_PER_GROUP / 2; i > 0; i >>= 1) {
+            if (iGroup < i)
+                sums[threadIndex] += sums[threadIndex + i];
+            barrier();
+        }
+        if (iGroup == 0) {
+            y[outputOffset + d] = float(sums[threadIndex]);
         }
 
-        y[outputOffset + d] = float(sum);
+        barrier();
     }
 }