testMatmul_Q80_Q40_F32.

b4rtaz · b4rtaz · commit a0f28039321d · 2025-03-23T13:14:38.000+01:00
diff --git a/src/nn/nn-cpu-ops.cpp b/src/nn/nn-cpu-ops.cpp
@@ -430,7 +430,7 @@ static void matmul_Q80_Q40_F32(float *output, const NnBlockQ80 *x, const NnBlock
                 const int w0 = (wb->qs[k] & 0x0F) - 8;
                 const int w1 = (wb->qs[k] >> 4) - 8;
                 const int i1 = xb->qs[k];
-                const int i2 = xb->qs[k + Q40_BLOCK_SIZE / 2];
+                const int i2 = xb->qs[k + Q80_BLOCK_SIZE / 2];
                 sum += (w0 * i1 + w1 * i2) * s;
             }
         }
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -483,18 +483,18 @@ void testRope_F32_F32() {
         });
 }
 
-void matmul_F32_F32_F32() {
-    #define MATMUL_N 64
-    #define MATMUL_D 96
+void testMatmul_F32_F32_F32() {
+    #define MATMUL_F32_N 64
+    #define MATMUL_F32_D 96
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
-            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, MATMUL_N));
-            NnUint yPipeIndex = netBuilder->addPipe("Y", size2D(F_32, N_BATCHES, MATMUL_D));
+            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, MATMUL_F32_N));
+            NnUint yPipeIndex = netBuilder->addPipe("Y", size2D(F_32, N_BATCHES, MATMUL_F32_D));
             segmentBuilder->addOp(
                 OP_MATMUL, "matmul", 0,
                 pointerBatchConfig(SRC_PIPE, xPipeIndex),
                 pointerBatchConfig(SRC_PIPE, yPipeIndex),
-                size2D(F_32, MATMUL_N, MATMUL_D),
+                size2D(F_32, MATMUL_F32_N, MATMUL_F32_D),
                 NnMatmulOpConfig{});
         },
         [](NnExecutor *executor, NnNetExecution *execution, NnVulkanDevice *device) {
@@ -503,32 +503,88 @@ void matmul_F32_F32_F32() {
             float *xPipe = (float *)execution->pipes[0];
             float *yPipe = (float *)execution->pipes[1];
 
-            float weight[MATMUL_N * MATMUL_D];
-            for (NnUint i = 0; i < N_BATCHES * MATMUL_N; i++)
+            float weight[MATMUL_F32_N * MATMUL_F32_D];
+            for (NnUint i = 0; i < N_BATCHES * MATMUL_F32_N; i++)
                 xPipe[i] = i * 0.01f;
-            for (NnUint i = 0; i < MATMUL_N * MATMUL_D; i++)
+            for (NnUint i = 0; i < MATMUL_F32_N * MATMUL_F32_D; i++)
                 weight[i] = i * 0.001f;
-            executor->loadWeight("matmul", 0, MATMUL_N * MATMUL_D * sizeof(float), (NnByte *)weight);
+            executor->loadWeight("matmul", 0, MATMUL_F32_N * MATMUL_F32_D * sizeof(float), (NnByte *)weight);
 
             // act
             executor->forward();
 
             // assert
             for (NnUint b = 0; b < N_BATCHES; b++) {
-                for (NnUint d = 0; d < MATMUL_D; d++) {
+                for (NnUint d = 0; d < MATMUL_F32_D; d++) {
                     float sum = 0.0f;
-                    for (NnUint n = 0; n < MATMUL_N; n++)
-                        sum += xPipe[b * MATMUL_N + n] * weight[d * MATMUL_N + n];
+                    for (NnUint n = 0; n < MATMUL_F32_N; n++)
+                        sum += xPipe[b * MATMUL_F32_N + n] * weight[d * MATMUL_F32_N + n];
 
-                    const NnUint p = b * MATMUL_D + d;
+                    const NnUint p = b * MATMUL_F32_D + d;
                     assertFloat(p, yPipe[p], sum, 0.0002f);
                 }
             }
-            printOk("matmul_F32_F32_F32");
+            printOk("testMatmul_F32_F32_F32");
         });
 }
 
-void multiheadAtt_F32_F32() {
+void testMatmul_Q80_Q40_F32() {
+    #define MATMUL_Q80_Q40_N 64
+    #define MATMUL_Q80_Q40_D 96
+    execute(
+        [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
+            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_Q80, N_BATCHES, MATMUL_Q80_Q40_N));
+            NnUint yPipeIndex = netBuilder->addPipe("Y", size2D(F_32, N_BATCHES, MATMUL_Q80_Q40_D));
+            segmentBuilder->addOp(
+                OP_MATMUL, "matmul", 0,
+                pointerBatchConfig(SRC_PIPE, xPipeIndex),
+                pointerBatchConfig(SRC_PIPE, yPipeIndex),
+                size2D(F_Q40, MATMUL_Q80_Q40_N, MATMUL_Q80_Q40_D),
+                NnMatmulOpConfig{});
+        },
+        [](NnExecutor *executor, NnNetExecution *execution, NnVulkanDevice *device) {
+            // arrange
+            execution->setBatchSize(N_BATCHES);
+            NnBlockQ80 *xPipe = (NnBlockQ80 *)execution->pipes[0];
+            float *yPipe = (float *)execution->pipes[1];
+
+            constexpr NnUint xSize = N_BATCHES * MATMUL_Q80_Q40_N;
+            constexpr NnUint weightSize = MATMUL_Q80_Q40_N * MATMUL_Q80_Q40_D;
+            constexpr NnUint weightBlocks = weightSize / Q40_BLOCK_SIZE;
+
+            float x[xSize];
+            float weight[weightSize];
+            NnBlockQ40 weightQ40[weightBlocks];
+
+            for (NnUint i = 0; i < xSize; i++)
+                x[i] = i * 0.01f;
+            for (NnUint i = 0; i < weightSize; i++)
+                weight[i] = i * 0.001f;
+
+            quantizeF32toQ80(x, xPipe, xSize, 1, 0);
+            quantizeF32toQ40(weight, weightQ40, weightSize, 1, 0);
+
+            executor->loadWeight("matmul", 0, weightBlocks * sizeof(NnBlockQ40), (NnByte *)weightQ40);
+
+            // act
+            executor->forward();
+
+            // assert
+            for (NnUint b = 0; b < N_BATCHES; b++) {
+                for (NnUint d = 0; d < MATMUL_Q80_Q40_D; d++) {
+                    float sum = 0.0f;
+                    for (NnUint n = 0; n < MATMUL_Q80_Q40_N; n++)
+                        sum += x[b * MATMUL_Q80_Q40_N + n] * weight[d * MATMUL_Q80_Q40_N + n];
+                    const NnUint p = b * MATMUL_Q80_Q40_D + d;
+                    const float change = (yPipe[p] - sum) / sum;
+                    assertFloat(p, change, 0.0, 0.04f);
+                }
+            }
+            printOk("testMatmul_Q80_Q40_F32");
+        });
+}
+
+void testMultiheadAtt_F32_F32() {
     #define MULTIHEAD_ATT_DIM 128
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
@@ -560,7 +616,7 @@ void multiheadAtt_F32_F32() {
             // TODO: for now this is a smoke test
             execution->setBatchSize(N_BATCHES);
             executor->forward();
-            printOk("multiheadAtt_F32_F32");
+            printOk("testMultiheadAtt_F32_F32");
         });
 }
 
@@ -577,7 +633,8 @@ int main() {
     testCast_F32_F32();
     testCast_F32_Q80();
     testRope_F32_F32();
-    matmul_F32_F32_F32();
-    multiheadAtt_F32_F32();
+    testMatmul_F32_F32_F32();
+    testMatmul_Q80_Q40_F32();
+    testMultiheadAtt_F32_F32();
     return 0;
 }
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -388,6 +388,7 @@ static const char *getShaderFileName(const NnOpCode opCode, const NnOpQuantType
     }
     if (opCode == OP_MATMUL) {
         if (quantType == F32_F32_F32) return "matmul-forward-f32-f32-f32.spv";
+        if (quantType == Q80_Q40_F32) return "matmul-forward-q80-q40-f32.spv";
     }
     if (opCode == OP_MULTIHEAD_ATT) {
         if (quantType == F32_F32_F32) return "multi-head-att-forward-f32-f32.spv";
diff --git a/src/nn/vulkan/matmul-forward-q80-q40-f32.comp b/src/nn/vulkan/matmul-forward-q80-q40-f32.comp
@@ -0,0 +1,86 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+
+#define Q80_BLOCK_SIZE 32
+#define Q40_BLOCK_SIZE 32
+#define N_THREADS 128
+
+layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+
+struct BatchInfo {
+    uint inputOffset;
+    uint inputSizeX;
+    uint outputOffset;
+    uint outputSizeX;
+};
+
+struct BlockQ80 {
+    float16_t d;
+    int8_t qs[Q80_BLOCK_SIZE];
+};
+
+struct BlockQ40 {
+    float16_t d;
+    uint8_t qs[Q40_BLOCK_SIZE / 2];
+};
+
+layout(binding = 0) readonly buffer inputBuffer { BlockQ80 x[]; };
+layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
+layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
+layout(binding = 3) readonly buffer weightBuffer { BlockQ40 weight[]; };
+
+shared uint sharedStart;
+shared uint sharedEnd;
+shared BatchInfo sharedInfo;
+
+void main() {
+    const uint threadIndex = gl_LocalInvocationID.x;
+
+    if (threadIndex == 0) {
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+
+        const uint ySlice = info.outputSizeX / nWorkGroups;
+        const uint yRest = info.outputSizeX % nWorkGroups;
+        sharedStart = workGroupIndex * ySlice + (workGroupIndex < yRest ? workGroupIndex : yRest);
+        sharedEnd = sharedStart + ySlice + (workGroupIndex < yRest ? 1 : 0);
+        sharedInfo = info;
+    }
+
+    barrier();
+    memoryBarrierShared();
+
+    const uint end = sharedEnd;
+    const uint inputOffset = sharedInfo.inputOffset;
+    const uint inputSizeX = sharedInfo.inputSizeX;
+    const uint outputOffset = sharedInfo.outputOffset;
+
+    for (uint d = sharedStart + threadIndex; d < end; d += N_THREADS) {
+        float16_t sum = float16_t(0.0f);
+
+        for (uint i = 0; i < inputSizeX; i++) {
+            const BlockQ80 xi = x[inputOffset + i];
+            const BlockQ40 wi = weight[d * inputSizeX + i];
+
+            float16_t s = float16_t(0.0f);
+            [[unroll]] for (uint j = 0; j < Q40_BLOCK_SIZE / 2; j++) {
+                const float16_t x0 = float16_t(xi.qs[j]);
+                const float16_t x1 = float16_t(xi.qs[j + Q80_BLOCK_SIZE / 2]);
+
+                const uint8_t wq = wi.qs[j];
+                const float16_t w0 = float16_t(wq & 0xF) - float16_t(8.0f);
+                const float16_t w1 = float16_t(wq >>  4) - float16_t(8.0f);
+                s += x0 * w0 + x1 * w1;
+            }
+            sum += s * xi.d * wi.d;
+        }
+
+        y[outputOffset + d] = float(sum);
+    }
+}
diff --git a/src/nn/vulkan/merge-add-forward-f32-f32.comp b/src/nn/vulkan/merge-add-forward-f32-f32.comp
@@ -12,7 +12,7 @@ struct BatchInfo {
 };
 
 layout(binding = 0) readonly buffer inputBuffer { float x[]; };
-layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
+layout(binding = 1) buffer outputBuffer { float y[]; };
 layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
 
 shared uint sharedDim;

Original file line number	Diff line number	Diff line change
`@@ -430,7 +430,7 @@ static void matmul_Q80_Q40_F32(float output, const NnBlockQ80 x, const NnBlock`
`430`	`430`	`const int w0 = (wb->qs[k] & 0x0F) - 8;`
`431`	`431`	`const int w1 = (wb->qs[k] >> 4) - 8;`
`432`	`432`	`const int i1 = xb->qs[k];`
`433`		`- const int i2 = xb->qs[k + Q40_BLOCK_SIZE / 2];`
	`433`	`+ const int i2 = xb->qs[k + Q80_BLOCK_SIZE / 2];`
`434`	`434`	`sum += (w0 * i1 + w1 * i2) * s;`
`435`	`435`	`}`
`436`	`436`	`}`
Original file line number	Diff line number	Diff line change
`@@ -388,6 +388,7 @@ static const char *getShaderFileName(const NnOpCode opCode, const NnOpQuantType`
`388`	`388`	`}`
`389`	`389`	`if (opCode == OP_MATMUL) {`
`390`	`390`	`if (quantType == F32_F32_F32) return "matmul-forward-f32-f32-f32.spv";`
	`391`	`+ if (quantType == Q80_Q40_F32) return "matmul-forward-q80-q40-f32.spv";`
`391`	`392`	`}`
`392`	`393`	`if (opCode == OP_MULTIHEAD_ATT) {`
`393`	`394`	`if (quantType == F32_F32_F32) return "multi-head-att-forward-f32-f32.spv";`