testMergeAdd_Q80_F32.

b4rtaz · b4rtaz · commit bee26bbb18de · 2025-03-23T11:27:22.000+01:00
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -182,12 +182,12 @@ void testMul_F32_F32() {
 }
 
 void testMergeAdd_F32_F32() {
-    #define MERGE_ADD_NODES 2
-    #define MERGE_ADD_DIM 64
+    #define MERGE_ADD_F32_NODES 2
+    #define MERGE_ADD_F32_DIM 64
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
-            NnUint zPipeIndex = netBuilder->addPipe("Z", size2D(F_32, N_BATCHES, MERGE_ADD_DIM * MERGE_ADD_NODES));
-            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, MERGE_ADD_DIM));
+            NnUint zPipeIndex = netBuilder->addPipe("Z", size2D(F_32, N_BATCHES, MERGE_ADD_F32_DIM * MERGE_ADD_F32_NODES));
+            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, MERGE_ADD_F32_DIM));
             segmentBuilder->addOp(OP_MERGE_ADD, "mergeAdd", 0,
                 pointerBatchConfig(SRC_PIPE, zPipeIndex),
                 pointerBatchConfig(SRC_PIPE, xPipeIndex),
@@ -201,9 +201,9 @@ void testMergeAdd_F32_F32() {
             float *zPipe = (float *)execution->pipes[0];
             float *xPipe = (float *)execution->pipes[1];
             for (NnUint b = 0; b < N_BATCHES; b++) {
-                for (NnUint n = 0; n < MERGE_ADD_NODES; n++) {
-                    for (NnUint i = 0; i < MERGE_ADD_DIM; i++)
-                        zPipe[b * MERGE_ADD_NODES * MERGE_ADD_DIM + n * MERGE_ADD_DIM + i] = (float)(b + 1);
+                for (NnUint n = 0; n < MERGE_ADD_F32_NODES; n++) {
+                    for (NnUint i = 0; i < MERGE_ADD_F32_DIM; i++)
+                        zPipe[b * MERGE_ADD_F32_NODES * MERGE_ADD_F32_DIM + n * MERGE_ADD_F32_DIM + i] = (float)(b + 1);
                 }
             }
 
@@ -212,15 +212,58 @@ void testMergeAdd_F32_F32() {
 
             // assert
             for (NnUint b = 0; b < N_BATCHES; b++) {
-                for (NnUint i = 0; i < MERGE_ADD_DIM; i++) {
-                    NnUint j = b * MERGE_ADD_DIM + i;
+                for (NnUint i = 0; i < MERGE_ADD_F32_DIM; i++) {
+                    NnUint j = b * MERGE_ADD_F32_DIM + i;
                     assertFloat(j, xPipe[j], (float)(2 * b + 2), 0.00001f);
                 }
             }
             printOk("testMergeAdd_F32_F32");
         });
 }
 
+static void testMergeAdd_Q80_F32() {
+    #define MERGE_ADD_Q80_NODES 2
+    #define MERGE_ADD_Q80_DIM 64
+    execute(
+        [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
+            const NnUint zPipeIndex = netBuilder->addPipe("Z", size2D(F_Q80, N_BATCHES, MERGE_ADD_Q80_DIM * MERGE_ADD_Q80_NODES));
+            const NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, MERGE_ADD_Q80_DIM));
+            segmentBuilder->addOp(OP_MERGE_ADD, "mergeAdd", 0,
+                pointerBatchConfig(SRC_PIPE, zPipeIndex),
+                pointerBatchConfig(SRC_PIPE, xPipeIndex),
+                size0(),
+                NnMergeAddOpCodeConfig{});
+        },
+        [](NnExecutor *executor, NnNetExecution *execution, NnVulkanDevice *device) {
+            // arrange
+            execution->setBatchSize(N_BATCHES);
+
+            float z[N_BATCHES * MERGE_ADD_Q80_DIM * MERGE_ADD_Q80_NODES];
+            for (NnUint b = 0; b < N_BATCHES; b++) {
+                for (NnUint n = 0; n < MERGE_ADD_Q80_NODES; n++) {
+                    for (NnUint i = 0; i < MERGE_ADD_Q80_DIM; i++)
+                        z[b * MERGE_ADD_Q80_NODES * MERGE_ADD_Q80_DIM + n * MERGE_ADD_Q80_DIM + i] = (float)(b + 1);
+                }
+            }
+
+            NnBlockQ80 *zPipe = (NnBlockQ80 *)execution->pipes[0];
+            const float *xPipe = (float *)execution->pipes[1];
+            quantizeF32toQ80(z, zPipe, N_BATCHES * MERGE_ADD_Q80_DIM * MERGE_ADD_Q80_NODES, 1, 0);
+
+            // act
+            executor->forward();
+
+            // assert
+            for (NnUint b = 0; b < N_BATCHES; b++) {
+                for (NnUint i = 0; i < MERGE_ADD_Q80_DIM; i++) {
+                    NnUint j = b * MERGE_ADD_Q80_DIM + i;
+                    assertFloat(j, xPipe[j], (float)(2 * b + 2), 0.00001f);
+                }
+            }
+            printOk("testMergeAdd_Q80_F32");
+        });
+}
+
 void testEmbedding_F32_F32() {
     #define EMBEDDING_DIM 16
     #define EMBEDDING_LEN 8
@@ -528,6 +571,7 @@ int main() {
     testSilu_F32_F32();
     testMul_F32_F32();
     testMergeAdd_F32_F32();
+    testMergeAdd_Q80_F32();
     testEmbedding_F32_F32();
     testShift_F32_F32();
     testCast_F32_F32();
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -375,6 +375,7 @@ NnDeviceSegment *NnVulkanDevice::createSegment(NnUint segmentIndex) {
 static const char *getShaderFileName(const NnOpCode opCode, const NnOpQuantType quantType) {
     if (opCode == OP_MERGE_ADD) {
         if (quantType == F32_F32_F32) return "merge-add-forward-f32-f32.spv";
+        if (quantType == Q80_Q80_F32) return "merge-add-forward-q80-f32.spv";
     }
     if (opCode == OP_EMBEDDING) {
         if (quantType == F32_F32_F32) return "embedding-forward-f32-f32.spv";
@@ -493,6 +494,7 @@ static std::vector<uint32_t> readShader(const char *fileName) {
     constexpr size_t maxSize = 16384;
     uint32_t chunk[maxSize];
     size_t bytesRead = fread(chunk, 1, maxSize, file);
+    assert(bytesRead < maxSize); // Check if the file is too large
     if (bytesRead > 0)
         code.insert(code.end(), chunk, chunk + bytesRead);
     if (ferror(file)) {
diff --git a/src/nn/vulkan/cast-forward-f32-q80.comp b/src/nn/vulkan/cast-forward-f32-q80.comp
@@ -1,5 +1,6 @@
 #version 450
 
+#extension GL_EXT_control_flow_attributes : enable
 #extension GL_EXT_shader_16bit_storage : enable
 #extension GL_EXT_shader_explicit_arithmetic_types : enable
 
@@ -70,7 +71,7 @@ void main() {
 
         y[yiOffset].d = float16_t(d);
 
-        for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
+        [[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
             const float v = x[xiOffset + j];
             y[yiOffset].qs[j] = int8_t(clamp(round(v * id), -127.0, 127.0));
         }
diff --git a/src/nn/vulkan/matmul-forward-f32-f32-f32.comp b/src/nn/vulkan/matmul-forward-f32-f32-f32.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#define N_THREADS 256
+#define N_THREADS 128
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/src/nn/vulkan/merge-add-forward-f32-f32.comp b/src/nn/vulkan/merge-add-forward-f32-f32.comp
@@ -1,7 +1,5 @@
 #version 450
 
-#extension GL_EXT_control_flow_attributes : enable
-
 #define N_THREADS 64
 
 layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
diff --git a/src/nn/vulkan/merge-add-forward-q80-f32.comp b/src/nn/vulkan/merge-add-forward-q80-f32.comp
@@ -0,0 +1,88 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+
+#define Q80_BLOCK_SIZE 32
+#define N_THREADS 64
+
+layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+
+struct BatchInfo {
+    uint inputOffset; // number of Q80 blocks
+    uint inputSizeX; // number of Q80 blocks
+    uint outputOffset;
+    uint outputSizeX;
+};
+
+struct BlockQ80 {
+    float16_t d;
+    int8_t qs[Q80_BLOCK_SIZE];
+};
+
+layout(binding = 0) readonly buffer inputBuffer { BlockQ80 x[]; };
+layout(binding = 1) buffer outputBuffer { float y[]; };
+layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
+
+shared uint sharedXStart;
+shared uint sharedXEnd;
+shared uint sharedNParts;
+shared uint sharedXJump;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
+
+void main() {
+    const uint threadIndex = gl_LocalInvocationID.x;
+
+    if (threadIndex == 0) {
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+        const uint xJump = info.outputSizeX / Q80_BLOCK_SIZE;
+        const uint nParts = info.inputSizeX / xJump;
+        const uint xSlice = xJump / nWorkGroups;
+        const uint xRest = xJump % nWorkGroups;
+
+        sharedXStart = workGroupIndex * xSlice + (workGroupIndex < xRest ? workGroupIndex : xRest);
+        sharedXEnd = sharedXStart + xSlice + (workGroupIndex < xRest ? 1 : 0);
+        sharedNParts = nParts;
+        sharedXJump = xJump;
+        sharedXOffset = info.inputOffset;
+        sharedYOffset = info.outputOffset;
+    }
+
+    barrier();
+    memoryBarrierShared();
+
+    const uint xStart = sharedXStart + threadIndex;
+    const uint xEnd = sharedXEnd;
+    const uint xJump = sharedXJump;
+    const uint nParts = sharedNParts;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
+    float16_t sums[Q80_BLOCK_SIZE];
+
+    for (uint i = xStart; i < xEnd; i += N_THREADS) {
+        const uint xiOffset = xOffset + i;
+        const uint yiOffset = yOffset + i * Q80_BLOCK_SIZE;
+
+        [[unroll]] for (uint k = 0; k < Q80_BLOCK_SIZE; k++) {
+            sums[k] = float16_t(0.0);
+        }
+        for (uint n = 0; n < nParts; n++) {
+            const BlockQ80 b = x[xiOffset + n * xJump];
+            const float16_t d = b.d;
+
+            [[unroll]] for (uint k = 0; k < Q80_BLOCK_SIZE; k++) {
+                sums[k] += float16_t(b.qs[k]) * d;
+            }
+        }
+
+        [[unroll]] for (uint k = 0; k < Q80_BLOCK_SIZE; k++) {
+            y[yiOffset + k] += float(sums[k]);
+        }
+    }
+}