testCast_F32_Q80.

b4rtaz · b4rtaz · commit da414459bc84 · 2025-03-22T23:23:31.000+01:00
diff --git a/src/nn/nn-vulkan-test.cpp b/src/nn/nn-vulkan-test.cpp
@@ -1,5 +1,6 @@
 #include <cstdio>
 #include "nn-config-builder.hpp"
+#include "nn-quants.hpp"
 #include "nn-vulkan.hpp"
 
 #define N_BATCHES 4
@@ -309,11 +310,11 @@ void testShift_F32_F32() {
 }
 
 void testCast_F32_F32() {
-    #define CAST_DIM 64
+    #define CAST_F32_DIM 64
     execute(
         [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
-            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, CAST_DIM));
-            NnUint yPipeIndex = netBuilder->addPipe("Y", size2D(F_32, N_BATCHES, CAST_DIM));
+            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, CAST_F32_DIM));
+            NnUint yPipeIndex = netBuilder->addPipe("Y", size2D(F_32, N_BATCHES, CAST_F32_DIM));
             segmentBuilder->addOp(
                 OP_CAST, "cast", 0,
                 pointerBatchConfig(SRC_PIPE, xPipeIndex),
@@ -327,25 +328,56 @@ void testCast_F32_F32() {
             float *xPipe = (float *)execution->pipes[0];
             float *yPipe = (float *)execution->pipes[1];
 
-            for (NnUint b = 0; b < N_BATCHES; b++) {
-                for (NnUint i = 0; i < CAST_DIM; i++)
-                    xPipe[b * CAST_DIM + i] = (float)b;
-            }
+            for (NnUint i = 0; i < N_BATCHES * CAST_F32_DIM; i++)
+                xPipe[i] = (float)(i + 1);
 
             // act
             executor->forward();
 
             // assert
-            for (NnUint b = 0; b < N_BATCHES; b++) {
-                for (NnUint i = 0; i < CAST_DIM; i++) {
-                    NnUint j = b * CAST_DIM + i;
-                    assertFloat(j, yPipe[j], (float)b, 0.00001f);
-                }
-            }
+            for (NnUint i = 0; i < N_BATCHES * CAST_F32_DIM; i++)
+                assertFloat(i, yPipe[i], (float)(i + 1), 0.00001f);
             printOk("testCast_F32_F32");
         });
 }
 
+void testCast_F32_Q80() {
+    #define CAST_Q80_DIM 256
+    execute(
+        [](NnNetConfigBuilder *netBuilder, NnNodeConfigBuilder *nodeBuilder, NnSegmentConfigBuilder *segmentBuilder) {
+            NnUint xPipeIndex = netBuilder->addPipe("X", size2D(F_32, N_BATCHES, CAST_Q80_DIM));
+            NnUint yPipeIndex = netBuilder->addPipe("Y", size2D(F_Q80, N_BATCHES, CAST_Q80_DIM));
+            segmentBuilder->addOp(
+                OP_CAST, "cast", 0,
+                pointerBatchConfig(SRC_PIPE, xPipeIndex),
+                pointerBatchConfig(SRC_PIPE, yPipeIndex),
+                size0(),
+                NnCastOpCodeConfig{});
+        },
+        [](NnExecutor *executor, NnNetExecution *execution, NnVulkanDevice *device) {
+            // arrange
+            execution->setBatchSize(N_BATCHES);
+            float *xPipe = (float *)execution->pipes[0];
+            NnBlockQ80 *yPipe = (NnBlockQ80 *)execution->pipes[1];
+
+            for (NnUint i = 0; i < N_BATCHES * CAST_Q80_DIM; i++)
+                xPipe[i] = (float)(i + 1);
+
+            // act
+            executor->forward();
+
+            float yF32[CAST_Q80_DIM * N_BATCHES];
+            dequantizeQ80toF32(yPipe, yF32, CAST_Q80_DIM * N_BATCHES, 1, 0);
+
+            for (NnUint i = 0; i < N_BATCHES * CAST_Q80_DIM; i++) {
+                const float expectedV = (float)(i + 1);
+                const float change = (yF32[i] - expectedV) / expectedV;
+                assertFloat(i, change, 0.0, 0.009f);
+            }
+            printOk("testCast_F32_Q80");
+        });
+}
+
 void testRope_F32_F32() {
     #define ROPE_DIM 2048
     #define ROPE_KV_DIM 512
@@ -490,13 +522,16 @@ void multiheadAtt_F32_F32() {
 }
 
 int main() {
+    initQuants();
+
     testRmsNorm_F32_F32_F32();
     testSilu_F32_F32();
     testMul_F32_F32();
     testMergeAdd_F32_F32();
     testEmbedding_F32_F32();
     testShift_F32_F32();
     testCast_F32_F32();
+    testCast_F32_Q80();
     testRope_F32_F32();
     matmul_F32_F32_F32();
     multiheadAtt_F32_F32();
diff --git a/src/nn/nn-vulkan.cpp b/src/nn/nn-vulkan.cpp
@@ -242,23 +242,36 @@ NnUint NnVulkanDeviceData::resolveBufferBatchOffset(NnPointerConfig *config, NnU
     assert(batchIndex < netConfig->nBatches);
     if (config->type == PNTR_RAW)
         return 0;
-    NnSize2D bufferSize = resolveBufferSize(config);
+
+    const NnSize2D bufferSize = resolveBufferSize(config);
+    const NnSize blockSize = getBlockSize(bufferSize.floatType);
+    assert(bufferSize.x % blockSize == 0);
+    const NnUint sizeX = bufferSize.x / blockSize;
+
     if (config->type == PNTR_BATCH)
-        return bufferSize.x * batchIndex;
-    if (config->type == PNTR_BATCHED_SLICE)
-        return bufferSize.x * batchIndex + (bufferSize.x / netConfig->nNodes) * nodeConfig->nodeIndex;
+        return sizeX * batchIndex;
+    if (config->type == PNTR_BATCHED_SLICE) {
+        assert(sizeX % netConfig->nNodes == 0);
+        return sizeX * batchIndex + (sizeX / netConfig->nNodes) * nodeConfig->nodeIndex;
+    }
     throw std::runtime_error("Cannot determine buffer offset");
 }
 
 NnUint NnVulkanDeviceData::resolveBufferBatchWidth(NnPointerConfig *config, NnUint batchIndex) {
     assert(batchIndex < netConfig->nBatches);
-    NnSize2D bufferSize = resolveBufferSize(config);
+    const NnSize2D bufferSize = resolveBufferSize(config);
+    const NnSize blockSize = getBlockSize(bufferSize.floatType);
+    assert(bufferSize.x % blockSize == 0);
+    const NnUint sizeX = bufferSize.x / blockSize;
+
     if (config->type == PNTR_RAW)
-        return bufferSize.x;
+        return sizeX;
     if (config->type == PNTR_BATCH)
-        return bufferSize.x;
-    if (config->type == PNTR_BATCHED_SLICE)
-        return bufferSize.x / netConfig->nNodes;
+        return sizeX;
+    if (config->type == PNTR_BATCHED_SLICE) {
+        assert(sizeX % netConfig->nNodes == 0);
+        return sizeX / netConfig->nNodes;
+    }
     throw std::runtime_error("Cannot determine buffer width");
 }
 
@@ -389,6 +402,7 @@ static const char *getShaderFileName(const NnOpCode opCode, const NnOpQuantType
     }
     if (opCode == OP_CAST) {
         if (quantType == F32_F32_F32) return "cast-forward-f32-f32.spv";
+        if (quantType == F32_F32_Q80) return "cast-forward-f32-q80.spv";
     }
     if (opCode == OP_SHIFT) {
         if (quantType == F32_F32_F32) return "shift-forward-f32-f32.spv";
diff --git a/src/nn/vulkan/cast-forward-f32-q80.comp b/src/nn/vulkan/cast-forward-f32-q80.comp
@@ -0,0 +1,78 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+
+#define Q80_BLOCK_SIZE 32
+#define N_THREADS 64
+
+layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
+
+struct BatchInfo {
+    uint inputOffset;
+    uint inputSizeX;
+    uint outputOffset; // number of Q80 blocks
+    uint outputSizeX; // number of Q80 blocks
+};
+
+struct BlockQ80 {
+    float16_t d;
+    int8_t qs[Q80_BLOCK_SIZE];
+};
+
+layout(binding = 0) readonly buffer inputBuffer { float x[]; };
+layout(binding = 1) writeonly buffer outputBuffer { BlockQ80 y[]; };
+layout(binding = 2) readonly buffer batchInfosBuffer { BatchInfo infos[]; };
+
+shared uint sharedYStart;
+shared uint sharedYEnd;
+shared uint sharedXOffset;
+shared uint sharedYOffset;
+
+void main() {
+    const uint threadIndex = gl_LocalInvocationID.x;
+
+    if (threadIndex == 0) {
+        const uint nWorkGroups = gl_NumWorkGroups.z;
+        const uint batchIndex = gl_WorkGroupID.y;
+        const uint workGroupIndex = gl_WorkGroupID.z;
+
+        const BatchInfo info = infos[batchIndex];
+
+        const uint ySlice = info.outputSizeX / nWorkGroups;
+        const uint yRest = info.outputSizeX % nWorkGroups;
+        sharedYStart = workGroupIndex * ySlice + (workGroupIndex < yRest ? workGroupIndex : yRest);
+        sharedYEnd = sharedYStart + ySlice + (workGroupIndex < yRest ? 1 : 0);
+        sharedXOffset = info.inputOffset;
+        sharedYOffset = info.outputOffset;
+    }
+
+    barrier();
+    memoryBarrierShared();
+
+    const uint yStart = sharedYStart + threadIndex;
+    const uint yEnd = sharedYEnd;
+    const uint xOffset = sharedXOffset;
+    const uint yOffset = sharedYOffset;
+
+    for (uint i = yStart; i < yEnd; i += N_THREADS) {
+        const uint xiOffset = xOffset + i * Q80_BLOCK_SIZE;
+        const uint yiOffset = yOffset + i;
+
+        float amax = 0.0;
+        for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
+            const float v = abs(x[xiOffset + j]);
+            amax = max(amax, v);
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d != 0.0 ? 1.0 / d : 0.0;
+
+        y[yiOffset].d = float16_t(d);
+
+        for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
+            const float v = x[xiOffset + j];
+            y[yiOffset].qs[j] = int8_t(clamp(round(v * id), -127.0, 127.0));
+        }
+    }
+}