From 4b0c97a2693319391c3f9f14ffbc1e7be26af1b3 Mon Sep 17 00:00:00 2001 From: Bart Tadych Date: Sun, 16 Feb 2025 23:28:20 +0000 Subject: [PATCH 1/3] feat: quantizeF32toQ80 avx2. --- src/nn/nn-quants.cpp | 46 ++++++++++++++++++++++++++++++++++++++++++++ src/nn/nn-quants.hpp | 46 +++++++++++++++++++++++++------------------- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/src/nn/nn-quants.cpp b/src/nn/nn-quants.cpp index 30b468d..6a8b67b 100644 --- a/src/nn/nn-quants.cpp +++ b/src/nn/nn-quants.cpp @@ -105,6 +105,52 @@ void quantizeF32toQ80(const float *input, NnBlockQ80 *output, const NnSize n, co vst1_lane_s32((int32_t *)(y->qs + j), vreinterpret_s32_s8(vec_i8), 0); } } +#elif defined(__AVX2__) + for (NnSize i = start; i < end; ++i) { + const float *x = input + i * Q80_BLOCK_SIZE; + NnBlockQ80 *y = output + i; + + __m256 max_abs = _mm256_setzero_ps(); + for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) { + __m256 vec = _mm256_loadu_ps(x + j); + __m256 abs_vec = _mm256_and_ps(vec, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF))); + max_abs = _mm256_max_ps(max_abs, abs_vec); + } + __m128 max_hi = _mm256_extractf128_ps(max_abs, 1); + __m128 max_lo = _mm256_castps256_ps128(max_abs); + __m128 max_128 = _mm_max_ps(max_hi, max_lo); + max_128 = _mm_max_ps(max_128, _mm_movehl_ps(max_128, max_128)); + max_128 = _mm_max_ss(max_128, _mm_shuffle_ps(max_128, max_128, _MM_SHUFFLE(1, 1, 1, 1))); + float amax = _mm_cvtss_f32(max_128); + + const float d = amax / 127.0f; + const float id = (d != 0.0f) ? 1.0f / d : 0.0f; + y->d = CONVERT_F32_TO_F16(d); + + const __m256 id_vec = _mm256_set1_ps(id); + const __m128i shuffle_mask = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 12, 8, 4, 0 + ); + + for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) { + __m256 vec = _mm256_loadu_ps(x + j); + __m256 scaled = _mm256_mul_ps(vec, id_vec); + __m256 rounded = _mm256_round_ps(scaled, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + __m256i integers = _mm256_cvtps_epi32(rounded); + + __m128i low = _mm256_extracti128_si256(integers, 0); + __m128i high = _mm256_extracti128_si256(integers, 1); + + __m128i low_bytes = _mm_shuffle_epi8(low, shuffle_mask); + __m128i high_bytes = _mm_shuffle_epi8(high, shuffle_mask); + + uint32_t low_part = _mm_extract_epi32(low_bytes, 0); + uint32_t high_part = _mm_extract_epi32(high_bytes, 0); + uint64_t packed = (static_cast(high_part) << 32) | low_part; + std::memcpy(y->qs + j, &packed, sizeof(packed)); + } + } #else for (NnSize i = start; i < end; i++) { const float *x = &input[i * Q80_BLOCK_SIZE]; diff --git a/src/nn/nn-quants.hpp b/src/nn/nn-quants.hpp index bbc44e3..5070e99 100644 --- a/src/nn/nn-quants.hpp +++ b/src/nn/nn-quants.hpp @@ -5,6 +5,8 @@ #include #if defined(__ARM_NEON) #include +#elif defined(__AVX2__) + #include #endif typedef std::uint8_t NnByte; @@ -15,34 +17,38 @@ float convertF16toF32Impl(const NnFp16 value); NnFp16 convertF32ToF16Impl(const float x); #if defined(__ARM_NEON) && defined(__ARM_FP16_FORMAT_IEEE) -inline float convertF16ToF32Neon(const NnFp16 value) { - __fp16 fp; - std::memcpy(&fp, &value, sizeof(fp)); - return (float)fp; -} - -inline NnFp16 convertF32ToF16Neon(const float x) { - __fp16 h = x; - return *(NnFp16 *)&h; -} - -#define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value) -#define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value) + inline float convertF16ToF32Neon(const NnFp16 value) { + __fp16 fp; + std::memcpy(&fp, &value, sizeof(fp)); + return (float)fp; + } + + inline NnFp16 convertF32ToF16Neon(const float x) { + __fp16 h = x; + return *(NnFp16 *)&h; + } + + #define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value) + #define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value) +#endif + +#if defined(__F16C__) + #define CONVERT_F32_TO_F16(v) _cvtss_sh((v), _MM_FROUND_TO_NEAREST_INT) #endif #if !defined(CONVERT_F16_TO_F32) -extern float f16ToF32Lookup[65536]; + extern float f16ToF32Lookup[65536]; -inline static float convertF16ToF32Lookup(const NnFp16 value) { - return f16ToF32Lookup[value]; -} + inline static float convertF16ToF32Lookup(const NnFp16 value) { + return f16ToF32Lookup[value]; + } -#define CONVERT_F16_TO_F32_LOOKUP -#define CONVERT_F16_TO_F32(value) convertF16ToF32Lookup(value) + #define CONVERT_F16_TO_F32_LOOKUP + #define CONVERT_F16_TO_F32(value) convertF16ToF32Lookup(value) #endif #if !defined(CONVERT_F32_TO_F16) -#define CONVERT_F32_TO_F16(value) convertF32ToF16Impl(value) + #define CONVERT_F32_TO_F16(value) convertF32ToF16Impl(value) #endif #define Q40_BLOCK_SIZE 32 From 2c5240eded31b8fd67e2f00d7b76741e99b51de5 Mon Sep 17 00:00:00 2001 From: Bart Tadych Date: Sun, 16 Feb 2025 23:38:43 +0000 Subject: [PATCH 2/3] test: testConvertF32toF16. --- src/nn/nn-cpu-ops-test.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/nn/nn-cpu-ops-test.cpp b/src/nn/nn-cpu-ops-test.cpp index 9e8575a..8af47b6 100644 --- a/src/nn/nn-cpu-ops-test.cpp +++ b/src/nn/nn-cpu-ops-test.cpp @@ -69,6 +69,15 @@ void testSplitThreads() { printf("✅ %24s passed\n", "splitThreads"); } +void testConvertF32toF16() { + float x[] = {0.0f, 0.25f, 0.3456f, 1.0f}; + for (NnSize i = 0; i < sizeof(x) / sizeof(float); i++) { + NnFp16 f16 = CONVERT_F32_TO_F16(x[i]); + float f32 = CONVERT_F16_TO_F32(f16); + compare_F32("convertF32toF16", &x[i], &f32, 1, 0.0005); + } +} + // quantization void testQuantization(const NnSize m) { std::vector a(m * Q40_BLOCK_SIZE); @@ -285,6 +294,7 @@ int main() { printCpuInstructionSet(); testSplitThreads(); + testConvertF32toF16(); testQuantization(32); testQuantization(2); testQuantization(1); From bacc38f47d4db7b7ab102595c5058343fbdc33a2 Mon Sep 17 00:00:00 2001 From: Bart Tadych Date: Sun, 16 Feb 2025 23:44:43 +0000 Subject: [PATCH 3/3] fix. --- src/nn/nn-quants.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/nn/nn-quants.hpp b/src/nn/nn-quants.hpp index 5070e99..fe3cae0 100644 --- a/src/nn/nn-quants.hpp +++ b/src/nn/nn-quants.hpp @@ -30,9 +30,7 @@ NnFp16 convertF32ToF16Impl(const float x); #define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value) #define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value) -#endif - -#if defined(__F16C__) +#elif defined(__F16C__) #define CONVERT_F32_TO_F16(v) _cvtss_sh((v), _MM_FROUND_TO_NEAREST_INT) #endif