Skip to content

feat: quantizeF32toQ80 avx2. #171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/nn/nn-cpu-ops-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ void testSplitThreads() {
printf("✅ %24s passed\n", "splitThreads");
}

void testConvertF32toF16() {
float x[] = {0.0f, 0.25f, 0.3456f, 1.0f};
for (NnSize i = 0; i < sizeof(x) / sizeof(float); i++) {
NnFp16 f16 = CONVERT_F32_TO_F16(x[i]);
float f32 = CONVERT_F16_TO_F32(f16);
compare_F32("convertF32toF16", &x[i], &f32, 1, 0.0005);
}
}

// quantization
void testQuantization(const NnSize m) {
std::vector<float> a(m * Q40_BLOCK_SIZE);
Expand Down Expand Up @@ -285,6 +294,7 @@ int main() {

printCpuInstructionSet();
testSplitThreads();
testConvertF32toF16();
testQuantization(32);
testQuantization(2);
testQuantization(1);
Expand Down
46 changes: 46 additions & 0 deletions src/nn/nn-quants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,52 @@ void quantizeF32toQ80(const float *input, NnBlockQ80 *output, const NnSize n, co
vst1_lane_s32((int32_t *)(y->qs + j), vreinterpret_s32_s8(vec_i8), 0);
}
}
#elif defined(__AVX2__)
for (NnSize i = start; i < end; ++i) {
const float *x = input + i * Q80_BLOCK_SIZE;
NnBlockQ80 *y = output + i;

__m256 max_abs = _mm256_setzero_ps();
for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) {
__m256 vec = _mm256_loadu_ps(x + j);
__m256 abs_vec = _mm256_and_ps(vec, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
max_abs = _mm256_max_ps(max_abs, abs_vec);
}
__m128 max_hi = _mm256_extractf128_ps(max_abs, 1);
__m128 max_lo = _mm256_castps256_ps128(max_abs);
__m128 max_128 = _mm_max_ps(max_hi, max_lo);
max_128 = _mm_max_ps(max_128, _mm_movehl_ps(max_128, max_128));
max_128 = _mm_max_ss(max_128, _mm_shuffle_ps(max_128, max_128, _MM_SHUFFLE(1, 1, 1, 1)));
float amax = _mm_cvtss_f32(max_128);

const float d = amax / 127.0f;
const float id = (d != 0.0f) ? 1.0f / d : 0.0f;
y->d = CONVERT_F32_TO_F16(d);

const __m256 id_vec = _mm256_set1_ps(id);
const __m128i shuffle_mask = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 12, 8, 4, 0
);

for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) {
__m256 vec = _mm256_loadu_ps(x + j);
__m256 scaled = _mm256_mul_ps(vec, id_vec);
__m256 rounded = _mm256_round_ps(scaled, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
__m256i integers = _mm256_cvtps_epi32(rounded);

__m128i low = _mm256_extracti128_si256(integers, 0);
__m128i high = _mm256_extracti128_si256(integers, 1);

__m128i low_bytes = _mm_shuffle_epi8(low, shuffle_mask);
__m128i high_bytes = _mm_shuffle_epi8(high, shuffle_mask);

uint32_t low_part = _mm_extract_epi32(low_bytes, 0);
uint32_t high_part = _mm_extract_epi32(high_bytes, 0);
uint64_t packed = (static_cast<uint64_t>(high_part) << 32) | low_part;
std::memcpy(y->qs + j, &packed, sizeof(packed));
}
}
#else
for (NnSize i = start; i < end; i++) {
const float *x = &input[i * Q80_BLOCK_SIZE];
Expand Down
44 changes: 24 additions & 20 deletions src/nn/nn-quants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <cstring>
#if defined(__ARM_NEON)
#include <arm_neon.h>
#elif defined(__AVX2__)
#include <immintrin.h>
#endif

typedef std::uint8_t NnByte;
Expand All @@ -15,34 +17,36 @@ float convertF16toF32Impl(const NnFp16 value);
NnFp16 convertF32ToF16Impl(const float x);

#if defined(__ARM_NEON) && defined(__ARM_FP16_FORMAT_IEEE)
inline float convertF16ToF32Neon(const NnFp16 value) {
__fp16 fp;
std::memcpy(&fp, &value, sizeof(fp));
return (float)fp;
}

inline NnFp16 convertF32ToF16Neon(const float x) {
__fp16 h = x;
return *(NnFp16 *)&h;
}

#define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value)
#define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value)
inline float convertF16ToF32Neon(const NnFp16 value) {
__fp16 fp;
std::memcpy(&fp, &value, sizeof(fp));
return (float)fp;
}

inline NnFp16 convertF32ToF16Neon(const float x) {
__fp16 h = x;
return *(NnFp16 *)&h;
}

#define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value)
#define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value)
#elif defined(__F16C__)
#define CONVERT_F32_TO_F16(v) _cvtss_sh((v), _MM_FROUND_TO_NEAREST_INT)
#endif

#if !defined(CONVERT_F16_TO_F32)
extern float f16ToF32Lookup[65536];
extern float f16ToF32Lookup[65536];

inline static float convertF16ToF32Lookup(const NnFp16 value) {
return f16ToF32Lookup[value];
}
inline static float convertF16ToF32Lookup(const NnFp16 value) {
return f16ToF32Lookup[value];
}

#define CONVERT_F16_TO_F32_LOOKUP
#define CONVERT_F16_TO_F32(value) convertF16ToF32Lookup(value)
#define CONVERT_F16_TO_F32_LOOKUP
#define CONVERT_F16_TO_F32(value) convertF16ToF32Lookup(value)
#endif

#if !defined(CONVERT_F32_TO_F16)
#define CONVERT_F32_TO_F16(value) convertF32ToF16Impl(value)
#define CONVERT_F32_TO_F16(value) convertF32ToF16Impl(value)
#endif

#define Q40_BLOCK_SIZE 32
Expand Down