Skip to content

Commit 415946f

Browse files
authoredFeb 17, 2025
feat: quantizeF32toQ80 avx2. (#171)
1 parent 5b4e682 commit 415946f

File tree

3 files changed

+80
-20
lines changed

3 files changed

+80
-20
lines changed
 

‎src/nn/nn-cpu-ops-test.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ void testSplitThreads() {
6969
printf("✅ %24s passed\n", "splitThreads");
7070
}
7171

72+
void testConvertF32toF16() {
73+
float x[] = {0.0f, 0.25f, 0.3456f, 1.0f};
74+
for (NnSize i = 0; i < sizeof(x) / sizeof(float); i++) {
75+
NnFp16 f16 = CONVERT_F32_TO_F16(x[i]);
76+
float f32 = CONVERT_F16_TO_F32(f16);
77+
compare_F32("convertF32toF16", &x[i], &f32, 1, 0.0005);
78+
}
79+
}
80+
7281
// quantization
7382
void testQuantization(const NnSize m) {
7483
std::vector<float> a(m * Q40_BLOCK_SIZE);
@@ -285,6 +294,7 @@ int main() {
285294

286295
printCpuInstructionSet();
287296
testSplitThreads();
297+
testConvertF32toF16();
288298
testQuantization(32);
289299
testQuantization(2);
290300
testQuantization(1);

‎src/nn/nn-quants.cpp

+46
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,52 @@ void quantizeF32toQ80(const float *input, NnBlockQ80 *output, const NnSize n, co
105105
vst1_lane_s32((int32_t *)(y->qs + j), vreinterpret_s32_s8(vec_i8), 0);
106106
}
107107
}
108+
#elif defined(__AVX2__)
109+
for (NnSize i = start; i < end; ++i) {
110+
const float *x = input + i * Q80_BLOCK_SIZE;
111+
NnBlockQ80 *y = output + i;
112+
113+
__m256 max_abs = _mm256_setzero_ps();
114+
for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) {
115+
__m256 vec = _mm256_loadu_ps(x + j);
116+
__m256 abs_vec = _mm256_and_ps(vec, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
117+
max_abs = _mm256_max_ps(max_abs, abs_vec);
118+
}
119+
__m128 max_hi = _mm256_extractf128_ps(max_abs, 1);
120+
__m128 max_lo = _mm256_castps256_ps128(max_abs);
121+
__m128 max_128 = _mm_max_ps(max_hi, max_lo);
122+
max_128 = _mm_max_ps(max_128, _mm_movehl_ps(max_128, max_128));
123+
max_128 = _mm_max_ss(max_128, _mm_shuffle_ps(max_128, max_128, _MM_SHUFFLE(1, 1, 1, 1)));
124+
float amax = _mm_cvtss_f32(max_128);
125+
126+
const float d = amax / 127.0f;
127+
const float id = (d != 0.0f) ? 1.0f / d : 0.0f;
128+
y->d = CONVERT_F32_TO_F16(d);
129+
130+
const __m256 id_vec = _mm256_set1_ps(id);
131+
const __m128i shuffle_mask = _mm_set_epi8(
132+
-1, -1, -1, -1, -1, -1, -1, -1,
133+
-1, -1, -1, -1, 12, 8, 4, 0
134+
);
135+
136+
for (int j = 0; j < Q80_BLOCK_SIZE; j += 8) {
137+
__m256 vec = _mm256_loadu_ps(x + j);
138+
__m256 scaled = _mm256_mul_ps(vec, id_vec);
139+
__m256 rounded = _mm256_round_ps(scaled, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
140+
__m256i integers = _mm256_cvtps_epi32(rounded);
141+
142+
__m128i low = _mm256_extracti128_si256(integers, 0);
143+
__m128i high = _mm256_extracti128_si256(integers, 1);
144+
145+
__m128i low_bytes = _mm_shuffle_epi8(low, shuffle_mask);
146+
__m128i high_bytes = _mm_shuffle_epi8(high, shuffle_mask);
147+
148+
uint32_t low_part = _mm_extract_epi32(low_bytes, 0);
149+
uint32_t high_part = _mm_extract_epi32(high_bytes, 0);
150+
uint64_t packed = (static_cast<uint64_t>(high_part) << 32) | low_part;
151+
std::memcpy(y->qs + j, &packed, sizeof(packed));
152+
}
153+
}
108154
#else
109155
for (NnSize i = start; i < end; i++) {
110156
const float *x = &input[i * Q80_BLOCK_SIZE];

‎src/nn/nn-quants.hpp

+24-20
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <cstring>
66
#if defined(__ARM_NEON)
77
#include <arm_neon.h>
8+
#elif defined(__AVX2__)
9+
#include <immintrin.h>
810
#endif
911

1012
typedef std::uint8_t NnByte;
@@ -15,34 +17,36 @@ float convertF16toF32Impl(const NnFp16 value);
1517
NnFp16 convertF32ToF16Impl(const float x);
1618

1719
#if defined(__ARM_NEON) && defined(__ARM_FP16_FORMAT_IEEE)
18-
inline float convertF16ToF32Neon(const NnFp16 value) {
19-
__fp16 fp;
20-
std::memcpy(&fp, &value, sizeof(fp));
21-
return (float)fp;
22-
}
23-
24-
inline NnFp16 convertF32ToF16Neon(const float x) {
25-
__fp16 h = x;
26-
return *(NnFp16 *)&h;
27-
}
28-
29-
#define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value)
30-
#define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value)
20+
inline float convertF16ToF32Neon(const NnFp16 value) {
21+
__fp16 fp;
22+
std::memcpy(&fp, &value, sizeof(fp));
23+
return (float)fp;
24+
}
25+
26+
inline NnFp16 convertF32ToF16Neon(const float x) {
27+
__fp16 h = x;
28+
return *(NnFp16 *)&h;
29+
}
30+
31+
#define CONVERT_F16_TO_F32(value) convertF16ToF32Neon(value)
32+
#define CONVERT_F32_TO_F16(value) convertF32ToF16Neon(value)
33+
#elif defined(__F16C__)
34+
#define CONVERT_F32_TO_F16(v) _cvtss_sh((v), _MM_FROUND_TO_NEAREST_INT)
3135
#endif
3236

3337
#if !defined(CONVERT_F16_TO_F32)
34-
extern float f16ToF32Lookup[65536];
38+
extern float f16ToF32Lookup[65536];
3539

36-
inline static float convertF16ToF32Lookup(const NnFp16 value) {
37-
return f16ToF32Lookup[value];
38-
}
40+
inline static float convertF16ToF32Lookup(const NnFp16 value) {
41+
return f16ToF32Lookup[value];
42+
}
3943

40-
#define CONVERT_F16_TO_F32_LOOKUP
41-
#define CONVERT_F16_TO_F32(value) convertF16ToF32Lookup(value)
44+
#define CONVERT_F16_TO_F32_LOOKUP
45+
#define CONVERT_F16_TO_F32(value) convertF16ToF32Lookup(value)
4246
#endif
4347

4448
#if !defined(CONVERT_F32_TO_F16)
45-
#define CONVERT_F32_TO_F16(value) convertF32ToF16Impl(value)
49+
#define CONVERT_F32_TO_F16(value) convertF32ToF16Impl(value)
4650
#endif
4751

4852
#define Q40_BLOCK_SIZE 32

0 commit comments

Comments
 (0)
Failed to load comments.