@@ -105,6 +105,52 @@ void quantizeF32toQ80(const float *input, NnBlockQ80 *output, const NnSize n, co
105
105
vst1_lane_s32 ((int32_t *)(y->qs + j), vreinterpret_s32_s8 (vec_i8), 0 );
106
106
}
107
107
}
108
+ #elif defined(__AVX2__)
109
+ for (NnSize i = start; i < end; ++i) {
110
+ const float *x = input + i * Q80_BLOCK_SIZE;
111
+ NnBlockQ80 *y = output + i;
112
+
113
+ __m256 max_abs = _mm256_setzero_ps ();
114
+ for (int j = 0 ; j < Q80_BLOCK_SIZE; j += 8 ) {
115
+ __m256 vec = _mm256_loadu_ps (x + j);
116
+ __m256 abs_vec = _mm256_and_ps (vec, _mm256_castsi256_ps (_mm256_set1_epi32 (0x7FFFFFFF )));
117
+ max_abs = _mm256_max_ps (max_abs, abs_vec);
118
+ }
119
+ __m128 max_hi = _mm256_extractf128_ps (max_abs, 1 );
120
+ __m128 max_lo = _mm256_castps256_ps128 (max_abs);
121
+ __m128 max_128 = _mm_max_ps (max_hi, max_lo);
122
+ max_128 = _mm_max_ps (max_128, _mm_movehl_ps (max_128, max_128));
123
+ max_128 = _mm_max_ss (max_128, _mm_shuffle_ps (max_128, max_128, _MM_SHUFFLE (1 , 1 , 1 , 1 )));
124
+ float amax = _mm_cvtss_f32 (max_128);
125
+
126
+ const float d = amax / 127 .0f ;
127
+ const float id = (d != 0 .0f ) ? 1 .0f / d : 0 .0f ;
128
+ y->d = CONVERT_F32_TO_F16 (d);
129
+
130
+ const __m256 id_vec = _mm256_set1_ps (id);
131
+ const __m128i shuffle_mask = _mm_set_epi8 (
132
+ -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
133
+ -1 , -1 , -1 , -1 , 12 , 8 , 4 , 0
134
+ );
135
+
136
+ for (int j = 0 ; j < Q80_BLOCK_SIZE; j += 8 ) {
137
+ __m256 vec = _mm256_loadu_ps (x + j);
138
+ __m256 scaled = _mm256_mul_ps (vec, id_vec);
139
+ __m256 rounded = _mm256_round_ps (scaled, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
140
+ __m256i integers = _mm256_cvtps_epi32 (rounded);
141
+
142
+ __m128i low = _mm256_extracti128_si256 (integers, 0 );
143
+ __m128i high = _mm256_extracti128_si256 (integers, 1 );
144
+
145
+ __m128i low_bytes = _mm_shuffle_epi8 (low, shuffle_mask);
146
+ __m128i high_bytes = _mm_shuffle_epi8 (high, shuffle_mask);
147
+
148
+ uint32_t low_part = _mm_extract_epi32 (low_bytes, 0 );
149
+ uint32_t high_part = _mm_extract_epi32 (high_bytes, 0 );
150
+ uint64_t packed = (static_cast <uint64_t >(high_part) << 32 ) | low_part;
151
+ std::memcpy (y->qs + j, &packed, sizeof (packed));
152
+ }
153
+ }
108
154
#else
109
155
for (NnSize i = start; i < end; i++) {
110
156
const float *x = &input[i * Q80_BLOCK_SIZE];
0 commit comments