@@ -2169,6 +2169,17 @@ __declspec(noalias) size_t
2169
2169
2170
2170
#ifndef _M_ARM64EC
2171
2171
namespace {
2172
+ __m256i __forceinline _Bitset_to_string_1_step_avx (const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) {
2173
+ const __m128i _Vx0 = _mm_cvtsi32_si128 (_Val);
2174
+ const __m128i _Vx1 = _mm_shuffle_epi8 (_Vx0, _mm_set_epi32 (0x00000000 , 0x01010101 , 0x02020202 , 0x03030303 ));
2175
+ const __m256i _Vx2 = _mm256_castsi128_si256 (_Vx1);
2176
+ const __m256i _Vx3 = _mm256_permutevar8x32_epi32 (_Vx2, _mm256_set_epi32 (3 , 3 , 2 , 2 , 1 , 1 , 0 , 0 ));
2177
+ const __m256i _Msk = _mm256_and_si256 (_Vx3, _mm256_set1_epi64x (0x0102040810204080 ));
2178
+ const __m256i _Ex0 = _mm256_cmpeq_epi8 (_Msk, _mm256_setzero_si256 ());
2179
+ const __m256i _Ex1 = _mm256_blendv_epi8 (_Px1, _Px0, _Ex0);
2180
+ return _Ex1;
2181
+ }
2182
+
2172
2183
__m128i __forceinline _Bitset_to_string_1_step (const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) {
2173
2184
const __m128i _Vx0 = _mm_cvtsi32_si128 (_Val);
2174
2185
const __m128i _Vx1 = _mm_unpacklo_epi8 (_Vx0, _Vx0);
@@ -2180,6 +2191,18 @@ namespace {
2180
2191
return _Ex1;
2181
2192
}
2182
2193
2194
+ __m256i __forceinline _Bitset_to_string_2_step_avx (const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) {
2195
+ const __m128i _Vx0 = _mm_cvtsi32_si128 (_Val);
2196
+ const __m128i _Vx1 = _mm_shuffle_epi8 (_Vx0, _mm_set_epi32 (0x00000000 , 0x00000000 , 0x01010101 , 0x01010101 ));
2197
+ const __m256i _Vx2 = _mm256_castsi128_si256 (_Vx1);
2198
+ const __m256i _Vx3 = _mm256_permute4x64_epi64 (_Vx2, _MM_SHUFFLE (1 , 1 , 0 , 0 ));
2199
+ const __m256i _Msk = _mm256_and_si256 (
2200
+ _Vx3, _mm256_set_epi64x (0x0001000200040008 , 0x0010002000400080 , 0x0001000200040008 , 0x0010002000400080 ));
2201
+ const __m256i _Ex0 = _mm256_cmpeq_epi16 (_Msk, _mm256_setzero_si256 ());
2202
+ const __m256i _Ex1 = _mm256_blendv_epi8 (_Px1, _Px0, _Ex0);
2203
+ return _Ex1;
2204
+ }
2205
+
2183
2206
__m128i __forceinline _Bitset_to_string_2_step (const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) {
2184
2207
const __m128i _Vx = _mm_set1_epi16 (_Val);
2185
2208
const __m128i _Msk = _mm_and_si128 (_Vx, _mm_set_epi64x (0x0001000200040008 , 0x0010002000400080 ));
@@ -2195,6 +2218,38 @@ extern "C" {
2195
2218
__declspec (noalias) void __stdcall __std_bitset_to_string_1(
2196
2219
char * const _Dest, const void * _Src, size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
2197
2220
#ifndef _M_ARM64EC
2221
+ if (_Use_avx2 () && _Size_bits >= 256 ) {
2222
+ const __m256i _Px0 = _mm256_broadcastb_epi8 (_mm_cvtsi32_si128 (_Elem0));
2223
+ const __m256i _Px1 = _mm256_broadcastb_epi8 (_mm_cvtsi32_si128 (_Elem1));
2224
+ if (_Size_bits >= 32 ) {
2225
+ char * _Pos = _Dest + _Size_bits;
2226
+ _Size_bits &= 0x1F ;
2227
+ char * const _Stop_at = _Dest + _Size_bits;
2228
+ do {
2229
+ uint32_t _Val;
2230
+ memcpy (&_Val, _Src, 4 );
2231
+ const __m256i _Elems = _Bitset_to_string_1_step_avx (_Val, _Px0, _Px1);
2232
+ _Pos -= 32 ;
2233
+ _mm256_storeu_si256 (reinterpret_cast <__m256i*>(_Pos), _Elems);
2234
+ _Advance_bytes (_Src, 4 );
2235
+ } while (_Pos != _Stop_at);
2236
+ }
2237
+
2238
+ if (_Size_bits > 0 ) {
2239
+ __assume (_Size_bits < 32 );
2240
+ uint32_t _Val = 0 ;
2241
+ memcpy (&_Val, _Src, (_Size_bits + 7 ) / 8 );
2242
+ const __m256i _Elems = _Bitset_to_string_1_step_avx (_Val, _Px0, _Px1);
2243
+ char _Tmp[32 ];
2244
+ _mm256_storeu_si256 (reinterpret_cast <__m256i*>(_Tmp), _Elems);
2245
+ const char * const _Tmpd = _Tmp + (32 - _Size_bits);
2246
+ memcpy (_Dest, _Tmpd, _Size_bits);
2247
+ }
2248
+
2249
+ _mm256_zeroupper (); // TRANSITION, DevCom-10331414
2250
+ return ;
2251
+ }
2252
+
2198
2253
if (_Use_sse2 ()) {
2199
2254
const __m128i _Px0 = _mm_set1_epi8 (_Elem0 ^ _Elem1);
2200
2255
const __m128i _Px1 = _mm_set1_epi8 (_Elem1);
@@ -2241,6 +2296,43 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1(
2241
2296
__declspec (noalias) void __stdcall __std_bitset_to_string_2(
2242
2297
wchar_t * const _Dest, const void * _Src, size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
2243
2298
#ifndef _M_ARM64EC
2299
+ if (_Use_avx2 () && _Size_bits >= 256 ) {
2300
+ const __m256i _Px0 = _mm256_broadcastw_epi16 (_mm_cvtsi32_si128 (_Elem0));
2301
+ const __m256i _Px1 = _mm256_broadcastw_epi16 (_mm_cvtsi32_si128 (_Elem1));
2302
+
2303
+ if (_Size_bits >= 16 ) {
2304
+ wchar_t * _Pos = _Dest + _Size_bits;
2305
+ _Size_bits &= 0xF ;
2306
+ wchar_t * const _Stop_at = _Dest + _Size_bits;
2307
+ do {
2308
+ uint16_t _Val;
2309
+ memcpy (&_Val, _Src, 2 );
2310
+ const __m256i _Elems = _Bitset_to_string_2_step_avx (_Val, _Px0, _Px1);
2311
+ _Pos -= 16 ;
2312
+ _mm256_storeu_si256 (reinterpret_cast <__m256i*>(_Pos), _Elems);
2313
+ _Advance_bytes (_Src, 2 );
2314
+ } while (_Pos != _Stop_at);
2315
+ }
2316
+
2317
+ if (_Size_bits > 0 ) {
2318
+ __assume (_Size_bits < 16 );
2319
+ uint16_t _Val;
2320
+ if (_Size_bits > 8 ) {
2321
+ memcpy (&_Val, _Src, 2 );
2322
+ } else {
2323
+ _Val = *reinterpret_cast <const uint8_t *>(_Src);
2324
+ }
2325
+ const __m256i _Elems = _Bitset_to_string_2_step_avx (_Val, _Px0, _Px1);
2326
+ wchar_t _Tmp[16 ];
2327
+ _mm256_storeu_si256 (reinterpret_cast <__m256i*>(_Tmp), _Elems);
2328
+ const wchar_t * const _Tmpd = _Tmp + (16 - _Size_bits);
2329
+ memcpy (_Dest, _Tmpd, _Size_bits * 2 );
2330
+ }
2331
+
2332
+ _mm256_zeroupper (); // TRANSITION, DevCom-10331414
2333
+ return ;
2334
+ }
2335
+
2244
2336
if (_Use_sse2 ()) {
2245
2337
const __m128i _Px0 = _mm_set1_epi16 (_Elem0 ^ _Elem1);
2246
2338
const __m128i _Px1 = _mm_set1_epi16 (_Elem1);
0 commit comments