|
44 | 44 | ))]
|
45 | 45 | 8 => transize(vtbl1_u8, self, idxs),
|
46 | 46 | #[cfg(target_feature = "ssse3")]
|
47 |
| - 16 => transize(x86::_mm_shuffle_epi8, self, idxs), |
| 47 | + 16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)), |
48 | 48 | #[cfg(target_feature = "simd128")]
|
49 | 49 | 16 => transize(wasm::i8x16_swizzle, self, idxs),
|
50 | 50 | #[cfg(all(
|
|
54 | 54 | ))]
|
55 | 55 | 16 => transize(vqtbl1q_u8, self, idxs),
|
56 | 56 | #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
|
57 |
| - 32 => transize_raw(avx2_pshufb, self, idxs), |
| 57 | + 32 => transize(avx2_pshufb, self, idxs), |
58 | 58 | #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
|
59 |
| - 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs), |
| 59 | + 32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self), |
60 | 60 | // Notable absence: avx512bw shuffle
|
61 | 61 | // If avx512bw is available, odds of avx512vbmi are good
|
62 | 62 | // FIXME: initial AVX512VBMI variant didn't actually pass muster
|
@@ -129,45 +129,25 @@ unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
|
129 | 129 | #[inline(always)]
|
130 | 130 | unsafe fn transize<T, const N: usize>(
|
131 | 131 | f: unsafe fn(T, T) -> T,
|
132 |
| - bytes: Simd<u8, N>, |
133 |
| - idxs: Simd<u8, N>, |
| 132 | + a: Simd<u8, N>, |
| 133 | + b: Simd<u8, N>, |
134 | 134 | ) -> Simd<u8, N>
|
135 | 135 | where
|
136 | 136 | LaneCount<N>: SupportedLaneCount,
|
137 | 137 | {
|
138 |
| - let idxs = zeroing_idxs(idxs); |
139 | 138 | // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
|
140 |
| - unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
| 139 | + unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) } |
141 | 140 | }
|
142 | 141 |
|
143 |
| -/// Make indices that yield 0 for this architecture |
| 142 | +/// Make indices that yield 0 for x86 |
| 143 | +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 144 | +#[allow(unused)] |
144 | 145 | #[inline(always)]
|
145 | 146 | fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
|
146 | 147 | where
|
147 | 148 | LaneCount<N>: SupportedLaneCount,
|
148 | 149 | {
|
149 |
| - // On x86, make sure the top bit is set. |
150 |
| - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
151 |
| - let idxs = { |
152 |
| - use crate::simd::cmp::SimdPartialOrd; |
153 |
| - idxs.simd_lt(Simd::splat(N as u8)) |
154 |
| - .select(idxs, Simd::splat(u8::MAX)) |
155 |
| - }; |
156 |
| - // Simply do nothing on most architectures. |
157 |
| - idxs |
158 |
| -} |
159 |
| - |
160 |
| -/// As transize but no implicit call to `zeroing_idxs`. |
161 |
| -#[allow(dead_code)] |
162 |
| -#[inline(always)] |
163 |
| -unsafe fn transize_raw<T, const N: usize>( |
164 |
| - f: unsafe fn(T, T) -> T, |
165 |
| - bytes: Simd<u8, N>, |
166 |
| - idxs: Simd<u8, N>, |
167 |
| -) -> Simd<u8, N> |
168 |
| -where |
169 |
| - LaneCount<N>: SupportedLaneCount, |
170 |
| -{ |
171 |
| - // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
172 |
| - unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
| 150 | + use crate::simd::cmp::SimdPartialOrd; |
| 151 | + idxs.simd_lt(Simd::splat(N as u8)) |
| 152 | + .select(idxs, Simd::splat(u8::MAX)) |
173 | 153 | }
|
0 commit comments