From af7e8fcc1f919d4ca6028fc2effe3f9f768a587e Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Thu, 29 Feb 2024 23:24:35 +0100 Subject: [PATCH] resolved comments Signed-off-by: Magnus Lundmark --- include/volk/volk_avx512_intrinsics.h | 21 +++-- kernels/volk/volk_32f_atan_32f.h | 102 +++++++++++------------- kernels/volk/volk_32fc_s32f_atan2_32f.h | 46 +++++------ 3 files changed, 83 insertions(+), 86 deletions(-) diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h index 0bac1c6a..6f6a05ee 100644 --- a/include/volk/volk_avx512_intrinsics.h +++ b/include/volk/volk_avx512_intrinsics.h @@ -16,6 +16,10 @@ #define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ #include +//////////////////////////////////////////////////////////////////////// +// Place real parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) { const __m512i idx = @@ -23,6 +27,10 @@ static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) return _mm512_permutex2var_ps(z1, idx, z2); } +//////////////////////////////////////////////////////////////////////// +// Place imaginary parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) { const __m512i idx = @@ -30,13 +38,12 @@ static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) return _mm512_permutex2var_ps(z1, idx, z2); } -/* - * Approximate arctan(x) via polynomial expansion - * on the interval [-1, 1] - * - * Maximum relative error ~6.5e-7 - * Polynomial evaluated via Horner's method - */ +//////////////////////////////////////////////////////////////////////// +// Approximate arctan(x) via polynomial expansion on the interval [-1, 1] +// Maximum relative error ~6.5e-7 +// Polynomial evaluated via Horner's method +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) { const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index ec078826..03afea55 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -63,8 +63,7 @@ static inline void volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - for (; number < num_points; number++) { + for (unsigned int number = 0; number < num_points; number++) { *out++ = atanf(*in++); } } @@ -74,8 +73,7 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) static inline void volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - for (; number < num_points; number++) { + for (unsigned int number = 0; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -85,17 +83,18 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point #include #include static inline void -volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) +volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points) { const __m512 one = _mm512_set1_ps(1.f); const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_load_ps(in); + in += 16; __mmask16 swap_mask = _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), @@ -106,16 +105,14 @@ volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) term = _mm512_sub_ps(term, result); result = _mm512_mask_blend_ps(swap_mask, result, term); _mm512_store_ps(out, result); - in += 16; out += 16; } - number = sixteenth_points * 16; - for (; number < num_points; number++) { + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { *out++ = volk_arctan(*in++); } } -#endif /* LV_HAVE_AVX512F for aligned */ +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include @@ -128,10 +125,11 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -141,12 +139,10 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -163,10 +159,11 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -176,12 +173,10 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -198,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_load_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -211,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_store_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -230,17 +224,18 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #include #include static inline void -volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) +volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points) { const __m512 one = _mm512_set1_ps(1.f); const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_loadu_ps(in); + in += 16; __mmask16 swap_mask = _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), @@ -251,16 +246,14 @@ volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) term = _mm512_sub_ps(term, result); result = _mm512_mask_blend_ps(swap_mask, result, term); _mm512_storeu_ps(out, result); - in += 16; out += 16; } - number = sixteenth_points * 16; - for (; number < num_points; number++) { + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { *out++ = volk_arctan(*in++); } } -#endif /* LV_HAVE_AVX512F for unaligned */ +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include @@ -272,10 +265,11 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -285,12 +279,10 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -306,10 +298,11 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -319,12 +312,10 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -341,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_loadu_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -354,12 +346,10 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_storeu_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 41492541..5e8be5ce 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -72,8 +72,8 @@ static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float real = *inPtr++; const float imag = *inPtr++; *outPtr++ = atan2f(imag, real) * invNormalizeFactor; @@ -91,8 +91,8 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float x = *inPtr++; const float y = *inPtr++; *outPtr++ = volk_atan2(y, x) * invNormalizeFactor; @@ -103,10 +103,10 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ #include #include -static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, - const lv_32fc_t* complexVector, - const float normalizeFactor, - unsigned int num_points) +static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) { const float* in = (float*)complexVector; float* out = (float*)outputVector; @@ -120,7 +120,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, const __m512 zero = _mm512_setzero_ps(); unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; + const unsigned int sixteenth_points = num_points / 16; for (; number < sixteenth_points; number++) { __m512 z1 = _mm512_load_ps(in); in += 16; @@ -156,7 +156,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, number = sixteenth_points * 16; volk_32fc_s32f_atan2_32f_polynomial( - out, (lv_32fc_t*)in, normalizeFactor, num_points - number); + out, complexVector + number, normalizeFactor, num_points - number); } #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ @@ -180,7 +180,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -240,7 +240,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -287,10 +287,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ #include #include -static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, - const lv_32fc_t* complexVector, - const float normalizeFactor, - unsigned int num_points) +static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) { const float* in = (float*)complexVector; float* out = (float*)outputVector; @@ -303,9 +303,9 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); const __m512 zero = _mm512_setzero_ps(); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 z1 = _mm512_loadu_ps(in); in += 16; __m512 z2 = _mm512_loadu_ps(in); @@ -338,9 +338,9 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, out += 16; } - number = sixteenth_points * 16; + unsigned int number = sixteenth_points * 16; volk_32fc_s32f_atan2_32f_polynomial( - out, (lv_32fc_t*)in, normalizeFactor, num_points - number); + out, complexVector + number, normalizeFactor, num_points - number); } #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ @@ -364,7 +364,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8; @@ -424,7 +424,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8;