resolved comments

Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
gnuradio · Feb 29, 2024 · af7e8fc · af7e8fc
1 parent 319387d
commit af7e8fc
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 86 deletions.
diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
@@ -16,27 +16,34 @@
 #define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
 #include <immintrin.h>
 
+////////////////////////////////////////////////////////////////////////
+// Place real parts of two complex vectors in output
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
 static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
 {
     const __m512i idx =
         _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
     return _mm512_permutex2var_ps(z1, idx, z2);
 }
 
+////////////////////////////////////////////////////////////////////////
+// Place imaginary parts of two complex vectors in output
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
 static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
 {
     const __m512i idx =
         _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
     return _mm512_permutex2var_ps(z1, idx, z2);
 }
 
-/*
- * Approximate arctan(x) via polynomial expansion
- * on the interval [-1, 1]
- *
- * Maximum relative error ~6.5e-7
- * Polynomial evaluated via Horner's method
- */
+////////////////////////////////////////////////////////////////////////
+// Approximate arctan(x) via polynomial expansion on the interval [-1, 1]
+// Maximum relative error ~6.5e-7
+// Polynomial evaluated via Horner's method
+// Requires AVX512F
+////////////////////////////////////////////////////////////////////////
 static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
 {
     const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);

diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
@@ -63,8 +63,7 @@
 static inline void
 volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int number = 0;
-    for (; number < num_points; number++) {
+    for (unsigned int number = 0; number < num_points; number++) {
         *out++ = atanf(*in++);
     }
 }
@@ -74,8 +73,7 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
 static inline void
 volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
 {
-    unsigned int number = 0;
-    for (; number < num_points; number++) {
+    for (unsigned int number = 0; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
@@ -85,17 +83,18 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point
 #include <immintrin.h>
 #include <volk/volk_avx512_intrinsics.h>
 static inline void
-volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
+volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points)
 {
     const __m512 one = _mm512_set1_ps(1.f);
     const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
     const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
     const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int sixteenth_points = num_points / 16;
-    for (; number < sixteenth_points; number++) {
+    const unsigned int sixteenth_points = num_points / 16;
+
+    for (unsigned int number = 0; number < sixteenth_points; number++) {
         __m512 x = _mm512_load_ps(in);
+        in += 16;
         __mmask16 swap_mask =
             _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
@@ -106,16 +105,14 @@ volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
         term = _mm512_sub_ps(term, result);
         result = _mm512_mask_blend_ps(swap_mask, result, term);
         _mm512_store_ps(out, result);
-        in += 16;
         out += 16;
     }
 
-    number = sixteenth_points * 16;
-    for (; number < num_points; number++) {
+    for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
-#endif /* LV_HAVE_AVX512F for aligned */
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
 
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
@@ -128,10 +125,11 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int eighth_points = num_points / 8;
-    for (; number < eighth_points; number++) {
+    const unsigned int eighth_points = num_points / 8;
+
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
+        in += 8;
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
@@ -141,12 +139,10 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
         term = _mm256_sub_ps(term, result);
         result = _mm256_blendv_ps(result, term, swap_mask);
         _mm256_store_ps(out, result);
-        in += 8;
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
+    for (unsigned int number = eighth_points * 8; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
@@ -163,10 +159,11 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int eighth_points = num_points / 8;
-    for (; number < eighth_points; number++) {
+    const unsigned int eighth_points = num_points / 8;
+
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_load_ps(in);
+        in += 8;
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
@@ -176,12 +173,10 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
         term = _mm256_sub_ps(term, result);
         result = _mm256_blendv_ps(result, term, swap_mask);
         _mm256_store_ps(out, result);
-        in += 8;
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
+    for (unsigned int number = eighth_points * 8; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
@@ -198,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
     const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
     const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int quarter_points = num_points / 4;
-    for (; number < quarter_points; number++) {
+    const unsigned int quarter_points = num_points / 4;
+
+    for (unsigned int number = 0; number < quarter_points; number++) {
         __m128 x = _mm_load_ps(in);
+        in += 4;
         __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
         __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
                                    _mm_blendv_ps(one, x, swap_mask));
@@ -211,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
         term = _mm_sub_ps(term, result);
         result = _mm_blendv_ps(result, term, swap_mask);
         _mm_store_ps(out, result);
-        in += 4;
         out += 4;
     }
 
-    number = quarter_points * 4;
-    for (; number < num_points; number++) {
+    for (unsigned int number = quarter_points * 4; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
@@ -230,17 +224,18 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
 #include <immintrin.h>
 #include <volk/volk_avx512_intrinsics.h>
 static inline void
-volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
+volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points)
 {
     const __m512 one = _mm512_set1_ps(1.f);
     const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
     const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
     const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int sixteenth_points = num_points / 16;
-    for (; number < sixteenth_points; number++) {
+    const unsigned int sixteenth_points = num_points / 16;
+
+    for (unsigned int number = 0; number < sixteenth_points; number++) {
         __m512 x = _mm512_loadu_ps(in);
+        in += 16;
         __mmask16 swap_mask =
             _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
@@ -251,16 +246,14 @@ volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
         term = _mm512_sub_ps(term, result);
         result = _mm512_mask_blend_ps(swap_mask, result, term);
         _mm512_storeu_ps(out, result);
-        in += 16;
         out += 16;
     }
 
-    number = sixteenth_points * 16;
-    for (; number < num_points; number++) {
+    for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
-#endif /* LV_HAVE_AVX512F for unaligned */
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
 
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
@@ -272,10 +265,11 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int eighth_points = num_points / 8;
-    for (; number < eighth_points; number++) {
+    const unsigned int eighth_points = num_points / 8;
+
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
+        in += 8;
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
@@ -285,12 +279,10 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
         term = _mm256_sub_ps(term, result);
         result = _mm256_blendv_ps(result, term, swap_mask);
         _mm256_storeu_ps(out, result);
-        in += 8;
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
+    for (unsigned int number = eighth_points * 8; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
@@ -306,10 +298,11 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
     const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
     const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int eighth_points = num_points / 8;
-    for (; number < eighth_points; number++) {
+    const unsigned int eighth_points = num_points / 8;
+
+    for (unsigned int number = 0; number < eighth_points; number++) {
         __m256 x = _mm256_loadu_ps(in);
+        in += 8;
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
@@ -319,12 +312,10 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
         term = _mm256_sub_ps(term, result);
         result = _mm256_blendv_ps(result, term, swap_mask);
         _mm256_storeu_ps(out, result);
-        in += 8;
         out += 8;
     }
 
-    number = eighth_points * 8;
-    for (; number < num_points; number++) {
+    for (unsigned int number = eighth_points * 8; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }
@@ -341,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
     const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
     const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 
-    unsigned int number = 0;
-    unsigned int quarter_points = num_points / 4;
-    for (; number < quarter_points; number++) {
+    const unsigned int quarter_points = num_points / 4;
+
+    for (unsigned int number = 0; number < quarter_points; number++) {
         __m128 x = _mm_loadu_ps(in);
+        in += 4;
         __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
         __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
                                    _mm_blendv_ps(one, x, swap_mask));
@@ -354,12 +346,10 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
         term = _mm_sub_ps(term, result);
         result = _mm_blendv_ps(result, term, swap_mask);
         _mm_storeu_ps(out, result);
-        in += 4;
         out += 4;
     }
 
-    number = quarter_points * 4;
-    for (; number < num_points; number++) {
+    for (unsigned int number = quarter_points * 4; number < num_points; number++) {
         *out++ = volk_arctan(*in++);
     }
 }