Skip to content

Commit

Permalink
resolved comments
Browse files Browse the repository at this point in the history
Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
  • Loading branch information
Ka-zam committed Feb 29, 2024
1 parent 319387d commit af7e8fc
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 86 deletions.
21 changes: 14 additions & 7 deletions include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,34 @@
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#include <immintrin.h>

////////////////////////////////////////////////////////////////////////
// Place real parts of two complex vectors in output
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
return _mm512_permutex2var_ps(z1, idx, z2);
}

////////////////////////////////////////////////////////////////////////
// Place imaginary parts of two complex vectors in output
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
return _mm512_permutex2var_ps(z1, idx, z2);
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
////////////////////////////////////////////////////////////////////////
// Approximate arctan(x) via polynomial expansion on the interval [-1, 1]
// Maximum relative error ~6.5e-7
// Polynomial evaluated via Horner's method
// Requires AVX512F
////////////////////////////////////////////////////////////////////////
static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
{
const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
Expand Down
102 changes: 46 additions & 56 deletions kernels/volk/volk_32f_atan_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@
static inline void
volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
for (; number < num_points; number++) {
for (unsigned int number = 0; number < num_points; number++) {
*out++ = atanf(*in++);
}
}
Expand All @@ -74,8 +73,7 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
static inline void
volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
{
unsigned int number = 0;
for (; number < num_points; number++) {
for (unsigned int number = 0; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand All @@ -85,17 +83,18 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point
#include <immintrin.h>
#include <volk/volk_avx512_intrinsics.h>
static inline void
volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points)
{
const __m512 one = _mm512_set1_ps(1.f);
const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int sixteenth_points = num_points / 16;
for (; number < sixteenth_points; number++) {
const unsigned int sixteenth_points = num_points / 16;

for (unsigned int number = 0; number < sixteenth_points; number++) {
__m512 x = _mm512_load_ps(in);
in += 16;
__mmask16 swap_mask =
_mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
Expand All @@ -106,16 +105,14 @@ volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
term = _mm512_sub_ps(term, result);
result = _mm512_mask_blend_ps(swap_mask, result, term);
_mm512_store_ps(out, result);
in += 16;
out += 16;
}

number = sixteenth_points * 16;
for (; number < num_points; number++) {
for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_AVX512F for aligned */
#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
Expand All @@ -128,10 +125,11 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
const unsigned int eighth_points = num_points / 8;

for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
Expand All @@ -141,12 +139,10 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_store_ps(out, result);
in += 8;
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand All @@ -163,10 +159,11 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
const unsigned int eighth_points = num_points / 8;

for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
Expand All @@ -176,12 +173,10 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_store_ps(out, result);
in += 8;
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand All @@ -198,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
for (; number < quarter_points; number++) {
const unsigned int quarter_points = num_points / 4;

for (unsigned int number = 0; number < quarter_points; number++) {
__m128 x = _mm_load_ps(in);
in += 4;
__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
__m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
_mm_blendv_ps(one, x, swap_mask));
Expand All @@ -211,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
term = _mm_sub_ps(term, result);
result = _mm_blendv_ps(result, term, swap_mask);
_mm_store_ps(out, result);
in += 4;
out += 4;
}

number = quarter_points * 4;
for (; number < num_points; number++) {
for (unsigned int number = quarter_points * 4; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand All @@ -230,17 +224,18 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
#include <immintrin.h>
#include <volk/volk_avx512_intrinsics.h>
static inline void
volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points)
{
const __m512 one = _mm512_set1_ps(1.f);
const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int sixteenth_points = num_points / 16;
for (; number < sixteenth_points; number++) {
const unsigned int sixteenth_points = num_points / 16;

for (unsigned int number = 0; number < sixteenth_points; number++) {
__m512 x = _mm512_loadu_ps(in);
in += 16;
__mmask16 swap_mask =
_mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
Expand All @@ -251,16 +246,14 @@ volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
term = _mm512_sub_ps(term, result);
result = _mm512_mask_blend_ps(swap_mask, result, term);
_mm512_storeu_ps(out, result);
in += 16;
out += 16;
}

number = sixteenth_points * 16;
for (; number < num_points; number++) {
for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
#endif /* LV_HAVE_AVX512F for unaligned */
#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
Expand All @@ -272,10 +265,11 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
const unsigned int eighth_points = num_points / 8;

for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
Expand All @@ -285,12 +279,10 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_storeu_ps(out, result);
in += 8;
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand All @@ -306,10 +298,11 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
const unsigned int eighth_points = num_points / 8;

for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
__m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
_mm256_blendv_ps(one, x, swap_mask));
Expand All @@ -319,12 +312,10 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
term = _mm256_sub_ps(term, result);
result = _mm256_blendv_ps(result, term, swap_mask);
_mm256_storeu_ps(out, result);
in += 8;
out += 8;
}

number = eighth_points * 8;
for (; number < num_points; number++) {
for (unsigned int number = eighth_points * 8; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand All @@ -341,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
for (; number < quarter_points; number++) {
const unsigned int quarter_points = num_points / 4;

for (unsigned int number = 0; number < quarter_points; number++) {
__m128 x = _mm_loadu_ps(in);
in += 4;
__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
__m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
_mm_blendv_ps(one, x, swap_mask));
Expand All @@ -354,12 +346,10 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
term = _mm_sub_ps(term, result);
result = _mm_blendv_ps(result, term, swap_mask);
_mm_storeu_ps(out, result);
in += 4;
out += 4;
}

number = quarter_points * 4;
for (; number < num_points; number++) {
for (unsigned int number = quarter_points * 4; number < num_points; number++) {
*out++ = volk_arctan(*in++);
}
}
Expand Down
Loading

0 comments on commit af7e8fc

Please sign in to comment.