diff --git a/CMakeLists.txt b/CMakeLists.txt index d6401845..9a8a460e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ # -# Copyright 2011-2020 Free Software Foundation, Inc. -# Copyright 2023 Magnus Lundmark +#Copyright 2011 - 2020 Free Software Foundation, Inc. +#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com> # -# This file is part of VOLK +#This file is part of VOLK # -# SPDX-License-Identifier: LGPL-3.0-or-later +#SPDX - License - Identifier : LGPL - 3.0 - or -later # ######################################################################## -# Project setup +#Project setup ######################################################################## cmake_minimum_required(VERSION 3.8) set(CMAKE_BUILD_TYPE @@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17) enable_testing() ######################################################################## -# Common compile flags +#Common compile flags ######################################################################## -# Disable complex math NaN/INFO range checking for performance +#Disable complex math NaN / INFO range checking for performance include(CheckCXXCompilerFlag) check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) if(HAVE_CX_LIMITED_RANGE) @@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") - # Abort compilation if kernel implementations have inconsistent function - # prototypes, i.e. if - # - # kernel_foo_sse(uint32_t *dst, lv32fc_t *src) - # kernel_foo_avx(uint16_t *dst, lv32fc_t *src) - # - # are defined. Note the different data type of the first argument). By - # default 'incompatible-pointer-types' is a warning only and 'pointer-sign' - # is a warning enabled by '-Wall'. These warnings are only applicable to C. +#Abort compilation if kernel implementations have inconsistent function +#prototypes, i.e.if +# +#kernel_foo_sse(uint32_t* dst, lv32fc_t* src) +#kernel_foo_avx(uint16_t* dst, lv32fc_t* src) +# +#are defined.Note the different data type of the first argument).By +#default 'incompatible-pointer-types' is a warning only and 'pointer-sign' +#is a warning enabled by '-Wall'.These warnings are only applicable to C. set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign") endif() @@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") ######################################################################## -# Version setup +#Version setup ######################################################################## set(VERSION_INFO_MAJOR_VERSION 3) @@ -87,13 +87,14 @@ include(VolkVersion) #setup version info math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 + ${VERSION_INFO_MINOR_VERSION} * 100 - + ${VERSION_INFO_MAINT_VERSION}") + + ${ + VERSION_INFO_MAINT_VERSION}") configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY) ######################################################################## -# Environment setup +#Environment setup ######################################################################## if(NOT DEFINED CROSSCOMPILE_MULTILIB) set(CROSSCOMPILE_MULTILIB "") @@ -116,10 +117,10 @@ if(MSVC) endif(MSVC) ######################################################################## -# Dependencies setup +#Dependencies setup ######################################################################## -# cpu_features - sensible defaults, user settable option +#cpu_features - sensible defaults, user settable option if(CMAKE_SYSTEM_PROCESSOR MATCHES "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)") option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON) @@ -158,7 +159,7 @@ else() message(STATUS "Building Volk without cpu_features") endif() -# Python +#Python include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)" PYTHON_MIN_VER_FOUND) @@ -168,12 +169,12 @@ if(NOT PYTHON_MIN_VER_FOUND) message(FATAL_ERROR "Python 3.4 or greater required to build VOLK") endif() -# Mako +#Mako if(NOT MAKO_FOUND) message(FATAL_ERROR "Mako templates required to build VOLK") endif() -# Check if we have std::filesystem +#Check if we have std::filesystem find_package( FILESYSTEM COMPONENTS Final Experimental @@ -183,9 +184,9 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) ######################################################################## -# check for aligned_alloc, since some compilers lack this C11 feature. -# For Apple-clang use `posix_memalign` -# For MSVC use `_aligned_malloc`. +#check for aligned_alloc, since some compilers lack this C11 feature. +#For Apple - clang use `posix_memalign` +#For MSVC use `_aligned_malloc`. ######################################################################## include(CheckSymbolExists) if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")) @@ -196,7 +197,7 @@ if(NOT USE_ALIGNED_ALLOC) endif() ######################################################################## -# Check if Orc is available +#Check if Orc is available ######################################################################## option(ENABLE_ORC "Enable Orc" True) if(ENABLE_ORC) @@ -206,17 +207,17 @@ else(ENABLE_ORC) endif(ENABLE_ORC) ######################################################################## -# Setup doxygen +#Setup doxygen ######################################################################## add_subdirectory(docs) ######################################################################## -# Detect /lib versus /lib64 +#Detect / lib versus / lib64 ######################################################################## include(GNUInstallDirs) ######################################################################## -# Setup the package config file +#Setup the package config file ######################################################################## #set variables found in the pc.in file set(prefix ${CMAKE_INSTALL_PREFIX}) @@ -233,7 +234,7 @@ install( COMPONENT "volk_devel") ######################################################################## -# Install all headers in the include directories +#Install all headers in the include directories ######################################################################## set(VOLK_RUNTIME_DIR bin) set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) @@ -255,6 +256,7 @@ install( ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h @@ -269,7 +271,7 @@ install( COMPONENT "volk_devel") ######################################################################## -# On Apple only, set install name and use rpath correctly, if not already set +#On Apple only, set install name and use rpath correctly, if not already set ######################################################################## if(APPLE) if(NOT CMAKE_INSTALL_NAME_DIR) @@ -290,21 +292,21 @@ if(APPLE) endif(APPLE) ######################################################################## -# Create uninstall target +#Create uninstall target ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY) -# Only add the target if there isn't one defined already +#Only add the target if there isn't one defined already if(NOT TARGET uninstall) add_custom_target(uninstall ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() ######################################################################## -# Install our Cmake modules into $prefix/lib/cmake/volk -# See "Package Configuration Files" on page: -# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging +#Install our Cmake modules into $prefix / lib / cmake / volk +#See "Package Configuration Files" on page: +#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in @@ -314,7 +316,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) ######################################################################## -# Install cmake search routine for external use +#Install cmake search routine for external use ######################################################################## if(NOT CMAKE_MODULES_DIR) @@ -334,7 +336,7 @@ install( DESTINATION ${CMAKE_MODULES_DIR}/volk) ######################################################################## -# Option to enable QA testing, on by default +#Option to enable QA testing, on by default ######################################################################## option(ENABLE_TESTING "Enable QA testing" ON) if(ENABLE_TESTING) @@ -345,7 +347,7 @@ endif() message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF") ######################################################################## -# Option to enable post-build profiling using volk_profile, off by default +#Option to enable post - build profiling using volk_profile, off by default ######################################################################## option(ENABLE_PROFILING "Launch system profiler after build" OFF) if(ENABLE_PROFILING) @@ -371,12 +373,12 @@ endif() message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF") ######################################################################## -# Setup the library +#Setup the library ######################################################################## add_subdirectory(lib) ######################################################################## -# And the utility apps +#And the utility apps ######################################################################## add_subdirectory(apps) option(ENABLE_MODTOOL "Enable volk_modtool python utility" True) @@ -385,6 +387,6 @@ if(ENABLE_MODTOOL) endif() ######################################################################## -# Print summary +#Print summary ######################################################################## message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}") diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb4..792c50d1 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -178,6 +178,14 @@ at the top, as a last resort. 64 + + + -mavx512dq + -mavx512dq + /arch:AVX512DQ + 64 + + diff --git a/gen/machines.xml b/gen/machines.xml index 887f9794..b76f6d07 100644 --- a/gen/machines.xml +++ b/gen/machines.xml @@ -65,4 +65,9 @@ generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc| + + +generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc| + + diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h index 03b24e6c..8a7e4d63 100644 --- a/include/volk/volk_avx2_fma_intrinsics.h +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -8,7 +8,7 @@ */ /* - * This file is intended to hold AVX2 FMA intrinsics of intrinsics. + * This file is intended to hold AVX2 FMA intrinsics. * They should be used in VOLK kernels to avoid copy-paste. */ @@ -23,7 +23,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h new file mode 100644 index 00000000..a6fd87ac --- /dev/null +++ b/include/volk/volk_avx512_intrinsics.h @@ -0,0 +1,67 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold AVX512 intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#include + +static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) +{ + // r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6 + const __m512i idx = + _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + const __m512 r = _mm512_permutex2var_ps(z1, idx, z2); + return r; +} + +static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) +{ + const __m512i idx = + _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + const __m512 i = _mm512_permutex2var_ps(z1, idx, z2); + return i; +} + +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) +{ + const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); + const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f); + const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f); + const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f); + const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f); + const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f); + const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f); + + const __m512 x_times_x = _mm512_mul_ps(x, x); + __m512 arctan; + arctan = a13; + arctan = _mm512_fmadd_ps(x_times_x, arctan, a11); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a9); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a7); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a5); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a3); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a1); + arctan = _mm512_mul_ps(x, arctan); + + return arctan; +} + +#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 2fc0f064..c6c7a2c5 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -9,7 +9,7 @@ */ /* - * This file is intended to hold AVX intrinsics of intrinsics. + * This file is intended to hold AVX intrinsics. * They should be used in VOLK kernels to avoid copy-pasta. */ @@ -24,7 +24,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index dc5987cb..ec078826 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -13,19 +13,19 @@ * * \b Overview * - * Computes arcsine of input vector and stores results in output vector. + * Computes arctan of input vector and stores results in output vector. * * Dispatcher Prototype * \code - * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points) + * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points) * \endcode * * \b Inputs - * \li aVector: The input vector of floats. + * \li in_ptr: The input vector of floats. * \li num_points: The number of data points. * * \b Outputs - * \li bVector: The vector where results will be stored. + * \li out_ptr: The vector where results will be stored. * * \b Example * Calculate common angles around the top half of the unit circle. @@ -59,6 +59,64 @@ #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = atanf(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_store_ps(out, result); + in += 16; + out += 16; + } + + number = sixteenth_points * 16; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -77,7 +135,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -112,7 +170,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -168,6 +226,42 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #ifndef INCLUDED_volk_32f_atan_32f_u_H #define INCLUDED_volk_32f_atan_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_storeu_ps(out, result); + in += 16; + out += 16; + } + + number = sixteenth_points * 16; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include static inline void @@ -185,7 +279,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -219,7 +313,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -271,26 +365,4 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) } #endif /* LV_HAVE_SSE4_1 for unaligned */ -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = volk_arctan(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = atanf(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 759db24c..41492541 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2012, 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -100,6 +100,66 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_load_ps(in); + in += 16; + __m512 z2 = _mm512_load_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_store_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, (lv_32fc_t*)in, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -136,7 +196,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -196,7 +256,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -224,6 +284,66 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_loadu_ps(in); + in += 16; + __m512 z2 = _mm512_loadu_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_storeu_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, (lv_32fc_t*)in, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -260,7 +380,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -320,7 +440,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);