Skip to content

Commit

Permalink
Added AVX512 kernels to arctan and atan2
Browse files Browse the repository at this point in the history
  • Loading branch information
Ka-zam committed Feb 25, 2024
1 parent ab2ab82 commit 217e42c
Show file tree
Hide file tree
Showing 8 changed files with 358 additions and 84 deletions.
90 changes: 46 additions & 44 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#
# Copyright 2011-2020 Free Software Foundation, Inc.
# Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
#Copyright 2011 - 2020 Free Software Foundation, Inc.
#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com>
#
# This file is part of VOLK
#This file is part of VOLK
#
# SPDX-License-Identifier: LGPL-3.0-or-later
#SPDX - License - Identifier : LGPL - 3.0 - or -later
#

########################################################################
# Project setup
#Project setup
########################################################################
cmake_minimum_required(VERSION 3.8)
set(CMAKE_BUILD_TYPE
Expand All @@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17)
enable_testing()

########################################################################
# Common compile flags
#Common compile flags
########################################################################

# Disable complex math NaN/INFO range checking for performance
#Disable complex math NaN / INFO range checking for performance
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE)
if(HAVE_CX_LIMITED_RANGE)
Expand All @@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)

if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
# Abort compilation if kernel implementations have inconsistent function
# prototypes, i.e. if
#
# kernel_foo_sse(uint32_t *dst, lv32fc_t *src)
# kernel_foo_avx(uint16_t *dst, lv32fc_t *src)
#
# are defined. Note the different data type of the first argument). By
# default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
# is a warning enabled by '-Wall'. These warnings are only applicable to C.
#Abort compilation if kernel implementations have inconsistent function
#prototypes, i.e.if
#
#kernel_foo_sse(uint32_t* dst, lv32fc_t* src)
#kernel_foo_avx(uint16_t* dst, lv32fc_t* src)
#
#are defined.Note the different data type of the first argument).By
#default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
#is a warning enabled by '-Wall'.These warnings are only applicable to C.
set(CMAKE_C_FLAGS
"${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign")
endif()
Expand All @@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE
message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.")

########################################################################
# Version setup
#Version setup
########################################################################

set(VERSION_INFO_MAJOR_VERSION 3)
Expand All @@ -87,13 +87,14 @@ include(VolkVersion) #setup version info

math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000
+ ${VERSION_INFO_MINOR_VERSION} * 100
+ ${VERSION_INFO_MAINT_VERSION}")
+ ${
VERSION_INFO_MAINT_VERSION}")

configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in
${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY)

########################################################################
# Environment setup
#Environment setup
########################################################################
if(NOT DEFINED CROSSCOMPILE_MULTILIB)
set(CROSSCOMPILE_MULTILIB "")
Expand All @@ -116,10 +117,10 @@ if(MSVC)
endif(MSVC)

########################################################################
# Dependencies setup
#Dependencies setup
########################################################################

# cpu_features - sensible defaults, user settable option
#cpu_features - sensible defaults, user settable option
if(CMAKE_SYSTEM_PROCESSOR MATCHES
"(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)")
option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON)
Expand Down Expand Up @@ -158,7 +159,7 @@ else()
message(STATUS "Building Volk without cpu_features")
endif()

# Python
#Python
include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)"
PYTHON_MIN_VER_FOUND)
Expand All @@ -168,12 +169,12 @@ if(NOT PYTHON_MIN_VER_FOUND)
message(FATAL_ERROR "Python 3.4 or greater required to build VOLK")
endif()

# Mako
#Mako
if(NOT MAKO_FOUND)
message(FATAL_ERROR "Mako templates required to build VOLK")
endif()

# Check if we have std::filesystem
#Check if we have std::filesystem
find_package(
FILESYSTEM
COMPONENTS Final Experimental
Expand All @@ -183,9 +184,9 @@ set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

########################################################################
# check for aligned_alloc, since some compilers lack this C11 feature.
# For Apple-clang use `posix_memalign`
# For MSVC use `_aligned_malloc`.
#check for aligned_alloc, since some compilers lack this C11 feature.
#For Apple - clang use `posix_memalign`
#For MSVC use `_aligned_malloc`.
########################################################################
include(CheckSymbolExists)
if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin"))
Expand All @@ -196,7 +197,7 @@ if(NOT USE_ALIGNED_ALLOC)
endif()

########################################################################
# Check if Orc is available
#Check if Orc is available
########################################################################
option(ENABLE_ORC "Enable Orc" True)
if(ENABLE_ORC)
Expand All @@ -206,17 +207,17 @@ else(ENABLE_ORC)
endif(ENABLE_ORC)

########################################################################
# Setup doxygen
#Setup doxygen
########################################################################
add_subdirectory(docs)

########################################################################
# Detect /lib versus /lib64
#Detect / lib versus / lib64
########################################################################
include(GNUInstallDirs)

########################################################################
# Setup the package config file
#Setup the package config file
########################################################################
#set variables found in the pc.in file
set(prefix ${CMAKE_INSTALL_PREFIX})
Expand All @@ -233,7 +234,7 @@ install(
COMPONENT "volk_devel")

########################################################################
# Install all headers in the include directories
#Install all headers in the include directories
########################################################################
set(VOLK_RUNTIME_DIR bin)
set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR})
Expand All @@ -255,6 +256,7 @@ install(
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand All @@ -269,7 +271,7 @@ install(
COMPONENT "volk_devel")

########################################################################
# On Apple only, set install name and use rpath correctly, if not already set
#On Apple only, set install name and use rpath correctly, if not already set
########################################################################
if(APPLE)
if(NOT CMAKE_INSTALL_NAME_DIR)
Expand All @@ -290,21 +292,21 @@ if(APPLE)
endif(APPLE)

########################################################################
# Create uninstall target
#Create uninstall target
########################################################################
configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY)

# Only add the target if there isn't one defined already
#Only add the target if there isn't one defined already
if(NOT TARGET uninstall)
add_custom_target(uninstall ${CMAKE_COMMAND} -P
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
endif()

########################################################################
# Install our Cmake modules into $prefix/lib/cmake/volk
# See "Package Configuration Files" on page:
# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging
#Install our Cmake modules into $prefix / lib / cmake / volk
#See "Package Configuration Files" on page:
#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging
########################################################################

configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
Expand All @@ -314,7 +316,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY)

########################################################################
# Install cmake search routine for external use
#Install cmake search routine for external use
########################################################################

if(NOT CMAKE_MODULES_DIR)
Expand All @@ -334,7 +336,7 @@ install(
DESTINATION ${CMAKE_MODULES_DIR}/volk)

########################################################################
# Option to enable QA testing, on by default
#Option to enable QA testing, on by default
########################################################################
option(ENABLE_TESTING "Enable QA testing" ON)
if(ENABLE_TESTING)
Expand All @@ -345,7 +347,7 @@ endif()
message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF")

########################################################################
# Option to enable post-build profiling using volk_profile, off by default
#Option to enable post - build profiling using volk_profile, off by default
########################################################################
option(ENABLE_PROFILING "Launch system profiler after build" OFF)
if(ENABLE_PROFILING)
Expand All @@ -371,12 +373,12 @@ endif()
message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF")

########################################################################
# Setup the library
#Setup the library
########################################################################
add_subdirectory(lib)

########################################################################
# And the utility apps
#And the utility apps
########################################################################
add_subdirectory(apps)
option(ENABLE_MODTOOL "Enable volk_modtool python utility" True)
Expand All @@ -385,6 +387,6 @@ if(ENABLE_MODTOOL)
endif()

########################################################################
# Print summary
#Print summary
########################################################################
message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")
8 changes: 8 additions & 0 deletions gen/archs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ at the top, as a last resort.
<alignment>64</alignment>
</arch>

<arch name="avx512dq">
<check name="avx512dq"></check>
<flag compiler="gnu">-mavx512dq</flag>
<flag compiler="clang">-mavx512dq</flag>
<flag compiler="msvc">/arch:AVX512DQ</flag>
<alignment>64</alignment>
</arch>

<arch name="riscv64">
</arch>

Expand Down
5 changes: 5 additions & 0 deletions gen/machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,9 @@
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
</machine>

<!-- trailing | bar means generate without either for MSVC -->
<machine name="avx512dq">
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
</machine>

</grammar>
4 changes: 2 additions & 2 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* This file is intended to hold AVX2 FMA intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

Expand All @@ -23,7 +23,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Expand Down
67 changes: 67 additions & 0 deletions include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX512 intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#include <immintrin.h>

static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
{
// r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6
const __m512i idx =
_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
const __m512 r = _mm512_permutex2var_ps(z1, idx, z2);
return r;
}

static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
const __m512 i = _mm512_permutex2var_ps(z1, idx, z2);
return i;
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
{
const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);

const __m512 x_times_x = _mm512_mul_ps(x, x);
__m512 arctan;
arctan = a13;
arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm512_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
4 changes: 2 additions & 2 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
*/

/*
* This file is intended to hold AVX intrinsics of intrinsics.
* This file is intended to hold AVX intrinsics.
* They should be used in VOLK kernels to avoid copy-pasta.
*/

Expand All @@ -24,7 +24,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Expand Down
Loading

0 comments on commit 217e42c

Please sign in to comment.