diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6401845..9a8a460e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,14 +1,14 @@
 #
-# Copyright 2011-2020 Free Software Foundation, Inc.
-# Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+#Copyright 2011 - 2020 Free Software Foundation, Inc.
+#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com>
 #
-# This file is part of VOLK
+#This file is part of VOLK
 #
-# SPDX-License-Identifier: LGPL-3.0-or-later
+#SPDX - License - Identifier : LGPL - 3.0 - or -later
 #
 
 ########################################################################
-# Project setup
+#Project setup
 ########################################################################
 cmake_minimum_required(VERSION 3.8)
 set(CMAKE_BUILD_TYPE
@@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17)
 enable_testing()
 
 ########################################################################
-# Common compile flags
+#Common compile flags
 ########################################################################
 
-# Disable complex math NaN/INFO range checking for performance
+#Disable complex math NaN / INFO range checking for performance
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE)
 if(HAVE_CX_LIMITED_RANGE)
@@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
 
 if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
-    # Abort compilation if kernel implementations have inconsistent function
-    # prototypes, i.e. if
-    #
-    #     kernel_foo_sse(uint32_t *dst, lv32fc_t *src)
-    #     kernel_foo_avx(uint16_t *dst, lv32fc_t *src)
-    #
-    # are defined. Note the different data type of the first argument). By
-    # default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
-    # is a warning enabled by '-Wall'. These warnings are only applicable to C.
+#Abort compilation if kernel implementations have inconsistent function
+#prototypes, i.e.if
+#
+#kernel_foo_sse(uint32_t* dst, lv32fc_t* src)
+#kernel_foo_avx(uint16_t* dst, lv32fc_t* src)
+#
+#are defined.Note the different data type of the first argument).By
+#default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
+#is a warning enabled by '-Wall'.These warnings are only applicable to C.
     set(CMAKE_C_FLAGS
         "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign")
 endif()
@@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE
 message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.")
 
 ########################################################################
-# Version setup
+#Version setup
 ########################################################################
 
 set(VERSION_INFO_MAJOR_VERSION 3)
@@ -87,13 +87,14 @@ include(VolkVersion) #setup version info
 
 math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000
     + ${VERSION_INFO_MINOR_VERSION} * 100
-    + ${VERSION_INFO_MAINT_VERSION}")
+    + ${
+    VERSION_INFO_MAINT_VERSION}")
 
 configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in
                ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY)
 
 ########################################################################
-# Environment setup
+#Environment setup
 ########################################################################
 if(NOT DEFINED CROSSCOMPILE_MULTILIB)
     set(CROSSCOMPILE_MULTILIB "")
@@ -116,10 +117,10 @@ if(MSVC)
 endif(MSVC)
 
 ########################################################################
-# Dependencies setup
+#Dependencies setup
 ########################################################################
 
-# cpu_features - sensible defaults, user settable option
+#cpu_features - sensible defaults, user settable option
 if(CMAKE_SYSTEM_PROCESSOR MATCHES
    "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)")
     option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON)
@@ -158,7 +159,7 @@ else()
     message(STATUS "Building Volk without cpu_features")
 endif()
 
-# Python
+#Python
 include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
 volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)"
                          PYTHON_MIN_VER_FOUND)
@@ -168,12 +169,12 @@ if(NOT PYTHON_MIN_VER_FOUND)
     message(FATAL_ERROR "Python 3.4 or greater required to build VOLK")
 endif()
 
-# Mako
+#Mako
 if(NOT MAKO_FOUND)
     message(FATAL_ERROR "Mako templates required to build VOLK")
 endif()
 
-# Check if we have std::filesystem
+#Check if we have std::filesystem
 find_package(
     FILESYSTEM
     COMPONENTS Final Experimental
@@ -183,9 +184,9 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 ########################################################################
-# check for aligned_alloc, since some compilers lack this C11 feature.
-# For Apple-clang use `posix_memalign`
-# For MSVC use `_aligned_malloc`.
+#check for aligned_alloc, since some compilers lack this C11 feature.
+#For Apple - clang use `posix_memalign`
+#For MSVC use `_aligned_malloc`.
 ########################################################################
 include(CheckSymbolExists)
 if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin"))
@@ -196,7 +197,7 @@ if(NOT USE_ALIGNED_ALLOC)
 endif()
 
 ########################################################################
-# Check if Orc is available
+#Check if Orc is available
 ########################################################################
 option(ENABLE_ORC "Enable Orc" True)
 if(ENABLE_ORC)
@@ -206,17 +207,17 @@ else(ENABLE_ORC)
 endif(ENABLE_ORC)
 
 ########################################################################
-# Setup doxygen
+#Setup doxygen
 ########################################################################
 add_subdirectory(docs)
 
 ########################################################################
-# Detect /lib versus /lib64
+#Detect / lib versus / lib64
 ########################################################################
 include(GNUInstallDirs)
 
 ########################################################################
-# Setup the package config file
+#Setup the package config file
 ########################################################################
 #set variables found in the pc.in file
 set(prefix ${CMAKE_INSTALL_PREFIX})
@@ -233,7 +234,7 @@ install(
     COMPONENT "volk_devel")
 
 ########################################################################
-# Install all headers in the include directories
+#Install all headers in the include directories
 ########################################################################
 set(VOLK_RUNTIME_DIR bin)
 set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR})
@@ -255,6 +256,7 @@ install(
           ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
+          ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
           ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
@@ -269,7 +271,7 @@ install(
     COMPONENT "volk_devel")
 
 ########################################################################
-# On Apple only, set install name and use rpath correctly, if not already set
+#On Apple only, set install name and use rpath correctly, if not already set
 ########################################################################
 if(APPLE)
     if(NOT CMAKE_INSTALL_NAME_DIR)
@@ -290,21 +292,21 @@ if(APPLE)
 endif(APPLE)
 
 ########################################################################
-# Create uninstall target
+#Create uninstall target
 ########################################################################
 configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
                ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY)
 
-# Only add the target if there isn't one defined already
+#Only add the target if there isn't one defined already
 if(NOT TARGET uninstall)
     add_custom_target(uninstall ${CMAKE_COMMAND} -P
                                 ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
 endif()
 
 ########################################################################
-# Install our Cmake modules into $prefix/lib/cmake/volk
-# See "Package Configuration Files" on page:
-#    http://www.cmake.org/Wiki/CMake/Tutorials/Packaging
+#Install our Cmake modules into $prefix / lib / cmake / volk
+#See "Package Configuration Files" on page:
+#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging
 ########################################################################
 
 configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
@@ -314,7 +316,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
                ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY)
 
 ########################################################################
-# Install cmake search routine for external use
+#Install cmake search routine for external use
 ########################################################################
 
 if(NOT CMAKE_MODULES_DIR)
@@ -334,7 +336,7 @@ install(
     DESTINATION ${CMAKE_MODULES_DIR}/volk)
 
 ########################################################################
-# Option to enable QA testing, on by default
+#Option to enable QA testing, on by default
 ########################################################################
 option(ENABLE_TESTING "Enable QA testing" ON)
 if(ENABLE_TESTING)
@@ -345,7 +347,7 @@ endif()
 message(STATUS "  Modify using: -DENABLE_TESTING=ON/OFF")
 
 ########################################################################
-# Option to enable post-build profiling using volk_profile, off by default
+#Option to enable post - build profiling using volk_profile, off by default
 ########################################################################
 option(ENABLE_PROFILING "Launch system profiler after build" OFF)
 if(ENABLE_PROFILING)
@@ -371,12 +373,12 @@ endif()
 message(STATUS "  Modify using: -DENABLE_PROFILING=ON/OFF")
 
 ########################################################################
-# Setup the library
+#Setup the library
 ########################################################################
 add_subdirectory(lib)
 
 ########################################################################
-# And the utility apps
+#And the utility apps
 ########################################################################
 add_subdirectory(apps)
 option(ENABLE_MODTOOL "Enable volk_modtool python utility" True)
@@ -385,6 +387,6 @@ if(ENABLE_MODTOOL)
 endif()
 
 ########################################################################
-# Print summary
+#Print summary
 ########################################################################
 message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")
diff --git a/gen/archs.xml b/gen/archs.xml
index 164c7bb4..792c50d1 100644
--- a/gen/archs.xml
+++ b/gen/archs.xml
@@ -178,6 +178,14 @@ at the top, as a last resort.
     <alignment>64</alignment>
 </arch>
 
+<arch name="avx512dq">
+    <check name="avx512dq"></check>
+    <flag compiler="gnu">-mavx512dq</flag>
+    <flag compiler="clang">-mavx512dq</flag>
+    <flag compiler="msvc">/arch:AVX512DQ</flag>
+    <alignment>64</alignment>
+</arch>
+
 <arch name="riscv64">
 </arch>
 
diff --git a/gen/machines.xml b/gen/machines.xml
index 887f9794..b76f6d07 100644
--- a/gen/machines.xml
+++ b/gen/machines.xml
@@ -65,4 +65,9 @@
 <archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
 </machine>
 
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx512dq">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
+</machine>
+
 </grammar>
diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h
index 03b24e6c..8a7e4d63 100644
--- a/include/volk/volk_avx2_fma_intrinsics.h
+++ b/include/volk/volk_avx2_fma_intrinsics.h
@@ -8,7 +8,7 @@
  */
 
 /*
- * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
+ * This file is intended to hold AVX2 FMA intrinsics.
  * They should be used in VOLK kernels to avoid copy-paste.
  */
 
@@ -23,7 +23,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
+static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
 {
     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h
new file mode 100644
index 00000000..a6fd87ac
--- /dev/null
+++ b/include/volk/volk_avx512_intrinsics.h
@@ -0,0 +1,67 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX512 intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
+#include <immintrin.h>
+
+static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
+{
+    // r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6
+    const __m512i idx =
+        _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    const __m512 r = _mm512_permutex2var_ps(z1, idx, z2);
+    return r;
+}
+
+static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
+{
+    const __m512i idx =
+        _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    const __m512 i = _mm512_permutex2var_ps(z1, idx, z2);
+    return i;
+}
+
+/*
+ * Approximate arctan(x) via polynomial expansion
+ * on the interval [-1, 1]
+ *
+ * Maximum relative error ~6.5e-7
+ * Polynomial evaluated via Horner's method
+ */
+static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
+{
+    const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
+    const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
+    const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
+    const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
+    const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
+    const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
+    const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);
+
+    const __m512 x_times_x = _mm512_mul_ps(x, x);
+    __m512 arctan;
+    arctan = a13;
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
+    arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
+    arctan = _mm512_mul_ps(x, arctan);
+
+    return arctan;
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h
index 2fc0f064..c6c7a2c5 100644
--- a/include/volk/volk_avx_intrinsics.h
+++ b/include/volk/volk_avx_intrinsics.h
@@ -9,7 +9,7 @@
  */
 
 /*
- * This file is intended to hold AVX intrinsics of intrinsics.
+ * This file is intended to hold AVX intrinsics.
  * They should be used in VOLK kernels to avoid copy-pasta.
  */
 
@@ -24,7 +24,7 @@
  * Maximum relative error ~6.5e-7
  * Polynomial evaluated via Horner's method
  */
-static inline __m256 _m256_arctan_poly_avx(const __m256 x)
+static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
 {
     const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
     const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h
index dc5987cb..ec078826 100644
--- a/kernels/volk/volk_32f_atan_32f.h
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2014 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -13,19 +13,19 @@
  *
  * \b Overview
  *
- * Computes arcsine of input vector and stores results in output vector.
+ * Computes arctan of input vector and stores results in output vector.
  *
  * <b>Dispatcher Prototype</b>
  * \code
- * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points)
  * \endcode
  *
  * \b Inputs
- * \li aVector: The input vector of floats.
+ * \li in_ptr: The input vector of floats.
  * \li num_points: The number of data points.
  *
  * \b Outputs
- * \li bVector: The vector where results will be stored.
+ * \li out_ptr: The vector where results will be stored.
  *
  * \b Example
  * Calculate common angles around the top half of the unit circle.
@@ -59,6 +59,64 @@
 #ifndef INCLUDED_volk_32f_atan_32f_a_H
 #define INCLUDED_volk_32f_atan_32f_a_H
 
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    for (; number < num_points; number++) {
+        *out++ = atanf(*in++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
+{
+    unsigned int number = 0;
+    for (; number < num_points; number++) {
+        *out++ = volk_arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include <immintrin.h>
+#include <volk/volk_avx512_intrinsics.h>
+static inline void
+volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const __m512 one = _mm512_set1_ps(1.f);
+    const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
+    const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+    const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+    for (; number < sixteenth_points; number++) {
+        __m512 x = _mm512_load_ps(in);
+        __mmask16 swap_mask =
+            _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
+                                      _mm512_mask_blend_ps(swap_mask, one, x));
+        __m512 result = _mm512_arctan_poly_avx512(x_star);
+        __m512 term = _mm512_and_ps(x_star, sign_mask);
+        term = _mm512_or_ps(pi_over_2, term);
+        term = _mm512_sub_ps(term, result);
+        result = _mm512_mask_blend_ps(swap_mask, result, term);
+        _mm512_store_ps(out, result);
+        in += 16;
+        out += 16;
+    }
+
+    number = sixteenth_points * 16;
+    for (; number < num_points; number++) {
+        *out++ = volk_arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_AVX512F for aligned */
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 #include <volk/volk_avx2_fma_intrinsics.h>
@@ -77,7 +135,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx2_fma(x_star);
+        __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -112,7 +170,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx(x_star);
+        __m256 result = _mm256_arctan_poly_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -168,6 +226,42 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
 #ifndef INCLUDED_volk_32f_atan_32f_u_H
 #define INCLUDED_volk_32f_atan_32f_u_H
 
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include <immintrin.h>
+#include <volk/volk_avx512_intrinsics.h>
+static inline void
+volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points)
+{
+    const __m512 one = _mm512_set1_ps(1.f);
+    const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
+    const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+    const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+    for (; number < sixteenth_points; number++) {
+        __m512 x = _mm512_loadu_ps(in);
+        __mmask16 swap_mask =
+            _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
+        __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one),
+                                      _mm512_mask_blend_ps(swap_mask, one, x));
+        __m512 result = _mm512_arctan_poly_avx512(x_star);
+        __m512 term = _mm512_and_ps(x_star, sign_mask);
+        term = _mm512_or_ps(pi_over_2, term);
+        term = _mm512_sub_ps(term, result);
+        result = _mm512_mask_blend_ps(swap_mask, result, term);
+        _mm512_storeu_ps(out, result);
+        in += 16;
+        out += 16;
+    }
+
+    number = sixteenth_points * 16;
+    for (; number < num_points; number++) {
+        *out++ = volk_arctan(*in++);
+    }
+}
+#endif /* LV_HAVE_AVX512F for unaligned */
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 static inline void
@@ -185,7 +279,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx2_fma(x_star);
+        __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -219,7 +313,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
         __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
         __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
                                       _mm256_blendv_ps(one, x, swap_mask));
-        __m256 result = _m256_arctan_poly_avx(x_star);
+        __m256 result = _mm256_arctan_poly_avx(x_star);
         __m256 term = _mm256_and_ps(x_star, sign_mask);
         term = _mm256_or_ps(pi_over_2, term);
         term = _mm256_sub_ps(term, result);
@@ -271,26 +365,4 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
 }
 #endif /* LV_HAVE_SSE4_1 for unaligned */
 
-#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
-{
-    unsigned int number = 0;
-    for (; number < num_points; number++) {
-        *out++ = volk_arctan(*in++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#ifdef LV_HAVE_GENERIC
-static inline void
-volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
-{
-    unsigned int number = 0;
-    for (; number < num_points; number++) {
-        *out++ = atanf(*in++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h
index 759db24c..41492541 100644
--- a/kernels/volk/volk_32fc_s32f_atan2_32f.h
+++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- */
 /*
  * Copyright 2012, 2014 Free Software Foundation, Inc.
- * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
+ * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
  *
  * This file is part of VOLK
  *
@@ -100,6 +100,66 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
 }
 #endif /* LV_HAVE_GENERIC */
 
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include <immintrin.h>
+#include <volk/volk_avx512_intrinsics.h>
+static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector,
+                                                     const lv_32fc_t* complexVector,
+                                                     const float normalizeFactor,
+                                                     unsigned int num_points)
+{
+    const float* in = (float*)complexVector;
+    float* out = (float*)outputVector;
+
+    const float invNormalizeFactor = 1.f / normalizeFactor;
+    const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
+    const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
+    const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
+    const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+    const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+    const __m512 zero = _mm512_setzero_ps();
+
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+    for (; number < sixteenth_points; number++) {
+        __m512 z1 = _mm512_load_ps(in);
+        in += 16;
+        __m512 z2 = _mm512_load_ps(in);
+        in += 16;
+
+        __m512 x = _mm512_real(z1, z2);
+        __m512 y = _mm512_imag(z1, z2);
+
+        __mmask16 swap_mask = _mm512_cmp_ps_mask(
+            _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
+        __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x),
+                                     _mm512_mask_blend_ps(swap_mask, x, y));
+        __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q);
+        input = _mm512_mask_blend_ps(nan_mask, input, zero);
+        __m512 result = _mm512_arctan_poly_avx512(input);
+
+        input =
+            _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
+        result = _mm512_mask_blend_ps(swap_mask, result, input);
+
+        __m512 x_sign_mask =
+            _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
+
+        result = _mm512_add_ps(
+            _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
+            result);
+        result = _mm512_mul_ps(result, vinvNormalizeFactor);
+
+        _mm512_store_ps(out, result);
+        out += 16;
+    }
+
+    number = sixteenth_points * 16;
+    volk_32fc_s32f_atan2_32f_polynomial(
+        out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
+}
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 #include <volk/volk_avx2_fma_intrinsics.h>
@@ -136,7 +196,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
                                      _mm256_blendv_ps(x, y, swap_mask));
         __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
         input = _mm256_blendv_ps(input, zero, nan_mask);
-        __m256 result = _m256_arctan_poly_avx2_fma(input);
+        __m256 result = _mm256_arctan_poly_avx2_fma(input);
 
         input =
             _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
@@ -196,7 +256,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
                                      _mm256_blendv_ps(x, y, swap_mask));
         __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
         input = _mm256_blendv_ps(input, zero, nan_mask);
-        __m256 result = _m256_arctan_poly_avx(input);
+        __m256 result = _mm256_arctan_poly_avx(input);
 
         input =
             _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
@@ -224,6 +284,66 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
 #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
 #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
 
+#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
+#include <immintrin.h>
+#include <volk/volk_avx512_intrinsics.h>
+static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector,
+                                                     const lv_32fc_t* complexVector,
+                                                     const float normalizeFactor,
+                                                     unsigned int num_points)
+{
+    const float* in = (float*)complexVector;
+    float* out = (float*)outputVector;
+
+    const float invNormalizeFactor = 1.f / normalizeFactor;
+    const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
+    const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
+    const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
+    const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
+    const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
+    const __m512 zero = _mm512_setzero_ps();
+
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+    for (; number < sixteenth_points; number++) {
+        __m512 z1 = _mm512_loadu_ps(in);
+        in += 16;
+        __m512 z2 = _mm512_loadu_ps(in);
+        in += 16;
+
+        __m512 x = _mm512_real(z1, z2);
+        __m512 y = _mm512_imag(z1, z2);
+
+        __mmask16 swap_mask = _mm512_cmp_ps_mask(
+            _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
+        __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x),
+                                     _mm512_mask_blend_ps(swap_mask, x, y));
+        __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q);
+        input = _mm512_mask_blend_ps(nan_mask, input, zero);
+        __m512 result = _mm512_arctan_poly_avx512(input);
+
+        input =
+            _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
+        result = _mm512_mask_blend_ps(swap_mask, result, input);
+
+        __m512 x_sign_mask =
+            _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
+
+        result = _mm512_add_ps(
+            _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
+            result);
+        result = _mm512_mul_ps(result, vinvNormalizeFactor);
+
+        _mm512_storeu_ps(out, result);
+        out += 16;
+    }
+
+    number = sixteenth_points * 16;
+    volk_32fc_s32f_atan2_32f_polynomial(
+        out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
+}
+#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
+
 #if LV_HAVE_AVX2 && LV_HAVE_FMA
 #include <immintrin.h>
 #include <volk/volk_avx2_fma_intrinsics.h>
@@ -260,7 +380,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
                                      _mm256_blendv_ps(x, y, swap_mask));
         __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
         input = _mm256_blendv_ps(input, zero, nan_mask);
-        __m256 result = _m256_arctan_poly_avx2_fma(input);
+        __m256 result = _mm256_arctan_poly_avx2_fma(input);
 
         input =
             _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
@@ -320,7 +440,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
                                      _mm256_blendv_ps(x, y, swap_mask));
         __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
         input = _mm256_blendv_ps(input, zero, nan_mask);
-        __m256 result = _m256_arctan_poly_avx(input);
+        __m256 result = _mm256_arctan_poly_avx(input);
 
         input =
             _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);