From 2ce41403de53f7966e2b602e622ccd25947b1c42 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 26 Feb 2024 00:37:48 +0100 Subject: [PATCH 1/4] Added AVX512 kernels to arctan and atan2 Signed-off-by: Magnus Lundmark --- CMakeLists.txt | 90 ++++++++-------- gen/archs.xml | 8 ++ gen/machines.xml | 5 + include/volk/volk_avx2_fma_intrinsics.h | 4 +- include/volk/volk_avx512_intrinsics.h | 67 ++++++++++++ include/volk/volk_avx_intrinsics.h | 4 +- kernels/volk/volk_32f_atan_32f.h | 134 ++++++++++++++++++------ kernels/volk/volk_32fc_s32f_atan2_32f.h | 130 ++++++++++++++++++++++- 8 files changed, 358 insertions(+), 84 deletions(-) create mode 100644 include/volk/volk_avx512_intrinsics.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d6401845..9a8a460e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ # -# Copyright 2011-2020 Free Software Foundation, Inc. -# Copyright 2023 Magnus Lundmark +#Copyright 2011 - 2020 Free Software Foundation, Inc. +#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com> # -# This file is part of VOLK +#This file is part of VOLK # -# SPDX-License-Identifier: LGPL-3.0-or-later +#SPDX - License - Identifier : LGPL - 3.0 - or -later # ######################################################################## -# Project setup +#Project setup ######################################################################## cmake_minimum_required(VERSION 3.8) set(CMAKE_BUILD_TYPE @@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17) enable_testing() ######################################################################## -# Common compile flags +#Common compile flags ######################################################################## -# Disable complex math NaN/INFO range checking for performance +#Disable complex math NaN / INFO range checking for performance include(CheckCXXCompilerFlag) check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) if(HAVE_CX_LIMITED_RANGE) @@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") - # Abort compilation if kernel implementations have inconsistent function - # prototypes, i.e. if - # - # kernel_foo_sse(uint32_t *dst, lv32fc_t *src) - # kernel_foo_avx(uint16_t *dst, lv32fc_t *src) - # - # are defined. Note the different data type of the first argument). By - # default 'incompatible-pointer-types' is a warning only and 'pointer-sign' - # is a warning enabled by '-Wall'. These warnings are only applicable to C. +#Abort compilation if kernel implementations have inconsistent function +#prototypes, i.e.if +# +#kernel_foo_sse(uint32_t* dst, lv32fc_t* src) +#kernel_foo_avx(uint16_t* dst, lv32fc_t* src) +# +#are defined.Note the different data type of the first argument).By +#default 'incompatible-pointer-types' is a warning only and 'pointer-sign' +#is a warning enabled by '-Wall'.These warnings are only applicable to C. set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign") endif() @@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") ######################################################################## -# Version setup +#Version setup ######################################################################## set(VERSION_INFO_MAJOR_VERSION 3) @@ -87,13 +87,14 @@ include(VolkVersion) #setup version info math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 + ${VERSION_INFO_MINOR_VERSION} * 100 - + ${VERSION_INFO_MAINT_VERSION}") + + ${ + VERSION_INFO_MAINT_VERSION}") configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY) ######################################################################## -# Environment setup +#Environment setup ######################################################################## if(NOT DEFINED CROSSCOMPILE_MULTILIB) set(CROSSCOMPILE_MULTILIB "") @@ -116,10 +117,10 @@ if(MSVC) endif(MSVC) ######################################################################## -# Dependencies setup +#Dependencies setup ######################################################################## -# cpu_features - sensible defaults, user settable option +#cpu_features - sensible defaults, user settable option if(CMAKE_SYSTEM_PROCESSOR MATCHES "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)") option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON) @@ -158,7 +159,7 @@ else() message(STATUS "Building Volk without cpu_features") endif() -# Python +#Python include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)" PYTHON_MIN_VER_FOUND) @@ -168,12 +169,12 @@ if(NOT PYTHON_MIN_VER_FOUND) message(FATAL_ERROR "Python 3.4 or greater required to build VOLK") endif() -# Mako +#Mako if(NOT MAKO_FOUND) message(FATAL_ERROR "Mako templates required to build VOLK") endif() -# Check if we have std::filesystem +#Check if we have std::filesystem find_package( FILESYSTEM COMPONENTS Final Experimental @@ -183,9 +184,9 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) ######################################################################## -# check for aligned_alloc, since some compilers lack this C11 feature. -# For Apple-clang use `posix_memalign` -# For MSVC use `_aligned_malloc`. +#check for aligned_alloc, since some compilers lack this C11 feature. +#For Apple - clang use `posix_memalign` +#For MSVC use `_aligned_malloc`. ######################################################################## include(CheckSymbolExists) if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")) @@ -196,7 +197,7 @@ if(NOT USE_ALIGNED_ALLOC) endif() ######################################################################## -# Check if Orc is available +#Check if Orc is available ######################################################################## option(ENABLE_ORC "Enable Orc" True) if(ENABLE_ORC) @@ -206,17 +207,17 @@ else(ENABLE_ORC) endif(ENABLE_ORC) ######################################################################## -# Setup doxygen +#Setup doxygen ######################################################################## add_subdirectory(docs) ######################################################################## -# Detect /lib versus /lib64 +#Detect / lib versus / lib64 ######################################################################## include(GNUInstallDirs) ######################################################################## -# Setup the package config file +#Setup the package config file ######################################################################## #set variables found in the pc.in file set(prefix ${CMAKE_INSTALL_PREFIX}) @@ -233,7 +234,7 @@ install( COMPONENT "volk_devel") ######################################################################## -# Install all headers in the include directories +#Install all headers in the include directories ######################################################################## set(VOLK_RUNTIME_DIR bin) set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) @@ -255,6 +256,7 @@ install( ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h @@ -269,7 +271,7 @@ install( COMPONENT "volk_devel") ######################################################################## -# On Apple only, set install name and use rpath correctly, if not already set +#On Apple only, set install name and use rpath correctly, if not already set ######################################################################## if(APPLE) if(NOT CMAKE_INSTALL_NAME_DIR) @@ -290,21 +292,21 @@ if(APPLE) endif(APPLE) ######################################################################## -# Create uninstall target +#Create uninstall target ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY) -# Only add the target if there isn't one defined already +#Only add the target if there isn't one defined already if(NOT TARGET uninstall) add_custom_target(uninstall ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() ######################################################################## -# Install our Cmake modules into $prefix/lib/cmake/volk -# See "Package Configuration Files" on page: -# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging +#Install our Cmake modules into $prefix / lib / cmake / volk +#See "Package Configuration Files" on page: +#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in @@ -314,7 +316,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) ######################################################################## -# Install cmake search routine for external use +#Install cmake search routine for external use ######################################################################## if(NOT CMAKE_MODULES_DIR) @@ -334,7 +336,7 @@ install( DESTINATION ${CMAKE_MODULES_DIR}/volk) ######################################################################## -# Option to enable QA testing, on by default +#Option to enable QA testing, on by default ######################################################################## option(ENABLE_TESTING "Enable QA testing" ON) if(ENABLE_TESTING) @@ -345,7 +347,7 @@ endif() message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF") ######################################################################## -# Option to enable post-build profiling using volk_profile, off by default +#Option to enable post - build profiling using volk_profile, off by default ######################################################################## option(ENABLE_PROFILING "Launch system profiler after build" OFF) if(ENABLE_PROFILING) @@ -371,12 +373,12 @@ endif() message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF") ######################################################################## -# Setup the library +#Setup the library ######################################################################## add_subdirectory(lib) ######################################################################## -# And the utility apps +#And the utility apps ######################################################################## add_subdirectory(apps) option(ENABLE_MODTOOL "Enable volk_modtool python utility" True) @@ -385,6 +387,6 @@ if(ENABLE_MODTOOL) endif() ######################################################################## -# Print summary +#Print summary ######################################################################## message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}") diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb4..792c50d1 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -178,6 +178,14 @@ at the top, as a last resort. 64 + + + -mavx512dq + -mavx512dq + /arch:AVX512DQ + 64 + + diff --git a/gen/machines.xml b/gen/machines.xml index 887f9794..b76f6d07 100644 --- a/gen/machines.xml +++ b/gen/machines.xml @@ -65,4 +65,9 @@ generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc| + + +generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc| + + diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h index 03b24e6c..8a7e4d63 100644 --- a/include/volk/volk_avx2_fma_intrinsics.h +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -8,7 +8,7 @@ */ /* - * This file is intended to hold AVX2 FMA intrinsics of intrinsics. + * This file is intended to hold AVX2 FMA intrinsics. * They should be used in VOLK kernels to avoid copy-paste. */ @@ -23,7 +23,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h new file mode 100644 index 00000000..a6fd87ac --- /dev/null +++ b/include/volk/volk_avx512_intrinsics.h @@ -0,0 +1,67 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold AVX512 intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#include + +static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) +{ + // r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6 + const __m512i idx = + _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + const __m512 r = _mm512_permutex2var_ps(z1, idx, z2); + return r; +} + +static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) +{ + const __m512i idx = + _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + const __m512 i = _mm512_permutex2var_ps(z1, idx, z2); + return i; +} + +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) +{ + const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); + const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f); + const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f); + const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f); + const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f); + const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f); + const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f); + + const __m512 x_times_x = _mm512_mul_ps(x, x); + __m512 arctan; + arctan = a13; + arctan = _mm512_fmadd_ps(x_times_x, arctan, a11); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a9); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a7); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a5); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a3); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a1); + arctan = _mm512_mul_ps(x, arctan); + + return arctan; +} + +#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 2fc0f064..c6c7a2c5 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -9,7 +9,7 @@ */ /* - * This file is intended to hold AVX intrinsics of intrinsics. + * This file is intended to hold AVX intrinsics. * They should be used in VOLK kernels to avoid copy-pasta. */ @@ -24,7 +24,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index dc5987cb..ec078826 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -13,19 +13,19 @@ * * \b Overview * - * Computes arcsine of input vector and stores results in output vector. + * Computes arctan of input vector and stores results in output vector. * * Dispatcher Prototype * \code - * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points) + * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points) * \endcode * * \b Inputs - * \li aVector: The input vector of floats. + * \li in_ptr: The input vector of floats. * \li num_points: The number of data points. * * \b Outputs - * \li bVector: The vector where results will be stored. + * \li out_ptr: The vector where results will be stored. * * \b Example * Calculate common angles around the top half of the unit circle. @@ -59,6 +59,64 @@ #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = atanf(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_store_ps(out, result); + in += 16; + out += 16; + } + + number = sixteenth_points * 16; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -77,7 +135,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -112,7 +170,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -168,6 +226,42 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #ifndef INCLUDED_volk_32f_atan_32f_u_H #define INCLUDED_volk_32f_atan_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_storeu_ps(out, result); + in += 16; + out += 16; + } + + number = sixteenth_points * 16; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include static inline void @@ -185,7 +279,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -219,7 +313,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -271,26 +365,4 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) } #endif /* LV_HAVE_SSE4_1 for unaligned */ -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = volk_arctan(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = atanf(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 759db24c..41492541 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2012, 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -100,6 +100,66 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_load_ps(in); + in += 16; + __m512 z2 = _mm512_load_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_store_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, (lv_32fc_t*)in, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -136,7 +196,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -196,7 +256,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -224,6 +284,66 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_loadu_ps(in); + in += 16; + __m512 z2 = _mm512_loadu_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_storeu_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, (lv_32fc_t*)in, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -260,7 +380,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -320,7 +440,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); From 4feceb5284bd08091611ce54d04bbde7df5b9934 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 26 Feb 2024 00:42:56 +0100 Subject: [PATCH 2/4] Removed comment, simplified Signed-off-by: Magnus Lundmark --- include/volk/volk_avx512_intrinsics.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h index a6fd87ac..0bac1c6a 100644 --- a/include/volk/volk_avx512_intrinsics.h +++ b/include/volk/volk_avx512_intrinsics.h @@ -18,19 +18,16 @@ static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) { - // r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6 const __m512i idx = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); - const __m512 r = _mm512_permutex2var_ps(z1, idx, z2); - return r; + return _mm512_permutex2var_ps(z1, idx, z2); } static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) { const __m512i idx = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); - const __m512 i = _mm512_permutex2var_ps(z1, idx, z2); - return i; + return _mm512_permutex2var_ps(z1, idx, z2); } /* From 319387d1a39af4ad9d8bf5edcdaa96313fe9b153 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 26 Feb 2024 00:47:54 +0100 Subject: [PATCH 3/4] Restored file Signed-off-by: Magnus Lundmark --- CMakeLists.txt | 89 +++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a8a460e..823f0942 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ # -#Copyright 2011 - 2020 Free Software Foundation, Inc. -#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com> +# Copyright 2011-2020 Free Software Foundation, Inc. +# Copyright 2023 Magnus Lundmark # -#This file is part of VOLK +# This file is part of VOLK # -#SPDX - License - Identifier : LGPL - 3.0 - or -later +# SPDX-License-Identifier: LGPL-3.0-or-later # ######################################################################## -#Project setup +# Project setup ######################################################################## cmake_minimum_required(VERSION 3.8) set(CMAKE_BUILD_TYPE @@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17) enable_testing() ######################################################################## -#Common compile flags +# Common compile flags ######################################################################## -#Disable complex math NaN / INFO range checking for performance +# Disable complex math NaN/INFO range checking for performance include(CheckCXXCompilerFlag) check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) if(HAVE_CX_LIMITED_RANGE) @@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") -#Abort compilation if kernel implementations have inconsistent function -#prototypes, i.e.if -# -#kernel_foo_sse(uint32_t* dst, lv32fc_t* src) -#kernel_foo_avx(uint16_t* dst, lv32fc_t* src) -# -#are defined.Note the different data type of the first argument).By -#default 'incompatible-pointer-types' is a warning only and 'pointer-sign' -#is a warning enabled by '-Wall'.These warnings are only applicable to C. + # Abort compilation if kernel implementations have inconsistent function + # prototypes, i.e. if + # + # kernel_foo_sse(uint32_t *dst, lv32fc_t *src) + # kernel_foo_avx(uint16_t *dst, lv32fc_t *src) + # + # are defined. Note the different data type of the first argument). By + # default 'incompatible-pointer-types' is a warning only and 'pointer-sign' + # is a warning enabled by '-Wall'. These warnings are only applicable to C. set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign") endif() @@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") ######################################################################## -#Version setup +# Version setup ######################################################################## set(VERSION_INFO_MAJOR_VERSION 3) @@ -87,14 +87,13 @@ include(VolkVersion) #setup version info math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 + ${VERSION_INFO_MINOR_VERSION} * 100 - + ${ - VERSION_INFO_MAINT_VERSION}") + + ${VERSION_INFO_MAINT_VERSION}") configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY) ######################################################################## -#Environment setup +# Environment setup ######################################################################## if(NOT DEFINED CROSSCOMPILE_MULTILIB) set(CROSSCOMPILE_MULTILIB "") @@ -117,10 +116,10 @@ if(MSVC) endif(MSVC) ######################################################################## -#Dependencies setup +# Dependencies setup ######################################################################## -#cpu_features - sensible defaults, user settable option +# cpu_features - sensible defaults, user settable option if(CMAKE_SYSTEM_PROCESSOR MATCHES "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)") option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON) @@ -159,7 +158,7 @@ else() message(STATUS "Building Volk without cpu_features") endif() -#Python +# Python include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)" PYTHON_MIN_VER_FOUND) @@ -169,12 +168,12 @@ if(NOT PYTHON_MIN_VER_FOUND) message(FATAL_ERROR "Python 3.4 or greater required to build VOLK") endif() -#Mako +# Mako if(NOT MAKO_FOUND) message(FATAL_ERROR "Mako templates required to build VOLK") endif() -#Check if we have std::filesystem +# Check if we have std::filesystem find_package( FILESYSTEM COMPONENTS Final Experimental @@ -184,9 +183,9 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) ######################################################################## -#check for aligned_alloc, since some compilers lack this C11 feature. -#For Apple - clang use `posix_memalign` -#For MSVC use `_aligned_malloc`. +# check for aligned_alloc, since some compilers lack this C11 feature. +# For Apple-clang use `posix_memalign` +# For MSVC use `_aligned_malloc`. ######################################################################## include(CheckSymbolExists) if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")) @@ -197,7 +196,7 @@ if(NOT USE_ALIGNED_ALLOC) endif() ######################################################################## -#Check if Orc is available +# Check if Orc is available ######################################################################## option(ENABLE_ORC "Enable Orc" True) if(ENABLE_ORC) @@ -207,17 +206,17 @@ else(ENABLE_ORC) endif(ENABLE_ORC) ######################################################################## -#Setup doxygen +# Setup doxygen ######################################################################## add_subdirectory(docs) ######################################################################## -#Detect / lib versus / lib64 +# Detect /lib versus /lib64 ######################################################################## include(GNUInstallDirs) ######################################################################## -#Setup the package config file +# Setup the package config file ######################################################################## #set variables found in the pc.in file set(prefix ${CMAKE_INSTALL_PREFIX}) @@ -234,7 +233,7 @@ install( COMPONENT "volk_devel") ######################################################################## -#Install all headers in the include directories +# Install all headers in the include directories ######################################################################## set(VOLK_RUNTIME_DIR bin) set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) @@ -271,7 +270,7 @@ install( COMPONENT "volk_devel") ######################################################################## -#On Apple only, set install name and use rpath correctly, if not already set +# On Apple only, set install name and use rpath correctly, if not already set ######################################################################## if(APPLE) if(NOT CMAKE_INSTALL_NAME_DIR) @@ -292,21 +291,21 @@ if(APPLE) endif(APPLE) ######################################################################## -#Create uninstall target +# Create uninstall target ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY) -#Only add the target if there isn't one defined already +# Only add the target if there isn't one defined already if(NOT TARGET uninstall) add_custom_target(uninstall ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() ######################################################################## -#Install our Cmake modules into $prefix / lib / cmake / volk -#See "Package Configuration Files" on page: -#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging +# Install our Cmake modules into $prefix/lib/cmake/volk +# See "Package Configuration Files" on page: +# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in @@ -316,7 +315,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) ######################################################################## -#Install cmake search routine for external use +# Install cmake search routine for external use ######################################################################## if(NOT CMAKE_MODULES_DIR) @@ -336,7 +335,7 @@ install( DESTINATION ${CMAKE_MODULES_DIR}/volk) ######################################################################## -#Option to enable QA testing, on by default +# Option to enable QA testing, on by default ######################################################################## option(ENABLE_TESTING "Enable QA testing" ON) if(ENABLE_TESTING) @@ -347,7 +346,7 @@ endif() message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF") ######################################################################## -#Option to enable post - build profiling using volk_profile, off by default +# Option to enable post-build profiling using volk_profile, off by default ######################################################################## option(ENABLE_PROFILING "Launch system profiler after build" OFF) if(ENABLE_PROFILING) @@ -373,12 +372,12 @@ endif() message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF") ######################################################################## -#Setup the library +# Setup the library ######################################################################## add_subdirectory(lib) ######################################################################## -#And the utility apps +# And the utility apps ######################################################################## add_subdirectory(apps) option(ENABLE_MODTOOL "Enable volk_modtool python utility" True) @@ -387,6 +386,6 @@ if(ENABLE_MODTOOL) endif() ######################################################################## -#Print summary +# Print summary ######################################################################## message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}") From af7e8fcc1f919d4ca6028fc2effe3f9f768a587e Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Thu, 29 Feb 2024 23:24:35 +0100 Subject: [PATCH 4/4] resolved comments Signed-off-by: Magnus Lundmark --- include/volk/volk_avx512_intrinsics.h | 21 +++-- kernels/volk/volk_32f_atan_32f.h | 102 +++++++++++------------- kernels/volk/volk_32fc_s32f_atan2_32f.h | 46 +++++------ 3 files changed, 83 insertions(+), 86 deletions(-) diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h index 0bac1c6a..6f6a05ee 100644 --- a/include/volk/volk_avx512_intrinsics.h +++ b/include/volk/volk_avx512_intrinsics.h @@ -16,6 +16,10 @@ #define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ #include +//////////////////////////////////////////////////////////////////////// +// Place real parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) { const __m512i idx = @@ -23,6 +27,10 @@ static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) return _mm512_permutex2var_ps(z1, idx, z2); } +//////////////////////////////////////////////////////////////////////// +// Place imaginary parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) { const __m512i idx = @@ -30,13 +38,12 @@ static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) return _mm512_permutex2var_ps(z1, idx, z2); } -/* - * Approximate arctan(x) via polynomial expansion - * on the interval [-1, 1] - * - * Maximum relative error ~6.5e-7 - * Polynomial evaluated via Horner's method - */ +//////////////////////////////////////////////////////////////////////// +// Approximate arctan(x) via polynomial expansion on the interval [-1, 1] +// Maximum relative error ~6.5e-7 +// Polynomial evaluated via Horner's method +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) { const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index ec078826..03afea55 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -63,8 +63,7 @@ static inline void volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - for (; number < num_points; number++) { + for (unsigned int number = 0; number < num_points; number++) { *out++ = atanf(*in++); } } @@ -74,8 +73,7 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) static inline void volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - for (; number < num_points; number++) { + for (unsigned int number = 0; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -85,17 +83,18 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point #include #include static inline void -volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) +volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points) { const __m512 one = _mm512_set1_ps(1.f); const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_load_ps(in); + in += 16; __mmask16 swap_mask = _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), @@ -106,16 +105,14 @@ volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) term = _mm512_sub_ps(term, result); result = _mm512_mask_blend_ps(swap_mask, result, term); _mm512_store_ps(out, result); - in += 16; out += 16; } - number = sixteenth_points * 16; - for (; number < num_points; number++) { + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { *out++ = volk_arctan(*in++); } } -#endif /* LV_HAVE_AVX512F for aligned */ +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include @@ -128,10 +125,11 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -141,12 +139,10 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -163,10 +159,11 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -176,12 +173,10 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -198,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_load_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -211,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_store_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -230,17 +224,18 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #include #include static inline void -volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) +volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points) { const __m512 one = _mm512_set1_ps(1.f); const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_loadu_ps(in); + in += 16; __mmask16 swap_mask = _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), @@ -251,16 +246,14 @@ volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) term = _mm512_sub_ps(term, result); result = _mm512_mask_blend_ps(swap_mask, result, term); _mm512_storeu_ps(out, result); - in += 16; out += 16; } - number = sixteenth_points * 16; - for (; number < num_points; number++) { + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { *out++ = volk_arctan(*in++); } } -#endif /* LV_HAVE_AVX512F for unaligned */ +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include @@ -272,10 +265,11 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -285,12 +279,10 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -306,10 +298,11 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -319,12 +312,10 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -341,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_loadu_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -354,12 +346,10 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_storeu_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 41492541..5e8be5ce 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -72,8 +72,8 @@ static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float real = *inPtr++; const float imag = *inPtr++; *outPtr++ = atan2f(imag, real) * invNormalizeFactor; @@ -91,8 +91,8 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float x = *inPtr++; const float y = *inPtr++; *outPtr++ = volk_atan2(y, x) * invNormalizeFactor; @@ -103,10 +103,10 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ #include #include -static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, - const lv_32fc_t* complexVector, - const float normalizeFactor, - unsigned int num_points) +static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) { const float* in = (float*)complexVector; float* out = (float*)outputVector; @@ -120,7 +120,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, const __m512 zero = _mm512_setzero_ps(); unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; + const unsigned int sixteenth_points = num_points / 16; for (; number < sixteenth_points; number++) { __m512 z1 = _mm512_load_ps(in); in += 16; @@ -156,7 +156,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, number = sixteenth_points * 16; volk_32fc_s32f_atan2_32f_polynomial( - out, (lv_32fc_t*)in, normalizeFactor, num_points - number); + out, complexVector + number, normalizeFactor, num_points - number); } #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ @@ -180,7 +180,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -240,7 +240,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -287,10 +287,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ #include #include -static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, - const lv_32fc_t* complexVector, - const float normalizeFactor, - unsigned int num_points) +static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) { const float* in = (float*)complexVector; float* out = (float*)outputVector; @@ -303,9 +303,9 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); const __m512 zero = _mm512_setzero_ps(); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 z1 = _mm512_loadu_ps(in); in += 16; __m512 z2 = _mm512_loadu_ps(in); @@ -338,9 +338,9 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, out += 16; } - number = sixteenth_points * 16; + unsigned int number = sixteenth_points * 16; volk_32fc_s32f_atan2_32f_polynomial( - out, (lv_32fc_t*)in, normalizeFactor, num_points - number); + out, complexVector + number, normalizeFactor, num_points - number); } #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ @@ -364,7 +364,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8; @@ -424,7 +424,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8;