diff --git a/CMakeLists.txt b/CMakeLists.txt index be1dd6ebc..d9141b154 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ # # Copyright 2011-2020 Free Software Foundation, Inc. +# Copyright 2023 Magnus Lundmark # # This file is part of VOLK # @@ -249,6 +250,7 @@ install(FILES ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h new file mode 100644 index 000000000..03b24e6c0 --- /dev/null +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -0,0 +1,50 @@ +/* -*- c++ -*- */ +/* + * Copyright 2023 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold AVX2 FMA intrinsics of intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ +#include + +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) +{ + const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); + const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); + const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); + const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); + const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); + const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); + const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); + + const __m256 x_times_x = _mm256_mul_ps(x, x); + __m256 arctan; + arctan = a13; + arctan = _mm256_fmadd_ps(x_times_x, arctan, a11); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a9); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a7); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a5); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a3); + arctan = _mm256_fmadd_ps(x_times_x, arctan, a1); + arctan = _mm256_mul_ps(x, arctan); + + return arctan; +} + +#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 0de88d84d..2fc0f064e 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2015 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -16,6 +17,43 @@ #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ #include +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m256 _m256_arctan_poly_avx(const __m256 x) +{ + const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); + const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); + const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); + const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); + const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); + const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); + const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); + + const __m256 x_times_x = _mm256_mul_ps(x, x); + __m256 arctan; + arctan = a13; + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a11); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a9); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a7); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a5); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a3); + arctan = _mm256_mul_ps(x_times_x, arctan); + arctan = _mm256_add_ps(arctan, a1); + arctan = _mm256_mul_ps(x, arctan); + + return arctan; +} + static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) { __m256 yl, yh, tmp1, tmp2; diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 55a814532..dc217cc97 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f) // Constant used to do log10 calculations as faster log2 //////////////////////////////////////////////////////////////////////// // precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr -#define volk_log2to10factor 3.01029995663981209120 +#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120 + +//////////////////////////////////////////////////////////////////////// +// arctan(x) +//////////////////////////////////////////////////////////////////////// +static inline float volk_arctan_poly(const float x) +{ + /* + * arctan(x) polynomial expansion on the interval [-1, 1] + * Maximum relative error < 6.6e-7 + */ + const float a1 = +0x1.ffffeap-1f; + const float a3 = -0x1.55437p-2f; + const float a5 = +0x1.972be6p-3f; + const float a7 = -0x1.1436ap-3f; + const float a9 = +0x1.5785aap-4f; + const float a11 = -0x1.2f3004p-5f; + const float a13 = +0x1.01a37cp-7f; + + const float x_times_x = x * x; + float arctan = a13; + arctan = fmaf(x_times_x, arctan, a11); + arctan = fmaf(x_times_x, arctan, a9); + arctan = fmaf(x_times_x, arctan, a7); + arctan = fmaf(x_times_x, arctan, a5); + arctan = fmaf(x_times_x, arctan, a3); + arctan = fmaf(x_times_x, arctan, a1); + arctan *= x; + + return arctan; +} + +static inline float volk_arctan(const float x) +{ + /* + * arctan(x) + arctan(1 / x) == sign(x) * pi / 2 + */ + const float pi_over_2 = 0x1.921fb6p0f; + + if (fabs(x) < 1.f) { + return volk_arctan_poly(x); + } else { + return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x); + } +} #endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h index ce900651c..0ede17845 100644 --- a/include/volk/volk_sse_intrinsics.h +++ b/include/volk/volk_sse_intrinsics.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2015 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -16,6 +17,43 @@ #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ #include +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m128 _mm_arctan_poly_sse(const __m128 x) +{ + const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f); + const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f); + const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f); + const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f); + const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f); + const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f); + const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f); + + const __m128 x_times_x = _mm_mul_ps(x, x); + __m128 arctan; + arctan = a13; + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a11); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a9); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a7); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a5); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a3); + arctan = _mm_mul_ps(x_times_x, arctan); + arctan = _mm_add_ps(arctan, a1); + arctan = _mm_mul_ps(x, arctan); + + return arctan; +} + static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) { __m128 iValue, qValue; diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index 96078f4a1..dc5987cb8 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -53,210 +54,115 @@ * volk_free(out); * \endcode */ - -#include #include -#include - -/* This is the number of terms of Taylor series to evaluate, increase this for more - * accuracy*/ -#define TERMS 2 #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H #if LV_HAVE_AVX2 && LV_HAVE_FMA #include - -static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector, - const float* aVector, - unsigned int num_points) +#include +static inline void +volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_fmadd_ps( - y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_store_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_store_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = volk_arctan(*in++); } } - #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */ - -#ifdef LV_HAVE_AVX +#if LV_HAVE_AVX #include - +#include static inline void -volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_load_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, - _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), - _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps( - y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_store_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_poly_avx(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_store_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = volk_arctan(*in++); } } - #endif /* LV_HAVE_AVX for aligned */ #ifdef LV_HAVE_SSE4_1 #include - +#include static inline void -volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m128 one = _mm_set1_ps(1.f); + const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f); + const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); + const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arctangent; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for (; number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - } - x = _mm_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), - _mm_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arctangent = - _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); - - _mm_store_ps(bPtr, arctangent); - aPtr += 4; - bPtr += 4; + unsigned int quarter_points = num_points / 4; + for (; number < quarter_points; number++) { + __m128 x = _mm_load_ps(in); + __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); + __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), + _mm_blendv_ps(one, x, swap_mask)); + __m128 result = _mm_arctan_poly_sse(x_star); + __m128 term = _mm_and_ps(x_star, sign_mask); + term = _mm_or_ps(pi_over_2, term); + term = _mm_sub_ps(term, result); + result = _mm_blendv_ps(result, term, swap_mask); + _mm_store_ps(out, result); + in += 4; + out += 4; } - number = quarterPoints * 4; + number = quarter_points * 4; for (; number < num_points; number++) { - *bPtr++ = atanf(*aPtr++); + *out++ = volk_arctan(*in++); } } - #endif /* LV_HAVE_SSE4_1 for aligned */ - #endif /* INCLUDED_volk_32f_atan_32f_a_H */ #ifndef INCLUDED_volk_32f_atan_32f_u_H @@ -264,205 +170,125 @@ volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #if LV_HAVE_AVX2 && LV_HAVE_FMA #include - -static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector, - const float* aVector, - unsigned int num_points) +static inline void +volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_fmadd_ps( - y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_storeu_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = volk_arctan(*in++); } } - #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */ - -#ifdef LV_HAVE_AVX +#if LV_HAVE_AVX #include - static inline void -volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m256 one = _mm256_set1_ps(1.f); + const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f); + const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); + const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int eighthPoints = num_points / 8; - int i, j; - - __m256 aVal, pio2, x, y, z, arctangent; - __m256 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm256_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm256_setzero_ps(); - fones = _mm256_set1_ps(1.0); - ftwos = _mm256_set1_ps(2.0); - ffours = _mm256_set1_ps(4.0); - - for (; number < eighthPoints; number++) { - aVal = _mm256_loadu_ps(aPtr); - z = aVal; - condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS); - z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition)); - condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS); - x = _mm256_add_ps( - z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) { - x = _mm256_add_ps(x, - _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); - } - x = _mm256_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) { - y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), - _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); - } - - y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); - condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); - - y = _mm256_add_ps( - y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS); - arctangent = _mm256_sub_ps( - arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition)); - - _mm256_storeu_ps(bPtr, arctangent); - aPtr += 8; - bPtr += 8; + unsigned int eighth_points = num_points / 8; + for (; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), + _mm256_blendv_ps(one, x, swap_mask)); + __m256 result = _m256_arctan_poly_avx(x_star); + __m256 term = _mm256_and_ps(x_star, sign_mask); + term = _mm256_or_ps(pi_over_2, term); + term = _mm256_sub_ps(term, result); + result = _mm256_blendv_ps(result, term, swap_mask); + _mm256_storeu_ps(out, result); + in += 8; + out += 8; } - number = eighthPoints * 8; + number = eighth_points * 8; for (; number < num_points; number++) { - *bPtr++ = atan(*aPtr++); + *out++ = volk_arctan(*in++); } } - #endif /* LV_HAVE_AVX for unaligned */ #ifdef LV_HAVE_SSE4_1 #include - +#include static inline void -volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; + const __m128 one = _mm_set1_ps(1.f); + const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f); + const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); + const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); unsigned int number = 0; - unsigned int quarterPoints = num_points / 4; - int i, j; - - __m128 aVal, pio2, x, y, z, arctangent; - __m128 fzeroes, fones, ftwos, ffours, condition; - - pio2 = _mm_set1_ps(3.14159265358979323846 / 2); - fzeroes = _mm_setzero_ps(); - fones = _mm_set1_ps(1.0); - ftwos = _mm_set1_ps(2.0); - ffours = _mm_set1_ps(4.0); - - for (; number < quarterPoints; number++) { - aVal = _mm_loadu_ps(aPtr); - z = aVal; - condition = _mm_cmplt_ps(z, fzeroes); - z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition)); - condition = _mm_cmplt_ps(z, fones); - x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - - for (i = 0; i < 2; i++) - x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); - x = _mm_div_ps(fones, x); - y = fzeroes; - for (j = TERMS - 1; j >= 0; j--) - y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), - _mm_set1_ps(pow(-1, j) / (2 * j + 1))); - - y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); - condition = _mm_cmpgt_ps(z, fones); - - y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition)); - arctangent = y; - condition = _mm_cmplt_ps(aVal, fzeroes); - arctangent = - _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition)); - - _mm_storeu_ps(bPtr, arctangent); - aPtr += 4; - bPtr += 4; + unsigned int quarter_points = num_points / 4; + for (; number < quarter_points; number++) { + __m128 x = _mm_loadu_ps(in); + __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); + __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), + _mm_blendv_ps(one, x, swap_mask)); + __m128 result = _mm_arctan_poly_sse(x_star); + __m128 term = _mm_and_ps(x_star, sign_mask); + term = _mm_or_ps(pi_over_2, term); + term = _mm_sub_ps(term, result); + result = _mm_blendv_ps(result, term, swap_mask); + _mm_storeu_ps(out, result); + in += 4; + out += 4; } - number = quarterPoints * 4; + number = quarter_points * 4; for (; number < num_points; number++) { - *bPtr++ = atanf(*aPtr++); + *out++ = volk_arctan(*in++); } } - #endif /* LV_HAVE_SSE4_1 for unaligned */ #ifdef LV_HAVE_GENERIC - static inline void -volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points) +volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) { - float* bPtr = bVector; - const float* aPtr = aVector; unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ - for (number = 0; number < num_points; number++) { - *bPtr++ = atanf(*aPtr++); +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = atanf(*in++); } } #endif /* LV_HAVE_GENERIC */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index d7e4ad179..530beb1ee 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -1,6 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 - 2021 Free Software Foundation, Inc. + * Copyright 2023 Magnus Lundmark * * This file is part of VOLK * @@ -85,7 +86,7 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params)) QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))