Skip to content

Commit

Permalink
tidied up
Browse files Browse the repository at this point in the history
Signed-off-by: Magnus Lundmark <[email protected]>
  • Loading branch information
Ka-zam committed Sep 27, 2023
1 parent 3ce5fb0 commit 8103dcf
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 51 deletions.
14 changes: 7 additions & 7 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
*/
static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0.999999348f);
const __m256 a3 = _mm256_set1_ps(-0.333265149f);
const __m256 a5 = _mm256_set1_ps(+0.198814825f);
const __m256 a7 = _mm256_set1_ps(-0.134871915f);
const __m256 a9 = _mm256_set1_ps(+0.08387119203f);
const __m256 a11 = _mm256_set1_ps(-0.0370130021f);
const __m256 a13 = _mm256_set1_ps(+0.00786337701f);
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
Expand Down
14 changes: 7 additions & 7 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
*/
static inline __m256 _m256_arctan_approximation_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0.999999348f);
const __m256 a3 = _mm256_set1_ps(-0.333265149f);
const __m256 a5 = _mm256_set1_ps(+0.198814825f);
const __m256 a7 = _mm256_set1_ps(-0.134871915f);
const __m256 a9 = _mm256_set1_ps(+0.08387119203f);
const __m256 a11 = _mm256_set1_ps(-0.0370130021f);
const __m256 a13 = _mm256_set1_ps(+0.00786337701f);
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
Expand Down
14 changes: 7 additions & 7 deletions include/volk/volk_sse_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
*/
static inline __m128 _mm_arctan_approximation_sse(const __m128 x)
{
const __m128 a1 = _mm_set1_ps(+0.999999348f);
const __m128 a3 = _mm_set1_ps(-0.333265149f);
const __m128 a5 = _mm_set1_ps(+0.198814825f);
const __m128 a7 = _mm_set1_ps(-0.134871915f);
const __m128 a9 = _mm_set1_ps(+0.08387119203f);
const __m128 a11 = _mm_set1_ps(-0.0370130021f);
const __m128 a13 = _mm_set1_ps(+0.00786337701f);
const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);

const __m128 x_times_x = _mm_mul_ps(x, x);
__m128 arctan;
Expand Down
86 changes: 56 additions & 30 deletions kernels/volk/volk_32f_atan_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
*/
#include <math.h>

#define POLY_ORDER (13) // Use either 11, 13 or 15
#define POLY_ORDER (13) // Use either 11, 12, 13 or 15
/*
* arctan(x) polynomial expansion on the interval [-1, 1]
*/
Expand All @@ -66,12 +66,12 @@ static inline float arctan_approximation(const float x)
/*
* Max relative error < 4.4e-6
*/
const float a1 = +0.99999562960490000333f;
const float a3 = -0.332994596840094522258f;
const float a5 = +0.195635924773478353923f;
const float a7 = -0.121239070669896989002f;
const float a9 = +0.057477313643212866033f;
const float a11 = -0.0134804696144362610026f;
const float a1 = +0x1.ffff6ep-1f;
const float a3 = -0x1.54fca2p-2f;
const float a5 = +0x1.90aaa2p-3f;
const float a7 = -0x1.f09d2ep-4f;
const float a9 = +0x1.d6e42cp-5f;
const float a11 = -0x1.b9c81ep-7f;

const float x_times_x = x * x;
float arctan = a11;
Expand All @@ -84,19 +84,45 @@ static inline float arctan_approximation(const float x)

return arctan;
}
#elif (POLY_ORDER == 12) // Order 13 with a1 set to 1
static inline float arctan_approximation(const float x)
{
/*
* Max relative error < 7.5e-7
*/
// a1 == 1 implicitly
const float a3 = -0x1.5548a4p-2f;
const float a5 = +0x1.978224p-3f;
const float a7 = -0x1.156488p-3f;
const float a9 = +0x1.5b822cp-4f;
const float a11 = -0x1.35a172p-5f;
const float a13 = +0x1.09a14ep-7f;

const float x_times_x = x * x;
float arctan = a13;
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
arctan = fmaf(x_times_x, arctan, a3);
arctan *= x_times_x;
arctan = fmaf(x, arctan, x);

return arctan;
}
#elif (POLY_ORDER == 13)
static inline float arctan_approximation(const float x)
{
/*
* Max relative error < 6.6e-7
*/
const float a1 = +0.9999993478296087828f;
const float a3 = -0.333265149202765609898f;
const float a5 = +0.198814824627785497185f;
const float a7 = -0.134871914532288880873f;
const float a9 = +0.0838711920297372790378f;
const float a11 = -0.0370130021481201927808f;
const float a13 = +0.0078633770069189183298f;
const float a1 = +0x1.ffffeap-1f;
const float a3 = -0x1.55437p-2f;
const float a5 = +0x1.972be6p-3f;
const float a7 = -0x1.1436ap-3f;
const float a9 = +0x1.5785aap-4f;
const float a11 = -0x1.2f3004p-5f;
const float a13 = +0x1.01a37cp-7f;

const float x_times_x = x * x;
float arctan = a13;
Expand All @@ -116,14 +142,14 @@ static inline float arctan_approximation(const float x)
/*
* Max relative error < 1.0e-7
*/
const float a1 = +0.999999900990340448835f;
const float a3 = -0.333319907465225372127f;
const float a5 = +0.199697239010171371818f;
const float a7 = -0.140194809276118218192f;
const float a9 = +0.0991429287136443502316f;
const float a11 = -0.0594863936663402520391f;
const float a13 = +0.0242524034293620953292f;
const float a15 = -0.00469327610039088433147f;
const float a1 = +0x1.fffffcp-1f;
const float a3 = -0x1.55519ep-2f;
const float a5 = +0x1.98f6a8p-3f;
const float a7 = -0x1.1f0a92p-3f;
const float a9 = +0x1.95b654p-4f;
const float a11 = -0x1.e65492p-5f;
const float a13 = +0x1.8c0c36p-6f;
const float a15 = -0x1.32316ep-8f;

const float x_times_x = x * x;
float arctan = a15;
Expand All @@ -150,9 +176,9 @@ static inline float arctan(const float x)
/*
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2
*/
const float pi_over_2 = M_PI_2;
const float pi_over_2 = 0x1.921fb6p0f;

if (fabs(x) <= 1.f) {
if (fabs(x) < 1.f) {
return arctan_approximation(x);
} else {
return copysignf(pi_over_2, x) - arctan_approximation(1.f / x);
Expand All @@ -166,7 +192,7 @@ static inline void
volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

Expand Down Expand Up @@ -201,7 +227,7 @@ static inline void
volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

Expand Down Expand Up @@ -236,7 +262,7 @@ static inline void
volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
{
const __m128 one = _mm_set1_ps(1.f);
const __m128 pi_over_2 = _mm_set1_ps(M_PI_2);
const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

Expand Down Expand Up @@ -274,7 +300,7 @@ static inline void
volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

Expand Down Expand Up @@ -308,7 +334,7 @@ static inline void
volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

Expand Down Expand Up @@ -343,7 +369,7 @@ static inline void
volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
{
const __m128 one = _mm_set1_ps(1.f);
const __m128 pi_over_2 = _mm_set1_ps(M_PI_2);
const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

Expand Down

0 comments on commit 8103dcf

Please sign in to comment.