Skip to content

Commit

Permalink
multiple fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Magnus Lundmark <[email protected]>
  • Loading branch information
Ka-zam committed Sep 26, 2023
1 parent 4840e4d commit 3ce5fb0
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 67 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#
# Copyright 2011-2020 Free Software Foundation, Inc.
# Copyright 2023 Magnus Lundmark <[email protected]>
#
# This file is part of VOLK
#
Expand Down Expand Up @@ -248,6 +249,7 @@ install(FILES
${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
Expand Down
50 changes: 50 additions & 0 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/* -*- c++ -*- */
/*
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0.999999348f);
const __m256 a3 = _mm256_set1_ps(-0.333265149f);
const __m256 a5 = _mm256_set1_ps(+0.198814825f);
const __m256 a7 = _mm256_set1_ps(-0.134871915f);
const __m256 a9 = _mm256_set1_ps(+0.08387119203f);
const __m256 a11 = _mm256_set1_ps(-0.0370130021f);
const __m256 a13 = _mm256_set1_ps(+0.00786337701f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
33 changes: 1 addition & 32 deletions include/volk/volk_avx2_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- c++ -*- */
/*
* Copyright 2015, 2023 Free Software Foundation, Inc.
* Copyright 2015 Free Software Foundation, Inc.
*
* This file is part of VOLK
*
Expand All @@ -17,37 +17,6 @@
#include "volk/volk_avx_intrinsics.h"
#include <immintrin.h>

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_approximation_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0.999999348f);
const __m256 a3 = _mm256_set1_ps(-0.333265149f);
const __m256 a5 = _mm256_set1_ps(+0.198814825f);
const __m256 a7 = _mm256_set1_ps(-0.134871915f);
const __m256 a9 = _mm256_set1_ps(+0.08387119203f);
const __m256 a11 = _mm256_set1_ps(-0.0370130021f);
const __m256 a13 = _mm256_set1_ps(+0.00786337701f);

const __m256 x_times_x = _mm256_mul_ps(x, x);
__m256 arctan;
arctan = a13;
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm256_mul_ps(x, arctan);

return arctan;
}

static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
{
const __m128i zeros = _mm_set1_epi8(0x00);
Expand Down
3 changes: 2 additions & 1 deletion include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015, 20233 Free Software Foundation, Inc.
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand Down
3 changes: 2 additions & 1 deletion include/volk/volk_sse_intrinsics.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2015, 2023 Free Software Foundation, Inc.
* Copyright 2015 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand Down
64 changes: 31 additions & 33 deletions kernels/volk/volk_32f_atan_32f.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2014, 2023 Free Software Foundation, Inc.
* Copyright 2014 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand Down Expand Up @@ -53,7 +54,6 @@
* volk_free(out);
* \endcode
*/
#include <inttypes.h>
#include <math.h>

#define POLY_ORDER (13) // Use either 11, 13 or 15
Expand All @@ -74,8 +74,7 @@ static inline float arctan_approximation(const float x)
const float a11 = -0.0134804696144362610026f;

const float x_times_x = x * x;
float arctan;
arctan = a11;
float arctan = a11;
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
arctan = fmaf(x_times_x, arctan, a5);
Expand All @@ -100,8 +99,7 @@ static inline float arctan_approximation(const float x)
const float a13 = +0.0078633770069189183298f;

const float x_times_x = x * x;
float arctan;
arctan = a13;
float arctan = a13;
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
arctan = fmaf(x_times_x, arctan, a7);
Expand All @@ -128,8 +126,7 @@ static inline float arctan_approximation(const float x)
const float a15 = -0.00469327610039088433147f;

const float x_times_x = x * x;
float arctan;
arctan = a15;
float arctan = a15;
arctan = fmaf(x_times_x, arctan, a13);
arctan = fmaf(x_times_x, arctan, a11);
arctan = fmaf(x_times_x, arctan, a9);
Expand All @@ -151,30 +148,30 @@ static inline float arctan_approximation(const float x)
static inline float arctan(const float x)
{
/*
* arctan(x) + arctan(1 / x) = sign(x) * pi / 2
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2
*/
const float pi_over_2 = M_PI_2;

if (fabs(x) <= 1.f) {
return arctan_approximation(x);
} else {
float term = (x > 0.f) ? pi_over_2 : -pi_over_2;
return term - arctan_approximation(1.f / x);
return copysignf(pi_over_2, x) - arctan_approximation(1.f / x);
}
}

#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
#include <volk/volk_avx2_fma_intrinsics.h>
static inline void
volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

uint32_t number = 0;
uint32_t eighth_points = num_points / 8;
unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
Expand All @@ -199,16 +196,17 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, uint32_t num_points)

#if LV_HAVE_AVX
#include <immintrin.h>
#include <volk/volk_avx_intrinsics.h>
static inline void
volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

uint32_t number = 0;
uint32_t eighth_points = num_points / 8;
unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
Expand All @@ -235,15 +233,15 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, uint32_t num_points)
#include <smmintrin.h>
#include <volk/volk_sse_intrinsics.h>
static inline void
volk_32f_atan_32f_a_sse4_1(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
{
const __m128 one = _mm_set1_ps(1.f);
const __m128 pi_over_2 = _mm_set1_ps(M_PI_2);
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

uint32_t number = 0;
uint32_t quarter_points = num_points / 4;
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
for (; number < quarter_points; number++) {
__m128 x = _mm_load_ps(in);
__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
Expand Down Expand Up @@ -273,15 +271,15 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, uint32_t num_points)
#if LV_HAVE_AVX2 && LV_HAVE_FMA
#include <immintrin.h>
static inline void
volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

uint32_t number = 0;
uint32_t eighth_points = num_points / 8;
unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
Expand All @@ -307,15 +305,15 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, uint32_t num_points)
#if LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
{
const __m256 one = _mm256_set1_ps(1.f);
const __m256 pi_over_2 = _mm256_set1_ps(M_PI_2);
const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));

uint32_t number = 0;
uint32_t eighth_points = num_points / 8;
unsigned int number = 0;
unsigned int eighth_points = num_points / 8;
for (; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
__m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
Expand All @@ -342,15 +340,15 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, uint32_t num_points)
#include <smmintrin.h>
#include <volk/volk_sse_intrinsics.h>
static inline void
volk_32f_atan_32f_u_sse4_1(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
{
const __m128 one = _mm_set1_ps(1.f);
const __m128 pi_over_2 = _mm_set1_ps(M_PI_2);
const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

uint32_t number = 0;
uint32_t quarter_points = num_points / 4;
unsigned int number = 0;
unsigned int quarter_points = num_points / 4;
for (; number < quarter_points; number++) {
__m128 x = _mm_loadu_ps(in);
__m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
Expand All @@ -375,9 +373,9 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, uint32_t num_points)

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_atan_32f_polynomial(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
{
uint32_t number = 0;
unsigned int number = 0;
for (; number < num_points; number++) {
*out++ = arctan(*in++);
}
Expand All @@ -386,9 +384,9 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, uint32_t num_points)

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_atan_32f_generic(float* out, const float* in, uint32_t num_points)
volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
{
uint32_t number = 0;
unsigned int number = 0;
for (; number < num_points; number++) {
*out++ = atanf(*in++);
}
Expand Down
1 change: 1 addition & 0 deletions lib/kernel_tests.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2014 - 2021 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <[email protected]>
*
* This file is part of VOLK
*
Expand Down

0 comments on commit 3ce5fb0

Please sign in to comment.