diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h new file mode 100644 index 00000000..37bd16a8 --- /dev/null +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -0,0 +1,201 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/*! + * \page volk_32f_reciprocal_32f + * + * \b Overview + * + * Computes the reciprocal of the input vector and stores the results + * in the output vector. For the AVX512F implementation the relative + * error is < 2**(-14) = 6.1e-05 + * + * Dispatcher Prototype + * \code + * void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li in: A pointer to the input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs + * \li bVector: A pointer to the output vector of floats. + * + * \b Example + * \code + int N = 10; + unsigned int alignment = volk_get_alignment(); + float* in = (float*)volk_malloc(sizeof(float)*N, alignment); + float* out = (float*)volk_malloc(sizeof(float)*N, alignment); + + for(unsigned int ii = 1; ii < N; ++ii){ + in[ii] = (float)(ii*ii); + } + + volk_32f_reciprocal_32f(out, in, N); + + for(unsigned int ii = 0; ii < N; ++ii){ + printf("out(%i) = %f\n", ii, out[ii]); + } + + volk_free(in); + volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H +#define INCLUDED_volk_32f_reciprocal_32f_a_H + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points) +{ + for (unsigned int i = 0; i < num_points; i++) { + out[i] = 1.f / in[i]; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE +#include +static inline void +volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points) +{ + const __m128 ONE = _mm_set_ps1(1.f); + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { + __m128 x = _mm_load_ps(in); + in += 4; + __m128 r = _mm_div_ps(ONE, x); + _mm_store_ps(out, r); + out += 4; + } + + const unsigned int done = quarter_points * 4; + + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +static inline void +volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points) +{ + const __m256 ONE = _mm256_set1_ps(1.f); + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + in += 8; + __m256 r = _mm256_div_ps(ONE, x); + _mm256_store_ps(out, r); + out += 8; + } + + const unsigned int done = eighth_points * 8; + + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX512F +#include +static inline void +volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points) +{ + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + in += 16; + __m512 r = _mm512_rcp14_ps(x); + _mm512_store_ps(out, r); + out += 16; + } + + const unsigned int done = sixteenth_points * 16; + + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX512F */ + +#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */ + +#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H +#define INCLUDED_volk_32f_reciprocal_32f_u_H + +#ifdef LV_HAVE_SSE +#include +static inline void +volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points) +{ + const __m128 ONE = _mm_set_ps1(1.f); + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { + __m128 x = _mm_loadu_ps(in); + in += 4; + __m128 r = _mm_div_ps(ONE, x); + _mm_storeu_ps(out, r); + out += 4; + } + + const unsigned int done = quarter_points * 4; + + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +static inline void +volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points) +{ + const __m256 ONE = _mm256_set1_ps(1.f); + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + in += 8; + __m256 r = _mm256_div_ps(ONE, x); + _mm256_storeu_ps(out, r); + out += 8; + } + + const unsigned int done = eighth_points * 8; + + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX512F +#include +static inline void +volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points) +{ + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + in += 16; + __m512 r = _mm512_rcp14_ps(x); + _mm512_storeu_ps(out, r); + out += 16; + } + + const unsigned int done = sixteenth_points * 16; + + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX512F */ + +#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index 16c79c36..57a296dc 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 - 2021 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -141,6 +141,7 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params.make_tol(6.15e-5))) QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_absolute(1e-5)))