From faf20a47a0c37ab6c393c3fd26e594f1c168cc34 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Tue, 13 Feb 2018 09:46:00 +0100 Subject: [PATCH 01/67] Includes for GCC 7.2 compatibility --- apps/volk_option_helpers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h index d6099b480..8a715476b 100644 --- a/apps/volk_option_helpers.h +++ b/apps/volk_option_helpers.h @@ -6,6 +6,8 @@ #define VOLK_VOLK_OPTION_HELPERS_H #include +#include +#include #include #include From 2ce41403de53f7966e2b602e622ccd25947b1c42 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 26 Feb 2024 00:37:48 +0100 Subject: [PATCH 02/67] Added AVX512 kernels to arctan and atan2 Signed-off-by: Magnus Lundmark --- CMakeLists.txt | 90 ++++++++-------- gen/archs.xml | 8 ++ gen/machines.xml | 5 + include/volk/volk_avx2_fma_intrinsics.h | 4 +- include/volk/volk_avx512_intrinsics.h | 67 ++++++++++++ include/volk/volk_avx_intrinsics.h | 4 +- kernels/volk/volk_32f_atan_32f.h | 134 ++++++++++++++++++------ kernels/volk/volk_32fc_s32f_atan2_32f.h | 130 ++++++++++++++++++++++- 8 files changed, 358 insertions(+), 84 deletions(-) create mode 100644 include/volk/volk_avx512_intrinsics.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d64018451..9a8a460e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ # -# Copyright 2011-2020 Free Software Foundation, Inc. -# Copyright 2023 Magnus Lundmark +#Copyright 2011 - 2020 Free Software Foundation, Inc. +#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com> # -# This file is part of VOLK +#This file is part of VOLK # -# SPDX-License-Identifier: LGPL-3.0-or-later +#SPDX - License - Identifier : LGPL - 3.0 - or -later # ######################################################################## -# Project setup +#Project setup ######################################################################## cmake_minimum_required(VERSION 3.8) set(CMAKE_BUILD_TYPE @@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17) enable_testing() ######################################################################## -# Common compile flags +#Common compile flags ######################################################################## -# Disable complex math NaN/INFO range checking for performance +#Disable complex math NaN / INFO range checking for performance include(CheckCXXCompilerFlag) check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) if(HAVE_CX_LIMITED_RANGE) @@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") - # Abort compilation if kernel implementations have inconsistent function - # prototypes, i.e. if - # - # kernel_foo_sse(uint32_t *dst, lv32fc_t *src) - # kernel_foo_avx(uint16_t *dst, lv32fc_t *src) - # - # are defined. Note the different data type of the first argument). By - # default 'incompatible-pointer-types' is a warning only and 'pointer-sign' - # is a warning enabled by '-Wall'. These warnings are only applicable to C. +#Abort compilation if kernel implementations have inconsistent function +#prototypes, i.e.if +# +#kernel_foo_sse(uint32_t* dst, lv32fc_t* src) +#kernel_foo_avx(uint16_t* dst, lv32fc_t* src) +# +#are defined.Note the different data type of the first argument).By +#default 'incompatible-pointer-types' is a warning only and 'pointer-sign' +#is a warning enabled by '-Wall'.These warnings are only applicable to C. set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign") endif() @@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") ######################################################################## -# Version setup +#Version setup ######################################################################## set(VERSION_INFO_MAJOR_VERSION 3) @@ -87,13 +87,14 @@ include(VolkVersion) #setup version info math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 + ${VERSION_INFO_MINOR_VERSION} * 100 - + ${VERSION_INFO_MAINT_VERSION}") + + ${ + VERSION_INFO_MAINT_VERSION}") configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY) ######################################################################## -# Environment setup +#Environment setup ######################################################################## if(NOT DEFINED CROSSCOMPILE_MULTILIB) set(CROSSCOMPILE_MULTILIB "") @@ -116,10 +117,10 @@ if(MSVC) endif(MSVC) ######################################################################## -# Dependencies setup +#Dependencies setup ######################################################################## -# cpu_features - sensible defaults, user settable option +#cpu_features - sensible defaults, user settable option if(CMAKE_SYSTEM_PROCESSOR MATCHES "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)") option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON) @@ -158,7 +159,7 @@ else() message(STATUS "Building Volk without cpu_features") endif() -# Python +#Python include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)" PYTHON_MIN_VER_FOUND) @@ -168,12 +169,12 @@ if(NOT PYTHON_MIN_VER_FOUND) message(FATAL_ERROR "Python 3.4 or greater required to build VOLK") endif() -# Mako +#Mako if(NOT MAKO_FOUND) message(FATAL_ERROR "Mako templates required to build VOLK") endif() -# Check if we have std::filesystem +#Check if we have std::filesystem find_package( FILESYSTEM COMPONENTS Final Experimental @@ -183,9 +184,9 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) ######################################################################## -# check for aligned_alloc, since some compilers lack this C11 feature. -# For Apple-clang use `posix_memalign` -# For MSVC use `_aligned_malloc`. +#check for aligned_alloc, since some compilers lack this C11 feature. +#For Apple - clang use `posix_memalign` +#For MSVC use `_aligned_malloc`. ######################################################################## include(CheckSymbolExists) if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")) @@ -196,7 +197,7 @@ if(NOT USE_ALIGNED_ALLOC) endif() ######################################################################## -# Check if Orc is available +#Check if Orc is available ######################################################################## option(ENABLE_ORC "Enable Orc" True) if(ENABLE_ORC) @@ -206,17 +207,17 @@ else(ENABLE_ORC) endif(ENABLE_ORC) ######################################################################## -# Setup doxygen +#Setup doxygen ######################################################################## add_subdirectory(docs) ######################################################################## -# Detect /lib versus /lib64 +#Detect / lib versus / lib64 ######################################################################## include(GNUInstallDirs) ######################################################################## -# Setup the package config file +#Setup the package config file ######################################################################## #set variables found in the pc.in file set(prefix ${CMAKE_INSTALL_PREFIX}) @@ -233,7 +234,7 @@ install( COMPONENT "volk_devel") ######################################################################## -# Install all headers in the include directories +#Install all headers in the include directories ######################################################################## set(VOLK_RUNTIME_DIR bin) set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) @@ -255,6 +256,7 @@ install( ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h + ${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h @@ -269,7 +271,7 @@ install( COMPONENT "volk_devel") ######################################################################## -# On Apple only, set install name and use rpath correctly, if not already set +#On Apple only, set install name and use rpath correctly, if not already set ######################################################################## if(APPLE) if(NOT CMAKE_INSTALL_NAME_DIR) @@ -290,21 +292,21 @@ if(APPLE) endif(APPLE) ######################################################################## -# Create uninstall target +#Create uninstall target ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY) -# Only add the target if there isn't one defined already +#Only add the target if there isn't one defined already if(NOT TARGET uninstall) add_custom_target(uninstall ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() ######################################################################## -# Install our Cmake modules into $prefix/lib/cmake/volk -# See "Package Configuration Files" on page: -# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging +#Install our Cmake modules into $prefix / lib / cmake / volk +#See "Package Configuration Files" on page: +#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in @@ -314,7 +316,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) ######################################################################## -# Install cmake search routine for external use +#Install cmake search routine for external use ######################################################################## if(NOT CMAKE_MODULES_DIR) @@ -334,7 +336,7 @@ install( DESTINATION ${CMAKE_MODULES_DIR}/volk) ######################################################################## -# Option to enable QA testing, on by default +#Option to enable QA testing, on by default ######################################################################## option(ENABLE_TESTING "Enable QA testing" ON) if(ENABLE_TESTING) @@ -345,7 +347,7 @@ endif() message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF") ######################################################################## -# Option to enable post-build profiling using volk_profile, off by default +#Option to enable post - build profiling using volk_profile, off by default ######################################################################## option(ENABLE_PROFILING "Launch system profiler after build" OFF) if(ENABLE_PROFILING) @@ -371,12 +373,12 @@ endif() message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF") ######################################################################## -# Setup the library +#Setup the library ######################################################################## add_subdirectory(lib) ######################################################################## -# And the utility apps +#And the utility apps ######################################################################## add_subdirectory(apps) option(ENABLE_MODTOOL "Enable volk_modtool python utility" True) @@ -385,6 +387,6 @@ if(ENABLE_MODTOOL) endif() ######################################################################## -# Print summary +#Print summary ######################################################################## message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}") diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb43..792c50d10 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -178,6 +178,14 @@ at the top, as a last resort. 64 + + + -mavx512dq + -mavx512dq + /arch:AVX512DQ + 64 + + diff --git a/gen/machines.xml b/gen/machines.xml index 887f97949..b76f6d07f 100644 --- a/gen/machines.xml +++ b/gen/machines.xml @@ -65,4 +65,9 @@ generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc| + + +generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc| + + diff --git a/include/volk/volk_avx2_fma_intrinsics.h b/include/volk/volk_avx2_fma_intrinsics.h index 03b24e6c0..8a7e4d63e 100644 --- a/include/volk/volk_avx2_fma_intrinsics.h +++ b/include/volk/volk_avx2_fma_intrinsics.h @@ -8,7 +8,7 @@ */ /* - * This file is intended to hold AVX2 FMA intrinsics of intrinsics. + * This file is intended to hold AVX2 FMA intrinsics. * They should be used in VOLK kernels to avoid copy-paste. */ @@ -23,7 +23,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h new file mode 100644 index 000000000..a6fd87ac3 --- /dev/null +++ b/include/volk/volk_avx512_intrinsics.h @@ -0,0 +1,67 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold AVX512 intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ +#include + +static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) +{ + // r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6 + const __m512i idx = + _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + const __m512 r = _mm512_permutex2var_ps(z1, idx, z2); + return r; +} + +static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) +{ + const __m512i idx = + _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + const __m512 i = _mm512_permutex2var_ps(z1, idx, z2); + return i; +} + +/* + * Approximate arctan(x) via polynomial expansion + * on the interval [-1, 1] + * + * Maximum relative error ~6.5e-7 + * Polynomial evaluated via Horner's method + */ +static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) +{ + const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); + const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f); + const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f); + const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f); + const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f); + const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f); + const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f); + + const __m512 x_times_x = _mm512_mul_ps(x, x); + __m512 arctan; + arctan = a13; + arctan = _mm512_fmadd_ps(x_times_x, arctan, a11); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a9); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a7); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a5); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a3); + arctan = _mm512_fmadd_ps(x_times_x, arctan, a1); + arctan = _mm512_mul_ps(x, arctan); + + return arctan; +} + +#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */ diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 2fc0f064e..c6c7a2c57 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -9,7 +9,7 @@ */ /* - * This file is intended to hold AVX intrinsics of intrinsics. + * This file is intended to hold AVX intrinsics. * They should be used in VOLK kernels to avoid copy-pasta. */ @@ -24,7 +24,7 @@ * Maximum relative error ~6.5e-7 * Polynomial evaluated via Horner's method */ -static inline __m256 _m256_arctan_poly_avx(const __m256 x) +static inline __m256 _mm256_arctan_poly_avx(const __m256 x) { const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index dc5987cb8..ec0788268 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -13,19 +13,19 @@ * * \b Overview * - * Computes arcsine of input vector and stores results in output vector. + * Computes arctan of input vector and stores results in output vector. * * Dispatcher Prototype * \code - * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points) + * void volk_32f_atan_32f(float* out, const float* in, unsigned int num_points) * \endcode * * \b Inputs - * \li aVector: The input vector of floats. + * \li in_ptr: The input vector of floats. * \li num_points: The number of data points. * * \b Outputs - * \li bVector: The vector where results will be stored. + * \li out_ptr: The vector where results will be stored. * * \b Example * Calculate common angles around the top half of the unit circle. @@ -59,6 +59,64 @@ #ifndef INCLUDED_volk_32f_atan_32f_a_H #define INCLUDED_volk_32f_atan_32f_a_H +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = atanf(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) +{ + unsigned int number = 0; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_store_ps(out, result); + in += 16; + out += 16; + } + + number = sixteenth_points * 16; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -77,7 +135,7 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -112,7 +170,7 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -168,6 +226,42 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #ifndef INCLUDED_volk_32f_atan_32f_u_H #define INCLUDED_volk_32f_atan_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void +volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) +{ + const __m512 one = _mm512_set1_ps(1.f); + const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + __mmask16 swap_mask = + _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); + __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), + _mm512_mask_blend_ps(swap_mask, one, x)); + __m512 result = _mm512_arctan_poly_avx512(x_star); + __m512 term = _mm512_and_ps(x_star, sign_mask); + term = _mm512_or_ps(pi_over_2, term); + term = _mm512_sub_ps(term, result); + result = _mm512_mask_blend_ps(swap_mask, result, term); + _mm512_storeu_ps(out, result); + in += 16; + out += 16; + } + + number = sixteenth_points * 16; + for (; number < num_points; number++) { + *out++ = volk_arctan(*in++); + } +} +#endif /* LV_HAVE_AVX512F for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include static inline void @@ -185,7 +279,7 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx2_fma(x_star); + __m256 result = _mm256_arctan_poly_avx2_fma(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -219,7 +313,7 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); - __m256 result = _m256_arctan_poly_avx(x_star); + __m256 result = _mm256_arctan_poly_avx(x_star); __m256 term = _mm256_and_ps(x_star, sign_mask); term = _mm256_or_ps(pi_over_2, term); term = _mm256_sub_ps(term, result); @@ -271,26 +365,4 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) } #endif /* LV_HAVE_SSE4_1 for unaligned */ -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = volk_arctan(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = atanf(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 759db24cc..414925417 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2012, 2014 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * @@ -100,6 +100,66 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_load_ps(in); + in += 16; + __m512 z2 = _mm512_load_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_store_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, (lv_32fc_t*)in, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -136,7 +196,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -196,7 +256,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -224,6 +284,66 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H #define INCLUDED_volk_32fc_s32f_atan2_32f_u_H +#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ +#include +#include +static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) +{ + const float* in = (float*)complexVector; + float* out = (float*)outputVector; + + const float invNormalizeFactor = 1.f / normalizeFactor; + const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor); + const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f); + const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); + const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); + const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); + const __m512 zero = _mm512_setzero_ps(); + + unsigned int number = 0; + unsigned int sixteenth_points = num_points / 16; + for (; number < sixteenth_points; number++) { + __m512 z1 = _mm512_loadu_ps(in); + in += 16; + __m512 z2 = _mm512_loadu_ps(in); + in += 16; + + __m512 x = _mm512_real(z1, z2); + __m512 y = _mm512_imag(z1, z2); + + __mmask16 swap_mask = _mm512_cmp_ps_mask( + _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); + __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), + _mm512_mask_blend_ps(swap_mask, x, y)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); + input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 result = _mm512_arctan_poly_avx512(input); + + input = + _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result); + result = _mm512_mask_blend_ps(swap_mask, result, input); + + __m512 x_sign_mask = + _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31)); + + result = _mm512_add_ps( + _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), + result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); + + _mm512_storeu_ps(out, result); + out += 16; + } + + number = sixteenth_points * 16; + volk_32fc_s32f_atan2_32f_polynomial( + out, (lv_32fc_t*)in, normalizeFactor, num_points - number); +} +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ + #if LV_HAVE_AVX2 && LV_HAVE_FMA #include #include @@ -260,7 +380,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx2_fma(input); + __m256 result = _mm256_arctan_poly_avx2_fma(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); @@ -320,7 +440,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, _mm256_blendv_ps(x, y, swap_mask)); __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); input = _mm256_blendv_ps(input, zero, nan_mask); - __m256 result = _m256_arctan_poly_avx(input); + __m256 result = _mm256_arctan_poly_avx(input); input = _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result); From 4feceb5284bd08091611ce54d04bbde7df5b9934 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 26 Feb 2024 00:42:56 +0100 Subject: [PATCH 03/67] Removed comment, simplified Signed-off-by: Magnus Lundmark --- include/volk/volk_avx512_intrinsics.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h index a6fd87ac3..0bac1c6a9 100644 --- a/include/volk/volk_avx512_intrinsics.h +++ b/include/volk/volk_avx512_intrinsics.h @@ -18,19 +18,16 @@ static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) { - // r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6 const __m512i idx = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); - const __m512 r = _mm512_permutex2var_ps(z1, idx, z2); - return r; + return _mm512_permutex2var_ps(z1, idx, z2); } static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) { const __m512i idx = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); - const __m512 i = _mm512_permutex2var_ps(z1, idx, z2); - return i; + return _mm512_permutex2var_ps(z1, idx, z2); } /* From 319387d1a39af4ad9d8bf5edcdaa96313fe9b153 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 26 Feb 2024 00:47:54 +0100 Subject: [PATCH 04/67] Restored file Signed-off-by: Magnus Lundmark --- CMakeLists.txt | 89 +++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a8a460e6..823f09425 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ # -#Copyright 2011 - 2020 Free Software Foundation, Inc. -#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com> +# Copyright 2011-2020 Free Software Foundation, Inc. +# Copyright 2023 Magnus Lundmark # -#This file is part of VOLK +# This file is part of VOLK # -#SPDX - License - Identifier : LGPL - 3.0 - or -later +# SPDX-License-Identifier: LGPL-3.0-or-later # ######################################################################## -#Project setup +# Project setup ######################################################################## cmake_minimum_required(VERSION 3.8) set(CMAKE_BUILD_TYPE @@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17) enable_testing() ######################################################################## -#Common compile flags +# Common compile flags ######################################################################## -#Disable complex math NaN / INFO range checking for performance +# Disable complex math NaN/INFO range checking for performance include(CheckCXXCompilerFlag) check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) if(HAVE_CX_LIMITED_RANGE) @@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") -#Abort compilation if kernel implementations have inconsistent function -#prototypes, i.e.if -# -#kernel_foo_sse(uint32_t* dst, lv32fc_t* src) -#kernel_foo_avx(uint16_t* dst, lv32fc_t* src) -# -#are defined.Note the different data type of the first argument).By -#default 'incompatible-pointer-types' is a warning only and 'pointer-sign' -#is a warning enabled by '-Wall'.These warnings are only applicable to C. + # Abort compilation if kernel implementations have inconsistent function + # prototypes, i.e. if + # + # kernel_foo_sse(uint32_t *dst, lv32fc_t *src) + # kernel_foo_avx(uint16_t *dst, lv32fc_t *src) + # + # are defined. Note the different data type of the first argument). By + # default 'incompatible-pointer-types' is a warning only and 'pointer-sign' + # is a warning enabled by '-Wall'. These warnings are only applicable to C. set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign") endif() @@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") ######################################################################## -#Version setup +# Version setup ######################################################################## set(VERSION_INFO_MAJOR_VERSION 3) @@ -87,14 +87,13 @@ include(VolkVersion) #setup version info math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 + ${VERSION_INFO_MINOR_VERSION} * 100 - + ${ - VERSION_INFO_MAINT_VERSION}") + + ${VERSION_INFO_MAINT_VERSION}") configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY) ######################################################################## -#Environment setup +# Environment setup ######################################################################## if(NOT DEFINED CROSSCOMPILE_MULTILIB) set(CROSSCOMPILE_MULTILIB "") @@ -117,10 +116,10 @@ if(MSVC) endif(MSVC) ######################################################################## -#Dependencies setup +# Dependencies setup ######################################################################## -#cpu_features - sensible defaults, user settable option +# cpu_features - sensible defaults, user settable option if(CMAKE_SYSTEM_PROCESSOR MATCHES "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)") option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON) @@ -159,7 +158,7 @@ else() message(STATUS "Building Volk without cpu_features") endif() -#Python +# Python include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)" PYTHON_MIN_VER_FOUND) @@ -169,12 +168,12 @@ if(NOT PYTHON_MIN_VER_FOUND) message(FATAL_ERROR "Python 3.4 or greater required to build VOLK") endif() -#Mako +# Mako if(NOT MAKO_FOUND) message(FATAL_ERROR "Mako templates required to build VOLK") endif() -#Check if we have std::filesystem +# Check if we have std::filesystem find_package( FILESYSTEM COMPONENTS Final Experimental @@ -184,9 +183,9 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) ######################################################################## -#check for aligned_alloc, since some compilers lack this C11 feature. -#For Apple - clang use `posix_memalign` -#For MSVC use `_aligned_malloc`. +# check for aligned_alloc, since some compilers lack this C11 feature. +# For Apple-clang use `posix_memalign` +# For MSVC use `_aligned_malloc`. ######################################################################## include(CheckSymbolExists) if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")) @@ -197,7 +196,7 @@ if(NOT USE_ALIGNED_ALLOC) endif() ######################################################################## -#Check if Orc is available +# Check if Orc is available ######################################################################## option(ENABLE_ORC "Enable Orc" True) if(ENABLE_ORC) @@ -207,17 +206,17 @@ else(ENABLE_ORC) endif(ENABLE_ORC) ######################################################################## -#Setup doxygen +# Setup doxygen ######################################################################## add_subdirectory(docs) ######################################################################## -#Detect / lib versus / lib64 +# Detect /lib versus /lib64 ######################################################################## include(GNUInstallDirs) ######################################################################## -#Setup the package config file +# Setup the package config file ######################################################################## #set variables found in the pc.in file set(prefix ${CMAKE_INSTALL_PREFIX}) @@ -234,7 +233,7 @@ install( COMPONENT "volk_devel") ######################################################################## -#Install all headers in the include directories +# Install all headers in the include directories ######################################################################## set(VOLK_RUNTIME_DIR bin) set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) @@ -271,7 +270,7 @@ install( COMPONENT "volk_devel") ######################################################################## -#On Apple only, set install name and use rpath correctly, if not already set +# On Apple only, set install name and use rpath correctly, if not already set ######################################################################## if(APPLE) if(NOT CMAKE_INSTALL_NAME_DIR) @@ -292,21 +291,21 @@ if(APPLE) endif(APPLE) ######################################################################## -#Create uninstall target +# Create uninstall target ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY) -#Only add the target if there isn't one defined already +# Only add the target if there isn't one defined already if(NOT TARGET uninstall) add_custom_target(uninstall ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() ######################################################################## -#Install our Cmake modules into $prefix / lib / cmake / volk -#See "Package Configuration Files" on page: -#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging +# Install our Cmake modules into $prefix/lib/cmake/volk +# See "Package Configuration Files" on page: +# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging ######################################################################## configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in @@ -316,7 +315,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) ######################################################################## -#Install cmake search routine for external use +# Install cmake search routine for external use ######################################################################## if(NOT CMAKE_MODULES_DIR) @@ -336,7 +335,7 @@ install( DESTINATION ${CMAKE_MODULES_DIR}/volk) ######################################################################## -#Option to enable QA testing, on by default +# Option to enable QA testing, on by default ######################################################################## option(ENABLE_TESTING "Enable QA testing" ON) if(ENABLE_TESTING) @@ -347,7 +346,7 @@ endif() message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF") ######################################################################## -#Option to enable post - build profiling using volk_profile, off by default +# Option to enable post-build profiling using volk_profile, off by default ######################################################################## option(ENABLE_PROFILING "Launch system profiler after build" OFF) if(ENABLE_PROFILING) @@ -373,12 +372,12 @@ endif() message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF") ######################################################################## -#Setup the library +# Setup the library ######################################################################## add_subdirectory(lib) ######################################################################## -#And the utility apps +# And the utility apps ######################################################################## add_subdirectory(apps) option(ENABLE_MODTOOL "Enable volk_modtool python utility" True) @@ -387,6 +386,6 @@ if(ENABLE_MODTOOL) endif() ######################################################################## -#Print summary +# Print summary ######################################################################## message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}") From af7e8fcc1f919d4ca6028fc2effe3f9f768a587e Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Thu, 29 Feb 2024 23:24:35 +0100 Subject: [PATCH 05/67] resolved comments Signed-off-by: Magnus Lundmark --- include/volk/volk_avx512_intrinsics.h | 21 +++-- kernels/volk/volk_32f_atan_32f.h | 102 +++++++++++------------- kernels/volk/volk_32fc_s32f_atan2_32f.h | 46 +++++------ 3 files changed, 83 insertions(+), 86 deletions(-) diff --git a/include/volk/volk_avx512_intrinsics.h b/include/volk/volk_avx512_intrinsics.h index 0bac1c6a9..6f6a05ee7 100644 --- a/include/volk/volk_avx512_intrinsics.h +++ b/include/volk/volk_avx512_intrinsics.h @@ -16,6 +16,10 @@ #define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ #include +//////////////////////////////////////////////////////////////////////// +// Place real parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) { const __m512i idx = @@ -23,6 +27,10 @@ static inline __m512 _mm512_real(const __m512 z1, const __m512 z2) return _mm512_permutex2var_ps(z1, idx, z2); } +//////////////////////////////////////////////////////////////////////// +// Place imaginary parts of two complex vectors in output +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) { const __m512i idx = @@ -30,13 +38,12 @@ static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2) return _mm512_permutex2var_ps(z1, idx, z2); } -/* - * Approximate arctan(x) via polynomial expansion - * on the interval [-1, 1] - * - * Maximum relative error ~6.5e-7 - * Polynomial evaluated via Horner's method - */ +//////////////////////////////////////////////////////////////////////// +// Approximate arctan(x) via polynomial expansion on the interval [-1, 1] +// Maximum relative error ~6.5e-7 +// Polynomial evaluated via Horner's method +// Requires AVX512F +//////////////////////////////////////////////////////////////////////// static inline __m512 _mm512_arctan_poly_avx512(const __m512 x) { const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f); diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index ec0788268..03afea55a 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -63,8 +63,7 @@ static inline void volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - for (; number < num_points; number++) { + for (unsigned int number = 0; number < num_points; number++) { *out++ = atanf(*in++); } } @@ -74,8 +73,7 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) static inline void volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) { - unsigned int number = 0; - for (; number < num_points; number++) { + for (unsigned int number = 0; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -85,17 +83,18 @@ volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_point #include #include static inline void -volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) +volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points) { const __m512 one = _mm512_set1_ps(1.f); const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_load_ps(in); + in += 16; __mmask16 swap_mask = _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), @@ -106,16 +105,14 @@ volk_32f_atan_32f_a_avx512(float* out, const float* in, unsigned int num_points) term = _mm512_sub_ps(term, result); result = _mm512_mask_blend_ps(swap_mask, result, term); _mm512_store_ps(out, result); - in += 16; out += 16; } - number = sixteenth_points * 16; - for (; number < num_points; number++) { + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { *out++ = volk_arctan(*in++); } } -#endif /* LV_HAVE_AVX512F for aligned */ +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include @@ -128,10 +125,11 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -141,12 +139,10 @@ volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_point term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -163,10 +159,11 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -176,12 +173,10 @@ volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points) term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_store_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -198,10 +193,11 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_load_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -211,12 +207,10 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_store_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -230,17 +224,18 @@ volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points) #include #include static inline void -volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) +volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points) { const __m512 one = _mm512_set1_ps(1.f); const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_loadu_ps(in); + in += 16; __mmask16 swap_mask = _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), @@ -251,16 +246,14 @@ volk_32f_atan_32f_u_avx512(float* out, const float* in, unsigned int num_points) term = _mm512_sub_ps(term, result); result = _mm512_mask_blend_ps(swap_mask, result, term); _mm512_storeu_ps(out, result); - in += 16; out += 16; } - number = sixteenth_points * 16; - for (; number < num_points; number++) { + for (unsigned int number = sixteenth_points * 16; number < num_points; number++) { *out++ = volk_arctan(*in++); } } -#endif /* LV_HAVE_AVX512F for unaligned */ +#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ #if LV_HAVE_AVX2 && LV_HAVE_FMA #include @@ -272,10 +265,11 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -285,12 +279,10 @@ volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_point term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -306,10 +298,11 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int eighth_points = num_points / 8; - for (; number < eighth_points; number++) { + const unsigned int eighth_points = num_points / 8; + + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); + in += 8; __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS); __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask), _mm256_blendv_ps(one, x, swap_mask)); @@ -319,12 +312,10 @@ volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points) term = _mm256_sub_ps(term, result); result = _mm256_blendv_ps(result, term, swap_mask); _mm256_storeu_ps(out, result); - in += 8; out += 8; } - number = eighth_points * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighth_points * 8; number < num_points; number++) { *out++ = volk_arctan(*in++); } } @@ -341,10 +332,11 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - unsigned int number = 0; - unsigned int quarter_points = num_points / 4; - for (; number < quarter_points; number++) { + const unsigned int quarter_points = num_points / 4; + + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_loadu_ps(in); + in += 4; __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one); __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask), _mm_blendv_ps(one, x, swap_mask)); @@ -354,12 +346,10 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) term = _mm_sub_ps(term, result); result = _mm_blendv_ps(result, term, swap_mask); _mm_storeu_ps(out, result); - in += 4; out += 4; } - number = quarter_points * 4; - for (; number < num_points; number++) { + for (unsigned int number = quarter_points * 4; number < num_points; number++) { *out++ = volk_arctan(*in++); } } diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 414925417..5e8be5ce1 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -72,8 +72,8 @@ static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float real = *inPtr++; const float imag = *inPtr++; *outPtr++ = atan2f(imag, real) * invNormalizeFactor; @@ -91,8 +91,8 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, float* outPtr = outputVector; const float* inPtr = (float*)inputVector; const float invNormalizeFactor = 1.f / normalizeFactor; - unsigned int number = 0; - for (; number < num_points; number++) { + + for (unsigned int number = 0; number < num_points; number++) { const float x = *inPtr++; const float y = *inPtr++; *outPtr++ = volk_atan2(y, x) * invNormalizeFactor; @@ -103,10 +103,10 @@ static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector, #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ #include #include -static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, - const lv_32fc_t* complexVector, - const float normalizeFactor, - unsigned int num_points) +static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) { const float* in = (float*)complexVector; float* out = (float*)outputVector; @@ -120,7 +120,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, const __m512 zero = _mm512_setzero_ps(); unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; + const unsigned int sixteenth_points = num_points / 16; for (; number < sixteenth_points; number++) { __m512 z1 = _mm512_load_ps(in); in += 16; @@ -156,7 +156,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512(float* outputVector, number = sixteenth_points * 16; volk_32fc_s32f_atan2_32f_polynomial( - out, (lv_32fc_t*)in, normalizeFactor, num_points - number); + out, complexVector + number, normalizeFactor, num_points - number); } #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */ @@ -180,7 +180,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -240,7 +240,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_load_ps(in); in += 8; @@ -287,10 +287,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, #if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ #include #include -static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, - const lv_32fc_t* complexVector, - const float normalizeFactor, - unsigned int num_points) +static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, + const lv_32fc_t* complexVector, + const float normalizeFactor, + unsigned int num_points) { const float* in = (float*)complexVector; float* out = (float*)outputVector; @@ -303,9 +303,9 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); const __m512 zero = _mm512_setzero_ps(); - unsigned int number = 0; - unsigned int sixteenth_points = num_points / 16; - for (; number < sixteenth_points; number++) { + const unsigned int sixteenth_points = num_points / 16; + + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 z1 = _mm512_loadu_ps(in); in += 16; __m512 z2 = _mm512_loadu_ps(in); @@ -338,9 +338,9 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512(float* outputVector, out += 16; } - number = sixteenth_points * 16; + unsigned int number = sixteenth_points * 16; volk_32fc_s32f_atan2_32f_polynomial( - out, (lv_32fc_t*)in, normalizeFactor, num_points - number); + out, complexVector + number, normalizeFactor, num_points - number); } #endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */ @@ -364,7 +364,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8; @@ -424,7 +424,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; - unsigned int eighth_points = num_points / 8; + const unsigned int eighth_points = num_points / 8; for (; number < eighth_points; number++) { __m256 z1 = _mm256_loadu_ps(in); in += 8; From d365c41a500b49bb79d3330bd62c4c7c94920084 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 24 Feb 2024 14:12:46 +0100 Subject: [PATCH 06/67] cpu_features: Update hints in README We should prefer a cpu_features version that is provided thru package managers. The README clearly states this preference now. Signed-off-by: Johannes Demel --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 16643761f..9f1f0d040 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,15 @@ $ volk_profile ``` #### Missing submodule -We use [cpu_features](https://github.com/google/cpu_features) as a git submodule to detect CPU features, e.g. AVX. -There are two options to get the required code: +We use [cpu_features](https://github.com/google/cpu_features) to detect CPU features, e.g. AVX. +Some platforms require a very recent version that is not available through the appropriate package manager. +In this case you must use `cpu_features` as a submodule. + +**NOTE**: Most package managers provide recent enough `cpu_features` versions by now. +Please default to the provided `cpu_features` version first, and only use the submodule in cases where this fails. +Please open an issue if this is the case. + +There are two options to get the required code in a submodule: ```bash git clone --recursive https://github.com/gnuradio/volk.git ``` From b71a27011b8b0539dd5f803cdf36e7fc6872976d Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 30 Mar 2024 23:32:56 +0100 Subject: [PATCH 07/67] macos: Fix CI dependency error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following error popped up during our CI builds on MacOS: ```shell $ pip3 install mako shell: /bin/bash -e {0} error: externally-managed-environment × This environment is externally managed ╰─> To install Python packages system-wide, try brew install xyz, where xyz is the package you are trying to install. If you wish to install a non-brew-packaged Python package, create a virtual environment using python3 -m venv path/to/venv. Then use path/to/venv/bin/python and path/to/venv/bin/pip. If you wish to install a non-brew packaged Python application, it may be easiest to use pipx install xyz, which will manage a virtual environment for you. Make sure you have pipx installed. note: If you believe this is a mistake, please contact your Python installation or OS distribution provider. You can override this, at the risk of breaking your Python installation or OS, by passing --break-system-packages. hint: See PEP 668 for the detailed specification. ``` [PEP668](https://peps.python.org/pep-0668/) tries to help people to prevent dependency issues on their systems. We can be pretty sure that our controlled CI environment is not affected by this issue. Thus, we can use `--break-system-packages` and install system-wide. Signed-off-by: Johannes Demel --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 244e95d47..8fe1daa0b 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -254,7 +254,7 @@ jobs: with: submodules: 'recursive' - name: dependencies - run: pip3 install mako + run: pip3 install --break-system-packages mako && brew install orc - name: configure run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON .. - name: build From 4c693c5cfc908f20da8534d64cbfd8337cef80f5 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Tue, 27 Feb 2024 22:15:55 +0100 Subject: [PATCH 08/67] New AVX512F implementation Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_reciprocal_32f.h | 189 +++++++++++++++++++++++++ lib/kernel_tests.h | 1 + 2 files changed, 190 insertions(+) create mode 100644 kernels/volk/volk_32f_reciprocal_32f.h diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h new file mode 100644 index 000000000..5fba6926e --- /dev/null +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -0,0 +1,189 @@ +/* -*- c++ -*- */ +/* + * Copyright 2023 Magnus Lundmark + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/*! + * \page volk_32f_reciprocal_32f + * + * \b Overview + * + * Computes the reciprocal of the input vector and stores the results + * in the output vector. For the AVX512F implementation the relative + * error is < 2**(-14) = 6.1e-05 + * + * Dispatcher Prototype + * \code + * void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li in: A pointer to the input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs + * \li bVector: A pointer to the output vector of floats. + * + * \b Example + * \code + int N = 10; + unsigned int alignment = volk_get_alignment(); + float* in = (float*)volk_malloc(sizeof(float)*N, alignment); + float* out = (float*)volk_malloc(sizeof(float)*N, alignment); + + for(unsigned int ii = 1; ii < N; ++ii){ + in[ii] = (float)(ii*ii); + } + + volk_32f_reciprocal_32f(out, in, N); + + for(unsigned int ii = 0; ii < N; ++ii){ + printf("out(%i) = %f\n", ii, out[ii]); + } + + volk_free(in); + volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H +#define INCLUDED_volk_32f_reciprocal_32f_a_H + +#ifdef LV_HAVE_GENERIC +static inline void +volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points) +{ + for (unsigned int i = 0; i < num_points; i++) { + out[i] = 1.f / in[i]; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE +#include +static inline void +volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points) +{ + const __m128 ONE = _mm_set_ps1(1.f); + const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { + __m128 x = _mm_load_ps(in); + in += 4; + __m128 r = _mm_div_ps(ONE, x); + _mm_store_ps(out, r); + out += 4; + } + + const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +static inline void +volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points) +{ + const __m256 ONE = _mm256_set1_ps(1.f); + const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { + __m256 x = _mm256_load_ps(in); + in += 8; + __m256 r = _mm256_div_ps(ONE, x); + _mm256_store_ps(out, r); + out += 8; + } + + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX512F +#include +static inline void +volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points) +{ + const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_load_ps(in); + in += 16; + __m512 r = _mm512_rcp14_ps(x); + _mm512_store_ps(out, r); + out += 16; + } + + const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX512F */ + +#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */ + +#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H +#define INCLUDED_volk_32f_reciprocal_32f_u_H + +#ifdef LV_HAVE_SSE +#include +static inline void +volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points) +{ + const __m128 ONE = _mm_set_ps1(1.f); + const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { + __m128 x = _mm_loadu_ps(in); + in += 4; + __m128 r = _mm_div_ps(ONE, x); + _mm_storeu_ps(out, r); + out += 4; + } + + const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +static inline void +volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points) +{ + const __m256 ONE = _mm256_set1_ps(1.f); + const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { + __m256 x = _mm256_loadu_ps(in); + in += 8; + __m256 r = _mm256_div_ps(ONE, x); + _mm256_storeu_ps(out, r); + out += 8; + } + + const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_AVX512F +#include +static inline void +volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points) +{ + const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { + __m512 x = _mm512_loadu_ps(in); + in += 16; + __m512 r = _mm512_rcp14_ps(x); + _mm512_storeu_ps(out, r); + out += 16; + } + + const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); +} +#endif /* LV_HAVE_AVX512F */ + +#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index 16c79c365..a22a40273 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -141,6 +141,7 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params)) QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params.make_tol(6.15e-5))) QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_absolute(1e-5))) From faa2f3b02a044fa3927759dafe32a24431bd649e Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Tue, 27 Feb 2024 22:38:26 +0100 Subject: [PATCH 09/67] Updated copyright year Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_reciprocal_32f.h | 2 +- lib/kernel_tests.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index 5fba6926e..42363d3fa 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -1,6 +1,6 @@ /* -*- c++ -*- */ /* - * Copyright 2023 Magnus Lundmark + * Copyright 2024 Magnus Lundmark * * This file is part of VOLK * diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index a22a40273..57a296dcb 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -1,7 +1,7 @@ /* -*- c++ -*- */ /* * Copyright 2014 - 2021 Free Software Foundation, Inc. - * Copyright 2023 Magnus Lundmark + * Copyright 2023, 2024 Magnus Lundmark * * This file is part of VOLK * From e8ea1f900a2117677fddf6e2cbd33e7bc46b23cd Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Fri, 1 Mar 2024 13:37:29 +0100 Subject: [PATCH 10/67] formatting Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_reciprocal_32f.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index 42363d3fa..37bd16a80 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -70,6 +70,7 @@ volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_poin { const __m128 ONE = _mm_set_ps1(1.f); const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_load_ps(in); in += 4; @@ -79,6 +80,7 @@ volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_poin } const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_SSE */ @@ -90,6 +92,7 @@ volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_poin { const __m256 ONE = _mm256_set1_ps(1.f); const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_load_ps(in); in += 8; @@ -99,6 +102,7 @@ volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_poin } const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX */ @@ -109,6 +113,7 @@ static inline void volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points) { const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_load_ps(in); in += 16; @@ -118,6 +123,7 @@ volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_p } const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX512F */ @@ -134,6 +140,7 @@ volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_poin { const __m128 ONE = _mm_set_ps1(1.f); const unsigned int quarter_points = num_points / 4; + for (unsigned int number = 0; number < quarter_points; number++) { __m128 x = _mm_loadu_ps(in); in += 4; @@ -143,6 +150,7 @@ volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_poin } const unsigned int done = quarter_points * 4; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_SSE */ @@ -154,6 +162,7 @@ volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_poin { const __m256 ONE = _mm256_set1_ps(1.f); const unsigned int eighth_points = num_points / 8; + for (unsigned int number = 0; number < eighth_points; number++) { __m256 x = _mm256_loadu_ps(in); in += 8; @@ -163,6 +172,7 @@ volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_poin } const unsigned int done = eighth_points * 8; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX */ @@ -173,6 +183,7 @@ static inline void volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points) { const unsigned int sixteenth_points = num_points / 16; + for (unsigned int number = 0; number < sixteenth_points; number++) { __m512 x = _mm512_loadu_ps(in); in += 16; @@ -182,6 +193,7 @@ volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_p } const unsigned int done = sixteenth_points * 16; + volk_32f_reciprocal_32f_generic(out, in, num_points - done); } #endif /* LV_HAVE_AVX512F */ From 0643178fda6264d128c401b178dc00fe01b48d60 Mon Sep 17 00:00:00 2001 From: Ron Economos Date: Mon, 24 Jun 2024 17:35:48 -0700 Subject: [PATCH 11/67] cmake: Suppress invalid escape sequence warnings with Python 3.12 Signed-off-by: Ron Economos --- cmake/Modules/VolkPython.cmake | 2 +- lib/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Modules/VolkPython.cmake b/cmake/Modules/VolkPython.cmake index 73bc2914a..7beff74b9 100644 --- a/cmake/Modules/VolkPython.cmake +++ b/cmake/Modules/VolkPython.cmake @@ -144,7 +144,7 @@ function(VOLK_UNIQUE_TARGET desc) execute_process( COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib unique = hashlib.sha256(b'${reldir}${ARGN}').hexdigest()[:5] -print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))" +print(re.sub(r'\\W', '_', '${desc} ${reldir} ' + unique))" OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE) add_custom_target(${_target} ALL DEPENDS ${ARGN}) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 074f46f84..0464c7724 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -105,7 +105,7 @@ macro(check_arch arch_name) #make the have_flag have nice alphanum chars (just for looks/not necessary) execute_process( COMMAND ${PYTHON_EXECUTABLE} -c - "import re; print(re.sub('\\W', '_', '${have_flag}'))" + "import re; print(re.sub(r'\\W', '_', '${have_flag}'))" OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE) if(VOLK_FLAG_CHECK_FLAGS) From 22772665f705f70da7a173c7dbcb70beecdf6a51 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 2 Mar 2024 22:53:58 +0100 Subject: [PATCH 12/67] cmake: Fix 64bit host CPU detection In cases where we don't cross-compile, we might want to detect if a CPU is 32bit or 64bit. CMake provides functionality for this case starting in CMake 3.10. Let's use it. Ubuntu 20.04 uses CMake 3.16. From the top of my head, this is the oldest supported distribution. Debian buster ships with CMake 3.13. Signed-off-by: Johannes Demel --- CMakeLists.txt | 4 +++- lib/CMakeLists.txt | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d64018451..8d55fa0a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,9 @@ ######################################################################## # Project setup ######################################################################## -cmake_minimum_required(VERSION 3.8) +# We use `IS_64BIT now: https://cmake.org/cmake/help/latest/command/cmake_host_system_information.html +cmake_minimum_required(VERSION 3.10) + set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose build type: None Debug Release RelWithDebInfo MinSizeRel") diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 0464c7724..2c160b2f2 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -255,17 +255,16 @@ endif() ######################################################################## if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) include(CheckTypeSize) - check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY) - if(${SIZEOF_CPU} EQUAL 64) + cmake_host_system_information(RESULT ASSUME_64BIT_HOST QUERY IS_64BIT) + if(ASSUME_64BIT_HOST) overrule_arch(32 "CPU width is 64 bits") - endif() - if(${SIZEOF_CPU} EQUAL 32) + else() overrule_arch(64 "CPU width is 32 bits") endif() #MSVC 64 bit does not have MMX, overrule it if(MSVC) - if(${SIZEOF_CPU} EQUAL 64) + if(ASSUME_64BIT_HOST) overrule_arch(mmx "No MMX for Win64") endif() force_arch(sse "Built-in for MSVC > 2013") From 34c3ff6ae4bec438c172867ee6adade9282d2f45 Mon Sep 17 00:00:00 2001 From: Doron Behar Date: Sat, 27 Jul 2024 20:50:30 +0300 Subject: [PATCH 13/67] cmake/pkgconfig: use CMAKE_INSTALL_FULL_* variables See: https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html Signed-off-by: Doron Behar --- CMakeLists.txt | 6 ------ tmpl/volk.pc.in | 8 ++++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d55fa0a7..01f0acb31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -220,12 +220,6 @@ include(GNUInstallDirs) ######################################################################## # Setup the package config file ######################################################################## -#set variables found in the pc.in file -set(prefix ${CMAKE_INSTALL_PREFIX}) -set(exec_prefix "\${prefix}") -set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") -set(includedir "\${prefix}/include") - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/tmpl/volk.pc.in ${CMAKE_CURRENT_BINARY_DIR}/volk.pc @ONLY) diff --git a/tmpl/volk.pc.in b/tmpl/volk.pc.in index 60dc8b0e2..5d408b05c 100644 --- a/tmpl/volk.pc.in +++ b/tmpl/volk.pc.in @@ -1,7 +1,7 @@ -prefix=@prefix@ -exec_prefix=@exec_prefix@ -libdir=@libdir@ -includedir=@includedir@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=@CMAKE_INSTALL_PREFIX@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ LV_CXXFLAGS=@LV_CXXFLAGS@ From 87959c5eb4f532857ffda6e6ab6a935a7a5a0fa8 Mon Sep 17 00:00:00 2001 From: Olaf Bernstein Date: Tue, 22 Oct 2024 01:54:18 +0200 Subject: [PATCH 14/67] add RISC-V Vector extension (RVV) kernels Signed-off-by: Olaf Bernstein --- .github/workflows/run-tests-rvv.yml | 54 +++++ cmake/Checks/check-rvv-intrinsics.c | 5 + cmake/Toolchains/rv64gcv-linux-gnu.cmake | 34 +++ gen/archs.xml | 44 ++++ gen/machines.xml | 12 + include/volk/volk_rvv_intrinsics.h | 77 +++++++ kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 61 ++++++ kernels/volk/volk_16i_branch_4_state_8.h | 4 + kernels/volk/volk_16i_convert_8i.h | 15 ++ kernels/volk/volk_16i_max_star_16i.h | 4 + .../volk/volk_16i_max_star_horizontal_16i.h | 4 + .../volk/volk_16i_permute_and_scalar_add.h | 4 + kernels/volk/volk_16i_s32f_convert_32f.h | 17 ++ kernels/volk/volk_16i_x4_quad_max_star_16i.h | 4 + kernels/volk/volk_16i_x5_add_quad_16i_x4.h | 4 + kernels/volk/volk_16ic_convert_32fc.h | 19 ++ kernels/volk/volk_16ic_deinterleave_16i_x2.h | 41 ++++ .../volk/volk_16ic_deinterleave_real_16i.h | 17 ++ kernels/volk/volk_16ic_deinterleave_real_8i.h | 20 ++ kernels/volk/volk_16ic_magnitude_16i.h | 46 ++++ .../volk/volk_16ic_s32f_deinterleave_32f_x2.h | 47 ++++ .../volk_16ic_s32f_deinterleave_real_32f.h | 20 ++ kernels/volk/volk_16ic_s32f_magnitude_32f.h | 44 ++++ kernels/volk/volk_16ic_x2_dot_prod_16ic.h | 64 ++++++ kernels/volk/volk_16ic_x2_multiply_16ic.h | 48 ++++ kernels/volk/volk_16u_byteswap.h | 49 +++++ kernels/volk/volk_16u_byteswappuppet_16u.h | 22 ++ kernels/volk/volk_32f_64f_add_64f.h | 18 ++ kernels/volk/volk_32f_64f_multiply_64f.h | 17 ++ kernels/volk/volk_32f_8u_polarbutterfly_32f.h | 170 +++++++++++++++ .../volk_32f_8u_polarbutterflypuppet_32f.h | 57 +++++ kernels/volk/volk_32f_accumulator_s32f.h | 22 ++ kernels/volk/volk_32f_acos_32f.h | 68 ++++++ kernels/volk/volk_32f_asin_32f.h | 66 ++++++ kernels/volk/volk_32f_atan_32f.h | 42 ++++ kernels/volk/volk_32f_binary_slicer_32i.h | 16 ++ kernels/volk/volk_32f_binary_slicer_8i.h | 17 ++ kernels/volk/volk_32f_convert_64f.h | 15 ++ kernels/volk/volk_32f_cos_32f.h | 60 +++++ kernels/volk/volk_32f_exp_32f.h | 54 +++++ kernels/volk/volk_32f_expfast_32f.h | 21 ++ kernels/volk/volk_32f_index_max_16u.h | 28 +++ kernels/volk/volk_32f_index_max_32u.h | 28 +++ kernels/volk/volk_32f_index_min_16u.h | 28 +++ kernels/volk/volk_32f_index_min_32u.h | 30 ++- kernels/volk/volk_32f_invsqrt_32f.h | 15 ++ kernels/volk/volk_32f_log2_32f.h | 68 ++++++ kernels/volk/volk_32f_reciprocal_32f.h | 15 ++ .../volk/volk_32f_s32f_32f_fm_detect_32f.h | 37 ++++ kernels/volk/volk_32f_s32f_add_32f.h | 17 ++ ...k_32f_s32f_calc_spectral_noise_floor_32f.h | 35 +++ kernels/volk/volk_32f_s32f_clamppuppet_32f.h | 10 + kernels/volk/volk_32f_s32f_convert_16i.h | 17 ++ kernels/volk/volk_32f_s32f_convert_32i.h | 17 ++ kernels/volk/volk_32f_s32f_convert_8i.h | 17 ++ kernels/volk/volk_32f_s32f_convertpuppet_8u.h | 11 + .../volk/volk_32f_s32f_mod_rangepuppet_32f.h | 10 + kernels/volk/volk_32f_s32f_multiply_32f.h | 17 ++ kernels/volk/volk_32f_s32f_normalize.h | 14 ++ .../volk/volk_32f_s32f_s32f_mod_range_32f.h | 32 +++ kernels/volk/volk_32f_s32f_stddev_32f.h | 28 +++ kernels/volk/volk_32f_s32f_x2_clamp_32f.h | 21 ++ kernels/volk/volk_32f_s32f_x2_convert_8u.h | 19 ++ kernels/volk/volk_32f_sin_32f.h | 62 ++++++ kernels/volk/volk_32f_sqrt_32f.h | 16 ++ .../volk/volk_32f_stddev_and_mean_32f_x2.h | 71 ++++++ kernels/volk/volk_32f_tan_32f.h | 67 ++++++ kernels/volk/volk_32f_tanh_32f.h | 34 +++ kernels/volk/volk_32f_x2_add_32f.h | 17 ++ kernels/volk/volk_32f_x2_divide_32f.h | 18 ++ kernels/volk/volk_32f_x2_dot_prod_16i.h | 15 ++ kernels/volk/volk_32f_x2_dot_prod_32f.h | 24 ++ .../volk/volk_32f_x2_fm_detectpuppet_32f.h | 13 ++ kernels/volk/volk_32f_x2_interleave_32fc.h | 39 ++++ kernels/volk/volk_32f_x2_max_32f.h | 18 ++ kernels/volk/volk_32f_x2_min_32f.h | 18 ++ kernels/volk/volk_32f_x2_multiply_32f.h | 17 ++ kernels/volk/volk_32f_x2_pow_32f.h | 123 +++++++++++ kernels/volk/volk_32f_x2_powpuppet_32f.h | 12 + .../volk/volk_32f_x2_s32f_interleave_16ic.h | 46 ++++ kernels/volk/volk_32f_x2_subtract_32f.h | 18 ++ kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 41 ++++ kernels/volk/volk_32fc_32f_add_32fc.h | 19 ++ kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 58 +++++ kernels/volk/volk_32fc_32f_multiply_32fc.h | 19 ++ kernels/volk/volk_32fc_accumulator_s32fc.h | 29 +++ kernels/volk/volk_32fc_conjugate_32fc.h | 17 ++ kernels/volk/volk_32fc_convert_16ic.h | 19 ++ kernels/volk/volk_32fc_deinterleave_32f_x2.h | 42 ++++ kernels/volk/volk_32fc_deinterleave_64f_x2.h | 40 ++++ .../volk/volk_32fc_deinterleave_imag_32f.h | 18 ++ .../volk/volk_32fc_deinterleave_real_32f.h | 17 ++ .../volk/volk_32fc_deinterleave_real_64f.h | 17 ++ kernels/volk/volk_32fc_index_max_16u.h | 63 +++++- kernels/volk/volk_32fc_index_max_32u.h | 63 +++++- kernels/volk/volk_32fc_index_min_16u.h | 63 ++++++ kernels/volk/volk_32fc_index_min_32u.h | 63 ++++++ kernels/volk/volk_32fc_magnitude_32f.h | 37 ++++ .../volk/volk_32fc_magnitude_squared_32f.h | 37 ++++ kernels/volk/volk_32fc_s32f_atan2_32f.h | 109 +++++++++ .../volk_32fc_s32f_deinterleave_real_16i.h | 20 ++ kernels/volk/volk_32fc_s32f_magnitude_16i.h | 42 ++++ .../volk/volk_32fc_s32f_power_spectrum_32f.h | 163 ++++++++++++++ .../volk_32fc_s32fc_rotator2puppet_32fc.h | 30 +++ .../volk/volk_32fc_s32fc_x2_rotator2_32fc.h | 156 +++++++++++++ kernels/volk/volk_32fc_x2_add_32fc.h | 20 ++ .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 67 ++++++ kernels/volk/volk_32fc_x2_divide_32fc.h | 61 ++++++ kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 65 ++++++ kernels/volk/volk_32fc_x2_multiply_32fc.h | 51 +++++ .../volk_32fc_x2_multiply_conjugate_32fc.h | 51 +++++ ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 58 +++++ ...fc_x2_s32fc_multiply_conjugate_add2_32fc.h | 65 ++++++ kernels/volk/volk_32fc_x2_square_dist_32f.h | 54 ++++- kernels/volk/volk_32i_s32f_convert_32f.h | 16 ++ kernels/volk/volk_32i_x2_and_32i.h | 17 ++ kernels/volk/volk_32i_x2_or_32i.h | 17 ++ kernels/volk/volk_32u_byteswap.h | 48 ++++ kernels/volk/volk_32u_byteswappuppet_32u.h | 22 ++ kernels/volk/volk_32u_popcnt.h | 18 ++ kernels/volk/volk_32u_popcntpuppet_32u.h | 32 ++- kernels/volk/volk_32u_reverse_32u.h | 53 +++++ kernels/volk/volk_64f_convert_32f.h | 15 ++ kernels/volk/volk_64f_x2_add_64f.h | 18 ++ kernels/volk/volk_64f_x2_max_64f.h | 17 ++ kernels/volk/volk_64f_x2_min_64f.h | 17 ++ kernels/volk/volk_64f_x2_multiply_64f.h | 18 ++ kernels/volk/volk_64u_byteswap.h | 49 +++++ kernels/volk/volk_64u_byteswappuppet_64u.h | 22 ++ kernels/volk/volk_64u_popcnt.h | 17 ++ kernels/volk/volk_64u_popcntpuppet_64u.h | 41 ++-- kernels/volk/volk_8i_convert_16i.h | 15 ++ kernels/volk/volk_8i_s32f_convert_32f.h | 17 ++ kernels/volk/volk_8ic_deinterleave_16i_x2.h | 22 ++ kernels/volk/volk_8ic_deinterleave_real_16i.h | 18 ++ kernels/volk/volk_8ic_deinterleave_real_8i.h | 17 ++ .../volk/volk_8ic_s32f_deinterleave_32f_x2.h | 24 ++ .../volk_8ic_s32f_deinterleave_real_32f.h | 19 ++ .../volk_8ic_x2_multiply_conjugate_16ic.h | 51 +++++ ...volk_8ic_x2_s32f_multiply_conjugate_32fc.h | 59 +++++ kernels/volk/volk_8u_conv_k7_r2puppet_8u.h | 134 ++++++------ kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 79 +++++++ kernels/volk/volk_8u_x3_encodepolar_8u_x2.h | 29 +++ .../volk/volk_8u_x3_encodepolarpuppet_8u.h | 42 ++++ kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 206 ++++++++++++++++++ lib/CMakeLists.txt | 16 ++ tmpl/volk_cpu.tmpl.c | 2 +- 147 files changed, 5334 insertions(+), 92 deletions(-) create mode 100644 .github/workflows/run-tests-rvv.yml create mode 100644 cmake/Checks/check-rvv-intrinsics.c create mode 100644 cmake/Toolchains/rv64gcv-linux-gnu.cmake create mode 100644 include/volk/volk_rvv_intrinsics.h diff --git a/.github/workflows/run-tests-rvv.yml b/.github/workflows/run-tests-rvv.yml new file mode 100644 index 000000000..b8184bb7b --- /dev/null +++ b/.github/workflows/run-tests-rvv.yml @@ -0,0 +1,54 @@ +# +# Copyright 2020 - 2022 Free Software Foundation, Inc. +# +# This file is part of VOLK +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# + +name: Run VOLK tests on different RVV configurations + +on: [push, pull_request] + +jobs: + Tests: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Install packages + run: | + git submodule update --init --recursive + sudo apt-get update -q -y + sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18 + mkdir build + cd build + - name: Test gcc-14 VLEN=128 + run: | + cd build; rm -rf * + CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. + make -j$(nproc) + ARGS=-j$(nproc) make test + - name: Test gcc-14 VLEN=256 + run: | + cd build; rm -rf * + CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) + ARGS=-j$(nproc) make test + - name: Test clang-18 VLEN=512 + run: | + cd build; rm -rf * + CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. + make -j$(nproc) + ARGS=-j$(nproc) make test + - name: Test clang-18 VLEN=1024 + run: | + cd build; rm -rf * + CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \ + cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release + make -j$(nproc) + ARGS=-j$(nproc) make test + + diff --git a/cmake/Checks/check-rvv-intrinsics.c b/cmake/Checks/check-rvv-intrinsics.c new file mode 100644 index 000000000..48d874ded --- /dev/null +++ b/cmake/Checks/check-rvv-intrinsics.c @@ -0,0 +1,5 @@ +#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14) +int main() { return 0; } +#else +#error "rvv intrinsics aren't supported" +#endif diff --git a/cmake/Toolchains/rv64gcv-linux-gnu.cmake b/cmake/Toolchains/rv64gcv-linux-gnu.cmake new file mode 100644 index 000000000..f6edd741c --- /dev/null +++ b/cmake/Toolchains/rv64gcv-linux-gnu.cmake @@ -0,0 +1,34 @@ +# +# Copyright 2024 Free Software Foundation, Inc. +# +# This file is part of VOLK +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(CMAKE_C_COMPILER $ENV{CC}) +set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) +set(CMAKE_CXX_COMPILER $ENV{CXX}) + +set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE) +set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE) +set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE) + +set(CMAKE_OBJCOPY + ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy + CACHE INTERNAL "objcopy tool") +set(CMAKE_SIZE_UTIL + ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size + CACHE INTERNAL "size tool") + +set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH}) + +set(QEMU_VLEN $ENV{VLEN}) +if(NOT QEMU_VLEN) + set(QEMU_VLEN "128") +endif() + +set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on") diff --git a/gen/archs.xml b/gen/archs.xml index 164c7bb43..7f9713692 100644 --- a/gen/archs.xml +++ b/gen/archs.xml @@ -181,4 +181,48 @@ at the top, as a last resort. + + tmpl/ currently assumes that every arch.name starting with "rv" requires + RVV intrinsics + + + There is currently no mechanism in RISC-V to append extensions, + so each arch needs to specify all of them, and the order needs in the + machine definition needs to be from the fewest to the most extensions. + Fortunately, this maps quite well to the profiles concept. + + + + -march=rv64gcv + -march=rv64gcv + + + + + -march=rv64gcv + -march=rv64gcv + + It's unclear how performance portable segmented load/stores are, so the + default rvv implementations avoid using them. + This is a pseudo arch for separate segmented load/store implementations, + and is expected to never be used standalone without "rvv". + + + + + google/cpu_features currently doesn't support these extensions and profiles. + + + + + diff --git a/gen/machines.xml b/gen/machines.xml index 887f97949..64e1bbd81 100644 --- a/gen/machines.xml +++ b/gen/machines.xml @@ -33,6 +33,18 @@ generic riscv64 orc| + +generic riscv64 rvv rvvseg orc| + + + + + + generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc| diff --git a/include/volk/volk_rvv_intrinsics.h b/include/volk/volk_rvv_intrinsics.h new file mode 100644 index 000000000..85e21d436 --- /dev/null +++ b/include/volk/volk_rvv_intrinsics.h @@ -0,0 +1,77 @@ +/* -*- c++ -*- */ +/* + * Copyright 2024 Free Software Foundation, Inc. + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +/* + * This file is intended to hold RVV intrinsics of intrinsics. + * They should be used in VOLK kernels to avoid copy-paste. + */ + +#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ +#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ +#include + +#define RISCV_SHRINK2(op, T, S, v) \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ + __riscv_vget_##T##S##m1(v, 1), \ + __riscv_vsetvlmax_e##S##m1()) + +#define RISCV_SHRINK4(op, T, S, v) \ + __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ + __riscv_vget_##T##S##m1(v, 1), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \ + __riscv_vget_##T##S##m1(v, 3), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()) + +#define RISCV_SHRINK8(op, T, S, v) \ + __riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ + __riscv_vget_##T##S##m1(v, 1), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \ + __riscv_vget_##T##S##m1(v, 3), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \ + __riscv_vget_##T##S##m1(v, 5), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_##op(__riscv_vget_##T##S##m1(v, 6), \ + __riscv_vget_##T##S##m1(v, 7), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()), \ + __riscv_vsetvlmax_e##S##m1()) + +#define RISCV_PERM4(f, v, vidx) \ + __riscv_vcreate_v_u8m1_u8m4( \ + f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1())) + +#define RISCV_LUT4(f, vtbl, v) \ + __riscv_vcreate_v_u8m1_u8m4( \ + f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \ + f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \ + f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \ + f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1())) + +#define RISCV_PERM8(f, v, vidx) \ + __riscv_vcreate_v_u8m1_u8m8( \ + f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \ + f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1())) + +#define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl) + +#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */ diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h index 8949785fc..8d772ba8c 100644 --- a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h +++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h @@ -668,5 +668,66 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result, #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = + __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl); + vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const short* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = + __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl); + vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/ diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h index b0f4d3b6e..775b15236 100644 --- a/kernels/volk/volk_16i_branch_4_state_8.h +++ b/kernels/volk/volk_16i_branch_4_state_8.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_branch_4_state_8 * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h index cb7168ef8..648712af1 100644 --- a/kernels/volk/volk_16i_convert_8i.h +++ b/kernels/volk/volk_16i_convert_8i.h @@ -275,5 +275,20 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16i_convert_8i_rvv(int8_t* outputVector, + const int16_t* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e16m8(n); + vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl); + __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_16i_convert_8i_a_H */ diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h index fba73da10..ab0a4bcf8 100644 --- a/kernels/volk/volk_16i_max_star_16i.h +++ b/kernels/volk/volk_16i_max_star_16i.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_max_star_16i * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h index 2b0b65c30..ee08ba439 100644 --- a/kernels/volk/volk_16i_max_star_horizontal_16i.h +++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -11,6 +11,10 @@ /*! * \page volk_16i_max_star_horizontal_16i * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h index 077c37b00..f57603db7 100644 --- a/kernels/volk/volk_16i_permute_and_scalar_add.h +++ b/kernels/volk/volk_16i_permute_and_scalar_add.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_permute_and_scalar_add * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h index 817ecd22f..1f9660cea 100644 --- a/kernels/volk/volk_16i_s32f_convert_32f.h +++ b/kernels/volk/volk_16i_s32f_convert_32f.h @@ -483,4 +483,21 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16i_s32f_convert_32f_rvv(float* outputVector, + const int16_t* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl); + __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */ diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h index a8337cc37..94e264fe8 100644 --- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_x4_quad_max_star_16i * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h index 53fa8de58..ba14c59db 100644 --- a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h +++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h @@ -10,6 +10,10 @@ /*! * \page volk_16i_x5_add_quad_16i_x4 * + * \b Deprecation + * + * This kernel is deprecated. + * * \b Overview * * diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h index 7a779bf8b..99fe7cb2c 100644 --- a/kernels/volk/volk_16ic_convert_32fc.h +++ b/kernels/volk/volk_16ic_convert_32fc.h @@ -315,4 +315,23 @@ static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector, } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_convert_32fc_rvv(lv_32fc_t* outputVector, + const lv_16sc_t* inputVector, + unsigned int num_points) +{ + const int16_t* in = (const int16_t*)inputVector; + float* out = (float*)outputVector; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4_t v = __riscv_vle16_v_i16m4(in, vl); + __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h index 37fb41e16..9f4ad7f7b 100644 --- a/kernels/volk/volk_16ic_deinterleave_16i_x2.h +++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h @@ -375,4 +375,45 @@ static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint32m8_t vc = __riscv_vle32_v_u32m8((const uint32_t*)complexVector, vl); + vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl); + vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl); + __riscv_vse16((uint16_t*)iBuffer, vr, vl); + __riscv_vse16((uint16_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer, + int16_t* qBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint16m4x2_t vc = + __riscv_vlseg2e16_v_u16m4x2((const uint16_t*)complexVector, vl); + vuint16m4_t vr = __riscv_vget_u16m4(vc, 0); + vuint16m4_t vi = __riscv_vget_u16m4(vc, 1); + __riscv_vse16((uint16_t*)iBuffer, vr, vl); + __riscv_vse16((uint16_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */ diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h index 92110a3a4..f5a9696fa 100644 --- a/kernels/volk/volk_16ic_deinterleave_real_16i.h +++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h @@ -377,4 +377,21 @@ static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_deinterleave_real_16i_rvv(int16_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const uint32_t* in = (const uint32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl); + __riscv_vse16((uint16_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h index 231be4171..257ea5195 100644 --- a/kernels/volk/volk_16ic_deinterleave_real_8i.h +++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h @@ -415,4 +415,24 @@ static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, } } #endif /* LV_HAVE_AVX2 */ + + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const uint32_t* in = (const uint32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl); + __riscv_vse8( + (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */ diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h index 76472540e..79553d652 100644 --- a/kernels/volk/volk_16ic_magnitude_16i.h +++ b/kernels/volk/volk_16ic_magnitude_16i.h @@ -411,4 +411,50 @@ static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, } #endif /* LV_HAVE_NEONV7 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_magnitude_16i_rvv(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const float scale = SHRT_MAX, iscale = 1.0f / scale; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl); + vint16m4_t vr = __riscv_vnsra(vc, 0, vl); + vint16m4_t vi = __riscv_vnsra(vc, 16, vl); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), iscale, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), iscale, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + vf = __riscv_vfmul(__riscv_vfsqrt(vf, vl), scale, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_magnitude_16i_rvvseg(int16_t* magnitudeVector, + const lv_16sc_t* complexVector, + unsigned int num_points) +{ + const float scale = SHRT_MAX, iscale = 1.0f / scale; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl); + vint16m4_t vr = __riscv_vget_i16m4(vc, 0); + vint16m4_t vi = __riscv_vget_i16m4(vc, 1); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), iscale, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), iscale, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + vf = __riscv_vfmul(__riscv_vfsqrt(vf, vl), scale, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */ diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h index 219e977c7..7f9b8ad68 100644 --- a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h +++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h @@ -327,4 +327,51 @@ volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl); + vint16m4_t vr = __riscv_vnsra(vc, 0, vl); + vint16m4_t vi = __riscv_vnsra(vc, 16, vl); + vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl); + vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_16ic_s32f_deinterleave_32f_x2_rvvseg(float* iBuffer, + float* qBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl); + vint16m4_t vr = __riscv_vget_i16m4(vc, 0); + vint16m4_t vi = __riscv_vget_i16m4(vc, 1); + vfloat32m8_t vrf = __riscv_vfwcvt_f(vr, vl); + vfloat32m8_t vif = __riscv_vfwcvt_f(vi, vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h index 556883299..e8a0d1a0b 100644 --- a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h +++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h @@ -334,4 +334,24 @@ volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_16ic_s32f_deinterleave_real_32f_rvv(float* iBuffer, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const int32_t* in = (const int32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vint32m8_t vc = __riscv_vle32_v_i32m8(in, vl); + vfloat32m8_t vr = __riscv_vfwcvt_f(__riscv_vncvt_x(vc, vl), vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vr, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H */ diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h index 89600632f..8b193ee22 100644 --- a/kernels/volk/volk_16ic_s32f_magnitude_32f.h +++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h @@ -329,4 +329,48 @@ static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_s32f_magnitude_32f_rvv(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl); + vint16m4_t vr = __riscv_vnsra(vc, 0, vl); + vint16m4_t vi = __riscv_vnsra(vc, 16, vl); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_s32f_magnitude_32f_rvvseg(float* magnitudeVector, + const lv_16sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl); + vint16m4_t vr = __riscv_vget_i16m4(vc, 0); + vint16m4_t vi = __riscv_vget_i16m4(vc, 1); + vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl); + vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl); + vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */ diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h index 48e33abf7..a12350a0d 100644 --- a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h +++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h @@ -690,4 +690,68 @@ static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, #endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_RVV +#include "volk_32fc_x2_dot_prod_32fc.h" + +static inline void volk_16ic_x2_dot_prod_16ic_rvv(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + vint16m4_t vsumr = __riscv_vmv_v_x_i16m4(0, __riscv_vsetvlmax_e16m4()); + vint16m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t va = __riscv_vle32_v_i32m8((const int32_t*)in_a, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8((const int32_t*)in_b, vl); + vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl); + vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e16m1(); + vint16m1_t vr = RISCV_SHRINK4(vadd, i, 16, vsumr); + vint16m1_t vi = RISCV_SHRINK4(vadd, i, 16, vsumi); + vint16m1_t z = __riscv_vmv_s_x_i16m1(0, vl); + *result = lv_cmake(__riscv_vmv_x(__riscv_vredsum(vr, z, vl)), + __riscv_vmv_x(__riscv_vredsum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include "volk_32fc_x2_dot_prod_32fc.h" + + +static inline void volk_16ic_x2_dot_prod_16ic_rvvseg(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + vint16m4_t vsumr = __riscv_vmv_v_x_i16m4(0, __riscv_vsetvlmax_e16m4()); + vint16m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_a, vl); + vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_b, vl); + vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1); + vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e16m1(); + vint16m1_t vr = RISCV_SHRINK4(vadd, i, 16, vsumr); + vint16m1_t vi = RISCV_SHRINK4(vadd, i, 16, vsumi); + vint16m1_t z = __riscv_vmv_s_x_i16m1(0, vl); + *result = lv_cmake(__riscv_vmv_x(__riscv_vredsum(vr, z, vl)), + __riscv_vmv_x(__riscv_vredsum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/ diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h index 03ee145c9..37f0fb66a 100644 --- a/kernels/volk/volk_16ic_x2_multiply_16ic.h +++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h @@ -462,4 +462,52 @@ static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_16ic_x2_multiply_16ic_rvv(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint32m8_t va = __riscv_vle32_v_i32m8((const int32_t*)in_a, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8((const int32_t*)in_b, vl); + vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl); + vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr); + vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi); + vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl); + __riscv_vse32((uint32_t*)result, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_16ic_x2_multiply_16ic_rvvseg(lv_16sc_t* result, + const lv_16sc_t* in_a, + const lv_16sc_t* in_b, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) { + vl = __riscv_vsetvl_e16m4(n); + vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_a, vl); + vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)in_b, vl); + vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1); + vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1); + vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl); + __riscv_vsseg2e16_v_i16m4x2( + (int16_t*)result, __riscv_vcreate_v_i16m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/ diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h index 8b1b8c032..50e59906b 100644 --- a/kernels/volk/volk_16u_byteswap.h +++ b/kernels/volk/volk_16u_byteswap.h @@ -280,5 +280,54 @@ static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int nu } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_16u_byteswap_rvv(uint16_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + size_t vlmax = __riscv_vsetvlmax_e8m1(); + if (vlmax <= 256) { + vuint8m1_t vidx = __riscv_vreinterpret_u8m1( + __riscv_vsub(__riscv_vreinterpret_u16m1(__riscv_vid_v_u8m1(vlmax)), + 0x100 - 0x1, + vlmax / 2)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgather, v, vidx); + __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl); + } + } else { + vuint16m2_t vidx = __riscv_vreinterpret_u16m2( + __riscv_vsub(__riscv_vreinterpret_u32m2(__riscv_vid_v_u16m2(vlmax)), + 0x10000 - 0x1, + vlmax / 2)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx); + __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl); + } + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void volk_16u_byteswap_rva23(uint16_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint16m8_t v = __riscv_vle16_v_u16m8(intsToSwap, vl); + __riscv_vse16(intsToSwap, __riscv_vrev8(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ #endif /* INCLUDED_volk_16u_byteswap_a_H */ diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h index 16e75d91d..f01129eb6 100644 --- a/kernels/volk/volk_16u_byteswappuppet_16u.h +++ b/kernels/volk/volk_16u_byteswappuppet_16u.h @@ -102,4 +102,26 @@ static inline void volk_16u_byteswappuppet_16u_u_orc(uint16_t* output, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +static inline void volk_16u_byteswappuppet_16u_rvv(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ + + volk_16u_byteswap_rvv((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); +} +#endif + +#ifdef LV_HAVE_RVA23 +static inline void volk_16u_byteswappuppet_16u_rva23(uint16_t* output, + uint16_t* intsToSwap, + unsigned int num_points) +{ + + volk_16u_byteswap_rva23((uint16_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t)); +} +#endif + #endif diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h index 06b568199..54d890e33 100644 --- a/kernels/volk/volk_32f_64f_add_64f.h +++ b/kernels/volk/volk_32f_64f_add_64f.h @@ -230,4 +230,22 @@ static inline void volk_32f_64f_add_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_64f_add_64f_rvv(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */ diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h index 069cd73e3..5ff815789 100644 --- a/kernels/volk/volk_32f_64f_multiply_64f.h +++ b/kernels/volk/volk_32f_64f_multiply_64f.h @@ -188,5 +188,22 @@ static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_64f_multiply_64f_rvv(double* cVector, + const float* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */ diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h index b3683a967..41e98a804 100644 --- a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h +++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h @@ -383,4 +383,174 @@ static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_8u_polarbutterfly_32f_rvv(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) +{ + const int frame_size = 0x01 << frame_exp; + if (row % 2) { // for odd rows just do the only necessary calculation and return. + const float* next_llrs = llrs + frame_size + row; + *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); + return; + } + + const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); + if (max_stage_depth < 3) { // vectorized version needs larger vectors. + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); + return; + } + + int loop_stage = max_stage_depth; + int stage_size = 0x01 << loop_stage; + + float* src_llr_ptr; + float* dst_llr_ptr; + + if (row) { // not necessary for ZERO row. == first bit to be decoded. + // first do bit combination for all stages + // effectively encode some decoded bits again. + unsigned char* u_target = u + frame_size; + unsigned char* u_temp = u + 2 * frame_size; + memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); + + volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size); + + src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; + dst_llr_ptr = llrs + max_stage_depth * frame_size + row; + + size_t n = stage_size; + for (size_t vl; n > 0; + n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl); + vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl)); + vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl)); + llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl); + llr0 = __riscv_vfadd(llr0, llr1, vl); + __riscv_vse32(dst_llr_ptr, llr0, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + const int min_stage = stage > 2 ? stage : 2; + + while (min_stage < loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + + size_t n = stage_size; + for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl)); + vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl)); + vfloat32m1_t v = + __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl); + v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl); + __riscv_vse32(dst_llr_ptr, v, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + // for stages < 3 vectors are too small!. + llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32f_8u_polarbutterfly_32f_rvvseg(float* llrs, + unsigned char* u, + const int frame_exp, + const int stage, + const int u_num, + const int row) +{ + const int frame_size = 0x01 << frame_exp; + if (row % 2) { // for odd rows just do the only necessary calculation and return. + const float* next_llrs = llrs + frame_size + row; + *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]); + return; + } + + const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row); + if (max_stage_depth < 3) { // vectorized version needs larger vectors. + volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row); + return; + } + + int loop_stage = max_stage_depth; + int stage_size = 0x01 << loop_stage; + + float* src_llr_ptr; + float* dst_llr_ptr; + + if (row) { // not necessary for ZERO row. == first bit to be decoded. + // first do bit combination for all stages + // effectively encode some decoded bits again. + unsigned char* u_target = u + frame_size; + unsigned char* u_temp = u + 2 * frame_size; + memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size); + + volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size); + + src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size; + dst_llr_ptr = llrs + max_stage_depth * frame_size + row; + + size_t n = stage_size; + for (size_t vl; n > 0; + n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl); + vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0); + vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1); + llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl); + llr0 = __riscv_vfadd(llr0, llr1, vl); + __riscv_vse32(dst_llr_ptr, llr0, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + const int min_stage = stage > 2 ? stage : 2; + + while (min_stage < loop_stage) { + dst_llr_ptr = llrs + loop_stage * frame_size + row; + src_llr_ptr = dst_llr_ptr + frame_size; + + size_t n = stage_size; + for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) { + vl = __riscv_vsetvl_e32m1(n); + vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl); + vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0); + vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1); + vfloat32m1_t v = + __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl); + v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl); + __riscv_vse32(dst_llr_ptr, v, vl); + } + + --loop_stage; + stage_size >>= 1; + } + + // for stages < 3 vectors are too small!. + llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row); +} +#endif /* LV_HAVE_RVVSEG */ + #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */ diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h index c97da33d0..6ebcd22e4 100644 --- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h @@ -162,5 +162,62 @@ static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +static inline void volk_32f_8u_polarbutterflypuppet_32f_rvv(float* llrs, + const float* input, + unsigned char* u, + const int elements) +{ + (void)input; // suppress unused parameter warning + + if (elements < 2) { + return; + } + + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); + + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_rvv(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } + + clean_up_intermediate_values(llrs, u, frame_size, elements); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +static inline void volk_32f_8u_polarbutterflypuppet_32f_rvvseg(float* llrs, + const float* input, + unsigned char* u, + const int elements) +{ + (void)input; // suppress unused parameter warning + + if (elements < 2) { + return; + } + + unsigned int frame_size = maximum_frame_size(elements); + unsigned int frame_exp = log2_of_power_of_2(frame_size); + + sanitize_bytes(u, elements); + clean_up_intermediate_values(llrs, u, frame_size, elements); + generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size); + + unsigned int u_num = 0; + for (; u_num < frame_size; u_num++) { + volk_32f_8u_polarbutterfly_32f_rvvseg(llrs, u, frame_exp, 0, u_num, u_num); + u[u_num] = llrs[u_num] > 0 ? 0 : 1; + } + + clean_up_intermediate_values(llrs, u, frame_size, elements); +} +#endif /* LV_HAVE_RVVSEG */ #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */ diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h index 1cd8568e4..7e9a81f76 100644 --- a/kernels/volk/volk_32f_accumulator_s32f.h +++ b/kernels/volk/volk_32f_accumulator_s32f.h @@ -232,4 +232,26 @@ static inline void volk_32f_accumulator_s32f_generic(float* result, } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_accumulator_s32f_rvv(float* result, + const float* inputBuffer, + unsigned int num_points) +{ + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl); + vsum = __riscv_vfadd_tu(vsum, vsum, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */ diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h index 5cf0d693a..dd4813ac6 100644 --- a/kernels/volk/volk_32f_acos_32f.h +++ b/kernels/volk/volk_32f_acos_32f.h @@ -501,4 +501,72 @@ volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_acos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax); + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + +#if ACOS_TERMS == 2 + const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax); +#elif ACOS_TERMS == 3 + const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax); +#elif ACOS_TERMS == 4 + const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax); +#endif + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t a = + __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl); + vfloat32m2_t z = __riscv_vfabs(a, vl); + vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfdiv(cf1, x, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + +#if ACOS_TERMS < 1 + vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl); +#elif ACOS_TERMS == 1 + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ACOS_TERMS == 2 + vfloat32m2_t y = cfm1o3; + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ACOS_TERMS == 3 + vfloat32m2_t y = cf1o5; + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ACOS_TERMS == 4 + vfloat32m2_t y = cfm1o7; + y = __riscv_vfmadd(y, xx, cf1o5, vl); + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#else +#error "ACOS_TERMS > 4 not supported by volk_32f_acos_32f_rvv" +#endif + y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl); + y = __riscv_vfadd_mu( + __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl); + + vfloat32m2_t acosine; + acosine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl); + acosine = __riscv_vfadd_mu(RISCV_VMFLTZ(32m2, v, vl), acosine, acosine, cpi, vl); + + __riscv_vse32(bVector, acosine, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_acos_32f_u_H */ diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h index 093771639..1914c39ea 100644 --- a/kernels/volk/volk_32f_asin_32f.h +++ b/kernels/volk/volk_32f_asin_32f.h @@ -486,4 +486,70 @@ volk_32f_asin_32f_generic(float* bVector, const float* aVector, unsigned int num } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_asin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + +#if ASIN_TERMS == 2 + const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax); +#elif ASIN_TERMS == 3 + const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax); +#elif ASIN_TERMS == 4 + const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax); +#endif + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t a = + __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl); + vfloat32m2_t z = __riscv_vfabs(a, vl); + vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); + x = __riscv_vfdiv(cf1, x, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + +#if ASIN_TERMS < 1 + vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl); +#elif ASIN_TERMS == 1 + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ASIN_TERMS == 2 + vfloat32m2_t y = cfm1o3; + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ASIN_TERMS == 3 + vfloat32m2_t y = cf1o5; + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#elif ASIN_TERMS == 4 + vfloat32m2_t y = cfm1o7; + y = __riscv_vfmadd(y, xx, cf1o5, vl); + y = __riscv_vfmadd(y, xx, cfm1o3, vl); + y = __riscv_vfmadd(y, xx, cf1, vl); +#else +#error "ASIN_TERMS > 4 not supported by volk_32f_asin_32f_rvv" +#endif + y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl); + y = __riscv_vfadd_mu( + __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl); + + vfloat32m2_t asine; + asine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl); + + __riscv_vse32(bVector, asine, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_asin_32f_u_H */ diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index dc5987cb8..300f46caf 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -293,4 +293,46 @@ volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_atan_32f_rvv(float* out, const float* in, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax); + const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax); + const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax); + const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); + const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(in, vl); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(v, vl), cf1, vl); + vfloat32m2_t x = __riscv_vfdiv_mu(mswap, v, cf1, v, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t p = c13; + p = __riscv_vfmadd(p, xx, c11, vl); + p = __riscv_vfmadd(p, xx, c9, vl); + p = __riscv_vfmadd(p, xx, c7, vl); + p = __riscv_vfmadd(p, xx, c5, vl); + p = __riscv_vfmadd(p, xx, c3, vl); + p = __riscv_vfmadd(p, xx, c1, vl); + p = __riscv_vfmul(p, x, vl); + + vfloat32m2_t t = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl); + p = __riscv_vmerge(p, t, mswap, vl); + + __riscv_vse32(out, p, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_atan_32f_u_H */ diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h index 7606145ba..861ef478c 100644 --- a/kernels/volk/volk_32f_binary_slicer_32i.h +++ b/kernels/volk/volk_32f_binary_slicer_32i.h @@ -261,5 +261,21 @@ static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_binary_slicer_32i_rvv(int* cVector, + const float* aVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)aVector, vl); + v = __riscv_vsrl(__riscv_vnot(v, vl), 31, vl); + __riscv_vse32((uint32_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_binary_slicer_32i_H */ diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h index c6929db42..9623ae90f 100644 --- a/kernels/volk/volk_32f_binary_slicer_8i.h +++ b/kernels/volk/volk_32f_binary_slicer_8i.h @@ -500,5 +500,22 @@ static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_binary_slicer_8i_rvv(int8_t* cVector, + const float* aVector, + unsigned int num_points) +{ + size_t n = num_points; + vint8m2_t v0 = __riscv_vmv_v_x_i8m2(1, __riscv_vsetvlmax_e8m2()); + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + vint8m2_t vn = __riscv_vmerge(v0, 0, __riscv_vmflt(v, 0, vl), vl); + __riscv_vse8(cVector, vn, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */ diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h index 93d1c6110..5e907d392 100644 --- a/kernels/volk/volk_32f_convert_64f.h +++ b/kernels/volk/volk_32f_convert_64f.h @@ -230,5 +230,20 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_convert_64f_rvv(double* outputVector, + const float* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl); + __riscv_vse64(outputVector, __riscv_vfwcvt_f(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_convert_64f_a_H */ diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h index 37785df06..aa264c07d 100644 --- a/kernels/volk/volk_32f_cos_32f.h +++ b/kernels/volk/volk_32f_cos_32f.h @@ -995,5 +995,65 @@ volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_poi #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax); + const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax); + const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax); + const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax); + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t s = __riscv_vfabs(v, vl); + vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl); + vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl); + + s = __riscv_vfnmsac(s, cPio4a, r, vl); + s = __riscv_vfnmsac(s, cPio4b, r, vl); + s = __riscv_vfnmsac(s, cPio4c, r, vl); + + s = __riscv_vfmul(s, 1 / 8.0f, vl); + s = __riscv_vfmul(s, s, vl); + vfloat32m2_t t = s; + s = __riscv_vfmsub(s, c5, c4, vl); + s = __riscv_vfmadd(s, t, c3, vl); + s = __riscv_vfmsub(s, t, c2, vl); + s = __riscv_vfmadd(s, t, cf1, vl); + s = __riscv_vfmul(s, t, vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, 1 / 2.0f, vl); + + vfloat32m2_t sine = + __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl); + vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl); + + vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl); + vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl); + + cosine = __riscv_vmerge(cosine, sine, m1, vl); + cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl); + + __riscv_vse32(bVector, cosine, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_cos_32f_u_H */ diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h index 13d21201f..85571dbc9 100644 --- a/kernels/volk/volk_32f_exp_32f.h +++ b/kernels/volk/volk_32f_exp_32f.h @@ -266,4 +266,58 @@ volk_32f_exp_32f_generic(float* bVector, const float* aVector, unsigned int num_ #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_exp_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax); + const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax); + const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax); + const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax); + const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax); + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax); + + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + v = __riscv_vfmin(v, exp_hi, vl); + v = __riscv_vfmax(v, exp_lo, vl); + vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl); + + vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl); + fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl); + v = __riscv_vfmacc(v, fx, exp_C1, vl); + v = __riscv_vfmacc(v, fx, exp_C2, vl); + vfloat32m2_t vv = __riscv_vfmul(v, v, vl); + + vfloat32m2_t y = c0; + y = __riscv_vfmadd(y, v, c1, vl); + y = __riscv_vfmadd(y, v, c2, vl); + y = __riscv_vfmadd(y, v, c3, vl); + y = __riscv_vfmadd(y, v, c4, vl); + y = __riscv_vfmadd(y, v, c5, vl); + y = __riscv_vfmadd(y, vv, v, vl); + y = __riscv_vfadd(y, cf1, vl); + + vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2( + __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl)); + + __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_exp_32f_u_H */ diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h index 7dfbaacb0..3b65968af 100644 --- a/kernels/volk/volk_32f_expfast_32f.h +++ b/kernels/volk/volk_32f_expfast_32f.h @@ -301,4 +301,25 @@ static inline void volk_32f_expfast_32f_generic(float* bVector, } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_expfast_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m8(); + const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(A / Mln2, vlmax); + const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(B - C, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + v = __riscv_vfmadd(v, ca, cb, vl); + v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl)); + __riscv_vse32(bVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_expfast_32f_u_H */ diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h index 2aad087e1..3e7c0fb98 100644 --- a/kernels/volk/volk_32f_index_max_16u.h +++ b/kernels/volk/volk_32f_index_max_16u.h @@ -359,4 +359,32 @@ volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_max_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(-FLT_MAX, __riscv_vsetvlmax_e32m8()); + vuint16m4_t vmaxi = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4()); + vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl); + vbool4_t m = __riscv_vmfgt(v, vmax, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m8(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK8(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool4_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/ diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h index 86dad0d19..0bf071fcb 100644 --- a/kernels/volk/volk_32f_index_max_32u.h +++ b/kernels/volk/volk_32f_index_max_32u.h @@ -542,4 +542,32 @@ volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_max_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(-FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl); + vbool8_t m = __riscv_vmfgt(v, vmax, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/ diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h index 000ecafc4..5e1f0aa13 100644 --- a/kernels/volk/volk_32f_index_min_16u.h +++ b/kernels/volk/volk_32f_index_min_16u.h @@ -346,4 +346,32 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_min_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(FLT_MAX, __riscv_vsetvlmax_e32m8()); + vuint16m4_t vmini = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4()); + vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl); + vbool4_t m = __riscv_vmflt(v, vmin, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m8(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK8(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool4_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_min_16u_u_H*/ diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h index 0c8bf8c0a..44e4c85d5 100644 --- a/kernels/volk/volk_32f_index_min_32u.h +++ b/kernels/volk/volk_32f_index_min_32u.h @@ -42,7 +42,7 @@ * * volk_32f_index_min_32u(out, in, N); * - * printf("minimum is %1.2f at index %u\n", in[*out], *out); + * ("minimum is %1.2f at index %u\n", in[*out], *out); * * volk_free(in); * volk_free(out); @@ -508,4 +508,32 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32f_index_min_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl); + vbool8_t m = __riscv_vmflt(v, vmin, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_index_min_32u_u_H*/ diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h index e91b6c7c9..b5a7c8f84 100644 --- a/kernels/volk/volk_32f_invsqrt_32f.h +++ b/kernels/volk/volk_32f_invsqrt_32f.h @@ -203,4 +203,19 @@ volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int nu } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_invsqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfrsqrt7(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */ diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 0443e56e4..fc1d744c2 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -718,5 +718,73 @@ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_ #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + +#if LOG_POLY_DEGREE == 6 + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax); +#elif LOG_POLY_DEGREE == 5 + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax); +#elif LOG_POLY_DEGREE == 4 + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax); +#elif LOG_POLY_DEGREE == 3 + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1); + const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); + const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t a = __riscv_vfabs(v, vl); + vfloat32m2_t exp = __riscv_vfcvt_f( + __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), + vl); + vfloat32m2_t frac = __riscv_vreinterpret_f32m2( + __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl)); + + vfloat32m2_t mant = c0; + mant = __riscv_vfmadd(mant, frac, c1, vl); + mant = __riscv_vfmadd(mant, frac, c2, vl); +#if LOG_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, c3, vl); +#if LOG_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, c4, vl); +#if LOG_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, c5, vl); +#endif +#endif +#endif + exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + + __riscv_vse32(bVector, exp, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_log2_32f_u_H */ diff --git a/kernels/volk/volk_32f_reciprocal_32f.h b/kernels/volk/volk_32f_reciprocal_32f.h index 37bd16a80..f44a9885c 100644 --- a/kernels/volk/volk_32f_reciprocal_32f.h +++ b/kernels/volk/volk_32f_reciprocal_32f.h @@ -198,4 +198,19 @@ volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_p } #endif /* LV_HAVE_AVX512F */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_reciprocal_32f_rvv(float* out, const float* in, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + __riscv_vse32(out, __riscv_vfrdiv(v, 1.0f, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h index a6eb37c2e..607bd6d88 100644 --- a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h +++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h @@ -335,4 +335,41 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_32f_fm_detect_32f_rvv(float* outputVector, + const float* inputVector, + const float bound, + float* saveValue, + unsigned int num_points) +{ + if (num_points < 1) + return; + + *outputVector = *inputVector - *saveValue; + if (*outputVector > bound) + *outputVector -= 2 * bound; + if (*outputVector < -bound) + *outputVector += 2 * bound; + ++inputVector; + ++outputVector; + + vfloat32m8_t v2bound = __riscv_vfmv_v_f_f32m8(bound * 2, __riscv_vsetvlmax_e32m8()); + + size_t n = num_points - 1; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(inputVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(inputVector - 1, vl); + vfloat32m8_t v = __riscv_vfsub(va, vb, vl); + v = __riscv_vfsub_mu(__riscv_vmfgt(v, bound, vl), v, v, v2bound, vl); + v = __riscv_vfadd_mu(__riscv_vmflt(v, -bound, vl), v, v, v2bound, vl); + __riscv_vse32(outputVector, v, vl); + } + + *saveValue = inputVector[-1]; +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_add_32f.h b/kernels/volk/volk_32f_s32f_add_32f.h index d7ae2aa16..e3301a7a7 100644 --- a/kernels/volk/volk_32f_s32f_add_32f.h +++ b/kernels/volk/volk_32f_s32f_add_32f.h @@ -258,4 +258,21 @@ static inline void volk_32f_s32f_add_32f_u_orc(float* cVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_add_32f_rvv(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfadd(v, scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */ diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h index 816f60928..368a987ab 100644 --- a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h +++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h @@ -52,6 +52,8 @@ #include #include +#include + #ifdef LV_HAVE_AVX #include @@ -458,4 +460,37 @@ volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude, *noiseFloorAmplitude = localNoiseFloorAmplitude; } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_s32f_calc_spectral_noise_floor_32f_rvv(float* noiseFloorAmplitude, + const float* realDataPoints, + const float spectralExclusionValue, + const unsigned int num_points) +{ + float sum; + volk_32f_accumulator_s32f_rvv(&sum, realDataPoints, num_points); + float meanAmplitude = sum / num_points + spectralExclusionValue; + + vfloat32m8_t vbin = __riscv_vfmv_v_f_f32m8(meanAmplitude, __riscv_vsetvlmax_e32m8()); + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points, binCount = 0; + for (size_t vl; n > 0; n -= vl, realDataPoints += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(realDataPoints, vl); + vbool4_t m = __riscv_vmfle(v, vbin, vl); + binCount += __riscv_vcpop(m, vl); + vsum = __riscv_vfadd_tumu(m, vsum, vsum, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl)); + + *noiseFloorAmplitude = binCount == 0 ? meanAmplitude : sum / binCount; +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_clamppuppet_32f.h b/kernels/volk/volk_32f_s32f_clamppuppet_32f.h index 254bfdd5d..b4a0e3af4 100644 --- a/kernels/volk/volk_32f_s32f_clamppuppet_32f.h +++ b/kernels/volk/volk_32f_s32f_clamppuppet_32f.h @@ -62,4 +62,14 @@ static inline void volk_32f_s32f_clamppuppet_32f_u_sse4_1(float* out, } #endif +#ifdef LV_HAVE_RVV +static inline void volk_32f_s32f_clamppuppet_32f_rvv(float* out, + const float* in, + const float min, + unsigned int num_points) +{ + volk_32f_s32f_x2_clamp_32f_rvv(out, in, min, -min, num_points); +} +#endif + #endif /* INCLUDED_volk_32f_s32f_clamppuppet_32f_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h index fe5a31b3f..667e97f6e 100644 --- a/kernels/volk/volk_32f_s32f_convert_16i.h +++ b/kernels/volk/volk_32f_s32f_convert_16i.h @@ -552,5 +552,22 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_convert_16i_rvv(int16_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + v = __riscv_vfmul(v, scalar, vl); + __riscv_vse16(outputVector, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h index 0cd9dee8e..b7b6fb1a9 100644 --- a/kernels/volk/volk_32f_s32f_convert_32i.h +++ b/kernels/volk/volk_32f_s32f_convert_32i.h @@ -405,5 +405,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_convert_32i_rvv(int32_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + v = __riscv_vfmul(v, scalar, vl); + __riscv_vse32(outputVector, __riscv_vfcvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */ diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h index d47f95a0c..a21ae7aa1 100644 --- a/kernels/volk/volk_32f_s32f_convert_8i.h +++ b/kernels/volk/volk_32f_s32f_convert_8i.h @@ -437,5 +437,22 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_convert_8i_rvv(int8_t* outputVector, + const float* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + vint16m4_t vi = __riscv_vfncvt_x(__riscv_vfmul(v, scalar, vl), vl); + __riscv_vse8(outputVector, __riscv_vnclip(vi, 0, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */ diff --git a/kernels/volk/volk_32f_s32f_convertpuppet_8u.h b/kernels/volk/volk_32f_s32f_convertpuppet_8u.h index 7f530c44f..aa1258bab 100644 --- a/kernels/volk/volk_32f_s32f_convertpuppet_8u.h +++ b/kernels/volk/volk_32f_s32f_convertpuppet_8u.h @@ -102,4 +102,15 @@ static inline void volk_32f_s32f_convertpuppet_8u_a_sse(uint8_t* output, volk_32f_s32f_x2_convert_8u_a_sse(output, input, scale, 128.0, num_points); } #endif + +#ifdef LV_HAVE_RVV +static inline void volk_32f_s32f_convertpuppet_8u_rvv(uint8_t* output, + const float* input, + float scale, + unsigned int num_points) +{ + volk_32f_s32f_x2_convert_8u_rvv(output, input, scale, 128.0, num_points); +} +#endif + #endif diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h index 3a1785964..f4a7a2b04 100644 --- a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h +++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h @@ -86,4 +86,14 @@ static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output, output, input, bound - 3.131f, bound, num_points); } #endif +#ifdef LV_HAVE_RVV +static inline void volk_32f_s32f_mod_rangepuppet_32f_rvv(float* output, + const float* input, + float bound, + unsigned int num_points) +{ + volk_32f_s32f_s32f_mod_range_32f_rvv( + output, input, bound - 3.131f, bound, num_points); +} +#endif #endif diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h index 26fc148c3..27d861493 100644 --- a/kernels/volk/volk_32f_s32f_multiply_32f.h +++ b/kernels/volk/volk_32f_s32f_multiply_32f.h @@ -257,4 +257,21 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_multiply_32f_rvv(float* cVector, + const float* aVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfmul(v, scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */ diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h index 46f5799b7..e572f24c3 100644 --- a/kernels/volk/volk_32f_s32f_normalize.h +++ b/kernels/volk/volk_32f_s32f_normalize.h @@ -203,5 +203,19 @@ static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_s32f_normalize_rvv(float* vecBuffer, const float scalar, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, vecBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(vecBuffer, vl); + __riscv_vse32(vecBuffer, __riscv_vfmul(v, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_normalize_u_H */ diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h index d185f1020..f51761500 100644 --- a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h +++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h @@ -359,5 +359,37 @@ static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_s32f_mod_range_32f_rvv(float* outputVector, + const float* inputVector, + const float lower_bound, + const float upper_bound, + unsigned int num_points) +{ + const float dist = upper_bound - lower_bound; + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t vdist = __riscv_vfmv_v_f_f32m4(dist, vlmax); + vfloat32m4_t vmdist = __riscv_vfmv_v_f_f32m4(-dist, vlmax); + vfloat32m4_t vupper = __riscv_vfmv_v_f_f32m4(upper_bound, vlmax); + vfloat32m4_t vlower = __riscv_vfmv_v_f_f32m4(lower_bound, vlmax); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, outputVector += vl, inputVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl); + vfloat32m4_t vlt = __riscv_vfsub(vlower, v, vl); + vfloat32m4_t vgt = __riscv_vfsub(v, vupper, vl); + vbool8_t mlt = __riscv_vmflt(v, vlower, vl); + vfloat32m4_t vmul = __riscv_vmerge(vmdist, vdist, mlt, vl); + vfloat32m4_t vcnt = __riscv_vfdiv(__riscv_vmerge(vgt, vlt, mlt, vl), vdist, vl); + vcnt = __riscv_vfcvt_f(__riscv_vadd(__riscv_vfcvt_rtz_x(vcnt, vl), 1, vl), vl); + vbool8_t mgt = __riscv_vmfgt(v, vupper, vl); + v = __riscv_vfmacc_mu(__riscv_vmor(mlt, mgt, vl), v, vcnt, vmul, vl); + + __riscv_vse32(outputVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */ diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h index 3b5bb6e14..8774277b5 100644 --- a/kernels/volk/volk_32f_s32f_stddev_32f.h +++ b/kernels/volk/volk_32f_s32f_stddev_32f.h @@ -344,4 +344,32 @@ static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_s32f_stddev_32f_rvv(float* stddev, + const float* inputBuffer, + const float mean, + unsigned int num_points) +{ + if (num_points == 0) { + *stddev = 0; + return; + } + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl); + vsum = __riscv_vfmacc_tu(vsum, v, v, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl); + float sum = __riscv_vfmv_f(v); + *stddev = sqrtf((sum / num_points) - (mean * mean)); +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_x2_clamp_32f.h b/kernels/volk/volk_32f_s32f_x2_clamp_32f.h index 19d51795f..2b194eaa7 100644 --- a/kernels/volk/volk_32f_s32f_x2_clamp_32f.h +++ b/kernels/volk/volk_32f_s32f_x2_clamp_32f.h @@ -187,4 +187,25 @@ static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out, } #endif /* LV_HAVE_SSE4_1 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_x2_clamp_32f_rvv(float* out, + const float* in, + const float min, + const float max, + unsigned int num_points) +{ + vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8()); + vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl); + __riscv_vse32(out, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */ diff --git a/kernels/volk/volk_32f_s32f_x2_convert_8u.h b/kernels/volk/volk_32f_s32f_x2_convert_8u.h index a52cdf283..1ad2b1ac0 100644 --- a/kernels/volk/volk_32f_s32f_x2_convert_8u.h +++ b/kernels/volk/volk_32f_s32f_x2_convert_8u.h @@ -612,5 +612,24 @@ static inline void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_s32f_x2_convert_8u_rvv(uint8_t* outputVector, + const float* inputVector, + const float scale, + const float bias, + unsigned int num_points) +{ + vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(bias, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl); + vuint16m4_t vi = __riscv_vfncvt_xu(__riscv_vfmadd_vf_f32m8(v, scale, vb, vl), vl); + __riscv_vse8(outputVector, __riscv_vnclipu(vi, 0, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_s32f_x2_convert_8u_a_H */ diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h index 371e424fd..d03ab51db 100644 --- a/kernels/volk/volk_32f_sin_32f.h +++ b/kernels/volk/volk_32f_sin_32f.h @@ -893,5 +893,67 @@ volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_poi #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax); + const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax); + const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax); + const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax); + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t s = __riscv_vfabs(v, vl); + vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl); + vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl); + + s = __riscv_vfnmsac(s, cPio4a, r, vl); + s = __riscv_vfnmsac(s, cPio4b, r, vl); + s = __riscv_vfnmsac(s, cPio4c, r, vl); + + s = __riscv_vfmul(s, 1 / 8.0f, vl); + s = __riscv_vfmul(s, s, vl); + vfloat32m2_t t = s; + s = __riscv_vfmsub(s, c5, c4, vl); + s = __riscv_vfmadd(s, t, c3, vl); + s = __riscv_vfmsub(s, t, c2, vl); + s = __riscv_vfmadd(s, t, cf1, vl); + s = __riscv_vfmul(s, t, vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, 1 / 2.0f, vl); + + vfloat32m2_t sine = + __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl); + vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl); + + vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl); + vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl), + __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl), + vl); + + sine = __riscv_vmerge(sine, cosine, m1, vl); + sine = __riscv_vfneg_mu(m2, sine, sine, vl); + + __riscv_vse32(bVector, sine, vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_sin_32f_u_H */ diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h index 9d2694130..c5672534e 100644 --- a/kernels/volk/volk_32f_sqrt_32f.h +++ b/kernels/volk/volk_32f_sqrt_32f.h @@ -205,4 +205,20 @@ volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_p } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_sqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl); + __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */ diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h index c71514bb7..96535ed68 100644 --- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h +++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h @@ -569,4 +569,75 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_stddev_and_mean_32f_x2_rvv(float* stddev, + float* mean, + const float* inputBuffer, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + if (num_points < vlmax) { + volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points); + return; + } + + vfloat32m4_t vsum = __riscv_vle32_v_f32m4(inputBuffer, vlmax); + inputBuffer += vlmax; + vfloat32m4_t vsumsq = __riscv_vfmv_v_f_f32m4(0, vlmax); + size_t partLen = num_points / vlmax; + + for (size_t i = 1; i < partLen; ++i, inputBuffer += vlmax) { + vfloat32m4_t v = __riscv_vle32_v_f32m4(inputBuffer, vlmax); + vsum = __riscv_vfadd(vsum, v, vlmax); + vfloat32m4_t vaux = __riscv_vfmsub(v, i + 1.0f, vsum, vlmax); + vaux = __riscv_vfmul(vaux, vaux, vlmax); + vaux = __riscv_vfmul(vaux, 1.0f / (i * (i + 1.0f)), vlmax); + vsumsq = __riscv_vfadd(vsumsq, vaux, vlmax); + } + + size_t vl = __riscv_vsetvlmax_e32m2(); + vfloat32m2_t vsum2 = + __riscv_vfadd(__riscv_vget_f32m2(vsum, 0), __riscv_vget_f32m2(vsum, 1), vl); + vfloat32m2_t vfix2 = + __riscv_vfsub(__riscv_vget_f32m2(vsum, 0), __riscv_vget_f32m2(vsum, 1), vl); + vfix2 = __riscv_vfmul(vfix2, vfix2, vl); + vfloat32m2_t vsumsq2 = + __riscv_vfadd(__riscv_vget_f32m2(vsumsq, 0), __riscv_vget_f32m2(vsumsq, 1), vl); + vsumsq2 = __riscv_vfmacc(vsumsq2, 0.5f / (num_points / vlmax), vfix2, vl); + + vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vsum1 = + __riscv_vfadd(__riscv_vget_f32m1(vsum2, 0), __riscv_vget_f32m1(vsum2, 1), vl); + vfloat32m1_t vfix1 = + __riscv_vfsub(__riscv_vget_f32m1(vsum2, 0), __riscv_vget_f32m1(vsum2, 1), vl); + vfix1 = __riscv_vfmul(vfix1, vfix1, vl); + vfloat32m1_t vsumsq1 = + __riscv_vfadd(__riscv_vget_f32m1(vsumsq2, 0), __riscv_vget_f32m1(vsumsq2, 1), vl); + vsumsq1 = __riscv_vfmacc(vsumsq1, 0.5f / (num_points / vlmax * 2), vfix1, vl); + + for (size_t n = num_points / vlmax * 4, vl = vlmax >> 2; vl >>= 1; n *= 2) { + vfloat32m1_t vsumdown = __riscv_vslidedown(vsum1, vl, vl); + vfix1 = __riscv_vfsub(vsum1, vsumdown, vl); + vfix1 = __riscv_vfmul(vfix1, vfix1, vl); + vsum1 = __riscv_vfadd(vsum1, vsumdown, vl); + vsumsq1 = __riscv_vfadd(vsumsq1, __riscv_vslidedown(vsumsq1, vl, vl), vl); + vsumsq1 = __riscv_vfmacc(vsumsq1, 0.5f / n, vfix1, vl); + } + + float sum = __riscv_vfmv_f(vsum1); + float sumsq = __riscv_vfmv_f(vsumsq1); + + for (size_t i = partLen * vlmax; i < num_points; ++i) { + float in = *inputBuffer++; + sum += in; + sumsq = update_square_sum_1_val(sumsq, sum, i, in); + } + + *stddev = sqrtf(sumsq / num_points); + *mean = sum / num_points; +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */ diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h index 1ec0202fa..28810c945 100644 --- a/kernels/volk/volk_32f_tan_32f.h +++ b/kernels/volk/volk_32f_tan_32f.h @@ -750,5 +750,72 @@ volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_poi } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_tan_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax); + const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax); + const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax); + const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax); + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax); + + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t s = __riscv_vfabs(v, vl); + vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl); + vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl); + + s = __riscv_vfnmsac(s, cPio4a, r, vl); + s = __riscv_vfnmsac(s, cPio4b, r, vl); + s = __riscv_vfnmsac(s, cPio4c, r, vl); + + s = __riscv_vfmul(s, 1 / 8.0f, vl); + s = __riscv_vfmul(s, s, vl); + vfloat32m2_t t = s; + s = __riscv_vfmsub(s, c5, c4, vl); + s = __riscv_vfmadd(s, t, c3, vl); + s = __riscv_vfmsub(s, t, c2, vl); + s = __riscv_vfmadd(s, t, cf1, vl); + s = __riscv_vfmul(s, t, vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl); + s = __riscv_vfmul(s, 1 / 2.0f, vl); + + vfloat32m2_t sine = + __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl); + vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl); + + vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl); + vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl); + vbool16_t m3 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl), + __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl), + vl); + + vfloat32m2_t sine0 = sine; + sine = __riscv_vmerge(sine, cosine, m1, vl); + sine = __riscv_vfneg_mu(m3, sine, sine, vl); + + cosine = __riscv_vmerge(cosine, sine0, m1, vl); + cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl); + + __riscv_vse32(bVector, __riscv_vfdiv(sine, cosine, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_tan_32f_u_H */ diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h index 3e36adb72..e90e40259 100644 --- a/kernels/volk/volk_32f_tanh_32f.h +++ b/kernels/volk/volk_32f_tanh_32f.h @@ -412,4 +412,38 @@ volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int n } #endif /* LV_HAVE_AVX && LV_HAVE_FMA */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32f_tanh_32f_rvv(float* bVector, const float* aVector, unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(135135.0f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(17325.0f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(378.0f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(62370.0f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3150.0f, vlmax); + const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(28.0f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2_t x = __riscv_vle32_v_f32m2(aVector, vl); + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t a, b; + a = __riscv_vfadd(xx, c3, vl); + a = __riscv_vfmadd(a, xx, c2, vl); + a = __riscv_vfmadd(a, xx, c1, vl); + a = __riscv_vfmul(a, x, vl); + b = c6; + b = __riscv_vfmadd(b, xx, c5, vl); + b = __riscv_vfmadd(b, xx, c4, vl); + b = __riscv_vfmadd(b, xx, c1, vl); + __riscv_vse32(bVector, __riscv_vfdiv(a, b, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_tanh_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h index f99e6b555..be9f6aa77 100644 --- a/kernels/volk/volk_32f_x2_add_32f.h +++ b/kernels/volk/volk_32f_x2_add_32f.h @@ -391,5 +391,22 @@ static inline void volk_32f_x2_add_32f_u_orc(float* cVector, #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_add_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */ diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h index bcb9da7cf..fbece7d50 100644 --- a/kernels/volk/volk_32f_x2_divide_32f.h +++ b/kernels/volk/volk_32f_x2_divide_32f.h @@ -347,4 +347,22 @@ static inline void volk_32f_x2_divide_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_divide_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfdiv(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h index 3a4b71776..3502b3a55 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_16i.h +++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h @@ -678,5 +678,20 @@ static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result, #endif /*LV_HAVE_AVX512F*/ +#ifdef LV_HAVE_RVV +#include + +#include "volk_32f_x2_dot_prod_32f.h" + +static inline void volk_32f_x2_dot_prod_16i_rvv(int16_t* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + float fresult = 0; + volk_32f_x2_dot_prod_32f_rvv(&fresult, input, taps, num_points); + *result = (int16_t)rintf(fresult); +} +#endif /*LV_HAVE_RVV*/ #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/ diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h index 5bdb72ced..2d86411e6 100644 --- a/kernels/volk/volk_32f_x2_dot_prod_32f.h +++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h @@ -949,4 +949,28 @@ extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, unsigned int num_points); #endif /* LV_HAVE_NEONV7 */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_x2_dot_prod_32f_rvv(float* result, + const float* input, + const float* taps, + unsigned int num_points) +{ + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v0 = __riscv_vle32_v_f32m8(input, vl); + vfloat32m8_t v1 = __riscv_vle32_v_f32m8(taps, vl); + vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum); + v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl); + *result = __riscv_vfmv_f(v); +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h index b49015433..62e30ad8d 100644 --- a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h +++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h @@ -79,4 +79,17 @@ static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector, outputVector, inputVector, bound, saveValue, num_points); } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +static inline void volk_32f_x2_fm_detectpuppet_32f_rvv(float* outputVector, + const float* inputVector, + float* saveValue, + unsigned int num_points) +{ + const float bound = 2.0f; + volk_32f_s32f_32f_fm_detect_32f_rvv( + outputVector, inputVector, bound, saveValue, num_points); +} +#endif /* LV_HAVE_RVV */ + #endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h index 140fa9ff1..2190f1a46 100644 --- a/kernels/volk/volk_32f_x2_interleave_32fc.h +++ b/kernels/volk/volk_32f_x2_interleave_32fc.h @@ -255,4 +255,43 @@ static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_interleave_32fc_rvv(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) +{ + uint64_t* out = (uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint32m4_t vr = __riscv_vle32_v_u32m4((const uint32_t*)iBuffer, vl); + vuint32m4_t vi = __riscv_vle32_v_u32m4((const uint32_t*)qBuffer, vl); + vuint64m8_t vc = + __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFFFFFF, vi, vl); + __riscv_vse64(out, vc, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32f_x2_interleave_32fc_rvvseg(lv_32fc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t vr = __riscv_vle32_v_f32m4(iBuffer, vl); + vfloat32m4_t vi = __riscv_vle32_v_f32m4(qBuffer, vl); + __riscv_vsseg2e32((float*)complexVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */ diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h index 0f88ffe68..a0d48f75e 100644 --- a/kernels/volk/volk_32f_x2_max_32f.h +++ b/kernels/volk/volk_32f_x2_max_32f.h @@ -330,4 +330,22 @@ static inline void volk_32f_x2_max_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_max_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfmax(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_max_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h index 128c74839..2910b1f9e 100644 --- a/kernels/volk/volk_32f_x2_min_32f.h +++ b/kernels/volk/volk_32f_x2_min_32f.h @@ -334,4 +334,22 @@ static inline void volk_32f_x2_min_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_min_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfmin(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_min_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h index c36adfc23..af266041d 100644 --- a/kernels/volk/volk_32f_x2_multiply_32f.h +++ b/kernels/volk/volk_32f_x2_multiply_32f.h @@ -356,5 +356,22 @@ static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_multiply_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfmul(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */ diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h index 637fd4b74..c2b772331 100644 --- a/kernels/volk/volk_32f_x2_pow_32f.h +++ b/kernels/volk/volk_32f_x2_pow_32f.h @@ -976,4 +976,127 @@ static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector, #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_pow_32f_rvv(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m1(); + +#if POW_POLY_DEGREE == 6 + const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax); + const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax); + const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax); + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax); +#elif POW_POLY_DEGREE == 5 + const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax); + const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax); + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax); +#elif POW_POLY_DEGREE == 4 + const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax); + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax); +#elif POW_POLY_DEGREE == 3 + const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax); + const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax); + const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax); + const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax); + const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax); + const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax); + const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax); + const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax); + const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax); + const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax); + + const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax); + const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax); + const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax); + const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax); + const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax); + const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax); + + const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1); + const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax); + const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m1(n); + vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl); + vfloat32m1_t log; + + { /* log(a) */ + vfloat32m1_t a = __riscv_vfabs(va, vl); + vfloat32m1_t exp = __riscv_vfcvt_f( + __riscv_vsub( + __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl), + vl); + vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor( + __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl)); + + vfloat32m1_t mant = cl0; + mant = __riscv_vfmadd(mant, frac, cl1, vl); + mant = __riscv_vfmadd(mant, frac, cl2, vl); +#if POW_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, cl3, vl); +#if POW_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, cl4, vl); +#if POW_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, cl5, vl); +#endif +#endif +#endif + log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + log = __riscv_vfmul(log, ln2, vl); + } + + vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl); + vb = __riscv_vfmul(vb, log, vl); /* b*log(a) */ + vfloat32m1_t exp; + + { /* exp(b*log(a)) */ + vb = __riscv_vfmin(vb, exp_hi, vl); + vb = __riscv_vfmax(vb, exp_lo, vl); + vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl); + + vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl); + fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl); + vb = __riscv_vfmacc(vb, exp_C1, fx, vl); + vb = __riscv_vfmacc(vb, exp_C2, fx, vl); + vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl); + + vfloat32m1_t y = ce0; + y = __riscv_vfmadd(y, vb, ce1, vl); + y = __riscv_vfmadd(y, vb, ce2, vl); + y = __riscv_vfmadd(y, vb, ce3, vl); + y = __riscv_vfmadd(y, vb, ce4, vl); + y = __riscv_vfmadd(y, vb, ce5, vl); + y = __riscv_vfmadd(y, vv, vb, vl); + y = __riscv_vfadd(y, cf1, vl); + + vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll( + __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl)); + + exp = __riscv_vfmul(y, pow2n, vl); + } + + __riscv_vse32(cVector, exp, vl); + } +} + +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */ diff --git a/kernels/volk/volk_32f_x2_powpuppet_32f.h b/kernels/volk/volk_32f_x2_powpuppet_32f.h index 419ee18ec..d4df0b3db 100644 --- a/kernels/volk/volk_32f_x2_powpuppet_32f.h +++ b/kernels/volk/volk_32f_x2_powpuppet_32f.h @@ -111,4 +111,16 @@ static inline void volk_32f_x2_powpuppet_32f_u_avx2(float* cVector, } #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +static inline void volk_32f_x2_powpuppet_32f_rvv(float* cVector, + const float* bVector, + const float* aVector, + unsigned int num_points) +{ + float* aVectorPos = make_positive(aVector, num_points); + volk_32f_x2_pow_32f_rvv(cVector, bVector, aVectorPos, num_points); + volk_free(aVectorPos); +} +#endif /* LV_HAVE_RVV */ + #endif /* INCLUDED_volk_32f_x2_powpuppet_32f_H */ diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h index 2ddfb0fd5..9a78a01a7 100644 --- a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h +++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h @@ -326,5 +326,51 @@ static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVec } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) +{ + uint32_t* out = (uint32_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl); + vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl); + vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl); + vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl); + vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri); + vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii); + vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl); + __riscv_vse32(out, vc, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector, + const float* iBuffer, + const float* qBuffer, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl); + vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl); + vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl); + vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl); + __riscv_vsseg2e16( + (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */ diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h index 631b72f84..e3d563fc1 100644 --- a/kernels/volk/volk_32f_x2_subtract_32f.h +++ b/kernels/volk/volk_32f_x2_subtract_32f.h @@ -272,4 +272,22 @@ static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32f_x2_subtract_32f_rvv(float* cVector, + const float* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vfsub(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */ diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index 6afd262a2..53a8a1bf3 100644 --- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -654,4 +654,45 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, } #endif // LV_HAVE_AVX +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_rvv(float* target, + float* src0, + float* center_point_array, + float* cutoff, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t vsum = __riscv_vfmv_v_f_f32m4(0, vlmax); + float mul1 = center_point_array[0]; // scalar to avoid register spills + float mul2 = center_point_array[1]; + vfloat32m4_t vmul3 = __riscv_vfmv_v_f_f32m4(center_point_array[2], vlmax); + vfloat32m4_t vmul4 = __riscv_vfmv_v_f_f32m4(center_point_array[3], vlmax); + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(*cutoff, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl); + vfloat32m4_t v1 = __riscv_vfmax(v, vmax, vl); + vfloat32m4_t v2 = __riscv_vfmul(v1, v1, vl); + vfloat32m4_t v3 = __riscv_vfmul(v1, v2, vl); + vfloat32m4_t v4 = __riscv_vfmul(v2, v2, vl); + v2 = __riscv_vfmul(v2, mul2, vl); + v4 = __riscv_vfmul(v4, vmul4, vl); + v1 = __riscv_vfmadd(v1, mul1, v2, vl); + v3 = __riscv_vfmadd(v3, vmul3, v4, vl); + v1 = __riscv_vfadd(v1, v3, vl); + vsum = __riscv_vfadd_tu(vsum, vsum, v1, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t v = RISCV_SHRINK4(vfadd, f, 32, vsum); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + float sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl)); + *target = sum + num_points * center_point_array[4]; +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/ diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h index b820ed5dc..24eff2b44 100644 --- a/kernels/volk/volk_32fc_32f_add_32fc.h +++ b/kernels/volk/volk_32fc_32f_add_32fc.h @@ -230,5 +230,24 @@ static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_32f_add_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2); + vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl); + vfloat32m8_t vf = __riscv_vreinterpret_f32m8( + __riscv_vreinterpret_u32m8(__riscv_vzext_vf2_u64m8(v, vl))); + __riscv_vse32((float*)cVector, __riscv_vfadd(vc, vf, vl * 2), vl * 2); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 363bf6577..472d405a5 100644 --- a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -743,5 +743,63 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result, #endif /*LV_HAVE_SSE*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_32f_dot_prod_32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)input, vl); + vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr; + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const lv_32fc_t* input, + const float* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr; + vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl); + vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/ diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h index 76ed1af76..b731414cc 100644 --- a/kernels/volk/volk_32fc_32f_multiply_32fc.h +++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -224,5 +224,24 @@ static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_32f_multiply_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const float* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2); + vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl); + vfloat32m8_t vf = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8( + __riscv_vwmaccu(__riscv_vwaddu_vv(v, v, vl), 0xFFFFFFFF, v, vl))); + __riscv_vse32((float*)cVector, __riscv_vfmul(vc, vf, vl * 2), vl * 2); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_accumulator_s32fc.h b/kernels/volk/volk_32fc_accumulator_s32fc.h index d7267ea64..72266bd53 100644 --- a/kernels/volk/volk_32fc_accumulator_s32fc.h +++ b/kernels/volk/volk_32fc_accumulator_s32fc.h @@ -276,4 +276,33 @@ static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_accumulator_s32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* inputBuffer, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m8(); + vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax); + const float* in = (const float*)inputBuffer; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, in += vl) { + vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax); /* force exact vl */ + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + vsum = __riscv_vfadd_tu(vsum, vsum, v, vl); + } + vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum)); + vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax)); + vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax)); + vlmax = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsum1); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsum2); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax))); +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */ diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h index aa1134abd..2edff1191 100644 --- a/kernels/volk/volk_32fc_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_conjugate_32fc.h @@ -260,4 +260,21 @@ static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_conjugate_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + unsigned int num_points) +{ + size_t n = num_points; + vuint64m8_t m = __riscv_vmv_v_x_u64m8(1ull << 63, __riscv_vsetvlmax_e64m8()); + for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t v = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + __riscv_vse64((uint64_t*)cVector, __riscv_vxor(v, m, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h index a38cce64a..55768ab03 100644 --- a/kernels/volk/volk_32fc_convert_16ic.h +++ b/kernels/volk/volk_32fc_convert_16ic.h @@ -416,4 +416,23 @@ static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, } } #endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_convert_16ic_rvv(lv_16sc_t* outputVector, + const lv_32fc_t* inputVector, + unsigned int num_points) +{ + int16_t* out = (int16_t*)outputVector; + float* in = (float*)inputVector; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl); + __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h index f269d6616..569942fe0 100644 --- a/kernels/volk/volk_32fc_deinterleave_32f_x2.h +++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h @@ -254,4 +254,46 @@ static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer, } } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_32f_x2_rvv(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl); + vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl); + __riscv_vse32((uint32_t*)iBuffer, vr, vl); + __riscv_vse32((uint32_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_deinterleave_32f_x2_rvvseg(float* iBuffer, + float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint32m4x2_t vc = + __riscv_vlseg2e32_v_u32m4x2((const uint32_t*)complexVector, vl); + vuint32m4_t vr = __riscv_vget_u32m4(vc, 0); + vuint32m4_t vi = __riscv_vget_u32m4(vc, 1); + __riscv_vse32((uint32_t*)iBuffer, vr, vl); + __riscv_vse32((uint32_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h index 1af5098f7..6599780bc 100644 --- a/kernels/volk/volk_32fc_deinterleave_64f_x2.h +++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h @@ -314,4 +314,44 @@ static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer, } #endif /* LV_HAVE_NEONV8 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_64f_x2_rvv(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + __riscv_vse64(iBuffer, __riscv_vfwcvt_f(vr, vl), vl); + __riscv_vse64(qBuffer, __riscv_vfwcvt_f(vi, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_deinterleave_64f_x2_rvvseg(double* iBuffer, + double* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + __riscv_vse64(iBuffer, __riscv_vfwcvt_f(vr, vl), vl); + __riscv_vse64(qBuffer, __riscv_vfwcvt_f(vi, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h index 9e330d33c..bb54411bd 100644 --- a/kernels/volk/volk_32fc_deinterleave_imag_32f.h +++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h @@ -229,4 +229,22 @@ static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer, } } #endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_imag_32f_rvv(float* qBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl); + __riscv_vse32((uint32_t*)qBuffer, __riscv_vnsrl(vc, 32, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h index 6fc0679dc..f75cdd034 100644 --- a/kernels/volk/volk_32fc_deinterleave_real_32f.h +++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h @@ -234,4 +234,21 @@ static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_real_32f_rvv(float* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl); + __riscv_vse32((uint32_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_deinterleave_real_32f_u_H */ diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h index 31d8f3ecc..5c6b0c959 100644 --- a/kernels/volk/volk_32fc_deinterleave_real_64f.h +++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h @@ -240,4 +240,21 @@ static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_deinterleave_real_64f_rvv(double* iBuffer, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl); + __riscv_vse64(iBuffer, __riscv_vfwcvt_f(__riscv_vreinterpret_f32m4(vi), vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_deinterleave_real_64f_u_H */ diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index 28b517668..781876d10 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -321,7 +321,7 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ uint32_t i = 0; - for (; i> 3; ++i) { + for (; i < (num_bytes >> 3); ++i) { sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); @@ -466,4 +466,65 @@ static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target, #endif /*LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32fc_index_max_16u_rvv(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void +volk_32fc_index_max_16u_rvvseg(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index fafff48c2..993187ca5 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -307,7 +307,7 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ uint32_t i = 0; - for (; i> 3; ++i) { + for (; i < (num_bytes >> 3); ++i) { sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); @@ -509,4 +509,65 @@ volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi #endif /*LV_HAVE_NEON*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void +volk_32fc_index_max_32u_rvv(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void +volk_32fc_index_max_32u_rvvseg(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +{ + vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, src0 += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmflt(vmax, v, vl); + vmax = __riscv_vfmax_tu(vmax, vmax, v, vl); + vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax), + __riscv_vfmv_v_f_f32m1(0, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmax, max, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h index 6cf6d8441..706db915b 100644 --- a/kernels/volk/volk_32fc_index_min_16u.h +++ b/kernels/volk/volk_32fc_index_min_16u.h @@ -462,4 +462,67 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, #endif /*LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_index_min_16u_rvv(uint16_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); + vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2()); + size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/ diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h index 5e409b99e..807a3bb51 100644 --- a/kernels/volk/volk_32fc_index_min_32u.h +++ b/kernels/volk/volk_32fc_index_min_32u.h @@ -504,4 +504,67 @@ static inline void volk_32fc_index_min_32u_neon(uint32_t* target, #endif /*LV_HAVE_NEON*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_index_min_32u_rvv(uint32_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target, + const lv_32fc_t* source, + uint32_t num_points) +{ + vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); + vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, source += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl); + vbool8_t m = __riscv_vmfgt(vmin, v, vl); + vmin = __riscv_vfmin_tu(vmin, vmin, v, vl); + vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl); + vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4()); + } + size_t vl = __riscv_vsetvlmax_e32m4(); + float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin), + __riscv_vfmv_v_f_f32m1(FLT_MAX, 1), + __riscv_vsetvlmax_e32m1())); + vbool8_t m = __riscv_vmfeq(vmin, min, vl); + *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl)); +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/ diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h index eca00e246..7b4e44a5f 100644 --- a/kernels/volk/volk_32fc_magnitude_32f.h +++ b/kernels/volk/volk_32fc_magnitude_32f.h @@ -420,5 +420,42 @@ static inline void volk_32fc_magnitude_32f_neon_fancy_sweet( } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_magnitude_32f_rvv(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_magnitude_32f_rvvseg(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h index e7b11ae96..24fa3a9a0 100644 --- a/kernels/volk/volk_32fc_magnitude_squared_32f.h +++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -350,5 +350,42 @@ static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_magnitude_squared_32f_rvv(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_magnitude_squared_32f_rvvseg(float* magnitudeVector, + const lv_32fc_t* complexVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(magnitudeVector, v, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 759db24cc..7d98b7c2b 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -344,4 +344,113 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, } #endif /* LV_HAVE_AVX2 for unaligned */ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, + const lv_32fc_t* inputVector, + const float normalizeFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax); + const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax); + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax); + const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax); + const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax); + const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); + const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl); + vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl)); + vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl)); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); + vfloat32m2_t x = __riscv_vfdiv( + __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl); + vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl); + x = __riscv_vreinterpret_f32m2( + __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl)); + + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t p = c13; + p = __riscv_vfmadd(p, xx, c11, vl); + p = __riscv_vfmadd(p, xx, c9, vl); + p = __riscv_vfmadd(p, xx, c7, vl); + p = __riscv_vfmadd(p, xx, c5, vl); + p = __riscv_vfmadd(p, xx, c3, vl); + p = __riscv_vfmadd(p, xx, c1, vl); + p = __riscv_vfmul(p, x, vl); + + x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl); + p = __riscv_vmerge(p, x, mswap, vl); + p = __riscv_vfadd_mu( + RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl); + + __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, + const lv_32fc_t* inputVector, + const float normalizeFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + + const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax); + const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax); + const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax); + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax); + const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax); + const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax); + const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); + const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl); + vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); + vfloat32m2_t x = __riscv_vfdiv( + __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl); + vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl); + x = __riscv_vreinterpret_f32m2( + __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl)); + + vfloat32m2_t xx = __riscv_vfmul(x, x, vl); + vfloat32m2_t p = c13; + p = __riscv_vfmadd(p, xx, c11, vl); + p = __riscv_vfmadd(p, xx, c9, vl); + p = __riscv_vfmadd(p, xx, c7, vl); + p = __riscv_vfmadd(p, xx, c5, vl); + p = __riscv_vfmadd(p, xx, c3, vl); + p = __riscv_vfmadd(p, xx, c1, vl); + p = __riscv_vfmul(p, x, vl); + + x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl); + p = __riscv_vmerge(p, x, mswap, vl); + p = __riscv_vfadd_mu( + RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl); + + __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h index c4bfc28e2..51840e3b5 100644 --- a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h +++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h @@ -253,4 +253,24 @@ volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer, #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32fc_s32f_deinterleave_real_16i_rvv(int16_t* iBuffer, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const uint64_t* in = (const uint64_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl); + vfloat32m4_t vif = __riscv_vfmul(__riscv_vreinterpret_f32m4(vi), scalar, vl); + __riscv_vse16(iBuffer, __riscv_vncvt_x(__riscv_vfcvt_x(vif, vl), vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h index 21e12e2d2..f699ed727 100644 --- a/kernels/volk/volk_32fc_s32f_magnitude_16i.h +++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h @@ -302,4 +302,46 @@ static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_s32f_magnitude_16i_rvv(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl); + vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl)); + vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl)); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_s32f_magnitude_16i_rvvseg(int16_t* magnitudeVector, + const lv_32fc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl); + vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0); + vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl); + __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */ diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h index be9aa88a4..f676758eb 100644 --- a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h +++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h @@ -142,4 +142,167 @@ volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput, #endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_s32f_power_spectrum_32f_rvv(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + +#if LOG_POLY_DEGREE == 6 + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax); +#elif LOG_POLY_DEGREE == 5 + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax); +#elif LOG_POLY_DEGREE == 4 + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax); +#elif LOG_POLY_DEGREE == 3 + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1); + const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); + const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + + const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)complexFFTInput, vl); + vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl)); + vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl)); + vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(v, normFactSq, vl); + + vfloat32m2_t a = __riscv_vfabs(v, vl); + vfloat32m2_t exp = __riscv_vfcvt_f( + __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), + vl); + vfloat32m2_t frac = __riscv_vreinterpret_f32m2( + __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl)); + + vfloat32m2_t mant = c0; + mant = __riscv_vfmadd(mant, frac, c1, vl); + mant = __riscv_vfmadd(mant, frac, c2, vl); +#if LOG_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, c3, vl); +#if LOG_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, c4, vl); +#if LOG_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, c5, vl); +#endif +#endif +#endif + v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + v = __riscv_vfmul(v, volk_log2to10factor, vl); + + __riscv_vse32(logPowerOutput, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_32fc_s32f_power_spectrum_32f_rvvseg(float* logPowerOutput, + const lv_32fc_t* complexFFTInput, + const float normalizationFactor, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + +#if LOG_POLY_DEGREE == 6 + const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax); + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax); +#elif LOG_POLY_DEGREE == 5 + const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax); + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax); +#elif LOG_POLY_DEGREE == 4 + const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax); + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax); +#elif LOG_POLY_DEGREE == 3 + const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax); + const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax); + const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax); +#else +#error +#endif + + const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax); + const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1); + const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); + const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + + const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, complexFFTInput += vl, logPowerOutput += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2x2_t vc = + __riscv_vlseg2e32_v_f32m2x2((const float*)complexFFTInput, vl); + vfloat32m2_t vr = __riscv_vget_f32m2(vc, 0); + vfloat32m2_t vi = __riscv_vget_f32m2(vc, 1); + vfloat32m2_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + v = __riscv_vfmul(v, normFactSq, vl); + + vfloat32m2_t a = __riscv_vfabs(v, vl); + vfloat32m2_t exp = __riscv_vfcvt_f( + __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), + vl); + vfloat32m2_t frac = __riscv_vreinterpret_f32m2( + __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl)); + + vfloat32m2_t mant = c0; + mant = __riscv_vfmadd(mant, frac, c1, vl); + mant = __riscv_vfmadd(mant, frac, c2, vl); +#if LOG_POLY_DEGREE >= 4 + mant = __riscv_vfmadd(mant, frac, c3, vl); +#if LOG_POLY_DEGREE >= 5 + mant = __riscv_vfmadd(mant, frac, c4, vl); +#if LOG_POLY_DEGREE >= 6 + mant = __riscv_vfmadd(mant, frac, c5, vl); +#endif +#endif +#endif + v = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + v = __riscv_vfmul(v, volk_log2to10factor, vl); + + __riscv_vse32(logPowerOutput, v, vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h index 3ce071ca4..1ae8ad928 100644 --- a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h @@ -170,4 +170,34 @@ volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector, #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ +#ifdef LV_HAVE_RVV +static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvv(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_rvv( + outVector, inVector, &phase_inc_n, phase, num_points); +} +#endif /*LV_HAVE_RVV*/ + + +#ifdef LV_HAVE_RVVSEG +static inline void volk_32fc_s32fc_rotator2puppet_32fc_rvvseg(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + unsigned int num_points) +{ + lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) }; + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); + const lv_32fc_t phase_inc_n = + *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc)); + volk_32fc_s32fc_x2_rotator2_32fc_rvv( + outVector, inVector, &phase_inc_n, phase, num_points); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h index bee1f0682..6a9018f43 100644 --- a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h @@ -779,4 +779,160 @@ static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVect #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/ +/* Note on the RVV implementation: + * The complex multiply was expanded, because we don't care about the corner cases. + * Otherwise, without -ffast-math, the compiler would inserts function calls, + * which invalidates all vector registers and spills them on each loop iteration. */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD; + + lv_32fc_t inc = 1.0f; + vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr; + for (size_t i = 0; i < vlmax; ++i) { + lv_32fc_t ph = + lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc), + lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc)); + phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax); + phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax); + inc = lv_cmake( + lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc), + lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc)); + } + vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax); + vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax); + + size_t vl = 0; + if (num_points > 0) + while (1) { + size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD; + num_points -= n; + + for (; n > 0; n -= vl, inVector += vl, outVector += vl) { + // vl + +static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector, + const lv_32fc_t* inVector, + const lv_32fc_t* phase_inc, + lv_32fc_t* phase, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m2(); + vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD; + + lv_32fc_t inc = 1.0f; + vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr; + for (size_t i = 0; i < vlmax; ++i) { + lv_32fc_t ph = + lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc), + lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc)); + phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax); + phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax); + inc = lv_cmake( + lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc), + lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc)); + } + vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax); + vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax); + + size_t vl = 0; + if (num_points > 0) + while (1) { + size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD; + num_points -= n; + + for (; n > 0; n -= vl, inVector += vl, outVector += vl) { + // vl + +static inline void volk_32fc_x2_add_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + const float* ina = (const float*)aVector; + const float* inb = (const float*)bVector; + float* out = (float*)cVector; + size_t n = num_points * 2; + for (size_t vl; n > 0; n -= vl, ina += vl, inb += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t va = __riscv_vle32_v_f32m8(ina, vl); + vfloat32m8_t vb = __riscv_vle32_v_f32m8(inb, vl); + __riscv_vse32(out, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h index 7b9aae3ae..a5a4a9df3 100644 --- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h @@ -421,5 +421,72 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result #endif /*LV_HAVE_SSE3*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl); + vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl); + vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl)); + vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl)); + vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl)); + vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl)); + vbi = __riscv_vfneg(vbi, vl); + vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m2(n); + vfloat32m2x2_t va = __riscv_vlseg2e32_v_f32m2x2((const float*)input, vl); + vfloat32m2x2_t vb = __riscv_vlseg2e32_v_f32m2x2((const float*)taps, vl); + vfloat32m2_t var = __riscv_vget_f32m2(va, 0), vai = __riscv_vget_f32m2(va, 1); + vfloat32m2_t vbr = __riscv_vget_f32m2(vb, 0), vbi = __riscv_vget_f32m2(vb, 1); + vbi = __riscv_vfneg(vbi, vl); + vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/ diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h index 3a013cb0a..ceee6559d 100644 --- a/kernels/volk/volk_32fc_x2_divide_32fc.h +++ b/kernels/volk/volk_32fc_x2_divide_32fc.h @@ -414,5 +414,66 @@ static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + + +static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + uint64_t* out = (uint64_t*)cVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl); + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t mul = __riscv_vfrdiv( + __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl); + vfloat32m4_t vr = __riscv_vfmul( + __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl); + vfloat32m4_t vi = __riscv_vfmul( + __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64(out, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t mul = __riscv_vfrdiv( + __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl); + vfloat32m4_t vr = __riscv_vfmul( + __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl); + vfloat32m4_t vi = __riscv_vfmul( + __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index 47d6f6974..d4acab3a3 100644 --- a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -730,5 +730,70 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result, #endif /*LV_HAVE_AVX && LV_HAVE_FMA*/ +#ifdef LV_HAVE_RVV +#include +#include + +static inline void volk_32fc_x2_dot_prod_32fc_rvv(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl); + vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl); + vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl)); + vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl)); + vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl)); + vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl)); + vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include +#include + +static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(lv_32fc_t* result, + const lv_32fc_t* input, + const lv_32fc_t* taps, + unsigned int num_points) +{ + vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vsumi = vsumr; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl); + vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl); + } + size_t vl = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr); + vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi); + vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl); + *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)), + __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl))); +} +#endif /*LV_HAVE_RVVSEG*/ #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h index 96cefed52..2db2929bc 100644 --- a/kernels/volk/volk_32fc_x2_multiply_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -460,4 +460,55 @@ static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_x2_multiply_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl); + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_multiply_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h index 12e4948a0..ce01d6d67 100644 --- a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -287,5 +287,56 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_x2_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl); + vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl)); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index 54ffbf0fd..0b956c205 100644 --- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -535,4 +535,62 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, } #endif // LV_HAVE_SSE +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + float scalar, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0); + vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/ diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h index b35bed5eb..b27f7b7b2 100644 --- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h +++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h @@ -342,4 +342,69 @@ volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t* cVector, } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvv(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + vfloat32m2_t vbr = + __riscv_vfmv_v_f_f32m2(lv_creal(*scalar), __riscv_vsetvlmax_e32m2()); + vfloat32m2_t vbi = + __riscv_vfmv_v_f_f32m2(lv_cimag(*scalar), __riscv_vsetvlmax_e32m2()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, bVector += vl, aVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m2(n); + vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)bVector, vl); + vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)aVector, vl); + vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl)); + vfloat32m2_t vcr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl)); + vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl)); + vfloat32m2_t vci = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl)); + vfloat32m2_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m2_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vuint32m2_t vru = __riscv_vreinterpret_u32m2(__riscv_vfadd(vr, vcr, vl)); + vuint32m2_t viu = __riscv_vreinterpret_u32m2(__riscv_vfadd(vi, vci, vl)); + vuint64m4_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvvseg(lv_32fc_t* cVector, + const lv_32fc_t* aVector, + const lv_32fc_t* bVector, + const lv_32fc_t* scalar, + unsigned int num_points) +{ + vfloat32m4_t vbr = + __riscv_vfmv_v_f_f32m4(lv_creal(*scalar), __riscv_vsetvlmax_e32m4()); + vfloat32m4_t vbi = + __riscv_vfmv_v_f_f32m4(lv_cimag(*scalar), __riscv_vsetvlmax_e32m4()); + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl); + vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl); + vfloat32m4_t vcr = __riscv_vget_f32m4(vc, 0), vci = __riscv_vget_f32m4(vc, 1); + vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1); + vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl); + vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl); + vr = __riscv_vfadd(vr, vcr, vl); + vi = __riscv_vfadd(vi, vci, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */ diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h index 4a93d5bf9..b711bcf10 100644 --- a/kernels/volk/volk_32fc_x2_square_dist_32f.h +++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -277,7 +277,7 @@ static inline void volk_32fc_x2_square_dist_32f_generic(float* target, float sq_dist; unsigned int i = 0; - for (; i> 3; ++i) { + for (; i < (num_bytes >> 3); ++i) { diff = src0[0] - points[i]; sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); @@ -374,4 +374,56 @@ static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, #endif /*LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32fc_x2_square_dist_32f_rvv(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl); + vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl)); + vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl)); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_32fc_x2_square_dist_32f_rvvseg(float* target, + lv_32fc_t* src0, + lv_32fc_t* points, + unsigned int num_points) +{ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax); + vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax); + + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, target += vl, points += vl) { + vl = __riscv_vsetvl_e32m4(n); + vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl); + vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0); + vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1); + vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl); + vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl); + vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl); + __riscv_vse32(target, v, vl); + } +} +#endif /*LV_HAVE_RVVSEG*/ + #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/ diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h index 678290fc8..749cb1af7 100644 --- a/kernels/volk/volk_32i_s32f_convert_32f.h +++ b/kernels/volk/volk_32i_s32f_convert_32f.h @@ -313,5 +313,21 @@ static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32i_s32f_convert_32f_rvv(float* outputVector, + const int32_t* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t v = __riscv_vfcvt_f(__riscv_vle32_v_i32m8(inputVector, vl), vl); + __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */ diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h index d2bcf6b84..79e4f2211 100644 --- a/kernels/volk/volk_32i_x2_and_32i.h +++ b/kernels/volk/volk_32i_x2_and_32i.h @@ -337,5 +337,22 @@ static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32i_x2_and_32i_rvv(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vand(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32i_x2_and_32i_u_H */ diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h index f3e4b769d..3642f13d8 100644 --- a/kernels/volk/volk_32i_x2_or_32i.h +++ b/kernels/volk/volk_32i_x2_or_32i.h @@ -336,5 +336,22 @@ static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32i_x2_or_32i_rvv(int32_t* cVector, + const int32_t* aVector, + const int32_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e32m8(n); + vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl); + vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl); + __riscv_vse32(cVector, __riscv_vor(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_32i_x2_or_32i_u_H */ diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h index a6ec86f80..d5d0613ec 100644 --- a/kernels/volk/volk_32u_byteswap.h +++ b/kernels/volk/volk_32u_byteswap.h @@ -343,5 +343,53 @@ static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int n } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32u_byteswap_rvv(uint32_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + size_t vlmax = __riscv_vsetvlmax_e8m1(); + if (vlmax <= 256) { + vuint8m1_t vidx = __riscv_vreinterpret_u8m1( + __riscv_vsub(__riscv_vreinterpret_u32m1(__riscv_vid_v_u8m1(vlmax)), + 0x3020100 - 0x10203, + vlmax / 4)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle32_v_u32m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgather, v, vidx); + __riscv_vse32(intsToSwap, __riscv_vreinterpret_u32m8(v), vl); + } + } else { + vuint16m2_t vidx = __riscv_vreinterpret_u16m2( + __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)), + 0x3000200010000 - 0x100020003, + vlmax / 4)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle32_v_u32m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx); + __riscv_vse32(intsToSwap, __riscv_vreinterpret_u32m8(v), vl); + } + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void volk_32u_byteswap_rva23(uint32_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t v = __riscv_vle32_v_u32m8(intsToSwap, vl); + __riscv_vse32(intsToSwap, __riscv_vrev8(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ #endif /* INCLUDED_volk_32u_byteswap_a_H */ diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h index a6ef921ff..4ad3deac0 100644 --- a/kernels/volk/volk_32u_byteswappuppet_32u.h +++ b/kernels/volk/volk_32u_byteswappuppet_32u.h @@ -91,4 +91,26 @@ static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output, } #endif +#ifdef LV_HAVE_RVV +static inline void volk_32u_byteswappuppet_32u_rvv(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ + + volk_32u_byteswap_rvv((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); +} +#endif + +#ifdef LV_HAVE_RVA23 +static inline void volk_32u_byteswappuppet_32u_rva23(uint32_t* output, + uint32_t* intsToSwap, + unsigned int num_points) +{ + + volk_32u_byteswap_rva23((uint32_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t)); +} +#endif + #endif diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h index b8c371fbc..3ad2f0aac 100644 --- a/kernels/volk/volk_32u_popcnt.h +++ b/kernels/volk/volk_32u_popcnt.h @@ -76,4 +76,22 @@ static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) #endif /*LV_HAVE_SSE4_2*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value) +{ + *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVA22V +#include + +static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value) +{ + *ret = __riscv_cpop_32(value); +} +#endif /*LV_HAVE_RVA22V*/ + #endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h index 19a17f561..aa0c4ca08 100644 --- a/kernels/volk/volk_32u_popcntpuppet_32u.h +++ b/kernels/volk/volk_32u_popcntpuppet_32u.h @@ -18,10 +18,8 @@ static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_32u_popcnt_generic(outVector + ii, *(inVector + ii)); - } + for (size_t i = 0; i < num_points; ++i) + volk_32u_popcnt_generic(outVector + i, inVector[i]); } #endif /* LV_HAVE_GENERIC */ @@ -30,11 +28,29 @@ static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii)); - } + for (size_t i = 0; i < num_points; ++i) + volk_32u_popcnt_a_sse4_2(outVector + i, inVector[i]); } #endif /* LV_HAVE_SSE4_2 */ +#ifdef LV_HAVE_RVV +static inline void volk_32u_popcntpuppet_32u_rvv(uint32_t* outVector, + const uint32_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) + volk_32u_popcnt_rvv(outVector + i, inVector[i]); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA22V +static inline void volk_32u_popcntpuppet_32u_rva22(uint32_t* outVector, + const uint32_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) + volk_32u_popcnt_rva22(outVector + i, inVector[i]); +} +#endif /* LV_HAVE_RVA22V */ + #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h index 62150ac64..ece8f48b1 100644 --- a/kernels/volk/volk_32u_reverse_32u.h +++ b/kernels/volk/volk_32u_reverse_32u.h @@ -337,4 +337,57 @@ volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_poi #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +#include + +static inline void +volk_32u_reverse_32u_rvv(uint32_t* out, const uint32_t* in, unsigned int num_points) +{ + size_t n = num_points; + + static const uint64_t tblLo[] = { + 0xE060A020C0408000, + 0xF070B030D0509010, + }; + static const uint64_t tblHi[] = { + 0x0E060A020C040800, + 0x0F070B030D050901, + }; + vuint8m1_t vtblLo = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblLo, 2)); + vuint8m1_t vtblHi = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblHi, 2)); + + size_t vlmax = __riscv_vsetvlmax_e8m1(); + vuint16m2_t vidx = __riscv_vreinterpret_u16m2( + __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)), + 0x3000200010000 - 0x100020003, + vlmax / 4)); + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m4(n); + vuint8m4_t v = __riscv_vreinterpret_u8m4(__riscv_vle32_v_u32m4(in, vl)); + v = RISCV_PERM4(__riscv_vrgatherei16, v, vidx); + vuint8m4_t lo = __riscv_vand(v, 0xF, vl * 4); + lo = RISCV_LUT4(__riscv_vrgather, vtblLo, lo); + vuint8m4_t hi = __riscv_vsrl(v, 4, vl * 4); + hi = RISCV_LUT4(__riscv_vrgather, vtblHi, hi); + v = __riscv_vor(hi, lo, vl * 4); + __riscv_vse32(out, __riscv_vreinterpret_u32m4(v), vl); + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void +volk_32u_reverse_32u_rva23(uint32_t* out, const uint32_t* in, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, out += vl) { + vl = __riscv_vsetvl_e32m8(n); + vuint32m8_t v = __riscv_vle32_v_u32m8(in, vl); + __riscv_vse32(out, __riscv_vbrev(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ + #endif /* INCLUDED_volk_32u_reverse_32u_u_H */ diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h index b5f9b5070..67f6ae487 100644 --- a/kernels/volk/volk_64f_convert_32f.h +++ b/kernels/volk/volk_64f_convert_32f.h @@ -315,5 +315,20 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_convert_32f_rvv(float* outputVector, + const double* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t v = __riscv_vle64_v_f64m8(inputVector, vl); + __riscv_vse32(outputVector, __riscv_vfncvt_f(v, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_64f_convert_32f_a_H */ diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h index 867a5d3bc..bf9024e8c 100644 --- a/kernels/volk/volk_64f_x2_add_64f.h +++ b/kernels/volk/volk_64f_x2_add_64f.h @@ -244,4 +244,22 @@ static inline void volk_64f_x2_add_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_add_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_64f_x2_add_64f_u_H */ diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h index 973605c73..e9ca3ef6e 100644 --- a/kernels/volk/volk_64f_x2_max_64f.h +++ b/kernels/volk/volk_64f_x2_max_64f.h @@ -290,5 +290,22 @@ static inline void volk_64f_x2_max_64f_u_avx(double* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_max_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmax(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_64f_x2_max_64f_u_H */ diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h index 970b843f5..7652ef72c 100644 --- a/kernels/volk/volk_64f_x2_min_64f.h +++ b/kernels/volk/volk_64f_x2_min_64f.h @@ -290,5 +290,22 @@ static inline void volk_64f_x2_min_64f_u_avx(double* cVector, } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_min_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmin(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_64f_x2_min_64f_u_H */ diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h index caab3aaa2..57eb468a2 100644 --- a/kernels/volk/volk_64f_x2_multiply_64f.h +++ b/kernels/volk/volk_64f_x2_multiply_64f.h @@ -244,4 +244,22 @@ static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector, #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64f_x2_multiply_64f_rvv(double* cVector, + const double* aVector, + const double* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e64m8(n); + vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl); + vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl); + __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */ diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h index 2fbf3cce5..a8da031c6 100644 --- a/kernels/volk/volk_64u_byteswap.h +++ b/kernels/volk/volk_64u_byteswap.h @@ -383,4 +383,53 @@ static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, #endif /* LV_HAVE_SSSE3 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + size_t vlmax = __riscv_vsetvlmax_e8m1(); + if (vlmax <= 256) { + vuint8m1_t vidx = __riscv_vreinterpret_u8m1( + __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)), + 0x0706050403020100 - 0x1020304050607, + vlmax / 8)); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgather, v, vidx); + __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl); + } + } else { + vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax); + vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax); + vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax); + vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax); + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint8m8_t v = + __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl)); + v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx); + __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl); + } + } +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA23 +#include + +static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap, unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, intsToSwap += vl) { + vl = __riscv_vsetvl_e64m8(n); + vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl); + __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl); + } +} +#endif /* LV_HAVE_RVA23 */ + #endif /* INCLUDED_volk_64u_byteswap_a_H */ diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h index c2b55bf4d..2be3b0b75 100644 --- a/kernels/volk/volk_64u_byteswappuppet_64u.h +++ b/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -92,4 +92,26 @@ static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output, } #endif +#ifdef LV_HAVE_RVV +static inline void volk_64u_byteswappuppet_64u_rvv(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ + + volk_64u_byteswap_rvv((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); +} +#endif + +#ifdef LV_HAVE_RVA23 +static inline void volk_64u_byteswappuppet_64u_rva23(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ + + volk_64u_byteswap_rva23((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); +} +#endif + #endif diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h index 5c9b2a3a3..fb12bbe14 100644 --- a/kernels/volk/volk_64u_popcnt.h +++ b/kernels/volk/volk_64u_popcnt.h @@ -116,5 +116,22 @@ static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value) } #endif /*LV_HAVE_NEON*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_64u_popcnt_rvv(uint64_t* ret, const uint64_t value) +{ + *ret = __riscv_vcpop(__riscv_vreinterpret_b2(__riscv_vmv_s_x_u64m1(value, 1)), 64); +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVA22V +#include + +static inline void volk_64u_popcnt_rva22(uint64_t* ret, const uint64_t value) +{ + *ret = __riscv_cpop_64(value); +} +#endif /*LV_HAVE_RVA22V*/ #endif /*INCLUDED_volk_64u_popcnt_a_H*/ diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h index 300d4fd19..a1ecc487b 100644 --- a/kernels/volk/volk_64u_popcntpuppet_64u.h +++ b/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -19,11 +19,8 @@ static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_64u_popcnt_generic(outVector + ii, num_points); - } - memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); + for (size_t i = 0; i < num_points; ++i) + volk_64u_popcnt_generic(outVector + i, inVector[i]); } #endif /* LV_HAVE_GENERIC */ @@ -32,11 +29,8 @@ static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_64u_popcnt_a_sse4_2(outVector + ii, num_points); - } - memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); + for (size_t i = 0; i < num_points; ++i) + volk_64u_popcnt_a_sse4_2(outVector + i, inVector[i]); } #endif /* LV_HAVE_SSE4_2 */ @@ -45,12 +39,29 @@ static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - unsigned int ii; - for (ii = 0; ii < num_points; ++ii) { - volk_64u_popcnt_neon(outVector + ii, num_points); - } - memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t)); + for (size_t i = 0; i < num_points; ++i) + volk_64u_popcnt_neon(outVector + i, inVector[i]); } #endif /* LV_HAVE_NEON */ +#ifdef LV_HAVE_RVV +static inline void volk_64u_popcntpuppet_64u_rvv(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) + volk_64u_popcnt_rvv(outVector + i, inVector[i]); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVA22V +static inline void volk_64u_popcntpuppet_64u_rva22(uint64_t* outVector, + const uint64_t* inVector, + unsigned int num_points) +{ + for (size_t i = 0; i < num_points; ++i) + volk_64u_popcnt_rva22(outVector + i, inVector[i]); +} +#endif /* LV_HAVE_RVA22V */ + #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h index 36e929bbc..0800f7c57 100644 --- a/kernels/volk/volk_8i_convert_16i.h +++ b/kernels/volk/volk_8i_convert_16i.h @@ -266,5 +266,20 @@ static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8i_convert_16i_rvv(int16_t* outputVector, + const int8_t* inputVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e8m4(n); + vint16m8_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m4(inputVector, vl), vl); + __riscv_vse16(outputVector, __riscv_vsll(v, 8, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h index d904d25d2..cd2c325e4 100644 --- a/kernels/volk/volk_8i_s32f_convert_32f.h +++ b/kernels/volk/volk_8i_s32f_convert_32f.h @@ -350,5 +350,22 @@ static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector, } #endif /* LV_HAVE_ORC */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8i_s32f_convert_32f_rvv(float* outputVector, + const int8_t* inputVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { + vl = __riscv_vsetvl_e8m2(n); + vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl); + __riscv_vse32( + outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h index 46b2e2e42..87d745b8d 100644 --- a/kernels/volk/volk_8ic_deinterleave_16i_x2.h +++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h @@ -392,4 +392,26 @@ static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer, } } #endif /* LV_HAVE_AVX2 */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_deinterleave_16i_x2_rvv(int16_t* iBuffer, + int16_t* qBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl); + vuint16m8_t vr = __riscv_vsll(vc, 8, vl); + vuint16m8_t vi = __riscv_vand(vc, 0xFF00, vl); + __riscv_vse16((uint16_t*)iBuffer, vr, vl); + __riscv_vse16((uint16_t*)qBuffer, vi, vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h index bef475921..8814e5e1e 100644 --- a/kernels/volk/volk_8ic_deinterleave_real_16i.h +++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h @@ -300,4 +300,22 @@ static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer, } } #endif /* LV_HAVE_AVX2 */ + +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_deinterleave_real_16i_rvv(int16_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) +{ + const int16_t* in = (const int16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e16m8(n); + vint16m8_t v = __riscv_vle16_v_i16m8(in, vl); + __riscv_vse16(iBuffer, __riscv_vsra(__riscv_vsll(v, 8, vl), 1, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */ diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h index 116b1afb9..2c409c691 100644 --- a/kernels/volk/volk_8ic_deinterleave_real_8i.h +++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h @@ -402,4 +402,21 @@ static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_deinterleave_real_8i_rvv(int8_t* iBuffer, + const lv_8sc_t* complexVector, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e16m8(n); + vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl); + __riscv_vse8((uint8_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */ diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h index 8936a1699..e0234b163 100644 --- a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h +++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h @@ -441,4 +441,28 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_s32f_deinterleave_32f_x2_rvv(float* iBuffer, + float* qBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl, qBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint16m4_t vc = __riscv_vle16_v_u16m4(in, vl); + vint8m2_t vr = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 0, vl)); + vint8m2_t vi = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 8, vl)); + vfloat32m8_t vrf = __riscv_vfwcvt_f(__riscv_vsext_vf2(vr, vl), vl); + vfloat32m8_t vif = __riscv_vfwcvt_f(__riscv_vsext_vf2(vi, vl), vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + __riscv_vse32(qBuffer, __riscv_vfmul(vif, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ + #endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */ diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h index 37cb25556..7ec8958d3 100644 --- a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h +++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h @@ -349,5 +349,24 @@ volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_s32f_deinterleave_real_32f_rvv(float* iBuffer, + const lv_8sc_t* complexVector, + const float scalar, + unsigned int num_points) +{ + const uint16_t* in = (const uint16_t*)complexVector; + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) { + vl = __riscv_vsetvl_e16m4(n); + vuint16m4_t vc = __riscv_vle16_v_u16m4(in, vl); + vint8m2_t vr = __riscv_vreinterpret_i8m2(__riscv_vnsrl(vc, 0, vl)); + vfloat32m8_t vrf = __riscv_vfwcvt_f(__riscv_vsext_vf2(vr, vl), vl); + __riscv_vse32(iBuffer, __riscv_vfmul(vrf, 1.0f / scalar, vl), vl); + } +} +#endif /*LV_HAVE_RVV*/ #endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H */ diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h index 5462ea673..5de0e3125 100644 --- a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h +++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h @@ -274,4 +274,55 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_x2_multiply_conjugate_16ic_rvv(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m2(n); + vint16m4_t va = __riscv_vle16_v_i16m4((const int16_t*)aVector, vl); + vint16m4_t vb = __riscv_vle16_v_i16m4((const int16_t*)bVector, vl); + vint8m2_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl); + vint8m2_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl); + vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr); + vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi); + vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl); + __riscv_vse32((uint32_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_8ic_x2_multiply_conjugate_16ic_rvvseg(lv_16sc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m2(n); + vint8m2x2_t va = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)aVector, vl); + vint8m2x2_t vb = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)bVector, vl); + vint8m2_t var = __riscv_vget_i8m2(va, 0), vai = __riscv_vget_i8m2(va, 1); + vint8m2_t vbr = __riscv_vget_i8m2(vb, 0), vbi = __riscv_vget_i8m2(vb, 1); + vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m4_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + __riscv_vsseg2e16_v_i16m4x2( + (int16_t*)cVector, __riscv_vcreate_v_i16m4x2(vr, vi), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */ diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h index 318a78192..5316ada06 100644 --- a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h +++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h @@ -341,4 +341,63 @@ volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector, #endif /* LV_HAVE_AVX2*/ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m1(n); + vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl); + vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl); + vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl); + vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl); + vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m2_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl); + vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl); + vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf); + vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif); + vuint64m8_t v = + __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); + __riscv_vse64((uint64_t*)cVector, v, vl); + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void +volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector, + const lv_8sc_t* aVector, + const lv_8sc_t* bVector, + const float scalar, + unsigned int num_points) +{ + size_t n = num_points; + for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) { + vl = __riscv_vsetvl_e8m1(n); + vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl); + vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl); + vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1); + vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1); + vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl); + vint16m2_t vi = + __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl); + vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl); + vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl); + __riscv_vsseg2e32_v_f32m4x2( + (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl); + } +} + +#endif /*LV_HAVE_RVVSEG*/ + #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */ diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h index 51963efd5..5314622b3 100644 --- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h @@ -20,11 +20,14 @@ typedef union { unsigned int* w; } p_decision_t; -static inline int parity(int x, unsigned char* Partab) +static inline int parity(int x) { - x ^= (x >> 16); - x ^= (x >> 8); - return Partab[x]; + x ^= x >> 16; + x ^= x >> 8; + x ^= x >> 4; + x ^= x >> 2; + x ^= x >> 1; + return x & 1; } static inline int chainback_viterbi(unsigned char* data, @@ -113,7 +116,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec, static unsigned char* X; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -127,24 +129,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec, D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -195,7 +185,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec, static unsigned char* X; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -209,24 +198,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec, D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -280,7 +257,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, static unsigned char* X; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -294,24 +270,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -363,7 +327,6 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, static unsigned char* D; static unsigned int excess = 6; static unsigned char* Branchtab; - static unsigned char Partab[256]; int d_polys[2] = { 79, 109 }; @@ -378,24 +341,12 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, volk_get_alignment()); int state, i; - int cnt, ti; - - /* Initialize parity lookup table */ - for (i = 0; i < 256; i++) { - cnt = 0; - ti = i; - while (ti) { - if (ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } + /* Initialize the branch table */ for (state = 0; state < d_numstates / 2; state++) { for (i = 0; i < rate; i++) { Branchtab[i * d_numstates / 2 + state] = - parity((2 * state) & d_polys[i], Partab) ? 255 : 0; + parity((2 * state) & d_polys[i]) ? 255 : 0; } } @@ -427,4 +378,59 @@ static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_RVV +#include + +static inline void volk_8u_conv_k7_r2puppet_8u_rvv(unsigned char* dec, + unsigned char* syms, + unsigned int framebits) +{ + if (framebits < 12) + return; + + int d_numstates = (1 << 6); + static unsigned char* D; + static unsigned char* Y; + static unsigned char* X; + static unsigned int excess = 6; + static unsigned char* Branchtab; + + static int once = 1; + if (once) { + once = 0; + + X = (unsigned char*)volk_malloc(3 * d_numstates, volk_get_alignment()); + Y = X + d_numstates; + Branchtab = Y + d_numstates; + D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6), + volk_get_alignment()); + + /* Initialize the branch table */ + for (size_t state = 0; state < d_numstates / 2; state++) { + Branchtab[state] = parity(state & 39) * 255; + Branchtab[state + d_numstates / 2] = parity(state & 54) * 255; + } + } + + memset(X, 31, d_numstates); // unbias the old_metrics + memset(D, 0, (d_numstates / 8) * (framebits + 6)); // initialize decisions + + volk_8u_x4_conv_k7_r2_8u_rvv( + Y, X, syms, D, framebits / 2 - excess, excess, Branchtab); + + unsigned int min = X[0]; + int i = 0, state = 0; + for (i = 0; i < d_numstates; ++i) { + if (X[i] < min) { + min = X[i]; + state = i; + } + } + + chainback_viterbi(dec, framebits / 2 - excess, state, excess, D); + + return; +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/ diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h index 1464218a4..5d03f03d2 100644 --- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h @@ -1153,5 +1153,84 @@ static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +#include + +static inline void volk_8u_x2_encodeframepolar_8u_rvv(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) +{ + unsigned int stage = log2_of_power_of_2(frame_size); + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + + while (stage) { + // encode stage + if (frame_half < 8) { + encodepolar_single_stage(frame, temp, num_branches, frame_half); + } else { + unsigned char *in = temp, *out = frame; + for (size_t branch = 0; branch < num_branches; ++branch) { + size_t n = frame_half; + for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) { + vl = __riscv_vsetvl_e8m1(n); + vuint16m2_t vc = __riscv_vle16_v_u16m2((uint16_t*)in, vl); + vuint8m1_t v1 = __riscv_vnsrl(vc, 0, vl); + vuint8m1_t v2 = __riscv_vnsrl(vc, 8, vl); + __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl); + __riscv_vse8(out + frame_half, v2, vl); + } + out += frame_half; + } + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + // update all the parameters. + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + --stage; + } +} +#endif /*LV_HAVE_RVV*/ + +#ifdef LV_HAVE_RVVSEG +#include + +static inline void volk_8u_x2_encodeframepolar_8u_rvvseg(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) +{ + unsigned int stage = log2_of_power_of_2(frame_size); + unsigned int frame_half = frame_size >> 1; + unsigned int num_branches = 1; + + while (stage) { + // encode stage + if (frame_half < 8) { + encodepolar_single_stage(frame, temp, num_branches, frame_half); + } else { + unsigned char *in = temp, *out = frame; + for (size_t branch = 0; branch < num_branches; ++branch) { + size_t n = frame_half; + for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) { + vl = __riscv_vsetvl_e8m1(n); + vuint8m1x2_t vc = __riscv_vlseg2e8_v_u8m1x2(in, vl); + vuint8m1_t v1 = __riscv_vget_u8m1(vc, 0); + vuint8m1_t v2 = __riscv_vget_u8m1(vc, 1); + __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl); + __riscv_vse8(out + frame_half, v2, vl); + } + out += frame_half; + } + } + memcpy(temp, frame, sizeof(unsigned char) * frame_size); + + // update all the parameters. + num_branches = num_branches << 1; + frame_half = frame_half >> 1; + --stage; + } +} +#endif /*LV_HAVE_RVVSEG*/ #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */ diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h index 4c45f7573..e54befa4a 100644 --- a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h +++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h @@ -169,4 +169,33 @@ volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +static inline void volk_8u_x3_encodepolar_8u_x2_rvv(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_rvv(frame, temp, frame_size); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +static inline void +volk_8u_x3_encodepolar_8u_x2_rvvseg(unsigned char* frame, + unsigned char* temp, + const unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + interleave_frozen_and_info_bits( + temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_8u_x2_encodeframepolar_8u_rvvseg(frame, temp, frame_size); +} +#endif /* LV_HAVE_RVVSEG */ + #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_ */ diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h index 496ca2e58..792168e0d 100644 --- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h +++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h @@ -156,5 +156,47 @@ volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame, } #endif /* LV_HAVE_AVX2 */ +#ifdef LV_HAVE_RVV +static inline void volk_8u_x3_encodepolarpuppet_8u_rvv(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + if (frame_size < 1) { + return; + } + + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_rvv( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); +} +#endif /* LV_HAVE_RVV */ + +#ifdef LV_HAVE_RVVSEG +static inline void +volk_8u_x3_encodepolarpuppet_8u_rvvseg(unsigned char* frame, + unsigned char* frozen_bit_mask, + const unsigned char* frozen_bits, + const unsigned char* info_bits, + unsigned int frame_size) +{ + if (frame_size < 1) { + return; + } + + frame_size = next_lower_power_of_two(frame_size); + unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size, + volk_get_alignment()); + adjust_frozen_mask(frozen_bit_mask, frame_size); + volk_8u_x3_encodepolar_8u_x2_rvvseg( + frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size); + volk_free(temp); +} +#endif /* LV_HAVE_RVVSEG */ #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_ */ diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 9750b665a..aa7fc3e01 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -465,4 +465,210 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_RVV +#include + +static inline void volk_8u_x4_conv_k7_r2_8u_rvv(unsigned char* Y, + unsigned char* X, + unsigned char* syms, + unsigned char* dec, + unsigned int framebits, + unsigned int excess, + unsigned char* Branchtab) +{ + size_t vl = 256 / 8; + + size_t n = framebits + excess; + + if (__riscv_vlenb() == 128 / 8) { + vuint8m2_t vX0 = __riscv_vle8_v_u8m2(X, vl), + vX1 = __riscv_vle8_v_u8m2(X + vl, vl); + vuint8m2_t vY0 = __riscv_vle8_v_u8m2(Y, vl), + vY1 = __riscv_vle8_v_u8m2(Y + vl, vl); + vuint8m2_t vB0 = __riscv_vle8_v_u8m2(Branchtab, vl); + vuint8m2_t vB1 = __riscv_vle8_v_u8m2(Branchtab + vl, vl); + vuint8m2_t v63 = __riscv_vmv_v_x_u8m2(63, vl); + + for (size_t i = 0; i < n; ++i) { + // Butterfly + vuint8m2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl); + vuint8m2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl); + vuint8m2_t va = __riscv_vaaddu(va0, va1, 0, vl); + va = __riscv_vreinterpret_u8m2( + __riscv_vsrl(__riscv_vreinterpret_u16m2(va), 2, vl / 2)); + va = __riscv_vand(va, v63, vl); + vuint8m2_t vb = __riscv_vssubu(v63, va, vl); + vuint8m2_t vX0a = __riscv_vsaddu(vX0, va, vl); + vuint8m2_t vX1b = __riscv_vsaddu(vX1, vb, vl); + vuint8m2_t vX0b = __riscv_vsaddu(vX0, vb, vl); + vuint8m2_t vX1a = __riscv_vsaddu(vX1, va, vl); + vY0 = __riscv_vminu(vX1b, vX0a, vl); + vY1 = __riscv_vminu(vX1a, vX0b, vl); + + vuint16m4_t vX1ba = + __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl); + vX1b = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 0); + vX1a = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 1); + + vuint16m4_t vm = + __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl); + vY0 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 0); + vY1 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 1); + + __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl); + __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl); + + // Renormalize + vuint8m2_t vmin = __riscv_vminu(vY0, vY1, vl); + vmin = __riscv_vlmul_ext_u8m2( + __riscv_vredminu(vmin, __riscv_vlmul_trunc_u8m1(vmin), vl)); + vmin = __riscv_vrgather(vmin, 0, vl); + vY0 = __riscv_vsub(vY0, vmin, vl); + vY1 = __riscv_vsub(vY1, vmin, vl); + + vuint8m2_t tmp; // Swap pointers to old and new metrics + tmp = vX0; + vX0 = vY0; + vY0 = tmp; + tmp = vX1; + vX1 = vY1; + vY1 = tmp; + } + if (n & 1) { + __riscv_vse8(X, vY0, vl); + __riscv_vse8(X + vl, vY1, vl); + __riscv_vse8(Y, vX0, vl); + __riscv_vse8(Y + vl, vX1, vl); + } else { + __riscv_vse8(X, vX0, vl); + __riscv_vse8(X + vl, vX1, vl); + __riscv_vse8(Y, vY0, vl); + __riscv_vse8(Y + vl, vY1, vl); + } + } else if (__riscv_vlenb() == 256 / 8) { + vuint8m1_t vX0 = __riscv_vle8_v_u8m1(X, vl), + vX1 = __riscv_vle8_v_u8m1(X + vl, vl); + vuint8m1_t vY0 = __riscv_vle8_v_u8m1(Y, vl), + vY1 = __riscv_vle8_v_u8m1(Y + vl, vl); + vuint8m1_t vB0 = __riscv_vle8_v_u8m1(Branchtab, vl); + vuint8m1_t vB1 = __riscv_vle8_v_u8m1(Branchtab + vl, vl); + vuint8m1_t v63 = __riscv_vmv_v_x_u8m1(63, vl); + + for (size_t i = 0; i < n; ++i) { + // Butterfly + vuint8m1_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl); + vuint8m1_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl); + vuint8m1_t va = __riscv_vaaddu(va0, va1, 0, vl); + va = __riscv_vreinterpret_u8m1( + __riscv_vsrl(__riscv_vreinterpret_u16m1(va), 2, vl / 2)); + va = __riscv_vand(va, v63, vl); + vuint8m1_t vb = __riscv_vssubu(v63, va, vl); + vuint8m1_t vX0a = __riscv_vsaddu(vX0, va, vl); + vuint8m1_t vX1b = __riscv_vsaddu(vX1, vb, vl); + vuint8m1_t vX0b = __riscv_vsaddu(vX0, vb, vl); + vuint8m1_t vX1a = __riscv_vsaddu(vX1, va, vl); + vY0 = __riscv_vminu(vX1b, vX0a, vl); + vY1 = __riscv_vminu(vX1a, vX0b, vl); + + vuint16m2_t vX1ba = + __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl); + vX1b = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 0); + vX1a = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 1); + + vuint16m2_t vm = + __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl); + vY0 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 0); + vY1 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 1); + + __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl); + __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl); + + // Renormalize + vuint8m1_t vmin = __riscv_vminu(vY0, vY1, vl); + vmin = __riscv_vrgather(__riscv_vredminu(vmin, vmin, vl), 0, vl); + vY0 = __riscv_vsub(vY0, vmin, vl); + vY1 = __riscv_vsub(vY1, vmin, vl); + + vuint8m1_t tmp; // Swap pointers to old and new metrics + tmp = vX0; + vX0 = vY0; + vY0 = tmp; + tmp = vX1; + vX1 = vY1; + vY1 = tmp; + } + if (n & 1) { + __riscv_vse8(X, vY0, vl); + __riscv_vse8(X + vl, vY1, vl); + __riscv_vse8(Y, vX0, vl); + __riscv_vse8(Y + vl, vX1, vl); + } else { + __riscv_vse8(X, vX0, vl); + __riscv_vse8(X + vl, vX1, vl); + __riscv_vse8(Y, vY0, vl); + __riscv_vse8(Y + vl, vY1, vl); + } + } else { + vuint8mf2_t vX0 = __riscv_vle8_v_u8mf2(X, vl), + vX1 = __riscv_vle8_v_u8mf2(X + vl, vl); + vuint8mf2_t vY0 = __riscv_vle8_v_u8mf2(Y, vl), + vY1 = __riscv_vle8_v_u8mf2(Y + vl, vl); + vuint8mf2_t vB0 = __riscv_vle8_v_u8mf2(Branchtab, vl); + vuint8mf2_t vB1 = __riscv_vle8_v_u8mf2(Branchtab + vl, vl); + vuint8mf2_t v63 = __riscv_vmv_v_x_u8mf2(63, vl); + + for (size_t i = 0; i < n; ++i) { + // Butterfly + vuint8mf2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl); + vuint8mf2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl); + vuint8mf2_t va = __riscv_vaaddu(va0, va1, 0, vl); + va = __riscv_vreinterpret_u8mf2( + __riscv_vsrl(__riscv_vreinterpret_u16mf2(va), 2, vl / 2)); + va = __riscv_vand(va, v63, vl); + vuint8mf2_t vb = __riscv_vssubu(v63, va, vl); + vuint8mf2_t vX0a = __riscv_vsaddu(vX0, va, vl); + vuint8mf2_t vX1b = __riscv_vsaddu(vX1, vb, vl); + vuint8mf2_t vX0b = __riscv_vsaddu(vX0, vb, vl); + vuint8mf2_t vX1a = __riscv_vsaddu(vX1, va, vl); + vY0 = __riscv_vminu(vX1b, vX0a, vl); + vY1 = __riscv_vminu(vX1a, vX0b, vl); + + vuint8m1_t vX1ba = __riscv_vreinterpret_u8m1( + __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl)); + vuint8m1_t vY01 = __riscv_vreinterpret_u8m1( + __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl)); + + __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY01, vX1ba, vl * 2), vl * 2); + + // Renormalize + vuint8m1_t vmin = + __riscv_vrgather(__riscv_vredminu(vY01, vY01, vl * 2), 0, vl * 2); + vY01 = __riscv_vsub(vY01, vmin, vl * 2); + + vY0 = __riscv_vlmul_trunc_u8mf2(vY01); + vY1 = __riscv_vlmul_trunc_u8mf2(__riscv_vslidedown(vY01, vl, vl)); + + vuint8mf2_t tmp; // Swap pointers to old and new metrics + tmp = vX0; + vX0 = vY0; + vY0 = tmp; + tmp = vX1; + vX1 = vY1; + vY1 = tmp; + } + if (n & 1) { + __riscv_vse8(X, vY0, vl); + __riscv_vse8(X + vl, vY1, vl); + __riscv_vse8(Y, vX0, vl); + __riscv_vse8(Y + vl, vX1, vl); + } else { + __riscv_vse8(X, vX0, vl); + __riscv_vse8(X + vl, vX1, vl); + __riscv_vse8(Y, vY0, vl); + __riscv_vse8(Y + vl, vY1, vl); + } + } +} +#endif /*LV_HAVE_RVV*/ + #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 2c160b2f2..588db44f1 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -93,12 +93,28 @@ execute_process( OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE) +try_compile( + HAVE_RVV_INTRINSICS + ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/cmake/Checks/check-rvv-intrinsics.c +) +if(HAVE_RVV_INTRINSICS) + message(STATUS "Checking RVV intrinsics - found") +else() + message(STATUS "Checking RVV intrinsics - not found") +endif() + macro(check_arch arch_name) set(flags ${ARGN}) set(have_${arch_name} TRUE) + + string(SUBSTRING "${arch_name}" 0 2 arch_prefix) foreach(flag ${flags}) if(MSVC AND (${flag} STREQUAL "/arch:SSE2" OR ${flag} STREQUAL "/arch:SSE")) # SSE/SSE2 is supported in MSVC since VS 2005 but flag not available when compiling 64-bit so do not check + elseif("${arch_prefix}" STREQUAL "rv" AND NOT HAVE_RVV_INTRINSICS) + message(STATUS "Skipping ${arch_name} due to missing RVV intrinsics support") + set(have_${arch_name} FALSE) else() include(CheckCXXCompilerFlag) set(have_flag have${flag}) diff --git a/tmpl/volk_cpu.tmpl.c b/tmpl/volk_cpu.tmpl.c index a4a06b0f2..2cf2fa34d 100644 --- a/tmpl/volk_cpu.tmpl.c +++ b/tmpl/volk_cpu.tmpl.c @@ -49,7 +49,7 @@ static int i_can_has_${arch.name} (void) { #if defined(CPU_FEATURES_ARCH_MIPS) if (GetMipsInfo().features.${check} == 0){ return 0; } #endif - %elif "riscv" in arch.name: + %elif "riscv" in arch.name or arch.name[:2] == "rv": #if defined(CPU_FEATURES_ARCH_RISCV) if (GetRiscvInfo().features.${check} == 0){ return 0; } #endif From 3e202a7d29c133188acbc193e43bc9af37a2d3c1 Mon Sep 17 00:00:00 2001 From: Olaf Bernstein Date: Mon, 28 Oct 2024 23:19:24 +0100 Subject: [PATCH 15/67] add braces around one-liners for loops Signed-off-by: Olaf Bernstein --- kernels/volk/volk_32f_acos_32f.h | 36 ++++++++++++------- kernels/volk/volk_32f_cos_32f.h | 21 +++++++---- kernels/volk/volk_32f_index_min_32u.h | 2 +- kernels/volk/volk_32f_invsqrt_32f.h | 9 +++-- kernels/volk/volk_32f_log2_32f.h | 3 +- kernels/volk/volk_32f_sin_32f.h | 6 ++-- kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 3 +- .../volk/volk_32fc_s32fc_x2_rotator2_32fc.h | 6 ++-- kernels/volk/volk_32u_popcntpuppet_32u.h | 12 ++++--- kernels/volk/volk_64u_popcntpuppet_64u.h | 15 +++++--- kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 12 ++++--- 11 files changed, 83 insertions(+), 42 deletions(-) diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h index dd4813ac6..4331987cf 100644 --- a/kernels/volk/volk_32f_acos_32f.h +++ b/kernels/volk/volk_32f_acos_32f.h @@ -102,13 +102,15 @@ static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector, x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_fmadd_ps( y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -171,14 +173,16 @@ volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_p x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -240,13 +244,15 @@ volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu condition = _mm_cmplt_ps(z, fones); x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } x = _mm_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); condition = _mm_cmpgt_ps(z, fones); @@ -315,13 +321,15 @@ static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector, x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_fmadd_ps( y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -384,14 +392,16 @@ volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_p x = _mm256_add_ps( z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x)))); + } x = _mm256_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours)); condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS); @@ -453,14 +463,16 @@ volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu condition = _mm_cmplt_ps(z, fones); x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition)); - for (i = 0; i < 2; i++) + for (i = 0; i < 2; i++) { x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x)))); + } x = _mm_div_ps(fones, x); y = fzeroes; - for (j = ACOS_TERMS - 1; j >= 0; j--) + for (j = ACOS_TERMS - 1; j >= 0; j--) { y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1, j) / (2 * j + 1))); + } y = _mm_mul_ps(y, _mm_mul_ps(x, ffours)); condition = _mm_cmpgt_ps(z, fones); diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h index aa264c07d..854dd00e5 100644 --- a/kernels/volk/volk_32f_cos_32f.h +++ b/kernels/volk/volk_32f_cos_32f.h @@ -127,8 +127,9 @@ static inline void volk_32f_cos_32f_a_avx512f(float* cosVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -224,8 +225,9 @@ volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int n cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -335,8 +337,9 @@ volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_p cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -442,8 +445,9 @@ volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); + } s = _mm_div_ps(s, ftwos); sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s)); @@ -546,8 +550,9 @@ static inline void volk_32f_cos_32f_u_avx512f(float* cosVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -644,8 +649,9 @@ volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int n cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); @@ -755,8 +761,9 @@ volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_p cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s)); + } s = _mm256_div_ps(s, ftwos); sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s)); diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h index 44e4c85d5..7d01fbb48 100644 --- a/kernels/volk/volk_32f_index_min_32u.h +++ b/kernels/volk/volk_32f_index_min_32u.h @@ -42,7 +42,7 @@ * * volk_32f_index_min_32u(out, in, N); * - * ("minimum is %1.2f at index %u\n", in[*out], *out); + * printf("minimum is %1.2f at index %u\n", in[*out], *out); * * volk_free(in); * volk_free(out); diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h index b5a7c8f84..838c99274 100644 --- a/kernels/volk/volk_32f_invsqrt_32f.h +++ b/kernels/volk/volk_32f_invsqrt_32f.h @@ -97,8 +97,9 @@ volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int nu } number = eighthPoints * 8; - for (; number < num_points; number++) + for (; number < num_points; number++) { *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_AVX */ @@ -156,8 +157,9 @@ volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num cPtr += 4; } - for (number = quarter_points * 4; number < num_points; number++) + for (number = quarter_points * 4; number < num_points; number++) { *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_NEON */ @@ -198,8 +200,9 @@ volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int nu } number = eighthPoints * 8; - for (; number < num_points; number++) + for (; number < num_points; number++) { *cPtr++ = Q_rsqrt(*aPtr++); + } } #endif /* LV_HAVE_AVX */ diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index fc1d744c2..47a7cbe38 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -95,8 +95,9 @@ volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num const float* aPtr = aVector; unsigned int number = 0; - for (number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *bPtr++ = log2f_non_ieee(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h index d03ab51db..a02f22601 100644 --- a/kernels/volk/volk_32f_sin_32f.h +++ b/kernels/volk/volk_32f_sin_32f.h @@ -127,8 +127,9 @@ static inline void volk_32f_sin_32f_a_avx512f(float* sinVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); @@ -520,8 +521,9 @@ static inline void volk_32f_sin_32f_u_avx512f(float* sinVector, cp1), s); - for (i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s)); + } s = _mm512_div_ps(s, ftwos); sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s)); diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index 53a8a1bf3..b9a837146 100644 --- a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -341,8 +341,9 @@ static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, result[k] += center_point_array[2] * thrd + center_point_array[3] * frth; } } - for (k = 0; k < 8; k += 2) + for (k = 0; k < 8; k += 2) { result[k] = result[k] + result[k + 1]; + } *target = result[0] + result[2] + result[4] + result[6]; diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h index 6a9018f43..d94f55f7d 100644 --- a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h @@ -853,8 +853,9 @@ static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector, } lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi)); - for (size_t i = 0; i < vlmax - vl; ++i) + for (size_t i = 0; i < vlmax - vl; ++i) { ph /= *phase_inc; // we're going backwards + } *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph)); } #endif /*LV_HAVE_RVV*/ @@ -929,8 +930,9 @@ static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector, } lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi)); - for (size_t i = 0; i < vlmax - vl; ++i) + for (size_t i = 0; i < vlmax - vl; ++i) { ph /= *phase_inc; // we're going backwards + } *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph)); } #endif /*LV_HAVE_RVVSEG*/ diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h index aa0c4ca08..b808eb00c 100644 --- a/kernels/volk/volk_32u_popcntpuppet_32u.h +++ b/kernels/volk/volk_32u_popcntpuppet_32u.h @@ -18,8 +18,9 @@ static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_32u_popcnt_generic(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_GENERIC */ @@ -28,8 +29,9 @@ static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_32u_popcnt_a_sse4_2(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_SSE4_2 */ @@ -38,8 +40,9 @@ static inline void volk_32u_popcntpuppet_32u_rvv(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_32u_popcnt_rvv(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_RVV */ @@ -48,8 +51,9 @@ static inline void volk_32u_popcntpuppet_32u_rva22(uint32_t* outVector, const uint32_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_32u_popcnt_rva22(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_RVA22V */ diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h index a1ecc487b..245aeba19 100644 --- a/kernels/volk/volk_64u_popcntpuppet_64u.h +++ b/kernels/volk/volk_64u_popcntpuppet_64u.h @@ -19,8 +19,9 @@ static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_64u_popcnt_generic(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_GENERIC */ @@ -29,8 +30,9 @@ static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_64u_popcnt_a_sse4_2(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_SSE4_2 */ @@ -39,8 +41,9 @@ static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_64u_popcnt_neon(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_NEON */ @@ -49,8 +52,9 @@ static inline void volk_64u_popcntpuppet_64u_rvv(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_64u_popcnt_rvv(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_RVV */ @@ -59,8 +63,9 @@ static inline void volk_64u_popcntpuppet_64u_rva22(uint64_t* outVector, const uint64_t* inVector, unsigned int num_points) { - for (size_t i = 0; i < num_points; ++i) + for (size_t i = 0; i < num_points; ++i) { volk_64u_popcnt_rva22(outVector + i, inVector[i]); + } } #endif /* LV_HAVE_RVA22V */ diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index aa7fc3e01..cb2db11ac 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -63,11 +63,14 @@ static inline void renormalize(unsigned char* X) int i; unsigned char min = X[0]; - for (i = 0; i < NUMSTATES; i++) - if (min > X[i]) + for (i = 0; i < NUMSTATES; i++) { + if (min > X[i]) { min = X[i]; - for (i = 0; i < NUMSTATES; i++) + } + } + for (i = 0; i < NUMSTATES; i++) { X[i] -= min; + } } @@ -91,8 +94,9 @@ static inline void BFLY(int i, int PRECISIONSHIFT = 2; metricsum = 1; - for (j = 0; j < RATE; j++) + for (j = 0; j < RATE; j++) { metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]); + } metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT; unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); From 0d3df6f1193233fef003e82729e3dd6a600ffd7a Mon Sep 17 00:00:00 2001 From: Olaf Bernstein Date: Mon, 28 Oct 2024 23:20:15 +0100 Subject: [PATCH 16/67] use with submodules recursive in RVV CI Signed-off-by: Olaf Bernstein --- .github/workflows/run-tests-rvv.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-tests-rvv.yml b/.github/workflows/run-tests-rvv.yml index b8184bb7b..10ff857d7 100644 --- a/.github/workflows/run-tests-rvv.yml +++ b/.github/workflows/run-tests-rvv.yml @@ -15,9 +15,10 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 + with: + submodules: "recursive" - name: Install packages run: | - git submodule update --init --recursive sudo apt-get update -q -y sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18 mkdir build From f1cd0dae16eb618b73e324c6cc43fbdf310c7033 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Wed, 22 Dec 2021 13:23:46 +0100 Subject: [PATCH 17/67] gtest: Start work on new test infrastructure Use GTest to run tests. This should help us in quite a few places. Signed-off-by: Johannes Demel --- CMakeLists.txt | 5 ++ tests/CMakeLists.txt | 54 +++++++++++++++++++ tests/test_volk_32fc_x2_multiply_32fc.cc | 66 ++++++++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 tests/CMakeLists.txt create mode 100644 tests/test_volk_32fc_x2_multiply_32fc.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 01f0acb31..f91f88af3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -371,6 +371,11 @@ message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF") ######################################################################## add_subdirectory(lib) +######################################################################## +# Add tests +######################################################################## +add_subdirectory(tests) + ######################################################################## # And the utility apps ######################################################################## diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 000000000..da0adbc40 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,54 @@ +# +# Copyright 2021 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +add_subdirectory(googletest) + +add_executable( + volk_tests + test_volk_32fc_x2_multiply_32fc.cc +) +target_link_libraries( + volk_tests + gtest_main + volk +) + +include(GoogleTest) +gtest_discover_tests(volk_tests) + + +target_include_directories(volk_tests + PRIVATE $ + PRIVATE $ + PRIVATE $ + PRIVATE $ + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} +) + +# if(ENABLE_STATIC_LIBS) +# target_link_libraries(volk_tests PRIVATE volk_static) +# set_target_properties(volk_tests PROPERTIES LINK_FLAGS "-static") +# else() +# target_link_libraries(volk_tests PRIVATE volk) +# endif() + +# install( +# TARGETS volk_tests +# DESTINATION bin +# COMPONENT "volk" +# ) \ No newline at end of file diff --git a/tests/test_volk_32fc_x2_multiply_32fc.cc b/tests/test_volk_32fc_x2_multiply_32fc.cc new file mode 100644 index 000000000..fab99310f --- /dev/null +++ b/tests/test_volk_32fc_x2_multiply_32fc.cc @@ -0,0 +1,66 @@ +#include +#include +#include + + +template +::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& expected, + const T& actual) +{ + ::testing::AssertionResult result = ::testing::AssertionFailure(); + if (expected.size() != actual.size()) { + return result << "expected result size=" << expected.size() + << " differs from actual size=" << actual.size(); + } + const unsigned long length = expected.size(); + + int errorsFound = 0; + const char* separator = " "; + for (unsigned long index = 0; index < length; index++) { + auto expected_real = ::testing::internal::FloatingPoint(expected[index].real()); + auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag()); + auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); + auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); + if (not expected_real.AlmostEquals(actual_real) or + not expected_imag.AlmostEquals(actual_imag)) + + { + if (errorsFound == 0) { + result << "Differences found:"; + } + if (errorsFound < 3) { + result << separator << expected[index] << " != " << actual[index] << " @ " + << index; + separator = ",\n"; + } + errorsFound++; + } + } + if (errorsFound > 0) { + result << separator << errorsFound << " differences in total"; + return result; + } + return ::testing::AssertionSuccess(); +} + + +TEST(Multiply, AVX) +{ + const size_t vector_length = 32; + auto vec0 = volk::vector(vector_length); + auto vec1 = volk::vector(vector_length); + auto result = volk::vector(vector_length); + for (size_t i = 0; i < vector_length; ++i) { + vec0[i] = std::complex(i * 3.14, i * 0.45); + vec1[i] = std::complex(i * -2.78, i * 5.44); + } + + auto expected = volk::vector(vector_length); + for (size_t i = 0; i < vector_length; ++i) { + expected[i] = vec0[i] * vec1[i]; + } + + volk_32fc_x2_multiply_32fc_manual(result.data(), vec0.data(), vec1.data(), vector_length); + // EXPECT_ITERABLE_COMPLEX_FLOAT_EQ(volk::vector, expected, result); + EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result)); +} From ed26357a7236ba75b616ab67375e6806fe9da641 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 19 Nov 2022 19:52:39 +0100 Subject: [PATCH 18/67] ci: Add a second test example This is an ongoing effort to introduce some more sophisticated QA tests. We use googletest and write custom test suits. This approach requires more manual work but yields a more accurate result as well. Signed-off-by: Johannes Demel --- tests/CMakeLists.txt | 24 ++-- tests/test_volk_32f_x3_sum_of_poly_32f.cc | 141 ++++++++++++++++++++++ tests/test_volk_32fc_x2_multiply_32fc.cc | 141 ++++++++++++++-------- tests/volk_test.cc | 80 ++++++++++++ tests/volk_test.h | 76 ++++++++++++ tmpl/volk.tmpl.c | 9 +- 6 files changed, 407 insertions(+), 64 deletions(-) create mode 100644 tests/test_volk_32f_x3_sum_of_poly_32f.cc create mode 100644 tests/volk_test.cc create mode 100644 tests/volk_test.h diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index da0adbc40..860adb74f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,30 +1,28 @@ # -# Copyright 2021 Free Software Foundation, Inc. +# Copyright 2022, 2024 Johannes Demel # -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. +# This file is part of VOLK. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . +# SPDX-License-Identifier: LGPL-3.0-or-later # add_subdirectory(googletest) +find_package(fmt REQUIRED) + +file(GLOB volk_test_files "test_*.cc") + add_executable( volk_tests - test_volk_32fc_x2_multiply_32fc.cc + volk_test.cc + ${volk_test_files} ) + target_link_libraries( volk_tests gtest_main volk + fmt::fmt ) include(GoogleTest) diff --git a/tests/test_volk_32f_x3_sum_of_poly_32f.cc b/tests/test_volk_32f_x3_sum_of_poly_32f.cc new file mode 100644 index 000000000..59c50ebfb --- /dev/null +++ b/tests/test_volk_32f_x3_sum_of_poly_32f.cc @@ -0,0 +1,141 @@ +/* -*- c++ -*- */ +/* + * Copyright 2022 Johannes Demel + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +#include "volk_test.h" +#include +#include +#include +#include +#include +#include +#include + + +class volk_32f_x3_sum_of_poly_32f_test : public VolkTest +{ +protected: + void SetUp() override + { + initialize_implementation_names(volk_32f_x3_sum_of_poly_32f_get_func_desc()); + initialize_data(GetParam()); + } + + void initialize_data(const size_t length) + { + vector_length = length; + + vec0 = volk::vector(length); + for (size_t i = 0; i < length; ++i) { + vec0[i] = float(2.8f + i * 0.14f); + } + + ua_vec0 = volk::vector({ 0.0f }); + for (auto v : vec0) { + ua_vec0.push_back(v); + } + + center_points = volk::vector({ 4.4, 2.1, 0.3, 0.05, 4.1 }); + ua_center_points = volk::vector({ + 0.0, + }); + for (auto v : center_points) { + ua_center_points.push_back(v); + } + + cutoff = volk::vector({ -1.5 }); + ua_cutoff = cutoff; + ua_cutoff.push_back(cutoff.at(0)); + + expected = 0.0f; + for (auto value : vec0) { + value = std::max(value, cutoff.at(0)); + auto sq = value * value; + auto cube = sq * value; + auto quartic = value * cube; + expected += value * center_points[0] + sq * center_points[1] + + cube * center_points[2] + quartic * center_points[3]; + } + expected += center_points[4] * float(length); + + result = volk::vector(1, 0.0); + ua_result.push_back(result.at(0)); + ua_result.push_back(result.at(0)); + } + + void execute_aligned(const std::string impl_name) + { + volk_32f_x3_sum_of_poly_32f_manual(result.data(), + vec0.data(), + center_points.data(), + cutoff.data(), + vector_length, + impl_name.c_str()); + } + + void execute_unaligned(const std::string impl_name) + { + volk_32f_x3_sum_of_poly_32f_manual(ua_result.data() + 1, + ua_vec0.data() + 1, + ua_center_points.data() + 1, + ua_cutoff.data() + 1, + vector_length, + impl_name.c_str()); + } + + // void TearDown() override {} + size_t vector_length; + volk::vector vec0; + volk::vector ua_vec0; + volk::vector center_points; + volk::vector ua_center_points; + volk::vector cutoff; + volk::vector ua_cutoff; + volk::vector result; + volk::vector ua_result; + float expected = 0.0f; +}; + + +TEST_P(volk_32f_x3_sum_of_poly_32f_test, aligned) +{ + for (auto name : implementation_names) { + auto tol = std::max(expected * 1e-5, 1e-5); + fmt::print( + "test aligned implementation: {:>12}, size={} ...", name, vector_length); + auto start = std::chrono::steady_clock::now(); + + execute_aligned(name); + + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); + EXPECT_NEAR(result.at(0), expected, tol); + } +} + +TEST_P(volk_32f_x3_sum_of_poly_32f_test, unaligned) +{ + for (auto name : unaligned_impl_names) { + auto tol = std::max(expected * 1e-5, 1e-5); + fmt::print( + "test unaligned implementation: {:>12}, size={} ...", name, vector_length); + auto start = std::chrono::steady_clock::now(); + + execute_unaligned(name); + + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); + EXPECT_NEAR(ua_result.at(1), expected, tol); + } +} + + +INSTANTIATE_TEST_SUITE_P(volk_32f_x3_sum_of_poly_32f, + volk_32f_x3_sum_of_poly_32f_test, + // testing::Values(8, 32) + testing::Values(7, 32, 128, 1023, 65535, 131071)); diff --git a/tests/test_volk_32fc_x2_multiply_32fc.cc b/tests/test_volk_32fc_x2_multiply_32fc.cc index fab99310f..77daebebb 100644 --- a/tests/test_volk_32fc_x2_multiply_32fc.cc +++ b/tests/test_volk_32fc_x2_multiply_32fc.cc @@ -1,66 +1,109 @@ +/* -*- c++ -*- */ +/* + * Copyright 2022 Johannes Demel + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +#include "volk_test.h" +#include +#include +#include #include #include #include +#include -template -::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& expected, - const T& actual) +class volk_32fc_x2_multiply_32fc_test : public VolkTest { - ::testing::AssertionResult result = ::testing::AssertionFailure(); - if (expected.size() != actual.size()) { - return result << "expected result size=" << expected.size() - << " differs from actual size=" << actual.size(); +protected: + void SetUp() override + { + initialize_implementation_names(volk_32fc_x2_multiply_32fc_get_func_desc()); + initialize_data(GetParam()); } - const unsigned long length = expected.size(); - - int errorsFound = 0; - const char* separator = " "; - for (unsigned long index = 0; index < length; index++) { - auto expected_real = ::testing::internal::FloatingPoint(expected[index].real()); - auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag()); - auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); - auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); - if (not expected_real.AlmostEquals(actual_real) or - not expected_imag.AlmostEquals(actual_imag)) - - { - if (errorsFound == 0) { - result << "Differences found:"; - } - if (errorsFound < 3) { - result << separator << expected[index] << " != " << actual[index] << " @ " - << index; - separator = ",\n"; - } - errorsFound++; + + void initialize_data(const size_t length) + { + vector_length = length; + vec0 = volk::vector(length); + vec1 = volk::vector(length); + result = volk::vector(length); + for (size_t i = 0; i < length; ++i) { + vec0[i] = std::complex(i * 3.14, i * 0.45); + vec1[i] = std::complex(i * -2.78, i * 5.44); } + + expected = volk::vector(length); + for (size_t i = 0; i < length; ++i) { + expected[i] = vec0[i] * vec1[i]; + } + + // This is a hacky solution to have unaligned tests. + ua_result = result; + ua_result.at(0) = expected.at(0); } - if (errorsFound > 0) { - result << separator << errorsFound << " differences in total"; - return result; + + void execute_aligned(const std::string impl_name) + { + volk_32fc_x2_multiply_32fc_manual( + result.data(), vec0.data(), vec1.data(), vector_length, impl_name.c_str()); } - return ::testing::AssertionSuccess(); -} + void execute_unaligned(const std::string impl_name) + { + volk_32fc_x2_multiply_32fc_manual(ua_result.data() + 1, + vec0.data() + 1, + vec1.data() + 1, + vector_length - 1, + impl_name.c_str()); + } + + // void TearDown() override {} + size_t vector_length; + volk::vector vec0; + volk::vector vec1; + volk::vector result; + volk::vector expected; + + volk::vector ua_result; +}; -TEST(Multiply, AVX) + +TEST_P(volk_32fc_x2_multiply_32fc_test, aligned) { - const size_t vector_length = 32; - auto vec0 = volk::vector(vector_length); - auto vec1 = volk::vector(vector_length); - auto result = volk::vector(vector_length); - for (size_t i = 0; i < vector_length; ++i) { - vec0[i] = std::complex(i * 3.14, i * 0.45); - vec1[i] = std::complex(i * -2.78, i * 5.44); - } + for (auto name : implementation_names) { + fmt::print( + "test aligned implementation: {:>12}, size={} ...", name, vector_length); + auto start = std::chrono::steady_clock::now(); + + execute_aligned(name); - auto expected = volk::vector(vector_length); - for (size_t i = 0; i < vector_length; ++i) { - expected[i] = vec0[i] * vec1[i]; + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); + EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result)); } +} + +TEST_P(volk_32fc_x2_multiply_32fc_test, unaligned) +{ + for (auto name : unaligned_impl_names) { + fmt::print( + "test unaligned implementation: {:>12}, size={} ...", name, vector_length); + auto start = std::chrono::steady_clock::now(); + + execute_unaligned(name); - volk_32fc_x2_multiply_32fc_manual(result.data(), vec0.data(), vec1.data(), vector_length); - // EXPECT_ITERABLE_COMPLEX_FLOAT_EQ(volk::vector, expected, result); - EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result)); + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); + EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, ua_result)); + } } + + +INSTANTIATE_TEST_SUITE_P(volk_32fc_x2_multiply_32fc, + volk_32fc_x2_multiply_32fc_test, + testing::Values(7, 32, 128, 1023, 131071)); diff --git a/tests/volk_test.cc b/tests/volk_test.cc new file mode 100644 index 000000000..ac580b40d --- /dev/null +++ b/tests/volk_test.cc @@ -0,0 +1,80 @@ +/* -*- c++ -*- */ +/* + * Copyright 2022 Johannes Demel + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +#include +#include +#include +#include +#include + + +template +::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& expected, + const T& actual) +{ + ::testing::AssertionResult result = ::testing::AssertionFailure(); + if (expected.size() != actual.size()) { + return result << "expected result size=" << expected.size() + << " differs from actual size=" << actual.size(); + } + const unsigned long length = expected.size(); + + int errorsFound = 0; + const char* separator = " "; + for (unsigned long index = 0; index < length; index++) { + auto expected_real = ::testing::internal::FloatingPoint(expected[index].real()); + auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag()); + auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); + auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); + if (not expected_real.AlmostEquals(actual_real) or + not expected_imag.AlmostEquals(actual_imag)) + + { + if (errorsFound == 0) { + result << "Differences found:"; + } + if (errorsFound < 3) { + result << separator << expected[index] << " != " << actual[index] << " @ " + << index; + separator = ",\n"; + } + errorsFound++; + } + } + if (errorsFound > 0) { + result << separator << errorsFound << " differences in total"; + return result; + } + return ::testing::AssertionSuccess(); +} + +std::vector get_kernel_implementation_name_list(volk_func_desc_t desc) +{ + std::vector names; + for (size_t i = 0; i < desc.n_impls; i++) { + names.push_back(std::string(desc.impl_names[i])); + } + std::sort(names.begin(), names.end()); + return names; +} + +std::tuple, std::vector> +separate_implementations_by_alignment(std::vector names) +{ + std::vector aligned; + std::vector unaligned; + for (auto name : names) { + if (name.rfind("a_", 0) == 0) { + aligned.push_back(name); + } else { + unaligned.push_back(name); + } + } + return { aligned, unaligned }; +} diff --git a/tests/volk_test.h b/tests/volk_test.h new file mode 100644 index 000000000..20f8780bb --- /dev/null +++ b/tests/volk_test.h @@ -0,0 +1,76 @@ +/* -*- c++ -*- */ +/* + * Copyright 2022 Johannes Demel + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +#include +#include +#include +#include +#include + + +std::vector get_kernel_implementation_name_list(volk_func_desc_t desc); + +std::tuple, std::vector> +separate_implementations_by_alignment(std::vector names); + +class VolkTest : public ::testing::TestWithParam +{ +protected: + void initialize_implementation_names(volk_func_desc_t desc) + { + implementation_names = get_kernel_implementation_name_list(desc); + std::tie(aligned_impl_names, unaligned_impl_names) = + separate_implementations_by_alignment(implementation_names); + } + + std::vector implementation_names; + std::vector aligned_impl_names; + std::vector unaligned_impl_names; +}; + + +template +::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& expected, + const T& actual) +{ + ::testing::AssertionResult result = ::testing::AssertionFailure(); + if (expected.size() != actual.size()) { + return result << "expected result size=" << expected.size() + << " differs from actual size=" << actual.size(); + } + const unsigned long length = expected.size(); + + int errorsFound = 0; + const char* separator = " "; + for (unsigned long index = 0; index < length; index++) { + auto expected_real = ::testing::internal::FloatingPoint(expected[index].real()); + auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag()); + auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); + auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); + if (not expected_real.AlmostEquals(actual_real) or + not expected_imag.AlmostEquals(actual_imag)) + + { + if (errorsFound == 0) { + result << "Differences found:"; + } + if (errorsFound < 3) { + result << separator << expected[index] << " != " << actual[index] << " @ " + << index; + separator = ",\n"; + } + errorsFound++; + } + } + if (errorsFound > 0) { + result << separator << errorsFound << " differences in total"; + return result; + } + return ::testing::AssertionSuccess(); +} diff --git a/tmpl/volk.tmpl.c b/tmpl/volk.tmpl.c index 62a9400ad..63c52b92c 100644 --- a/tmpl/volk.tmpl.c +++ b/tmpl/volk.tmpl.c @@ -169,14 +169,19 @@ static inline void __${kern.name}(${kern.arglist_full}) ${kern.pname} ${kern.name}_u = &__${kern.name}_u; ${kern.pname} ${kern.name} = &__${kern.name}; -void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) +${kern.pname} ${kern.name}_get_impl(const char* impl_name) { const int index = volk_get_index( get_machine()->${kern.name}_impl_names, get_machine()->${kern.name}_n_impls, impl_name ); - get_machine()->${kern.name}_impls[index]( + return get_machine()->${kern.name}_impls[index]; +} + +void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) +{ + ${kern.name}_get_impl(impl_name)( ${kern.arglist_names} ); } From a06853236d3625d95385e5e88dbd17ec72b935b1 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 3 Dec 2023 14:04:10 +0100 Subject: [PATCH 19/67] tests: Add a log info print test Potentially, we want to be more verbose. This is a first effort to improve in this area. Signed-off-by: Johannes Demel --- tests/googletest | 1 + tests/test_volk_32fc_x2_multiply_32fc.cc | 11 ++++++++++- tests/volk_test.cc | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) create mode 160000 tests/googletest diff --git a/tests/googletest b/tests/googletest new file mode 160000 index 000000000..9a32aee22 --- /dev/null +++ b/tests/googletest @@ -0,0 +1 @@ +Subproject commit 9a32aee22d771387c494be2d8519fbdf46a713b2 diff --git a/tests/test_volk_32fc_x2_multiply_32fc.cc b/tests/test_volk_32fc_x2_multiply_32fc.cc index 77daebebb..aa527afce 100644 --- a/tests/test_volk_32fc_x2_multiply_32fc.cc +++ b/tests/test_volk_32fc_x2_multiply_32fc.cc @@ -51,6 +51,7 @@ class volk_32fc_x2_multiply_32fc_test : public VolkTest { volk_32fc_x2_multiply_32fc_manual( result.data(), vec0.data(), vec1.data(), vector_length, impl_name.c_str()); + EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result)); } void execute_unaligned(const std::string impl_name) @@ -91,6 +92,9 @@ TEST_P(volk_32fc_x2_multiply_32fc_test, aligned) TEST_P(volk_32fc_x2_multiply_32fc_test, unaligned) { for (auto name : unaligned_impl_names) { + RecordProperty("aligned", false); + RecordProperty("implementation", name); + RecordProperty("size", vector_length); fmt::print( "test unaligned implementation: {:>12}, size={} ...", name, vector_length); auto start = std::chrono::steady_clock::now(); @@ -106,4 +110,9 @@ TEST_P(volk_32fc_x2_multiply_32fc_test, unaligned) INSTANTIATE_TEST_SUITE_P(volk_32fc_x2_multiply_32fc, volk_32fc_x2_multiply_32fc_test, - testing::Values(7, 32, 128, 1023, 131071)); + testing::Values(7, 32, 128, 1023, 131071), + testing::PrintToStringParamName() + // [](const testing::TestParamInfo& info) { + // return fmt::format("{}", info.param); + // } +); diff --git a/tests/volk_test.cc b/tests/volk_test.cc index ac580b40d..6c0589a06 100644 --- a/tests/volk_test.cc +++ b/tests/volk_test.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include From 8f43c21238e95bbf3e3f493adcbce9e5fc592ddf Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Mon, 19 Aug 2024 22:12:28 +0200 Subject: [PATCH 20/67] gtest: Make gtest an install dependency It's probably a good thing to have googletest as an install dependency. Signed-off-by: Johannes Demel --- tests/CMakeLists.txt | 35 ++++++++++------------- tests/googletest | 1 - tests/test_volk_32f_x3_sum_of_poly_32f.cc | 1 - 3 files changed, 15 insertions(+), 22 deletions(-) delete mode 160000 tests/googletest diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 860adb74f..fd9d8b8c4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,9 +6,17 @@ # SPDX-License-Identifier: LGPL-3.0-or-later # -add_subdirectory(googletest) +if(NOT ENABLE_TESTING) + return() +endif(NOT ENABLE_TESTING) -find_package(fmt REQUIRED) +find_package(fmt) +find_package(GTest) + +if(NOT fmt_FOUND OR NOT GTest_FOUND) + message(warning "Missing fmtlib and/or googletest for this test suite") + return() +endif(NOT fmt_FOUND OR NOT GTest_FOUND) file(GLOB volk_test_files "test_*.cc") @@ -18,11 +26,11 @@ add_executable( ${volk_test_files} ) -target_link_libraries( - volk_tests - gtest_main - volk - fmt::fmt +target_link_libraries(volk_tests + PRIVATE + GTest::gtest_main + volk + fmt::fmt ) include(GoogleTest) @@ -37,16 +45,3 @@ target_include_directories(volk_tests PRIVATE ${CMAKE_CURRENT_BINARY_DIR} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ) - -# if(ENABLE_STATIC_LIBS) -# target_link_libraries(volk_tests PRIVATE volk_static) -# set_target_properties(volk_tests PROPERTIES LINK_FLAGS "-static") -# else() -# target_link_libraries(volk_tests PRIVATE volk) -# endif() - -# install( -# TARGETS volk_tests -# DESTINATION bin -# COMPONENT "volk" -# ) \ No newline at end of file diff --git a/tests/googletest b/tests/googletest deleted file mode 160000 index 9a32aee22..000000000 --- a/tests/googletest +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9a32aee22d771387c494be2d8519fbdf46a713b2 diff --git a/tests/test_volk_32f_x3_sum_of_poly_32f.cc b/tests/test_volk_32f_x3_sum_of_poly_32f.cc index 59c50ebfb..27d972dcf 100644 --- a/tests/test_volk_32f_x3_sum_of_poly_32f.cc +++ b/tests/test_volk_32f_x3_sum_of_poly_32f.cc @@ -137,5 +137,4 @@ TEST_P(volk_32f_x3_sum_of_poly_32f_test, unaligned) INSTANTIATE_TEST_SUITE_P(volk_32f_x3_sum_of_poly_32f, volk_32f_x3_sum_of_poly_32f_test, - // testing::Values(8, 32) testing::Values(7, 32, 128, 1023, 65535, 131071)); From 5b0aac655cf2243ff68ca102fef47aab9ca7a343 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Mon, 19 Aug 2024 22:30:42 +0200 Subject: [PATCH 21/67] gtest: Enable GTests in CI workflows We want to gradually migrate to a more stable test framework. This requires that our CI runs the new framework. Here we go! Signed-off-by: Johannes Demel --- .github/workflows/publish_docs.yml | 52 ++++---- .github/workflows/run-tests.yml | 196 ++++++++++++++--------------- 2 files changed, 122 insertions(+), 126 deletions(-) diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml index 0990727f5..9bbf5bbdb 100644 --- a/.github/workflows/publish_docs.yml +++ b/.github/workflows/publish_docs.yml @@ -7,29 +7,29 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3.1.0 - with: - submodules: 'recursive' - - name: Install dependencies - run: sudo apt install python3-mako liborc-dev doxygen - - name: Configure - run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. - - name: Build - run: cmake --build build --target volk_doc - - name: Setup SSH Keys - env: - SSH_AUTH_SOCK: /tmp/ssh_agent.sock - run: | - ssh-agent -a $SSH_AUTH_SOCK > /dev/null - ssh-add - <<< "${{ secrets.SSH_KEY }}" - mkdir $HOME/.ssh - echo -n "${{ secrets.SSH_KNOWN_HOST }}" > $HOME/.ssh/known_hosts - - name: Upload via SSH - env: - SSH_AUTH_SOCK: /tmp/ssh_agent.sock - TARGET_DIR: "${{ github.ref_type }}/${{ github.ref_name }}" - run: 'tar -cz build/html/ | ssh ${{ secrets.SSH_USER }}@${{ secrets.SSH_SERVER }} "mkdir -p /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); cd /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); tar --strip-components=2 -xz; rm /www/${{ env.TARGET_DIR }}/live; cd /www/${{ env.TARGET_DIR }}; ln -sf $(date +%Y.%m.%d) live;"' - - uses: actions/upload-artifact@v4 - with: - name: volk_docs - path: build/html/ + - uses: actions/checkout@v4 + with: + submodules: "recursive" + - name: Install dependencies + run: sudo apt install python3-mako liborc-dev doxygen + - name: Configure + run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. + - name: Build + run: cmake --build build --target volk_doc + - name: Setup SSH Keys + env: + SSH_AUTH_SOCK: /tmp/ssh_agent.sock + run: | + ssh-agent -a $SSH_AUTH_SOCK > /dev/null + ssh-add - <<< "${{ secrets.SSH_KEY }}" + mkdir $HOME/.ssh + echo -n "${{ secrets.SSH_KNOWN_HOST }}" > $HOME/.ssh/known_hosts + - name: Upload via SSH + env: + SSH_AUTH_SOCK: /tmp/ssh_agent.sock + TARGET_DIR: "${{ github.ref_type }}/${{ github.ref_name }}" + run: 'tar -cz build/html/ | ssh ${{ secrets.SSH_USER }}@${{ secrets.SSH_SERVER }} "mkdir -p /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); cd /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); tar --strip-components=2 -xz; rm /www/${{ env.TARGET_DIR }}/live; cd /www/${{ env.TARGET_DIR }}; ln -sf $(date +%Y.%m.%d) live;"' + - uses: actions/upload-artifact@v4 + with: + name: volk_docs + path: build/html/ diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 8fe1daa0b..fb0d0a080 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -18,49 +18,51 @@ jobs: fail-fast: false matrix: compiler: - - { name: g++-7, cc: gcc-7, cxx: g++-7, distro: ubuntu-20.04 } - - { name: g++-8, cc: gcc-8, cxx: g++-8, distro: ubuntu-20.04 } - - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-latest } - - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-latest } - - { name: clang-7, cc: clang-7, cxx: clang++-7, distro: ubuntu-20.04 } - - { name: clang-8, cc: clang-8, cxx: clang++-8, distro: ubuntu-20.04 } - - { name: clang-9, cc: clang-9, cxx: clang++-9, distro: ubuntu-20.04 } + - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-20.04 } + - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-20.04 } + - { name: g++-11, cc: gcc-11, cxx: g++-11, distro: ubuntu-22.04 } + - { name: g++-12, cc: gcc-12, cxx: g++-12, distro: ubuntu-22.04 } + - { name: g++-13, cc: gcc-13, cxx: g++-13, distro: ubuntu-24.04 } + - { name: g++-14, cc: gcc-14, cxx: g++-14, distro: ubuntu-24.04 } - { name: clang-10, cc: clang-10, cxx: clang++-10, distro: ubuntu-20.04 } - { name: clang-11, cc: clang-11, cxx: clang++-11, distro: ubuntu-20.04 } - - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-latest } - - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-latest } - - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-latest } - # - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-latest } + - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-22.04 } + - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-22.04 } + - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-22.04 } + - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-22.04 } + - { name: clang-16, cc: clang-16, cxx: clang++-16, distro: ubuntu-24.04 } + - { name: clang-17, cc: clang-17, cxx: clang++-17, distro: ubuntu-24.04 } + - { name: clang-18, cc: clang-18, cxx: clang++-18, distro: ubuntu-24.04 } runs-on: ${{ matrix.compiler.distro }} steps: - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - - name: Install dependencies - run: sudo apt install python3-mako liborc-dev ${{ matrix.compiler.name }} - - name: Configure - env: - CC: ${{ matrix.compiler.cc }} - CXX: ${{ matrix.compiler.cxx }} - run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. - - name: Build - run: | - echo "Build with $(nproc) thread(s)" - cmake --build build -j$(nproc) - - name: Print info - run: | - ./build/cpu_features/list_cpu_features - ./build/apps/volk-config-info --alignment - ./build/apps/volk-config-info --avail-machines - ./build/apps/volk-config-info --all-machines - ./build/apps/volk-config-info --malloc - ./build/apps/volk-config-info --cc - - name: Test - run: | - cd build - ctest -V + - uses: actions/checkout@v4 + with: + submodules: "recursive" + - name: Install dependencies + run: sudo apt install python3-mako liborc-dev libgtest-dev libfmt-dev ${{ matrix.compiler.name }} + - name: Configure + env: + CC: ${{ matrix.compiler.cc }} + CXX: ${{ matrix.compiler.cxx }} + run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. + - name: Build + run: | + echo "Build with $(nproc) thread(s)" + cmake --build build -j$(nproc) + - name: Print info + run: | + ./build/cpu_features/list_cpu_features + ./build/apps/volk-config-info --alignment + ./build/apps/volk-config-info --avail-machines + ./build/apps/volk-config-info --all-machines + ./build/apps/volk-config-info --malloc + ./build/apps/volk-config-info --cc + - name: Test + run: | + cd build + ctest -V build-ubuntu-arm: # The host should always be linux @@ -73,9 +75,6 @@ jobs: fail-fast: false matrix: include: - - arch: aarch64 - distro: ubuntu20.04 - compiler: { name: g++-8, cc: gcc-8, cxx: g++-8 } - arch: aarch64 distro: ubuntu20.04 compiler: { name: g++-9, cc: gcc-9, cxx: g++-9 } @@ -94,6 +93,9 @@ jobs: - arch: aarch64 distro: ubuntu22.04 compiler: { name: clang-14, cc: clang-14, cxx: clang++-14 } + - arch: aarch64 + distro: ubuntu22.04 + compiler: { name: clang-15, cc: clang-15, cxx: clang++-15 } - arch: armv7 distro: ubuntu22.04 compiler: { name: g++, cc: gcc, cxx: g++ } @@ -112,8 +114,8 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: 'recursive' - - uses: uraimo/run-on-arch-action@v2.5.0 + submodules: "recursive" + - uses: uraimo/run-on-arch-action@v2.7.2 name: Build in non-x86 container id: build with: @@ -139,7 +141,7 @@ jobs: case "${{ matrix.distro }}" in ubuntu*|jessie|stretch|buster) apt-get update -q -y - apt-get install -q -y git cmake python3-mako liborc-dev ${{ matrix.compiler.name }} + apt-get install -q -y git cmake python3-mako liborc-dev libgtest-dev libfmt-dev ${{ matrix.compiler.name }} ;; fedora*) dnf -y update @@ -169,44 +171,41 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - - name: dependencies - run: sudo apt install python3-mako liborc-dev - - name: configure - run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON .. - - name: build - run: cmake --build build -j$(nproc) - - name: Print info - run: | - ./build/cpu_features/list_cpu_features - ./build/apps/volk-config-info --alignment - ./build/apps/volk-config-info --avail-machines - ./build/apps/volk-config-info --all-machines - ./build/apps/volk-config-info --malloc - ./build/apps/volk-config-info --cc - - name: test - run: cd build && ctest -V - - + - uses: actions/checkout@v4 + with: + submodules: "recursive" + - name: dependencies + run: sudo apt install python3-mako liborc-dev libgtest-dev libfmt-dev + - name: configure + run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON .. + - name: build + run: cmake --build build -j$(nproc) + - name: Print info + run: | + ./build/cpu_features/list_cpu_features + ./build/apps/volk-config-info --alignment + ./build/apps/volk-config-info --avail-machines + ./build/apps/volk-config-info --all-machines + ./build/apps/volk-config-info --malloc + ./build/apps/volk-config-info --cc + - name: test + run: cd build && ctest -V build-windows: - runs-on: windows-latest steps: - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - - name: dependencies - run: pip install mako - - name: configure - run: mkdir build && cd build && cmake .. - - name: build - run: cmake --build build --config Release --target INSTALL -j2 - - name: test - run: cd build && ctest -V -C Release + - uses: actions/checkout@v4 + with: + submodules: "recursive" + - name: dependencies + run: pip install mako + - name: configure + run: mkdir build && cd build && cmake .. + - name: build + run: cmake --build build --config Release --target INSTALL -j4 + - name: test + run: cd build && ctest -V -C Release # build-windows-msys2: # name: Build on windows-latest using MinGW and MSYS2 @@ -226,7 +225,7 @@ jobs: # python-six # mingw-w64-x86_64-gcc # mingw-w64-x86_64-cmake - # - uses: actions/checkout@v2 + # - uses: actions/checkout@v4 # - name: Checkout submodules # run: git submodule update --init --recursive # - name: Configure @@ -242,32 +241,29 @@ jobs: # ctest -V build-macos: - strategy: matrix: - os: [macos-latest, flyci-macos-large-latest-m2] + os: [macos-13, macos-latest] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - - name: dependencies - run: pip3 install --break-system-packages mako && brew install orc - - name: configure - run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON .. - - name: build - run: cmake --build build --config Debug -j3 - - name: Print info - run: | - ./build/cpu_features/list_cpu_features - # ./build/apps/volk-config-info --alignment - # ./build/apps/volk-config-info --avail-machines - # ./build/apps/volk-config-info --all-machines - # ./build/apps/volk-config-info --malloc - # ./build/apps/volk-config-info --cc - - name: test - run: cd build && ctest -V - - + - uses: actions/checkout@v4 + with: + submodules: "recursive" + - name: dependencies + run: pip3 install --break-system-packages mako && brew install orc + - name: configure + run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON .. + - name: build + run: cmake --build build --config Debug -j4 + - name: Print info + run: | + ./build/cpu_features/list_cpu_features + # ./build/apps/volk-config-info --alignment + # ./build/apps/volk-config-info --avail-machines + # ./build/apps/volk-config-info --all-machines + # ./build/apps/volk-config-info --malloc + # ./build/apps/volk-config-info --cc + - name: test + run: cd build && ctest -V From 0d2e306b95c6593e4825f12de43f9a7a28943661 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Tue, 20 Aug 2024 23:50:28 +0200 Subject: [PATCH 22/67] tests: Beautify test output We can run an individual test for each implementation now. Further, the test names are easier to recognize. Signed-off-by: Johannes Demel --- tests/CMakeLists.txt | 2 + tests/test_volk_32f_x3_sum_of_poly_32f.cc | 61 +++++++++------------ tests/test_volk_32fc_x2_multiply_32fc.cc | 65 +++++++++-------------- tests/volk_test.cc | 27 ++++++++-- tests/volk_test.h | 36 +++++++++---- 5 files changed, 102 insertions(+), 89 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fd9d8b8c4..42dbeca0f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -26,6 +26,8 @@ add_executable( ${volk_test_files} ) +target_compile_features(volk_tests PUBLIC cxx_std_20) + target_link_libraries(volk_tests PRIVATE GTest::gtest_main diff --git a/tests/test_volk_32f_x3_sum_of_poly_32f.cc b/tests/test_volk_32f_x3_sum_of_poly_32f.cc index 27d972dcf..746623b9f 100644 --- a/tests/test_volk_32f_x3_sum_of_poly_32f.cc +++ b/tests/test_volk_32f_x3_sum_of_poly_32f.cc @@ -22,8 +22,8 @@ class volk_32f_x3_sum_of_poly_32f_test : public VolkTest protected: void SetUp() override { - initialize_implementation_names(volk_32f_x3_sum_of_poly_32f_get_func_desc()); - initialize_data(GetParam()); + initialize_test(GetParam()); + initialize_data(vector_length); } void initialize_data(const size_t length) @@ -66,6 +66,7 @@ class volk_32f_x3_sum_of_poly_32f_test : public VolkTest result = volk::vector(1, 0.0); ua_result.push_back(result.at(0)); ua_result.push_back(result.at(0)); + tolerance = std::max(expected * 1e-5, 1e-5); } void execute_aligned(const std::string impl_name) @@ -76,6 +77,7 @@ class volk_32f_x3_sum_of_poly_32f_test : public VolkTest cutoff.data(), vector_length, impl_name.c_str()); + EXPECT_NEAR(result.at(0), expected, tolerance); } void execute_unaligned(const std::string impl_name) @@ -86,6 +88,7 @@ class volk_32f_x3_sum_of_poly_32f_test : public VolkTest ua_cutoff.data() + 1, vector_length, impl_name.c_str()); + EXPECT_NEAR(ua_result.at(1), expected, tolerance); } // void TearDown() override {} @@ -99,42 +102,30 @@ class volk_32f_x3_sum_of_poly_32f_test : public VolkTest volk::vector result; volk::vector ua_result; float expected = 0.0f; + float tolerance = 1.0e-5; }; - -TEST_P(volk_32f_x3_sum_of_poly_32f_test, aligned) -{ - for (auto name : implementation_names) { - auto tol = std::max(expected * 1e-5, 1e-5); - fmt::print( - "test aligned implementation: {:>12}, size={} ...", name, vector_length); - auto start = std::chrono::steady_clock::now(); - - execute_aligned(name); - - std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; - fmt::print("\tduration={}\n", elapsed); - EXPECT_NEAR(result.at(0), expected, tol); - } -} - -TEST_P(volk_32f_x3_sum_of_poly_32f_test, unaligned) +TEST_P(volk_32f_x3_sum_of_poly_32f_test, run) { - for (auto name : unaligned_impl_names) { - auto tol = std::max(expected * 1e-5, 1e-5); - fmt::print( - "test unaligned implementation: {:>12}, size={} ...", name, vector_length); - auto start = std::chrono::steady_clock::now(); - - execute_unaligned(name); - - std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; - fmt::print("\tduration={}\n", elapsed); - EXPECT_NEAR(ua_result.at(1), expected, tol); + fmt::print("test {} implementation: {:>12}, size={} ...", + is_aligned_implementation ? "aligned" : "unaligned", + implementation_name, + vector_length); + auto start = std::chrono::steady_clock::now(); + + if (is_aligned_implementation) { + execute_aligned(implementation_name); + } else { + execute_unaligned(implementation_name); } + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); } - -INSTANTIATE_TEST_SUITE_P(volk_32f_x3_sum_of_poly_32f, - volk_32f_x3_sum_of_poly_32f_test, - testing::Values(7, 32, 128, 1023, 65535, 131071)); +INSTANTIATE_TEST_SUITE_P( + volk_32f_x3_sum_of_poly_32f, + volk_32f_x3_sum_of_poly_32f_test, + testing::Combine(testing::ValuesIn(get_kernel_implementation_name_list( + volk_32f_x3_sum_of_poly_32f_get_func_desc())), + testing::ValuesIn(default_vector_sizes)), + generate_volk_test_name()); diff --git a/tests/test_volk_32fc_x2_multiply_32fc.cc b/tests/test_volk_32fc_x2_multiply_32fc.cc index aa527afce..e70031448 100644 --- a/tests/test_volk_32fc_x2_multiply_32fc.cc +++ b/tests/test_volk_32fc_x2_multiply_32fc.cc @@ -11,19 +11,19 @@ #include #include #include +#include #include #include #include #include - class volk_32fc_x2_multiply_32fc_test : public VolkTest { protected: void SetUp() override { - initialize_implementation_names(volk_32fc_x2_multiply_32fc_get_func_desc()); - initialize_data(GetParam()); + initialize_test(GetParam()); + initialize_data(vector_length); } void initialize_data(const size_t length) @@ -61,10 +61,9 @@ class volk_32fc_x2_multiply_32fc_test : public VolkTest vec1.data() + 1, vector_length - 1, impl_name.c_str()); + EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, ua_result)); } - // void TearDown() override {} - size_t vector_length; volk::vector vec0; volk::vector vec1; volk::vector result; @@ -73,46 +72,30 @@ class volk_32fc_x2_multiply_32fc_test : public VolkTest volk::vector ua_result; }; - -TEST_P(volk_32fc_x2_multiply_32fc_test, aligned) +TEST_P(volk_32fc_x2_multiply_32fc_test, run) { - for (auto name : implementation_names) { - fmt::print( - "test aligned implementation: {:>12}, size={} ...", name, vector_length); - auto start = std::chrono::steady_clock::now(); - - execute_aligned(name); - - std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; - fmt::print("\tduration={}\n", elapsed); - EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result)); + fmt::print("test {} implementation: {:>12}, size={} ...", + is_aligned_implementation ? "aligned" : "unaligned", + implementation_name, + vector_length); + auto start = std::chrono::steady_clock::now(); + + if (is_aligned_implementation) { + execute_aligned(implementation_name); + } else { + execute_unaligned(implementation_name); } -} - -TEST_P(volk_32fc_x2_multiply_32fc_test, unaligned) -{ - for (auto name : unaligned_impl_names) { - RecordProperty("aligned", false); - RecordProperty("implementation", name); - RecordProperty("size", vector_length); - fmt::print( - "test unaligned implementation: {:>12}, size={} ...", name, vector_length); - auto start = std::chrono::steady_clock::now(); - execute_unaligned(name); - std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; - fmt::print("\tduration={}\n", elapsed); - EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, ua_result)); - } + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); } -INSTANTIATE_TEST_SUITE_P(volk_32fc_x2_multiply_32fc, - volk_32fc_x2_multiply_32fc_test, - testing::Values(7, 32, 128, 1023, 131071), - testing::PrintToStringParamName() - // [](const testing::TestParamInfo& info) { - // return fmt::format("{}", info.param); - // } -); +INSTANTIATE_TEST_SUITE_P( + volk_32fc_x2_multiply_32fc, + volk_32fc_x2_multiply_32fc_test, + testing::Combine(testing::ValuesIn(get_kernel_implementation_name_list( + volk_32fc_x2_multiply_32fc_get_func_desc())), + testing::ValuesIn(default_vector_sizes)), + generate_volk_test_name()); diff --git a/tests/volk_test.cc b/tests/volk_test.cc index 6c0589a06..bf9a99151 100644 --- a/tests/volk_test.cc +++ b/tests/volk_test.cc @@ -55,7 +55,7 @@ ::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& exp return ::testing::AssertionSuccess(); } -std::vector get_kernel_implementation_name_list(volk_func_desc_t desc) +std::vector get_kernel_implementation_name_list(const volk_func_desc_t desc) { std::vector names; for (size_t i = 0; i < desc.n_impls; i++) { @@ -65,13 +65,18 @@ std::vector get_kernel_implementation_name_list(volk_func_desc_t de return names; } +bool is_aligned_implementation_name(const std::string& name) +{ + return name.rfind("a_", 0) == 0; +} + std::tuple, std::vector> -separate_implementations_by_alignment(std::vector names) +separate_implementations_by_alignment(const std::vector& names) { std::vector aligned; std::vector unaligned; for (auto name : names) { - if (name.rfind("a_", 0) == 0) { + if (is_aligned_implementation_name(name)) { aligned.push_back(name); } else { unaligned.push_back(name); @@ -79,3 +84,19 @@ separate_implementations_by_alignment(std::vector names) } return { aligned, unaligned }; } + +std::vector +get_aligned_kernel_implementation_names(const volk_func_desc_t desc) +{ + auto impls = get_kernel_implementation_name_list(desc); + auto [aligned, unaligned] = separate_implementations_by_alignment(impls); + return aligned; +} + +std::vector +get_unaligned_kernel_implementation_names(const volk_func_desc_t desc) +{ + auto impls = get_kernel_implementation_name_list(desc); + auto [aligned, unaligned] = separate_implementations_by_alignment(impls); + return unaligned; +} diff --git a/tests/volk_test.h b/tests/volk_test.h index 20f8780bb..ebc2e3237 100644 --- a/tests/volk_test.h +++ b/tests/volk_test.h @@ -11,27 +11,43 @@ #include #include #include +#include #include +static constexpr std::array default_vector_sizes{ 7, 32, 128, 1023, 131071 }; -std::vector get_kernel_implementation_name_list(volk_func_desc_t desc); +std::vector get_kernel_implementation_name_list(const volk_func_desc_t desc); + +bool is_aligned_implementation_name(const std::string& name); std::tuple, std::vector> -separate_implementations_by_alignment(std::vector names); +separate_implementations_by_alignment(const std::vector& names); + +std::vector +get_aligned_kernel_implementation_names(const volk_func_desc_t desc); +std::vector +get_unaligned_kernel_implementation_names(const volk_func_desc_t desc); + +struct generate_volk_test_name { + template + std::string operator()(const ::testing::TestParamInfo& info) const + { + return fmt::format("{}_{}", std::get<0>(info.param), std::get<1>(info.param)); + } +}; -class VolkTest : public ::testing::TestWithParam +class VolkTest : public ::testing::TestWithParam> { protected: - void initialize_implementation_names(volk_func_desc_t desc) + void initialize_test(const std::tuple& param) { - implementation_names = get_kernel_implementation_name_list(desc); - std::tie(aligned_impl_names, unaligned_impl_names) = - separate_implementations_by_alignment(implementation_names); + std::tie(implementation_name, vector_length) = param; + is_aligned_implementation = is_aligned_implementation_name(implementation_name); } - std::vector implementation_names; - std::vector aligned_impl_names; - std::vector unaligned_impl_names; + std::string implementation_name; + bool is_aligned_implementation; + size_t vector_length; }; From 0caeb884b556f333f1b869b55f7c3ba584792a9a Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Mon, 28 Oct 2024 22:13:55 +0100 Subject: [PATCH 23/67] ci: Fix run-on-arch hiccup We probably need to load a newer version than what we pinned before. Let's just update and hopefully enjoy the benefits. Signed-off-by: Johannes Demel --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index fb0d0a080..392fa4d90 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -115,7 +115,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: "recursive" - - uses: uraimo/run-on-arch-action@v2.7.2 + - uses: uraimo/run-on-arch-action@v2 name: Build in non-x86 container id: build with: From 222558c8960f029c1c9a5726f31a2ed100fabd72 Mon Sep 17 00:00:00 2001 From: Olaf Bernstein Date: Tue, 29 Oct 2024 00:06:46 +0100 Subject: [PATCH 24/67] RVV CI: build sequentially with verbose Signed-off-by: Olaf Bernstein --- .github/workflows/run-tests-rvv.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-tests-rvv.yml b/.github/workflows/run-tests-rvv.yml index 10ff857d7..e97825f34 100644 --- a/.github/workflows/run-tests-rvv.yml +++ b/.github/workflows/run-tests-rvv.yml @@ -29,27 +29,27 @@ jobs: CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. make -j$(nproc) - ARGS=-j$(nproc) make test + ARGS=-V make test - name: Test gcc-14 VLEN=256 run: | cd build; rm -rf * CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release make -j$(nproc) - ARGS=-j$(nproc) make test + ARGS=-V make test - name: Test clang-18 VLEN=512 run: | cd build; rm -rf * CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. make -j$(nproc) - ARGS=-j$(nproc) make test + ARGS=-V make test - name: Test clang-18 VLEN=1024 run: | cd build; rm -rf * CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release make -j$(nproc) - ARGS=-j$(nproc) make test + ARGS=-V make test From e006e0f61fbbea142f80a1f8e3887397b9d72746 Mon Sep 17 00:00:00 2001 From: Olaf Bernstein Date: Thu, 31 Oct 2024 15:01:34 +0100 Subject: [PATCH 25/67] use segmented store in rotator2 rvvseg implementation Signed-off-by: Olaf Bernstein --- kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h index d94f55f7d..e668e3c54 100644 --- a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h +++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h @@ -906,12 +906,8 @@ static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector, __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl); vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl); - - vuint32m2_t vru = __riscv_vreinterpret_u32m2(vr); - vuint32m2_t viu = __riscv_vreinterpret_u32m2(vi); - vuint64m4_t res = - __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl); - __riscv_vse64((uint64_t*)outVector, res, vl); + vfloat32m2x2_t vc = __riscv_vcreate_v_f32m2x2(vr, vi); + __riscv_vsseg2e32_v_f32m2x2((float*)outVector, vc, vl); vfloat32m2_t tmp = phr; phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl); From 93cf869df1e19878eea51e34326ca3231fed1ea2 Mon Sep 17 00:00:00 2001 From: Suleyman Poyraz Date: Fri, 1 Nov 2024 19:24:13 +0300 Subject: [PATCH 26/67] Changes that suggested on #770 Writing to the python helper script moved to `python/volk_modtool/CMakeLists.txt` Revert "Changes that suggested on #770" and Added way given on pull req. This reverts commit d4afa9ee1edbab2b4c737db288359e339d5a00a9 partially. as @jdemel stated: > CMAKE_CURRENT_BINARY_DIR would possibly be the preferred way to use. I'd suspect changing this variable in the original location would already fix your issue. That'd be great. That way we can keep the code in a more generic place that makes it easier to re-use. Also, thanks for your patience. I suspected about is there any upper config deletes CMAKE_CURRENT_SOURCE_DIR and CMAKE_CURRENT_BUILD_DIR and it overriding on 'CMakeLists.txt'. Signed-off-by: Suleyman Poyraz --- CMakeLists.txt | 54 ++++++++++++++++------------------ cmake/Modules/VolkPython.cmake | 2 +- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f91f88af3..971b48d7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,8 +61,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign") endif() -set(CMAKE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) #allows this to be a sub-project -set(CMAKE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) #allows this to be a sub-project list(INSERT CMAKE_MODULE_PATH 0 ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules )#location for custom "Modules" @@ -91,8 +89,8 @@ math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 + ${VERSION_INFO_MINOR_VERSION} * 100 + ${VERSION_INFO_MAINT_VERSION}") -configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in - ${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_version.h.in + ${CMAKE_CURRENT_BINARY_DIR}/include/volk/volk_version.h @ONLY) ######################################################################## # Environment setup @@ -236,31 +234,31 @@ set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) set(VOLK_INCLUDE_DIR include) install( - DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/kernels/volk DESTINATION include COMPONENT "volk_devel" FILES_MATCHING PATTERN "*.h") install( - FILES ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_alloc.hh - ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h - ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h - ${CMAKE_BINARY_DIR}/include/volk/volk.h - ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h - ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h - ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h - ${CMAKE_SOURCE_DIR}/include/volk/volk_malloc.h - ${CMAKE_BINARY_DIR}/include/volk/volk_version.h - ${CMAKE_SOURCE_DIR}/include/volk/constants.h + FILES ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_prefs.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_alloc.hh + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_complex.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_common.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/saturation_arithmetic.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h + ${CMAKE_CURRENT_BINARY_DIR}/include/volk/volk.h + ${CMAKE_CURRENT_BINARY_DIR}/include/volk/volk_cpu.h + ${CMAKE_CURRENT_BINARY_DIR}/include/volk/volk_config_fixed.h + ${CMAKE_CURRENT_BINARY_DIR}/include/volk/volk_typedefs.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/volk_malloc.h + ${CMAKE_CURRENT_BINARY_DIR}/include/volk/volk_version.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/volk/constants.h DESTINATION include/volk COMPONENT "volk_devel") @@ -288,7 +286,7 @@ endif(APPLE) ######################################################################## # Create uninstall target ######################################################################## -configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY) # Only add the target if there isn't one defined already @@ -303,11 +301,11 @@ endif() # http://www.cmake.org/Wiki/CMake/Tutorials/Packaging ######################################################################## -configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in - ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfig.cmake @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfig.cmake @ONLY) -configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in - ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY) ######################################################################## # Install cmake search routine for external use diff --git a/cmake/Modules/VolkPython.cmake b/cmake/Modules/VolkPython.cmake index 7beff74b9..629d6d795 100644 --- a/cmake/Modules/VolkPython.cmake +++ b/cmake/Modules/VolkPython.cmake @@ -140,7 +140,7 @@ file(TO_CMAKE_PATH ${VOLK_PYTHON_DIR} VOLK_PYTHON_DIR) # Usage: VOLK_UNIQUE_TARGET( ) ######################################################################## function(VOLK_UNIQUE_TARGET desc) - file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + file(RELATIVE_PATH reldir ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) execute_process( COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib unique = hashlib.sha256(b'${reldir}${ARGN}').hexdigest()[:5] From a7a296eb5766df4b1ec1ada94cb127007ac3ee29 Mon Sep 17 00:00:00 2001 From: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Date: Tue, 12 Nov 2024 20:04:27 -0400 Subject: [PATCH 27/67] appveyor: Update to VS 2022/Python 3.12 Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> --- appveyor.yml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index ae4825644..a8a6b7b20 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -5,16 +5,16 @@ # # SPDX-License-Identifier: LGPL-3.0-or-later # -image: Visual Studio 2019 +image: Visual Studio 2022 cache: - packages -> appveyor.yml environment: environment: matrix: - - job_name: VS 16 2019 / python 3.8 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 - CMAKE_GENERATOR: Visual Studio 16 2019 - PYTHON: "C:\\Python38-x64" + - job_name: VS 17 2022 / python 3.12 + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022 + CMAKE_GENERATOR: Visual Studio 17 2022 + PYTHON: "C:\Python312-x64" install: # Prepend the selected Python to the PATH of this build @@ -27,18 +27,16 @@ install: - pip install mako before_build: - git submodule update --init --recursive - - cmake -G "%CMAKE_GENERATOR%" -A x64 \ - -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=ON \ - . + - cmake -G "%CMAKE_GENERATOR%" -A x64 -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=ON . build_script: - cmake --build . --config Release --target INSTALL test_script: - ctest -V --output-on-failure -C Release after_test: - - cd "c:\Program Files" - - 7z a "c:\libvolk-x64-%VC_VERSION%.zip" volk + - cd "C:\Program Files" + - 7z a "C:\libvolk-x64-%VC_VERSION%.zip" volk - mkdir dlls - cd dlls - - 7z a "c:\libvolk-x64-deps-%VC_VERSION%.zip" * - - appveyor PushArtifact c:\libvolk-x64-%VC_VERSION%.zip - - appveyor PushArtifact c:\libvolk-x64-deps-%VC_VERSION%.zip + - 7z a "C:\libvolk-x64-deps-%VC_VERSION%.zip" * + - appveyor PushArtifact C:\libvolk-x64-%VC_VERSION%.zip + - appveyor PushArtifact C:\libvolk-x64-deps-%VC_VERSION%.zip From 7126af9bb4ea2ce45ed7d1c1267afb2730625b5f Mon Sep 17 00:00:00 2001 From: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Date: Tue, 12 Nov 2024 21:26:14 -0400 Subject: [PATCH 28/67] Update android_build.yml Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> --- .github/workflows/android_build.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/android_build.yml b/.github/workflows/android_build.yml index 4036aef68..3e272b0b9 100644 --- a/.github/workflows/android_build.yml +++ b/.github/workflows/android_build.yml @@ -35,20 +35,26 @@ jobs: # All dependencies - name: Install dependencies - run: sudo apt install -y cmake openjdk-11-jre-headless wget unzip make python3-mako + run: sudo apt install -y cmake python3-mako + + # Setup Java + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' # Setup Android SDK, and auto-accept licenses - name: Install Android SDK - run: wget --quiet --output-document=android-sdk.zip https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && mkdir android-sdk-linux && unzip -qq android-sdk.zip -d android-sdk-linux && export ANDROID_HOME=./android-sdk-linux && echo y | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --update && (echo y; echo y; echo y; echo y; echo y; echo y; echo y; echo y) | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --licenses + run: wget --quiet --output-document=android-sdk.zip https://dl.google.com/android/repository/commandlinetools-linux-11076708_latest.zip && mkdir android-sdk-linux && unzip -qq android-sdk.zip -d android-sdk-linux && export ANDROID_HOME=./android-sdk-linux && echo y | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --update && (echo y; echo y; echo y; echo y; echo y; echo y; echo y; echo y) | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --licenses # Call SDKManager to install the Android NDK - name: Install Android NDK - run: $GITHUB_WORKSPACE/android-sdk-linux/cmdline-tools/bin/sdkmanager --sdk_root=$GITHUB_WORKSPACE/android-sdk-linux --install "ndk;24.0.8215888" --channel=3 + run: $GITHUB_WORKSPACE/android-sdk-linux/cmdline-tools/bin/sdkmanager --sdk_root=$GITHUB_WORKSPACE/android-sdk-linux --install "ndk;27.2.12479018" --channel=3 # Setup build directory - name: Setup ${{ matrix.arch.name }} shell: bash - run: cd $GITHUB_WORKSPACE/ && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-sdk-linux/ndk/24.0.8215888/build/cmake/android.toolchain.cmake -DANDROID_ABI=${{ matrix.arch.name }} -DANDROID_PLATFORM=android-23 .. + run: cd $GITHUB_WORKSPACE/ && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-sdk-linux/ndk/27.2.12479018/build/cmake/android.toolchain.cmake -DANDROID_ABI=${{ matrix.arch.name }} -DANDROID_PLATFORM=android-34 .. # Build - name: Build ${{ matrix.arch.name }} From f3de844d799c5cd5aba3c9a3727d57ac4438927c Mon Sep 17 00:00:00 2001 From: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Date: Sat, 16 Nov 2024 17:59:19 -0400 Subject: [PATCH 29/67] Adds toolchain file for Raspberry Pi 5 Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> --- README.md | 5 +++-- .../arm_cortex_a76_hardfp_native.cmake | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake diff --git a/README.md b/README.md index 9f1f0d040..799b670e9 100644 --- a/README.md +++ b/README.md @@ -74,12 +74,13 @@ To build for these boards you need specify the correct cmake toolchain file for _Note: There is no need for adding extra options to the compiler for 64 bit._ +* Raspberry Pi 5 `arm_cortex_a76_hardfp_native.cmake` * Raspberry Pi 4 `arm_cortex_a72_hardfp_native.cmake` * Raspberry Pi 3 `arm_cortex_a53_hardfp_native.cmake` ```bash $ mkdir build && cd build -$ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake .. +$ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake .. # make -j4 might be faster $ make $ make test @@ -136,4 +137,4 @@ notices at the top of source files list which years changes have been made. For some files, changes have occurred in many consecutive years. These files may often have the format of a year range (e.g., "2006 - 2011"), which indicates that these files have had copyrightable changes made -during each year in the range, inclusive. \ No newline at end of file +during each year in the range, inclusive. diff --git a/cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake b/cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake new file mode 100644 index 000000000..a540c9335 --- /dev/null +++ b/cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake @@ -0,0 +1,21 @@ +# +# This file is part of VOLK +# +# SPDX-License-Identifier: LGPL-3.0-or-later +# + +######################################################################## +# Toolchain file for building native on a ARM Cortex A76 w/ NEON +# Usage: cmake -DCMAKE_TOOLCHAIN_FILE= +######################################################################## +set(CMAKE_CXX_COMPILER g++) +set(CMAKE_C_COMPILER gcc) +set(CMAKE_CXX_FLAGS + "-march=armv8.2-a -mtune=cortex-a76 -mfpu=neon-fp-armv8 -mfloat-abi=hard" + CACHE STRING "" FORCE) +set(CMAKE_C_FLAGS + ${CMAKE_CXX_FLAGS} + CACHE STRING "" FORCE) #same flags for C sources +set(CMAKE_ASM_FLAGS + "${CMAKE_CXX_FLAGS} -mthumb -g" + CACHE STRING "" FORCE) #same flags for asm sources From 430f2ef2b84f3807eba807183e1166f5b4acbdb9 Mon Sep 17 00:00:00 2001 From: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:35:59 -0400 Subject: [PATCH 30/67] Update links Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> --- CMakeLists.txt | 2 +- README.md | 4 ++-- cmake/Modules/VolkAddTest.cmake | 2 +- cmake/cmake_uninstall.cmake.in | 2 +- cmake/msvc/sys/time.h | 2 +- docs/CHANGELOG.md | 4 ++-- docs/Doxyfile.in | 20 ++++++++++--------- docs/versioning.md | 6 +++--- gen/volk_kernel_defs.py | 2 +- include/volk/volk_common.h | 2 +- include/volk/volk_complex.h | 2 +- kernels/volk/volk_32f_log2_32f.h | 2 +- kernels/volk/volk_32u_reverse_32u.h | 6 +++--- kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 2 +- .../volk/volk_8u_x3_encodepolarpuppet_8u.h | 2 +- lib/testqa.cc | 2 +- 16 files changed, 32 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 971b48d7e..6d7291a72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,7 +298,7 @@ endif() ######################################################################## # Install our Cmake modules into $prefix/lib/cmake/volk # See "Package Configuration Files" on page: -# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging +# https://gitlab.kitware.com/cmake/community/-/wikis/doc/tutorials/Packaging ######################################################################## configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in diff --git a/README.md b/README.md index 799b670e9..6ee3ce3e3 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ # Welcome to VOLK! -VOLK is a sub-project of GNU Radio. Please see http://libvolk.org for bug +VOLK is a sub-project of GNU Radio. Please see https://www.libvolk.org/ for bug tracking, documentation, source code, and contact information about VOLK. See https://www.gnuradio.org/ for information about GNU Radio. @@ -20,7 +20,7 @@ https://www.gnuradio.org/git/volk.git/. ## How to use VOLK: -For detailed instructions see http://libvolk.org/doxygen/using_volk.html +For detailed instructions see https://www.libvolk.org/doxygen/using_volk.html See these steps for a quick build guide. diff --git a/cmake/Modules/VolkAddTest.cmake b/cmake/Modules/VolkAddTest.cmake index 653a9e5c6..810491983 100644 --- a/cmake/Modules/VolkAddTest.cmake +++ b/cmake/Modules/VolkAddTest.cmake @@ -57,7 +57,7 @@ function(VOLK_ADD_TEST test_name executable_name) file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR} srcdir) list(APPEND environs "srcdir=\"${srcdir}\"") - #http://www.cmake.org/pipermail/cmake/2009-May/029464.html + #https://cmake.org/pipermail/cmake/2009-May/029464.html #Replaced this add test + set environs code with the shell script generation. #Its nicer to be able to manually run the shell script to diagnose problems. if(UNIX) diff --git a/cmake/cmake_uninstall.cmake.in b/cmake/cmake_uninstall.cmake.in index 1c11ba746..43631fac4 100644 --- a/cmake/cmake_uninstall.cmake.in +++ b/cmake/cmake_uninstall.cmake.in @@ -5,7 +5,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later # -# http://www.vtk.org/Wiki/CMake_FAQ#Can_I_do_.22make_uninstall.22_with_CMake.3F +# https://gitlab.kitware.com/cmake/community/-/wikis/FAQ#can-i-do-make-uninstall-with-cmake if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") message( diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h index d1f7ed282..027d5d96f 100644 --- a/cmake/msvc/sys/time.h +++ b/cmake/msvc/sys/time.h @@ -18,7 +18,7 @@ #define NOMINMAX #endif -// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 +// https://learn.microsoft.com/en-us/archive/msdn-technet-forums/430449b3-f6dd-4e18-84de-eebd26a8d668 #include < time.h > #include //I've omitted this line. #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 33fe3818e..4e7fed12b 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,8 +1,8 @@ # Changelog All notable changes to VOLK will be documented in this file. -The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) -and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), starting with version 2.0.0. +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html), starting with version 2.0.0. ## [2.0.0] - 2019-08-06 diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index f6fa80f31..1f3e2882a 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -1152,7 +1152,7 @@ VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: -# http://clang.llvm.org/) for more accurate parsing at the cost of reduced +# https://clang.llvm.org/) for more accurate parsing at the cost of reduced # performance. This can be particularly helpful with template rich C++ code for # which doxygen's built-in parser lacks the necessary type information. # Note: The availability of this option depends on whether or not doxygen was @@ -1178,7 +1178,7 @@ CLANG_OPTIONS = # If clang assisted parsing is enabled you can provide the clang parser with the # path to the directory containing a file called compile_commands.json. This # file is the compilation database (see: -# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the +# https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the # options used when the source files were built. This is equivalent to # specifying the -p option to a clang tool, such as clang-check. These options # will then be passed to the parser. Any options specified with CLANG_OPTIONS @@ -1659,7 +1659,7 @@ USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. +# https://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. @@ -1675,10 +1675,12 @@ MATHJAX_FORMAT = HTML-CSS # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from https://www.mathjax.org before deployment. -# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. +# The default value is: +# in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 +# in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest +MATHJAX_RELPATH = https://cdn.jsdelivr.net/npm/mathjax@2 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example @@ -1690,7 +1692,7 @@ MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: -# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an +# https://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -2153,7 +2155,7 @@ DOCBOOK_PROGRAMLISTING = NO #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures # the structure of the code including all documentation. Note that this feature # is still experimental and incomplete at the moment. # The default value is: NO. @@ -2357,7 +2359,7 @@ HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz (see: -# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent +# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent # Bell Labs. The other options in this section have no effect if this option is # set to NO # The default value is: YES. @@ -2534,7 +2536,7 @@ DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. For an explanation of the image formats see the section # output formats in the documentation of the dot tool (Graphviz (see: -# http://www.graphviz.org/)). +# https://www.graphviz.org/)). # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order # to make the SVG files visible in IE 9+ (other browsers do not have this # requirement). diff --git a/docs/versioning.md b/docs/versioning.md index 470966c42..d8d749dd8 100644 --- a/docs/versioning.md +++ b/docs/versioning.md @@ -70,7 +70,7 @@ Semantic Versioning Specification (SemVer) The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be -interpreted as described in [RFC 2119](http://tools.ietf.org/html/rfc2119). +interpreted as described in [RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119). 1. Software using Semantic Versioning MUST declare a public API. This API could be declared in the code itself or exist strictly in documentation. @@ -265,7 +265,7 @@ About ----- The Semantic Versioning specification is authored by [Tom -Preston-Werner](http://tom.preston-werner.com), inventor of Gravatars and +Preston-Werner](https://tom.preston-werner.com/), inventor of Gravatars and cofounder of GitHub. If you'd like to leave feedback, please [open an issue on @@ -276,4 +276,4 @@ License ------- Creative Commons - CC BY 3.0 -http://creativecommons.org/licenses/by/3.0/ +https://creativecommons.org/licenses/by/3.0/ diff --git a/gen/volk_kernel_defs.py b/gen/volk_kernel_defs.py index 36e36132e..95c48a81e 100644 --- a/gen/volk_kernel_defs.py +++ b/gen/volk_kernel_defs.py @@ -14,7 +14,7 @@ ######################################################################## # Strip comments from a c/cpp file. # Input is code string, output is code string without comments. -# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments +# https://stackoverflow.com/questions/241327/remove-c-and-c-comments-using-python ######################################################################## def comment_remover(text): def replacer(match): diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 1785dbdae..f14affcdd 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -92,7 +92,7 @@ //////////////////////////////////////////////////////////////////////// // Define VOLK_API for library symbols -// http://gcc.gnu.org/wiki/Visibility +// https://gcc.gnu.org/wiki/Visibility //////////////////////////////////////////////////////////////////////// #ifdef volk_EXPORTS #define VOLK_API __VOLK_ATTR_EXPORT diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h index c7dd83d07..e444d0226 100644 --- a/include/volk/volk_complex.h +++ b/include/volk/volk_complex.h @@ -78,7 +78,7 @@ typedef double complex lv_64fc_t; // When GNUC is available, use the complex extensions. // The extensions always return the correct value type. -// http://gcc.gnu.org/onlinedocs/gcc/Complex.html +// https://gcc.gnu.org/onlinedocs/gcc/Complex.html #ifdef __GNUC__ #define lv_creal(x) (__real__(x)) diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 47a7cbe38..23382749f 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -18,7 +18,7 @@ * +-Inf outputs are mapped to +-127.0f and +-NaN input values are not supported. * * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation - * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm + * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h index ece8f48b1..3be939165 100644 --- a/kernels/volk/volk_32u_reverse_32u.h +++ b/kernels/volk/volk_32u_reverse_32u.h @@ -30,7 +30,7 @@ #ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain -// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +// https://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable static const unsigned char BitReverseTable256[] = { 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, @@ -120,7 +120,7 @@ static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out, #endif /* LV_HAVE_GENERIC */ // Idea from "Bit Twiddling Hacks", which dedicates this method to public domain -// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +// https://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable #ifdef LV_HAVE_GENERIC static inline void volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points) @@ -140,7 +140,7 @@ volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_poi #endif /* LV_HAVE_GENERIC */ // Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public -// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits +// domain https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits #ifdef LV_HAVE_GENERIC static inline void volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points) diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h index 5d03f03d2..daa867ddf 100644 --- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h @@ -17,7 +17,7 @@ static inline unsigned int log2_of_power_of_2(unsigned int val) { - // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog + // algorithm from: https://graphics.stanford.edu/~seander/bithacks.html#IntegerLog static const unsigned int b[] = { 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000 }; diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h index 792168e0d..133b497e8 100644 --- a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h +++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h @@ -19,7 +19,7 @@ static inline unsigned int next_lower_power_of_two(const unsigned int val) { // algorithm found and adopted from: - // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html + // https://acius2.blogspot.com/2007/11/calculating-next-power-of-2.html unsigned int res = val; res = (res >> 1) | res; res = (res >> 2) | res; diff --git a/lib/testqa.cc b/lib/testqa.cc index 36e0ef187..342c7ca5f 100644 --- a/lib/testqa.cc +++ b/lib/testqa.cc @@ -104,7 +104,7 @@ int main(int argc, char* argv[]) /* * This function prints qa results as XML output similar to output - * from Junit. For reference output see http://llg.cubic.org/docs/junit/ + * from Junit. For reference output see https://llg.cubic.org/docs/junit/ */ void print_qa_xml(std::vector results, unsigned int nfails) { From 54c27e842077e669052e6043594a812fbec8c321 Mon Sep 17 00:00:00 2001 From: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Date: Sat, 16 Nov 2024 21:01:12 -0400 Subject: [PATCH 31/67] Update links Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> --- include/volk/volk_avx_intrinsics.h | 2 +- include/volk/volk_malloc.h | 4 ++-- kernels/volk/volk_32fc_x2_divide_32fc.h | 4 ++-- lib/volk_malloc.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index 2fc0f064e..affdd0fe4 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -174,7 +174,7 @@ static inline __m256 _mm256_polar_sign_mask(__m128i fbits) _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); // // This is the desired function call. Though it seems to be missing in GCC. - // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/# + // // Compare: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), // _mm_castsi128_ps(sign_bits0)); } diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h index a26f5f682..2af133ee3 100644 --- a/include/volk/volk_malloc.h +++ b/include/volk/volk_malloc.h @@ -28,7 +28,7 @@ __VOLK_DECL_BEGIN * see: https://linux.die.net/man/3/aligned_alloc * For MSVC, we fall back to `_aligned_malloc`. * see: - * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019 + * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=msvc-170 * * Because of the ways in which volk_malloc may allocate memory, it is * important to always free volk_malloc pointers using volk_free. @@ -51,7 +51,7 @@ VOLK_API void* volk_malloc(size_t size, size_t alignment); * see: https://en.cppreference.com/w/c/memory/free * In case `_aligned_malloc` was used, we call `_aligned_free`. * see: - * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019 + * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=msvc-170 * * \param aptr The aligned pointer allocated by volk_malloc. */ diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h index ceee6559d..a5b1b5496 100644 --- a/kernels/volk/volk_32fc_x2_divide_32fc.h +++ b/kernels/volk/volk_32fc_x2_divide_32fc.h @@ -213,7 +213,7 @@ static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, sq, sq); // obtain the actual squared magnitude, although out of order mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them // best guide I found on using these functions: - // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=2738,2059,2738,2738,3875,3874,3875,2738,3870 div = _mm256_div_ps(mul_conj, mag_sq); _mm256_storeu_ps((float*)c, div); // Store the results back into the C container @@ -313,7 +313,7 @@ static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, { /* * Guide to AVX intrisics: - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/# + * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html * * we'll do the "classical" * a a b* diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c index 61ad162d6..ce3978f36 100644 --- a/lib/volk_malloc.c +++ b/lib/volk_malloc.c @@ -19,7 +19,7 @@ * * MSVC is broken * see: - * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019 + * https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170 * This section: * C11 The Universal CRT implemented the parts of the * C11 Standard Library that are required by C++17, From 4d87ed000487a8901da0f41d131ba7847fee63ea Mon Sep 17 00:00:00 2001 From: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Date: Sat, 16 Nov 2024 21:20:43 -0400 Subject: [PATCH 32/67] Fix clang-format issue Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> --- include/volk/volk_avx_intrinsics.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h index affdd0fe4..fd63f681f 100644 --- a/include/volk/volk_avx_intrinsics.h +++ b/include/volk/volk_avx_intrinsics.h @@ -173,10 +173,10 @@ static inline __m256 _mm256_polar_sign_mask(__m128i fbits) __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0); return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1); - // // This is the desired function call. Though it seems to be missing in GCC. - // // Compare: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html - // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), - // _mm_castsi128_ps(sign_bits0)); + // This is the desired function call. Though it seems to be missing in GCC. + // Compare: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html + // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1), + // _mm_castsi128_ps(sign_bits0)); } static inline void From 892fafd0a055c7a71cfa2af6efb27501af848d1f Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Wed, 22 Jan 2025 22:37:05 +0100 Subject: [PATCH 33/67] ci: Add first native Linux ARM runners Since there are native ARM runners available now, we should use them. Let's test this. This is the source: https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories As indicated, this is still in beta. Signed-off-by: Johannes Demel --- .github/workflows/run-tests.yml | 40 ++++++++++++++------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 392fa4d90..8c2b5e411 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -19,20 +19,34 @@ jobs: matrix: compiler: - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-20.04 } + - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-22.04-arm } - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-20.04 } + - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-22.04-arm } - { name: g++-11, cc: gcc-11, cxx: g++-11, distro: ubuntu-22.04 } + - { name: g++-11, cc: gcc-11, cxx: g++-11, distro: ubuntu-22.04-arm } - { name: g++-12, cc: gcc-12, cxx: g++-12, distro: ubuntu-22.04 } + - { name: g++-12, cc: gcc-12, cxx: g++-12, distro: ubuntu-22.04-arm } - { name: g++-13, cc: gcc-13, cxx: g++-13, distro: ubuntu-24.04 } + - { name: g++-13, cc: gcc-13, cxx: g++-13, distro: ubuntu-24.04-arm } - { name: g++-14, cc: gcc-14, cxx: g++-14, distro: ubuntu-24.04 } + - { name: g++-14, cc: gcc-14, cxx: g++-14, distro: ubuntu-24.04-arm } - { name: clang-10, cc: clang-10, cxx: clang++-10, distro: ubuntu-20.04 } - { name: clang-11, cc: clang-11, cxx: clang++-11, distro: ubuntu-20.04 } + - { name: clang-11, cc: clang-11, cxx: clang++-11, distro: ubuntu-22.04-arm } - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-22.04 } + - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-22.04-arm } - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-22.04 } + - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-22.04-arm } - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-22.04 } + - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-22.04-arm } - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-22.04 } + - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-22.04-arm } - { name: clang-16, cc: clang-16, cxx: clang++-16, distro: ubuntu-24.04 } + - { name: clang-16, cc: clang-16, cxx: clang++-16, distro: ubuntu-24.04-arm } - { name: clang-17, cc: clang-17, cxx: clang++-17, distro: ubuntu-24.04 } + - { name: clang-17, cc: clang-17, cxx: clang++-17, distro: ubuntu-24.04-arm } - { name: clang-18, cc: clang-18, cxx: clang++-18, distro: ubuntu-24.04 } + - { name: clang-18, cc: clang-18, cxx: clang++-18, distro: ubuntu-24.04-arm } runs-on: ${{ matrix.compiler.distro }} @@ -67,7 +81,7 @@ jobs: build-ubuntu-arm: # The host should always be linux # see: https://github.com/uraimo/run-on-arch-action - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 name: Build on ${{ matrix.distro }} ${{ matrix.arch }} ${{ matrix.compiler.name }} # Run steps on a matrix of compilers and possibly archs. @@ -75,27 +89,9 @@ jobs: fail-fast: false matrix: include: - - arch: aarch64 - distro: ubuntu20.04 - compiler: { name: g++-9, cc: gcc-9, cxx: g++-9 } - - arch: aarch64 - distro: ubuntu20.04 - compiler: { name: g++-10, cc: gcc-10, cxx: g++-10 } - - arch: aarch64 - distro: ubuntu22.04 - compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 } - arch: aarch64 distro: ubuntu20.04 compiler: { name: clang-9, cc: clang-9, cxx: clang++-9 } - - arch: aarch64 - distro: ubuntu20.04 - compiler: { name: clang-10, cc: clang-10, cxx: clang++-10 } - - arch: aarch64 - distro: ubuntu22.04 - compiler: { name: clang-14, cc: clang-14, cxx: clang++-14 } - - arch: aarch64 - distro: ubuntu22.04 - compiler: { name: clang-15, cc: clang-15, cxx: clang++-15 } - arch: armv7 distro: ubuntu22.04 compiler: { name: g++, cc: gcc, cxx: g++ } @@ -106,7 +102,6 @@ jobs: - arch: s390x distro: ubuntu22.04 compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 } - # It would be really nice to test on Risc-V but that'll take time. - arch: riscv64 distro: ubuntu22.04 compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 } @@ -166,9 +161,8 @@ jobs: ctest -V build-ubuntu-static: - name: Build static on ubuntu-latest - - runs-on: ubuntu-latest + name: Build static on ubuntu-22.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 From cbc8dc484cdb9d0deeafb82ec751ce177d5caca9 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Wed, 22 Jan 2025 22:16:05 +0100 Subject: [PATCH 34/67] cleanup: Remove unused includes clangd reports multiple unused includes. These should be removed to potentially reduce build times and also to minimize includes. Only including necessary includes helps to better understand the code. Signed-off-by: Johannes Demel --- kernels/volk/volk_16i_x4_quad_max_star_16i.h | 1 - kernels/volk/volk_32fc_index_max_32u.h | 2 -- lib/qa_utils.cc | 1 - 3 files changed, 4 deletions(-) diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h index 94e264fe8..be2de2fc0 100644 --- a/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -47,7 +47,6 @@ #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H #include -#include #ifdef LV_HAVE_SSE2 diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index 993187ca5..c9fd4e560 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -58,7 +58,6 @@ #define INCLUDED_volk_32fc_index_max_32u_a_H #include -#include #include #include @@ -327,7 +326,6 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ #define INCLUDED_volk_32fc_index_max_32u_u_H #include -#include #include #include diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc index 603993258..f50609495 100644 --- a/lib/qa_utils.cc +++ b/lib/qa_utils.cc @@ -21,7 +21,6 @@ #include // for sqrt, fabs, abs #include // for memcpy, memset #include // for clock -#include // for operator<<, basic... #include // for cout, cerr #include // for numeric_limits #include // for map, map<>::mappe... From a091230c16184d98719e9d7c3ee3e91d707ff5fd Mon Sep 17 00:00:00 2001 From: Sam Lane Date: Thu, 23 Jan 2025 12:48:32 +0000 Subject: [PATCH 35/67] feature: add env variable kernel override Signed-off-by: Sam Lane --- lib/volk_rank_archs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c index 750fe54b7..9f7dbb4b1 100644 --- a/lib/volk_rank_archs.c +++ b/lib/volk_rank_archs.c @@ -56,6 +56,13 @@ int volk_rank_archs(const char* kern_name, // name of the kernel to rank return volk_get_index(impl_names, n_impls, "generic"); } + // If we've defined the kernel name as an environment variable, always return + // the 'overridden' kernel. Used for manually overring config kernels at runtime. + char *override_env = getenv(kern_name); + if (override_env) { + return volk_get_index(impl_names, n_impls, override_env); + } + // now look for the function name in the prefs list for (i = 0; i < n_arch_prefs; i++) { if (!strncmp(kern_name, From 8af035c05b6897b82d89599165b0ece36ab93735 Mon Sep 17 00:00:00 2001 From: Sam Lane <9569766+SJ-Innovation@users.noreply.github.com> Date: Mon, 27 Jan 2025 09:10:08 +0000 Subject: [PATCH 36/67] fix: formatting Signed-off-by: Sam Lane --- lib/volk_rank_archs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c index 9f7dbb4b1..8431125be 100644 --- a/lib/volk_rank_archs.c +++ b/lib/volk_rank_archs.c @@ -58,7 +58,7 @@ int volk_rank_archs(const char* kern_name, // name of the kernel to rank // If we've defined the kernel name as an environment variable, always return // the 'overridden' kernel. Used for manually overring config kernels at runtime. - char *override_env = getenv(kern_name); + char* override_env = getenv(kern_name); if (override_env) { return volk_get_index(impl_names, n_impls, override_env); } From dfd46b2c3c194a93cf08182e4ec8d17157ac654a Mon Sep 17 00:00:00 2001 From: John Sallay Date: Fri, 24 Jan 2025 21:32:28 -0500 Subject: [PATCH 37/67] Add const to several args Signed-off-by: John Sallay --- kernels/volk/volk_32fc_index_max_16u.h | 18 +++---- kernels/volk/volk_32fc_index_max_32u.h | 20 ++++---- kernels/volk/volk_32fc_index_min_16u.h | 2 +- kernels/volk/volk_32fc_index_min_32u.h | 2 +- ...32fc_x2_s32f_square_dist_scalar_mult_32f.h | 48 +++++++++---------- kernels/volk/volk_32fc_x2_square_dist_32f.h | 30 ++++++------ 6 files changed, 60 insertions(+), 60 deletions(-) diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index 781876d10..25a3f1ac6 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -23,7 +23,7 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t + * void volk_32fc_index_max_16u(uint16_t* target, const lv_32fc_t* src0, uint32_t * num_points) \endcode * * \b Inputs @@ -74,7 +74,7 @@ #include static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -134,7 +134,7 @@ static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target, #include static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -194,7 +194,7 @@ static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target, #include static inline void -volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_16u_a_sse3(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; const uint32_t num_bytes = num_points * 8; @@ -309,7 +309,7 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_16u_generic(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -351,7 +351,7 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ #include static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -411,7 +411,7 @@ static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target, #include static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -471,7 +471,7 @@ static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target, #include static inline void -volk_32fc_index_max_16u_rvv(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_16u_rvv(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) { vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); @@ -502,7 +502,7 @@ volk_32fc_index_max_16u_rvv(uint16_t* target, lv_32fc_t* src0, uint32_t num_poin #include static inline void -volk_32fc_index_max_16u_rvvseg(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_16u_rvvseg(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) { vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index c9fd4e560..500aa7608 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -17,7 +17,7 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t + * void volk_32fc_index_max_32u(uint32_t* target, const lv_32fc_t* src0, uint32_t * num_points) \endcode * * \b Inputs @@ -66,7 +66,7 @@ #include static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -124,7 +124,7 @@ static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target, #include static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -182,7 +182,7 @@ static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target, #include static inline void -volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_32u_a_sse3(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) { const uint32_t num_bytes = num_points * 8; @@ -296,7 +296,7 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p #ifdef LV_HAVE_GENERIC static inline void -volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_32u_generic(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) { const uint32_t num_bytes = num_points * 8; @@ -334,7 +334,7 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ #include static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -392,7 +392,7 @@ static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target, #include static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target, - lv_32fc_t* src0, + const lv_32fc_t* src0, uint32_t num_points) { const __m256i indices_increment = _mm256_set1_epi32(8); @@ -450,7 +450,7 @@ static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target, #include static inline void -volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_32u_neon(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) { unsigned int number = 0; const uint32_t quarter_points = num_points / 4; @@ -512,7 +512,7 @@ volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi #include static inline void -volk_32fc_index_max_32u_rvv(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_32u_rvv(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) { vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); @@ -543,7 +543,7 @@ volk_32fc_index_max_32u_rvv(uint32_t* target, lv_32fc_t* src0, uint32_t num_poin #include static inline void -volk_32fc_index_max_32u_rvvseg(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +volk_32fc_index_max_32u_rvvseg(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) { vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h index 706db915b..b86e56ce3 100644 --- a/kernels/volk/volk_32fc_index_min_16u.h +++ b/kernels/volk/volk_32fc_index_min_16u.h @@ -23,7 +23,7 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t + * void volk_32fc_index_min_16u(uint16_t* target, const lv_32fc_t* source, uint32_t * num_points) \endcode * * \b Inputs diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h index 807a3bb51..cd7e9a7b4 100644 --- a/kernels/volk/volk_32fc_index_min_32u.h +++ b/kernels/volk/volk_32fc_index_min_32u.h @@ -17,7 +17,7 @@ * * Dispatcher Prototype * \code - * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t + * void volk_32fc_index_min_32u(uint32_t* target, const lv_32fc_t* source, uint32_t * num_points) \endcode * * \b Inputs diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index 0b956c205..781ce0683 100644 --- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -17,8 +17,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0, - * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode + * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, const lv_32fc_t* src0, + * const lv_32fc_t* points, float scalar, unsigned int num_points) \endcode * * \b Inputs * \li src0: The complex input. Only the first point is used. @@ -94,8 +94,8 @@ static inline void calculate_scaled_distances(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -178,8 +178,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -219,8 +219,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -278,8 +278,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, #include static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -303,8 +303,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target, #ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -329,8 +329,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -413,8 +413,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -454,8 +454,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -513,8 +513,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target, #include static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -540,8 +540,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { @@ -569,8 +569,8 @@ volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target, static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, float scalar, unsigned int num_points) { diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h index b711bcf10..bfd44b587 100644 --- a/kernels/volk/volk_32fc_x2_square_dist_32f.h +++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -17,7 +17,7 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points, + * void volk_32fc_x2_square_dist_32f(float* target, const lv_32fc_t* src0, lv_32fc_t* points, * unsigned int num_points) { \endcode * * \b Inputs @@ -73,8 +73,8 @@ #include static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { const unsigned int num_bytes = num_points * 8; @@ -166,8 +166,8 @@ static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target, #include static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { const unsigned int num_bytes = num_points * 8; @@ -233,8 +233,8 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, #ifdef LV_HAVE_NEON #include static inline void volk_32fc_x2_square_dist_32f_neon(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { const unsigned int quarter_points = num_points / 4; @@ -267,8 +267,8 @@ static inline void volk_32fc_x2_square_dist_32f_neon(float* target, #ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_square_dist_32f_generic(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { const unsigned int num_bytes = num_points * 8; @@ -302,8 +302,8 @@ static inline void volk_32fc_x2_square_dist_32f_generic(float* target, #include static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { const unsigned int num_bytes = num_points * 8; @@ -378,8 +378,8 @@ static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target, #include static inline void volk_32fc_x2_square_dist_32f_rvv(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { size_t vlmax = __riscv_vsetvlmax_e32m4(); @@ -404,8 +404,8 @@ static inline void volk_32fc_x2_square_dist_32f_rvv(float* target, #include static inline void volk_32fc_x2_square_dist_32f_rvvseg(float* target, - lv_32fc_t* src0, - lv_32fc_t* points, + const lv_32fc_t* src0, + const lv_32fc_t* points, unsigned int num_points) { size_t vlmax = __riscv_vsetvlmax_e32m4(); From 692fecb8521b94306f9f6de012fac43efca2ec04 Mon Sep 17 00:00:00 2001 From: John Sallay Date: Fri, 24 Jan 2025 21:48:27 -0500 Subject: [PATCH 38/67] Run clang-format Signed-off-by: John Sallay --- kernels/volk/volk_32fc_index_max_16u.h | 15 +++++++++------ kernels/volk/volk_32fc_index_max_32u.h | 15 +++++++++------ ...olk_32fc_x2_s32f_square_dist_scalar_mult_32f.h | 4 ++-- kernels/volk/volk_32fc_x2_square_dist_32f.h | 4 ++-- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index 25a3f1ac6..f0aebb2bf 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -193,8 +193,9 @@ static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target, #include #include -static inline void -volk_32fc_index_max_16u_a_sse3(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_16u_a_sse3(uint16_t* target, + const lv_32fc_t* src0, + uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; const uint32_t num_bytes = num_points * 8; @@ -308,8 +309,9 @@ volk_32fc_index_max_16u_a_sse3(uint16_t* target, const lv_32fc_t* src0, uint32_t #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_index_max_16u_generic(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_16u_generic(uint16_t* target, + const lv_32fc_t* src0, + uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; @@ -501,8 +503,9 @@ volk_32fc_index_max_16u_rvv(uint16_t* target, const lv_32fc_t* src0, uint32_t nu #include #include -static inline void -volk_32fc_index_max_16u_rvvseg(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_16u_rvvseg(uint16_t* target, + const lv_32fc_t* src0, + uint32_t num_points) { vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2()); diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index 500aa7608..7f824cdc5 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -181,8 +181,9 @@ static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target, #include #include -static inline void -volk_32fc_index_max_32u_a_sse3(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_32u_a_sse3(uint32_t* target, + const lv_32fc_t* src0, + uint32_t num_points) { const uint32_t num_bytes = num_points * 8; @@ -295,8 +296,9 @@ volk_32fc_index_max_32u_a_sse3(uint32_t* target, const lv_32fc_t* src0, uint32_t #endif /*LV_HAVE_SSE3*/ #ifdef LV_HAVE_GENERIC -static inline void -volk_32fc_index_max_32u_generic(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_32u_generic(uint32_t* target, + const lv_32fc_t* src0, + uint32_t num_points) { const uint32_t num_bytes = num_points * 8; @@ -542,8 +544,9 @@ volk_32fc_index_max_32u_rvv(uint32_t* target, const lv_32fc_t* src0, uint32_t nu #include #include -static inline void -volk_32fc_index_max_32u_rvvseg(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_32u_rvvseg(uint32_t* target, + const lv_32fc_t* src0, + uint32_t num_points) { vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4()); vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4()); diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h index 781ce0683..955ea4b07 100644 --- a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h +++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h @@ -17,8 +17,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, const lv_32fc_t* src0, - * const lv_32fc_t* points, float scalar, unsigned int num_points) \endcode + * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, const lv_32fc_t* + * src0, const lv_32fc_t* points, float scalar, unsigned int num_points) \endcode * * \b Inputs * \li src0: The complex input. Only the first point is used. diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h index bfd44b587..7b3a5d471 100644 --- a/kernels/volk/volk_32fc_x2_square_dist_32f.h +++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -17,8 +17,8 @@ * * Dispatcher Prototype * \code - * void volk_32fc_x2_square_dist_32f(float* target, const lv_32fc_t* src0, lv_32fc_t* points, - * unsigned int num_points) { \endcode + * void volk_32fc_x2_square_dist_32f(float* target, const lv_32fc_t* src0, lv_32fc_t* + * points, unsigned int num_points) { \endcode * * \b Inputs * \li src0: The complex input. Only the first point is used. From 5ca7171bae98359c0e5c39d81e9b46f5f7b310d1 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Mon, 3 Feb 2025 22:22:51 +0100 Subject: [PATCH 39/67] zenodo: Update citations With new contributors come new entries in the Zenodo file. Further, git supports .mailmap to canonicalize all the different names and email addresses that people use over the years. Signed-off-by: Johannes Demel --- .mailmap | 56 +++++++++++++++++++++++++++ .zenodo.json | 15 +++++++ scripts/tools/run_citations_update.py | 2 + 3 files changed, 73 insertions(+) create mode 100644 .mailmap diff --git a/.mailmap b/.mailmap new file mode 100644 index 000000000..544be983a --- /dev/null +++ b/.mailmap @@ -0,0 +1,56 @@ +Alexandre Rouma AlexandreRouma +Alexey Slokva alesha72003 +Andrey Rodionov dernasherbrezon + + + +Christoph Mayer cmayer +Christoph Mayer hcab14 + + +Douglas Geiger Doug +Douglas Geiger Doug Geiger +Douglas Geiger Douglas Geiger + +Federico Larroca git-artes +Geof Nieboer gnieboer +Jam M. Hernandez Quiceno Jam Quiceno + +Johannes Demel jdemel +Johannes Demel jdemel +Johannes Demel jdemel +Johannes Demel jdemel +Johannes Demel Johannes Demel +Johannes Demel Johannes Demel +Johannes Demel Johannes Demel +John Sallay jsallay <31416796+jsallay@users.noreply.github.com> + +Marc Lichtman Marc L + + +Marcus Müller Marcus Mueller + <157892+michaelld@users.noreply.github.com> + +Michael Dickens Michael L Dickens +Michael Dickens Micheal Dickens +Mike Piscopo ghostop14 + + + + +Nathan West Nathan West + +Nicholas McCarthy namccart +Nicholas McCarthy Nick McCarthy +Nick Foster Nick Foster +Nick Foster Nick Foster +Olaf Bernstein +Pascal Giard Pascal Giard + +Philip Balister root +Philip Balister root +Rick Farina Rick Farina (Zero_Chaos) + <32478819+fritterhoff@users.noreply.github.com> +Ryan Volz Ryan Volz +Sam Lane Sam Lane <9569766+SJ-Innovation@users.noreply.github.com> + diff --git a/.zenodo.json b/.zenodo.json index a6f5498e7..0ae1bb100 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -39,6 +39,9 @@ { "name": "Bekhit, Amr" }, + { + "name": "Bernstein, Olaf" + }, { "affiliation": "Carnegie Mellon University, IIT Bombay", "name": "Bhowmick, Abhishek" @@ -84,6 +87,9 @@ { "name": "Est\u00e9vez, Daniel" }, + { + "name": "Farina, Rick" + }, { "affiliation": "Centre Tecnol\u00f2gic de Telecomunicacions de Catalunya (CTTC)", "name": "Fernandez, Carles" @@ -118,6 +124,9 @@ { "name": "Kaesberger, Martin" }, + { + "name": "Lane, Sam" + }, { "name": "Lichtman, Marc" }, @@ -169,6 +178,9 @@ { "name": "Piscopo, Mike" }, + { + "name": "Poyraz, Suleyman" + }, { "name": "Quiceno, Jam M. Hernandez" }, @@ -273,6 +285,9 @@ }, { "name": "rear1019" + }, + { + "name": "tinyboxvk" } ] } \ No newline at end of file diff --git a/scripts/tools/run_citations_update.py b/scripts/tools/run_citations_update.py index 6aca7407b..e2e619f30 100644 --- a/scripts/tools/run_citations_update.py +++ b/scripts/tools/run_citations_update.py @@ -34,6 +34,7 @@ def parse_contributors(contributors): name_aliases = { 'alesha72003': "Alexey Slokva", + 'Camel Coder': 'Olaf Bernstein', 'dernasherbrezon': "Andrey Rodionov", 'Doug': "Douglas Geiger", 'Doug Geiger': "Douglas Geiger", @@ -50,6 +51,7 @@ def parse_contributors(contributors): 'namccart': "Nicholas McCarthy", 'hcab14': "Christoph Mayer", 'cmayer': "Christoph Mayer", + 'Rick Farina (Zero_Chaos)': 'Rick Farina', 'root': "Philip Balister", 'jsallay': "John Sallay"} From 055b29dabaf1691f6e08e353034f7a6eb909472e Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Mon, 3 Feb 2025 23:12:10 +0100 Subject: [PATCH 40/67] Release 3.2.0 --- .lastrelease | 2 +- CMakeLists.txt | 4 ++-- docs/CHANGELOG.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/.lastrelease b/.lastrelease index d95827c3d..6d260c3af 100644 --- a/.lastrelease +++ b/.lastrelease @@ -1 +1 @@ -v3.1.2 +v3.2.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d7291a72..5afb2dc30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,8 +81,8 @@ message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.") ######################################################################## set(VERSION_INFO_MAJOR_VERSION 3) -set(VERSION_INFO_MINOR_VERSION 1) -set(VERSION_INFO_MAINT_VERSION 2) +set(VERSION_INFO_MINOR_VERSION 2) +set(VERSION_INFO_MAINT_VERSION 0) include(VolkVersion) #setup version info math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000 diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 4e7fed12b..52714d771 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -882,3 +882,61 @@ automatically now. - bash negative exit codes are not portable, let's be positive + + +## [3.2.0] - 2025-02-03 + +Hi everyone! + +This is the VOLK v3.2.0 release! We want to thank all contributors. +This release wouldn't have been possible without them. + +Thanks to Olaf Bernstein, VOLK received well optimized RiscV implementations for almost every kernel. +Together with the appropriate CI, this contribution makes VOLK way more powerful on a whole new architecture. + +We started to use gtest as an additional test framework. The current "one kinda test fits all" approach is often insufficient to test kernels where they really should not fail. +Now, this approach should allow us to implement more powerful tests more easily. + +Besides the x86 platform, we see more and more ARM activity. The corresponding kernels can now be tested natively on Linux and MacOS. +This approach is way faster than before with QEMU. A single job runs in ~1min instead of ~12min now. + +### Contributors + +- Doron Behar +- Johannes Demel +- John Sallay +- Magnus Lundmark +- Olaf Bernstein +- Ron Economos +- Sam Lane +- Suleyman Poyraz +- tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> + +### Changes + +- New and improved kernels + - add RISC-V Vector extension (RVV) kernels + - New AVX512F implementation +- Improved and modernized CI + - ci: Add first native Linux ARM runners + - macos: Fix CI dependency error + - appveyor: Update to VS 2022/Python 3.12 + - Update android_build.yml +- Improved builds + - cmake: Fix 64bit host CPU detection + - cmake: Suppress invalid escape sequence warnings with Python 3.12 + - cmake/pkgconfig: use CMAKE_INSTALL_FULL_* variables + - cmake: Fix VOLK as a submodule build issue + - Adds toolchain file for Raspberry Pi 5 +- New and improved tests + - gtest: Start work on new test infrastructure + - tests: Add a log info print test + - gtest: Make gtest an install dependency + - gtest: Enable GTests in CI workflows + - tests: Beautify test output +- Documentation + - cpu_features: Update hints in README +- Code quality + - Add const to several args +- Usability features + - feature: add env variable kernel override From 83a6c66ed12503411d62bbc807b4279c86639bb6 Mon Sep 17 00:00:00 2001 From: Anil Gurses Date: Tue, 25 Feb 2025 20:53:03 -0500 Subject: [PATCH 41/67] Fix github actions for publishing docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Anıl Gürses --- .github/workflows/publish_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml index 9bbf5bbdb..1e7190fd7 100644 --- a/.github/workflows/publish_docs.yml +++ b/.github/workflows/publish_docs.yml @@ -28,7 +28,7 @@ jobs: env: SSH_AUTH_SOCK: /tmp/ssh_agent.sock TARGET_DIR: "${{ github.ref_type }}/${{ github.ref_name }}" - run: 'tar -cz build/html/ | ssh ${{ secrets.SSH_USER }}@${{ secrets.SSH_SERVER }} "mkdir -p /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); cd /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); tar --strip-components=2 -xz; rm /www/${{ env.TARGET_DIR }}/live; cd /www/${{ env.TARGET_DIR }}; ln -sf $(date +%Y.%m.%d) live;"' + run: 'tar -cz build/html/ | ssh ${{ secrets.SSH_USER }}@${{ secrets.SSH_SERVER }} "mkdir -p /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); cd /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); tar --strip-components=2 -xzf -; rm -f /www/${{ env.TARGET_DIR }}/live; cd /www/${{ env.TARGET_DIR }}; ln -sf $(date +%Y.%m.%d) live;"' - uses: actions/upload-artifact@v4 with: name: volk_docs From 8bf00210771dc5141c6c5dc1c19820bbfd0915b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20M=C3=BCller?= Date: Tue, 3 Jun 2025 11:18:43 +0200 Subject: [PATCH 42/67] Add tool to compare profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcus Müller --- scripts/tools/compare_volk_profiles | 69 +++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100755 scripts/tools/compare_volk_profiles diff --git a/scripts/tools/compare_volk_profiles b/scripts/tools/compare_volk_profiles new file mode 100755 index 000000000..6862d9bc7 --- /dev/null +++ b/scripts/tools/compare_volk_profiles @@ -0,0 +1,69 @@ +#!/bin/env python3 +# Copyright 2022, 2025 Marcus Müller +# SPDX-License-Identifier: GPL-3.0 +# Takes in a list of volk profiles, ignores empty and identical lines, prints a table of differences. +# Hacky as hell + +from sys import argv + + +def keepline(line: str) -> bool: + if not line: + return False + line = line.strip() + if line.startswith("#"): + return False + return True + + +def kernel(line: str) -> str: + return line.split(" ")[0] + + +def impls(line: str) -> tuple[str, str]: + return tuple(line.strip().split(" ")[1:]) + + +machines = [ + {kernel(line): impls(line) for line in open(f_name) if keepline(line)} + for f_name in argv[1:] +] +kernels = [set(d.keys()) for d in machines] +common_kernels = [ + kernel for kernel in kernels[0] if all((kernel in ks for ks in kernels[1:])) +] + +differing_kernels = dict() +for kernel in common_kernels: + first_impl = machines[0][kernel] + if all(machine[kernel] == first_impl for machine in machines[1:]): + continue + differing_kernels[kernel] = { + argv[idx + 1]: machine[kernel] for idx, machine in enumerate(machines) + } + +max_kernel_len = max(len(kernel) for kernel in common_kernels) +max_impl_len = max( + max(max(len(alignment) for alignment in impl) for impl in kernel.values()) + for kernel in differing_kernels.values() +) + +print( + f"|{'Kernel':<{max_kernel_len}}|" + + "|".join( + f"{fname + ' a':<{max_impl_len}}|{fname + ' u':<{max_impl_len}}" + for fname in argv[1:] + ) + + "|" +) +for kernel, impls in differing_kernels.items(): + print( + f"|{kernel:<{max_kernel_len}}|" + + "|".join( + "|".join( + f"{impl:<{max_impl_len}}" for impl in differing_kernels[kernel][fname] + ) + for fname in argv[1:] + ) + + "|" + ) From 9eae391bdabbdb9e3aabcbe90a7bd46a2907ffb5 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Fri, 4 Jul 2025 16:03:36 +0200 Subject: [PATCH 43/67] readme: Add paragraph to outline dependency policy This should help users and contributors to better understand the different needs/desires. A written statement can be discussed. Signed-off-by: Johannes Demel --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6ee3ce3e3..f3ecb60f1 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,8 @@ We use [cpu_features](https://github.com/google/cpu_features) to detect CPU feat Some platforms require a very recent version that is not available through the appropriate package manager. In this case you must use `cpu_features` as a submodule. -**NOTE**: Most package managers provide recent enough `cpu_features` versions by now. -Please default to the provided `cpu_features` version first, and only use the submodule in cases where this fails. +**NOTE**: Most package managers provide recent enough `cpu_features` versions by now. +Please default to the provided `cpu_features` version first, and only use the submodule in cases where this fails. Please open an issue if this is the case. There are two options to get the required code in a submodule: @@ -116,6 +116,14 @@ The same goal applies to different OSes. Although this does only rarely happen, We want to make sure VOLK works with C/C++ standard compliant compilers. Of course, as an open source project we focus on open source compilers, most notably GCC and Clang. We want to make sure VOLK compiles on a wide variety of compilers. Thus, we target AppleClang and MSVC as well. Mind that MSVC lacks `aligned_alloc` support for aligned arrays. We use MSVC specific instructions in this case which cannot be `free`'d with `free`. +### Dependency version policy +Finding the correct way to handle the minimum (and potentially maximum) supported dependency version is a difficult task. +For VOLK, we want to ensure that all widely used, and openly maintained, distributions (Ubuntu, Debian, Fedora, etc.) are supported. +The default version of dependencies in these distributions are considered to be the baseline, or oldest supported versions. +While older dependencies might work, we do not want to maintain workarounds etc. for these dependencies. +Also, we want to signal to contributors that they can rely on certain minimum versions for contributions. +If you want to use VOLK on an obsolete distribution, we assume you know what you are doing and you can make the necessary changes to compile VOLK on such a platform, e.g., decrease the minimum version checks and fix corresponding errors. +This approach aims to strike a balance between the desire to use VOLK on every possible platform and the desire to be able to use the latest features. ## License From 0fd2820960921c2e2864c96c2ba24663a7225bdb Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Fri, 4 Jul 2025 16:17:57 +0200 Subject: [PATCH 44/67] ci: Remove obsolete CI, add new CI This PR switches to later Ubuntu versions, and runs the CI with later compilers. Signed-off-by: Johannes Demel --- .github/workflows/run-tests.yml | 20 ++++---------------- CMakeLists.txt | 15 +++++++++++---- apps/CMakeLists.txt | 2 ++ cmake/Modules/VolkAddTest.cmake | 1 + lib/CMakeLists.txt | 1 + 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 8c2b5e411..7e87e357b 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -18,10 +18,6 @@ jobs: fail-fast: false matrix: compiler: - - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-20.04 } - - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-22.04-arm } - - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-20.04 } - - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-22.04-arm } - { name: g++-11, cc: gcc-11, cxx: g++-11, distro: ubuntu-22.04 } - { name: g++-11, cc: gcc-11, cxx: g++-11, distro: ubuntu-22.04-arm } - { name: g++-12, cc: gcc-12, cxx: g++-12, distro: ubuntu-22.04 } @@ -30,15 +26,8 @@ jobs: - { name: g++-13, cc: gcc-13, cxx: g++-13, distro: ubuntu-24.04-arm } - { name: g++-14, cc: gcc-14, cxx: g++-14, distro: ubuntu-24.04 } - { name: g++-14, cc: gcc-14, cxx: g++-14, distro: ubuntu-24.04-arm } - - { name: clang-10, cc: clang-10, cxx: clang++-10, distro: ubuntu-20.04 } - - { name: clang-11, cc: clang-11, cxx: clang++-11, distro: ubuntu-20.04 } - - { name: clang-11, cc: clang-11, cxx: clang++-11, distro: ubuntu-22.04-arm } - - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-22.04 } - - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-22.04-arm } - - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-22.04 } - - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-22.04-arm } - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-22.04 } - - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-22.04-arm } + # - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-22.04-arm } # possibly broken runner: https://github.com/actions/runner-images/issues/8659 - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-22.04 } - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-22.04-arm } - { name: clang-16, cc: clang-16, cxx: clang++-16, distro: ubuntu-24.04 } @@ -47,6 +36,8 @@ jobs: - { name: clang-17, cc: clang-17, cxx: clang++-17, distro: ubuntu-24.04-arm } - { name: clang-18, cc: clang-18, cxx: clang++-18, distro: ubuntu-24.04 } - { name: clang-18, cc: clang-18, cxx: clang++-18, distro: ubuntu-24.04-arm } + - { name: clang-19, cc: clang-19, cxx: clang++-19, distro: ubuntu-24.04 } + - { name: clang-19, cc: clang-19, cxx: clang++-19, distro: ubuntu-24.04-arm } runs-on: ${{ matrix.compiler.distro }} @@ -89,9 +80,6 @@ jobs: fail-fast: false matrix: include: - - arch: aarch64 - distro: ubuntu20.04 - compiler: { name: clang-9, cc: clang-9, cxx: clang++-9 } - arch: armv7 distro: ubuntu22.04 compiler: { name: g++, cc: gcc, cxx: g++ } @@ -110,7 +98,7 @@ jobs: - uses: actions/checkout@v4 with: submodules: "recursive" - - uses: uraimo/run-on-arch-action@v2 + - uses: uraimo/run-on-arch-action@v3 name: Build in non-x86 container id: build with: diff --git a/CMakeLists.txt b/CMakeLists.txt index 5afb2dc30..dffd8efd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ # Project setup ######################################################################## # We use `IS_64BIT now: https://cmake.org/cmake/help/latest/command/cmake_host_system_information.html -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.22) set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} @@ -21,9 +21,6 @@ project(volk) enable_language(CXX) enable_language(C) -set(CMAKE_C_STANDARD 11) -set(CMAKE_CXX_STANDARD 17) - enable_testing() ######################################################################## @@ -35,12 +32,22 @@ include(CheckCXXCompilerFlag) check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) if(HAVE_CX_LIMITED_RANGE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcx-limited-range") + check_cxx_compiler_flag(-Wno-unused-command-line-argument + HAVE_WNO_UNUSED_CMD_LINE_ARG) + if(HAVE_WNO_UNUSED_CMD_LINE_ARG) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") + endif(HAVE_WNO_UNUSED_CMD_LINE_ARG) endif(HAVE_CX_LIMITED_RANGE) include(CheckCCompilerFlag) check_c_compiler_flag(-fcx-limited-range HAVE_C_LIMITED_RANGE) if(HAVE_C_LIMITED_RANGE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcx-limited-range") + check_cxx_compiler_flag(-Wno-unused-command-line-argument + HAVE_WNO_UNUSED_CMD_LINE_ARG) + if(HAVE_WNO_UNUSED_CMD_LINE_ARG) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") + endif(HAVE_WNO_UNUSED_CMD_LINE_ARG) endif(HAVE_C_LIMITED_RANGE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 98d002af3..34d6e798a 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -21,6 +21,7 @@ add_executable( volk_profile ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc ${PROJECT_SOURCE_DIR}/lib/qa_utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc) +target_compile_features(volk_profile PUBLIC cxx_std_17) if(MSVC) target_include_directories( @@ -57,6 +58,7 @@ install( # MAKE volk-config-info add_executable(volk-config-info volk-config-info.cc ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc) +target_compile_features(volk-config-info PUBLIC cxx_std_17) if(ENABLE_STATIC_LIBS) target_link_libraries(volk-config-info volk_static) diff --git a/cmake/Modules/VolkAddTest.cmake b/cmake/Modules/VolkAddTest.cmake index 810491983..f4aef6ea3 100644 --- a/cmake/Modules/VolkAddTest.cmake +++ b/cmake/Modules/VolkAddTest.cmake @@ -24,6 +24,7 @@ function(VOLK_GEN_TEST executable_name) "SOURCES;TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN}) add_executable(${executable_name} ${VOLK_TEST_SOURCES}) target_link_libraries(${executable_name} ${VOLK_TEST_TARGET_DEPS}) + target_compile_features(${executable_name} PUBLIC cxx_std_17) endfunction() ######################################################################## diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 588db44f1..db3e78d88 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -564,6 +564,7 @@ if(VOLK_CPU_FEATURES) PRIVATE $ ) endif() +target_compile_features(volk_obj PUBLIC c_std_17) #Configure object target properties if(NOT MSVC) From 94e4078029d59e81e91e2f58c3ecae9bce0c2a92 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Fri, 4 Jul 2025 17:05:59 +0200 Subject: [PATCH 45/67] filesystem: Fully rely on std::filesystem In the past, we tried to detect different options for std::filesystem, or even used Boost. Now is a good time to remove all the old code and just rely on the fact that current compilers provide std::filesystem with C++17. Signed-off-by: Johannes Demel --- CMakeLists.txt | 6 - apps/CMakeLists.txt | 6 - apps/volk_profile.cc | 20 +-- cmake/Modules/FindFILESYSTEM.cmake | 276 ----------------------------- 4 files changed, 6 insertions(+), 302 deletions(-) delete mode 100644 cmake/Modules/FindFILESYSTEM.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index dffd8efd9..718c9fa6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,12 +180,6 @@ if(NOT MAKO_FOUND) message(FATAL_ERROR "Mako templates required to build VOLK") endif() -# Check if we have std::filesystem -find_package( - FILESYSTEM - COMPONENTS Final Experimental - REQUIRED) - set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 34d6e798a..b1dd48994 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -37,12 +37,6 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_BINARY_DIR} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -add_definitions(-DHAS_STD_FILESYSTEM=1) -if(${find_experimental}) - add_definitions(-DHAS_STD_FILESYSTEM_EXPERIMENTAL=1) -endif() -target_link_libraries(volk_profile PRIVATE std::filesystem) - if(ENABLE_STATIC_LIBS) target_link_libraries(volk_profile PRIVATE volk_static) set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static") diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc index 6392aa790..306e9a9e2 100644 --- a/apps/volk_profile.cc +++ b/apps/volk_profile.cc @@ -7,19 +7,15 @@ * SPDX-License-Identifier: LGPL-3.0-or-later */ -#if HAS_STD_FILESYSTEM_EXPERIMENTAL -#include -#else -#include -#endif #include // for size_t #include // for stat #include // for volk_get_config_path -#include // IWYU pragma: keep -#include // for operator<<, basic_ostream -#include // for map, map<>::iterator -#include // for pair -#include // for vector, vector<>::const_... +#include +#include // IWYU pragma: keep +#include // for operator<<, basic_ostream +#include // for map, map<>::iterator +#include // for pair +#include // for vector, vector<>::const_... #include "kernel_tests.h" // for init_test_list #include "qa_utils.h" // for volk_test_results_t, vol... @@ -27,11 +23,7 @@ #include "volk_option_helpers.h" // for option_list, option_t #include "volk_profile.h" -#if HAS_STD_FILESYSTEM_EXPERIMENTAL -namespace fs = std::experimental::filesystem; -#else namespace fs = std::filesystem; -#endif volk_test_params_t test_params(1e-6f, 327.f, 131071, 1987, false, ""); diff --git a/cmake/Modules/FindFILESYSTEM.cmake b/cmake/Modules/FindFILESYSTEM.cmake deleted file mode 100644 index 874f6bc28..000000000 --- a/cmake/Modules/FindFILESYSTEM.cmake +++ /dev/null @@ -1,276 +0,0 @@ -# Copyright 2019 Free Software Foundation, Inc. -# -# This file is part of VOLK. -# -# SPDX-License-Identifier: LGPL-3.0-or-later -# - -# Original code from https://github.com/vector-of-bool/CMakeCM and modified -# by C. Fernandez. The original code is distributed under the OSI-approved -# BSD 3-Clause License. See https://cmake.org/licensing for details. - -#[=======================================================================[.rst: - -FindFILESYSTEM -############## - -This module supports the C++17 standard library's filesystem utilities. Use the -:imp-target:`std::filesystem` imported target to - -Options -******* - -The ``COMPONENTS`` argument to this module supports the following values: - -.. find-component:: Experimental - :name: fs.Experimental - - Allows the module to find the "experimental" Filesystem TS version of the - Filesystem library. This is the library that should be used with the - ``std::experimental::filesystem`` namespace. - -.. find-component:: Final - :name: fs.Final - - Finds the final C++17 standard version of the filesystem library. - -If no components are provided, behaves as if the -:find-component:`fs.Final` component was specified. - -If both :find-component:`fs.Experimental` and :find-component:`fs.Final` are -provided, first looks for ``Final``, and falls back to ``Experimental`` in case -of failure. If ``Final`` is found, :imp-target:`std::filesystem` and all -:ref:`variables ` will refer to the ``Final`` version. - - -Imported Targets -**************** - -.. imp-target:: std::filesystem - - The ``std::filesystem`` imported target is defined when any requested - version of the C++ filesystem library has been found, whether it is - *Experimental* or *Final*. - - If no version of the filesystem library is available, this target will not - be defined. - - .. note:: - This target has ``cxx_std_17`` as an ``INTERFACE`` - :ref:`compile language standard feature `. Linking - to this target will automatically enable C++17 if no later standard - version is already required on the linking target. - - -.. _fs.variables: - -Variables -********* - -.. variable:: CXX_FILESYSTEM_IS_EXPERIMENTAL - - Set to ``TRUE`` when the :find-component:`fs.Experimental` version of C++ - filesystem library was found, otherwise ``FALSE``. - -.. variable:: CXX_FILESYSTEM_HAVE_FS - - Set to ``TRUE`` when a filesystem header was found. - -.. variable:: CXX_FILESYSTEM_HEADER - - Set to either ``filesystem`` or ``experimental/filesystem`` depending on - whether :find-component:`fs.Final` or :find-component:`fs.Experimental` was - found. - -.. variable:: CXX_FILESYSTEM_NAMESPACE - - Set to either ``std::filesystem`` or ``std::experimental::filesystem`` - depending on whether :find-component:`fs.Final` or - :find-component:`fs.Experimental` was found. - - -Examples -******** - -Using `find_package(FILESYSTEM)` with no component arguments: - -.. code-block:: cmake - - find_package(FILESYSTEM REQUIRED) - - add_executable(my-program main.cpp) - target_link_libraries(my-program PRIVATE std::filesystem) - - -#]=======================================================================] - -if(TARGET std::filesystem) - # This module has already been processed. Don't do it again. - return() -endif() - -include(CMakePushCheckState) -include(CheckIncludeFileCXX) -include(CheckCXXSourceCompiles) - -cmake_push_check_state() - -set(CMAKE_REQUIRED_QUIET ${FILESYSTEM_FIND_QUIETLY}) - -# All of our tests require C++17 or later -set(OLD_CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -set(CMAKE_CXX_STANDARD 17) -if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER - "8.0.0")) - if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.99") - set(UNDEFINED_BEHAVIOR_WITHOUT_LINKING TRUE) - endif() - set(CMAKE_REQUIRED_FLAGS "-std=c++17") -endif() -if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") AND NOT (CMAKE_CXX_COMPILER_VERSION - VERSION_LESS "8.99")) - set(CMAKE_REQUIRED_FLAGS "-std=c++17") -endif() -if((CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") AND NOT (CMAKE_CXX_COMPILER_VERSION - VERSION_LESS "11")) - set(CMAKE_REQUIRED_FLAGS "-std=c++17") -endif() -if(MSVC AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "18")) - set(CMAKE_REQUIRED_FLAGS "/std:c++17") -endif() - -# Normalize and check the component list we were given -set(want_components ${FILESYSTEM_FIND_COMPONENTS}) -if(FILESYSTEM_FIND_COMPONENTS STREQUAL "") - set(want_components Final) -endif() - -# Warn on any unrecognized components -set(extra_components ${want_components}) -list(REMOVE_ITEM extra_components Final Experimental) -foreach(component IN LISTS extra_components) - message(WARNING "Extraneous find_package component for FILESYSTEM: ${component}") -endforeach() - -# Detect which of Experimental and Final we should look for -set(find_experimental TRUE) -set(find_final TRUE) -if(NOT "Final" IN_LIST want_components) - set(find_final FALSE) -endif() -if(NOT "Experimental" IN_LIST want_components) - set(find_experimental FALSE) -endif() - -if(find_final) - check_include_file_cxx("filesystem" _CXX_FILESYSTEM_HAVE_HEADER) - mark_as_advanced(_CXX_FILESYSTEM_HAVE_HEADER) - if(_CXX_FILESYSTEM_HAVE_HEADER) - # We found the non-experimental header. Don't bother looking for the - # experimental one. - set(find_experimental FALSE) - endif() -else() - set(_CXX_FILESYSTEM_HAVE_HEADER FALSE) -endif() - -if(find_experimental) - check_include_file_cxx("experimental/filesystem" - _CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER) - mark_as_advanced(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER) -else() - set(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER FALSE) -endif() - -if(_CXX_FILESYSTEM_HAVE_HEADER) - set(_have_fs TRUE) - set(_fs_header filesystem) - set(_fs_namespace std::filesystem) -elseif(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER) - set(_have_fs TRUE) - set(_fs_header experimental/filesystem) - set(_fs_namespace std::experimental::filesystem) -else() - set(_have_fs FALSE) -endif() - -set(CXX_FILESYSTEM_HAVE_FS - ${_have_fs} - CACHE BOOL "TRUE if we have the C++ filesystem headers") -set(CXX_FILESYSTEM_HEADER - ${_fs_header} - CACHE STRING "The header that should be included to obtain the filesystem APIs") -set(CXX_FILESYSTEM_NAMESPACE - ${_fs_namespace} - CACHE STRING "The C++ namespace that contains the filesystem APIs") - -set(_found FALSE) - -if(CXX_FILESYSTEM_HAVE_FS) - # We have some filesystem library available. Do link checks - string( - CONFIGURE - [[ - #include <@CXX_FILESYSTEM_HEADER@> - - int main() { - auto cwd = @CXX_FILESYSTEM_NAMESPACE@::current_path(); - return static_cast(cwd.string().size()); - } - ]] - code - @ONLY) - - # Try to compile a simple filesystem program without any linker flags - if(NOT UNDEFINED_BEHAVIOR_WITHOUT_LINKING) - check_cxx_source_compiles("${code}" CXX_FILESYSTEM_NO_LINK_NEEDED) - set(can_link ${CXX_FILESYSTEM_NO_LINK_NEEDED}) - endif() - - if(NOT CXX_FILESYSTEM_NO_LINK_NEEDED) - set(prev_libraries ${CMAKE_REQUIRED_LIBRARIES}) - set(CMAKE_REQUIRED_FLAGS "-std=c++17") - # Add the libstdc++ flag - set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lstdc++fs) - check_cxx_source_compiles("${code}" CXX_FILESYSTEM_STDCPPFS_NEEDED) - set(can_link ${CXX_FILESYSTEM_STDCPPFS_NEEDED}) - if(NOT CXX_FILESYSTEM_STDCPPFS_NEEDED) - # Try the libc++ flag - set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lc++fs) - check_cxx_source_compiles("${code}" CXX_FILESYSTEM_CPPFS_NEEDED) - set(can_link ${CXX_FILESYSTEM_CPPFS_NEEDED}) - endif() - endif() - - if(can_link) - if(CMAKE_VERSION VERSION_LESS 3.12) - add_library(std::filesystem INTERFACE IMPORTED GLOBAL) - else() - add_library(std::filesystem INTERFACE IMPORTED) - target_compile_features(std::filesystem INTERFACE cxx_std_17) - endif() - set(_found TRUE) - - if(CXX_FILESYSTEM_NO_LINK_NEEDED) - # Nothing to add... - elseif(CXX_FILESYSTEM_STDCPPFS_NEEDED) - target_link_libraries(std::filesystem INTERFACE -lstdc++fs) - elseif(CXX_FILESYSTEM_CPPFS_NEEDED) - target_link_libraries(std::filesystem INTERFACE -lc++fs) - endif() - endif() -endif() - -if(NOT ${_found}) - set(CMAKE_CXX_STANDARD ${OLD_CMAKE_CXX_STANDARD}) -endif() - -cmake_pop_check_state() - -set(FILESYSTEM_FOUND - ${_found} - CACHE BOOL "TRUE if we can compile and link a program using std::filesystem" FORCE) - -if(FILESYSTEM_FIND_REQUIRED AND NOT FILESYSTEM_FOUND) - message(FATAL_ERROR "Cannot compile a simple program using std::filesystem") -endif() From bd34c7cffefbf02ea3a18e10b984ebc68ea934a7 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 5 Jul 2025 16:37:36 +0200 Subject: [PATCH 46/67] ci: Add Werror flag to CI for C compilation We only used the Werror flag for C++. If we use the Werror flag, we should mostly use it for our core. Signed-off-by: Johannes Demel --- .github/workflows/run-tests.yml | 8 ++++---- CMakeLists.txt | 13 +++++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 7e87e357b..c4496a3c7 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -51,7 +51,7 @@ jobs: env: CC: ${{ matrix.compiler.cc }} CXX: ${{ matrix.compiler.cxx }} - run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. + run: mkdir build && cd build && cmake -DCMAKE_C_FLAGS="-Werror" -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. - name: Build run: | echo "Build with $(nproc) thread(s)" @@ -135,7 +135,7 @@ jobs: run: | cd /volk cd build - cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ${{ matrix.cmakeargs }} .. + cmake -DCMAKE_C_FLAGS="-Werror" -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ${{ matrix.cmakeargs }} .. echo "Build with $(nproc) thread(s)" make -j$(nproc) if [ -f ./cpu_features/list_cpu_features ]; then @@ -159,7 +159,7 @@ jobs: - name: dependencies run: sudo apt install python3-mako liborc-dev libgtest-dev libfmt-dev - name: configure - run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DBUILD_EXECUTABLE=ON .. + run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True -DCMAKE_C_FLAGS="-Werror" -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. - name: build run: cmake --build build -j$(nproc) - name: Print info @@ -236,7 +236,7 @@ jobs: - name: dependencies run: pip3 install --break-system-packages mako && brew install orc - name: configure - run: mkdir build && cd build && cmake -DBUILD_EXECUTABLE=ON .. + run: mkdir build && cd build && cmake -DCMAKE_C_FLAGS="-Werror" -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. - name: build run: cmake --build build --config Debug -j4 - name: Print info diff --git a/CMakeLists.txt b/CMakeLists.txt index 718c9fa6f..b4d5f21ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") endif() list(INSERT CMAKE_MODULE_PATH 0 ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules -)#location for custom "Modules" +) #location for custom "Modules" include(VolkBuildTypes) #select the release build type by default to get optimization flags @@ -114,7 +114,7 @@ set(CROSSCOMPILE_MULTILIB if(MSVC) add_definitions(-D_USE_MATH_DEFINES - )#enables math constants on all supported versions of MSVC + ) #enables math constants on all supported versions of MSVC add_compile_options(/W1) #reduce warnings add_compile_options(/wo4309) add_compile_options(/wd4752) @@ -155,11 +155,20 @@ if(VOLK_CPU_FEATURES) set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Build cpu_features with Position Independent Code (PIC)." FORCE) + include(CheckCCompilerFlag) + set(CMAKE_C_FLAGS_SAVED "${CMAKE_C_FLAGS}") + check_c_compiler_flag(-Wno-unused-function HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) + if(HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function") + endif(HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) + + set(BUILD_SHARED_LIBS OFF) set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}") set(BUILD_SHARED_LIBS OFF) set(ENABLE_INSTALL OFF) add_subdirectory(cpu_features) set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS_SAVED}") endif() else() message(STATUS "Building Volk without cpu_features") From 7aa0e3ddf0c0043f2207f499b284160847f358d1 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 5 Jul 2025 16:40:48 +0200 Subject: [PATCH 47/67] cx-limited-range: Scope compile feature We only want to use this flag in necessary compilation units. All other units should build without this option. Further, this fixes issues with the latest Clang compiler versions. Signed-off-by: Johannes Demel --- CMakeLists.txt | 21 --------------------- lib/CMakeLists.txt | 19 ++++++++++++++----- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b4d5f21ba..9253694ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,27 +28,6 @@ enable_testing() ######################################################################## # Disable complex math NaN/INFO range checking for performance -include(CheckCXXCompilerFlag) -check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE) -if(HAVE_CX_LIMITED_RANGE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcx-limited-range") - check_cxx_compiler_flag(-Wno-unused-command-line-argument - HAVE_WNO_UNUSED_CMD_LINE_ARG) - if(HAVE_WNO_UNUSED_CMD_LINE_ARG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") - endif(HAVE_WNO_UNUSED_CMD_LINE_ARG) -endif(HAVE_CX_LIMITED_RANGE) - -include(CheckCCompilerFlag) -check_c_compiler_flag(-fcx-limited-range HAVE_C_LIMITED_RANGE) -if(HAVE_C_LIMITED_RANGE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcx-limited-range") - check_cxx_compiler_flag(-Wno-unused-command-line-argument - HAVE_WNO_UNUSED_CMD_LINE_ARG) - if(HAVE_WNO_UNUSED_CMD_LINE_ARG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") - endif(HAVE_WNO_UNUSED_CMD_LINE_ARG) -endif(HAVE_C_LIMITED_RANGE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index db3e78d88..ababae933 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -93,11 +93,8 @@ execute_process( OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE) -try_compile( - HAVE_RVV_INTRINSICS - ${CMAKE_BINARY_DIR} - ${CMAKE_SOURCE_DIR}/cmake/Checks/check-rvv-intrinsics.c -) +try_compile(HAVE_RVV_INTRINSICS ${CMAKE_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/cmake/Checks/check-rvv-intrinsics.c) if(HAVE_RVV_INTRINSICS) message(STATUS "Checking RVV intrinsics - found") else() @@ -571,6 +568,18 @@ if(NOT MSVC) set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fPIC") endif() +# Disable complex math NaN/INFO range checking for performance +include(CheckCCompilerFlag) +check_c_compiler_flag(-fcx-limited-range HAVE_C_LIMITED_RANGE) +if(HAVE_C_LIMITED_RANGE) + set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fcx-limited-range") + check_c_compiler_flag(-Wno-unused-command-line-argument HAVE_WNO_UNUSED_CMD_LINE_ARG) + if(HAVE_WNO_UNUSED_CMD_LINE_ARG) + set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS + "-Wno-unused-command-line-argument") + endif(HAVE_WNO_UNUSED_CMD_LINE_ARG) +endif(HAVE_C_LIMITED_RANGE) + #Add dynamic library # #NOTE: The PUBLIC include directories and library linkage will be From 6e8c86cd9da0bd1e04a0bed80634604b105bb847 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 6 Jul 2025 19:06:37 +0200 Subject: [PATCH 48/67] cpu_features: Start using packaged version in CI Most distros provide a cpu_features package now. On most platforms all the required features are available. Let's use these features. Signed-off-by: Johannes Demel --- .github/workflows/run-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index c4496a3c7..10cf81a55 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -58,7 +58,9 @@ jobs: cmake --build build -j$(nproc) - name: Print info run: | - ./build/cpu_features/list_cpu_features + if [ -f ./build/cpu_features/list_cpu_features ]; then + ./build/cpu_features/list_cpu_features + fi ./build/apps/volk-config-info --alignment ./build/apps/volk-config-info --avail-machines ./build/apps/volk-config-info --all-machines From fc5b7aae915902dea7d0965194777d4a7d09aee1 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 6 Jul 2025 20:15:48 +0200 Subject: [PATCH 49/67] ppc64le: Suppress unused variable warning The cpu_features build does not use some variables for the ppc64le build. This errors with the Werror flag. This commit suppresses this error. Signed-off-by: Johannes Demel --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9253694ce..3281a9c06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,7 +138,7 @@ if(VOLK_CPU_FEATURES) set(CMAKE_C_FLAGS_SAVED "${CMAKE_C_FLAGS}") check_c_compiler_flag(-Wno-unused-function HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) if(HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable") endif(HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) set(BUILD_SHARED_LIBS OFF) From f314ca91adbd9e71a61b4a2c6087bba5b7a5b060 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 6 Jul 2025 21:21:32 +0200 Subject: [PATCH 50/67] cmake: Fix auto-format We use cmake-format to have consistent CMake files. Signed-off-by: Johannes Demel --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3281a9c06..91a6571f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") endif() list(INSERT CMAKE_MODULE_PATH 0 ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules -) #location for custom "Modules" +)#location for custom "Modules" include(VolkBuildTypes) #select the release build type by default to get optimization flags @@ -93,7 +93,7 @@ set(CROSSCOMPILE_MULTILIB if(MSVC) add_definitions(-D_USE_MATH_DEFINES - ) #enables math constants on all supported versions of MSVC + )#enables math constants on all supported versions of MSVC add_compile_options(/W1) #reduce warnings add_compile_options(/wo4309) add_compile_options(/wd4752) @@ -138,7 +138,8 @@ if(VOLK_CPU_FEATURES) set(CMAKE_C_FLAGS_SAVED "${CMAKE_C_FLAGS}") check_c_compiler_flag(-Wno-unused-function HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) if(HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable") + set(CMAKE_C_FLAGS + "${CMAKE_C_FLAGS} -Wno-unused-function -Wno-unused-variable") endif(HAVE_WNO_UNUSED_FUNCTION_CMD_LINE_ARG) set(BUILD_SHARED_LIBS OFF) From 8d04148dccd412a2044c2da945f83fd00147e193 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 6 Jul 2025 21:22:25 +0200 Subject: [PATCH 51/67] neon: Fix compile check The `check_c_source_compiles` code to test if NEON is supported was broken. Let's add flags and formatting to make it work again. Signed-off-by: Johannes Demel --- lib/CMakeLists.txt | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index ababae933..7af373424 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -228,18 +228,17 @@ endif(NOT CPU_IS_x86) include(CheckCSourceCompiles) -check_c_source_compiles( - "#include \nint main(){ uint8_t *dest; uint8x8_t res; vst1_u8(dest, res); }" - neon_compile_result) +set(CMAKE_C_FLAGS_SAVED "${CMAKE_C_FLAGS}") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-uninitialized") +check_c_source_compiles("#include +int main(){ uint8_t *dest; uint8x8_t res; vst1_u8(dest, res); }" neon_compile_result) if(neon_compile_result) set(CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/include) - check_c_source_compiles( - "#include \n int main(){__VOLK_ASM(\"vrev32.8 q0, q0\");}" - have_neonv7_result) - check_c_source_compiles( - "#include \n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}" - have_neonv8_result) + check_c_source_compiles("#include + int main(){__VOLK_ASM(\"vrev32.8 q0, q0\");}" have_neonv7_result) + check_c_source_compiles("#include + int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}" have_neonv8_result) if(NOT have_neonv7_result) overrule_arch(neonv7 "Compiler doesn't support neonv7") @@ -253,7 +252,7 @@ else(neon_compile_result) overrule_arch(neonv7 "Compiler doesn't support NEON") overrule_arch(neonv8 "Compiler doesn't support NEON") endif(neon_compile_result) - +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS_SAVED}") ######################################################################## # implement overruling in the ORC case, # since ORC always passes flag detection From 45963f907edfa1a88cf2713f0e9ff9b259d02fc6 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sun, 6 Jul 2025 22:02:35 +0200 Subject: [PATCH 52/67] 16u_bytswap: Update code style Move loop variable declaration into loop. Fix `maybe-uninitialized` warning by initializing the corresponding variable. Signed-off-by: Johannes Demel --- kernels/volk/volk_16u_byteswap.h | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h index 50e59906b..7519b782d 100644 --- a/kernels/volk/volk_16u_byteswap.h +++ b/kernels/volk/volk_16u_byteswap.h @@ -41,7 +41,6 @@ #define INCLUDED_volk_16u_byteswap_u_H #include -#include #ifdef LV_HAVE_GENERIC @@ -63,8 +62,6 @@ static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap, #include static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points) { - unsigned int number; - const unsigned int nPerSet = 16; const uint64_t nSets = num_points / nPerSet; @@ -76,7 +73,7 @@ static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int n const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); - for (number = 0; number < nSets; number++) { + for (unsigned int number = 0; number < nSets; number++) { // Load the 32t values, increment inputPtr later since we're doing it in-place. const __m256i input = _mm256_load_si256((__m256i*)inputPtr); const __m256i output = _mm256_shuffle_epi8(input, myShuffle); @@ -87,7 +84,7 @@ static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int n } // Byteswap any remaining points: - for (number = nPerSet * nSets; number < num_points; number++) { + for (unsigned int number = nPerSet * nSets; number < num_points; number++) { uint16_t outputVal = *inputPtr; outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); *inputPtr = outputVal; @@ -101,8 +98,6 @@ static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int n #include static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points) { - unsigned int number; - const unsigned int nPerSet = 16; const uint64_t nSets = num_points / nPerSet; @@ -114,7 +109,7 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]); - for (number = 0; number < nSets; number++) { + for (unsigned int number = 0; number < nSets; number++) { // Load the 32t values, increment inputPtr later since we're doing it in-place. const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr); const __m256i output = _mm256_shuffle_epi8(input, myShuffle); @@ -125,7 +120,7 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n } // Byteswap any remaining points: - for (number = nPerSet * nSets; number < num_points; number++) { + for (unsigned int number = nPerSet * nSets; number < num_points; number++) { uint16_t outputVal = *inputPtr; outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); *inputPtr = outputVal; @@ -140,12 +135,11 @@ static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int n static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points) { - unsigned int number = 0; uint16_t* inputPtr = intsToSwap; __m128i input, left, right, output; const unsigned int eighthPoints = num_points / 8; - for (; number < eighthPoints; number++) { + for (unsigned int number = 0; number < eighthPoints; number++) { // Load the 16t values, increment inputPtr later since we're doing it in-place. input = _mm_loadu_si128((__m128i*)inputPtr); // Do the two shifts @@ -159,8 +153,7 @@ static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int n } // Byteswap any remaining points: - number = eighthPoints * 8; - for (; number < num_points; number++) { + for (unsigned int number = eighthPoints * 8; number < num_points; number++) { uint16_t outputVal = *inputPtr; outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); *inputPtr = outputVal; @@ -175,7 +168,6 @@ static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int n #define INCLUDED_volk_16u_byteswap_a_H #include -#include #ifdef LV_HAVE_SSE2 #include @@ -209,12 +201,12 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points) { - unsigned int number; unsigned int eighth_points = num_points / 8; - uint16x8_t input, output; + uint16x8_t input; + uint16x8_t output = { 0, 0, 0, 0, 0, 0, 0, 0 }; uint16_t* inputPtr = intsToSwap; - for (number = 0; number < eighth_points; number++) { + for (unsigned int number = 0; number < eighth_points; number++) { input = vld1q_u16(inputPtr); output = vsriq_n_u16(output, input, 8); output = vsliq_n_u16(output, input, 8); @@ -233,7 +225,6 @@ static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, unsigned int num_points) { uint16_t* inputPtr = intsToSwap; - unsigned int number = 0; unsigned int n16points = num_points / 16; uint8x8x4_t input_table; @@ -253,7 +244,7 @@ static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap, int_lookup45 = vcreate_u8(1521377802851189772); int_lookup67 = vcreate_u8(1666058148527343118); - for (number = 0; number < n16points; ++number) { + for (unsigned int number = 0; number < n16points; ++number) { input_table = vld4_u8((uint8_t*)inputPtr); swapped_int01 = vtbl4_u8(input_table, int_lookup01); swapped_int23 = vtbl4_u8(input_table, int_lookup23); From 5d1e0f3b40ba5035a51700f28d70c558c558a1e0 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 9 Aug 2025 16:27:33 +0200 Subject: [PATCH 53/67] cmake: Update to modern PIC enablement Instead of adding a `-fPIC` flag, we can use more modern CMake features to enable `POSITION INDEPENDENT CODE` (PIC). Signed-off-by: Johannes Demel --- lib/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 7af373424..1975ed85b 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -563,9 +563,7 @@ endif() target_compile_features(volk_obj PUBLIC c_std_17) #Configure object target properties -if(NOT MSVC) - set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fPIC") -endif() +set_target_properties(volk_obj PROPERTIES POSITION_INDEPENDENT_CODE ON) # Disable complex math NaN/INFO range checking for performance include(CheckCCompilerFlag) From 461bc23c57103bc9bd8257fb601e18857f998870 Mon Sep 17 00:00:00 2001 From: Johannes Demel Date: Sat, 9 Aug 2025 18:11:17 +0200 Subject: [PATCH 54/67] rotator: Add specialized gtest suite This test suite should enable us to test the rotator more specifically. Signed-off-by: Johannes Demel --- .../test_volk_32fc_s32fc_x2_rotator2_32fc.cc | 151 ++++++++++++++++++ tests/volk_test.h | 78 ++++++++- 2 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 tests/test_volk_32fc_s32fc_x2_rotator2_32fc.cc diff --git a/tests/test_volk_32fc_s32fc_x2_rotator2_32fc.cc b/tests/test_volk_32fc_s32fc_x2_rotator2_32fc.cc new file mode 100644 index 000000000..f259d425f --- /dev/null +++ b/tests/test_volk_32fc_s32fc_x2_rotator2_32fc.cc @@ -0,0 +1,151 @@ +/* -*- c++ -*- */ +/* + * Copyright 2025 Johannes Demel + * + * This file is part of VOLK + * + * SPDX-License-Identifier: LGPL-3.0-or-later + */ + +#include "volk_test.h" +#include +#include +#include +#include +#include +#include +#include + +class volk_32fc_s32fc_x2_rotator2_32fc_test : public VolkTest +{ +protected: + void SetUp() override + { + initialize_test(GetParam()); + initialize_data(vector_length); + } + + void initialize_data(const size_t length) + { + // Be stricter for smaller vectors. Error accumulate slowly! + if (length < 16) { + absolute_error = 10.e-7; + } else if (length < 128) { + absolute_error = 10.e-6; + } else if (length < 65536) { + absolute_error = 10.e-5; + } else { + absolute_error = 10.e-3; + } + + vector_length = length; + input = volk::vector(length); + result = volk::vector(length); + result_magnitude = volk::vector(length); + + const float initial_phase = initial_phase_steps * increment; + phase_increment = std::polar(1.0f, increment); + phase = std::polar(1.0f, initial_phase); + + for (size_t i = 0; i < length; ++i) { + input[i] = + std::complex(2.0f * std::cos(2.0f * M_PI * i / length), + 2.0f * std::sin(0.3f + 2.0f * M_PI * i / length)); + } + + // Calculate expected results + expected = volk::vector(length); + for (size_t i = 0; i < length; ++i) { + expected[i] = + input[i] * + std::polar(1.0f, initial_phase + static_cast(i) * increment); + } + + expected_magnitude = volk::vector(length); + for (size_t i = 0; i < length; ++i) { + expected_magnitude[i] = std::abs(input[i]); + } + + // This is a hacky solution to have unaligned tests. + ua_result = result; + ua_result.at(0) = expected.at(0); + } + + void execute_aligned(const std::string impl_name) + { + volk_32fc_s32fc_x2_rotator2_32fc_manual(result.data(), + input.data(), + &phase_increment, + &phase, + vector_length, + impl_name.c_str()); + + for (size_t i = 0; i < vector_length; ++i) { + result_magnitude[i] = std::abs(result[i]); + } + EXPECT_TRUE(AreFloatingPointArraysEqualWithAbsoluteError( + expected_magnitude, result_magnitude, absolute_magnitue_error)); + EXPECT_TRUE(AreComplexFloatingPointArraysEqualWithAbsoluteError( + expected, result, absolute_error)); + } + + void execute_unaligned(const std::string impl_name) + { + lv_32fc_t unaligned_phase = + std::polar(1.0f, (initial_phase_steps + 1.0f) * increment); + volk_32fc_s32fc_x2_rotator2_32fc_manual(ua_result.data() + 1, + input.data() + 1, + &phase_increment, + &unaligned_phase, + vector_length - 1, + impl_name.c_str()); + for (size_t i = 0; i < vector_length; ++i) { + result_magnitude[i] = std::abs(ua_result[i]); + } + result_magnitude[0] = expected_magnitude[0]; + + EXPECT_TRUE(AreFloatingPointArraysEqualWithAbsoluteError( + expected_magnitude, result_magnitude, absolute_magnitue_error)); + EXPECT_TRUE(AreComplexFloatingPointArraysEqualWithAbsoluteError( + expected, ua_result, absolute_error)); + } + + static constexpr float increment = 0.07f; + static constexpr float initial_phase_steps = 0.0f; + static constexpr float absolute_magnitue_error = 1.0e-4; + float absolute_error{}; + volk::vector input; + volk::vector result; + lv_32fc_t phase_increment; + lv_32fc_t phase; + volk::vector expected; + volk::vector expected_magnitude; + volk::vector ua_result; + volk::vector result_magnitude; +}; + +TEST_P(volk_32fc_s32fc_x2_rotator2_32fc_test, run) +{ + fmt::print("test {} implementation: {:>12}, size={} ...", + is_aligned_implementation ? "aligned" : "unaligned", + implementation_name, + vector_length); + auto start = std::chrono::steady_clock::now(); + + if (is_aligned_implementation) { + execute_aligned(implementation_name); + } else { + execute_unaligned(implementation_name); + } + + std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; + fmt::print("\tduration={}\n", elapsed); +} + +INSTANTIATE_TEST_SUITE_P( + volk_32fc_s32fc_x2_rotator2_32fc, + volk_32fc_s32fc_x2_rotator2_32fc_test, + testing::Combine(testing::ValuesIn(get_kernel_implementation_name_list( + volk_32fc_s32fc_x2_rotator2_32fc_get_func_desc())), + testing::ValuesIn(default_vector_sizes)), + generate_volk_test_name()); diff --git a/tests/volk_test.h b/tests/volk_test.h index ebc2e3237..1298bef3f 100644 --- a/tests/volk_test.h +++ b/tests/volk_test.h @@ -70,9 +70,83 @@ ::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& exp auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); if (not expected_real.AlmostEquals(actual_real) or - not expected_imag.AlmostEquals(actual_imag)) + not expected_imag.AlmostEquals(actual_imag)) { + if (errorsFound == 0) { + result << "Differences found:"; + } + if (errorsFound < 3) { + result << separator << expected[index] << " != " << actual[index] << " @ " + << index; + separator = ",\n"; + } + errorsFound++; + } + } + if (errorsFound > 0) { + result << separator << errorsFound << " differences in total"; + return result; + } + return ::testing::AssertionSuccess(); +} + +template +::testing::AssertionResult AreComplexFloatingPointArraysEqualWithAbsoluteError( + const T& expected, const T& actual, const float absolute_error = 1.0e-7) +{ + ::testing::AssertionResult result = ::testing::AssertionFailure(); + if (expected.size() != actual.size()) { + return result << "expected result size=" << expected.size() + << " differs from actual size=" << actual.size(); + } + const unsigned long length = expected.size(); - { + int errorsFound = 0; + const char* separator = " "; + for (unsigned long index = 0; index < length; index++) { + auto expected_real = ::testing::internal::FloatingPoint(expected[index].real()); + auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag()); + auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); + auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); + if (expected_real.is_nan() or actual_real.is_nan() or expected_imag.is_nan() or + actual_imag.is_nan() or + std::abs(expected[index].real() - actual[index].real()) > absolute_error or + std::abs(expected[index].imag() - actual[index].imag()) > absolute_error) { + if (errorsFound == 0) { + result << "Differences found:"; + } + if (errorsFound < 3) { + result << separator << expected[index] << " != " << actual[index] << " @ " + << index; + separator = ",\n"; + } + errorsFound++; + } + } + if (errorsFound > 0) { + result << separator << errorsFound << " differences in total"; + return result; + } + return ::testing::AssertionSuccess(); +} + +template +::testing::AssertionResult AreFloatingPointArraysEqualWithAbsoluteError( + const T& expected, const T& actual, const float absolute_error = 1.0e-7) +{ + ::testing::AssertionResult result = ::testing::AssertionFailure(); + if (expected.size() != actual.size()) { + return result << "expected result size=" << expected.size() + << " differs from actual size=" << actual.size(); + } + const unsigned long length = expected.size(); + + int errorsFound = 0; + const char* separator = " "; + for (unsigned long index = 0; index < length; index++) { + auto expected_value = ::testing::internal::FloatingPoint(expected[index]); + auto actual_value = ::testing::internal::FloatingPoint(actual[index]); + if (expected_value.is_nan() or actual_value.is_nan() or + std::abs(expected[index] - actual[index]) > absolute_error) { if (errorsFound == 0) { result << "Differences found:"; } From 584fc49a64bfa09e8cf7eb5c1c144d8ad31809cf Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 16:47:34 +0100 Subject: [PATCH 55/67] Add edge cases and tests to atan2/atan Signed-off-by: Magnus Lundmark --- include/volk/volk_common.h | 40 ++- kernels/volk/volk_32f_atan_32f.h | 4 + kernels/volk/volk_32fc_s32f_atan2_32f.h | 336 +++++++++++++++++++++--- lib/kernel_tests.h | 45 +++- lib/qa_utils.cc | 121 +++++++-- lib/qa_utils.h | 47 +++- 6 files changed, 532 insertions(+), 61 deletions(-) diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 1785dbdae..3f8c09b89 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -203,6 +203,16 @@ static inline float volk_arctan(const float x) */ const float pi_2 = 0x1.921fb6p0f; + // Propagate NaN + if (isnan(x)) { + return x; + } + + // arctan(±∞) = ±π/2 + if (isinf(x)) { + return copysignf(pi_2, x); + } + if (fabs(x) < 1.f) { return volk_arctan_poly(x); } else { @@ -226,11 +236,39 @@ static inline float volk_atan2(const float y, const float x) const float pi = 0x1.921fb6p1f; const float pi_2 = 0x1.921fb6p0f; + // Propagate NaN from inputs + if (isnan(x) || isnan(y)) { + return x + y; + } + + // Handle infinity cases per IEEE 754 + if (isinf(y)) { + if (isinf(x)) { + // Both infinite: atan2(±∞, ±∞) = ±π/4 or ±3π/4 + const float angle = (x > 0.f) ? (pi_2 / 2.f) : (3.f * pi_2 / 2.f); + return copysignf(angle, y); + } else { + // y infinite, x finite: atan2(±∞, x) = ±π/2 + return copysignf(pi_2, y); + } + } + if (isinf(x)) { + // x infinite, y finite: atan2(y, +∞) = ±0, atan2(y, -∞) = ±π + return (x > 0.f) ? copysignf(0.f, y) : copysignf(pi, y); + } + if (fabs(x) == 0.f) { return (fabs(y) == 0.f) ? copysignf(0.f, y) : copysignf(pi_2, y); } const int swap = fabs(x) < fabs(y); - const float input = swap ? (x / y) : (y / x); + const float numerator = swap ? x : y; + const float denominator = swap ? y : x; + float input = numerator / denominator; + + if (isnan(input)) { + input = numerator; + } + float result = volk_arctan_poly(input); result = swap ? (input >= 0.f ? pi_2 : -pi_2) - result : result; if (x < 0.f) { diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index 03afea55a..82a5e1ce8 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -99,6 +99,8 @@ volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_point _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), _mm512_mask_blend_ps(swap_mask, one, x)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(x_star, x_star, _CMP_UNORD_Q); + x_star = _mm512_mask_blend_ps(nan_mask, x_star, x); __m512 result = _mm512_arctan_poly_avx512(x_star); __m512 term = _mm512_and_ps(x_star, sign_mask); term = _mm512_or_ps(pi_over_2, term); @@ -240,6 +242,8 @@ volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_point _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS); __m512 x_star = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, x, one), _mm512_mask_blend_ps(swap_mask, one, x)); + __mmask16 nan_mask = _mm512_cmp_ps_mask(x_star, x_star, _CMP_UNORD_Q); + x_star = _mm512_mask_blend_ps(nan_mask, x_star, x); __m512 result = _mm512_arctan_poly_avx512(x_star); __m512 term = _mm512_and_ps(x_star, sign_mask); term = _mm512_or_ps(pi_over_2, term); diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 5e8be5ce1..90d1db56c 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -117,7 +117,6 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - const __m512 zero = _mm512_setzero_ps(); unsigned int number = 0; const unsigned int sixteenth_points = num_points / 16; @@ -130,12 +129,53 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, __m512 x = _mm512_real(z1, z2); __m512 y = _mm512_imag(z1, z2); + // Detect NaN in original inputs before division + __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) | + _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q); + + // Handle infinity cases per IEEE 754 + const __m512 zero = _mm512_setzero_ps(); + const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f); // π/4 + const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f); // 3π/4 + + __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18); // ±inf + __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18); // ±inf + __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS); + + // Build infinity result + __m512 inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + __mmask16 both_inf = y_inf_mask & x_inf_mask; + __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4); + both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask)); + inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result); + + // y infinite, x finite: ±π/2 + __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask; + __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask)); + inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result); + + // x infinite, y finite: 0 or ±π + __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask; + __m512 x_inf_result = + _mm512_mask_blend_ps(x_pos_mask, + _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)), + _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask))); + inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result); + + __mmask16 any_inf_mask = y_inf_mask | x_inf_mask; + __mmask16 swap_mask = _mm512_cmp_ps_mask( _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); - __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), - _mm512_mask_blend_ps(swap_mask, x, y)); - __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); - input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x); + __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y); + __m512 input = _mm512_div_ps(numerator, denominator); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + __mmask16 div_nan_mask = + _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask; + input = _mm512_mask_blend_ps(div_nan_mask, input, numerator); __m512 result = _mm512_arctan_poly_avx512(input); input = @@ -148,6 +188,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector, result = _mm512_add_ps( _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), result); + + // Select infinity result or normal result + result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); _mm512_store_ps(out, result); @@ -177,7 +221,6 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f); const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; const unsigned int eighth_points = num_points / 8; @@ -190,12 +233,56 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, __m256 x = _mm256_real(z1, z2); __m256 y = _mm256_imag(z1, z2); + // Detect NaN in original inputs before division + __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q), + _mm256_cmp_ps(y, y, _CMP_UNORD_Q)); + + // Handle infinity cases per IEEE 754 + const __m256 zero = _mm256_setzero_ps(); + const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 + const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 + + __m256 y_abs = _mm256_and_ps(y, abs_mask); + __m256 x_abs = _mm256_and_ps(x, abs_mask); + __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf + __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf + __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS); + + // Build infinity result + __m256 inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask); + __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask); + both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf); + + // y infinite, x finite: ±π/2 + __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask); + __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only); + + // x infinite, y finite: 0 or ±π + __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask); + __m256 x_inf_result = + _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)), + _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)), + x_pos_mask); + inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only); + + __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask); + __m256 swap_mask = _mm256_cmp_ps( _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS); - __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask), - _mm256_blendv_ps(x, y, swap_mask)); - __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); - input = _mm256_blendv_ps(input, zero, nan_mask); + __m256 numerator = _mm256_blendv_ps(y, x, swap_mask); + __m256 denominator = _mm256_blendv_ps(x, y, swap_mask); + __m256 input = _mm256_div_ps(numerator, denominator); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + __m256 div_nan_mask = + _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q)); + input = _mm256_blendv_ps(input, numerator, div_nan_mask); __m256 result = _mm256_arctan_poly_avx2_fma(input); input = @@ -208,6 +295,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, result = _mm256_add_ps( _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask), result); + + // Select infinity result or normal result + result = _mm256_blendv_ps(result, inf_result, any_inf_mask); + result = _mm256_mul_ps(result, vinvNormalizeFactor); _mm256_store_ps(out, result); @@ -237,7 +328,6 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f); const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; const unsigned int eighth_points = num_points / 8; @@ -250,12 +340,56 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, __m256 x = _mm256_real(z1, z2); __m256 y = _mm256_imag(z1, z2); + // Detect NaN in original inputs before division + __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q), + _mm256_cmp_ps(y, y, _CMP_UNORD_Q)); + + // Handle infinity cases per IEEE 754 + const __m256 zero = _mm256_setzero_ps(); + const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 + const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 + + __m256 y_abs = _mm256_and_ps(y, abs_mask); + __m256 x_abs = _mm256_and_ps(x, abs_mask); + __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf + __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf + __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS); + + // Build infinity result + __m256 inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask); + __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask); + both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf); + + // y infinite, x finite: ±π/2 + __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask); + __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only); + + // x infinite, y finite: 0 or ±π + __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask); + __m256 x_inf_result = + _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)), + _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)), + x_pos_mask); + inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only); + + __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask); + __m256 swap_mask = _mm256_cmp_ps( _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS); - __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask), - _mm256_blendv_ps(x, y, swap_mask)); - __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); - input = _mm256_blendv_ps(input, zero, nan_mask); + __m256 numerator = _mm256_blendv_ps(y, x, swap_mask); + __m256 denominator = _mm256_blendv_ps(x, y, swap_mask); + __m256 input = _mm256_div_ps(numerator, denominator); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + __m256 div_nan_mask = + _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q)); + input = _mm256_blendv_ps(input, numerator, div_nan_mask); __m256 result = _mm256_arctan_poly_avx(input); input = @@ -268,6 +402,10 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, result = _mm256_add_ps( _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask), result); + + // Select infinity result or normal result + result = _mm256_blendv_ps(result, inf_result, any_inf_mask); + result = _mm256_mul_ps(result, vinvNormalizeFactor); _mm256_store_ps(out, result); @@ -301,7 +439,6 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f); const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)); const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)); - const __m512 zero = _mm512_setzero_ps(); const unsigned int sixteenth_points = num_points / 16; @@ -314,12 +451,53 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, __m512 x = _mm512_real(z1, z2); __m512 y = _mm512_imag(z1, z2); + // Detect NaN in original inputs before division + __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) | + _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q); + + // Handle infinity cases per IEEE 754 + const __m512 zero = _mm512_setzero_ps(); + const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f); // π/4 + const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f); // 3π/4 + + __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18); // ±inf + __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18); // ±inf + __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS); + + // Build infinity result + __m512 inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + __mmask16 both_inf = y_inf_mask & x_inf_mask; + __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4); + both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask)); + inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result); + + // y infinite, x finite: ±π/2 + __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask; + __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask)); + inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result); + + // x infinite, y finite: 0 or ±π + __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask; + __m512 x_inf_result = + _mm512_mask_blend_ps(x_pos_mask, + _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)), + _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask))); + inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result); + + __mmask16 any_inf_mask = y_inf_mask | x_inf_mask; + __mmask16 swap_mask = _mm512_cmp_ps_mask( _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS); - __m512 input = _mm512_div_ps(_mm512_mask_blend_ps(swap_mask, y, x), - _mm512_mask_blend_ps(swap_mask, x, y)); - __mmask16 nan_mask = _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q); - input = _mm512_mask_blend_ps(nan_mask, input, zero); + __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x); + __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y); + __m512 input = _mm512_div_ps(numerator, denominator); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + __mmask16 div_nan_mask = + _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask; + input = _mm512_mask_blend_ps(div_nan_mask, input, numerator); __m512 result = _mm512_arctan_poly_avx512(input); input = @@ -332,6 +510,10 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector, result = _mm512_add_ps( _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask), result); + + // Select infinity result or normal result + result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result); + result = _mm512_mul_ps(result, vinvNormalizeFactor); _mm512_storeu_ps(out, result); @@ -361,7 +543,6 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f); const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; const unsigned int eighth_points = num_points / 8; @@ -374,12 +555,56 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, __m256 x = _mm256_real(z1, z2); __m256 y = _mm256_imag(z1, z2); + // Detect NaN in original inputs before division + __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q), + _mm256_cmp_ps(y, y, _CMP_UNORD_Q)); + + // Handle infinity cases per IEEE 754 + const __m256 zero = _mm256_setzero_ps(); + const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 + const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 + + __m256 y_abs = _mm256_and_ps(y, abs_mask); + __m256 x_abs = _mm256_and_ps(x, abs_mask); + __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf + __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf + __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS); + + // Build infinity result + __m256 inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask); + __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask); + both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf); + + // y infinite, x finite: ±π/2 + __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask); + __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only); + + // x infinite, y finite: 0 or ±π + __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask); + __m256 x_inf_result = + _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)), + _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)), + x_pos_mask); + inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only); + + __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask); + __m256 swap_mask = _mm256_cmp_ps( _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS); - __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask), - _mm256_blendv_ps(x, y, swap_mask)); - __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); - input = _mm256_blendv_ps(input, zero, nan_mask); + __m256 numerator = _mm256_blendv_ps(y, x, swap_mask); + __m256 denominator = _mm256_blendv_ps(x, y, swap_mask); + __m256 input = _mm256_div_ps(numerator, denominator); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + __m256 div_nan_mask = + _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q)); + input = _mm256_blendv_ps(input, numerator, div_nan_mask); __m256 result = _mm256_arctan_poly_avx2_fma(input); input = @@ -392,6 +617,10 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, result = _mm256_add_ps( _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask), result); + + // Select infinity result or normal result + result = _mm256_blendv_ps(result, inf_result, any_inf_mask); + result = _mm256_mul_ps(result, vinvNormalizeFactor); _mm256_storeu_ps(out, result); @@ -421,7 +650,6 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f); const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); - const __m256 zero = _mm256_setzero_ps(); unsigned int number = 0; const unsigned int eighth_points = num_points / 8; @@ -434,12 +662,56 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, __m256 x = _mm256_real(z1, z2); __m256 y = _mm256_imag(z1, z2); + // Detect NaN in original inputs before division + __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q), + _mm256_cmp_ps(y, y, _CMP_UNORD_Q)); + + // Handle infinity cases per IEEE 754 + const __m256 zero = _mm256_setzero_ps(); + const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 + const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 + + __m256 y_abs = _mm256_and_ps(y, abs_mask); + __m256 x_abs = _mm256_and_ps(x, abs_mask); + __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf + __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf + __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS); + + // Build infinity result + __m256 inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask); + __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask); + both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf); + + // y infinite, x finite: ±π/2 + __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask); + __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask)); + inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only); + + // x infinite, y finite: 0 or ±π + __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask); + __m256 x_inf_result = + _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)), + _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)), + x_pos_mask); + inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only); + + __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask); + __m256 swap_mask = _mm256_cmp_ps( _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS); - __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask), - _mm256_blendv_ps(x, y, swap_mask)); - __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q); - input = _mm256_blendv_ps(input, zero, nan_mask); + __m256 numerator = _mm256_blendv_ps(y, x, swap_mask); + __m256 denominator = _mm256_blendv_ps(x, y, swap_mask); + __m256 input = _mm256_div_ps(numerator, denominator); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + __m256 div_nan_mask = + _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q)); + input = _mm256_blendv_ps(input, numerator, div_nan_mask); __m256 result = _mm256_arctan_poly_avx(input); input = @@ -452,6 +724,10 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, result = _mm256_add_ps( _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask), result); + + // Select infinity result or normal result + result = _mm256_blendv_ps(result, inf_result, any_inf_mask); + result = _mm256_mul_ps(result, vinvNormalizeFactor); _mm256_storeu_ps(out, result); diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index 16c79c365..5c5d5c4bf 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -11,9 +11,11 @@ #include "qa_utils.h" #include +#include +#include #include -// macros for initializing volk_test_case_t. Maccros are needed to generate +// macros for initializing volk_test_case_t. Macros are needed to generate // function names of the pattern kernel_name_* // for puppets we need to get all the func_variants for the puppet and just @@ -93,12 +95,49 @@ std::vector init_test_list(volk_test_params_t test_params) QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc)) - QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params)) + + volk_test_params_t test_params_atan(test_params); + test_params_atan.add_float_edge_cases({ std::nanf(""), + std::numeric_limits::infinity(), + -std::numeric_limits::infinity(), + 0.0f, + -0.0f, + 1e10f, + -1e10f, + 1.0f, + -1.0f }); + QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_atan)) + QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc)) QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power)) QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_snf)) - QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params)) + + volk_test_params_t test_params_atan2(test_params); + const float inf = std::numeric_limits::infinity(); + const float nan = std::nanf(""); + test_params_atan2.add_complex_edge_cases( + { lv_cmake(0.0f, 0.0f), // atan2(0, 0) = 0 + lv_cmake(0.0f, -0.0f), // atan2(-0, 0) = -0 (preserve sign) + lv_cmake(0.0f, 1.0f), // atan2(1, 0) = π/2 + lv_cmake(0.0f, -1.0f), // atan2(-1, 0) = -π/2 + lv_cmake(1.0f, 0.0f), // atan2(0, 1) = 0 + lv_cmake(-1.0f, 0.0f), // atan2(0, -1) = π + lv_cmake(1.0f, 1.0f), // atan2(1, 1) = π/4 + lv_cmake(-1.0f, 1.0f), // atan2(1, -1) = 3π/4 + lv_cmake(-1.0f, -1.0f), // atan2(-1, -1) = -3π/4 + lv_cmake(1.0f, -1.0f), // atan2(-1, 1) = -π/4 + lv_cmake(inf, inf), // atan2(inf, inf) = π/4 + lv_cmake(inf, -inf), // atan2(-inf, inf) = -π/4 + lv_cmake(-inf, inf), // atan2(inf, -inf) = 3π/4 + lv_cmake(-inf, -inf), // atan2(-inf, -inf) = -3π/4 + lv_cmake(inf, 0.0f), // atan2(0, inf) = 0 + lv_cmake(-inf, 0.0f), // atan2(0, -inf) = π + lv_cmake(1.0f, inf), // atan2(inf, 1) = π/2 + lv_cmake(1.0f, -inf), // atan2(-inf, 1) = -π/2 + lv_cmake(nan, 1.0f), // atan2(1, nan) = nan (propagate) + lv_cmake(1.0f, nan) }); // atan2(nan, 1) = nan (propagate) + QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params_atan2)) QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params.make_absolute(2e-2))) QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params)) diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc index 603993258..d15aca2df 100644 --- a/lib/qa_utils.cc +++ b/lib/qa_utils.cc @@ -38,19 +38,68 @@ void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_en } } -void load_random_data(void* data, volk_type_t type, unsigned int n) +void load_random_data(void* data, + volk_type_t type, + unsigned int n, + const std::vector& float_edge_cases, + const std::vector& complex_edge_cases) { std::random_device rnd_device; std::default_random_engine rnd_engine(rnd_device()); + + unsigned int edge_case_count = 0; + + // Inject complex edge cases for complex float types + if (type.is_float && type.is_complex && !complex_edge_cases.empty()) { + edge_case_count = std::min((unsigned int)complex_edge_cases.size(), n); + if (type.size == 8) { + lv_64fc_t* array = static_cast(data); + for (unsigned int i = 0; i < edge_case_count; i++) { + array[i] = lv_cmake((double)lv_creal(complex_edge_cases[i]), + (double)lv_cimag(complex_edge_cases[i])); + } + } else { + lv_32fc_t* array = static_cast(data); + for (unsigned int i = 0; i < edge_case_count; i++) { + array[i] = complex_edge_cases[i]; + } + } + } + // Inject float edge cases for non-complex float types + else if (type.is_float && !type.is_complex && !float_edge_cases.empty()) { + edge_case_count = std::min((unsigned int)float_edge_cases.size(), n); + if (type.size == 8) { + double* array = static_cast(data); + for (unsigned int i = 0; i < edge_case_count; i++) { + array[i] = static_cast(float_edge_cases[i]); + } + } else { + float* array = static_cast(data); + for (unsigned int i = 0; i < edge_case_count; i++) { + array[i] = float_edge_cases[i]; + } + } + } + + unsigned int remaining_n = n - edge_case_count; if (type.is_complex) - n *= 2; + remaining_n *= 2; + if (type.is_float) { if (type.size == 8) { - random_floats(data, n, rnd_engine); + double* array = static_cast(data); + random_floats(array + edge_case_count * (type.is_complex ? 2 : 1), + remaining_n, + rnd_engine); } else { - random_floats(data, n, rnd_engine); + float* array = static_cast(data); + random_floats(array + edge_case_count * (type.is_complex ? 2 : 1), + remaining_n, + rnd_engine); } } else { + if (type.is_complex) + n *= 2; switch (type.size) { case 8: if (type.is_signed) { @@ -386,6 +435,27 @@ bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode) bool fail = false; int print_max_errs = 10; for (unsigned int i = 0; i < vlen; i++) { + // Check for special values (NaN, inf) + bool in1_special = std::isnan(((t*)(in1))[i]) || std::isinf(((t*)(in1))[i]); + bool in2_special = std::isnan(((t*)(in2))[i]) || std::isinf(((t*)(in2))[i]); + + if (in1_special || in2_special) { + // For NaN: both must be NaN (NaN != NaN, so use isnan) + // For inf: both must be same signed infinity + bool values_match = + (std::isnan(((t*)(in1))[i]) && std::isnan(((t*)(in2))[i])) || + (((t*)(in1))[i] == ((t*)(in2))[i]); + if (!values_match) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i]) + << " in2: " << t(((t*)(in2))[i]); + std::cout << " tolerance was: " << tol << std::endl; + } + } + continue; // Skip normal comparison for special values + } + if (absolute_mode) { if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) { fail = true; @@ -429,16 +499,30 @@ bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode) bool fail = false; int print_max_errs = 10; for (unsigned int i = 0; i < 2 * vlen; i += 2) { - if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) || - std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) || - std::isinf(in2[i]) || std::isinf(in2[i + 1])) { - fail = true; - if (print_max_errs-- > 0) { - std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " - << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] - << "j"; - std::cout << " tolerance was: " << tol << std::endl; + // Check for special values (NaN, inf) and verify they match + bool in1_has_special = std::isnan(in1[i]) || std::isnan(in1[i + 1]) || + std::isinf(in1[i]) || std::isinf(in1[i + 1]); + bool in2_has_special = std::isnan(in2[i]) || std::isnan(in2[i + 1]) || + std::isinf(in2[i]) || std::isinf(in2[i + 1]); + + if (in1_has_special || in2_has_special) { + // For NaN: both must be NaN (NaN != NaN, so use isnan) + // For inf: both must be same signed infinity + bool real_match = + (std::isnan(in1[i]) && std::isnan(in2[i])) || (in1[i] == in2[i]); + bool imag_match = (std::isnan(in1[i + 1]) && std::isnan(in2[i + 1])) || + (in1[i + 1] == in2[i + 1]); + + if (!real_match || !imag_match) { + fail = true; + if (print_max_errs-- > 0) { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " + << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] + << "j"; + std::cout << " tolerance was: " << tol << std::endl; + } } + continue; // Skip normal comparison for special values } t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] }; t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); @@ -543,7 +627,9 @@ bool run_volk_tests(volk_func_desc_t desc, results, puppet_master_name, test_params.absolute_mode(), - test_params.benchmark_mode()); + test_params.benchmark_mode(), + test_params.float_edge_cases(), + test_params.complex_edge_cases()); } bool run_volk_tests(volk_func_desc_t desc, @@ -556,7 +642,9 @@ bool run_volk_tests(volk_func_desc_t desc, std::vector* results, std::string puppet_master_name, bool absolute_mode, - bool benchmark_mode) + bool benchmark_mode, + const std::vector& float_edge_cases, + const std::vector& complex_edge_cases) { // Initialize this entry in results vector results->push_back(volk_test_results_t()); @@ -615,7 +703,8 @@ bool run_volk_tests(volk_func_desc_t desc, mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1))); } for (size_t i = 0; i < inbuffs.size(); i++) { - load_random_data(inbuffs[i], inputsig[i], vlen); + load_random_data( + inbuffs[i], inputsig[i], vlen, float_edge_cases, complex_edge_cases); } // ok let's make a vector of vector of void buffers, which holds the input/output diff --git a/lib/qa_utils.h b/lib/qa_utils.h index a65677203..c2a00206b 100644 --- a/lib/qa_utils.h +++ b/lib/qa_utils.h @@ -62,6 +62,8 @@ class volk_test_params_t bool _benchmark_mode; bool _absolute_mode; std::string _kernel_regex; + std::vector _float_edge_cases; + std::vector _complex_edge_cases; public: // ctor @@ -85,6 +87,14 @@ class volk_test_params_t void set_iter(unsigned int iter) { _iter = iter; }; void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; }; void set_regex(std::string regex) { _kernel_regex = regex; }; + void add_float_edge_cases(const std::vector& edge_cases) + { + _float_edge_cases = edge_cases; + }; + void add_complex_edge_cases(const std::vector& edge_cases) + { + _complex_edge_cases = edge_cases; + }; // getters float tol() { return _tol; }; lv_32fc_t scalar() { return _scalar; }; @@ -93,6 +103,11 @@ class volk_test_params_t bool benchmark_mode() { return _benchmark_mode; }; bool absolute_mode() { return _absolute_mode; }; std::string kernel_regex() { return _kernel_regex; }; + const std::vector& float_edge_cases() const { return _float_edge_cases; }; + const std::vector& complex_edge_cases() const + { + return _complex_edge_cases; + }; volk_test_params_t make_absolute(float tol) { volk_test_params_t t(*this); @@ -154,6 +169,13 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float* buf, unsigned n); +void load_random_data( + void* data, + volk_type_t type, + unsigned int n, + const std::vector& float_edge_cases = std::vector(), + const std::vector& complex_edge_cases = std::vector()); + bool run_volk_tests(volk_func_desc_t, void (*)(), std::string, @@ -161,17 +183,20 @@ bool run_volk_tests(volk_func_desc_t, std::vector* results = NULL, std::string puppet_master_name = "NULL"); -bool run_volk_tests(volk_func_desc_t, - void (*)(), - std::string, - float, - lv_32fc_t, - unsigned int, - unsigned int, - std::vector* results = NULL, - std::string puppet_master_name = "NULL", - bool absolute_mode = false, - bool benchmark_mode = false); +bool run_volk_tests( + volk_func_desc_t, + void (*)(), + std::string, + float, + lv_32fc_t, + unsigned int, + unsigned int, + std::vector* results = NULL, + std::string puppet_master_name = "NULL", + bool absolute_mode = false, + bool benchmark_mode = false, + const std::vector& float_edge_cases = std::vector(), + const std::vector& complex_edge_cases = std::vector()); #define VOLK_PROFILE(func, test_params, results) \ run_volk_tests(func##_get_func_desc(), \ From 5a4bf3190b611770ac2d911b29c4b627a786dc69 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 18:35:41 +0100 Subject: [PATCH 56/67] removed dupes Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_atan_32f.h | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h index 839f4840b..6b602c742 100644 --- a/kernels/volk/volk_32f_atan_32f.h +++ b/kernels/volk/volk_32f_atan_32f.h @@ -359,28 +359,6 @@ volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points) } #endif /* LV_HAVE_SSE4_1 for unaligned */ -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = volk_arctan(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_GENERIC -static inline void -volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points) -{ - unsigned int number = 0; - for (; number < num_points; number++) { - *out++ = atanf(*in++); - } -} -#endif /* LV_HAVE_GENERIC */ - #ifdef LV_HAVE_RVV #include From 98f98ce22dceb1be61e2e3800c3c2a2a42505c69 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 19:10:35 +0100 Subject: [PATCH 57/67] Fix log2 edge case handling for NaN and negative inputs Signed-off-by: Magnus Lundmark --- include/volk/volk_common.h | 5 +++ kernels/volk/volk_32f_log2_32f.h | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index eefb0833b..6b7a89cb3 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -152,9 +152,14 @@ union bit256 { //////////////////////////////////////////////////////////////////////// #include // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels +// NaN -> NaN (preserved for consistency) static inline float log2f_non_ieee(float f) { float const result = log2f(f); + // Return NaN for NaN inputs or negative values (preserves IEEE behavior for invalid inputs) + if (isnan(result)) + return result; + // Map ±Inf to ±127.0f to match SIMD kernel behavior return isinf(result) ? copysignf(127.0f, result) : result; } diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 23382749f..2afffa143 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -132,6 +132,12 @@ static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector, for (; number < eighthPoints; number++) { aVal = _mm256_load_ps(aPtr); + + // Check for NaN or negative/zero (invalid inputs for log2) + __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 nan_value = _mm256_set1_ps(NAN); + bias = _mm256_set1_epi32(127); leadingOne = _mm256_set1_ps(1.0f); exp = _mm256_sub_epi32( @@ -177,6 +183,10 @@ static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector, #endif bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); + + // Replace invalid results with NaN + bVal = _mm256_blendv_ps(bVal, nan_value, invalid_mask); + _mm256_store_ps(bPtr, bVal); aPtr += 8; @@ -219,6 +229,12 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_ for (; number < eighthPoints; number++) { aVal = _mm256_load_ps(aPtr); + + // Check for NaN or negative/zero (invalid inputs for log2) + __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 nan_value = _mm256_set1_ps(NAN); + bias = _mm256_set1_epi32(127); leadingOne = _mm256_set1_ps(1.0f); exp = _mm256_sub_epi32( @@ -265,6 +281,10 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_ bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); + + // Replace invalid results with NaN + bVal = _mm256_blendv_ps(bVal, nan_value, invalid_mask); + _mm256_store_ps(bPtr, bVal); aPtr += 8; @@ -305,6 +325,12 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu for (; number < quarterPoints; number++) { aVal = _mm_load_ps(aPtr); + + // Check for NaN or negative/zero (invalid inputs for log2) + __m128 invalid_mask = _mm_cmple_ps(aVal, _mm_setzero_ps()); // aVal <= 0 + invalid_mask = _mm_or_ps(invalid_mask, _mm_cmpunord_ps(aVal, aVal)); // Or NaN + __m128 nan_value = _mm_set1_ps(NAN); + bias = _mm_set1_epi32(127); leadingOne = _mm_set1_ps(1.0f); exp = _mm_sub_epi32( @@ -348,6 +374,10 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu #endif bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + + // Replace invalid results with NaN + bVal = _mm_blendv_ps(bVal, nan_value, invalid_mask); + _mm_store_ps(bPtr, bVal); aPtr += 4; @@ -488,6 +518,12 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu for (; number < quarterPoints; number++) { aVal = _mm_loadu_ps(aPtr); + + // Check for NaN or negative/zero (invalid inputs for log2) + __m128 invalid_mask = _mm_cmple_ps(aVal, _mm_setzero_ps()); // aVal <= 0 + invalid_mask = _mm_or_ps(invalid_mask, _mm_cmpunord_ps(aVal, aVal)); // Or NaN + __m128 nan_value = _mm_set1_ps(NAN); + bias = _mm_set1_epi32(127); leadingOne = _mm_set1_ps(1.0f); exp = _mm_sub_epi32( @@ -531,6 +567,10 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu #endif bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne))); + + // Replace invalid results with NaN + bVal = _mm_blendv_ps(bVal, nan_value, invalid_mask); + _mm_storeu_ps(bPtr, bVal); aPtr += 4; @@ -574,6 +614,12 @@ static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector, for (; number < eighthPoints; number++) { aVal = _mm256_loadu_ps(aPtr); + + // Check for NaN or negative/zero (invalid inputs for log2) + __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 nan_value = _mm256_set1_ps(NAN); + bias = _mm256_set1_epi32(127); leadingOne = _mm256_set1_ps(1.0f); exp = _mm256_sub_epi32( @@ -619,6 +665,10 @@ static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector, #endif bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal); + + // Replace invalid results with NaN + bVal = _mm256_blendv_ps(bVal, nan_value, invalid_mask); + _mm256_storeu_ps(bPtr, bVal); aPtr += 8; @@ -661,6 +711,12 @@ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_ for (; number < eighthPoints; number++) { aVal = _mm256_loadu_ps(aPtr); + + // Check for NaN or negative/zero (invalid inputs for log2) + __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 nan_value = _mm256_set1_ps(NAN); + bias = _mm256_set1_epi32(127); leadingOne = _mm256_set1_ps(1.0f); exp = _mm256_sub_epi32( @@ -707,6 +763,10 @@ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_ bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal); + + // Replace invalid results with NaN + bVal = _mm256_blendv_ps(bVal, nan_value, invalid_mask); + _mm256_storeu_ps(bPtr, bVal); aPtr += 8; From a04c7dcff624e393c85e8cebbd38fd890584a7c8 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 19:12:11 +0100 Subject: [PATCH 58/67] Fix log2 edge case handling for NaN and negative inputs Signed-off-by: Magnus Lundmark --- include/volk/volk_common.h | 3 ++- kernels/volk/volk_32f_log2_32f.h | 32 ++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h index 6b7a89cb3..3ff6540ff 100644 --- a/include/volk/volk_common.h +++ b/include/volk/volk_common.h @@ -156,7 +156,8 @@ union bit256 { static inline float log2f_non_ieee(float f) { float const result = log2f(f); - // Return NaN for NaN inputs or negative values (preserves IEEE behavior for invalid inputs) + // Return NaN for NaN inputs or negative values (preserves IEEE behavior for invalid + // inputs) if (isnan(result)) return result; // Map ±Inf to ±127.0f to match SIMD kernel behavior diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 2afffa143..9f148e902 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -134,8 +134,10 @@ static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector, aVal = _mm256_load_ps(aPtr); // Check for NaN or negative/zero (invalid inputs for log2) - __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 - invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 invalid_mask = + _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = + _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN __m256 nan_value = _mm256_set1_ps(NAN); bias = _mm256_set1_epi32(127); @@ -231,8 +233,10 @@ volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_ aVal = _mm256_load_ps(aPtr); // Check for NaN or negative/zero (invalid inputs for log2) - __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 - invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 invalid_mask = + _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = + _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN __m256 nan_value = _mm256_set1_ps(NAN); bias = _mm256_set1_epi32(127); @@ -327,8 +331,8 @@ volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int nu aVal = _mm_load_ps(aPtr); // Check for NaN or negative/zero (invalid inputs for log2) - __m128 invalid_mask = _mm_cmple_ps(aVal, _mm_setzero_ps()); // aVal <= 0 - invalid_mask = _mm_or_ps(invalid_mask, _mm_cmpunord_ps(aVal, aVal)); // Or NaN + __m128 invalid_mask = _mm_cmple_ps(aVal, _mm_setzero_ps()); // aVal <= 0 + invalid_mask = _mm_or_ps(invalid_mask, _mm_cmpunord_ps(aVal, aVal)); // Or NaN __m128 nan_value = _mm_set1_ps(NAN); bias = _mm_set1_epi32(127); @@ -520,8 +524,8 @@ volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int nu aVal = _mm_loadu_ps(aPtr); // Check for NaN or negative/zero (invalid inputs for log2) - __m128 invalid_mask = _mm_cmple_ps(aVal, _mm_setzero_ps()); // aVal <= 0 - invalid_mask = _mm_or_ps(invalid_mask, _mm_cmpunord_ps(aVal, aVal)); // Or NaN + __m128 invalid_mask = _mm_cmple_ps(aVal, _mm_setzero_ps()); // aVal <= 0 + invalid_mask = _mm_or_ps(invalid_mask, _mm_cmpunord_ps(aVal, aVal)); // Or NaN __m128 nan_value = _mm_set1_ps(NAN); bias = _mm_set1_epi32(127); @@ -616,8 +620,10 @@ static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector, aVal = _mm256_loadu_ps(aPtr); // Check for NaN or negative/zero (invalid inputs for log2) - __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 - invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 invalid_mask = + _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = + _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN __m256 nan_value = _mm256_set1_ps(NAN); bias = _mm256_set1_epi32(127); @@ -713,8 +719,10 @@ volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_ aVal = _mm256_loadu_ps(aPtr); // Check for NaN or negative/zero (invalid inputs for log2) - __m256 invalid_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 - invalid_mask = _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN + __m256 invalid_mask = + _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LE_OQ); // aVal <= 0 + invalid_mask = + _mm256_or_ps(invalid_mask, _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q)); // Or NaN __m256 nan_value = _mm256_set1_ps(NAN); bias = _mm256_set1_epi32(127); From dbcf374757c012803eecfd9366992f45575e4a5a Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 19:47:21 +0100 Subject: [PATCH 59/67] added riscv kernels Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_log2_32f.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index 9f148e902..e2673d900 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -470,11 +470,22 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po // (-1)^sign * 2^exp * 1.significand, so the log2 is // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23) for (number = 0; number < quarterPoints; ++number) { + // Check for NaN or negative/zero (invalid inputs for log2) + float32x4_t aval_f = vld1q_f32(aPtr); + uint32x4_t invalid_mask = vcleq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal <= 0 + // Check for NaN: NaN comparison with itself returns false + uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aval_f, aval_f)); // NOT(aVal == aVal) + invalid_mask = vorrq_u32(invalid_mask, nan_mask); // Combine masks + float32x4_t nan_value = vdupq_n_f32(NAN); + // load float in to an int register without conversion aval = vld1q_s32((int*)aPtr); VLOG2Q_NEON_F32(log2_approx, aval) + // Replace invalid results with NaN + log2_approx = vbslq_f32(invalid_mask, nan_value, log2_approx); + vst1q_f32(bPtr, log2_approx); aPtr += 4; @@ -826,10 +837,19 @@ volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_poi const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax); const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax); + const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); + const vfloat32m2_t nan_val = __riscv_vfmv_v_f_f32m2(NAN, vlmax); + size_t n = num_points; for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { vl = __riscv_vsetvl_e32m2(n); vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); + + // Check for invalid inputs (NaN, negative, or zero) + vbool16_t invalid_mask = __riscv_vmfle(v, zero, vl); // v <= 0 + vbool16_t nan_mask = __riscv_vmfne(v, v, vl); // NaN check: v != v + invalid_mask = __riscv_vmor(invalid_mask, nan_mask, vl); + vfloat32m2_t a = __riscv_vfabs(v, vl); vfloat32m2_t exp = __riscv_vfcvt_f( __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl), @@ -851,6 +871,9 @@ volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_poi #endif exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl); + // Replace invalid results with NaN + exp = __riscv_vmerge(exp, nan_val, invalid_mask, vl); + __riscv_vse32(bVector, exp, vl); } } From a392c00369f94a0c73a53a6c2ff3612853568f17 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 20:10:10 +0100 Subject: [PATCH 60/67] fix RVV atan2 nan propagation Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_log2_32f.h | 10 +++---- kernels/volk/volk_32fc_s32f_atan2_32f.h | 38 ++++++++++++++++++------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h index e2673d900..e2a195658 100644 --- a/kernels/volk/volk_32f_log2_32f.h +++ b/kernels/volk/volk_32f_log2_32f.h @@ -472,10 +472,10 @@ volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_po for (number = 0; number < quarterPoints; ++number) { // Check for NaN or negative/zero (invalid inputs for log2) float32x4_t aval_f = vld1q_f32(aPtr); - uint32x4_t invalid_mask = vcleq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal <= 0 + uint32x4_t invalid_mask = vcleq_f32(aval_f, vdupq_n_f32(0.0f)); // aVal <= 0 // Check for NaN: NaN comparison with itself returns false - uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aval_f, aval_f)); // NOT(aVal == aVal) - invalid_mask = vorrq_u32(invalid_mask, nan_mask); // Combine masks + uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aval_f, aval_f)); // NOT(aVal == aVal) + invalid_mask = vorrq_u32(invalid_mask, nan_mask); // Combine masks float32x4_t nan_value = vdupq_n_f32(NAN); // load float in to an int register without conversion @@ -846,8 +846,8 @@ volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_poi vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); // Check for invalid inputs (NaN, negative, or zero) - vbool16_t invalid_mask = __riscv_vmfle(v, zero, vl); // v <= 0 - vbool16_t nan_mask = __riscv_vmfne(v, v, vl); // NaN check: v != v + vbool16_t invalid_mask = __riscv_vmfle(v, zero, vl); // v <= 0 + vbool16_t nan_mask = __riscv_vmfne(v, v, vl); // NaN check: v != v invalid_mask = __riscv_vmor(invalid_mask, nan_mask, vl); vfloat32m2_t a = __riscv_vfabs(v, vl); diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 8d70f1f27..b68c508ac 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -768,12 +768,21 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl); vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl)); vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl)); + + // Detect NaN in original inputs before division + vbool16_t input_nan_mask = + __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); - vfloat32m2_t x = __riscv_vfdiv( - __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl); - vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl); - x = __riscv_vreinterpret_f32m2( - __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl)); + vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl); + vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl); + vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl); + vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl); + x = __riscv_vmerge(x, numerator, div_nan_mask, vl); vfloat32m2_t xx = __riscv_vfmul(x, x, vl); vfloat32m2_t p = c13; @@ -822,12 +831,21 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, vl = __riscv_vsetvl_e32m2(n); vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl); vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1); + + // Detect NaN in original inputs before division + vbool16_t input_nan_mask = + __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl); + vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); - vfloat32m2_t x = __riscv_vfdiv( - __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl); - vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl); - x = __riscv_vreinterpret_f32m2( - __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl)); + vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl); + vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl); + vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl); + + // Only handle NaN from division (0/0, inf/inf), not from NaN inputs + // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) + vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl); + vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl); + x = __riscv_vmerge(x, numerator, div_nan_mask, vl); vfloat32m2_t xx = __riscv_vfmul(x, x, vl); vfloat32m2_t p = c13; From a5ba9dc96107f41e135e17131c666acbf268d458 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 20:52:39 +0100 Subject: [PATCH 61/67] use vmandnot instead of vmandn Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32fc_s32f_atan2_32f.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index b68c508ac..9d26b4b91 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -781,7 +781,8 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, // Only handle NaN from division (0/0, inf/inf), not from NaN inputs // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl); - vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl); + // div_nan_mask = x_nan_mask & ~input_nan_mask (AND-NOT operation) + vbool16_t div_nan_mask = __riscv_vmandnot(x_nan_mask, input_nan_mask, vl); x = __riscv_vmerge(x, numerator, div_nan_mask, vl); vfloat32m2_t xx = __riscv_vfmul(x, x, vl); @@ -844,7 +845,8 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, // Only handle NaN from division (0/0, inf/inf), not from NaN inputs // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl); - vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl); + // div_nan_mask = x_nan_mask & ~input_nan_mask (AND-NOT operation) + vbool16_t div_nan_mask = __riscv_vmandnot(x_nan_mask, input_nan_mask, vl); x = __riscv_vmerge(x, numerator, div_nan_mask, vl); vfloat32m2_t xx = __riscv_vfmul(x, x, vl); From 17e083ada4d41c8b24a8311d6aea76a2bd477809 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 20:58:18 +0100 Subject: [PATCH 62/67] revert... Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32fc_s32f_atan2_32f.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 9d26b4b91..1d8079025 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -781,8 +781,8 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, // Only handle NaN from division (0/0, inf/inf), not from NaN inputs // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl); - // div_nan_mask = x_nan_mask & ~input_nan_mask (AND-NOT operation) - vbool16_t div_nan_mask = __riscv_vmandnot(x_nan_mask, input_nan_mask, vl); + // div_nan_mask = x_nan_mask & ~input_nan_mask (vmandn computes vs2 & ~vs1) + vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl); x = __riscv_vmerge(x, numerator, div_nan_mask, vl); vfloat32m2_t xx = __riscv_vfmul(x, x, vl); @@ -845,8 +845,8 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, // Only handle NaN from division (0/0, inf/inf), not from NaN inputs // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0) vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl); - // div_nan_mask = x_nan_mask & ~input_nan_mask (AND-NOT operation) - vbool16_t div_nan_mask = __riscv_vmandnot(x_nan_mask, input_nan_mask, vl); + // div_nan_mask = x_nan_mask & ~input_nan_mask (vmandn computes vs2 & ~vs1) + vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl); x = __riscv_vmerge(x, numerator, div_nan_mask, vl); vfloat32m2_t xx = __riscv_vfmul(x, x, vl); From 4a7cfa7b8172b4da7e664bc47ebfc82e9715c2f2 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Sun, 9 Nov 2025 23:01:46 +0100 Subject: [PATCH 63/67] Add infinity handling to RVV atan2 implementation Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32fc_s32f_atan2_32f.h | 82 ++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index 1d8079025..ccfdefd64 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -762,6 +762,11 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); + const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(__builtin_inff(), vlmax); + const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4 + const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4 + size_t n = num_points; for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { vl = __riscv_vsetvl_e32m2(n); @@ -773,7 +778,38 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, vbool16_t input_nan_mask = __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl); - vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); + // Handle infinity cases per IEEE 754 + vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl); + vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl); + vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl); // |vr| == inf + vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl); // |vi| == inf + vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl); + + // Build infinity result + vfloat32m2_t inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl); + vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl); + both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl); // Copy sign from vi + inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl); + + // vi infinite, vr finite: ±π/2 + vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl); + vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl); // π/2 with sign of vi + inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl); + + // vr infinite, vi finite: 0 or ±π + vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl); + vfloat32m2_t vr_inf_result = + __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl), // π with sign of vi + __riscv_vfsgnj(zero, vi, vl), // 0 with sign of vi + vr_pos_mask, + vl); + inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl); + + vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl); + + vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl); vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl); vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl); vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl); @@ -800,6 +836,9 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, p = __riscv_vfadd_mu( RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl); + // Select infinity result or normal result + p = __riscv_vmerge(p, inf_result, any_inf_mask, vl); + __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl); } } @@ -827,6 +866,11 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax); const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); + const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); + const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(__builtin_inff(), vlmax); + const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4 + const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4 + size_t n = num_points; for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) { vl = __riscv_vsetvl_e32m2(n); @@ -837,7 +881,38 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, vbool16_t input_nan_mask = __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl); - vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl); + // Handle infinity cases per IEEE 754 + vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl); + vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl); + vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl); // |vr| == inf + vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl); // |vi| == inf + vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl); + + // Build infinity result + vfloat32m2_t inf_result = zero; + // Both infinite: ±π/4 or ±3π/4 + vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl); + vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl); + both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl); // Copy sign from vi + inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl); + + // vi infinite, vr finite: ±π/2 + vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl); + vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl); // π/2 with sign of vi + inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl); + + // vr infinite, vi finite: 0 or ±π + vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl); + vfloat32m2_t vr_inf_result = + __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl), // π with sign of vi + __riscv_vfsgnj(zero, vi, vl), // 0 with sign of vi + vr_pos_mask, + vl); + inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl); + + vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl); + + vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl); vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl); vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl); vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl); @@ -864,6 +939,9 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, p = __riscv_vfadd_mu( RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl); + // Select infinity result or normal result + p = __riscv_vmerge(p, inf_result, any_inf_mask, vl); + __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl); } } From bb1c9630db1ea6f2052434a5b7a6f3a9bd948037 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 10 Nov 2025 00:00:01 +0100 Subject: [PATCH 64/67] Fix RVV acos/asin: one_minus_v_sq intrinsic Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_acos_32f.h | 6 ++++-- kernels/volk/volk_32f_asin_32f.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h index 4331987cf..02ef5896f 100644 --- a/kernels/volk/volk_32f_acos_32f.h +++ b/kernels/volk/volk_32f_acos_32f.h @@ -540,8 +540,10 @@ volk_32f_acos_32f_rvv(float* bVector, const float* aVector, unsigned int num_poi for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { vl = __riscv_vsetvl_e32m2(n); vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); - vfloat32m2_t a = - __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl); + // Compute 1 - v^2 = (1+v)*(1-v) for better numerical stability + vfloat32m2_t one_minus_v_sq = + __riscv_vfmul(__riscv_vfadd(cf1, v, vl), __riscv_vfsub(cf1, v, vl), vl); + vfloat32m2_t a = __riscv_vfdiv(__riscv_vfsqrt(one_minus_v_sq, vl), v, vl); vfloat32m2_t z = __riscv_vfabs(a, vl); vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h index 1914c39ea..404e218c1 100644 --- a/kernels/volk/volk_32f_asin_32f.h +++ b/kernels/volk/volk_32f_asin_32f.h @@ -512,8 +512,10 @@ volk_32f_asin_32f_rvv(float* bVector, const float* aVector, unsigned int num_poi for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) { vl = __riscv_vsetvl_e32m2(n); vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); - vfloat32m2_t a = - __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl); + // Compute 1 - v^2 = (1+v)*(1-v) for better numerical stability + vfloat32m2_t one_minus_v_sq = + __riscv_vfmul(__riscv_vfadd(cf1, v, vl), __riscv_vfsub(cf1, v, vl), vl); + vfloat32m2_t a = __riscv_vfdiv(__riscv_vfsqrt(one_minus_v_sq, vl), v, vl); vfloat32m2_t z = __riscv_vfabs(a, vl); vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); From 3fdcbac2b10cc9609d5919198d1beb21d4f47322 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 10 Nov 2025 00:27:48 +0100 Subject: [PATCH 65/67] asin: use correct formula (inverse of acos) Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32f_asin_32f.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h index 404e218c1..dd7e25265 100644 --- a/kernels/volk/volk_32f_asin_32f.h +++ b/kernels/volk/volk_32f_asin_32f.h @@ -513,9 +513,10 @@ volk_32f_asin_32f_rvv(float* bVector, const float* aVector, unsigned int num_poi vl = __riscv_vsetvl_e32m2(n); vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl); // Compute 1 - v^2 = (1+v)*(1-v) for better numerical stability + // For asin: a = v / sqrt(1 - v^2) (inverse of acos) vfloat32m2_t one_minus_v_sq = __riscv_vfmul(__riscv_vfadd(cf1, v, vl), __riscv_vfsub(cf1, v, vl), vl); - vfloat32m2_t a = __riscv_vfdiv(__riscv_vfsqrt(one_minus_v_sq, vl), v, vl); + vfloat32m2_t a = __riscv_vfdiv(v, __riscv_vfsqrt(one_minus_v_sq, vl), vl); vfloat32m2_t z = __riscv_vfabs(a, vl); vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl); x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl); From d763b1f7889454466fe707e63a8ec0110e002e94 Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 10 Nov 2025 00:57:33 +0100 Subject: [PATCH 66/67] use HUGE_VALF instead of __builtin_inff for portability Signed-off-by: Magnus Lundmark --- kernels/volk/volk_32fc_s32f_atan2_32f.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h index ccfdefd64..af9ac6e2b 100644 --- a/kernels/volk/volk_32fc_s32f_atan2_32f.h +++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h @@ -239,7 +239,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector, // Handle infinity cases per IEEE 754 const __m256 zero = _mm256_setzero_ps(); - const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 inf = _mm256_set1_ps(HUGE_VALF); const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 @@ -346,7 +346,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector, // Handle infinity cases per IEEE 754 const __m256 zero = _mm256_setzero_ps(); - const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 inf = _mm256_set1_ps(HUGE_VALF); const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 @@ -561,7 +561,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector, // Handle infinity cases per IEEE 754 const __m256 zero = _mm256_setzero_ps(); - const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 inf = _mm256_set1_ps(HUGE_VALF); const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 @@ -668,7 +668,7 @@ static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector, // Handle infinity cases per IEEE 754 const __m256 zero = _mm256_setzero_ps(); - const __m256 inf = _mm256_set1_ps(__builtin_inff()); + const __m256 inf = _mm256_set1_ps(HUGE_VALF); const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4 @@ -763,7 +763,7 @@ static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector, const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(__builtin_inff(), vlmax); + const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax); const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4 const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4 @@ -867,7 +867,7 @@ static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector, const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax); const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax); - const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(__builtin_inff(), vlmax); + const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax); const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4 const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4 From 08cf9cf7053fc5f74090d765814c8190fc6fa2fa Mon Sep 17 00:00:00 2001 From: Magnus Lundmark Date: Mon, 10 Nov 2025 03:06:06 +0100 Subject: [PATCH 67/67] Add DCO signoff for commit faf20a4 I hereby add my Signed-off-by to this commit: faf20a4 Includes for GCC 7.2 compatibility This commit was made before DCO was required for this project. I certify that I have the rights to submit this work under the project's license. Signed-off-by: Magnus Lundmark