add RISC-V Vector extension (RVV) kernels

Signed-off-by: Olaf Bernstein <[email protected]>
gnuradio · Oct 22, 2024 · cbb9101 · cbb9101
1 parent bd4694d
commit cbb9101
Show file tree

Hide file tree

Showing 147 changed files with 5,321 additions and 89 deletions.
diff --git a/.github/workflows/run-tests-rvv.yml b/.github/workflows/run-tests-rvv.yml
@@ -0,0 +1,54 @@
+#
+# Copyright 2020 - 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+name: Run VOLK tests on different RVV configurations
+
+on: [push, pull_request]
+
+jobs:
+  Tests:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install packages
+      run: |
+        git submodule update --init --recursive
+        sudo apt-get update -q -y
+        sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18
+        mkdir build
+        cd build
+    - name: Test gcc-14 VLEN=128
+      run: |
+        cd build; rm -rf *
+        CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \
+        cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
+        make -j$(nproc)
+        ARGS=-j$(nproc) make test
+    - name: Test clang-18 VLEN=256
+      run: |
+        cd build; rm -rf *
+        CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=256 \
+        cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
+        make -j$(nproc)
+        ARGS=-j$(nproc) make test
+    - name: Test clang-18 VLEN=512
+      run: |
+        cd build; rm -rf *
+        CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \
+        cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
+        make -j$(nproc)
+        ARGS=-j$(nproc) make test
+    - name: Test clang-18 VLEN=1024
+      run: |
+        cd build; rm -rf *
+        CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \
+        cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake ..
+        make -j$(nproc)
+        ARGS=-j$(nproc) make test
+
+
diff --git a/cmake/Checks/check-rvv-intrinsics.c b/cmake/Checks/check-rvv-intrinsics.c
@@ -0,0 +1,6 @@
+#if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14)
+int main() { return 0; }
+#else
+#error "rvv intrinsics aren't supported"
+#endif
+
diff --git a/cmake/Toolchains/rv64gcv-linux-gnu.cmake b/cmake/Toolchains/rv64gcv-linux-gnu.cmake
@@ -0,0 +1,34 @@
+#
+# Copyright 2024 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+set(CMAKE_C_COMPILER $ENV{CC})
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_CXX_COMPILER $ENV{CXX})
+
+set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE)
+set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE)
+
+set(CMAKE_OBJCOPY
+    ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy
+    CACHE INTERNAL "objcopy tool")
+set(CMAKE_SIZE_UTIL
+    ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size
+    CACHE INTERNAL "size tool")
+
+set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
+
+set(QEMU_VLEN $ENV{VLEN})
+if(NOT QEMU_VLEN)
+    set(QEMU_VLEN "128")
+endif()
+
+set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on")
diff --git a/gen/archs.xml b/gen/archs.xml
@@ -181,4 +181,48 @@ at the top, as a last resort.
 <arch name="riscv64">
 </arch>
 
+<!-->
+    tmpl/ currently assumes that every arch.name starting with "rv" requires
+    RVV intrinsics
+</-->
+<!-->
+    There is currently no mechanism in RISC-V to append extensions,
+    so each arch needs to specify all of them, and the order needs in the
+    machine definition needs to be from the fewest to the most extensions.
+    Fortunately, this maps quite well to the profiles concept.
+</-->
+<arch name="rvv">
+    <check name="V"></check>
+    <flag compiler="gnu">-march=rv64gcv</flag>
+    <flag compiler="clang">-march=rv64gcv</flag>
+</arch>
+
+<arch name="rvvseg">
+    <check name="V"></check>
+    <flag compiler="gnu">-march=rv64gcv</flag>
+    <flag compiler="clang">-march=rv64gcv</flag>
+    <!-->
+        It's unclear how performance portable segmented load/stores are, so the
+        default rvv implementations avoid using them.
+        This is a pseudo arch for separate segmented load/store implementations,
+        and is expected to never be used standalone without "rvv".
+    </-->
+</arch>
+
+<!-->
+    google/cpu_features currently doesn't support these extensions and profiles.
+</-->
+<!--arch name="rva22v">
+    <check name="V"></check>
+    <check name="B"></check>
+    <flag compiler="gnu">-march=rv64gcv_zba_zbb_zbs</flag>
+    <flag compiler="clang">-march=rv64gcv_zba_zbb_zbs</flag>
+</arch-->
+
+<!--arch name="rva23">
+    <check name="rva23"></check>
+    <flag compiler="gnu">-march=rva23u64</flag>
+    <flag compiler="clang">-march=rva23u64</flag>
+</arch-->
+
 </grammar>
diff --git a/gen/machines.xml b/gen/machines.xml
@@ -33,6 +33,18 @@
 <archs>generic riscv64 orc|</archs>
 </machine>
 
+<machine name="rv64gcv">
+<archs>generic riscv64 rvv rvvseg orc|</archs>
+</machine>
+
+<!--machine name="rva22v">
+<archs>generic riscv64 rvv rvvseg rva22v orc|</archs>
+</machine-->
+
+<!--machine name="rva23">
+<archs>generic riscv64 rvv rvvseg rva22v rva23 orc|</archs>
+</machine-->
+
 <machine name="sse4_a">
 <archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
 </machine>

diff --git a/include/volk/volk_rvv_intrinsics.h b/include/volk/volk_rvv_intrinsics.h
@@ -0,0 +1,73 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2024 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold RVV intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_
+#include <riscv_vector.h>
+
+#define RISCV_SHRINK2(op,T,S,v) __riscv_##op( \
+    __riscv_vget_##T##S##m1(v, 0), \
+    __riscv_vget_##T##S##m1(v, 1), \
+    __riscv_vsetvlmax_e##S##m1())
+
+#define RISCV_SHRINK4(op,T,S,v) __riscv_##op( \
+    __riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
+                 __riscv_vget_##T##S##m1(v, 1), \
+                 __riscv_vsetvlmax_e##S##m1()), \
+    __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
+                 __riscv_vget_##T##S##m1(v, 3), \
+                 __riscv_vsetvlmax_e##S##m1()), \
+    __riscv_vsetvlmax_e##S##m1())
+
+#define RISCV_SHRINK8(op,T,S,v) __riscv_##op( \
+    __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \
+                              __riscv_vget_##T##S##m1(v, 1), \
+                              __riscv_vsetvlmax_e##S##m1()), \
+                 __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \
+                              __riscv_vget_##T##S##m1(v, 3), \
+                              __riscv_vsetvlmax_e##S##m1()), \
+                  __riscv_vsetvlmax_e##S##m1()), \
+    __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \
+                              __riscv_vget_##T##S##m1(v, 5), \
+                              __riscv_vsetvlmax_e##S##m1()), \
+                 __riscv_##op(__riscv_vget_##T##S##m1(v, 6), \
+                              __riscv_vget_##T##S##m1(v, 7), \
+                              __riscv_vsetvlmax_e##S##m1()), \
+                  __riscv_vsetvlmax_e##S##m1()), \
+    __riscv_vsetvlmax_e##S##m1())
+
+#define RISCV_PERM8(f,v,vidx) __riscv_vcreate_v_u8m1_u8m8( \
+    f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \
+    f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1()))
+
+#define RISCV_LUT8(f,vtbl,v) __riscv_vcreate_v_u8m1_u8m8( \
+    f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 4), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 5), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 6), __riscv_vsetvlmax_e8m1()), \
+    f(vtbl, __riscv_vget_u8m1(v, 7), __riscv_vsetvlmax_e8m1()))
+
+
+#define RISCV_VMFLTZ(T,v,vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl)
+
+#endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */
diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -668,5 +668,68 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
 
 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
 
+#ifdef LV_HAVE_RVV
+#include <riscv_vector.h>
+#include <volk/volk_rvv_intrinsics.h>
+
+static inline void
+volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
+                                const short* input,
+                                const lv_32fc_t* taps,
+                                unsigned int num_points)
+{
+    vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+    vfloat32m4_t vsumi = vsumr;
+    size_t n = num_points;
+    for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+        vl = __riscv_vsetvl_e32m4(n);
+        vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
+        vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc,  0, vl));
+        vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
+        vfloat32m4_t v = __riscv_vfwcvt_f(
+                __riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
+        vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
+        vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
+    }
+    size_t vl =__riscv_vsetvlmax_e32m1();
+    vfloat32m1_t vr = RISCV_SHRINK4(vfadd,f,32,vsumr);
+    vfloat32m1_t vi = RISCV_SHRINK4(vfadd,f,32,vsumi);
+    vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+    *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+                       __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVV*/
+
+#ifdef LV_HAVE_RVVSEG
+#include <riscv_vector.h>
+#include <volk/volk_rvv_intrinsics.h>
+
+static inline void
+volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
+                                   const short* input,
+                                   const lv_32fc_t* taps,
+                                   unsigned int num_points)
+{
+    vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
+    vfloat32m4_t vsumi = vsumr;
+    size_t n = num_points;
+    for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
+        vl = __riscv_vsetvl_e32m4(n);
+        vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
+        vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
+        vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
+        vfloat32m4_t v = __riscv_vfwcvt_f(
+                __riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
+        vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
+        vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
+    }
+    size_t vl =__riscv_vsetvlmax_e32m1();
+    vfloat32m1_t vr = RISCV_SHRINK4(vfadd,f,32,vsumr);
+    vfloat32m1_t vi = RISCV_SHRINK4(vfadd,f,32,vsumi);
+    vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
+    *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
+                       __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
+}
+#endif /*LV_HAVE_RVVSEG*/
 
 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h
@@ -10,6 +10,10 @@
 /*!
  * \page volk_16i_branch_4_state_8
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
  * \b Overview
  *
  * <FIXME>

diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h
@@ -275,5 +275,21 @@ static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
 }
 #endif /* LV_HAVE_NEON */
 
+#ifdef LV_HAVE_RVV
+#include <riscv_vector.h>
+
+static inline void
+volk_16i_convert_8i_rvv(int8_t* outputVector,
+                        const int16_t* inputVector,
+                        unsigned int num_points)
+{
+    size_t n = num_points;
+    for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+        vl = __riscv_vsetvl_e16m8(n);
+        vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
+        __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
+    }
+}
+#endif /*LV_HAVE_RVV*/
 
 #endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h
@@ -10,6 +10,10 @@
 /*!
  * \page volk_16i_max_star_16i
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
  * \b Overview
  *
  * <FIXME>

diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -11,6 +11,10 @@
 /*!
  * \page volk_16i_max_star_horizontal_16i
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
  * \b Overview
  *
  * <FIXME>

diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -10,6 +10,10 @@
 /*!
  * \page volk_16i_permute_and_scalar_add
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated.
+ *
  * \b Overview
  *
  * <FIXME>

diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -483,4 +483,22 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_RVV
+#include <riscv_vector.h>
+
+static inline void
+volk_16i_s32f_convert_32f_rvv(float* outputVector,
+                              const int16_t* inputVector,
+                              const float scalar,
+                              unsigned int num_points)
+{
+    size_t n = num_points;
+    for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
+        vl = __riscv_vsetvl_e16m4(n);
+        vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl);
+        __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f/scalar, vl), vl);
+    }
+}
+#endif /*LV_HAVE_RVV*/
+
 #endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */