diff --git a/kernels/volk/volk_8i_x2_saturated_sum_8i.h b/kernels/volk/volk_8i_x2_saturated_sum_8i.h new file mode 100644 index 000000000..848b80033 --- /dev/null +++ b/kernels/volk/volk_8i_x2_saturated_sum_8i.h @@ -0,0 +1,277 @@ +/* -*- c++ -*- */ +/* + * Copyright 2015 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/*! + * \page volk_8i_x2_saturated_sum_8i + * + * \b Overview + * + * Carry out a saturated addition on a vectors of signed 8-bit integers. + * + * Dispatcher Prototype + * \code + * void volk_8i_x2_saturated_sum_8i(int8_t* outVector, const int8_t* inVectorA, const int8_t* inVectorB, const unsigned int num_vals) + * \endcode + * + * \b Inputs + * \li inVectorA: The first input vector of 8-bit integers. + * \li inVectorB: The second input vector of 8-bit integers. + * \li num_vals: The number of data points. + * + * \b Outputs + * \li outVector: The resulting output of 8-bit integers. + * + * \b Example + * + * The follow example adds some vectors such that the result will eventually saturate + * + * \code + * int N = 100; + * unsigned int alignment = volk_get_alignment(); + * int8_t* vecA = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment); + * int8_t* vecB = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment); + * int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * vecA[ii] = ii; + * vecB[ii] = 100+ii; + * } + * + * volk_8i_x2_saturated_sum_8i(out, vecA, vecB, N); + * + * for(unsigned int ii = 0; ii < N; ++ii){ + * printf("out[%u] = %d\n", ii, out[ii]); + * } + * + * volk_free(vecA); + * volk_free(vecB); + * volk_free(out); + * \endcode + */ + +#ifndef INCLUDED_volk_8i_x2_saturated_sum_8i_u_H +#define INCLUDED_volk_8i_x2_saturated_sum_8i_u_H + +#include +#include + +#ifdef LV_HAVE_GENERIC + +static inline void +volk_8i_x2_saturated_sum_8i_generic(int8_t* outVector, + const int8_t* inVectorA, const int8_t* inVectorB, + const unsigned int num_vals) +{ + unsigned int i = 0; + int16_t tmpPlus; // Need to move to more bits to workaround possible overflow + + for(; i < num_vals; i++){ + tmpPlus = inVectorB[i] + inVectorA[i]; + // Saturate to avoid overflow + tmpPlus = (tmpPlus > 127) ? 127 : tmpPlus; + tmpPlus = (tmpPlus < -128) ? -128 : tmpPlus; + + outVector[i] = ((int8_t)tmpPlus); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void +volk_8i_x2_saturated_sum_8i_u_sse2(int8_t* outVector, + const int8_t* inVectorA, const int8_t* inVectorB, + const unsigned int num_vals) +{ + const unsigned int VEC_SIZE = 16; + const unsigned int vecNum = num_vals / VEC_SIZE; + unsigned int i = 0; + int16_t tmpPlus; // Need to move to more bits to workaround possible overflow + + for(; i < vecNum; i++) { + __m128i a = _mm_loadu_si128((const __m128i*)(inVectorA + VEC_SIZE*i)); + __m128i b = _mm_loadu_si128((const __m128i*)(inVectorB + VEC_SIZE*i)); + __m128i result = _mm_adds_epi8(b, a); + _mm_storeu_si128((__m128i*)(outVector + VEC_SIZE*i), result); + } + + i = VEC_SIZE*vecNum; + for(; i < num_vals; i++) { + tmpPlus = inVectorB[i] + inVectorA[i]; + // Saturate to avoid overflow + tmpPlus = (tmpPlus > 127) ? 127 : tmpPlus; + tmpPlus = (tmpPlus < -128) ? -128 : tmpPlus; + + outVector[i] = ((int8_t)tmpPlus); + } +} + +#endif /* LV_HAVE_SSE2 for unaligned */ + + +#ifdef LV_HAVE_AVX2 + +#include + +static inline void +volk_8i_x2_saturated_sum_8i_u_avx2(int8_t* outVector, + const int8_t* inVectorA, const int8_t* inVectorB, + const unsigned int num_vals) +{ + const unsigned int VEC_SIZE = 32; + const unsigned int vecNum = num_vals / VEC_SIZE; + unsigned int i = 0; + int16_t tmpPlus; // Need to move to more bits to workaround possible overflow + + for(; i < vecNum; i++) { + __m256i a = _mm256_loadu_si256((const __m256i*)(inVectorA + VEC_SIZE*i)); + __m256i b = _mm256_loadu_si256((const __m256i*)(inVectorB + VEC_SIZE*i)); + __m256i result = _mm256_adds_epi8(b, a); + _mm256_storeu_si256((__m256i*)(outVector + VEC_SIZE*i), result); + } + + i = VEC_SIZE*vecNum; + for(; i < num_vals; i++) { + tmpPlus = inVectorB[i] + inVectorA[i]; + // Saturate to avoid overflow + tmpPlus = (tmpPlus > 127) ? 127 : tmpPlus; + tmpPlus = (tmpPlus < -128) ? -128 : tmpPlus; + + outVector[i] = ((int8_t)tmpPlus); + } +} + +#endif /* LV_HAVE_AVX2 for unaligned */ + +#endif /* INCLUDED_VOLK_8s_SATURATED_SUM_8s_UNALIGNED8_H */ + + +#ifndef INCLUDED_volk_8i_x2_saturated_sum_8i_a_H +#define INCLUDED_volk_8i_x2_saturated_sum_8i_a_H + +#ifdef LV_HAVE_SSE2 + +#include + +static inline void +volk_8i_x2_saturated_sum_8i_a_sse2(int8_t* outVector, + const int8_t* inVectorA, const int8_t* inVectorB, + const unsigned int num_vals) +{ + const unsigned int VEC_SIZE = 16; + const unsigned int vecNum = num_vals / VEC_SIZE; + unsigned int i = 0; + int16_t tmpPlus; // Need to move to more bits to workaround possible overflow + + for(; i < vecNum; i++) { + __m128i a = _mm_load_si128((const __m128i*)(inVectorA + VEC_SIZE*i)); + __m128i b = _mm_load_si128((const __m128i*)(inVectorB + VEC_SIZE*i)); + __m128i result = _mm_adds_epi8(b, a); + _mm_store_si128((__m128i*)(outVector + VEC_SIZE*i), result); + } + + i = VEC_SIZE*vecNum; + for(; i < num_vals; i++) { + tmpPlus = inVectorB[i] + inVectorA[i]; + // Saturate to avoid overflow + tmpPlus = (tmpPlus > 127) ? 127 : tmpPlus; + tmpPlus = (tmpPlus < -128) ? -128 : tmpPlus; + + outVector[i] = ((int8_t)tmpPlus); + } +} + +#endif /* LV_HAVE_SSE2 for aligned */ + + +#ifdef LV_HAVE_AVX2 + +#include + +static inline void +volk_8i_x2_saturated_sum_8i_a_avx2(int8_t* outVector, + const int8_t* inVectorA, const int8_t* inVectorB, + const unsigned int num_vals) +{ + const unsigned int VEC_SIZE = 32; + const unsigned int vecNum = num_vals / VEC_SIZE; + unsigned int i = 0; + int16_t tmpPlus; // Need to move to more bits to workaround possible overflow + + for(; i < vecNum; i++) { + __m256i a = _mm256_load_si256((const __m256i*)(inVectorA + VEC_SIZE*i)); + __m256i b = _mm256_load_si256((const __m256i*)(inVectorB + VEC_SIZE*i)); + __m256i result = _mm256_adds_epi8(b, a); + _mm256_store_si256((__m256i*)(outVector + VEC_SIZE*i), result); + } + + i = VEC_SIZE*vecNum; + for(; i < num_vals; i++) { + tmpPlus = inVectorB[i] + inVectorA[i]; + // Saturate to avoid overflow + tmpPlus = (tmpPlus > 127) ? 127 : tmpPlus; + tmpPlus = (tmpPlus < -128) ? -128 : tmpPlus; + + outVector[i] = ((int8_t)tmpPlus); + } +} + +#endif /* LV_HAVE_AVX2 for aligned */ + + +#ifdef LV_HAVE_NEON + +#include + +static inline void +volk_8i_x2_saturated_sum_8i_a_neon(int8_t* outVector, + const int8_t* inVectorA, const int8_t* inVectorB, + const unsigned int num_vals) +{ + const unsigned int VEC_SIZE = 16; + const unsigned int vecNum = num_vals / VEC_SIZE; + unsigned int i = 0; + int16_t tmpPlus; // Need to move to more bits to workaround possible overflow + + for(; i < vecNum; i++) { + int8x16_t a = vld1q_s8((const int8x16_t*)(inVectorA + VEC_SIZE*i)); + int8x16_t b = vld1q_s8((const int8x16_t*)(inVectorB + VEC_SIZE*i)); + vst1q_s8((const int8x16_t*)(outVector + VEC_SIZE*i), vqaddq_s8(b, a)); + } + + i = VEC_SIZE*vecNum; + for(; i < num_vals; i++) { + tmpPlus = inVectorB[i] + inVectorA[i]; + // Saturate to avoid overflow + tmpPlus = (tmpPlus > 127) ? 127 : tmpPlus; + tmpPlus = (tmpPlus < -128) ? -128 : tmpPlus; + + outVector[i] = ((int8_t)tmpPlus); + } +} + +#endif /* LV_HAVE_NEON for aligned */ + +#endif /* INCLUDED_VOLK_8s_SATURATED_SUM_8s_ALIGNED8_H */ diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h index 23c0c45a4..76125dfad 100644 --- a/lib/kernel_tests.h +++ b/lib/kernel_tests.h @@ -115,6 +115,7 @@ std::vector init_test_list(volk_test_params_t test_params) (VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params)) (VOLK_INIT_TEST(volk_8i_convert_16i, test_params)) (VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params)) + (VOLK_INIT_TEST(volk_8i_x2_saturated_sum_8i, test_params)) (VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params)) (VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params)) (VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))