Skip to content

Commit

Permalink
Merge pull request #671 from jdemel/refactor_scalar_multiply
Browse files Browse the repository at this point in the history
kernel: Refactor 32f_s32f_multiply_32f kernel
  • Loading branch information
jdemel authored Nov 4, 2023
2 parents e853e9b + 204ff55 commit 4292073
Showing 1 changed file with 41 additions and 78 deletions.
119 changes: 41 additions & 78 deletions kernels/volk/volk_32f_s32f_multiply_32f.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@
#include <inttypes.h>
#include <stdio.h>

#ifdef LV_HAVE_GENERIC
static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points)
{
for (unsigned int number = 0; number < num_points; number++) {
*cVector++ = (*aVector++) * scalar;
}
}
#endif /* LV_HAVE_GENERIC */

#ifdef LV_HAVE_SSE
#include <xmmintrin.h>

Expand All @@ -67,27 +79,24 @@ static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;

float* cPtr = cVector;
const float* aPtr = aVector;

__m128 aVal, bVal, cVal;
bVal = _mm_set_ps1(scalar);
for (; number < quarterPoints; number++) {
aVal = _mm_loadu_ps(aPtr);
const __m128 bVal = _mm_set_ps1(scalar);
for (unsigned int number = 0; number < quarterPoints; number++) {
__m128 aVal = _mm_loadu_ps(aPtr);

cVal = _mm_mul_ps(aVal, bVal);
__m128 cVal = _mm_mul_ps(aVal, bVal);

_mm_storeu_ps(cPtr, cVal); // Store the results back into the C container

aPtr += 4;
cPtr += 4;
}

number = quarterPoints * 4;
for (; number < num_points; number++) {
for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
*cPtr++ = (*aPtr++) * scalar;
}
}
Expand All @@ -101,28 +110,24 @@ static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;

float* cPtr = cVector;
const float* aPtr = aVector;

__m256 aVal, bVal, cVal;
bVal = _mm256_set1_ps(scalar);
for (; number < eighthPoints; number++) {
const __m256 bVal = _mm256_set1_ps(scalar);
for (unsigned int number = 0; number < eighthPoints; number++) {
__m256 aVal = _mm256_loadu_ps(aPtr);

aVal = _mm256_loadu_ps(aPtr);

cVal = _mm256_mul_ps(aVal, bVal);
__m256 cVal = _mm256_mul_ps(aVal, bVal);

_mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container

aPtr += 8;
cPtr += 8;
}

number = eighthPoints * 8;
for (; number < num_points; number++) {
for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
*cPtr++ = (*aPtr++) * scalar;
}
}
Expand All @@ -135,22 +140,6 @@ extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
unsigned int num_points);
#endif /* LV_HAVE_RISCV64 */

#ifdef LV_HAVE_GENERIC
static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const float* inputPtr = aVector;
float* outputPtr = cVector;
for (number = 0; number < num_points; number++) {
*outputPtr = (*inputPtr) * scalar;
inputPtr++;
outputPtr++;
}
}
#endif /* LV_HAVE_GENERIC */

#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */

Expand All @@ -169,27 +158,24 @@ static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;

float* cPtr = cVector;
const float* aPtr = aVector;

__m128 aVal, bVal, cVal;
bVal = _mm_set_ps1(scalar);
for (; number < quarterPoints; number++) {
aVal = _mm_load_ps(aPtr);
const __m128 bVal = _mm_set_ps1(scalar);
for (unsigned int number = 0; number < quarterPoints; number++) {
__m128 aVal = _mm_load_ps(aPtr);

cVal = _mm_mul_ps(aVal, bVal);
__m128 cVal = _mm_mul_ps(aVal, bVal);

_mm_store_ps(cPtr, cVal); // Store the results back into the C container

aPtr += 4;
cPtr += 4;
}

number = quarterPoints * 4;
for (; number < num_points; number++) {
for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
*cPtr++ = (*aPtr++) * scalar;
}
}
Expand All @@ -203,27 +189,24 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;

float* cPtr = cVector;
const float* aPtr = aVector;

__m256 aVal, bVal, cVal;
bVal = _mm256_set1_ps(scalar);
for (; number < eighthPoints; number++) {
aVal = _mm256_load_ps(aPtr);
const __m256 bVal = _mm256_set1_ps(scalar);
for (unsigned int number = 0; number < eighthPoints; number++) {
__m256 aVal = _mm256_load_ps(aPtr);

cVal = _mm256_mul_ps(aVal, bVal);
__m256 cVal = _mm256_mul_ps(aVal, bVal);

_mm256_store_ps(cPtr, cVal); // Store the results back into the C container

aPtr += 8;
cPtr += 8;
}

number = eighthPoints * 8;
for (; number < num_points; number++) {
for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
*cPtr++ = (*aPtr++) * scalar;
}
}
Expand All @@ -237,46 +220,26 @@ static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const float* inputPtr = aVector;
float* outputPtr = cVector;
const unsigned int quarterPoints = num_points / 4;

float32x4_t aVal, cVal;
const float* inputPtr = aVector;
float* outputPtr = cVector;

for (number = 0; number < quarterPoints; number++) {
aVal = vld1q_f32(inputPtr); // Load into NEON regs
cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
vst1q_f32(outputPtr, cVal); // Store results back to output
for (unsigned int number = 0; number < quarterPoints; number++) {
float32x4_t aVal = vld1q_f32(inputPtr); // Load into NEON regs
float32x4_t cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
vst1q_f32(outputPtr, cVal); // Store results back to output
inputPtr += 4;
outputPtr += 4;
}
for (number = quarterPoints * 4; number < num_points; number++) {

for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
*outputPtr++ = (*inputPtr++) * scalar;
}
}
#endif /* LV_HAVE_NEON */


#ifdef LV_HAVE_GENERIC

static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
const float* aVector,
const float scalar,
unsigned int num_points)
{
unsigned int number = 0;
const float* inputPtr = aVector;
float* outputPtr = cVector;
for (number = 0; number < num_points; number++) {
*outputPtr = (*inputPtr) * scalar;
inputPtr++;
outputPtr++;
}
}
#endif /* LV_HAVE_GENERIC */


#ifdef LV_HAVE_ORC

extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
Expand All @@ -292,6 +255,6 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
}

#endif /* LV_HAVE_GENERIC */
#endif /* LV_HAVE_ORC */

#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */

0 comments on commit 4292073

Please sign in to comment.