Skip to content

Commit 651abe0

Browse files
committed
granular paralel generic kernel for byteswap
Signed-off-by: Marcus Müller <[email protected]>
1 parent e853e9b commit 651abe0

File tree

2 files changed

+37
-26
lines changed

2 files changed

+37
-26
lines changed

kernels/volk/volk_64u_byteswap.h

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,30 @@
5656
#include <inttypes.h>
5757
#include <stdio.h>
5858

59+
#ifdef LV_HAVE_GENERIC
60+
/* Adapted from https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
61+
* Where they reverse the bits in an N-bit word. But who's stoppng me from doing the same
62+
* on byte level?
63+
* Idea is simple: swap the elementary units with half of them "selected" each step, in a
64+
* Hadamard kind of selection.
65+
*/
66+
67+
static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
68+
unsigned int num_points)
69+
{
70+
for (unsigned int point = 0; point < num_points; point++, intsToSwap++) {
71+
uint64_t in = *intsToSwap;
72+
/* swap individual bytes */
73+
in = (in & 0x00FF00FF00FF00FF) << 8 | (in & 0xFF00FF00FF00FF00) >> 8;
74+
/* swap individual shorts */
75+
in = (in & 0x0000FFFF0000FFFF) << 16 | (in & 0xFFFF0000FFFF0000) >> 16;
76+
/* swap the two 32 bit words */
77+
in = (in & 0x00000000FFFFFFFF) << 32 | (in & 0xFFFFFFFF00000000) >> 32;
78+
*intsToSwap = in;
79+
}
80+
}
81+
#endif
82+
5983
#ifdef LV_HAVE_SSE2
6084
#include <emmintrin.h>
6185

@@ -109,30 +133,6 @@ static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int n
109133
}
110134
#endif /* LV_HAVE_SSE2 */
111135

112-
113-
#ifdef LV_HAVE_GENERIC
114-
115-
static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
116-
unsigned int num_points)
117-
{
118-
uint32_t* inputPtr = (uint32_t*)intsToSwap;
119-
unsigned int point;
120-
for (point = 0; point < num_points; point++) {
121-
uint32_t output1 = *inputPtr;
122-
uint32_t output2 = inputPtr[1];
123-
124-
output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
125-
((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
126-
127-
output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
128-
((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
129-
130-
*inputPtr++ = output2;
131-
*inputPtr++ = output1;
132-
}
133-
}
134-
#endif /* LV_HAVE_GENERIC */
135-
136136
#if LV_HAVE_AVX2
137137
#include <immintrin.h>
138138
static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
@@ -476,8 +476,8 @@ static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
476476

477477
#ifdef LV_HAVE_GENERIC
478478

479-
static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
480-
unsigned int num_points)
479+
static inline void volk_64u_byteswap_generic_decompose(uint64_t* intsToSwap,
480+
unsigned int num_points)
481481
{
482482
uint32_t* inputPtr = (uint32_t*)intsToSwap;
483483
unsigned int point;

kernels/volk/volk_64u_byteswappuppet_64u.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@
1515
#include <string.h>
1616
#include <volk/volk_64u_byteswap.h>
1717

18+
#ifdef LV_HAVE_GENERIC
19+
static inline void volk_64u_byteswappuppet_64u_generic_decompose(uint64_t* output,
20+
uint64_t* intsToSwap,
21+
unsigned int num_points)
22+
{
23+
24+
volk_64u_byteswap_generic_decompose((uint64_t*)intsToSwap, num_points);
25+
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
26+
}
27+
#endif
28+
1829
#ifdef LV_HAVE_GENERIC
1930
static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
2031
uint64_t* intsToSwap,

0 commit comments

Comments
 (0)