diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h index 118b5972..f4919e2d 100644 --- a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h +++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h @@ -97,8 +97,8 @@ static inline int chainback_viterbi(unsigned char* data, #include #include -static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, - unsigned char* dec, +static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec, + unsigned char* syms, unsigned int framebits) { if (framebits < 12) { @@ -181,8 +181,8 @@ static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms, #include "volk/sse2neon.h" -static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms, - unsigned char* dec, +static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec, + unsigned char* syms, unsigned int framebits) { if (framebits < 12) { @@ -266,8 +266,8 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms, //#include //#include // -// static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms, -// unsigned char* dec, +// static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec, +// unsigned char* syms, // unsigned int framebits) //{ // if (framebits < 12) { @@ -349,8 +349,8 @@ static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms, #if LV_HAVE_GENERIC -static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms, - unsigned char* dec, +static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec, + unsigned char* syms, unsigned int framebits) { if (framebits < 12) { diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 239cdd1b..3e1ba6b5 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -57,19 +57,17 @@ typedef union { #endif -static inline void renormalize(unsigned char* X, unsigned char threshold) +static inline void renormalize(unsigned char* X) { int NUMSTATES = 64; int i; unsigned char min = X[0]; - // if(min > threshold) { for (i = 0; i < NUMSTATES; i++) if (min > X[i]) min = X[i]; for (i = 0; i < NUMSTATES; i++) X[i] -= min; - //} } @@ -85,16 +83,17 @@ static inline void BFLY(int i, int j; unsigned int decision0, decision1; unsigned char metric, m0, m1, m2, m3; + unsigned short metricsum; int NUMSTATES = 64; int RATE = 2; - int METRICSHIFT = 2; + int METRICSHIFT = 1; int PRECISIONSHIFT = 2; - metric = 0; + metricsum = 1; for (j = 0; j < RATE; j++) - metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT; - metric = metric >> PRECISIONSHIFT; + metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]); + metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT; unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT); @@ -103,8 +102,8 @@ static inline void BFLY(int i, m2 = X[i] + (max - metric); m3 = X[i + NUMSTATES / 2] + metric; - decision0 = (signed int)(m0 - m1) > 0; - decision1 = (signed int)(m2 - m3) > 0; + decision0 = (signed int)(m0 - m1) >= 0; + decision1 = (signed int)(m2 - m3) >= 0; Y[2 * i] = decision0 ? m1 : m0; Y[2 * i + 1] = decision1 ? m3 : m2; @@ -297,7 +296,7 @@ static inline void BFLY(int i, // } // } // -// renormalize(X, 210); +// renormalize(X); // // unsigned int j; // for (j = 0; j < (framebits + excess) % 2; ++j) { @@ -312,7 +311,7 @@ static inline void BFLY(int i, // Branchtab); // } // -// renormalize(Y, 210); +// renormalize(Y); // } // /*skip*/ //} @@ -438,27 +437,25 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, *(a112) = s28; a113 = (a95 + 3); *(a113) = s29; - if ((((unsigned char*)Y)[0] > 210)) { - __m128i m5, m6; - m5 = ((__m128i*)Y)[0]; - m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); - m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); - m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); - __m128i m7; - m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); - m7 = - ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); - m7 = - ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); - m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); - m7 = _mm_unpacklo_epi8(m7, m7); - m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); - m6 = _mm_unpacklo_epi64(m7, m7); - ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); - ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); - ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); - ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); - } + + __m128i m5, m6; + m5 = ((__m128i*)Y)[0]; + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); + __m128i m7; + m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); + m7 = _mm_unpacklo_epi8(m7, m7); + m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); + m6 = _mm_unpacklo_epi64(m7, m7); + ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); + ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); + ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); + ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); + unsigned char a188, a194; int a186, a205; short int s48, s49, s54, s55; @@ -561,31 +558,27 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, *(a225) = s56; a226 = (a208 + 3); *(a226) = s57; - if ((((unsigned char*)X)[0] > 210)) { - __m128i m12, m13; - m12 = ((__m128i*)X)[0]; - m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); - m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); - m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); - __m128i m14; - m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); - m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), - ((__m128i)m14))); - m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), - ((__m128i)m14))); - m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), - ((__m128i)m14))); - m14 = _mm_unpacklo_epi8(m14, m14); - m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); - m13 = _mm_unpacklo_epi64(m14, m14); - ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); - ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); - ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); - ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); - } + + __m128i m12, m13; + m12 = ((__m128i*)X)[0]; + m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); + m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); + m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); + __m128i m14; + m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14))); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14))); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14))); + m14 = _mm_unpacklo_epi8(m14, m14); + m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); + m13 = _mm_unpacklo_epi64(m14, m14); + ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); + ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); + ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); + ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); } - renormalize(X, 210); + renormalize(X); /*int ch; for(ch = 0; ch < 64; ch++) { @@ -607,7 +600,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, } - renormalize(Y, 210); + renormalize(Y); /*printf("\n"); for(ch = 0; ch < 64; ch++) { @@ -734,27 +727,25 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, *(a112) = s28; a113 = (a95 + 3); *(a113) = s29; - if ((((unsigned char*)Y)[0] > 210)) { - __m128i m5, m6; - m5 = ((__m128i*)Y)[0]; - m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); - m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); - m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); - __m128i m7; - m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); - m7 = - ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); - m7 = - ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); - m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); - m7 = _mm_unpacklo_epi8(m7, m7); - m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); - m6 = _mm_unpacklo_epi64(m7, m7); - ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); - ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); - ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); - ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); - } + + __m128i m5, m6; + m5 = ((__m128i*)Y)[0]; + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]); + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]); + m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]); + __m128i m7; + m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7))); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7))); + m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7))); + m7 = _mm_unpacklo_epi8(m7, m7); + m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0)); + m6 = _mm_unpacklo_epi64(m7, m7); + ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6); + ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6); + ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6); + ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6); + unsigned char a188, a194; int a186, a205; short int s48, s49, s54, s55; @@ -857,31 +848,27 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, *(a225) = s56; a226 = (a208 + 3); *(a226) = s57; - if ((((unsigned char*)X)[0] > 210)) { - __m128i m12, m13; - m12 = ((__m128i*)X)[0]; - m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); - m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); - m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); - __m128i m14; - m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); - m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), - ((__m128i)m14))); - m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), - ((__m128i)m14))); - m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), - ((__m128i)m14))); - m14 = _mm_unpacklo_epi8(m14, m14); - m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); - m13 = _mm_unpacklo_epi64(m14, m14); - ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); - ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); - ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); - ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); - } + + __m128i m12, m13; + m12 = ((__m128i*)X)[0]; + m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]); + m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]); + m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]); + __m128i m14; + m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14))); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14))); + m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14))); + m14 = _mm_unpacklo_epi8(m14, m14); + m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0)); + m13 = _mm_unpacklo_epi64(m14, m14); + ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13); + ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13); + ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13); + ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13); } - renormalize(X, 210); + renormalize(X); /*int ch; for(ch = 0; ch < 64; ch++) { @@ -903,7 +890,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, } - renormalize(Y, 210); + renormalize(Y); /*printf("\n"); for(ch = 0; ch < 64; ch++) { @@ -928,7 +915,6 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, { int nbits = framebits + excess; int NUMSTATES = 64; - int RENORMALIZE_THRESHOLD = 210; int s, i; for (s = 0; s < nbits; s++) { @@ -937,7 +923,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab); } - renormalize(Y, RENORMALIZE_THRESHOLD); + renormalize(Y); /// Swap pointers to old and new metrics tmp = (void*)X;