Skip to content

Commit 74d872b

Browse files
committed
+add AVX2 optimizations of function Yuv444pToRgbaV2.
1 parent c6f29b2 commit 74d872b

File tree

5 files changed

+101
-12
lines changed

5 files changed

+101
-12
lines changed

docs/2024.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ <h3 id="R144">December X, 2024 (version X.X.144)</h3>
3939
<h4>Algorithms</h4>
4040
<h5>New features</h5>
4141
<ul>
42-
<li>SSE4.1 optimizations of function Yuv444pToRgbaV2.</li>
42+
<li>SSE4.1, AVX2 optimizations of function Yuv444pToRgbaV2.</li>
4343
<li>SSE4.1 optimizations of class ImageJpegLoader.</li>
4444
</ul>
4545

src/Simd/SimdAvx2.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,9 @@ namespace Simd
653653
void Yuv444pToRgbV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
654654
size_t width, size_t height, uint8_t* rgb, size_t rgbStride, SimdYuvType yuvType);
655655

656+
void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
657+
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType);
658+
656659
void Yuv420pToUyvy422(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
657660
size_t width, size_t height, uint8_t* uyvy, size_t uyvyStride);
658661
}

src/Simd/SimdAvx2YuvToBgraV2.cpp

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,94 @@ namespace Simd
459459
Yuv420pToBgraV2<true>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
460460
else
461461
Yuv420pToBgraV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
462+
#endif
463+
}
464+
465+
//-------------------------------------------------------------------------------------------------
466+
467+
template <bool align, class T> SIMD_INLINE void YuvToRgba16(__m256i y16, __m256i u16, __m256i v16, const __m256i& a_0, __m256i* rgba)
468+
{
469+
const __m256i b16 = YuvToBlue16<T>(y16, u16);
470+
const __m256i g16 = YuvToGreen16<T>(y16, u16, v16);
471+
const __m256i r16 = YuvToRed16<T>(y16, v16);
472+
const __m256i rg8 = _mm256_or_si256(r16, _mm256_slli_si256(g16, 1));
473+
const __m256i ba8 = _mm256_or_si256(b16, a_0);
474+
__m256i rgba0 = _mm256_unpacklo_epi16(rg8, ba8);
475+
__m256i rgba1 = _mm256_unpackhi_epi16(rg8, ba8);
476+
Permute2x128(rgba0, rgba1);
477+
Store<align>(rgba + 0, rgba0);
478+
Store<align>(rgba + 1, rgba1);
479+
}
480+
481+
template <bool align, class T> SIMD_INLINE void YuvToRgba(__m256i y8, __m256i u8, __m256i v8, const __m256i& a_0, __m256i* rgba)
482+
{
483+
YuvToRgba16<align, T>(UnpackY<T, 0>(y8), UnpackUV<T, 0>(u8), UnpackUV<T, 0>(v8), a_0, rgba + 0);
484+
YuvToRgba16<align, T>(UnpackY<T, 1>(y8), UnpackUV<T, 1>(u8), UnpackUV<T, 1>(v8), a_0, rgba + 2);
485+
}
486+
487+
template <bool align, class T> SIMD_INLINE void Yuv444pToRgbaV2(const uint8_t* y, const uint8_t* u, const uint8_t* v, const __m256i& a_0, uint8_t* rgba)
488+
{
489+
YuvToRgba<align, T>(LoadPermuted<align>((__m256i*)y), LoadPermuted<align>((__m256i*)u), LoadPermuted<align>((__m256i*)v), a_0, (__m256i*)rgba);
490+
}
491+
492+
template <bool align, class T> void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
493+
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha)
494+
{
495+
assert(width >= A);
496+
if (align)
497+
{
498+
assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride));
499+
assert(Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride));
500+
}
501+
502+
__m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1);
503+
size_t bodyWidth = AlignLo(width, A);
504+
size_t tail = width - bodyWidth;
505+
for (size_t row = 0; row < height; ++row)
506+
{
507+
for (size_t colYuv = 0, colRgba = 0; colYuv < bodyWidth; colYuv += A, colRgba += QA)
508+
{
509+
Yuv444pToRgbaV2<align, T>(y + colYuv, u + colYuv, v + colYuv, a_0, rgba + colRgba);
510+
}
511+
if (tail)
512+
{
513+
size_t col = width - A;
514+
Yuv444pToRgbaV2<false, T>(y + col, u + col, v + col, a_0, rgba + 4 * col);
515+
}
516+
y += yStride;
517+
u += uStride;
518+
v += vStride;
519+
rgba += rgbaStride;
520+
}
521+
}
522+
523+
template <bool align> void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
524+
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType)
525+
{
526+
switch (yuvType)
527+
{
528+
case SimdYuvBt601: Yuv444pToRgbaV2<align, Base::Bt601>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
529+
case SimdYuvBt709: Yuv444pToRgbaV2<align, Base::Bt709>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
530+
case SimdYuvBt2020: Yuv444pToRgbaV2<align, Base::Bt2020>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
531+
case SimdYuvTrect871: Yuv444pToRgbaV2<align, Base::Trect871>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
532+
default:
533+
assert(0);
534+
}
535+
}
536+
537+
void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
538+
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType)
539+
{
540+
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
541+
Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
542+
#else
543+
if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
544+
&& Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride))
545+
Yuv444pToRgbaV2<true>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
546+
else
547+
Yuv444pToRgbaV2<false>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
462548
#endif
463549
}
464550
}
465-
#endif// SIMD_AVX2_ENABLE
551+
#endif
466552
}

src/Simd/SimdLib.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6952,11 +6952,11 @@ SIMD_API void SimdYuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_
69526952
// Avx512bw::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
69536953
// else
69546954
//#endif
6955-
//#ifdef SIMD_AVX2_ENABLE
6956-
// if (Avx2::Enable && width >= Avx2::A)
6957-
// Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
6958-
// else
6959-
//#endif
6955+
#ifdef SIMD_AVX2_ENABLE
6956+
if (Avx2::Enable && width >= Avx2::A)
6957+
Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
6958+
else
6959+
#endif
69606960
#ifdef SIMD_SSE41_ENABLE
69616961
if (Sse41::Enable && width >= Sse41::A)
69626962
Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);

src/Test/TestYuvToBgra.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -346,11 +346,11 @@ namespace Test
346346
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Sse41::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
347347
#endif
348348

349-
//#ifdef SIMD_AVX2_ENABLE
350-
// if (Simd::Avx2::Enable && TestAvx2())
351-
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
352-
//#endif
353-
//
349+
#ifdef SIMD_AVX2_ENABLE
350+
if (Simd::Avx2::Enable && TestAvx2())
351+
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
352+
#endif
353+
354354
//#ifdef SIMD_AVX512BW_ENABLE
355355
// if (Simd::Avx512bw::Enable && TestAvx512bw())
356356
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);

0 commit comments

Comments
 (0)