Skip to content

Commit

Permalink
+add AVX2 optimizations of function Yuv444pToRgbaV2.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Nov 15, 2024
1 parent c6f29b2 commit 74d872b
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 12 deletions.
2 changes: 1 addition & 1 deletion docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ <h3 id="R144">December X, 2024 (version X.X.144)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>SSE4.1 optimizations of function Yuv444pToRgbaV2.</li>
<li>SSE4.1, AVX2 optimizations of function Yuv444pToRgbaV2.</li>
<li>SSE4.1 optimizations of class ImageJpegLoader.</li>
</ul>

Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,9 @@ namespace Simd
void Yuv444pToRgbV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* rgb, size_t rgbStride, SimdYuvType yuvType);

void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType);

void Yuv420pToUyvy422(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* uyvy, size_t uyvyStride);
}
Expand Down
88 changes: 87 additions & 1 deletion src/Simd/SimdAvx2YuvToBgraV2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,8 +459,94 @@ namespace Simd
Yuv420pToBgraV2<true>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
Yuv420pToBgraV2<false>(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
#endif
}

//-------------------------------------------------------------------------------------------------

template <bool align, class T> SIMD_INLINE void YuvToRgba16(__m256i y16, __m256i u16, __m256i v16, const __m256i& a_0, __m256i* rgba)
{
const __m256i b16 = YuvToBlue16<T>(y16, u16);
const __m256i g16 = YuvToGreen16<T>(y16, u16, v16);
const __m256i r16 = YuvToRed16<T>(y16, v16);
const __m256i rg8 = _mm256_or_si256(r16, _mm256_slli_si256(g16, 1));
const __m256i ba8 = _mm256_or_si256(b16, a_0);
__m256i rgba0 = _mm256_unpacklo_epi16(rg8, ba8);
__m256i rgba1 = _mm256_unpackhi_epi16(rg8, ba8);
Permute2x128(rgba0, rgba1);
Store<align>(rgba + 0, rgba0);
Store<align>(rgba + 1, rgba1);
}

template <bool align, class T> SIMD_INLINE void YuvToRgba(__m256i y8, __m256i u8, __m256i v8, const __m256i& a_0, __m256i* rgba)
{
YuvToRgba16<align, T>(UnpackY<T, 0>(y8), UnpackUV<T, 0>(u8), UnpackUV<T, 0>(v8), a_0, rgba + 0);
YuvToRgba16<align, T>(UnpackY<T, 1>(y8), UnpackUV<T, 1>(u8), UnpackUV<T, 1>(v8), a_0, rgba + 2);
}

template <bool align, class T> SIMD_INLINE void Yuv444pToRgbaV2(const uint8_t* y, const uint8_t* u, const uint8_t* v, const __m256i& a_0, uint8_t* rgba)
{
YuvToRgba<align, T>(LoadPermuted<align>((__m256i*)y), LoadPermuted<align>((__m256i*)u), LoadPermuted<align>((__m256i*)v), a_0, (__m256i*)rgba);
}

template <bool align, class T> void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha)
{
assert(width >= A);
if (align)
{
assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride));
assert(Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride));
}

__m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1);
size_t bodyWidth = AlignLo(width, A);
size_t tail = width - bodyWidth;
for (size_t row = 0; row < height; ++row)
{
for (size_t colYuv = 0, colRgba = 0; colYuv < bodyWidth; colYuv += A, colRgba += QA)
{
Yuv444pToRgbaV2<align, T>(y + colYuv, u + colYuv, v + colYuv, a_0, rgba + colRgba);
}
if (tail)
{
size_t col = width - A;
Yuv444pToRgbaV2<false, T>(y + col, u + col, v + col, a_0, rgba + 4 * col);
}
y += yStride;
u += uStride;
v += vStride;
rgba += rgbaStride;
}
}

template <bool align> void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType)
{
switch (yuvType)
{
case SimdYuvBt601: Yuv444pToRgbaV2<align, Base::Bt601>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
case SimdYuvBt709: Yuv444pToRgbaV2<align, Base::Bt709>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
case SimdYuvBt2020: Yuv444pToRgbaV2<align, Base::Bt2020>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
case SimdYuvTrect871: Yuv444pToRgbaV2<align, Base::Trect871>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
default:
assert(0);
}
}

void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType)
{
#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
#else
if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
&& Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride))
Yuv444pToRgbaV2<true>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
else
Yuv444pToRgbaV2<false>(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
#endif
}
}
#endif// SIMD_AVX2_ENABLE
#endif
}
10 changes: 5 additions & 5 deletions src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6952,11 +6952,11 @@ SIMD_API void SimdYuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_
// Avx512bw::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
// else
//#endif
//#ifdef SIMD_AVX2_ENABLE
// if (Avx2::Enable && width >= Avx2::A)
// Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
// else
//#endif
#ifdef SIMD_AVX2_ENABLE
if (Avx2::Enable && width >= Avx2::A)
Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
else
#endif
#ifdef SIMD_SSE41_ENABLE
if (Sse41::Enable && width >= Sse41::A)
Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
Expand Down
10 changes: 5 additions & 5 deletions src/Test/TestYuvToBgra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,11 @@ namespace Test
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Sse41::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
#endif

//#ifdef SIMD_AVX2_ENABLE
// if (Simd::Avx2::Enable && TestAvx2())
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
//#endif
//
#ifdef SIMD_AVX2_ENABLE
if (Simd::Avx2::Enable && TestAvx2())
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable && TestAvx512bw())
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
Expand Down

0 comments on commit 74d872b

Please sign in to comment.