diff --git a/docs/2024.html b/docs/2024.html
index 602dce1e09..ab71a19843 100644
--- a/docs/2024.html
+++ b/docs/2024.html
@@ -39,7 +39,7 @@
December X, 2024 (version X.X.144)
Algorithms
New features
- - SSE4.1 optimizations of function Yuv444pToRgbaV2.
+ - SSE4.1, AVX2 optimizations of function Yuv444pToRgbaV2.
- SSE4.1 optimizations of class ImageJpegLoader.
diff --git a/src/Simd/SimdAvx2.h b/src/Simd/SimdAvx2.h
index ad49329ff8..e330f93763 100644
--- a/src/Simd/SimdAvx2.h
+++ b/src/Simd/SimdAvx2.h
@@ -653,6 +653,9 @@ namespace Simd
void Yuv444pToRgbV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* rgb, size_t rgbStride, SimdYuvType yuvType);
+ void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
+ size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType);
+
void Yuv420pToUyvy422(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
size_t width, size_t height, uint8_t* uyvy, size_t uyvyStride);
}
diff --git a/src/Simd/SimdAvx2YuvToBgraV2.cpp b/src/Simd/SimdAvx2YuvToBgraV2.cpp
index 2742ce56f0..2d9a0c0928 100644
--- a/src/Simd/SimdAvx2YuvToBgraV2.cpp
+++ b/src/Simd/SimdAvx2YuvToBgraV2.cpp
@@ -459,8 +459,94 @@ namespace Simd
Yuv420pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
else
Yuv420pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType);
+#endif
+ }
+
+ //-------------------------------------------------------------------------------------------------
+
+ template SIMD_INLINE void YuvToRgba16(__m256i y16, __m256i u16, __m256i v16, const __m256i& a_0, __m256i* rgba)
+ {
+ const __m256i b16 = YuvToBlue16(y16, u16);
+ const __m256i g16 = YuvToGreen16(y16, u16, v16);
+ const __m256i r16 = YuvToRed16(y16, v16);
+ const __m256i rg8 = _mm256_or_si256(r16, _mm256_slli_si256(g16, 1));
+ const __m256i ba8 = _mm256_or_si256(b16, a_0);
+ __m256i rgba0 = _mm256_unpacklo_epi16(rg8, ba8);
+ __m256i rgba1 = _mm256_unpackhi_epi16(rg8, ba8);
+ Permute2x128(rgba0, rgba1);
+ Store(rgba + 0, rgba0);
+ Store(rgba + 1, rgba1);
+ }
+
+ template SIMD_INLINE void YuvToRgba(__m256i y8, __m256i u8, __m256i v8, const __m256i& a_0, __m256i* rgba)
+ {
+ YuvToRgba16(UnpackY(y8), UnpackUV(u8), UnpackUV(v8), a_0, rgba + 0);
+ YuvToRgba16(UnpackY(y8), UnpackUV(u8), UnpackUV(v8), a_0, rgba + 2);
+ }
+
+ template SIMD_INLINE void Yuv444pToRgbaV2(const uint8_t* y, const uint8_t* u, const uint8_t* v, const __m256i& a_0, uint8_t* rgba)
+ {
+ YuvToRgba(LoadPermuted((__m256i*)y), LoadPermuted((__m256i*)u), LoadPermuted((__m256i*)v), a_0, (__m256i*)rgba);
+ }
+
+ template void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
+ size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha)
+ {
+ assert(width >= A);
+ if (align)
+ {
+ assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride));
+ assert(Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride));
+ }
+
+ __m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1);
+ size_t bodyWidth = AlignLo(width, A);
+ size_t tail = width - bodyWidth;
+ for (size_t row = 0; row < height; ++row)
+ {
+ for (size_t colYuv = 0, colRgba = 0; colYuv < bodyWidth; colYuv += A, colRgba += QA)
+ {
+ Yuv444pToRgbaV2(y + colYuv, u + colYuv, v + colYuv, a_0, rgba + colRgba);
+ }
+ if (tail)
+ {
+ size_t col = width - A;
+ Yuv444pToRgbaV2(y + col, u + col, v + col, a_0, rgba + 4 * col);
+ }
+ y += yStride;
+ u += uStride;
+ v += vStride;
+ rgba += rgbaStride;
+ }
+ }
+
+ template void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
+ size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType)
+ {
+ switch (yuvType)
+ {
+ case SimdYuvBt601: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
+ case SimdYuvBt709: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
+ case SimdYuvBt2020: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
+ case SimdYuvTrect871: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break;
+ default:
+ assert(0);
+ }
+ }
+
+ void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride,
+ size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType)
+ {
+#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900
+ Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
+#else
+ if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)
+ && Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride))
+ Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
+ else
+ Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
#endif
}
}
-#endif// SIMD_AVX2_ENABLE
+#endif
}
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
index 60936d1640..208f9281c6 100644
--- a/src/Simd/SimdLib.cpp
+++ b/src/Simd/SimdLib.cpp
@@ -6952,11 +6952,11 @@ SIMD_API void SimdYuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_
// Avx512bw::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
// else
//#endif
-//#ifdef SIMD_AVX2_ENABLE
-// if (Avx2::Enable && width >= Avx2::A)
-// Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
-// else
-//#endif
+#ifdef SIMD_AVX2_ENABLE
+ if (Avx2::Enable && width >= Avx2::A)
+ Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
+ else
+#endif
#ifdef SIMD_SSE41_ENABLE
if (Sse41::Enable && width >= Sse41::A)
Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType);
diff --git a/src/Test/TestYuvToBgra.cpp b/src/Test/TestYuvToBgra.cpp
index 95db7e3e00..9e58b7236e 100644
--- a/src/Test/TestYuvToBgra.cpp
+++ b/src/Test/TestYuvToBgra.cpp
@@ -346,11 +346,11 @@ namespace Test
result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Sse41::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
#endif
-//#ifdef SIMD_AVX2_ENABLE
-// if (Simd::Avx2::Enable && TestAvx2())
-// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
-//#endif
-//
+#ifdef SIMD_AVX2_ENABLE
+ if (Simd::Avx2::Enable && TestAvx2())
+ result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);
+#endif
+
//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable && TestAvx512bw())
// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);