diff --git a/docs/2024.html b/docs/2024.html index 602dce1e09..ab71a19843 100644 --- a/docs/2024.html +++ b/docs/2024.html @@ -39,7 +39,7 @@

December X, 2024 (version X.X.144)

Algorithms

New features
diff --git a/src/Simd/SimdAvx2.h b/src/Simd/SimdAvx2.h index ad49329ff8..e330f93763 100644 --- a/src/Simd/SimdAvx2.h +++ b/src/Simd/SimdAvx2.h @@ -653,6 +653,9 @@ namespace Simd void Yuv444pToRgbV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, size_t width, size_t height, uint8_t* rgb, size_t rgbStride, SimdYuvType yuvType); + void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType); + void Yuv420pToUyvy422(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, size_t width, size_t height, uint8_t* uyvy, size_t uyvyStride); } diff --git a/src/Simd/SimdAvx2YuvToBgraV2.cpp b/src/Simd/SimdAvx2YuvToBgraV2.cpp index 2742ce56f0..2d9a0c0928 100644 --- a/src/Simd/SimdAvx2YuvToBgraV2.cpp +++ b/src/Simd/SimdAvx2YuvToBgraV2.cpp @@ -459,8 +459,94 @@ namespace Simd Yuv420pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType); else Yuv420pToBgraV2(y, yStride, u, uStride, v, vStride, width, height, bgra, bgraStride, alpha, yuvType); +#endif + } + + //------------------------------------------------------------------------------------------------- + + template SIMD_INLINE void YuvToRgba16(__m256i y16, __m256i u16, __m256i v16, const __m256i& a_0, __m256i* rgba) + { + const __m256i b16 = YuvToBlue16(y16, u16); + const __m256i g16 = YuvToGreen16(y16, u16, v16); + const __m256i r16 = YuvToRed16(y16, v16); + const __m256i rg8 = _mm256_or_si256(r16, _mm256_slli_si256(g16, 1)); + const __m256i ba8 = _mm256_or_si256(b16, a_0); + __m256i rgba0 = _mm256_unpacklo_epi16(rg8, ba8); + __m256i rgba1 = _mm256_unpackhi_epi16(rg8, ba8); + Permute2x128(rgba0, rgba1); + Store(rgba + 0, rgba0); + Store(rgba + 1, rgba1); + } + + template SIMD_INLINE void YuvToRgba(__m256i y8, __m256i u8, __m256i v8, const __m256i& a_0, __m256i* rgba) + { + YuvToRgba16(UnpackY(y8), UnpackUV(u8), UnpackUV(v8), a_0, rgba + 0); + YuvToRgba16(UnpackY(y8), UnpackUV(u8), UnpackUV(v8), a_0, rgba + 2); + } + + template SIMD_INLINE void Yuv444pToRgbaV2(const uint8_t* y, const uint8_t* u, const uint8_t* v, const __m256i& a_0, uint8_t* rgba) + { + YuvToRgba(LoadPermuted((__m256i*)y), LoadPermuted((__m256i*)u), LoadPermuted((__m256i*)v), a_0, (__m256i*)rgba); + } + + template void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha) + { + assert(width >= A); + if (align) + { + assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); + assert(Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride)); + } + + __m256i a_0 = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1); + size_t bodyWidth = AlignLo(width, A); + size_t tail = width - bodyWidth; + for (size_t row = 0; row < height; ++row) + { + for (size_t colYuv = 0, colRgba = 0; colYuv < bodyWidth; colYuv += A, colRgba += QA) + { + Yuv444pToRgbaV2(y + colYuv, u + colYuv, v + colYuv, a_0, rgba + colRgba); + } + if (tail) + { + size_t col = width - A; + Yuv444pToRgbaV2(y + col, u + col, v + col, a_0, rgba + 4 * col); + } + y += yStride; + u += uStride; + v += vStride; + rgba += rgbaStride; + } + } + + template void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType) + { + switch (yuvType) + { + case SimdYuvBt601: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break; + case SimdYuvBt709: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break; + case SimdYuvBt2020: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break; + case SimdYuvTrect871: Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha); break; + default: + assert(0); + } + } + + void Yuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, const uint8_t* v, size_t vStride, + size_t width, size_t height, uint8_t* rgba, size_t rgbaStride, uint8_t alpha, SimdYuvType yuvType) + { +#if defined(SIMD_X86_ENABLE) && defined(NDEBUG) && defined(_MSC_VER) && _MSC_VER <= 1900 + Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); +#else + if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) + && Aligned(v) && Aligned(vStride) && Aligned(rgba) && Aligned(rgbaStride)) + Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); + else + Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); #endif } } -#endif// SIMD_AVX2_ENABLE +#endif } diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp index 60936d1640..208f9281c6 100644 --- a/src/Simd/SimdLib.cpp +++ b/src/Simd/SimdLib.cpp @@ -6952,11 +6952,11 @@ SIMD_API void SimdYuv444pToRgbaV2(const uint8_t* y, size_t yStride, const uint8_ // Avx512bw::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); // else //#endif -//#ifdef SIMD_AVX2_ENABLE -// if (Avx2::Enable && width >= Avx2::A) -// Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); -// else -//#endif +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable && width >= Avx2::A) + Avx2::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); + else +#endif #ifdef SIMD_SSE41_ENABLE if (Sse41::Enable && width >= Sse41::A) Sse41::Yuv444pToRgbaV2(y, yStride, u, uStride, v, vStride, width, height, rgba, rgbaStride, alpha, yuvType); diff --git a/src/Test/TestYuvToBgra.cpp b/src/Test/TestYuvToBgra.cpp index 95db7e3e00..9e58b7236e 100644 --- a/src/Test/TestYuvToBgra.cpp +++ b/src/Test/TestYuvToBgra.cpp @@ -346,11 +346,11 @@ namespace Test result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Sse41::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1); #endif -//#ifdef SIMD_AVX2_ENABLE -// if (Simd::Avx2::Enable && TestAvx2()) -// result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1); -//#endif -// +#ifdef SIMD_AVX2_ENABLE + if (Simd::Avx2::Enable && TestAvx2()) + result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx2::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1); +#endif + //#ifdef SIMD_AVX512BW_ENABLE // if (Simd::Avx512bw::Enable && TestAvx512bw()) // result = result && YuvToBgra2AutoTest(FUNC_YUV2(Simd::Avx512bw::Yuv444pToRgbaV2), FUNC_YUV2(SimdYuv444pToRgbaV2), 1, 1);