Skip to content

Commit

Permalink
+add SSE4.1 optimizations of class SynetConvolution16bNhwcDepthwise.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Oct 9, 2024
1 parent b82f4f3 commit 3a3a5ae
Show file tree
Hide file tree
Showing 14 changed files with 322 additions and 19 deletions.
2 changes: 1 addition & 1 deletion docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ <h3 id="R143">November X, 2024 (version X.X.143)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation of class SynetConvolution16bNhwcDepthwise.</li>
<li>Base implementation, SSE4.1 optimizations of class SynetConvolution16bNhwcDepthwise.</li>
</ul>
<h5>Im
<h5>Improving</h5>
Expand Down
1 change: 1 addition & 0 deletions prj/vs2019/Sse41.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConversion.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNchwGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDeptwise.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDirect.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution32f.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2019/Sse41.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,9 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution32fNhwcDirect.cpp">
<Filter>Sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDeptwise.cpp">
<Filter>Sse41</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Sse41">
Expand Down
1 change: 1 addition & 0 deletions prj/vs2022/Sse41.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConversion.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16b.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNchwGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDeptwise.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDirect.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcGemm.cpp" />
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution32f.cpp" />
Expand Down
3 changes: 3 additions & 0 deletions prj/vs2022/Sse41.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,9 @@
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution32fNhwcDirect.cpp">
<Filter>Sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\Simd\SimdSse41SynetConvolution16bNhwcDeptwise.cpp">
<Filter>Sse41</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Sse41">
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdAmxBf16SynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace Simd
if (SynetConvolution16bNchwGemm::Preferable(param))
return new AmxBf16::SynetConvolution16bNchwGemm(param);
if (Base::SynetConvolution16bNhwcDepthwise::Preferable(param))
return new Base::SynetConvolution16bNhwcDepthwise(param);
return new Sse41::SynetConvolution16bNhwcDepthwise(param);
return new Base::SynetConvolution16bGemm(param);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdAvx2SynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace Simd
if (Base::SynetConvolution16bNchwGemm::Preferable(param))
return new Avx2::SynetConvolution16bNchwGemm(param);
if (Base::SynetConvolution16bNhwcDepthwise::Preferable(param))
return new Base::SynetConvolution16bNhwcDepthwise(param);
return new Sse41::SynetConvolution16bNhwcDepthwise(param);
return new Base::SynetConvolution16bGemm(param);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdAvx512bwSynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace Simd
if (Base::SynetConvolution16bNchwGemm::Preferable(param))
return new Avx512bw::SynetConvolution16bNchwGemm(param);
if (Base::SynetConvolution16bNhwcDepthwise::Preferable(param))
return new Base::SynetConvolution16bNhwcDepthwise(param);
return new Sse41::SynetConvolution16bNhwcDepthwise(param);
return new Base::SynetConvolution16bGemm(param);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/Simd/SimdSse41SynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ namespace Simd
return new Sse41::SynetConvolution16bNhwcGemm(param);
if (SynetConvolution16bNchwGemm::Preferable(param))
return new Sse41::SynetConvolution16bNchwGemm(param);
if (Base::SynetConvolution16bNhwcDepthwise::Preferable(param))
return new Base::SynetConvolution16bNhwcDepthwise(param);
if (SynetConvolution16bNhwcDepthwise::Preferable(param))
return new SynetConvolution16bNhwcDepthwise(param);
return new Base::SynetConvolution16bGemm(param);
}
}
Expand Down
283 changes: 283 additions & 0 deletions src/Simd/SimdSse41SynetConvolution16bNhwcDeptwise.cpp

Large diffs are not rendered by default.

14 changes: 1 addition & 13 deletions src/Simd/SimdSse41SynetMergedConvolution16bDepthwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,7 @@ namespace Simd
using AlgParam = Base::SynetMergedConvolution16b::AlgParam;
using DepthwisePtr = Base::SynetMergedConvolution16b::DepthwiseConvolutionPtr;

template <class T> SIMD_INLINE __m128 LoadSrc(const T* src);

template <> SIMD_INLINE __m128 LoadSrc<float>(const float* src)
{
return _mm_loadu_ps(src);
}

template <> SIMD_INLINE __m128 LoadSrc<uint16_t>(const uint16_t* src)
{
return _mm_castsi128_ps(UnpackU16<0>(K_ZERO, _mm_loadl_epi64((__m128i*)src)));
}

//---------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------

template<typename T, Term16bType term, SimdConvolutionActivationType type> void DepthwiseConvolution(const uint8_t* src8, const ConvParam& p, const AlgParam& a,
size_t maC, size_t yBeg, size_t yEnd, const float* weight, const float* bias, const float* params, uint8_t* dst)
Expand Down
8 changes: 8 additions & 0 deletions src/Simd/SimdSynetConvolution16b.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,14 @@ namespace Simd
virtual String Ext() const { return "Sse41"; }
};

class SynetConvolution16bNhwcDepthwise : public Base::SynetConvolution16bNhwcDepthwise
{
public:
SynetConvolution16bNhwcDepthwise(const ConvParam& p);

virtual String Ext() const { return "Sse41"; }
};

class SynetConvolution16bNchwGemm : public Base::SynetConvolution16bNchwGemm
{
public:
Expand Down
14 changes: 14 additions & 0 deletions src/Simd/SimdSynetConvolution16bCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ namespace Simd
#ifdef SIMD_SSE41_ENABLE
namespace Sse41
{
template <class T> SIMD_INLINE __m128 LoadSrc(const T* src);

template <> SIMD_INLINE __m128 LoadSrc<float>(const float* src)
{
return _mm_loadu_ps(src);
}

template <> SIMD_INLINE __m128 LoadSrc<uint16_t>(const uint16_t* src)
{
return _mm_castsi128_ps(UnpackU16<0>(K_ZERO, _mm_loadl_epi64((__m128i*)src)));
}

//-------------------------------------------------------------------------------------------------

template <Term16bType term> struct Term16b
{
template<SimdConvolutionActivationType type, int index> static SIMD_INLINE void Save(uint16_t* ptr, __m128 value, const __m128* bias, const __m128* params);
Expand Down
2 changes: 2 additions & 0 deletions src/Test/TestSynetConvolution16b.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,11 +283,13 @@ namespace Test
#endif
#if 1
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 76, 64, 64, 76, _7, _1, _1, _3, _3, 76, aId, tT, b16, f32), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 76, 64, 64, 76, _3, _1, _1, _1, _1, 76, aId, tT, f32, f32), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 152, 32, 32, 152, _7, _1, _1, _3, _3, 152, aPr, tT, b16, f32), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 304, 16, 16, 304, _7, _1, _1, _3, _3, 304, aMi, tT, f32, f32), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 304, 16, 16, 304, _3, _1, _1, _1, _1, 304, aRe, tT, f32, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 304, 17, 15, 304, _7, _1, _1, _3, _3, 304, aGe, tT, b16, f32), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 608, 8, 8, 608, _7, _1, _1, _3, _3, 608, aSw, tT, b16, b16), c, f1, f2);
result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 152, 32, 32, 152, _5, _1, _2, _2, _2, 152, aPr, tT, b16, b16), c, f1, f2);
#endif

#else
Expand Down

0 comments on commit 3a3a5ae

Please sign in to comment.