Skip to content

Commit

Permalink
*improve SSE4.1 optimizations of class ResizerFloatBilinear (part 6: …
Browse files Browse the repository at this point in the history
…case of large scale, channels = 2).
  • Loading branch information
ermig1979 committed Jan 15, 2025
1 parent 858d4af commit 218fa22
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 8 deletions.
6 changes: 2 additions & 4 deletions src/Simd/SimdBaseResizerBilinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,13 +334,11 @@ namespace Simd
ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
: Resizer(param)
{
_rowBuf = _param.align < 16 || _param.channels < 4 || _param.dstH >= _param.srcH;
_rowBuf = _param.align < 16 || ((_param.channels < 4 && _param.align > 16) || _param.channels == 3) || _param.dstH >= _param.srcH;
#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)
_rowBuf = true;
#else
if ((_param.align == 16 && _param.channels < 2))
_rowBuf = false;
#endif

_ay.Resize(_param.dstH, false, _param.align);
_iy.Resize(_param.dstH, false, _param.align);
EstimateIndexAlpha(_param, _param.srcH, _param.dstH, 1, 1, _iy.data, _ay.data);
Expand Down
7 changes: 6 additions & 1 deletion src/Simd/SimdLoad.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ namespace Simd
return _mm_load_ps(p);
}

SIMD_INLINE __m128 LoadHalf(const float* p)
{
return _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)p));
}

SIMD_INLINE __m128 Load(const float * p0, const float * p1)
{
return _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p0), (__m64*)p1);
Expand Down Expand Up @@ -96,7 +101,7 @@ namespace Simd

SIMD_INLINE __m128i LoadHalf(const __m128i* p)
{
return _mm_castps_si128(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)p));
return _mm_loadl_epi64(p);
}

SIMD_INLINE __m128i Load(const __m128i* p0, const __m128i* p1)
Expand Down
31 changes: 29 additions & 2 deletions src/Simd/SimdSse41ResizerBilinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ namespace Simd
void ResizerFloatBilinear::Run(const float* src, size_t srcStride, float* dst, size_t dstStride)
{
size_t cn = _param.channels, cnF = AlignLo(cn, F), cnT = cn - cnF, cnL = cnT - F;
size_t dw = _param.dstW, dw4 = AlignLo(dw, 4);
size_t dw = _param.dstW, dw2= AlignLo(dw, 2), dw4 = AlignLo(dw, 4);
__m128 _1 = _mm_set1_ps(1.0f);
if (_rowBuf)
{
Expand Down Expand Up @@ -722,7 +722,6 @@ namespace Simd
size_t dx = 0;
for (; dx < dw4; dx += 4)
{
size_t os = _ix[dx];
__m128 fx1 = _mm_loadu_ps(_ax.data + dx);
__m128 fx0 = _mm_sub_ps(_1, fx1);
__m128 s00 = Load(src0 + _ix[dx + 0], src0 + _ix[dx + 1]);
Expand All @@ -743,6 +742,34 @@ namespace Simd
_mm_store_ss(dst + dx, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1)));
}
}
else if (cn == 2)
{
size_t dx = 0, od = 0;
for (; dx < dw4; dx += 2, od += 4)
{
__m128 fx = LoadHalf(_ax.data + dx);
__m128 fx1 = _mm_unpacklo_ps(fx, fx);
__m128 fx0 = _mm_sub_ps(_1, fx1);
__m128 s00 = _mm_loadu_ps(src0 + _ix[dx + 0]);
__m128 s01 = _mm_loadu_ps(src0 + _ix[dx + 1]);
__m128 r0 = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(s00, s01, 0x44), fx0), _mm_mul_ps(_mm_shuffle_ps(s00, s01, 0xEE), fx1));
__m128 s10 = _mm_loadu_ps(src1 + _ix[dx + 0]);
__m128 s11 = _mm_loadu_ps(src1 + _ix[dx + 1]);
__m128 r1 = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(s10, s11, 0x44), fx0), _mm_mul_ps(_mm_shuffle_ps(s10, s11, 0xEE), fx1));
_mm_storeu_ps(dst + od, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1)));
}
for (; dx < dw; dx++, od += 2)
{
size_t os = _ix[dx];
__m128 fx1 = _mm_set1_ps(_ax[dx]);
__m128 fx0 = _mm_sub_ps(_1, fx1);
__m128 s0 = _mm_loadu_ps(src0 + os);
__m128 r0 = _mm_add_ps(_mm_mul_ps(s0, fx0), _mm_mul_ps(_mm_shuffle_ps(s0, s0, 0xEE), fx1));
__m128 s1 = _mm_loadu_ps(src1 + os);
__m128 r1 = _mm_add_ps(_mm_mul_ps(s1, fx0), _mm_mul_ps(_mm_shuffle_ps(s1, s1, 0xEE), fx1));
StoreHalf<0>(dst + od, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1)));
}
}
else
{
for (size_t dx = 0; dx < dw; dx++)
Expand Down
2 changes: 1 addition & 1 deletion src/Test/TestResize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ namespace Test
bool ResizerAutoTest(const FuncRS & f1, const FuncRS & f2)
{
bool result = true;

result = result && ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 2, f1, f2);
#if 1
#if defined(SIMD_X64_ENABLE)
result = result && ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 64, f1, f2);
Expand Down

0 comments on commit 218fa22

Please sign in to comment.