From 3bafc892db3f6b279064a75c8f9c378214b7eb4e Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Thu, 16 Jan 2025 18:08:52 +0300 Subject: [PATCH] *improve AVX2 optimizations of class ResizerFloatBilinear (part 5: case of large scale, channels = 1). --- src/Simd/SimdAvx2ResizerBilinear.cpp | 69 +++++++++++++++++++++++++++- src/Simd/SimdBaseResizerBilinear.cpp | 2 +- src/Test/TestResize.cpp | 2 +- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/src/Simd/SimdAvx2ResizerBilinear.cpp b/src/Simd/SimdAvx2ResizerBilinear.cpp index ca6bc02481..b1e6294670 100644 --- a/src/Simd/SimdAvx2ResizerBilinear.cpp +++ b/src/Simd/SimdAvx2ResizerBilinear.cpp @@ -809,6 +809,7 @@ namespace Simd size_t cn = _param.channels, cnH = AlignLo(cn, HF), cnTH = cn - cnH, cnLH = cnTH - HF, cnF = AlignLo(cn, F), cnTF = cn - cnF, cnLF = cnTF - F; + size_t dw = _param.dstW, dw2 = AlignLo(dw, 2), dw4 = AlignLo(dw, 4), dw8 = AlignLo(dw, 8), dw1 = dw - 1; __m256 _1 = _mm256_set1_ps(1.0f); if (_rowBuf) { @@ -1005,7 +1006,73 @@ namespace Simd } else { - Sse41::ResizerFloatBilinear::Run(src, srcStride, dst, dstStride); + if (cn > 1) + { + Sse41::ResizerFloatBilinear::Run(src, srcStride, dst, dstStride); + return; + } + for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) + { + __m128 fy1 = _mm_set1_ps(_ay[dy]); + __m128 fy0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fy1); + const float* src0 = src + _iy[dy] * srcStride, * src1 = src0 + srcStride; + if (cn == 1) + { + size_t dx = 0; + if (Avx2::SlowGather) + { + for (; dx < dw8; dx += 8) + { + __m256 fx1 = _mm256_load_ps(_ax.data + dx); + __m256 fx0 = _mm256_sub_ps(_1, fx1); + __m256 s00 = Load(src0 + _ix[dx + 0], src0 + _ix[dx + 1], src0 + _ix[dx + 4], src0 + _ix[dx + 5]); + __m256 s01 = Load(src0 + _ix[dx + 2], src0 + _ix[dx + 3], src0 + _ix[dx + 6], src0 + _ix[dx + 7]); + __m256 r0 = _mm256_fmadd_ps(_mm256_shuffle_ps(s00, s01, 0x88), fx0, _mm256_mul_ps(_mm256_shuffle_ps(s00, s01, 0xDD), fx1)); + __m256 s10 = Load(src1 + _ix[dx + 0], src1 + _ix[dx + 1], src1 + _ix[dx + 4], src1 + _ix[dx + 5]); + __m256 s11 = Load(src1 + _ix[dx + 2], src1 + _ix[dx + 3], src1 + _ix[dx + 6], src1 + _ix[dx + 7]); + __m256 r1 = _mm256_fmadd_ps(_mm256_shuffle_ps(s10, s11, 0x88), fx0, _mm256_mul_ps(_mm256_shuffle_ps(s10, s11, 0xDD), fx1)); + _mm256_store_ps(dst + dx, _mm256_fmadd_ps(r0, fx0, _mm256_mul_ps(r1, fx1))); + } + } + else + { + for (; dx < dw8; dx += 8) + { + __m256 fx1 = _mm256_load_ps(_ax.data + dx); + __m256 fx0 = _mm256_sub_ps(_1, fx1); + __m256i idx = Avx2::LoadPermuted((__m256i*)(_ix.data + dx)); + __m256 s00 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)src0, _mm256_extracti128_si256(idx, 0), 4)); + __m256 s01 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)src0, _mm256_extracti128_si256(idx, 1), 4)); + __m256 r0 = _mm256_fmadd_ps(_mm256_shuffle_ps(s00, s01, 0x88), fx0, _mm256_mul_ps(_mm256_shuffle_ps(s00, s01, 0xDD), fx1)); + __m256 s10 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)src1, _mm256_extracti128_si256(idx, 0), 4)); + __m256 s11 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)src1, _mm256_extracti128_si256(idx, 1), 4)); + __m256 r1 = _mm256_fmadd_ps(_mm256_shuffle_ps(s10, s11, 0x88), fx0, _mm256_mul_ps(_mm256_shuffle_ps(s10, s11, 0xDD), fx1)); + _mm256_store_ps(dst + dx, _mm256_fmadd_ps(r0, fx0, _mm256_mul_ps(r1, fx1))); + } + } + for (; dx < dw4; dx += 4) + { + __m128 fx1 = _mm_loadu_ps(_ax.data + dx); + __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); + __m128 s00 = Sse41::Load(src0 + _ix[dx + 0], src0 + _ix[dx + 1]); + __m128 s01 = Sse41::Load(src0 + _ix[dx + 2], src0 + _ix[dx + 3]); + __m128 r0 = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(s00, s01, 0x88), fx0), _mm_mul_ps(_mm_shuffle_ps(s00, s01, 0xDD), fx1)); + __m128 s10 = Sse41::Load(src1 + _ix[dx + 0], src1 + _ix[dx + 1]); + __m128 s11 = Sse41::Load(src1 + _ix[dx + 2], src1 + _ix[dx + 3]); + __m128 r1 = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(s10, s11, 0x88), fx0), _mm_mul_ps(_mm_shuffle_ps(s10, s11, 0xDD), fx1)); + _mm_storeu_ps(dst + dx, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1))); + } + for (; dx < dw; dx++) + { + size_t os = _ix[dx]; + __m128 fx1 = _mm_set1_ps(_ax[dx]); + __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); + __m128 r0 = _mm_add_ps(_mm_mul_ps(_mm_load_ss(src0 + os), fx0), _mm_mul_ps(_mm_load_ss(src0 + os + 1), fx1)); + __m128 r1 = _mm_add_ps(_mm_mul_ps(_mm_load_ss(src1 + os), fx0), _mm_mul_ps(_mm_load_ss(src1 + os + 1), fx1)); + _mm_store_ss(dst + dx, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1))); + } + } + } } } diff --git a/src/Simd/SimdBaseResizerBilinear.cpp b/src/Simd/SimdBaseResizerBilinear.cpp index c9e80675bb..61f5941af5 100644 --- a/src/Simd/SimdBaseResizerBilinear.cpp +++ b/src/Simd/SimdBaseResizerBilinear.cpp @@ -334,7 +334,7 @@ namespace Simd ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) : Resizer(param) { - _rowBuf = _param.align < 16 || (_param.channels < 4 && _param.align > 16) || _param.dstH >= _param.srcH; + _rowBuf = _param.align < 16 || (_param.channels < 4 && (_param.align > 32 || _param.channels < 2) ) || _param.dstH >= _param.srcH; #if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE) _rowBuf = true; #endif diff --git a/src/Test/TestResize.cpp b/src/Test/TestResize.cpp index c9c0eac19a..85de04af9e 100644 --- a/src/Test/TestResize.cpp +++ b/src/Test/TestResize.cpp @@ -244,7 +244,7 @@ namespace Test bool ResizerAutoTest(const FuncRS & f1, const FuncRS & f2) { - //return ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 10, f1, f2); + //return ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 1, f1, f2); bool result = true; #if 1