Skip to content

Commit

Permalink
*improve AVX2 optimizations of class ResizerFloatBilinear (part 4: ca…
Browse files Browse the repository at this point in the history
…se of small scale, channels >= 8).
  • Loading branch information
ermig1979 committed Jan 16, 2025
1 parent c8b0b06 commit 1fc4efe
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 24 deletions.
49 changes: 26 additions & 23 deletions src/Simd/SimdAvx2ResizerBilinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -960,18 +960,20 @@ namespace Simd
Store<false>(pb + dx, pb + dx + cnHF, _mm256_fmadd_ps(fx0, s0, _mm256_mul_ps(fx1, s1)));
}
}
else if (!Avx2::SlowGather)
else
{
__m256i _cn = _mm256_set1_epi32((int)cn);
for (; dx < rsF; dx += F)
for (; dx < rs;)
{
__m256i i0 = _mm256_load_si256((__m256i*)(_ix.data + dx));
__m256i i1 = _mm256_add_epi32(i0, _cn);
__m256 s0 = _mm256_i32gather_ps(ps, i0, 4);
__m256 s1 = _mm256_i32gather_ps(ps, i1, 4);
__m256 fx1 = _mm256_load_ps(_ax.data + dx);
const float* ps0 = ps + _ix[dx];
__m256 fx1 = _mm256_set1_ps(_ax[dx]);
__m256 fx0 = _mm256_sub_ps(_1, fx1);
_mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1)));
for (size_t eF = dx + cnF; dx < eF; dx += F, ps0 += F)
_mm256_storeu_ps(pb + dx, _mm256_fmadd_ps(fx0, _mm256_loadu_ps(ps0), _mm256_mul_ps(fx1, _mm256_loadu_ps(ps0 + cn))));
if (cnTF)
{
_mm256_storeu_ps(pb + dx + cnLF, _mm256_fmadd_ps(fx0, _mm256_loadu_ps(ps0 + cnLF), _mm256_mul_ps(fx1, _mm256_loadu_ps(ps0 + cn + cnLF))));
dx += cnTF;
}
}
}
for (; dx < rs; dx++)
Expand Down Expand Up @@ -1182,7 +1184,20 @@ namespace Simd
_mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, Sse41::BFloat16ToFloat32<0>(_src)), _mm_mul_ps(fx1, Sse41::BFloat16ToFloat32<1>(_src))));
}
}
if (cn >= 8)
else if (cn < 8)
{
for (; dx < rs;)
{
const uint16_t* ps0 = ps + _ix[dx];
__m128 fx1 = _mm_set1_ps(_ax[dx]);
__m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
for (size_t end = dx + cnH; dx < end; dx += Sse41::F, ps0 += Sse41::F)
_mm_storeu_ps(pb + dx, BilinearRowSumBf16(ps0, cn, fx0, fx1));
if (cnTH)
_mm_storeu_ps(pb + dx + cnLH, BilinearRowSumBf16(ps0 + cnLH, cn, fx0, fx1)), dx += cnTH;
}
}
else
{
for (; dx < rs;)
{
Expand All @@ -1195,19 +1210,7 @@ namespace Simd
_mm256_storeu_ps(pb + dx + cnLF, BilinearRowSumBf16(ps0 + cnLF, cn, fx0, fx1)), dx += cnTF;
}
}
else if (cn > 4)
{
for (; dx < rs;)
{
const uint16_t* ps0 = ps + _ix[dx];
__m128 fx1 = _mm_set1_ps(_ax[dx]);
__m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
for (size_t end = dx + cnH; dx < end; dx += Sse41::F, ps0 += Sse41::F)
_mm_storeu_ps(pb + dx, BilinearRowSumBf16(ps0, cn, fx0, fx1));
if (cnTH)
_mm_storeu_ps(pb + dx + cnLH, BilinearRowSumBf16(ps0 + cnLH, cn, fx0, fx1)), dx += cnTH;
}
}

for (; dx < rs; dx++)
{
int32_t sx = _ix[dx];
Expand Down
2 changes: 1 addition & 1 deletion src/Test/TestResize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ namespace Test

bool ResizerAutoTest(const FuncRS & f1, const FuncRS & f2)
{
//return ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 6, f1, f2);
//return ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 10, f1, f2);
bool result = true;

#if 1
Expand Down

0 comments on commit 1fc4efe

Please sign in to comment.