Skip to content

Commit

Permalink
+add AVX-512BW optimizations of function SynetNormalizeLayerForwardV4.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Dec 7, 2023
1 parent 7066a01 commit 5a7a45e
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 8 deletions.
2 changes: 1 addition & 1 deletion docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ <h3 id="R133">January X, 2024 (version 5.4.133)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Base implementation, SSE4.1, AVX2 optimizations of function SynetNormalizeLayerForwardV4.</li>
<li>Base implementation, SSE4.1, AVX2, AVX-512BW optimizations of function SynetNormalizeLayerForwardV4.</li>
</ul>

<h4>Test framework</h4>
Expand Down
2 changes: 0 additions & 2 deletions src/Simd/SimdAvx2SynetNormalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,7 @@ namespace Simd
{
__m256 _sqsum = _mm256_setzero_ps();
for (s = 0; s < spatialF; s += F, o += F)
{
_sqsum = _mm256_add_ps(Square(_mm256_loadu_ps(src + o)), _sqsum);
}
float sqsum = Avx::ExtractSum(_sqsum);
for (; s < spatial; ++s, ++o)
sqsum += Simd::Square(src[o]);
Expand Down
3 changes: 3 additions & 0 deletions src/Simd/SimdAvx512bw.h
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,9 @@ namespace Simd
void SynetNormalizeLayerForwardV3(const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);

void SynetNormalizeLayerForwardV4(const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);

void SynetPoolingAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX,
size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format);

Expand Down
95 changes: 95 additions & 0 deletions src/Simd/SimdAvx512bwSynetNormalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,101 @@ namespace Simd
else
assert(0);
}

//-------------------------------------------------------------------------------------------------

void NormalizeNchwV4(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
{
float k = 1.0f / float(channels);
size_t spatialF = AlignLo(spatial, F), s;
__mmask16 spatialM = TailMask16(spatial - spatialF);
for (size_t b = 0; b < batch; ++b)
{
float sum = 0;
for (size_t c = 0; c < channels; ++c)
{
__m512 _sqsum = _mm512_setzero_ps();
const float* ps = src + c * spatial;
for (s = 0; s < spatialF; s += F)
_sqsum = _mm512_add_ps(Square(_mm512_loadu_ps(ps + s)), _sqsum);
if(s < spatial)
_sqsum = _mm512_add_ps(Square(_mm512_maskz_loadu_ps(spatialM, ps + s)), _sqsum);
float sqsum = Avx512bw::ExtractSum(_sqsum);
buf[c] = sqrt(sqsum);
sum += buf[c];
}
float norm = 1.0f / (sum * k + eps);
for (size_t c = 0; c < channels; ++c)
{
__m512 _alpha = _mm512_set1_ps(1.0f + scale[c] * buf[c] * norm);
__m512 _shift = _mm512_set1_ps(shift[c]);
for (s = 0; s < spatialF; s += F)
_mm512_storeu_ps(dst + s, _mm512_add_ps(_mm512_mul_ps(_mm512_loadu_ps(src + s), _alpha), _shift));
if (s < spatial)
_mm512_mask_storeu_ps(dst + s, spatialM, _mm512_add_ps(_mm512_mul_ps(_mm512_maskz_loadu_ps(spatialM, src + s), _alpha), _shift));
dst += spatial;
src += spatial;
}
}
}

void NormalizeNhwcV4(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
{
float k = 1.0f / float(channels);
size_t channelsF = AlignLo(channels, F), c;
__mmask16 channelsM = TailMask16(channels - channelsF);
__m512 _eps = _mm512_set1_ps(eps), _k = _mm512_set1_ps(k), _1 = _mm512_set1_ps(1.0f);
for (size_t b = 0; b < batch; ++b)
{
for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(buf + c, _mm512_setzero_ps());
if (c < channels)
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_setzero_ps());
for (size_t s = 0; s < spatial; ++s)
{
const float* ps = src + s * channels;
for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(buf + c, _mm512_add_ps(Square(_mm512_loadu_ps(ps + c)), _mm512_loadu_ps(buf + c)));
if (c < channels)
_mm512_mask_storeu_ps(buf + c, channelsM, _mm512_add_ps(Square(_mm512_maskz_loadu_ps(channelsM, ps + c)), _mm512_maskz_loadu_ps(channelsM, buf + c)));
}
float sum = 0;
for (size_t c = 0; c < channels; ++c)
{
buf[c] = sqrt(buf[c]);
sum += buf[c];
}
float norm = 1.0f / (sum * k + eps);
for (size_t c = 0; c < channels; ++c)
buf[c] = 1.0f + scale[c] * buf[c] * norm;
for (size_t s = 0, o = 0; s < spatial; ++s)
{
for (c = 0; c < channelsF; c += F)
_mm512_storeu_ps(dst + c, _mm512_add_ps(_mm512_mul_ps(_mm512_loadu_ps(src + c), _mm512_loadu_ps(buf + c)), _mm512_loadu_ps(shift + c)));
if (c < channels)
_mm512_mask_storeu_ps(dst + c, channelsM, _mm512_add_ps(_mm512_mul_ps(_mm512_maskz_loadu_ps(channelsM, src + c), _mm512_maskz_loadu_ps(channelsM, buf + c)), _mm512_maskz_loadu_ps(channelsM, shift + c)));
src += channels;
dst += channels;
}
}
}

void SynetNormalizeLayerForwardV4(const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst)
{
Array32f _buf;
if (buf == NULL)
{
_buf.Resize(channels);
buf = _buf.data;
}
if (format == SimdTensorFormatNchw)
NormalizeNchwV4(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
else if (format == SimdTensorFormatNhwc)
NormalizeNhwcV4(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
else
assert(0);
}
}
#endif
}
2 changes: 1 addition & 1 deletion src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6740,7 +6740,7 @@ SIMD_API void SimdSynetNormalizeLayerForwardV4(const float* src, size_t batch, s
#if defined(SIMD_SYNET_ENABLE)
typedef void(*SimdSynetNormalizeLayerForwardV4Ptr) (const float* src, size_t batch, size_t channels, size_t spatial,
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);
const static SimdSynetNormalizeLayerForwardV4Ptr simdSynetNormalizeLayerForwardV4 = SIMD_FUNC2(SynetNormalizeLayerForwardV4, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);// , SIMD_AVX512BW_FUNC, SIMD_NEON_FUNC);
const static SimdSynetNormalizeLayerForwardV4Ptr simdSynetNormalizeLayerForwardV4 = SIMD_FUNC3(SynetNormalizeLayerForwardV4, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);// , SIMD_NEON_FUNC);

simdSynetNormalizeLayerForwardV4(src, batch, channels, spatial, scale, shift, eps, format, buf, dst);
#else
Expand Down
8 changes: 4 additions & 4 deletions src/Test/TestSynetNormalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,10 @@ namespace Test
result = result && SynetNormalizeLayerForwardV2AutoTest(FUNC_SNLF2(Simd::Avx2::SynetNormalizeLayerForwardV4), FUNC_SNLF2(SimdSynetNormalizeLayerForwardV4));
#endif

//#ifdef SIMD_AVX512BW_ENABLE
// if (Simd::Avx512bw::Enable)
// result = result && SynetNormalizeLayerForwardV2AutoTest(FUNC_SNLF2(Simd::Avx512bw::SynetNormalizeLayerForwardV4), FUNC_SNLF2(SimdSynetNormalizeLayerForwardV4));
//#endif
#ifdef SIMD_AVX512BW_ENABLE
if (Simd::Avx512bw::Enable)
result = result && SynetNormalizeLayerForwardV2AutoTest(FUNC_SNLF2(Simd::Avx512bw::SynetNormalizeLayerForwardV4), FUNC_SNLF2(SimdSynetNormalizeLayerForwardV4));
#endif

return result;
}
Expand Down

0 comments on commit 5a7a45e

Please sign in to comment.