Skip to content

Commit

Permalink
*RELEASE 6.1.137.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed May 2, 2024
1 parent 2a134f4 commit 6b090e8
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 28 deletions.
2 changes: 1 addition & 1 deletion docs/2024.html
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ <h1>Simd Library Release Notes (2024).</h1>

<a href="#HOME">Home</a>
<hr/>
<h3 id="R137">May X, 2024 (version X.X.137)</h3>
<h3 id="R137">May 2, 2024 (version 6.1.137)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
Expand Down
3 changes: 3 additions & 0 deletions docs/download.html
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ <h1>Simd Library Download.</h1>
<h3>2024</h3>
<table width=1012 border="1" style='border-collapse:collapse'>
<tr align="center" style="background-color:#e0e0e0; font-weight:bold"> <td>Release Notes</td> <td>Download Link</td> <td>Size</td> </tr>
<tr> <td align="center"> <a href="2024.html#R137">May 2, 2024</a> </td>
<td align="center"> <a href="https://github.com/ermig1979/Simd/archive/refs/tags/v6.1.137.zip">Simd-6.1.137.zip</a> </td>
<td align="center">5.8 MB</td> </tr>
<tr> <td align="center"> <a href="2024.html#R136">April 2, 2024</a> </td>
<td align="center"> <a href="https://github.com/ermig1979/Simd/archive/refs/tags/v6.1.136.zip">Simd-6.1.136.zip</a> </td>
<td align="center">5.7 MB</td> </tr>
Expand Down
2 changes: 1 addition & 1 deletion prj/txt/UserVersion.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
6.1.136
6.1.137
112 changes: 86 additions & 26 deletions src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ namespace Simd
typedef Base::SynetConvolution16bNhwcGemm::AlgParam AlgParam;
typedef Base::SynetConvolution16bNhwcGemm::ConvolutionPtr Convolution;

//-----------------------------------------------------------------------------------------
#define SIMD_CONV_REORDER_TYPE 1

#define SIMD_CONV_REORDER_TYPE 1
//-----------------------------------------------------------------------------------------

static void Convert16bNhwcGemm(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
{
static void Convert16bNhwcGemmD(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
{
const float* src = (float*)src8;
size_t srcC32 = AlignLo(p.srcC, 32);
__mmask16 srcMask[2];
Expand All @@ -59,7 +59,6 @@ namespace Simd
for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
{
uint16_t* row = dst + dr * a.bufK;

for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
{
size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
Expand Down Expand Up @@ -96,15 +95,78 @@ namespace Simd
}
}

static void Convert16bNhwcGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
static void Convert16bNhwcGemmR(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
{
const float* src = (float*)src8;
size_t srcC32 = AlignLo(p.srcC, 32);
assert(p.srcC == srcC32);
for (size_t dy = yBeg, dr = (a.macroK < a.bufK ? dy * p.dstW : 0) + b * p.dstH * p.dstW; dy < yEnd; ++dy)
{
for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
{
uint16_t* row = dst + dr * a.bufK;
for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
{
size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
if (sy < p.srcH)
{
for (size_t kx = 0; kx < p.kernelX; kx++)
{
size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
if (sx < p.srcW)
{
const float* ps = src + (sy * p.srcW + sx) * p.srcC;
for (size_t sc = 0; sc < srcC32; sc += 32)
ConvertA(ps + sc, row + sc);
row += p.srcC;
}
else
{
for (size_t sc = 0; sc < srcC32; sc += 32)
SetZero(row + sc);
row += p.srcC;
}
}
}
else
{
for (size_t sc = 0, n = p.kernelX * p.srcC; sc < n; sc += 32)
SetZero(row + sc);
row += p.kernelX * p.srcC;
}
}
}
}
}

static void Convert16bNhwcGemm1x1D(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
{
const float* src = (float*)src8;
size_t srcC32 = AlignLo(p.srcC, 32), n = (yEnd - yBeg) * p.dstW;
__mmask16 srcMask0 = TailMask16(p.srcC - srcC32 - F * 0);
__mmask16 srcMask1 = TailMask16(p.srcC - srcC32 - F * 1);
src += yBeg * p.srcW * p.srcC;
dst += ((a.macroK < a.bufK ? yBeg * p.dstW : 0) + b * p.dstH * p.dstW) * a.bufK;
for (size_t i = 0; i < n; ++i)
{
size_t sc = 0;
for (; sc < srcC32; sc += 32)
ConvertA(src + sc, dst + sc);
if (srcC32 < p.srcC)
ConvertA(src + sc, dst + sc, srcMask0, srcMask1);
src += p.srcC;
dst += a.bufK;
}
}

static void Convert16bNhwcGemm1x1R(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
{
const float* src = (float*)src8;
size_t srcC32 = AlignLo(p.srcC, 32), n = (yEnd - yBeg) * p.dstW;
__mmask16 srcMask0 = TailMask16(p.srcC - srcC32 - F * 0);
__mmask16 srcMask1 = TailMask16(p.srcC - srcC32 - F * 1);
src += yBeg * p.srcW * p.srcC;
dst += ((a.macroK < a.bufK ? yBeg * p.dstW : 0) + b * p.dstH * p.dstW) * a.bufK;
#if SIMD_CONV_REORDER_TYPE
for (size_t i = 0; i < n; i += 16)
{
size_t m = Min(i + 16, n) - i;
Expand All @@ -114,32 +176,16 @@ namespace Simd
size_t j = 0;
for(; j < m; ++j)
ConvertA(src + sc + j * p.srcC, dst + j * 32 + sc * 16);
for (; j < 16; ++j)
SetZero(dst + j * 32 + sc * 16);
}
if (srcC32 < p.srcC)
{
size_t j = 0;
for (; j < m; ++j)
ConvertA(src + sc + j * p.srcC, dst + j * 32 + sc * 16, srcMask0, srcMask1);
for (; j < 16; ++j)
SetZero(dst + j * 32 + sc * 16);
}
src += p.srcC * 16;
dst += a.bufK * 16;
}
#else
for (size_t i = 0; i < n; ++i)
{
size_t sc = 0;
for (; sc < srcC32; sc += 32)
ConvertA(src + sc, dst + sc);
if (srcC32 < p.srcC)
ConvertA(src + sc, dst + sc, srcMask0, srcMask1);
src += p.srcC;
dst += a.bufK;
}
#endif
}

static void Reorder16bNhwcGemm(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
Expand Down Expand Up @@ -479,12 +525,26 @@ namespace Simd
{
if (_is1x1)
{
_convert = Convert16bNhwcGemm1x1;
a.reorderType = SIMD_CONV_REORDER_TYPE;
#if SIMD_CONV_REORDER_TYPE
_convert = Convert16bNhwcGemm1x1R;
a.reorderType = 1;
#else
_convert = Convert16bNhwcGemm1x1D;
a.reorderType = 0;
#endif
}
else
{
_convert = Convert16bNhwcGemm;
if (p.srcC == AlignLo(p.srcC, 32))
{
_convert = Convert16bNhwcGemmR;
a.reorderType = 0;
}
else
{
_convert = Convert16bNhwcGemmD;
a.reorderType = 0;
}
}
}
switch (p.activation)
Expand Down
14 changes: 14 additions & 0 deletions src/Simd/SimdSynetConvolution16bCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,20 @@ namespace Simd
_mm512_mask_storeu_epi16(dst, dstMask, _mm512_setzero_si512());
}

SIMD_INLINE void Copy(const uint16_t* src, uint16_t* dst, __mmask32 srcMask = __mmask32(-1), __mmask32 dstMask = __mmask32(-1))
{
_mm512_mask_storeu_epi16(dst, dstMask, _mm512_maskz_loadu_epi16(srcMask, src));
}

SIMD_INLINE void Copy(const uint16_t* src, uint16_t* dst, size_t size32, __mmask32 tail = __mmask32(0))
{
size_t i = 0;
for(; i < size32; i += 32)
_mm512_storeu_epi16(dst + i, _mm512_loadu_epi16(src + i));
if(tail)
_mm512_mask_storeu_epi16(dst + i, tail, _mm512_maskz_loadu_epi16(tail, src + i));
}

//-------------------------------------------------------------------------------------------------

template <Term16bType term> struct Term16b
Expand Down

0 comments on commit 6b090e8

Please sign in to comment.