Skip to content

Commit

Permalink
+add Support of RGBA-32 input image format in AVX-512BW, NEON optimiz…
Browse files Browse the repository at this point in the history
…ations of function SynetSetInput.
  • Loading branch information
ermig1979 committed Nov 29, 2023
1 parent bae41b8 commit a17f9c4
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/2023.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ <h3 id="R132">December X, 2023 (version X.X.132)</h3>
<h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Support of RGBA-32 input image format in base implementation, SSE4.1, AVX2 optimizations of function SynetSetInput.</li>
<li>Support of RGBA-32 input image format in base implementation, SSE4.1, AVX2, AVX-512BW, NEON optimizations of function SynetSetInput.</li>
</ul>

<h5>Bug fixing</h5>
Expand Down
78 changes: 76 additions & 2 deletions src/Simd/SimdAvx512bwSynetConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ namespace Simd
}

const __m512i K16_BLUE_RED = SIMD_MM512_SET2_EPI16(Base::BLUE_TO_GRAY_WEIGHT, Base::RED_TO_GRAY_WEIGHT);
const __m512i K16_RED_BLUE = SIMD_MM512_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
const __m512i K16_GREEN_0000 = SIMD_MM512_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0x0000);
const __m512i K32_ROUND_TERM = SIMD_MM512_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM);

Expand Down Expand Up @@ -420,6 +421,22 @@ namespace Simd
StoreScaled<false>(dst + 3 * F, BgraToGray32(_mm512_shuffle_epi8(bgra3, K8_SHUFFLE_RGB_TO_BGRA)), scale, shift);
}

SIMD_INLINE __m512i RgbaToGray32(__m512i rgba)
{
const __m512i g0a0 = _mm512_shuffle_epi8(rgba, K8_SUFFLE_BGRA_TO_G0A0);
const __m512i r0b0 = _mm512_and_si512(rgba, K16_00FF);
const __m512i weightedSum = _mm512_add_epi32(_mm512_madd_epi16(g0a0, K16_GREEN_0000), _mm512_madd_epi16(r0b0, K16_RED_BLUE));
return _mm512_srli_epi32(_mm512_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
}

template<> SIMD_INLINE void SynetSetInput1<SimdPixelFormatRgba32>(const uint8_t* src, __m512 scale, __m512 shift, float* dst)
{
StoreScaled<false>(dst + 0 * F, RgbaToGray32(Load<false>((__m512i*)src + 0)), scale, shift);
StoreScaled<false>(dst + 1 * F, RgbaToGray32(Load<false>((__m512i*)src + 1)), scale, shift);
StoreScaled<false>(dst + 2 * F, RgbaToGray32(Load<false>((__m512i*)src + 2)), scale, shift);
StoreScaled<false>(dst + 3 * F, RgbaToGray32(Load<false>((__m512i*)src + 3)), scale, shift);
}

template<SimdPixelFormatType format, size_t step> void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
{
__m512 _scale = _mm512_set1_ps(scale[0]);
Expand Down Expand Up @@ -544,6 +561,21 @@ namespace Simd
SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load<false>(src + 8 * F)), scale, shift, dst + 3 * F, channel);
}

SIMD_INLINE void SynetSetInputNchw3Rgba(__m512i rgba, const __m512* scale, const __m512* shift, float* dst, size_t channel)
{
StoreScaled<false>(dst + 0 * channel, _mm512_shuffle_epi8(rgba, K8_SHUFFLE_BGRA_TO_R32), scale[0], shift[0]);
StoreScaled<false>(dst + 1 * channel, _mm512_shuffle_epi8(rgba, K8_SHUFFLE_BGRA_TO_G32), scale[1], shift[1]);
StoreScaled<false>(dst + 2 * channel, _mm512_shuffle_epi8(rgba, K8_SHUFFLE_BGRA_TO_B32), scale[2], shift[2]);
}

template<> SIMD_INLINE void SynetSetInputNchw3A<SimdPixelFormatRgba32>(const uint8_t* src, const __m512* scale, const __m512* shift, float* dst, size_t channel)
{
SynetSetInputNchw3Rgba(Load<false>(src + 0 * A), scale, shift, dst + 0 * F, channel);
SynetSetInputNchw3Rgba(Load<false>(src + 1 * A), scale, shift, dst + 1 * F, channel);
SynetSetInputNchw3Rgba(Load<false>(src + 2 * A), scale, shift, dst + 2 * F, channel);
SynetSetInputNchw3Rgba(Load<false>(src + 3 * A), scale, shift, dst + 3 * F, channel);
}

template<SimdPixelFormatType format> SIMD_INLINE void SynetSetInputNchw3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel);

template<> SIMD_INLINE void SynetSetInputNchw3F<SimdPixelFormatGray8>(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst, size_t channel)
Expand All @@ -569,6 +601,11 @@ namespace Simd
SynetSetInputNchw3Rgb(_mm512_permutexvar_epi32(K32_PERMUTE_BGR_TO_BGRA_END, Load<false>(src - F)), scale, shift, dst, channel);
}

template<> SIMD_INLINE void SynetSetInputNchw3F<SimdPixelFormatRgba32>(const uint8_t* src, const __m512* scale, const __m512* shift, float* dst, size_t channel)
{
SynetSetInputNchw3Rgba(Load<false>(src), scale, shift, dst, channel);
}

template<SimdPixelFormatType format, size_t step> void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
{
size_t widthF = AlignLo(width, F), widthA = AlignLo(width, A), channel = width * height;
Expand Down Expand Up @@ -687,6 +724,32 @@ namespace Simd
StoreScaled<false>(dst + 0xB * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 2)), scale[2], shift[2]);
}

const __m512i K8_SUFFLE_RGBA_TO_BGR = SIMD_MM512_SETR_EPI8(
0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1,
0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1,
0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1,
0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1);

template<> SIMD_INLINE void SynetSetInputNhwc3A<SimdPixelFormatRgba32>(const uint8_t* src, const __m512* scale, const __m512* shift, float* dst)
{
__m512i bgr0 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load<false>(src + 0 * A), K8_SUFFLE_RGBA_TO_BGR));
StoreScaled<false>(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 0)), scale[0], shift[0]);
StoreScaled<false>(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 1)), scale[1], shift[1]);
StoreScaled<false>(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr0, 2)), scale[2], shift[2]);
__m512i bgr1 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load<false>(src + 1 * A), K8_SUFFLE_RGBA_TO_BGR));
StoreScaled<false>(dst + 0x3 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 0)), scale[0], shift[0]);
StoreScaled<false>(dst + 0x4 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 1)), scale[1], shift[1]);
StoreScaled<false>(dst + 0x5 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr1, 2)), scale[2], shift[2]);
__m512i bgr2 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load<false>(src + 2 * A), K8_SUFFLE_RGBA_TO_BGR));
StoreScaled<false>(dst + 0x6 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 0)), scale[0], shift[0]);
StoreScaled<false>(dst + 0x7 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 1)), scale[1], shift[1]);
StoreScaled<false>(dst + 0x8 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr2, 2)), scale[2], shift[2]);
__m512i bgr3 = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load<false>(src + 3 * A), K8_SUFFLE_RGBA_TO_BGR));
StoreScaled<false>(dst + 0x9 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 0)), scale[0], shift[0]);
StoreScaled<false>(dst + 0xA * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 1)), scale[1], shift[1]);
StoreScaled<false>(dst + 0xB * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr3, 2)), scale[2], shift[2]);
}

template<SimdPixelFormatType format> SIMD_INLINE void SynetSetInputNhwc3F(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst);

template<> SIMD_INLINE void SynetSetInputNhwc3F<SimdPixelFormatGray8>(const uint8_t * src, const __m512 * scale, const __m512 * shift, float * dst)
Expand Down Expand Up @@ -720,6 +783,14 @@ namespace Simd
StoreScaled<false>(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 2)), scale[2], shift[2]);
}

template<> SIMD_INLINE void SynetSetInputNhwc3F<SimdPixelFormatRgba32>(const uint8_t* src, const __m512* scale, const __m512* shift, float* dst)
{
__m512i bgr = _mm512_permutexvar_epi32(K32_PERMUTE_BGRA_TO_BGR, _mm512_shuffle_epi8(Load<false>(src), K8_SUFFLE_RGBA_TO_BGR));
StoreScaled<false>(dst + 0x0 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 0)), scale[0], shift[0]);
StoreScaled<false>(dst + 0x1 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 1)), scale[1], shift[1]);
StoreScaled<false>(dst + 0x2 * F, _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(bgr, 2)), scale[2], shift[2]);
}

template<SimdPixelFormatType format, size_t step> void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
{
size_t widthF = AlignLo(width, F);
Expand All @@ -733,8 +804,8 @@ namespace Simd
for (size_t y = 0; y < height; ++y)
{
size_t x = 0;
for (; x < widthA; x += A)
SynetSetInputNhwc3A<format>(src + step * x, _scale, _shift, dst + 3 * x);
//for (; x < widthA; x += A)
// SynetSetInputNhwc3A<format>(src + step * x, _scale, _shift, dst + 3 * x);
for (; x < widthF; x += F)
SynetSetInputNhwc3F<format>(src + step * x, _scale, _shift, dst + 3 * x);
if (widthF < width)
Expand All @@ -761,6 +832,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInput1<SimdPixelFormatBgr24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInput1<SimdPixelFormatBgra32, 4>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInput1<SimdPixelFormatRgb24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgba32: SynetSetInput1<SimdPixelFormatRgba32, 4>(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
}
break;
Expand All @@ -774,6 +846,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNchw3<SimdPixelFormatBgr24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNchw3<SimdPixelFormatBgra32, 4>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNchw3<SimdPixelFormatRgb24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgba32: SynetSetInputNchw3<SimdPixelFormatRgba32, 4>(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
}
break;
Expand All @@ -784,6 +857,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNhwc3<SimdPixelFormatBgr24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNhwc3<SimdPixelFormatBgra32, 4>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNhwc3<SimdPixelFormatRgb24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgba32: SynetSetInputNhwc3<SimdPixelFormatRgba32, 4>(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
}
break;
Expand Down
46 changes: 45 additions & 1 deletion src/Simd/SimdNeonSynetConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ namespace Simd
SynetSetInput1Bgr16(UnpackU8<1>(rgb.val[2]), UnpackU8<1>(rgb.val[1]), UnpackU8<1>(rgb.val[0]), scale, shift, dst + 2 * F);
}

template<> SIMD_INLINE void SynetSetInput1<SimdPixelFormatRgba32>(const uint8_t* src, float32x4_t scale, float32x4_t shift, float* dst)
{
uint8x16x4_t rgba = Load4<false>(src);
SynetSetInput1Bgr16(UnpackU8<0>(rgba.val[2]), UnpackU8<0>(rgba.val[1]), UnpackU8<0>(rgba.val[0]), scale, shift, dst + 0 * F);
SynetSetInput1Bgr16(UnpackU8<1>(rgba.val[2]), UnpackU8<1>(rgba.val[1]), UnpackU8<1>(rgba.val[0]), scale, shift, dst + 2 * F);
}

template<SimdPixelFormatType format, size_t step> void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
{
float32x4_t _scale = vdupq_n_f32(scale[0]);
Expand Down Expand Up @@ -287,6 +294,14 @@ namespace Simd
SynetSetInput1Gray8(rgb.val[0], scale[2], shift[2], dst + 2 * channel);
}

template<> SIMD_INLINE void SynetSetInputNchw3<SimdPixelFormatRgba32>(const uint8_t* src, const float32x4_t* scale, const float32x4_t* shift, float* dst, size_t channel)
{
uint8x16x4_t rgba = Load4<false>(src);
SynetSetInput1Gray8(rgba.val[2], scale[0], shift[0], dst + 0 * channel);
SynetSetInput1Gray8(rgba.val[1], scale[1], shift[1], dst + 1 * channel);
SynetSetInput1Gray8(rgba.val[0], scale[2], shift[2], dst + 2 * channel);
}

template<SimdPixelFormatType format, size_t step> void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
{
size_t aligned = AlignLo(width, A), channel = width * height;
Expand Down Expand Up @@ -419,6 +434,32 @@ namespace Simd
StoreScaled<false>(dst + 0xB * F, UnpackU16<1>(bgr5), scale[2], shift[2]);
}

const uint8x8_t K8_TBL_RGBA_TO_BGR_0 = SIMD_VEC_SETR_PI8(0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9);
const uint8x8_t K8_TBL_RGBA_TO_BGR_1 = SIMD_VEC_SETR_PI8(0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE);
const uint8x8_t K8_TBL_RGBA_TO_BGR_2 = SIMD_VEC_SETR_PI8(0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC);

template<> SIMD_INLINE void SynetSetInputNhwc3<SimdPixelFormatRgba32>(const uint8_t* src, const float32x4_t* scale, const float32x4_t* shift, float* dst)
{
uint16x8_t bgr0 = vmovl_u8(Shuffle(Load<false>(src + 0), K8_TBL_RGBA_TO_BGR_0));
StoreScaled<false>(dst + 0x0 * F, UnpackU16<0>(bgr0), scale[0], shift[0]);
StoreScaled<false>(dst + 0x1 * F, UnpackU16<1>(bgr0), scale[1], shift[1]);
uint16x8_t bgr1 = vmovl_u8(Shuffle(Load<false>(src + 8), K8_TBL_RGBA_TO_BGR_1));
StoreScaled<false>(dst + 0x2 * F, UnpackU16<0>(bgr1), scale[2], shift[2]);
StoreScaled<false>(dst + 0x3 * F, UnpackU16<1>(bgr1), scale[0], shift[0]);
uint16x8_t bgr2 = vmovl_u8(Shuffle(Load<false>(src + 16), K8_TBL_RGBA_TO_BGR_2));
StoreScaled<false>(dst + 0x4 * F, UnpackU16<0>(bgr2), scale[1], shift[1]);
StoreScaled<false>(dst + 0x5 * F, UnpackU16<1>(bgr2), scale[2], shift[2]);
uint16x8_t bgr3 = vmovl_u8(Shuffle(Load<false>(src + 32), K8_TBL_RGBA_TO_BGR_0));
StoreScaled<false>(dst + 0x6 * F, UnpackU16<0>(bgr3), scale[0], shift[0]);
StoreScaled<false>(dst + 0x7 * F, UnpackU16<1>(bgr3), scale[1], shift[1]);
uint16x8_t bgr4 = vmovl_u8(Shuffle(Load<false>(src + 40), K8_TBL_RGBA_TO_BGR_1));
StoreScaled<false>(dst + 0x8 * F, UnpackU16<0>(bgr4), scale[2], shift[2]);
StoreScaled<false>(dst + 0x9 * F, UnpackU16<1>(bgr4), scale[0], shift[0]);
uint16x8_t bgr5 = vmovl_u8(Shuffle(Load<false>(src + 48), K8_TBL_RGBA_TO_BGR_2));
StoreScaled<false>(dst + 0xA * F, UnpackU16<0>(bgr5), scale[1], shift[1]);
StoreScaled<false>(dst + 0xB * F, UnpackU16<1>(bgr5), scale[2], shift[2]);
}

template<SimdPixelFormatType format, size_t step> void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
{
size_t aligned = AlignLo(width, A);
Expand Down Expand Up @@ -457,6 +498,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInput1<SimdPixelFormatBgr24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInput1<SimdPixelFormatBgra32, 4>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInput1<SimdPixelFormatRgb24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgba32: SynetSetInput1<SimdPixelFormatRgba32, 4>(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
}
break;
Expand All @@ -470,6 +512,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNchw3<SimdPixelFormatBgr24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNchw3<SimdPixelFormatBgra32, 4>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNchw3<SimdPixelFormatRgb24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgba32: SynetSetInputNchw3<SimdPixelFormatRgba32, 4>(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
}
break;
Expand All @@ -480,7 +523,8 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNhwc3<SimdPixelFormatBgr24, 3>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNhwc3<SimdPixelFormatBgra32, 4>(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNhwc3<SimdPixelFormatRgb24, 3>(src, width, height, stride, scale, lower, dst); return;
default: return Base::SynetSetInput(src, width, height, stride, srcFormat, lower, upper, dst, channels, dstFormat); assert(0);
case SimdPixelFormatRgba32: SynetSetInputNhwc3<SimdPixelFormatRgba32, 4>(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
}
break;
default: assert(0);
Expand Down
2 changes: 1 addition & 1 deletion src/Test/TestSynetConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ namespace Test

result = result && SynetSetInputAutoTest(3, 112, 96, View::Rgb24, SimdTensorFormatNhwc, f1, f2);

for (int s = 0; s < 4; ++s)
for (int s = 0; s < 5; ++s)
{
for (int c = 0; c < 2; ++c)
{
Expand Down

0 comments on commit a17f9c4

Please sign in to comment.