Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SIMD playground #306

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Magnum/Math/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ set(MagnumMath_HEADERS
Packing.h
Range.h
RectangularMatrix.h
Simd.h
StrictWeakOrdering.h
Swizzle.h
Tags.h
Expand Down
71 changes: 71 additions & 0 deletions src/Magnum/Math/Packing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@

#include "Packing.h"

#include <emmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>

namespace Magnum { namespace Math {

namespace {
Expand Down Expand Up @@ -101,4 +105,71 @@ UnsignedShort packHalf(const Float value) {
return h;
}

namespace Implementation {

void unpackUnsignedByteToShort(Simd::NoneT, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
for(std::size_t i = 0; i < in.size(); ++i) out[i] = in[i];
}

void unpackUnsignedByteToShort(Simd::Sse2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m128i* out128 = reinterpret_cast<__m128i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_loadu_si128(in128 + i);
_mm_storeu_si128(out128 + i*2 + 0, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
_mm_storeu_si128(out128 + i*2 + 1, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
}
}

void unpackUnsignedByteToShort(Simd::Sse41T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m128i* out128 = reinterpret_cast<__m128i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_loadu_si128(in128 + i);
_mm_storeu_si128(out128 + i*2 + 0, _mm_cvtepu8_epi16(a));
_mm_storeu_si128(out128 + i*2 + 1, _mm_cvtepu8_epi16(_mm_srli_si128(a, 8)));
}
}

void unpackUnsignedByteToShort(Simd::Avx2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m256i* out256 = reinterpret_cast<__m256i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_load_si128(in128 + i);
_mm256_store_si256(out256 + i, _mm256_cvtepu8_epi16(a));
}
}

}

namespace {

__attribute__ ((target ("default"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
}

// TODO: why gcc complains about unused functions here?!
__attribute__ ((target ("sse2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
}

__attribute__ ((target ("sse4.1"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
}

__attribute__ ((target ("avx2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
}

}

void unpackUnsignedByteToShort(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
CORRADE_ASSERT(in.size() == out.size(), "Math::unpackUnsignedByteToShort(): input has" << in.size() << "elements while output has" << out.size(), );
CORRADE_ASSERT(!(reinterpret_cast<std::uintptr_t>(in.data())%16) && !(reinterpret_cast<std::uintptr_t>(in.data())%16), "Math::unpackUnsignedByteToShort(): the data are not 16-byte aligned", );

/** @todo run only for a multiple of 16, do the rest scalar */
CORRADE_INTERNAL_ASSERT(!(in.size()%16));
unpackUnsignedByteToShortDispatch(in, out);
}

}}
34 changes: 34 additions & 0 deletions src/Magnum/Math/Packing.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
*/

#include "Magnum/Math/Functions.h"
#include "Magnum/Math/Simd.h"

namespace Magnum { namespace Math {

Expand Down Expand Up @@ -209,6 +210,39 @@ template<std::size_t size> Vector<size, Float> unpackHalf(const Vector<size, Uns
return out;
}

namespace Implementation {
// TODO: expose these publicly? would make sense, otherwise the tags are useless
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::NoneT, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse41T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Avx2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
}

/**
@brief Unpack an array of 8-bit unsigned integers to 16-bit

The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
// TODO: mention SIMD?
MAGNUM_EXPORT void unpackUnsignedByteToShort(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);

/**
@brief Unpack an array of 8-bit unsigned integers to 32-bit

The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
MAGNUM_EXPORT void unpackUnsignedByteToInt(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedInt> out);

/**
@brief Unpack an array of 16-bit unsigned integers to 32-bit

The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
MAGNUM_EXPORT void unpackUnsignedShortToInt(Corrade::Containers::ArrayView<const UnsignedShort> in, Corrade::Containers::ArrayView<UnsignedInt> out);

}}

#endif
148 changes: 148 additions & 0 deletions src/Magnum/Math/Simd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#ifndef Magnum_Math_Simd_h
#define Magnum_Math_Simd_h
/*
This file is part of Magnum.

Copyright © 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019
Vladimír Vondruš <[email protected]>

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/

/** @file
* @brief Namespace @ref Magnum::Math::Simd
*/

/** @namespace Magnum::Math::Simd
@brief SIMD dispatch tags

Tags for dispatching to particular SIMD-optimized versions of batch math
algorithms.

This library is built as part of Magnum by default. To use this library with
CMake, you need to find the `Magnum` package and link to the `Magnum::Magnum`
target:

@code{.cmake}
find_package(Magnum REQUIRED)

# ...
target_link_libraries(your-app Magnum::Magnum)
@endcode

See @ref building and @ref cmake for more information.
*/
namespace Magnum { namespace Math { namespace Simd {

/**
@brief No SIMD acceleration tag type

Used to distinguish algorithms that have no explicit SIMD optimizations, apart
from compiler magic.
@see @ref None
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct NoneT {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit NoneT(Init) {}
#endif
};

/**
@brief SSE2 SIMD acceleration tag type

Used to distinguish algorithms that use at most the
[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set.
@see @ref Sse2
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct Sse2T {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit Sse2T(Init) {}
#endif
};

/**
@brief SSE4.1 SIMD acceleration tag type

Used to distinguish algorithms that use at most the
[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set.
@see @ref Sse41
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct Sse41T {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit Sse41T(Init) {}
#endif
};

/**
@brief AVX2 SIMD acceleration tag type

Used to distinguish algorithms that use at most the
[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2)
instruction set.
@see @ref Avx2
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct Avx2T {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit Avx2T(Init) {}
#endif
};

/**
@brief No SIMD acceleration tag

Use for selecting algorithms with no explicit SIMD optimizations.
*/
constexpr NoneT None{NoneT::Init{}};

/**
@brief SSE2 SIMD acceleration tag

Use for selecting algorithms that use at most the
[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set.
*/
constexpr Sse2T Sse2{Sse2T::Init{}};

/**
@brief SSE4.1 SIMD acceleration tag

Use for selecting algorithms that use at most the
[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set.
*/
constexpr Sse41T Sse41{Sse41T::Init{}};

/**
@brief AVX2 SIMD acceleration tag type

Use for selecting algorithms that use at most the
[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2)
instruction set.
*/
constexpr Avx2T Avx2{Avx2T::Init{}};

}}}

#endif
50 changes: 50 additions & 0 deletions src/Magnum/Math/Test/PackingTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
*/

#include <limits>
#include <Corrade/Containers/Array.h>
#include <Corrade/TestSuite/Tester.h>

#include "Magnum/Math/Packing.h"
Expand All @@ -46,6 +47,9 @@ struct PackingTest: Corrade::TestSuite::Tester {

/* Half (un)pack functions are tested and benchmarked in HalfTest.cpp,
because there's involved comparison and benchmarks to ground truth */

void unpackUnsignedByteToShortBenchmark();
template<class T> void unpackUnsignedByteToShortBenchmark();
};

typedef Math::Vector3<Float> Vector3;
Expand All @@ -62,6 +66,14 @@ PackingTest::PackingTest() {
&PackingTest::reunpackUnsinged,
&PackingTest::reunpackSinged,
&PackingTest::unpackTypeDeduction});

addBenchmarks<PackingTest>({
&PackingTest::unpackUnsignedByteToShortBenchmark,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::NoneT>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse2T>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse41T>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Avx2T>
}, 1000);
}

void PackingTest::bitMax() {
Expand Down Expand Up @@ -279,6 +291,44 @@ void PackingTest::unpackTypeDeduction() {
CORRADE_COMPARE((Math::unpack<Float, Byte>('\x7F')), 1.0f);
}

void PackingTest::unpackUnsignedByteToShortBenchmark() {
Corrade::Containers::Array<UnsignedByte> in{20000};
Corrade::Containers::Array<UnsignedShort> out{20000};
UnsignedByte a = 0;
for(auto& i: in) i = a++;

CORRADE_BENCHMARK(100)
unpackUnsignedByteToShort(in, out);
}

// TODO: uh provide this elsewhere
template<class> struct SimdTraits;
template<> struct SimdTraits<Simd::NoneT> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::NoneT>"; }
};
template<> struct SimdTraits<Simd::Sse2T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse2T>"; }
};
template<> struct SimdTraits<Simd::Sse41T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse41T>"; }
};
template<> struct SimdTraits<Simd::Avx2T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Avx2T>"; }
};

template<class T> void PackingTest::unpackUnsignedByteToShortBenchmark() {
setTestCaseName(SimdTraits<T>::name());

Corrade::Containers::Array<UnsignedByte> in{20000};
Corrade::Containers::Array<UnsignedShort> out{20000};
UnsignedByte a = 0;
for(auto& i: in) i = a++;

CORRADE_BENCHMARK(100)
// TODO: uh the typename wat
Implementation::unpackUnsignedByteToShort(T{typename T::Init{}}, in, out);
}

}}}}

CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest)