mosra · mosra · Jan 19, 2019 · Jan 19, 2019
diff --git a/src/Magnum/Math/CMakeLists.txt b/src/Magnum/Math/CMakeLists.txt
@@ -48,6 +48,7 @@ set(MagnumMath_HEADERS
     Packing.h
     Range.h
     RectangularMatrix.h
+    Simd.h
     StrictWeakOrdering.h
     Swizzle.h
     Tags.h

diff --git a/src/Magnum/Math/Packing.cpp b/src/Magnum/Math/Packing.cpp
@@ -25,6 +25,10 @@
 
 #include "Packing.h"
 
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
 namespace Magnum { namespace Math {
 
 namespace {
@@ -101,4 +105,71 @@ UnsignedShort packHalf(const Float value) {
     return h;
 }
 
+namespace Implementation {
+
+void unpackUnsignedByteToShort(Simd::NoneT, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    for(std::size_t i = 0; i < in.size(); ++i) out[i] = in[i];
+}
+
+void unpackUnsignedByteToShort(Simd::Sse2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
+    __m128i* out128 = reinterpret_cast<__m128i*>(out.data());
+    for(std::size_t i = 0; i < in.size()/16; ++i) {
+        __m128i a = _mm_loadu_si128(in128 + i);
+        _mm_storeu_si128(out128 + i*2 + 0, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
+        _mm_storeu_si128(out128 + i*2 + 1, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
+    }
+}
+
+void unpackUnsignedByteToShort(Simd::Sse41T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
+    __m128i* out128 = reinterpret_cast<__m128i*>(out.data());
+    for(std::size_t i = 0; i < in.size()/16; ++i) {
+        __m128i a = _mm_loadu_si128(in128 + i);
+        _mm_storeu_si128(out128 + i*2 + 0, _mm_cvtepu8_epi16(a));
+        _mm_storeu_si128(out128 + i*2 + 1, _mm_cvtepu8_epi16(_mm_srli_si128(a, 8)));
+    }
+}
+
+void unpackUnsignedByteToShort(Simd::Avx2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
+    __m256i* out256 = reinterpret_cast<__m256i*>(out.data());
+    for(std::size_t i = 0; i < in.size()/16; ++i) {
+        __m128i a = _mm_load_si128(in128 + i);
+        _mm256_store_si256(out256 + i, _mm256_cvtepu8_epi16(a));
+    }
+}
+
+}
+
+namespace {
+
+__attribute__ ((target ("default"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
+}
+
+// TODO: why gcc complains about unused functions here?!
+__attribute__ ((target ("sse2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
+}
+
+__attribute__ ((target ("sse4.1"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
+}
+
+__attribute__ ((target ("avx2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
+}
+
+}
+
+void unpackUnsignedByteToShort(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
+    CORRADE_ASSERT(in.size() == out.size(), "Math::unpackUnsignedByteToShort(): input has" << in.size() << "elements while output has" << out.size(), );
+    CORRADE_ASSERT(!(reinterpret_cast<std::uintptr_t>(in.data())%16) && !(reinterpret_cast<std::uintptr_t>(in.data())%16), "Math::unpackUnsignedByteToShort(): the data are not 16-byte aligned", );
+
+    /** @todo run only for a multiple of 16, do the rest scalar */
+    CORRADE_INTERNAL_ASSERT(!(in.size()%16));
+    unpackUnsignedByteToShortDispatch(in, out);
+}
+
 }}
diff --git a/src/Magnum/Math/Packing.h b/src/Magnum/Math/Packing.h
@@ -30,6 +30,7 @@
  */
 
 #include "Magnum/Math/Functions.h"
+#include "Magnum/Math/Simd.h"
 
 namespace Magnum { namespace Math {
 
@@ -209,6 +210,39 @@ template<std::size_t size> Vector<size, Float> unpackHalf(const Vector<size, Uns
     return out;
 }
 
+namespace Implementation {
+    // TODO: expose these publicly? would make sense, otherwise the tags are useless
+    MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::NoneT, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
+    MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
+    MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse41T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
+    MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Avx2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
+}
+
+/**
+@brief Unpack an array of 8-bit unsigned integers to 16-bit
+
+The @p in and @p out are expected to have the same size and be aligned to 16
+bytes.
+*/
+// TODO: mention SIMD?
+MAGNUM_EXPORT void unpackUnsignedByteToShort(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
+
+/**
+@brief Unpack an array of 8-bit unsigned integers to 32-bit
+
+The @p in and @p out are expected to have the same size and be aligned to 16
+bytes.
+*/
+MAGNUM_EXPORT void unpackUnsignedByteToInt(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedInt> out);
+
+/**
+@brief Unpack an array of 16-bit unsigned integers to 32-bit
+
+The @p in and @p out are expected to have the same size and be aligned to 16
+bytes.
+*/
+MAGNUM_EXPORT void unpackUnsignedShortToInt(Corrade::Containers::ArrayView<const UnsignedShort> in, Corrade::Containers::ArrayView<UnsignedInt> out);
+
 }}
 
 #endif
diff --git a/src/Magnum/Math/Simd.h b/src/Magnum/Math/Simd.h
@@ -0,0 +1,148 @@
+#ifndef Magnum_Math_Simd_h
+#define Magnum_Math_Simd_h
+/*
+    This file is part of Magnum.
+
+    Copyright © 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019
+              Vladimír Vondruš <[email protected]>
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+*/
+
+/** @file
+ * @brief Namespace @ref Magnum::Math::Simd
+ */
+
+/** @namespace Magnum::Math::Simd
+@brief SIMD dispatch tags
+
+Tags for dispatching to particular SIMD-optimized versions of batch math
+algorithms.
+
+This library is built as part of Magnum by default. To use this library with
+CMake, you need to find the `Magnum` package and link to the `Magnum::Magnum`
+target:
+
+@code{.cmake}
+find_package(Magnum REQUIRED)
+
+# ...
+target_link_libraries(your-app Magnum::Magnum)
+@endcode
+
+See @ref building and @ref cmake for more information.
+*/
+namespace Magnum { namespace Math { namespace Simd {
+
+/**
+@brief No SIMD acceleration tag type
+
+Used to distinguish algorithms that have no explicit SIMD optimizations, apart
+from compiler magic.
+@see @ref None
+*/
+/* Explicit constructor to avoid ambiguous calls when using {} */
+struct NoneT {
+    #ifndef DOXYGEN_GENERATING_OUTPUT
+    struct Init{};
+    constexpr explicit NoneT(Init) {}
+    #endif
+};
+
+/**
+@brief SSE2 SIMD acceleration tag type
+
+Used to distinguish algorithms that use at most the
+[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set.
+@see @ref Sse2
+*/
+/* Explicit constructor to avoid ambiguous calls when using {} */
+struct Sse2T {
+    #ifndef DOXYGEN_GENERATING_OUTPUT
+    struct Init{};
+    constexpr explicit Sse2T(Init) {}
+    #endif
+};
+
+/**
+@brief SSE4.1 SIMD acceleration tag type
+
+Used to distinguish algorithms that use at most the
+[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set.
+@see @ref Sse41
+*/
+/* Explicit constructor to avoid ambiguous calls when using {} */
+struct Sse41T {
+    #ifndef DOXYGEN_GENERATING_OUTPUT
+    struct Init{};
+    constexpr explicit Sse41T(Init) {}
+    #endif
+};
+
+/**
+@brief AVX2 SIMD acceleration tag type
+
+Used to distinguish algorithms that use at most the
+[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2)
+instruction set.
+@see @ref Avx2
+*/
+/* Explicit constructor to avoid ambiguous calls when using {} */
+struct Avx2T {
+    #ifndef DOXYGEN_GENERATING_OUTPUT
+    struct Init{};
+    constexpr explicit Avx2T(Init) {}
+    #endif
+};
+
+/**
+@brief No SIMD acceleration tag
+
+Use for selecting algorithms with no explicit SIMD optimizations.
+*/
+constexpr NoneT None{NoneT::Init{}};
+
+/**
+@brief SSE2 SIMD acceleration tag
+
+Use for selecting algorithms that use at most the
+[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set.
+*/
+constexpr Sse2T Sse2{Sse2T::Init{}};
+
+/**
+@brief SSE4.1 SIMD acceleration tag
+
+Use for selecting algorithms that use at most the
+[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set.
+*/
+constexpr Sse41T Sse41{Sse41T::Init{}};
+
+/**
+@brief AVX2 SIMD acceleration tag type
+
+Use for selecting algorithms that use at most the
+[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2)
+instruction set.
+*/
+constexpr Avx2T Avx2{Avx2T::Init{}};
+
+}}}
+
+#endif
diff --git a/src/Magnum/Math/Test/PackingTest.cpp b/src/Magnum/Math/Test/PackingTest.cpp
@@ -24,6 +24,7 @@
 */
 
 #include <limits>
+#include <Corrade/Containers/Array.h>
 #include <Corrade/TestSuite/Tester.h>
 
 #include "Magnum/Math/Packing.h"
@@ -46,6 +47,9 @@ struct PackingTest: Corrade::TestSuite::Tester {
 
     /* Half (un)pack functions are tested and benchmarked in HalfTest.cpp,
        because there's involved comparison and benchmarks to ground truth */
+
+    void unpackUnsignedByteToShortBenchmark();
+    template<class T> void unpackUnsignedByteToShortBenchmark();
 };
 
 typedef Math::Vector3<Float> Vector3;
@@ -62,6 +66,14 @@ PackingTest::PackingTest() {
               &PackingTest::reunpackUnsinged,
               &PackingTest::reunpackSinged,
               &PackingTest::unpackTypeDeduction});
+
+    addBenchmarks<PackingTest>({
+        &PackingTest::unpackUnsignedByteToShortBenchmark,
+        &PackingTest::unpackUnsignedByteToShortBenchmark<Simd::NoneT>,
+        &PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse2T>,
+        &PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse41T>,
+        &PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Avx2T>
+    }, 1000);
 }
 
 void PackingTest::bitMax() {
@@ -279,6 +291,44 @@ void PackingTest::unpackTypeDeduction() {
     CORRADE_COMPARE((Math::unpack<Float, Byte>('\x7F')), 1.0f);
 }
 
+void PackingTest::unpackUnsignedByteToShortBenchmark() {
+    Corrade::Containers::Array<UnsignedByte> in{20000};
+    Corrade::Containers::Array<UnsignedShort> out{20000};
+    UnsignedByte a = 0;
+    for(auto& i: in) i = a++;
+
+    CORRADE_BENCHMARK(100)
+        unpackUnsignedByteToShort(in, out);
+}
+
+// TODO: uh provide this elsewhere
+template<class> struct SimdTraits;
+template<> struct SimdTraits<Simd::NoneT> {
+    static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::NoneT>"; }
+};
+template<> struct SimdTraits<Simd::Sse2T> {
+    static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse2T>"; }
+};
+template<> struct SimdTraits<Simd::Sse41T> {
+    static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse41T>"; }
+};
+template<> struct SimdTraits<Simd::Avx2T> {
+    static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Avx2T>"; }
+};
+
+template<class T> void PackingTest::unpackUnsignedByteToShortBenchmark() {
+    setTestCaseName(SimdTraits<T>::name());
+
+    Corrade::Containers::Array<UnsignedByte> in{20000};
+    Corrade::Containers::Array<UnsignedShort> out{20000};
+    UnsignedByte a = 0;
+    for(auto& i: in) i = a++;
+
+    CORRADE_BENCHMARK(100)
+        // TODO: uh the typename wat
+        Implementation::unpackUnsignedByteToShort(T{typename T::Init{}}, in, out);
+}
+
 }}}}
 
 CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest)