From 1075472af267695730adba03311d5b62b266cdf9 Mon Sep 17 00:00:00 2001
From: Rafael Gago <rafael_gago_81@hotmail.com>
Date: Tue, 31 Aug 2021 20:49:25 +0200
Subject: [PATCH] nearbyhint: Fix for ffast-math

The fix on 45cad81a305b09c4b1c9db55c23fcbb0f4e01ee4 wasn't working on
Clang.

On ffast-math the compiler is free to assume that "x + v -v = x".
45cad81a305 was workarounding this fact by storing "x + v" on a volatile
variable.

For Clang this wasn't enough to stop optimizing, as it correctly
detected that the variable is local-scope, so no one can take a
reference to it.

This commit reworks the fix by defining a function to do the operation
and disabling optimizations on that function for all supported
compilers (and those using the same frontend).

For non-supported compilers an #error is emitted, as the workaround
wasn't safe enough. It could even break between compiler versions. This
avoids potentially weird behaviour on the future.
---
 .../xsimd/arch/generic/xsimd_generic_math.hpp | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)
diff --git a/include/xsimd/arch/generic/xsimd_generic_math.hpp b/include/xsimd/arch/generic/xsimd_generic_math.hpp
index 56e4d98bb..a5d2b57a6 100644
--- a/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -1707,6 +1707,40 @@ namespace xsimd {
     }
 
 
+#if !defined(__FAST_MATH__)
+    template <class T>
+    T conformant_add_then_sub (T x, T v)
+    {
+        return x + v - v;
+    }
+#else
+#if defined(__clang__)
+    // available on clang 4
+    #define XSIMD_NO_OPTIMIZATION_ATTRIBUTE __attribute__((optnone))
+    #define XSIMD_NO_OPTIMIZATION_PRAGMA
+#elif defined(__GNUC__)
+    // available on GCC 4.9
+    #define XSIMD_NO_OPTIMIZATION_ATTRIBUTE __attribute__((optimize("O0")))
+    #define XSIMD_NO_OPTIMIZATION_PRAGMA
+#elif defined(_MSC_VER)
+    // available Visual Studio 2015
+    #define XSIMD_NO_OPTIMIZATION_ATTRIBUTE
+    #define XSIMD_NO_OPTIMIZATION_PRAGMA __pragma(optimize("", off))
+#else
+    // Under fast-math, the compiler will assume (x - v + v = x).
+    //
+    // This error is hit it is because you are using an unsuported compiler.
+    // Consider submitting a patch, as workaunding it is easy.
+    #error "Unoptimized version of x + y - y required. See the code for details."
+#endif
+    XSIMD_NO_OPTIMIZATION_PRAGMA
+    template <class T>
+    XSIMD_NO_OPTIMIZATION_ATTRIBUTE T conformant_add_then_sub (T x, T v)
+    {
+        return x + v - v;
+    }
+#endif
+
     // nearbyint
     template<class A, class T, class=typename std::enable_if<std::is_integral<T>::value, void>::type>
     batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) {
@@ -1718,16 +1752,7 @@ namespace xsimd {
         batch_type s = bitofsign(self);
         batch_type v = self ^ s;
         batch_type t2n = constants::twotonmb<batch_type>();
-        // Under fast-math, reordering is possible and the compiler optimizes d
-        // to v. That's not what we want, so prevent compiler optimization here.
-        // FIXME: it may be better to emit a memory barrier here (?).
-#ifdef __FAST_MATH__
-        volatile batch_type d0 = v + t2n;
-        batch_type d = *(batch_type*)(void*)(&d0) - t2n;
-#else
-        batch_type d0 = v + t2n;
-        batch_type d = d0 - t2n;
-#endif
+        batch_type d = conformant_add_then_sub (v, t2n);
         return s ^ select(v < t2n, d, v);
       }
     }
@@ -2199,4 +2224,3 @@ namespace xsimd {
 }
 
 #endif
-