From c1667e1db74eeac0788238874e82963ae0b6cb24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Thu, 24 Oct 2019 10:48:54 +0200
Subject: [PATCH] Update with Manu benchmarks and workaround
 https://github.com/numforge/laser/issues/40

---
 benchmarks/gemm/gemm_bench_float32.nim |  35 ++++----
 benchmarks/gemm/gemm_bench_float64.nim | 106 +++++++++++++++----------
 benchmarks/third_party/manu            |   1 +
 3 files changed, 80 insertions(+), 62 deletions(-)
 create mode 160000 benchmarks/third_party/manu

diff --git a/benchmarks/gemm/gemm_bench_float32.nim b/benchmarks/gemm/gemm_bench_float32.nim
index 2707f8a..30ba846 100644
--- a/benchmarks/gemm/gemm_bench_float32.nim
+++ b/benchmarks/gemm/gemm_bench_float32.nim
@@ -335,6 +335,7 @@ when isMainModule:
   echo &"Theoretical peak single-core:  {TheoSerialPeak:>9.3f} GFLOP/s"
   echo &"Theoretical peak multi:        {TheoThreadedPeak:>9.3f} GFLOP/s"
   echo "Make sure to not bench Apple Accelerate or the default Linux BLAS."
+  echo "Due to strange OpenMP interferences, separate the run of code-sections using OpenMP, see https://github.com/numforge/laser/issues/40"
   block:
     let a = newSeqWith(M*K, float32 rand(-0.1..0.1))
     let b = newSeqWith(K*N, float32 rand(-0.1..0.1))
@@ -342,32 +343,32 @@ when isMainModule:
     # let reference = benchReference(a, b, NbSamples)
     # let simpleTiling = benchSimpleTiling(a, b, NbSamples)
     # let arraymancer = benchArraymancerFallback(a, b, NbSamples)
-    let vendorBlas = benchOpenBLAS(a, b, NbSamples)
     let laser = benchLaserGEMM(a, b, NbSamples)
-    let glow = benchPyTorchGlow(a, b, NbSamples)
-    let mkldnnref = benchMkldnnRef(a, b, NbSamples)
-    let mkldnnjitavx = benchMkldnnJitAVX(a, b, NbSamples)
-    let mkldnnjitavx512 = benchMkldnnJitAVX512(a, b, NbSamples)
+    # let vendorBlas = benchOpenBLAS(a, b, NbSamples)
+    # let glow = benchPyTorchGlow(a, b, NbSamples)
+    # let mkldnnref = benchMkldnnRef(a, b, NbSamples)
+    # let mkldnnjitavx = benchMkldnnJitAVX(a, b, NbSamples)
+    # let mkldnnjitavx512 = benchMkldnnJitAVX512(a, b, NbSamples)
 
-    block:
-      # var error = mean_relative_error(vendorBlas, reference)
-      # echo "Mean Relative Error of OpenBLAS vs reference: ", error
-      # doAssert error <= 1e-5'f32, $error
+    # block:
+    #   # var error = mean_relative_error(vendorBlas, reference)
+    #   # echo "Mean Relative Error of OpenBLAS vs reference: ", error
+    #   # doAssert error <= 1e-5'f32, $error
 
-      # error = mean_relative_error(challenger, reference)
-      # echo "Mean Relative Error compared to Reference: ", error
-      # doAssert error <= 1e-5'f32, $error
+    #   # error = mean_relative_error(challenger, reference)
+    #   # echo "Mean Relative Error compared to Reference: ", error
+    #   # doAssert error <= 1e-5'f32, $error
 
-      var error = mean_relative_error(vendorBlas, laser)
-      echo "Mean Relative Error compared to vendor BLAS: ", error
-      doAssert error <= 1e-5'f32, $error
+    #   var error = mean_relative_error(vendorBlas, laser)
+    #   echo "Mean Relative Error compared to vendor BLAS: ", error
+    #   doAssert error <= 1e-5'f32, $error
 
 # Seems like my original Arraymancer BLAS has false sharing issue
 # FYI Apple accelerate is about 117~122GFLOP/s on my machine.
 
 ###############################
 # Compilation command
-# $ nim cpp -r -d:release -d:openmp -o:build/bench_gemm benchmarks/gemm/gemm_bench_float32.nim
+# $ nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim
 
 # Don't forget to add OpenBLAS in your path:
 # For example on Mac with OpenBLAS from Homebrew
@@ -379,8 +380,6 @@ when isMainModule:
 # i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.5 GHz AVX512 turbo
 # PyTorch Glow compiled with AVX2 as AVX512 is slower
 
-# nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim
-
 # A matrix shape: (M: 1920, N: 1920)
 # B matrix shape: (M: 1920, N: 1920)
 # Output shape: (M: 1920, N: 1920)
diff --git a/benchmarks/gemm/gemm_bench_float64.nim b/benchmarks/gemm/gemm_bench_float64.nim
index 4077096..a8c837b 100644
--- a/benchmarks/gemm/gemm_bench_float64.nim
+++ b/benchmarks/gemm/gemm_bench_float64.nim
@@ -49,6 +49,8 @@ import
   ./arraymancer/blas_l3_gemm,
   ../../laser/primitives/matrix_multiplication/gemm
 
+import ../third_party/manu/manu/matrix as manu
+
 const
   M     = 8*6*20
   K     = 8*6*20
@@ -145,6 +147,22 @@ proc benchLaserGEMM(a, b: seq[float64], nb_samples: int) =
               b_ptr, N, 1,
       0'f64,  c_ptr, N, 1
     )
+
+proc benchManu(a, b: seq[float64], nb_samples: int) =
+  let Amat = manu.matrix(a, M)
+  let Bmat = manu.matrix(N, b)
+  var C: manu.Matrix
+  # let output = C.data.addr # data is not exposed :/
+  var output: array[1, float64] # The bench display the first item for sanity checks
+
+  bench("Manu implementation"):
+    # No initialization needed, Manu doesn't work in-place
+    discard
+  do:
+    # Main work
+    C = Amat * Bmat
+    output[0] = C[0, 0]
+
 # ###########################################
 
 when defined(fast_math):
@@ -170,6 +188,7 @@ when isMainModule:
   echo &"Theoretical peak single-core:  {CpuGhz * CpuFlopCycle:>9.3f} GFLOP/s"
   echo &"Theoretical peak multi:        {CpuGhz * CpuFlopCycle * NumCpuCores:>9.3f} GFLOP/s"
   echo "Make sure to not bench Apple Accelerate or the default Linux BLAS."
+  echo "Due to strange OpenMP interferences, separate the run of code-sections using OpenMP, see https://github.com/numforge/laser/issues/40"
   block:
     let a = newSeqWith(M*K, float64 rand(1.0))
     let b = newSeqWith(K*N, float64 rand(1.0))
@@ -177,16 +196,18 @@ when isMainModule:
     # when not defined(openmp):
     #   benchSimpleTiling(a, b, NbSamples) # for some reason stalled with OpenMP
     # benchArraymancerFallback(a, b, NbSamples)
-    benchOpenBLAS(a, b, NbSamples)
+    # benchOpenBLAS(a, b, NbSamples)
     benchLaserGEMM(a, b, NbSamples)
+    benchManu(a, b, NbSamples)
 
 # Seems like my original Arraymancer BLAS has false sharing issue
-# FYI Apple accelerate is about 117~122GFLOP/s on my machine.
 
 ###############################
 # OpenMP
+# Due to strange OpenMP interferences, OpenMP code sections should be run independently
+# see https://github.com/numforge/laser/issues/40
 
-# Warmup: 1.1890 s, result 224 (displayed to avoid compiler optimizing warmup away)
+# Run 1: OpenBLAS vs Manu
 
 # A matrix shape: (M: 960, N: 960)
 # B matrix shape: (M: 960, N: 960)
@@ -197,33 +218,30 @@ when isMainModule:
 # Theoretical peak single-core:     43.200 GFLOP/s
 # Theoretical peak multi:           86.400 GFLOP/s
 # Make sure to not bench Apple Accelerate or the default Linux BLAS.
-
+#
 # OpenBLAS benchmark
-# Collected 10 samples in 0.315 seconds
-# Average time: 31.429 ms
-# Stddev  time: 2.503 ms
-# Min     time: 29.868 ms
-# Max     time: 37.536 ms
-# Perf:         56.300 GFLOP/s
-
+# Collected 10 samples in 0.056 seconds
+# Average time: 5.589 ms
+# Stddev  time: 6.702 ms
+# Min     time: 3.004 ms
+# Max     time: 24.487 ms
+# Perf:         316.588 GFLOP/s
+#
 # Display output[0] to make sure it's not optimized away
 # 232.3620566397699
-
-# Laser production implementation
-# Collected 10 samples in 0.327 seconds
-# Average time: 32.625 ms
-# Stddev  time: 3.080 ms
-# Min     time: 31.182 ms
-# Max     time: 41.327 ms
-# Perf:         54.236 GFLOP/s
-
+#
+# Manu implementation
+# Collected 10 samples in 8.470 seconds
+# Average time: 846.977 ms
+# Stddev  time: 0.884 ms
+# Min     time: 845.685 ms
+# Max     time: 848.072 ms
+# Perf:         2.089 GFLOP/s
+#
 # Display output[0] to make sure it's not optimized away
-# 232.36205663977
-
-###############################
-# Serial
+# 237.8399578000516
 
-# Warmup: 1.1948 s, result 224 (displayed to avoid compiler optimizing warmup away)
+# Run 2: Laser vs Manu
 
 # A matrix shape: (M: 960, N: 960)
 # B matrix shape: (M: 960, N: 960)
@@ -234,25 +252,25 @@ when isMainModule:
 # Theoretical peak single-core:     43.200 GFLOP/s
 # Theoretical peak multi:           86.400 GFLOP/s
 # Make sure to not bench Apple Accelerate or the default Linux BLAS.
-
-# OpenBLAS benchmark
-# Collected 10 samples in 0.566 seconds
-# Average time: 56.528 ms
-# Stddev  time: 2.482 ms
-# Min     time: 55.359 ms
-# Max     time: 63.552 ms
-# Perf:         31.303 GFLOP/s
-
-# Display output[0] to make sure it's not optimized away
-# 232.3620566397699
-
+#
 # Laser production implementation
-# Collected 10 samples in 0.531 seconds
-# Average time: 53.075 ms
-# Stddev  time: 1.592 ms
-# Min     time: 51.679 ms
-# Max     time: 55.885 ms
-# Perf:         33.339 GFLOP/s
-
+# Collected 10 samples in 0.053 seconds
+# Average time: 5.270 ms
+# Stddev  time: 9.205 ms
+# Min     time: 2.245 ms
+# Max     time: 31.464 ms
+# Perf:         335.751 GFLOP/s
+#
 # Display output[0] to make sure it's not optimized away
 # 232.36205663977
+#
+# Manu implementation
+# Collected 10 samples in 8.503 seconds
+# Average time: 850.315 ms
+# Stddev  time: 0.787 ms
+# Min     time: 848.843 ms
+# Max     time: 850.849 ms
+# Perf:         2.081 GFLOP/s
+#
+# Display output[0] to make sure it's not optimized away
+# 237.8399578000516
diff --git a/benchmarks/third_party/manu b/benchmarks/third_party/manu
new file mode 160000
index 0000000..17aa803
--- /dev/null
+++ b/benchmarks/third_party/manu
@@ -0,0 +1 @@
+Subproject commit 17aa803dc4efefb12bc46031ae6df6a84267f5d1