From c7ddceb0d54390f622c48b7fcb4bb44964961d6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Thu, 26 Sep 2019 15:45:53 +0200
Subject: [PATCH] Update OpenMP and Laser BLAS for Nim v1.0.0

---
 benchmarks/gemm/gemm_bench_float32.nim | 98 ++++++++++----------------
 laser/openmp.nim                       |  4 +-
 nim.cfg                                |  6 +-
 3 files changed, 42 insertions(+), 66 deletions(-)

diff --git a/benchmarks/gemm/gemm_bench_float32.nim b/benchmarks/gemm/gemm_bench_float32.nim
index 0fc9c6b..2707f8a 100644
--- a/benchmarks/gemm/gemm_bench_float32.nim
+++ b/benchmarks/gemm/gemm_bench_float32.nim
@@ -379,7 +379,7 @@ when isMainModule:
 # i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.5 GHz AVX512 turbo
 # PyTorch Glow compiled with AVX2 as AVX512 is slower
 
-# Warmup: 0.9063 s, result 224 (displayed to avoid compiler optimizing warmup away)
+# nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim
 
 # A matrix shape: (M: 1920, N: 1920)
 # B matrix shape: (M: 1920, N: 1920)
@@ -391,75 +391,51 @@ when isMainModule:
 # Theoretical peak multi:         4032.000 GFLOP/s
 # Make sure to not bench Apple Accelerate or the default Linux BLAS.
 
-# Reference loop
-# Collected 10 samples in 10.352 seconds
-# Average time: 1034.621 ms
-# Stddev  time: 3.193 ms
-# Min     time: 1029.729 ms
-# Max     time: 1040.034 ms
-# Perf:         13.682 GFLOP/s
-
-# Simple Tiling
-# Collected 10 samples in 16.658 seconds
-# Average time: 1665.251 ms
-# Stddev  time: 488.574 ms
-# Min     time: 274.804 ms
-# Max     time: 1825.817 ms
-# Perf:         8.501 GFLOP/s
-
-# Arraymancer fallback BLAS
-# Collected 10 samples in 22.953 seconds
-# Average time: 2294.844 ms
-# Stddev  time: 1.488 ms
-# Min     time: 2293.406 ms
-# Max     time: 2297.158 ms
-# Perf:         6.169 GFLOP/s
-
 # OpenBLAS benchmark
-# Collected 10 samples in 0.090 seconds
-# Average time: 8.344 ms
-# Stddev  time: 5.493 ms
-# Min     time: 6.586 ms
-# Max     time: 23.977 ms
-# Perf:         1696.506 GFLOP/s
+# Collected 10 samples in 0.089 seconds
+# Average time: 8.172 ms
+# Stddev  time: 5.513 ms
+# Min     time: 6.410 ms
+# Max     time: 23.863 ms
+# Perf:         1732.227 GFLOP/s
 
 # Laser production implementation
-# Collected 10 samples in 0.089 seconds
-# Average time: 8.396 ms
-# Stddev  time: 3.306 ms
-# Min     time: 7.219 ms
-# Max     time: 17.793 ms
-# Perf:         1686.090 GFLOP/s
+# Collected 10 samples in 0.082 seconds
+# Average time: 7.553 ms
+# Stddev  time: 4.509 ms
+# Min     time: 5.866 ms
+# Max     time: 20.314 ms
+# Perf:         1874.073 GFLOP/s
 
 # PyTorch Glow: libjit matmul implementation (with AVX+FMA)
-# Collected 10 samples in 1.895 seconds
-# Average time: 189.521 ms
-# Stddev  time: 2.362 ms
-# Min     time: 188.692 ms
-# Max     time: 196.239 ms
-# Perf:         74.693 GFLOP/s
+# Collected 10 samples in 2.042 seconds
+# Average time: 204.186 ms
+# Stddev  time: 0.598 ms
+# Min     time: 203.783 ms
+# Max     time: 205.815 ms
+# Perf:         69.328 GFLOP/s
 
 # MKL-DNN reference GEMM benchmark
-# Collected 10 samples in 0.381 seconds
-# Average time: 37.376 ms
-# Stddev  time: 5.534 ms
-# Min     time: 34.748 ms
-# Max     time: 49.298 ms
-# Perf:         378.741 GFLOP/s
+# Collected 10 samples in 0.331 seconds
+# Average time: 32.286 ms
+# Stddev  time: 4.983 ms
+# Min     time: 30.018 ms
+# Max     time: 46.264 ms
+# Perf:         438.449 GFLOP/s
 
 # MKL-DNN JIT AVX benchmark
-# Collected 10 samples in 0.101 seconds
-# Average time: 9.385 ms
-# Stddev  time: 4.980 ms
-# Min     time: 7.717 ms
-# Max     time: 23.549 ms
-# Perf:         1508.331 GFLOP/s
+# Collected 10 samples in 0.105 seconds
+# Average time: 9.752 ms
+# Stddev  time: 5.647 ms
+# Min     time: 7.749 ms
+# Max     time: 25.768 ms
+# Perf:         1451.603 GFLOP/s
 
 # MKL-DNN JIT AVX512 benchmark
-# Collected 10 samples in 0.084 seconds
-# Average time: 7.798 ms
-# Stddev  time: 9.361 ms
-# Min     time: 4.685 ms
-# Max     time: 34.417 ms
-# Perf:         1815.302 GFLOP/s
+# Collected 10 samples in 0.088 seconds
+# Average time: 8.148 ms
+# Stddev  time: 10.751 ms
+# Min     time: 4.572 ms
+# Max     time: 38.731 ms
+# Perf:         1737.346 GFLOP/s
 # Mean Relative Error compared to vendor BLAS: 3.045843413929106e-06
diff --git a/laser/openmp.nim b/laser/openmp.nim
index 092dfaa..21af751 100644
--- a/laser/openmp.nim
+++ b/laser/openmp.nim
@@ -119,8 +119,8 @@ template omp_parallel*(body: untyped): untyped =
   block: body
 
 template omp_parallel_if*(condition: bool, body: untyped) =
-  let predicate = condition # Make symbol valid and ensure it's lvalue
-  {.emit: "#pragma omp parallel if (`predicate`)".}
+  let predicate = condition # Make symbol valid and ensure it's a lvalue
+  {.emit: ["#pragma omp parallel if (",predicate,")"].}
   block: body
 
 template omp_for*(
diff --git a/nim.cfg b/nim.cfg
index 777d27b..580b183 100644
--- a/nim.cfg
+++ b/nim.cfg
@@ -8,10 +8,10 @@
   stackTrace:off
   threads:on
   @if macosx: # Default compiler on Mac is clang without OpenMP and gcc is an alias to clang.
-              # Use Homebrew GCC instead for OpenMP support. GCC (v7), must be properly linked via `brew link gcc`
+              # Use Homebrew GCC instead for OpenMP support. GCC (v8), must be properly linked via `brew link gcc`
     cc:"gcc"
-    gcc.exe:"/usr/local/bin/gcc-7"
-    gcc.linkerexe:"/usr/local/bin/gcc-7"
+    gcc.exe:"/usr/local/bin/gcc-8"
+    gcc.linkerexe:"/usr/local/bin/gcc-8"
   @end
 @end