From c1667e1db74eeac0788238874e82963ae0b6cb24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= Date: Thu, 24 Oct 2019 10:48:54 +0200 Subject: [PATCH] Update with Manu benchmarks and workaround https://github.com/numforge/laser/issues/40 --- benchmarks/gemm/gemm_bench_float32.nim | 35 ++++---- benchmarks/gemm/gemm_bench_float64.nim | 106 +++++++++++++++---------- benchmarks/third_party/manu | 1 + 3 files changed, 80 insertions(+), 62 deletions(-) create mode 160000 benchmarks/third_party/manu diff --git a/benchmarks/gemm/gemm_bench_float32.nim b/benchmarks/gemm/gemm_bench_float32.nim index 2707f8a..30ba846 100644 --- a/benchmarks/gemm/gemm_bench_float32.nim +++ b/benchmarks/gemm/gemm_bench_float32.nim @@ -335,6 +335,7 @@ when isMainModule: echo &"Theoretical peak single-core: {TheoSerialPeak:>9.3f} GFLOP/s" echo &"Theoretical peak multi: {TheoThreadedPeak:>9.3f} GFLOP/s" echo "Make sure to not bench Apple Accelerate or the default Linux BLAS." + echo "Due to strange OpenMP interferences, separate the run of code-sections using OpenMP, see https://github.com/numforge/laser/issues/40" block: let a = newSeqWith(M*K, float32 rand(-0.1..0.1)) let b = newSeqWith(K*N, float32 rand(-0.1..0.1)) @@ -342,32 +343,32 @@ when isMainModule: # let reference = benchReference(a, b, NbSamples) # let simpleTiling = benchSimpleTiling(a, b, NbSamples) # let arraymancer = benchArraymancerFallback(a, b, NbSamples) - let vendorBlas = benchOpenBLAS(a, b, NbSamples) let laser = benchLaserGEMM(a, b, NbSamples) - let glow = benchPyTorchGlow(a, b, NbSamples) - let mkldnnref = benchMkldnnRef(a, b, NbSamples) - let mkldnnjitavx = benchMkldnnJitAVX(a, b, NbSamples) - let mkldnnjitavx512 = benchMkldnnJitAVX512(a, b, NbSamples) + # let vendorBlas = benchOpenBLAS(a, b, NbSamples) + # let glow = benchPyTorchGlow(a, b, NbSamples) + # let mkldnnref = benchMkldnnRef(a, b, NbSamples) + # let mkldnnjitavx = benchMkldnnJitAVX(a, b, NbSamples) + # let mkldnnjitavx512 = benchMkldnnJitAVX512(a, b, NbSamples) - block: - # var error = mean_relative_error(vendorBlas, reference) - # echo "Mean Relative Error of OpenBLAS vs reference: ", error - # doAssert error <= 1e-5'f32, $error + # block: + # # var error = mean_relative_error(vendorBlas, reference) + # # echo "Mean Relative Error of OpenBLAS vs reference: ", error + # # doAssert error <= 1e-5'f32, $error - # error = mean_relative_error(challenger, reference) - # echo "Mean Relative Error compared to Reference: ", error - # doAssert error <= 1e-5'f32, $error + # # error = mean_relative_error(challenger, reference) + # # echo "Mean Relative Error compared to Reference: ", error + # # doAssert error <= 1e-5'f32, $error - var error = mean_relative_error(vendorBlas, laser) - echo "Mean Relative Error compared to vendor BLAS: ", error - doAssert error <= 1e-5'f32, $error + # var error = mean_relative_error(vendorBlas, laser) + # echo "Mean Relative Error compared to vendor BLAS: ", error + # doAssert error <= 1e-5'f32, $error # Seems like my original Arraymancer BLAS has false sharing issue # FYI Apple accelerate is about 117~122GFLOP/s on my machine. ############################### # Compilation command -# $ nim cpp -r -d:release -d:openmp -o:build/bench_gemm benchmarks/gemm/gemm_bench_float32.nim +# $ nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim # Don't forget to add OpenBLAS in your path: # For example on Mac with OpenBLAS from Homebrew @@ -379,8 +380,6 @@ when isMainModule: # i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.5 GHz AVX512 turbo # PyTorch Glow compiled with AVX2 as AVX512 is slower -# nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim - # A matrix shape: (M: 1920, N: 1920) # B matrix shape: (M: 1920, N: 1920) # Output shape: (M: 1920, N: 1920) diff --git a/benchmarks/gemm/gemm_bench_float64.nim b/benchmarks/gemm/gemm_bench_float64.nim index 4077096..a8c837b 100644 --- a/benchmarks/gemm/gemm_bench_float64.nim +++ b/benchmarks/gemm/gemm_bench_float64.nim @@ -49,6 +49,8 @@ import ./arraymancer/blas_l3_gemm, ../../laser/primitives/matrix_multiplication/gemm +import ../third_party/manu/manu/matrix as manu + const M = 8*6*20 K = 8*6*20 @@ -145,6 +147,22 @@ proc benchLaserGEMM(a, b: seq[float64], nb_samples: int) = b_ptr, N, 1, 0'f64, c_ptr, N, 1 ) + +proc benchManu(a, b: seq[float64], nb_samples: int) = + let Amat = manu.matrix(a, M) + let Bmat = manu.matrix(N, b) + var C: manu.Matrix + # let output = C.data.addr # data is not exposed :/ + var output: array[1, float64] # The bench display the first item for sanity checks + + bench("Manu implementation"): + # No initialization needed, Manu doesn't work in-place + discard + do: + # Main work + C = Amat * Bmat + output[0] = C[0, 0] + # ########################################### when defined(fast_math): @@ -170,6 +188,7 @@ when isMainModule: echo &"Theoretical peak single-core: {CpuGhz * CpuFlopCycle:>9.3f} GFLOP/s" echo &"Theoretical peak multi: {CpuGhz * CpuFlopCycle * NumCpuCores:>9.3f} GFLOP/s" echo "Make sure to not bench Apple Accelerate or the default Linux BLAS." + echo "Due to strange OpenMP interferences, separate the run of code-sections using OpenMP, see https://github.com/numforge/laser/issues/40" block: let a = newSeqWith(M*K, float64 rand(1.0)) let b = newSeqWith(K*N, float64 rand(1.0)) @@ -177,16 +196,18 @@ when isMainModule: # when not defined(openmp): # benchSimpleTiling(a, b, NbSamples) # for some reason stalled with OpenMP # benchArraymancerFallback(a, b, NbSamples) - benchOpenBLAS(a, b, NbSamples) + # benchOpenBLAS(a, b, NbSamples) benchLaserGEMM(a, b, NbSamples) + benchManu(a, b, NbSamples) # Seems like my original Arraymancer BLAS has false sharing issue -# FYI Apple accelerate is about 117~122GFLOP/s on my machine. ############################### # OpenMP +# Due to strange OpenMP interferences, OpenMP code sections should be run independently +# see https://github.com/numforge/laser/issues/40 -# Warmup: 1.1890 s, result 224 (displayed to avoid compiler optimizing warmup away) +# Run 1: OpenBLAS vs Manu # A matrix shape: (M: 960, N: 960) # B matrix shape: (M: 960, N: 960) @@ -197,33 +218,30 @@ when isMainModule: # Theoretical peak single-core: 43.200 GFLOP/s # Theoretical peak multi: 86.400 GFLOP/s # Make sure to not bench Apple Accelerate or the default Linux BLAS. - +# # OpenBLAS benchmark -# Collected 10 samples in 0.315 seconds -# Average time: 31.429 ms -# Stddev time: 2.503 ms -# Min time: 29.868 ms -# Max time: 37.536 ms -# Perf: 56.300 GFLOP/s - +# Collected 10 samples in 0.056 seconds +# Average time: 5.589 ms +# Stddev time: 6.702 ms +# Min time: 3.004 ms +# Max time: 24.487 ms +# Perf: 316.588 GFLOP/s +# # Display output[0] to make sure it's not optimized away # 232.3620566397699 - -# Laser production implementation -# Collected 10 samples in 0.327 seconds -# Average time: 32.625 ms -# Stddev time: 3.080 ms -# Min time: 31.182 ms -# Max time: 41.327 ms -# Perf: 54.236 GFLOP/s - +# +# Manu implementation +# Collected 10 samples in 8.470 seconds +# Average time: 846.977 ms +# Stddev time: 0.884 ms +# Min time: 845.685 ms +# Max time: 848.072 ms +# Perf: 2.089 GFLOP/s +# # Display output[0] to make sure it's not optimized away -# 232.36205663977 - -############################### -# Serial +# 237.8399578000516 -# Warmup: 1.1948 s, result 224 (displayed to avoid compiler optimizing warmup away) +# Run 2: Laser vs Manu # A matrix shape: (M: 960, N: 960) # B matrix shape: (M: 960, N: 960) @@ -234,25 +252,25 @@ when isMainModule: # Theoretical peak single-core: 43.200 GFLOP/s # Theoretical peak multi: 86.400 GFLOP/s # Make sure to not bench Apple Accelerate or the default Linux BLAS. - -# OpenBLAS benchmark -# Collected 10 samples in 0.566 seconds -# Average time: 56.528 ms -# Stddev time: 2.482 ms -# Min time: 55.359 ms -# Max time: 63.552 ms -# Perf: 31.303 GFLOP/s - -# Display output[0] to make sure it's not optimized away -# 232.3620566397699 - +# # Laser production implementation -# Collected 10 samples in 0.531 seconds -# Average time: 53.075 ms -# Stddev time: 1.592 ms -# Min time: 51.679 ms -# Max time: 55.885 ms -# Perf: 33.339 GFLOP/s - +# Collected 10 samples in 0.053 seconds +# Average time: 5.270 ms +# Stddev time: 9.205 ms +# Min time: 2.245 ms +# Max time: 31.464 ms +# Perf: 335.751 GFLOP/s +# # Display output[0] to make sure it's not optimized away # 232.36205663977 +# +# Manu implementation +# Collected 10 samples in 8.503 seconds +# Average time: 850.315 ms +# Stddev time: 0.787 ms +# Min time: 848.843 ms +# Max time: 850.849 ms +# Perf: 2.081 GFLOP/s +# +# Display output[0] to make sure it's not optimized away +# 237.8399578000516 diff --git a/benchmarks/third_party/manu b/benchmarks/third_party/manu new file mode 160000 index 0000000..17aa803 --- /dev/null +++ b/benchmarks/third_party/manu @@ -0,0 +1 @@ +Subproject commit 17aa803dc4efefb12bc46031ae6df6a84267f5d1