Skip to content

Commit

Permalink
Update with Manu benchmarks and workaround #40
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Oct 24, 2019
1 parent c7ddceb commit c1667e1
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 62 deletions.
35 changes: 17 additions & 18 deletions benchmarks/gemm/gemm_bench_float32.nim
Original file line number Diff line number Diff line change
Expand Up @@ -335,39 +335,40 @@ when isMainModule:
echo &"Theoretical peak single-core: {TheoSerialPeak:>9.3f} GFLOP/s"
echo &"Theoretical peak multi: {TheoThreadedPeak:>9.3f} GFLOP/s"
echo "Make sure to not bench Apple Accelerate or the default Linux BLAS."
echo "Due to strange OpenMP interferences, separate the run of code-sections using OpenMP, see https://github.com/numforge/laser/issues/40"
block:
let a = newSeqWith(M*K, float32 rand(-0.1..0.1))
let b = newSeqWith(K*N, float32 rand(-0.1..0.1))

# let reference = benchReference(a, b, NbSamples)
# let simpleTiling = benchSimpleTiling(a, b, NbSamples)
# let arraymancer = benchArraymancerFallback(a, b, NbSamples)
let vendorBlas = benchOpenBLAS(a, b, NbSamples)
let laser = benchLaserGEMM(a, b, NbSamples)
let glow = benchPyTorchGlow(a, b, NbSamples)
let mkldnnref = benchMkldnnRef(a, b, NbSamples)
let mkldnnjitavx = benchMkldnnJitAVX(a, b, NbSamples)
let mkldnnjitavx512 = benchMkldnnJitAVX512(a, b, NbSamples)
# let vendorBlas = benchOpenBLAS(a, b, NbSamples)
# let glow = benchPyTorchGlow(a, b, NbSamples)
# let mkldnnref = benchMkldnnRef(a, b, NbSamples)
# let mkldnnjitavx = benchMkldnnJitAVX(a, b, NbSamples)
# let mkldnnjitavx512 = benchMkldnnJitAVX512(a, b, NbSamples)

block:
# var error = mean_relative_error(vendorBlas, reference)
# echo "Mean Relative Error of OpenBLAS vs reference: ", error
# doAssert error <= 1e-5'f32, $error
# block:
# # var error = mean_relative_error(vendorBlas, reference)
# # echo "Mean Relative Error of OpenBLAS vs reference: ", error
# # doAssert error <= 1e-5'f32, $error

# error = mean_relative_error(challenger, reference)
# echo "Mean Relative Error compared to Reference: ", error
# doAssert error <= 1e-5'f32, $error
# # error = mean_relative_error(challenger, reference)
# # echo "Mean Relative Error compared to Reference: ", error
# # doAssert error <= 1e-5'f32, $error

var error = mean_relative_error(vendorBlas, laser)
echo "Mean Relative Error compared to vendor BLAS: ", error
doAssert error <= 1e-5'f32, $error
# var error = mean_relative_error(vendorBlas, laser)
# echo "Mean Relative Error compared to vendor BLAS: ", error
# doAssert error <= 1e-5'f32, $error

# Seems like my original Arraymancer BLAS has false sharing issue
# FYI Apple accelerate is about 117~122GFLOP/s on my machine.

###############################
# Compilation command
# $ nim cpp -r -d:release -d:openmp -o:build/bench_gemm benchmarks/gemm/gemm_bench_float32.nim
# $ nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim

# Don't forget to add OpenBLAS in your path:
# For example on Mac with OpenBLAS from Homebrew
Expand All @@ -379,8 +380,6 @@ when isMainModule:
# i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.5 GHz AVX512 turbo
# PyTorch Glow compiled with AVX2 as AVX512 is slower

# nim cpp -r -d:release -d:danger -d:openmp --outdir:build benchmarks/gemm/gemm_bench_float32.nim

# A matrix shape: (M: 1920, N: 1920)
# B matrix shape: (M: 1920, N: 1920)
# Output shape: (M: 1920, N: 1920)
Expand Down
106 changes: 62 additions & 44 deletions benchmarks/gemm/gemm_bench_float64.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ import
./arraymancer/blas_l3_gemm,
../../laser/primitives/matrix_multiplication/gemm

import ../third_party/manu/manu/matrix as manu

const
M = 8*6*20
K = 8*6*20
Expand Down Expand Up @@ -145,6 +147,22 @@ proc benchLaserGEMM(a, b: seq[float64], nb_samples: int) =
b_ptr, N, 1,
0'f64, c_ptr, N, 1
)

proc benchManu(a, b: seq[float64], nb_samples: int) =
let Amat = manu.matrix(a, M)
let Bmat = manu.matrix(N, b)
var C: manu.Matrix
# let output = C.data.addr # data is not exposed :/
var output: array[1, float64] # The bench display the first item for sanity checks

bench("Manu implementation"):
# No initialization needed, Manu doesn't work in-place
discard
do:
# Main work
C = Amat * Bmat
output[0] = C[0, 0]

# ###########################################

when defined(fast_math):
Expand All @@ -170,23 +188,26 @@ when isMainModule:
echo &"Theoretical peak single-core: {CpuGhz * CpuFlopCycle:>9.3f} GFLOP/s"
echo &"Theoretical peak multi: {CpuGhz * CpuFlopCycle * NumCpuCores:>9.3f} GFLOP/s"
echo "Make sure to not bench Apple Accelerate or the default Linux BLAS."
echo "Due to strange OpenMP interferences, separate the run of code-sections using OpenMP, see https://github.com/numforge/laser/issues/40"
block:
let a = newSeqWith(M*K, float64 rand(1.0))
let b = newSeqWith(K*N, float64 rand(1.0))

# when not defined(openmp):
# benchSimpleTiling(a, b, NbSamples) # for some reason stalled with OpenMP
# benchArraymancerFallback(a, b, NbSamples)
benchOpenBLAS(a, b, NbSamples)
# benchOpenBLAS(a, b, NbSamples)
benchLaserGEMM(a, b, NbSamples)
benchManu(a, b, NbSamples)

# Seems like my original Arraymancer BLAS has false sharing issue
# FYI Apple accelerate is about 117~122GFLOP/s on my machine.

###############################
# OpenMP
# Due to strange OpenMP interferences, OpenMP code sections should be run independently
# see https://github.com/numforge/laser/issues/40

# Warmup: 1.1890 s, result 224 (displayed to avoid compiler optimizing warmup away)
# Run 1: OpenBLAS vs Manu

# A matrix shape: (M: 960, N: 960)
# B matrix shape: (M: 960, N: 960)
Expand All @@ -197,33 +218,30 @@ when isMainModule:
# Theoretical peak single-core: 43.200 GFLOP/s
# Theoretical peak multi: 86.400 GFLOP/s
# Make sure to not bench Apple Accelerate or the default Linux BLAS.

#
# OpenBLAS benchmark
# Collected 10 samples in 0.315 seconds
# Average time: 31.429 ms
# Stddev time: 2.503 ms
# Min time: 29.868 ms
# Max time: 37.536 ms
# Perf: 56.300 GFLOP/s

# Collected 10 samples in 0.056 seconds
# Average time: 5.589 ms
# Stddev time: 6.702 ms
# Min time: 3.004 ms
# Max time: 24.487 ms
# Perf: 316.588 GFLOP/s
#
# Display output[0] to make sure it's not optimized away
# 232.3620566397699

# Laser production implementation
# Collected 10 samples in 0.327 seconds
# Average time: 32.625 ms
# Stddev time: 3.080 ms
# Min time: 31.182 ms
# Max time: 41.327 ms
# Perf: 54.236 GFLOP/s

#
# Manu implementation
# Collected 10 samples in 8.470 seconds
# Average time: 846.977 ms
# Stddev time: 0.884 ms
# Min time: 845.685 ms
# Max time: 848.072 ms
# Perf: 2.089 GFLOP/s
#
# Display output[0] to make sure it's not optimized away
# 232.36205663977

###############################
# Serial
# 237.8399578000516

# Warmup: 1.1948 s, result 224 (displayed to avoid compiler optimizing warmup away)
# Run 2: Laser vs Manu

# A matrix shape: (M: 960, N: 960)
# B matrix shape: (M: 960, N: 960)
Expand All @@ -234,25 +252,25 @@ when isMainModule:
# Theoretical peak single-core: 43.200 GFLOP/s
# Theoretical peak multi: 86.400 GFLOP/s
# Make sure to not bench Apple Accelerate or the default Linux BLAS.

# OpenBLAS benchmark
# Collected 10 samples in 0.566 seconds
# Average time: 56.528 ms
# Stddev time: 2.482 ms
# Min time: 55.359 ms
# Max time: 63.552 ms
# Perf: 31.303 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 232.3620566397699

#
# Laser production implementation
# Collected 10 samples in 0.531 seconds
# Average time: 53.075 ms
# Stddev time: 1.592 ms
# Min time: 51.679 ms
# Max time: 55.885 ms
# Perf: 33.339 GFLOP/s

# Collected 10 samples in 0.053 seconds
# Average time: 5.270 ms
# Stddev time: 9.205 ms
# Min time: 2.245 ms
# Max time: 31.464 ms
# Perf: 335.751 GFLOP/s
#
# Display output[0] to make sure it's not optimized away
# 232.36205663977
#
# Manu implementation
# Collected 10 samples in 8.503 seconds
# Average time: 850.315 ms
# Stddev time: 0.787 ms
# Min time: 848.843 ms
# Max time: 850.849 ms
# Perf: 2.081 GFLOP/s
#
# Display output[0] to make sure it's not optimized away
# 237.8399578000516
1 change: 1 addition & 0 deletions benchmarks/third_party/manu
Submodule manu added at 17aa80

0 comments on commit c1667e1

Please sign in to comment.