Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for NVidia GPUs #5

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ target_link_libraries(_sharpy PRIVATE
${imex_dialect_libs}
${imex_conversion_libs}
IMEXTransforms
MLIRCopyOpInterface
IMEXUtil
LLVMX86CodeGen
LLVMX86AsmParser
Expand Down
26 changes: 17 additions & 9 deletions examples/black_scholes.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,15 +246,23 @@ def eval():
info(f"Median rate: {perf_rate:.5f} Mopts/s")

# verify
call, put = args[-2], args[-1]
expected_call = 16.976097804669887
expected_put = 0.34645174725098116
call_value = float(call[0])
put_value = float(put[0])
assert numpy.allclose(call_value, expected_call)
assert numpy.allclose(put_value, expected_put)

info("SUCCESS")
if device:
# FIXME gpu.memcpy to host requires identity layout
# FIXME reduction on gpu
# call = args[-2].to_device()
# put = args[-1].to_device()
pass
else:
call = args[-2]
put = args[-1]
expected_call = 16.976097804669887
expected_put = 0.34645174725098116
call_value = float(call[0])
put_value = float(put[0])
assert numpy.allclose(call_value, expected_call)
assert numpy.allclose(put_value, expected_put)
info("SUCCESS")

fini()


Expand Down
140 changes: 79 additions & 61 deletions examples/shallow_water.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ def ind_arr(shape, columns=False):
"""Construct an (nx, ny) array where each row/col is an arange"""
nx, ny = shape
if columns:
ind = np.arange(0, nx * ny, 1, dtype=np.int32) % nx
ind = np.arange(0, nx * ny, 1, dtype=np.int64) % nx
ind = transpose(np.reshape(ind, (ny, nx)))
else:
ind = np.arange(0, nx * ny, 1, dtype=np.int32) % ny
ind = np.arange(0, nx * ny, 1, dtype=np.int64) % ny
ind = np.reshape(ind, (nx, ny))
return ind.astype(dtype)

Expand Down Expand Up @@ -156,7 +156,7 @@ def ind_arr(shape, columns=False):
q = create_full(F_shape, 0.0, dtype)

# bathymetry
h = create_full(T_shape, 0.0, dtype)
h = create_full(T_shape, 1.0, dtype) # HACK init with 1

hu = create_full(U_shape, 0.0, dtype)
hv = create_full(V_shape, 0.0, dtype)
Expand Down Expand Up @@ -205,13 +205,15 @@ def bathymetry(x_t_2d, y_t_2d, lx, ly):
return bath * create_full(T_shape, 1.0, dtype)

# set bathymetry
h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly)
# h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device)
# steady state potential energy
pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny
# pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny
pe_offset = 0.5 * g * float(1.0) / nx / ny

# compute time step
alpha = 0.5
h_max = float(np.max(h, all_axes))
# h_max = float(np.max(h, all_axes))
h_max = float(1.0)
c = (g * h_max) ** 0.5
dt = alpha * dx / c
dt = t_export / int(math.ceil(t_export / dt))
Expand Down Expand Up @@ -328,9 +330,9 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
u0, v0, e0 = exact_solution(
0, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d
)
e[:, :] = e0
u[:, :] = u0
v[:, :] = v0
e[:, :] = e0.to_device(device)
u[:, :] = u0.to_device(device)
v[:, :] = v0.to_device(device)

t = 0
i_export = 0
Expand All @@ -344,30 +346,41 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
t = i * dt

if t >= next_t_export - 1e-8:
_elev_max = np.max(e, all_axes)
_u_max = np.max(u, all_axes)
_q_max = np.max(q, all_axes)
_total_v = np.sum(e + h, all_axes)

# potential energy
_pe = 0.5 * g * (e + h) * (e - h) + pe_offset
_total_pe = np.sum(_pe, all_axes)

# kinetic energy
u2 = u * u
v2 = v * v
u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
_ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
_total_ke = np.sum(_ke, all_axes)

total_pe = float(_total_pe) * dx * dy
total_ke = float(_total_ke) * dx * dy
total_e = total_ke + total_pe
elev_max = float(_elev_max)
u_max = float(_u_max)
q_max = float(_q_max)
total_v = float(_total_v) * dx * dy
if device:
# FIXME gpu.memcpy to host requires identity layout
# FIXME reduction on gpu
elev_max = 0
u_max = 0
q_max = 0
diff_e = 0
diff_v = 0
total_pe = 0
total_ke = 0
else:
_elev_max = np.max(e, all_axes)
_u_max = np.max(u, all_axes)
_q_max = np.max(q, all_axes)
_total_v = np.sum(e + h, all_axes)

# potential energy
_pe = 0.5 * g * (e + h) * (e - h) + pe_offset
_total_pe = np.sum(_pe, all_axes)

# kinetic energy
u2 = u * u
v2 = v * v
u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
_ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
_total_ke = np.sum(_ke, all_axes)

total_pe = float(_total_pe) * dx * dy
total_ke = float(_total_ke) * dx * dy
total_e = total_ke + total_pe
elev_max = float(_elev_max)
u_max = float(_u_max)
q_max = float(_q_max)
total_v = float(_total_v) * dx * dy

if i_export == 0:
initial_v = total_v
Expand Down Expand Up @@ -402,35 +415,40 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
duration = time_mod.perf_counter() - tic
info(f"Duration: {duration:.2f} s")

e_exact = exact_solution(t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d)[
2
]
err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
info(f"L2 error: {err_L2:7.15e}")

if nx < 128 or ny < 128:
info("Skipping correctness test due to small problem size.")
elif not benchmark_mode:
tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
assert (
diff_e < tolerance_ene
), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
if nx == 128 and ny == 128:
if datatype == "f32":
assert numpy.allclose(
err_L2, 4.3127859e-05, rtol=1e-5
), "L2 error does not match"
else:
assert numpy.allclose(
err_L2, 4.315799035627906e-05
), "L2 error does not match"
else:
tolerance_l2 = 1e-4
if device:
# FIXME gpu.memcpy to host requires identity layout
# FIXME reduction on gpu
pass
else:
e_exact = exact_solution(
t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d
)[2]
err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
info(f"L2 error: {err_L2:7.15e}")

if nx < 128 or ny < 128:
info("Skipping correctness test due to small problem size.")
elif not benchmark_mode:
tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
assert (
err_L2 < tolerance_l2
), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
info("SUCCESS")
diff_e < tolerance_ene
), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
if nx == 128 and ny == 128:
if datatype == "f32":
assert numpy.allclose(
err_L2, 4.3127859e-05, rtol=1e-5
), "L2 error does not match"
else:
assert numpy.allclose(
err_L2, 4.315799035627906e-05
), "L2 error does not match"
else:
tolerance_l2 = 1e-4
assert (
err_L2 < tolerance_l2
), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
info("SUCCESS")

fini()

Expand Down
36 changes: 27 additions & 9 deletions examples/wave_equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,19 +115,21 @@ def ind_arr(shape, columns=False):
"""Construct an (nx, ny) array where each row/col is an arange"""
nx, ny = shape
if columns:
ind = np.arange(0, nx * ny, 1, dtype=np.int32) % nx
ind = np.arange(0, nx * ny, 1, dtype=np.int64) % nx
ind = transpose(np.reshape(ind, (ny, nx)))
else:
ind = np.arange(0, nx * ny, 1, dtype=np.int32) % ny
ind = np.arange(0, nx * ny, 1, dtype=np.int64) % ny
ind = np.reshape(ind, (nx, ny))
return ind.astype(dtype)

# coordinate arrays
T_shape = (nx, ny)
U_shape = (nx + 1, ny)
V_shape = (nx, ny + 1)
sync()
x_t_2d = xmin + ind_arr(T_shape, True) * dx + dx / 2
y_t_2d = ymin + ind_arr(T_shape) * dy + dy / 2
sync()

dofs_T = int(numpy.prod(numpy.asarray(T_shape)))
dofs_U = int(numpy.prod(numpy.asarray(U_shape)))
Expand All @@ -151,6 +153,8 @@ def ind_arr(shape, columns=False):
u2 = create_full(U_shape, 0.0, dtype)
v2 = create_full(V_shape, 0.0, dtype)

sync()

def exact_elev(t, x_t_2d, y_t_2d, lx, ly):
"""
Exact solution for elevation field.
Expand Down Expand Up @@ -224,7 +228,7 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
sync()

# initial solution
e[:, :] = exact_elev(0.0, x_t_2d, y_t_2d, lx, ly)
e[:, :] = exact_elev(0.0, x_t_2d, y_t_2d, lx, ly).to_device(device)
u[:, :] = create_full(U_shape, 0.0, dtype)
v[:, :] = create_full(V_shape, 0.0, dtype)
sync()
Expand All @@ -240,9 +244,15 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
t = i * dt

if t >= next_t_export - 1e-8:
_elev_max = np.max(e, all_axes)
_u_max = np.max(u, all_axes)
_total_v = np.sum(e + h, all_axes)
sync()
H_tmp = e + h
sync()
_elev_max = np.max(e, all_axes).to_device()
# NOTE max(u) segfaults, shape (n+1, n) too large for tiling
_u_max = np.max(u[1:, :], all_axes).to_device()
_total_v = np.sum(H_tmp, all_axes).to_device()
# NOTE this segfaults
# _total_v = np.sum(e + h, all_axes).to_device() # segfaults

elev_max = float(_elev_max)
u_max = float(_u_max)
Expand Down Expand Up @@ -277,12 +287,20 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
duration = time_mod.perf_counter() - tic
info(f"Duration: {duration:.2f} s")

e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly)
e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly).to_device(device)
err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
err2sum = np.sum(err2, all_axes).to_device()
sync()
# e_host = e.to_device()
# sync()
# e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly).to_device()
# err2 = (e_exact - e_host) * (e_exact - e_host) * dx * dy / lx / ly
# err2sum = np.sum(err2, all_axes)
sync()
err_L2 = math.sqrt(float(err2sum))
info(f"L2 error: {err_L2:7.5e}")

if nx == 128 and ny == 128 and not benchmark_mode:
if nx == 128 and ny == 128 and not benchmark_mode and not device:
if datatype == "f32":
assert numpy.allclose(err_L2, 7.2235471e-03, rtol=1e-4)
else:
Expand Down
Loading