Skip to content

Commit

Permalink
Merge #821
Browse files Browse the repository at this point in the history
821: make GC deterministic in distributed r=simonbyrne a=simonbyrne

# PULL REQUEST

## Purpose and Content
This should reduce MPI Waitall time by manually triggering the GC across all processes at the same time.

## Benefits and Risks
The number of steps will require some tuning to avoid out-of-memory errors

## Linked Issues
- Item 3 of #635 
- Mentioned in #686
- Supersedes #687


## PR Checklist
- [x] This PR has a corresponding issue OR is linked to an SDI.
- [x] I have followed CliMA's codebase [contribution](https://clima.github.io/ClimateMachine.jl/latest/Contributing/) and [style](https://clima.github.io/ClimateMachine.jl/latest/DevDocs/CodeStyle/) guidelines OR N/A.
- [x] I have followed CliMA's [documentation policy](https://github.com/CliMA/policies/wiki/Documentation-Policy).
- [x] I have checked all issues and PRs and I certify that this PR does not duplicate an open PR.
- [x] I linted my code on my local machine prior to submission OR N/A.
- [x] Unit tests are included OR N/A.
- [x] Code used in an integration test OR N/A.
- [x] All tests ran successfully on my local machine OR N/A.
- [x] All classes, modules, and function contain docstrings OR N/A.
- [x] Documentation has been added/updated OR N/A.


Co-authored-by: Simon Byrne <[email protected]>
  • Loading branch information
bors[bot] and simonbyrne authored Oct 3, 2022
2 parents b657535 + c192129 commit 9d5d78e
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .buildkite/scaling/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ FT="Float32"
resolutions=("low" "mid" "high")
max_procs_per_node=16 # limit this artificially for profiling
profiling=enable
exclusive=false
exclusive=true
mpi_impl="openmpi"

# set up environment and agents
Expand Down
24 changes: 24 additions & 0 deletions examples/hybrid/callbacks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,18 @@ function get_callbacks(parsed_args, simulation, model_spec, params)
else
call_every_dt(save_restart_func, dt_save_restart)
end

gc_callback = if simulation.is_distributed
call_every_n_steps(gc_func, 1000)
else
nothing
end

return ODE.CallbackSet(
dss_cb,
save_to_disk_callback,
save_restart_callback,
gc_callback,
additional_callbacks...,
)
end
Expand Down Expand Up @@ -395,3 +403,19 @@ function save_restart_func(integrator)
Base.close(hdfwriter)
return nothing
end

function gc_func(integrator)
free_mem = Sys.free_memory()
total_mem = Sys.total_memory()
p_free_mem = free_mem / total_mem
min_p_free_mem =
ClimaCommsMPI.MPI.Allreduce(p_free_mem, min, comms_ctx.mpicomm)
do_gc = min_p_free_mem < 0.2
@info "GC check" "free mem (MB)" = free_mem / 2^20 "total mem (MB)" =
total_mem / 2^20 "Minimum free memory (%)" = min_p_free_mem * 100 "Calling GC" =
do_gc
if do_gc
GC.gc()
end
return nothing
end
3 changes: 3 additions & 0 deletions examples/hybrid/driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ end
@info "Running job:`$(simulation.job_id)`"
if simulation.is_distributed
OrdinaryDiffEq.step!(integrator)
GC.enable(false)
GC.gc()
ClimaComms.barrier(comms_ctx)
if ClimaComms.iamroot(comms_ctx)
@timev begin
Expand All @@ -256,6 +258,7 @@ if simulation.is_distributed
walltime = @elapsed sol = OrdinaryDiffEq.solve!(integrator)
end
ClimaComms.barrier(comms_ctx)
GC.enable(true)
else
sol = @timev OrdinaryDiffEq.solve!(integrator)
end
Expand Down

0 comments on commit 9d5d78e

Please sign in to comment.