-
Notifications
You must be signed in to change notification settings - Fork 26
Update benchmark_step.jl for CUDA benchmarking with useful kernel names
#4055
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 49 commits
abe1f57
cbad8ac
df5f349
606f584
f369d6c
e4ce7a2
a521070
04e0454
8a931f5
215255f
0b362e7
cba2694
6620982
3dcca85
b228a35
4439d43
f36f405
a09bbc0
317dca8
1153ca3
7aa7dea
f4429ff
57fba6c
b073672
3824cde
cb6bc31
e41ea59
efb94cf
ac635a4
90ccf5b
63e213d
512471e
175164c
7dd4645
88cc058
7e959cd
77ef5e8
d7befca
5559604
adde261
427b8db
ad20c71
39c6438
395a0f0
e92a8a5
b6ca744
df705c4
5adf780
d7c6177
1f431bc
d847d5d
5fed428
8616282
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ steps: | |
| - label: "init :computer:" | ||
| key: "init_cpu_env" | ||
| concurrency: 1 | ||
| concurrency_group: 'depot/climaatmos-ci' | ||
| concurrency_group: "depot/climaatmos-ci" | ||
| command: | ||
| - "echo $$JULIA_DEPOT_PATH" | ||
|
|
||
|
|
@@ -41,13 +41,11 @@ steps: | |
|
|
||
| - group: "Reproducibility infrastructure" | ||
| steps: | ||
|
|
||
| - label: ":computer: Test reproducibility infrastructure" | ||
| command: "julia --color=yes --project=.buildkite test/unit_reproducibility_infra.jl" | ||
|
|
||
| - group: "Radiation" | ||
| steps: | ||
|
|
||
| - label: ":computer: single column radiative equilibrium gray" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -98,7 +96,6 @@ steps: | |
|
|
||
| - group: "Gravity wave" | ||
| steps: | ||
|
|
||
| - label: ":computer: non-orographic gravity wave parameterization unit test 3d" | ||
| command: "julia --color=yes --project=.buildkite test/parameterized_tendencies/gravity_wave/non_orographic_gravity_wave/nogw_test_3d.jl" | ||
| artifact_paths: "nonorographic_gravity_wave_test_3d/*" | ||
|
|
@@ -132,7 +129,6 @@ steps: | |
|
|
||
| - group: "Column Examples" | ||
| steps: | ||
|
|
||
| - label: ":computer: single column hydrostatic balance float64" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -149,7 +145,6 @@ steps: | |
|
|
||
| - group: "Box Examples" | ||
| steps: | ||
|
|
||
| - label: ":computer: Box hydrostatic balance" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -188,7 +183,6 @@ steps: | |
|
|
||
| - group: "Plane Examples" | ||
| steps: | ||
|
|
||
| - label: ":computer: Density current experiment" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -287,7 +281,6 @@ steps: | |
|
|
||
| - group: "Conservation check" | ||
| steps: | ||
|
|
||
| - label: ":computer: baroclinic wave check conservation" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_conservation.yml | ||
|
|
@@ -324,7 +317,6 @@ steps: | |
|
|
||
| - group: "Sphere Examples (Dycore)" | ||
| steps: | ||
|
|
||
| - label: ":computer: hydrostatic balance float64" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -394,7 +386,6 @@ steps: | |
|
|
||
| - group: "Sphere Examples (Aquaplanet)" | ||
| steps: | ||
|
|
||
| - label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -462,7 +453,6 @@ steps: | |
|
|
||
| - group: "Sphere Examples (Topography)" | ||
| steps: | ||
|
|
||
| - label: ":computer: baroclinic wave topography (dcmip)" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -489,7 +479,6 @@ steps: | |
|
|
||
| - group: "Restarting" | ||
| steps: | ||
|
|
||
| - label: ":computer: test restart" | ||
| command: > | ||
| julia --color=yes --project=.buildkite test/restart.jl | ||
|
|
@@ -554,7 +543,6 @@ steps: | |
|
|
||
| - group: "MPI Examples" | ||
| steps: | ||
|
|
||
| - label: ":computer: Prep restart for MPI" | ||
| key: "mpi_baro_wave_make_restart" | ||
| command: > | ||
|
|
@@ -624,7 +612,6 @@ steps: | |
|
|
||
| - group: "EDOnlyEDMFX" | ||
| steps: | ||
|
|
||
| - label: ":man_in_business_suit_levitating: EDOnly EDMFX aquaplanet" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -647,7 +634,6 @@ steps: | |
|
|
||
| - group: "Diagnostic EDMFX" | ||
| steps: | ||
|
|
||
| - label: ":genie: Diagnostic EDMFX test in a box" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -746,14 +732,12 @@ steps: | |
|
|
||
| - group: "Prognostic EDMFX" | ||
| steps: | ||
|
|
||
| - label: ":genie: Prognostic EDMFX advection test in a column" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
| --config_file $CONFIG_PATH/prognostic_edmfx_adv_test_column.yml | ||
| --job_id prognostic_edmfx_adv_test_column | ||
| artifact_paths: "prognostic_edmfx_adv_test_column/output_active/*" | ||
|
|
||
| agents: | ||
| slurm_mem: 20GB | ||
|
|
||
|
|
@@ -948,7 +932,6 @@ steps: | |
|
|
||
| - group: "Autodiff" | ||
| steps: | ||
|
|
||
| - label: "baroclinic wave moist check conservation float64 sparse autodiff" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_ft64_sparse_autodiff.yml | ||
|
|
@@ -1044,7 +1027,6 @@ steps: | |
|
|
||
| - group: "GPU" | ||
| steps: | ||
|
|
||
| - label: "GPU: Gravity waves" | ||
| command: > | ||
| julia --color=yes --project=.buildkite .buildkite/ci_driver.jl | ||
|
|
@@ -1113,7 +1095,6 @@ steps: | |
| - "baroclinic_wave" | ||
| - "baroclinic_wave_gpu" | ||
|
|
||
|
|
||
| - label: "GPU: baroclinic wave - 2 gpus" | ||
| key: "baroclinic_wave_2gpu" | ||
| command: | ||
|
|
@@ -1233,7 +1214,6 @@ steps: | |
|
|
||
| - group: "Benchmarks" | ||
| steps: | ||
|
|
||
| - label: ":computer: Benchmark: CPU baroclinic wave moist" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this label is incorrect |
||
| command: > | ||
| julia --color=yes --project=.buildkite perf/benchmark_step.jl | ||
|
|
@@ -1251,6 +1231,7 @@ steps: | |
| artifact_paths: "bm_baroclinic_wave_moist_gpu/output_active/*" | ||
| env: | ||
| CLIMACOMMS_DEVICE: "CUDA" | ||
| CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" | ||
| agents: | ||
| slurm_mem: 16G | ||
| slurm_gpus: 1 | ||
|
|
@@ -1272,6 +1253,7 @@ steps: | |
| --job_id bm_default_gpu | ||
| env: | ||
| CLIMACOMMS_DEVICE: "CUDA" | ||
| CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" | ||
| agents: | ||
| slurm_mem: 24GB | ||
| slurm_gpus: 1 | ||
|
|
@@ -1284,6 +1266,7 @@ steps: | |
| --job_id bm_diag_edmf_gpu | ||
| env: | ||
| CLIMACOMMS_DEVICE: "CUDA" | ||
| CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" | ||
| agents: | ||
| slurm_mem: 24GB | ||
| slurm_gpus: 1 | ||
|
|
@@ -1296,13 +1279,13 @@ steps: | |
| --job_id bm_prog_edmf_gpu | ||
| env: | ||
| CLIMACOMMS_DEVICE: "CUDA" | ||
| CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" | ||
| agents: | ||
| slurm_mem: 24GB | ||
| slurm_gpus: 1 | ||
|
|
||
| - group: "Flame graphs" | ||
| steps: | ||
|
|
||
| - label: ":fire: Flame graph: gpu job" | ||
| command: > | ||
| julia --color=yes --project=.buildkite perf/flame.jl | ||
|
|
@@ -1420,7 +1403,6 @@ steps: | |
|
|
||
| - group: "Checkbounds/Inference/Invalidations" | ||
| steps: | ||
|
|
||
| # TODO: we should somehow decouple this unit test from the perf env / scripts | ||
| # Checkbounds | ||
| - label: ":computer: checkbounds" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,10 +14,12 @@ include(joinpath("perf", "benchmark_step.jl")); | |
| redirect_stderr(IOContext(stderr, :stacktrace_types_limited => Ref(false))) | ||
| import ClimaComms | ||
| ClimaComms.@import_required_backends | ||
| import ClimaCore | ||
|
||
| import Random | ||
| Random.seed!(1234) | ||
| import ClimaAtmos as CA | ||
| import ClimaComms | ||
| import CUDA | ||
|
|
||
| include("common.jl") | ||
| (; config_file, job_id) = CA.commandline_kwargs() | ||
|
|
@@ -26,17 +28,44 @@ config = CA.AtmosConfig(config_file; job_id) | |
| simulation = CA.get_simulation(config) | ||
| (; integrator) = simulation; | ||
| Y₀ = deepcopy(integrator.u); | ||
| # Run one step to compile | ||
| @info "Compiling benchmark_step!..." | ||
| CA.benchmark_step!(integrator, Y₀); # compile first | ||
| CA.benchmark_step!(integrator, Y₀); | ||
|
|
||
| @info "Running benchmark_step!..." | ||
| n_steps = 10 | ||
| comms_ctx = ClimaComms.context(integrator.u.c) | ||
| device = ClimaComms.device(comms_ctx) | ||
| local e | ||
| s = CA.@timed_str begin | ||
| e = ClimaComms.elapsed(device) do | ||
| CA.benchmark_step!(integrator, Y₀, n_steps) # run | ||
|
|
||
| # If we're running on CUDA, use CUDA's profiler | ||
| if device isa ClimaComms.CUDADevice | ||
| e = 0.0 | ||
| n_steps = 5 | ||
| use_external_profiler = CUDA.Profile.detect_cupti() | ||
| if use_external_profiler | ||
| @info "Using external CUDA profiler" | ||
| CUDA.@profile external = true begin | ||
| e = CUDA.@elapsed begin | ||
| CA.benchmark_step!(integrator, Y₀, n_steps) | ||
| end | ||
| end | ||
| else | ||
| @info "Using internal CUDA profiler" | ||
| res = CUDA.@profile external = false begin | ||
| e = CUDA.@elapsed begin | ||
petebachant marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| CA.benchmark_step!(integrator, Y₀, n_steps) | ||
| end | ||
| end | ||
| show(IOContext(stdout, :limit => false), res) | ||
| end | ||
| @info "Ran step! with CUDA $n_steps times in $e s, ($(CA.prettytime(e/n_steps*1e9)) per step)" | ||
| else | ||
| # Profile with Julia's built-in profiler | ||
| n_steps = 10 | ||
| local e | ||
| s = CA.@timed_str begin | ||
| e = ClimaComms.elapsed(device) do | ||
| CA.benchmark_step!(integrator, Y₀, n_steps) # run | ||
| end | ||
| end | ||
| @info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)" | ||
| end | ||
| @info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These changes were made by a YAML auto-formatter in VS Code. Is there a style guide I might be breaking here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure... this is something I have been wondering as well. I considered following this example, which is used in Buildkite's docs.