Skip to content

Commit 24688f0

Browse files
authored
Merge pull request #817 from CliMA/js/table-atmos
add atmos to table
2 parents a3bffcb + b011d91 commit 24688f0

File tree

8 files changed

+187
-94
lines changed

8 files changed

+187
-94
lines changed

.buildkite/benchmarks/README.md

+2-6
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,5 @@ temperature and sea ice
3939
### Comparison Metrics
4040
- Simulated years per day (SYPD): The number of years of simulation time we
4141
can run in 1 day of walltime
42-
- CPU simulation object allocations: The allocations in GB of the simulation
43-
object, which contains everything needed to run the simulation.
44-
In the atmosphere-only case, this is the `AtmosSimulation` object.
45-
In the coupled case, this is the `CoupledSimulation` object, which includes
46-
all of the component models, coupler fields, and auxiliary objects. More
47-
information on this object can be found in the `Interfacer` docs.
42+
- CPU maximum resident set size (max RSS): The max RSS memory footprint on the
43+
CPU of this process since it began. This is measured for both CPU and GPU runs.

.buildkite/benchmarks/pipeline.yml

+25-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,18 @@ steps:
4343

4444
- group: "CPU benchmarks"
4545
steps:
46+
- label: "CPU ClimaAtmos without diagnostic EDMF"
47+
key: "climaatmos"
48+
command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id climaatmos"
49+
artifact_paths: "experiments/ClimaEarth/output/climaatmos/climaatmos_artifacts/*"
50+
env:
51+
BUILD_HISTORY_HANDLE: ""
52+
CLIMACOMMS_DEVICE: "CPU"
53+
agents:
54+
slurm_ntasks_per_node: 64
55+
slurm_nodes: 1
56+
slurm_mem_per_cpu: 4GB
57+
4658
- label: "CPU ClimaAtmos with diagnostic EDMF"
4759
key: "climaatmos_diagedmf"
4860
command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id climaatmos_diagedmf"
@@ -69,6 +81,16 @@ steps:
6981

7082
- group: "GPU benchmarks"
7183
steps:
84+
- label: "GPU ClimaAtmos without diagnostic EDMF"
85+
key: "gpu_climaatmos"
86+
command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id gpu_climaatmos"
87+
artifact_paths: "experiments/ClimaEarth/output/climaatmos/gpu_climaatmos_artifacts/*"
88+
agents:
89+
slurm_gpus_per_task: 1
90+
slurm_cpus_per_task: 4
91+
slurm_ntasks: 4
92+
slurm_mem: 16GB
93+
7294
- label: "GPU ClimaAtmos with diagnostic EDMF"
7395
key: "gpu_climaatmos_diagedmf"
7496
command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id gpu_climaatmos_diagedmf"
@@ -93,11 +115,13 @@ steps:
93115
steps:
94116
- label: "Compare AMIP/Atmos-only with diagnostic EDMF"
95117
key: "compare_amip_climaatmos_amip_diagedmf"
96-
command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/user_io/benchmarks.jl --cpu_job_id_coupled amip_diagedmf --cpu_job_id_atmos climaatmos_diagedmf --gpu_job_id_coupled gpu_amip_diagedmf --gpu_job_id_atmos gpu_climaatmos_diagedmf --mode_name amip --build_id $BUILDKITE_BUILD_NUMBER"
118+
command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/user_io/benchmarks.jl --cpu_job_id_coupled amip_diagedmf --cpu_job_id_atmos_diagedmf climaatmos_diagedmf --cpu_job_id_atmos climaatmos --build_id $BUILDKITE_BUILD_NUMBER"
97119
artifact_paths: "experiments/ClimaEarth/output/compare_amip_climaatmos_amip_diagedmf/*"
98120
depends_on:
121+
- "climaatmos"
99122
- "climaatmos_diagedmf"
100123
- "amip_diagedmf"
124+
- "gpu_climaatmos"
101125
- "gpu_climaatmos_diagedmf"
102126
- "gpu_amip_diagedmf"
103127

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
FLOAT_TYPE: "Float32"
2+
approximate_linear_solve_iters: 2
3+
dt: 120secs
4+
dt_cloud_fraction: 1hours
5+
dt_rad: 1hours
6+
dt_save_state_to_disk: "Inf"
7+
dt_save_to_sol: "Inf"
8+
dz_bottom: 30.0
9+
dz_top: 3000.0
10+
h_elem: 30
11+
moist: equil
12+
output_default_diagnostics: false
13+
precip_model: 0M
14+
prognostic_tke: true
15+
rad: allskywithclear
16+
surface_setup: DefaultMoninObukhov
17+
t_end: 12hours
18+
turb_flux_partition: "CombinedStateFluxesMOST"
19+
vert_diff: "true"
20+
z_elem: 63
21+
z_max: 55000.0

experiments/ClimaEarth/run_amip.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -823,9 +823,9 @@ if ClimaComms.iamroot(comms_ctx)
823823
sypd_filename = joinpath(dir_paths.artifacts, "sypd.txt")
824824
write(sypd_filename, "$sypd")
825825

826-
cpu_allocs_GB = Utilities.show_memory_usage(comms_ctx)
827-
cpu_allocs_filename = joinpath(dir_paths.artifacts, "allocations_cpu.txt")
828-
write(cpu_allocs_filename, cpu_allocs_GB)
826+
cpu_max_rss_GB = Utilities.show_memory_usage(comms_ctx)
827+
cpu_max_rss_filename = joinpath(dir_paths.artifacts, "max_rss_cpu.txt")
828+
write(cpu_max_rss_filename, cpu_max_rss_GB)
829829
end
830830

831831
#=

experiments/ClimaEarth/user_io/benchmarks.jl

+104-74
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Our goal here is to output a table displaying some results from benchmark runs
33
in the coupler. We want to be able to compare between CPU and GPU runs, as well
44
as between coupled and atmos-only runs. The metrics we want to compare are
5-
SYPD, allocations, and the maximum, median, and mean differences between the
6-
CPU and GPU states.
5+
SYPD, memory usage, allocations, and the maximum, median, and mean differences
6+
between the CPU and GPU states.
77
88
The table should look something like this (note that the last 3 columns will be
99
added in a future PR):
@@ -12,11 +12,11 @@ added in a future PR):
1212
------------------------------------------------------------------------------------
1313
| | $job_id | $job_id | | | |
1414
| Coupled run | $SYPD | $SYPD | $max_diff | $median_diff | $mean_diff |
15-
| | $cpu_allocs | $cpu_allocs | | | |
15+
| | $cpu_max_rss | $cpu_max_rss| | | |
1616
------------------------------------------------------------------------------------
1717
| | $job_id | $job_id | | | |
1818
| Atmos-only | $SYPD | $SYPD | $max_diff | $median_diff | $mean_diff |
19-
| | $cpu_allocs | $cpu_allocs | | | |
19+
| | $cpu_max_rss | $cpu_max_rss| | | |
2020
------------------------------------------------------------------------------------
2121
2222
=#
@@ -37,15 +37,19 @@ function argparse_settings()
3737
arg_type = String
3838
default = nothing
3939
"--cpu_job_id_atmos"
40-
help = "The name of the CPU atmos-only run we want to compare. User must specify CPU and/or GPU atmos-only run name."
40+
help = "The name of the CPU atmos-only run without diagnostic EDMF we want to compare. User must specify CPU and/or GPU atmos-only non-EDMF run name."
4141
arg_type = String
4242
default = nothing
4343
"--gpu_job_id_atmos"
4444
help = "The name of the GPU atmos-only run we want to compare."
4545
arg_type = String
4646
default = nothing
47-
"--mode_name"
48-
help = "The mode of the simulations being compared (`slabplanet` or `AMIP`)."
47+
"--cpu_job_id_atmos_diagedmf"
48+
help = "The name of the CPU atmos-only run with diagnostic EDMF we want to compare. User must specify CPU and/or GPU atmos-only EDMF run name."
49+
arg_type = String
50+
default = nothing
51+
"--gpu_job_id_atmos_diagedmf"
52+
help = "The name of the GPU atmos-only run we want to compare."
4953
arg_type = String
5054
default = nothing
5155
"--coupler_output_dir"
@@ -60,97 +64,123 @@ function argparse_settings()
6064
return s
6165
end
6266

63-
# Parse command line arguments
64-
parsed_args = ArgParse.parse_args(ARGS, argparse_settings())
67+
"""
68+
get_run_info(parsed_args, run_type)
69+
70+
Use the input `parsed_args` to get the job ID and artifacts directories for
71+
both the CPU and GPU runs of the given `run_type`.
72+
73+
`run_type` must be one of "coupled", "atmos", or "atmos_diagedmf".
74+
"""
75+
function get_run_info(parsed_args, run_type)
76+
# Read in CPU and GPU job ID info from command line
77+
if run_type == "coupled"
78+
cpu_job_id = parsed_args["cpu_job_id_coupled"]
79+
gpu_job_id = parsed_args["gpu_job_id_coupled"]
80+
mode_name = "amip"
81+
elseif run_type == "atmos_diagedmf"
82+
cpu_job_id = parsed_args["cpu_job_id_atmos_diagedmf"]
83+
gpu_job_id = parsed_args["gpu_job_id_atmos_diagedmf"]
84+
mode_name = "climaatmos"
85+
elseif run_type == "atmos"
86+
cpu_job_id = parsed_args["cpu_job_id_atmos"]
87+
gpu_job_id = parsed_args["gpu_job_id_atmos"]
88+
mode_name = "climaatmos"
89+
else
90+
error("Invalid run type: $run_type")
91+
end
6592

66-
# Access buildkite pipeline ID (from `BUILDKITE_GITHUB_DEPLOYMENT_ID` variable)
67-
build_id = parsed_args["build_id"]
68-
if !isnothing(build_id)
69-
build_id_str = "Build ID: $build_id"
70-
else
71-
build_id_str = "Build ID: N/A"
72-
end
93+
# Verify that the user has provided the necessary job IDs
94+
# If only one job ID of the CPU/GPU run pair is provided, the other will be inferred
95+
if isnothing(cpu_job_id) && isnothing(gpu_job_id)
96+
error("Must pass CPU and/or GPU coupled run name to compare them.")
97+
elseif isnothing(gpu_job_id)
98+
gpu_job_id = "gpu_" * cpu_job_id
99+
elseif isnothing(cpu_job_id)
100+
cpu_job_id = gpu_job_id[5:end]
101+
end
73102

74-
# Construct CPU and GPU artifacts directories
75-
output_dir = parsed_args["coupler_output_dir"]
103+
# Construct CPU and GPU artifacts directories
104+
cpu_artifacts_dir = joinpath(output_dir, mode_name, cpu_job_id) * "_artifacts"
105+
gpu_artifacts_dir = joinpath(output_dir, mode_name, gpu_job_id) * "_artifacts"
76106

77-
# Coupled runs
78-
# Read in CPU and GPU run name info from command line
79-
cpu_job_id_coupled = parsed_args["cpu_job_id_coupled"]
80-
gpu_job_id_coupled = parsed_args["gpu_job_id_coupled"]
81-
if isnothing(cpu_job_id_coupled) && isnothing(gpu_job_id_coupled)
82-
error("Must pass CPU and/or GPU coupled run name to compare them.")
83-
elseif isnothing(gpu_job_id_coupled)
84-
gpu_job_id_coupled = "gpu_" * cpu_job_id_coupled
85-
elseif isnothing(cpu_job_id_coupled)
86-
cpu_job_id_coupled = gpu_job_id_coupled[5:end]
107+
return (cpu_job_id, gpu_job_id, cpu_artifacts_dir, gpu_artifacts_dir)
87108
end
88109

89-
# Read in mode name from command line (or retrieve from run name).
90-
# Note that we expect this to be the same for all 4 simulations being compared.
91-
mode_name = parsed_args["mode_name"]
92-
if isnothing(mode_name)
93-
mode_name =
94-
occursin("amip", cpu_job_id_coupled) ? "amip" :
95-
(occursin("slabplanet", cpu_job_id_coupled) ? "slabplanet" : error("Please provide a valid `mode_name`."))
96-
end
110+
"""
111+
get_run_data(artifacts_dir)
97112
98-
gpu_artifacts_dir_coupled = joinpath(output_dir, mode_name, gpu_job_id_coupled) * "_artifacts"
99-
cpu_artifacts_dir_coupled = joinpath(output_dir, mode_name, cpu_job_id_coupled) * "_artifacts"
100-
101-
# Atmos-only runs
102-
# Read in CPU and GPU run name info from command line
103-
cpu_job_id_atmos = parsed_args["cpu_job_id_atmos"]
104-
gpu_job_id_atmos = parsed_args["gpu_job_id_atmos"]
105-
if isnothing(cpu_job_id_atmos) && isnothing(gpu_job_id_atmos)
106-
error("Must pass CPU and/or GPU coupled run name to compare them.")
107-
elseif isnothing(gpu_job_id_atmos)
108-
gpu_job_id_atmos = "gpu_" * cpu_job_id_atmos
109-
elseif isnothing(cpu_job_id_atmos)
110-
cpu_job_id_atmos = gpu_job_id_atmos[5:end]
111-
cpu_artifacts_dir_atmos = joinpath(output_dir, cpu_job_id_atmos)
113+
Read in run data from artifacts directories, currently SYPD and max RSS on the CPU.
114+
"""
115+
function get_run_data(artifacts_dir)
116+
# Read in SYPD info
117+
sypd = open(joinpath(artifacts_dir, "sypd.txt"), "r") do sypd_file
118+
round(parse(Float64, read(sypd_file, String)), digits = 4)
119+
end
120+
121+
# Read in max RSS info
122+
cpu_max_rss = open(joinpath(artifacts_dir, "max_rss_cpu.txt"), "r") do cpu_max_rss_file
123+
read(cpu_max_rss_file, String)
124+
end
125+
126+
return (sypd, cpu_max_rss)
112127
end
113128

114-
mode_name_atmos = "climaatmos"
115-
gpu_artifacts_dir_atmos = joinpath(output_dir, mode_name_atmos, gpu_job_id_atmos) * "_artifacts"
116-
cpu_artifacts_dir_atmos = joinpath(output_dir, mode_name_atmos, cpu_job_id_atmos) * "_artifacts"
129+
"""
130+
append_table_data(table_data, setup_id, cpu_job_id, gpu_job_id, cpu_artifacts_dir, gpu_artifacts_dir)
131+
132+
Append data for a given setup to the table data.
133+
"""
134+
function append_table_data(table_data, setup_id, cpu_job_id, gpu_job_id, cpu_artifacts_dir, gpu_artifacts_dir)
135+
# Get SYPD and allocation info for both input runs
136+
cpu_sypd, cpu_max_rss = get_run_data(cpu_artifacts_dir)
137+
gpu_sypd, gpu_cpu_max_rss = get_run_data(gpu_artifacts_dir)
138+
139+
# Create rows containing data for these runs
140+
new_table_data = [
141+
["" "job ID:" cpu_job_id gpu_job_id]
142+
[setup_id "SYPD:" cpu_sypd gpu_sypd]
143+
["" "CPU max RSS:" cpu_max_rss gpu_cpu_max_rss]
144+
]
145+
return vcat(table_data, new_table_data)
146+
end
117147

118-
# Read in SYPD and allocations info from artifacts directories
119-
function get_sypd_allocs(artifacts_dir)
120-
# Read in SYPD info
121-
sypd_file = open(joinpath(artifacts_dir, "sypd.txt"), "r")
122-
sypd = round(parse(Float64, read(sypd_file, String)), digits = 4)
123148

124-
# Read in allocations info
125-
cpu_allocs_file = open(joinpath(artifacts_dir, "allocations_cpu.txt"), "r")
126-
cpu_allocs = read(cpu_allocs_file, String)
149+
# Read in command line arguments
150+
parsed_args = ArgParse.parse_args(ARGS, argparse_settings())
151+
output_dir = parsed_args["coupler_output_dir"]
127152

128-
return (sypd, cpu_allocs)
153+
# Access buildkite pipeline ID (from `BUILDKITE_GITHUB_DEPLOYMENT_ID` variable)
154+
build_id = parsed_args["build_id"]
155+
if !isnothing(build_id)
156+
build_id_str = "Build ID: $build_id"
157+
else
158+
build_id_str = "Build ID: N/A"
129159
end
130160

131-
cpu_sypd_coupled, cpu_allocs_coupled = get_sypd_allocs(cpu_artifacts_dir_coupled)
132-
gpu_sypd_coupled, gpu_cpu_allocs_coupled = get_sypd_allocs(gpu_artifacts_dir_coupled)
133-
cpu_sypd_atmos, cpu_allocs_atmos = get_sypd_allocs(cpu_artifacts_dir_atmos)
134-
gpu_sypd_atmos, gpu_cpu_allocs_atmos = get_sypd_allocs(gpu_artifacts_dir_atmos)
161+
# Read in run info for each of the cases we want to compare
162+
run_info_coupled = get_run_info(parsed_args, "coupled")
163+
run_info_atmos_diagedmf = get_run_info(parsed_args, "atmos_diagedmf")
164+
run_info_atmos = get_run_info(parsed_args, "atmos")
135165

136166
# Set up info for PrettyTables.jl
137167
headers = [build_id_str, "Horiz. res.: 30 elems", "CPU Run [64 processes]", "GPU Run [4 A100s]"]
138168
data = [
139169
["" "Vert. res.: 63 levels" "" ""]
140170
["" "dt: 120secs" "" ""]
141-
["" "job ID:" cpu_job_id_coupled gpu_job_id_coupled]
142-
["Coupled" "SYPD:" cpu_sypd_coupled gpu_sypd_coupled]
143-
["" "CPU max RSS allocs:" cpu_allocs_coupled gpu_cpu_allocs_coupled]
144-
["" "job ID:" cpu_job_id_atmos gpu_job_id_atmos]
145-
["Atmos-only" "SYPD:" cpu_sypd_atmos gpu_sypd_atmos]
146-
["" "CPU max RSS allocs:" cpu_allocs_atmos gpu_cpu_allocs_atmos]
147171
]
148172

173+
# Append data to the table for each of the cases we want to compare
174+
data = append_table_data(data, "Coupled", run_info_coupled...)
175+
data = append_table_data(data, "Atmos with diag. EDMF", run_info_atmos_diagedmf...)
176+
data = append_table_data(data, "Atmos without diag. EDMF", run_info_atmos...)
177+
149178
# Use the coupled CPU job ID for the output dir
150-
table_output_dir = joinpath(output_dir, "compare_$(mode_name)_$(mode_name_atmos)_$(cpu_job_id_coupled)")
179+
cpu_job_id_coupled = run_info_coupled[1]
180+
table_output_dir = joinpath(output_dir, "compare_amip_climaatmos_$(cpu_job_id_coupled)")
151181
!isdir(table_output_dir) && mkdir(table_output_dir)
152182
table_path = joinpath(table_output_dir, "table.txt")
153183
open(table_path, "w") do f
154184
# Output the table, including lines before and after the header
155-
PrettyTables.pretty_table(f, data, header = headers, hlines = [0, 3, 6, 9])
185+
PrettyTables.pretty_table(f, data, header = headers, hlines = [0, 3, 6, 9, 12]) # TODO don't hardcode hlines
156186
end

src/Utilities.jl

+4-4
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ CPU of this process since it began.
8181
`comms_ctx`: the communication context being used to run the model
8282
"""
8383
function show_memory_usage(comms_ctx)
84-
cpu_allocs_GB = ""
84+
cpu_max_rss_GB = ""
8585
if ClimaComms.iamroot(comms_ctx)
86-
cpu_allocs_GB = "CPU: " * string(round(Sys.maxrss() / 1e9, digits = 3)) * " GiB"
87-
@info cpu_allocs_GB
86+
cpu_max_rss_GB = string(round(Sys.maxrss() / 1e9, digits = 3)) * " GiB"
87+
@info cpu_max_rss_GB
8888
end
89-
return cpu_allocs_GB
89+
return cpu_max_rss_GB
9090
end
9191

9292
end # module

test/component_model_tests/climaatmos_standalone/atmos_driver.jl

+8-6
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,15 @@ es = CA.EfficiencyStats(tspan, walltime)
6767
sypd = CA.simulated_years_per_day(es)
6868
@info "SYPD: $sypd"
6969

70-
## Save the SYPD and allocation information
70+
## Save the SYPD and max RSS information
7171
comms_ctx = atmos_config.comms_ctx
7272
if ClimaComms.iamroot(comms_ctx)
73-
sypd_filename = joinpath(output_dir, "sypd.txt")
74-
write(sypd_filename, "$sypd")
73+
open(joinpath(output_dir, "sypd.txt"), "w") do sypd_filename
74+
write(sypd_filename, "$sypd")
75+
end
7576

76-
cpu_allocs_GB = Utilities.show_memory_usage(comms_ctx)
77-
cpu_allocs_filename = joinpath(output_dir, "allocations_cpu.txt")
78-
write(cpu_allocs_filename, cpu_allocs_GB)
77+
open(joinpath(output_dir, "max_rss_cpu.txt"), "w") do cpu_max_rss_filename
78+
cpu_max_rss_GB = Utilities.show_memory_usage(comms_ctx)
79+
write(cpu_max_rss_filename, cpu_max_rss_GB)
80+
end
7981
end

toml/diagnostic_edmfx.toml

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[entr_inv_tau]
2+
value = 0.002
3+
4+
[entr_coeff]
5+
value = 0
6+
7+
[detr_inv_tau]
8+
value = 0
9+
10+
[detr_buoy_coeff]
11+
value = 0.12
12+
13+
[detr_vertdiv_coeff]
14+
value = 0.6
15+
16+
[min_area_limiter_scale]
17+
value = 0
18+
19+
[max_area_limiter_scale]
20+
value = 0

0 commit comments

Comments
 (0)