Merge pull request #817 from CliMA/js/table-atmos

juliasloan25 · web-flow · commit 24688f0f5b3b · 2024-06-07T19:38:56.000-07:00
add atmos to table
diff --git a/.buildkite/benchmarks/README.md b/.buildkite/benchmarks/README.md
@@ -39,9 +39,5 @@ temperature and sea ice
 ### Comparison Metrics
 - Simulated years per day (SYPD): The number of years of simulation time we
 can run in 1 day of walltime
-- CPU simulation object allocations: The allocations in GB of the simulation
-object, which contains everything needed to run the simulation.
-In the atmosphere-only case, this is the `AtmosSimulation` object.
-In the coupled case, this is the `CoupledSimulation` object, which includes
-all of the component models, coupler fields, and auxiliary objects. More
-information on this object can be found in the `Interfacer` docs.
+- CPU maximum resident set size (max RSS): The max RSS memory footprint on the
+CPU of this process since it began. This is measured for both CPU and GPU runs.
diff --git a/.buildkite/benchmarks/pipeline.yml b/.buildkite/benchmarks/pipeline.yml
@@ -43,6 +43,18 @@ steps:
 
   - group: "CPU benchmarks"
     steps:
+      - label: "CPU ClimaAtmos without diagnostic EDMF"
+        key: "climaatmos"
+        command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id climaatmos"
+        artifact_paths: "experiments/ClimaEarth/output/climaatmos/climaatmos_artifacts/*"
+        env:
+          BUILD_HISTORY_HANDLE: ""
+          CLIMACOMMS_DEVICE: "CPU"
+        agents:
+          slurm_ntasks_per_node: 64
+          slurm_nodes: 1
+          slurm_mem_per_cpu: 4GB
+
       - label: "CPU ClimaAtmos with diagnostic EDMF"
         key: "climaatmos_diagedmf"
         command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id climaatmos_diagedmf"
@@ -69,6 +81,16 @@ steps:
 
   - group: "GPU benchmarks"
     steps:
+      - label: "GPU ClimaAtmos without diagnostic EDMF"
+        key: "gpu_climaatmos"
+        command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id gpu_climaatmos"
+        artifact_paths: "experiments/ClimaEarth/output/climaatmos/gpu_climaatmos_artifacts/*"
+        agents:
+          slurm_gpus_per_task: 1
+          slurm_cpus_per_task: 4
+          slurm_ntasks: 4
+          slurm_mem: 16GB
+
       - label: "GPU ClimaAtmos with diagnostic EDMF"
         key: "gpu_climaatmos_diagedmf"
         command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id gpu_climaatmos_diagedmf"
@@ -93,11 +115,13 @@ steps:
     steps:
       - label: "Compare AMIP/Atmos-only with diagnostic EDMF"
         key: "compare_amip_climaatmos_amip_diagedmf"
-        command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/user_io/benchmarks.jl --cpu_job_id_coupled amip_diagedmf --cpu_job_id_atmos climaatmos_diagedmf --gpu_job_id_coupled gpu_amip_diagedmf --gpu_job_id_atmos gpu_climaatmos_diagedmf --mode_name amip --build_id $BUILDKITE_BUILD_NUMBER"
+        command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/user_io/benchmarks.jl --cpu_job_id_coupled amip_diagedmf --cpu_job_id_atmos_diagedmf climaatmos_diagedmf --cpu_job_id_atmos climaatmos --build_id $BUILDKITE_BUILD_NUMBER"
         artifact_paths: "experiments/ClimaEarth/output/compare_amip_climaatmos_amip_diagedmf/*"
         depends_on:
+          - "climaatmos"
           - "climaatmos_diagedmf"
           - "amip_diagedmf"
+          - "gpu_climaatmos"
           - "gpu_climaatmos_diagedmf"
           - "gpu_amip_diagedmf"
 
diff --git a/config/benchmark_configs/climaatmos.yml b/config/benchmark_configs/climaatmos.yml
@@ -0,0 +1,21 @@
+FLOAT_TYPE: "Float32"
+approximate_linear_solve_iters: 2
+dt: 120secs
+dt_cloud_fraction: 1hours
+dt_rad: 1hours
+dt_save_state_to_disk: "Inf"
+dt_save_to_sol: "Inf"
+dz_bottom: 30.0
+dz_top: 3000.0
+h_elem: 30
+moist: equil
+output_default_diagnostics: false
+precip_model: 0M
+prognostic_tke: true
+rad: allskywithclear
+surface_setup: DefaultMoninObukhov
+t_end: 12hours
+turb_flux_partition: "CombinedStateFluxesMOST"
+vert_diff: "true"
+z_elem: 63
+z_max: 55000.0
diff --git a/experiments/ClimaEarth/run_amip.jl b/experiments/ClimaEarth/run_amip.jl
@@ -823,9 +823,9 @@ if ClimaComms.iamroot(comms_ctx)
     sypd_filename = joinpath(dir_paths.artifacts, "sypd.txt")
     write(sypd_filename, "$sypd")
 
-    cpu_allocs_GB = Utilities.show_memory_usage(comms_ctx)
-    cpu_allocs_filename = joinpath(dir_paths.artifacts, "allocations_cpu.txt")
-    write(cpu_allocs_filename, cpu_allocs_GB)
+    cpu_max_rss_GB = Utilities.show_memory_usage(comms_ctx)
+    cpu_max_rss_filename = joinpath(dir_paths.artifacts, "max_rss_cpu.txt")
+    write(cpu_max_rss_filename, cpu_max_rss_GB)
 end
 
 #=
diff --git a/experiments/ClimaEarth/user_io/benchmarks.jl b/experiments/ClimaEarth/user_io/benchmarks.jl
@@ -2,8 +2,8 @@
 Our goal here is to output a table displaying some results from benchmark runs
 in the coupler. We want to be able to compare between CPU and GPU runs, as well
 as between coupled and atmos-only runs. The metrics we want to compare are
-SYPD, allocations, and the maximum, median, and mean differences between the
-CPU and GPU states.
+SYPD, memory usage, allocations, and the maximum, median, and mean differences
+between the CPU and GPU states.
 
 The table should look something like this (note that the last 3 columns will be
 added in a future PR):
@@ -12,11 +12,11 @@ added in a future PR):
 ------------------------------------------------------------------------------------
 |             |   $job_id    |   $job_id   |           |              |              |
 | Coupled run |    $SYPD     |    $SYPD    | $max_diff | $median_diff |  $mean_diff  |
-|             | $cpu_allocs  | $cpu_allocs |           |              |              |
+|             | $cpu_max_rss | $cpu_max_rss|           |              |              |
 ------------------------------------------------------------------------------------
 |             |   $job_id    |   $job_id   |           |              |              |
 | Atmos-only  |    $SYPD     |    $SYPD    | $max_diff | $median_diff |  $mean_diff  |
-|             | $cpu_allocs  | $cpu_allocs |           |              |              |
+|             | $cpu_max_rss | $cpu_max_rss|           |              |              |
 ------------------------------------------------------------------------------------
 
 =#
@@ -37,15 +37,19 @@ function argparse_settings()
         arg_type = String
         default = nothing
         "--cpu_job_id_atmos"
-        help = "The name of the CPU atmos-only run we want to compare. User must specify CPU and/or GPU atmos-only run name."
+        help = "The name of the CPU atmos-only run without diagnostic EDMF we want to compare. User must specify CPU and/or GPU atmos-only non-EDMF run name."
         arg_type = String
         default = nothing
         "--gpu_job_id_atmos"
         help = "The name of the GPU atmos-only run we want to compare."
         arg_type = String
         default = nothing
-        "--mode_name"
-        help = "The mode of the simulations being compared (`slabplanet` or `AMIP`)."
+        "--cpu_job_id_atmos_diagedmf"
+        help = "The name of the CPU atmos-only run with diagnostic EDMF we want to compare. User must specify CPU and/or GPU atmos-only EDMF run name."
+        arg_type = String
+        default = nothing
+        "--gpu_job_id_atmos_diagedmf"
+        help = "The name of the GPU atmos-only run we want to compare."
         arg_type = String
         default = nothing
         "--coupler_output_dir"
@@ -60,97 +64,123 @@ function argparse_settings()
     return s
 end
 
-# Parse command line arguments
-parsed_args = ArgParse.parse_args(ARGS, argparse_settings())
+"""
+    get_run_info(parsed_args, run_type)
+
+Use the input `parsed_args` to get the job ID and artifacts directories for
+both the CPU and GPU runs of the given `run_type`.
+
+`run_type` must be one of "coupled", "atmos", or "atmos_diagedmf".
+"""
+function get_run_info(parsed_args, run_type)
+    # Read in CPU and GPU job ID info from command line
+    if run_type == "coupled"
+        cpu_job_id = parsed_args["cpu_job_id_coupled"]
+        gpu_job_id = parsed_args["gpu_job_id_coupled"]
+        mode_name = "amip"
+    elseif run_type == "atmos_diagedmf"
+        cpu_job_id = parsed_args["cpu_job_id_atmos_diagedmf"]
+        gpu_job_id = parsed_args["gpu_job_id_atmos_diagedmf"]
+        mode_name = "climaatmos"
+    elseif run_type == "atmos"
+        cpu_job_id = parsed_args["cpu_job_id_atmos"]
+        gpu_job_id = parsed_args["gpu_job_id_atmos"]
+        mode_name = "climaatmos"
+    else
+        error("Invalid run type: $run_type")
+    end
 
-# Access buildkite pipeline ID (from `BUILDKITE_GITHUB_DEPLOYMENT_ID` variable)
-build_id = parsed_args["build_id"]
-if !isnothing(build_id)
-    build_id_str = "Build ID: $build_id"
-else
-    build_id_str = "Build ID: N/A"
-end
+    # Verify that the user has provided the necessary job IDs
+    # If only one job ID of the CPU/GPU run pair is provided, the other will be inferred
+    if isnothing(cpu_job_id) && isnothing(gpu_job_id)
+        error("Must pass CPU and/or GPU coupled run name to compare them.")
+    elseif isnothing(gpu_job_id)
+        gpu_job_id = "gpu_" * cpu_job_id
+    elseif isnothing(cpu_job_id)
+        cpu_job_id = gpu_job_id[5:end]
+    end
 
-# Construct CPU and GPU artifacts directories
-output_dir = parsed_args["coupler_output_dir"]
+    # Construct CPU and GPU artifacts directories
+    cpu_artifacts_dir = joinpath(output_dir, mode_name, cpu_job_id) * "_artifacts"
+    gpu_artifacts_dir = joinpath(output_dir, mode_name, gpu_job_id) * "_artifacts"
 
-# Coupled runs
-# Read in CPU and GPU run name info from command line
-cpu_job_id_coupled = parsed_args["cpu_job_id_coupled"]
-gpu_job_id_coupled = parsed_args["gpu_job_id_coupled"]
-if isnothing(cpu_job_id_coupled) && isnothing(gpu_job_id_coupled)
-    error("Must pass CPU and/or GPU coupled run name to compare them.")
-elseif isnothing(gpu_job_id_coupled)
-    gpu_job_id_coupled = "gpu_" * cpu_job_id_coupled
-elseif isnothing(cpu_job_id_coupled)
-    cpu_job_id_coupled = gpu_job_id_coupled[5:end]
+    return (cpu_job_id, gpu_job_id, cpu_artifacts_dir, gpu_artifacts_dir)
 end
 
-# Read in mode name from command line (or retrieve from run name).
-# Note that we expect this to be the same for all 4 simulations being compared.
-mode_name = parsed_args["mode_name"]
-if isnothing(mode_name)
-    mode_name =
-        occursin("amip", cpu_job_id_coupled) ? "amip" :
-        (occursin("slabplanet", cpu_job_id_coupled) ? "slabplanet" : error("Please provide a valid `mode_name`."))
-end
+"""
+    get_run_data(artifacts_dir)
 
-gpu_artifacts_dir_coupled = joinpath(output_dir, mode_name, gpu_job_id_coupled) * "_artifacts"
-cpu_artifacts_dir_coupled = joinpath(output_dir, mode_name, cpu_job_id_coupled) * "_artifacts"
-
-# Atmos-only runs
-# Read in CPU and GPU run name info from command line
-cpu_job_id_atmos = parsed_args["cpu_job_id_atmos"]
-gpu_job_id_atmos = parsed_args["gpu_job_id_atmos"]
-if isnothing(cpu_job_id_atmos) && isnothing(gpu_job_id_atmos)
-    error("Must pass CPU and/or GPU coupled run name to compare them.")
-elseif isnothing(gpu_job_id_atmos)
-    gpu_job_id_atmos = "gpu_" * cpu_job_id_atmos
-elseif isnothing(cpu_job_id_atmos)
-    cpu_job_id_atmos = gpu_job_id_atmos[5:end]
-    cpu_artifacts_dir_atmos = joinpath(output_dir, cpu_job_id_atmos)
+Read in run data from artifacts directories, currently SYPD and max RSS on the CPU.
+"""
+function get_run_data(artifacts_dir)
+    # Read in SYPD info
+    sypd = open(joinpath(artifacts_dir, "sypd.txt"), "r") do sypd_file
+        round(parse(Float64, read(sypd_file, String)), digits = 4)
+    end
+
+    # Read in max RSS info
+    cpu_max_rss = open(joinpath(artifacts_dir, "max_rss_cpu.txt"), "r") do cpu_max_rss_file
+        read(cpu_max_rss_file, String)
+    end
+
+    return (sypd, cpu_max_rss)
 end
 
-mode_name_atmos = "climaatmos"
-gpu_artifacts_dir_atmos = joinpath(output_dir, mode_name_atmos, gpu_job_id_atmos) * "_artifacts"
-cpu_artifacts_dir_atmos = joinpath(output_dir, mode_name_atmos, cpu_job_id_atmos) * "_artifacts"
+"""
+    append_table_data(table_data, setup_id, cpu_job_id, gpu_job_id, cpu_artifacts_dir, gpu_artifacts_dir)
+
+Append data for a given setup to the table data.
+"""
+function append_table_data(table_data, setup_id, cpu_job_id, gpu_job_id, cpu_artifacts_dir, gpu_artifacts_dir)
+    # Get SYPD and allocation info for both input runs
+    cpu_sypd, cpu_max_rss = get_run_data(cpu_artifacts_dir)
+    gpu_sypd, gpu_cpu_max_rss = get_run_data(gpu_artifacts_dir)
+
+    # Create rows containing data for these runs
+    new_table_data = [
+        ["" "job ID:" cpu_job_id gpu_job_id]
+        [setup_id "SYPD:" cpu_sypd gpu_sypd]
+        ["" "CPU max RSS:" cpu_max_rss gpu_cpu_max_rss]
+    ]
+    return vcat(table_data, new_table_data)
+end
 
-# Read in SYPD and allocations info from artifacts directories
-function get_sypd_allocs(artifacts_dir)
-    # Read in SYPD info
-    sypd_file = open(joinpath(artifacts_dir, "sypd.txt"), "r")
-    sypd = round(parse(Float64, read(sypd_file, String)), digits = 4)
 
-    # Read in allocations info
-    cpu_allocs_file = open(joinpath(artifacts_dir, "allocations_cpu.txt"), "r")
-    cpu_allocs = read(cpu_allocs_file, String)
+# Read in command line arguments
+parsed_args = ArgParse.parse_args(ARGS, argparse_settings())
+output_dir = parsed_args["coupler_output_dir"]
 
-    return (sypd, cpu_allocs)
+# Access buildkite pipeline ID (from `BUILDKITE_GITHUB_DEPLOYMENT_ID` variable)
+build_id = parsed_args["build_id"]
+if !isnothing(build_id)
+    build_id_str = "Build ID: $build_id"
+else
+    build_id_str = "Build ID: N/A"
 end
 
-cpu_sypd_coupled, cpu_allocs_coupled = get_sypd_allocs(cpu_artifacts_dir_coupled)
-gpu_sypd_coupled, gpu_cpu_allocs_coupled = get_sypd_allocs(gpu_artifacts_dir_coupled)
-cpu_sypd_atmos, cpu_allocs_atmos = get_sypd_allocs(cpu_artifacts_dir_atmos)
-gpu_sypd_atmos, gpu_cpu_allocs_atmos = get_sypd_allocs(gpu_artifacts_dir_atmos)
+# Read in run info for each of the cases we want to compare
+run_info_coupled = get_run_info(parsed_args, "coupled")
+run_info_atmos_diagedmf = get_run_info(parsed_args, "atmos_diagedmf")
+run_info_atmos = get_run_info(parsed_args, "atmos")
 
 # Set up info for PrettyTables.jl
 headers = [build_id_str, "Horiz. res.: 30 elems", "CPU Run [64 processes]", "GPU Run [4 A100s]"]
 data = [
     ["" "Vert. res.: 63 levels" "" ""]
     ["" "dt: 120secs" "" ""]
-    ["" "job ID:" cpu_job_id_coupled gpu_job_id_coupled]
-    ["Coupled" "SYPD:" cpu_sypd_coupled gpu_sypd_coupled]
-    ["" "CPU max RSS allocs:" cpu_allocs_coupled gpu_cpu_allocs_coupled]
-    ["" "job ID:" cpu_job_id_atmos gpu_job_id_atmos]
-    ["Atmos-only" "SYPD:" cpu_sypd_atmos gpu_sypd_atmos]
-    ["" "CPU max RSS allocs:" cpu_allocs_atmos gpu_cpu_allocs_atmos]
 ]
 
+# Append data to the table for each of the cases we want to compare
+data = append_table_data(data, "Coupled", run_info_coupled...)
+data = append_table_data(data, "Atmos with diag. EDMF", run_info_atmos_diagedmf...)
+data = append_table_data(data, "Atmos without diag. EDMF", run_info_atmos...)
+
 # Use the coupled CPU job ID for the output dir
-table_output_dir = joinpath(output_dir, "compare_$(mode_name)_$(mode_name_atmos)_$(cpu_job_id_coupled)")
+cpu_job_id_coupled = run_info_coupled[1]
+table_output_dir = joinpath(output_dir, "compare_amip_climaatmos_$(cpu_job_id_coupled)")
 !isdir(table_output_dir) && mkdir(table_output_dir)
 table_path = joinpath(table_output_dir, "table.txt")
 open(table_path, "w") do f
     # Output the table, including lines before and after the header
-    PrettyTables.pretty_table(f, data, header = headers, hlines = [0, 3, 6, 9])
+    PrettyTables.pretty_table(f, data, header = headers, hlines = [0, 3, 6, 9, 12]) # TODO don't hardcode hlines
 end
diff --git a/src/Utilities.jl b/src/Utilities.jl
@@ -81,12 +81,12 @@ CPU of this process since it began.
 `comms_ctx`: the communication context being used to run the model
 """
 function show_memory_usage(comms_ctx)
-    cpu_allocs_GB = ""
+    cpu_max_rss_GB = ""
     if ClimaComms.iamroot(comms_ctx)
-        cpu_allocs_GB = "CPU: " * string(round(Sys.maxrss() / 1e9, digits = 3)) * " GiB"
-        @info cpu_allocs_GB
+        cpu_max_rss_GB = string(round(Sys.maxrss() / 1e9, digits = 3)) * " GiB"
+        @info cpu_max_rss_GB
     end
-    return cpu_allocs_GB
+    return cpu_max_rss_GB
 end
 
 end # module
diff --git a/test/component_model_tests/climaatmos_standalone/atmos_driver.jl b/test/component_model_tests/climaatmos_standalone/atmos_driver.jl
@@ -67,13 +67,15 @@ es = CA.EfficiencyStats(tspan, walltime)
 sypd = CA.simulated_years_per_day(es)
 @info "SYPD: $sypd"
 
-## Save the SYPD and allocation information
+## Save the SYPD and max RSS information
 comms_ctx = atmos_config.comms_ctx
 if ClimaComms.iamroot(comms_ctx)
-    sypd_filename = joinpath(output_dir, "sypd.txt")
-    write(sypd_filename, "$sypd")
+    open(joinpath(output_dir, "sypd.txt"), "w") do sypd_filename
+        write(sypd_filename, "$sypd")
+    end
 
-    cpu_allocs_GB = Utilities.show_memory_usage(comms_ctx)
-    cpu_allocs_filename = joinpath(output_dir, "allocations_cpu.txt")
-    write(cpu_allocs_filename, cpu_allocs_GB)
+    open(joinpath(output_dir, "max_rss_cpu.txt"), "w") do cpu_max_rss_filename
+        cpu_max_rss_GB = Utilities.show_memory_usage(comms_ctx)
+        write(cpu_max_rss_filename, cpu_max_rss_GB)
+    end
 end
diff --git a/toml/diagnostic_edmfx.toml b/toml/diagnostic_edmfx.toml
@@ -0,0 +1,20 @@
+[entr_inv_tau]
+value = 0.002
+
+[entr_coeff]
+value = 0
+
+[detr_inv_tau]
+value = 0
+
+[detr_buoy_coeff]
+value = 0.12
+
+[detr_vertdiv_coeff]
+value = 0.6
+
+[min_area_limiter_scale]
+value = 0
+
+[max_area_limiter_scale]
+value = 0