Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

some new scaling numbers on Frontier #2904

Merged
merged 7 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# scaling numbers for the 3D XRB -- using subch_simple
# using the same inputs.He.25cm.static.1000Hz as previously
# modules:
#
# module load PrgEnv-gnu
# module load cray-mpich/8.1.27
# module load craype-accel-amd-gfx90a
# module load amd-mixed/6.0.0
# module unload darshan-runtime
#
# build info:
#
# EOS: /ccs/home/zingale/Microphysics/EOS/helmholtz
# NETWORK: /ccs/home/zingale/Microphysics/networks/subch_simple
# CONDUCTIVITY: /ccs/home/zingale/Microphysics/conductivity/stellar
# INTEGRATOR: VODE
# SCREENING: screen5
#
# Castro git describe: 24.07
# AMReX git describe: 24.07-3-g7dc2081a9
# Microphysics git describe: 24.07-1-g0a96241b


# nodes rocm mag_grid_size avg time / std dev
# step
# 48 6.0 128 --- crashes due to not enough GPU memory ---
64 6.0 128 167.502 0.419448
128 6.0 128 85.4082 0.29416
256 6.0 128 46.4092 0.876868
512 6.0 128 25.5446 0.123848
1024 6.0 128 17.3517 0.0857189
2048 6.0 128 13.564 0.0498023

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# new scaling numbers for the 3D XRB
# using the same inputs.He.25cm.static.1000Hz as previously
# modules:
#
# module load PrgEnv-gnu
# module load cray-mpich/8.1.27
# module load craype-accel-amd-gfx90a
# module load amd-mixed/6.0.0
# module unload darshan-runtime
#
# build info:
#
# EOS: /ccs/home/zingale/Microphysics/EOS/helmholtz
# NETWORK: /ccs/home/zingale/Microphysics/networks/iso7
# CONDUCTIVITY: /ccs/home/zingale/Microphysics/conductivity/stellar
# INTEGRATOR: VODE
# SCREENING: screen5
#
# Castro git describe: 24.07
# AMReX git describe: 24.07-3-g7dc2081a9
# Microphysics git describe: 24.07-1-g0a96241b

# nodes rocm mag_grid_size avg time / std dev
# step
48 6.0 128 69.3646 0.286297
64 6.0 128 50.2029 0.681104
128 6.0 128 28.4001 0.221368
256 6.0 128 15.7771 0.133056
512 6.0 128 9.6077 0.331669
1024 6.0 128 6.66329 0.103599
2048 6.0 128 5.15287 0.0542774

# note that the 2048 run uses a blocking factor of 16)

# in contrast to the previous run, we've disabled all inlining with
# ROCm to get around some compiler bugs, so that might explain some
# slight slowdown here.

Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# new scaling numbers for the 3D XRB
# using the same inputs.He.25cm.static.1000Hz as previously
# modules:
#
# module load PrgEnv-gnu
# module load cray-mpich/8.1.27
# module load craype-accel-amd-gfx90a
# module load amd-mixed/6.0.0
# module unload darshan-runtime
#
# build info:
#
# EOS: /ccs/home/zingale/Microphysics/EOS/helmholtz
# NETWORK: /ccs/home/zingale/Microphysics/networks/iso7
# CONDUCTIVITY: /ccs/home/zingale/Microphysics/conductivity/stellar
# INTEGRATOR: RKC
# SCREENING: screen5
#
# Castro git describe: 24.07
# AMReX git describe: 24.07-3-g7dc2081a9
# Microphysics git describe: 24.07-1-g0a96241b

# nodes rocm mag_grid_size avg time / std dev
# step
48 6.0 128 54.7583 0.287289
64 6.0 128 39.4336 0.32115
128 6.0 128 22.8627 0.22725
256 6.0 128 12.9558 0.12379
512 6.0 128 7.9563 0.104311
1024 6.0 128 5.65801 0.109306
2048 6.0 128 4.33241 0.0577508

# note the 2048 node run using blocking_factor = 16


# in contrast to the previous run, we've disabled all inlining with
# ROCm to get around some compiler bugs, so that might explain some
# slight slowdown here.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
54 changes: 24 additions & 30 deletions Exec/science/flame_wave/scaling/frontier/frontier_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
plt.rc("axes", linewidth=1.5)
plt.rc("lines", markeredgewidth=1.5)

frontier_data = np.loadtxt("frontier-scaling-2023-04-06.txt")
frontier_data = np.loadtxt("frontier-scaling-2024-07-04.txt")

frontier_nodes = frontier_data[:, 0]
frontier_times = frontier_data[:, 3]
frontier_std = frontier_data[:, 4]

frontier_rkc_data = np.loadtxt("frontier-scaling-rkc-2023-05-31.txt")
frontier_rkc_data = np.loadtxt("frontier-scaling-rkc-2024-07-04.txt")

frontier_rkc_nodes = frontier_rkc_data[:, 0]
frontier_rkc_times = frontier_rkc_data[:, 3]
Expand All @@ -27,6 +27,12 @@
summit_times = summit_data[:, 2]
summit_std = summit_data[:, 3]

frontier_bignet_data = np.loadtxt("frontier-scaling-2024-07-04-subch_simple.txt")

frontier_bignet_nodes = frontier_bignet_data[:, 0]
frontier_bignet_times = frontier_bignet_data[:, 3]
frontier_bignet_std = frontier_bignet_data[:, 4]


def trend_line(c, t):
cnew = np.array(sorted(list(set(c))))
Expand All @@ -39,43 +45,31 @@ def trend_line(c, t):

fig, ax = plt.subplots(1)

ax.errorbar(frontier_nodes, frontier_times, yerr=frontier_std, ls="None", marker="x", label="Frontier (ROCm 5.3)")
ax.errorbar(frontier_rkc_nodes, frontier_rkc_times, yerr=frontier_rkc_std, ls="None", marker="x", label="Frontier (RKC integrator)")
ax.errorbar(summit_nodes, summit_times, yerr=summit_std, ls="None", marker="x", label="Summit (CUDA 11.4)")
ax.errorbar(frontier_nodes, frontier_times, yerr=frontier_std,
ls="None", marker="x", label="Frontier (ROCm 6.0)")
ax.errorbar(frontier_rkc_nodes, frontier_rkc_times, yerr=frontier_rkc_std,
ls="None", marker="x", label="Frontier (ROCm 6.0; RKC integrator)")
ax.errorbar(summit_nodes, summit_times, yerr=summit_std,
ls="None", marker="^", label="Summit (CUDA 11.4)")
ax.errorbar(frontier_bignet_nodes, frontier_bignet_times, yerr=frontier_bignet_std,
ls="None", marker="o", label="Frontier (ROCm 6.0; big network)")

c, t = trend_line(frontier_nodes, frontier_times)
ax.plot(c, t, alpha=0.5, linestyle=":")

ax.set_ylabel("wallclock time / step")
ax.set_xlabel("number of nodes")

ax.set_xscale("log")
ax.set_yscale("log")

ax.legend()
ax.plot(c, t, alpha=0.5, linestyle=":", color="k")

fig.savefig("frontier_flame_wave_scaling.png")


# now by GPUs

fig, ax = plt.subplots(1)
c, t = trend_line(frontier_bignet_nodes, frontier_bignet_times)
ax.plot(c, t, alpha=0.5, linestyle=":", color="k")

nfrontier_gpu = 8
nsummit_gpu = 6

ax.errorbar(frontier_nodes * nfrontier_gpu, frontier_times, yerr=frontier_std, ls="None", marker="x", label="Frontier (ROCm 5.3)")
ax.errorbar(summit_nodes * nsummit_gpu, summit_times, yerr=summit_std, ls="None", marker="x", label="Summit (CUDA 11.4)")

c, t = trend_line(frontier_nodes * nfrontier_gpu, frontier_times)
ax.plot(c, t, alpha=0.5, linestyle=":")

ax.set_ylabel("wallclock time / step")
ax.set_xlabel("number of GPUs")
ax.set_xlabel("number of nodes")

ax.set_xscale("log")
ax.set_yscale("log")

ax.legend()

fig.savefig("frontier_flame_wave_scaling_by_gpus.png")
ax.set_title("3D XRB flame scaling")

fig.tight_layout()
fig.savefig("frontier_flame_wave_scaling.png")