Skip to content

Commit

Permalink
some new scaling numbers on Frontier (#2904)
Browse files Browse the repository at this point in the history
  • Loading branch information
zingale authored Jul 12, 2024
1 parent 6b4f1c3 commit 2fe8c1f
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# scaling numbers for the 3D XRB -- using subch_simple
# using the same inputs.He.25cm.static.1000Hz as previously
# modules:
#
# module load PrgEnv-gnu
# module load cray-mpich/8.1.27
# module load craype-accel-amd-gfx90a
# module load amd-mixed/6.0.0
# module unload darshan-runtime
#
# build info:
#
# EOS: /ccs/home/zingale/Microphysics/EOS/helmholtz
# NETWORK: /ccs/home/zingale/Microphysics/networks/subch_simple
# CONDUCTIVITY: /ccs/home/zingale/Microphysics/conductivity/stellar
# INTEGRATOR: VODE
# SCREENING: screen5
#
# Castro git describe: 24.07
# AMReX git describe: 24.07-3-g7dc2081a9
# Microphysics git describe: 24.07-1-g0a96241b


# nodes rocm mag_grid_size avg time / std dev
# step
# 48 6.0 128 --- crashes due to not enough GPU memory ---
64 6.0 128 167.502 0.419448
128 6.0 128 85.4082 0.29416
256 6.0 128 46.4092 0.876868
512 6.0 128 25.5446 0.123848
1024 6.0 128 17.3517 0.0857189
2048 6.0 128 13.564 0.0498023

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# new scaling numbers for the 3D XRB
# using the same inputs.He.25cm.static.1000Hz as previously
# modules:
#
# module load PrgEnv-gnu
# module load cray-mpich/8.1.27
# module load craype-accel-amd-gfx90a
# module load amd-mixed/6.0.0
# module unload darshan-runtime
#
# build info:
#
# EOS: /ccs/home/zingale/Microphysics/EOS/helmholtz
# NETWORK: /ccs/home/zingale/Microphysics/networks/iso7
# CONDUCTIVITY: /ccs/home/zingale/Microphysics/conductivity/stellar
# INTEGRATOR: VODE
# SCREENING: screen5
#
# Castro git describe: 24.07
# AMReX git describe: 24.07-3-g7dc2081a9
# Microphysics git describe: 24.07-1-g0a96241b

# nodes rocm mag_grid_size avg time / std dev
# step
48 6.0 128 69.3646 0.286297
64 6.0 128 50.2029 0.681104
128 6.0 128 28.4001 0.221368
256 6.0 128 15.7771 0.133056
512 6.0 128 9.6077 0.331669
1024 6.0 128 6.66329 0.103599
2048 6.0 128 5.15287 0.0542774

# note that the 2048 run uses a blocking factor of 16)

# in contrast to the previous run, we've disabled all inlining with
# ROCm to get around some compiler bugs, so that might explain some
# slight slowdown here.

Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# new scaling numbers for the 3D XRB
# using the same inputs.He.25cm.static.1000Hz as previously
# modules:
#
# module load PrgEnv-gnu
# module load cray-mpich/8.1.27
# module load craype-accel-amd-gfx90a
# module load amd-mixed/6.0.0
# module unload darshan-runtime
#
# build info:
#
# EOS: /ccs/home/zingale/Microphysics/EOS/helmholtz
# NETWORK: /ccs/home/zingale/Microphysics/networks/iso7
# CONDUCTIVITY: /ccs/home/zingale/Microphysics/conductivity/stellar
# INTEGRATOR: RKC
# SCREENING: screen5
#
# Castro git describe: 24.07
# AMReX git describe: 24.07-3-g7dc2081a9
# Microphysics git describe: 24.07-1-g0a96241b

# nodes rocm mag_grid_size avg time / std dev
# step
48 6.0 128 54.7583 0.287289
64 6.0 128 39.4336 0.32115
128 6.0 128 22.8627 0.22725
256 6.0 128 12.9558 0.12379
512 6.0 128 7.9563 0.104311
1024 6.0 128 5.65801 0.109306
2048 6.0 128 4.33241 0.0577508

# note the 2048 node run using blocking_factor = 16


# in contrast to the previous run, we've disabled all inlining with
# ROCm to get around some compiler bugs, so that might explain some
# slight slowdown here.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
54 changes: 24 additions & 30 deletions Exec/science/flame_wave/scaling/frontier/frontier_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
plt.rc("axes", linewidth=1.5)
plt.rc("lines", markeredgewidth=1.5)

frontier_data = np.loadtxt("frontier-scaling-2023-04-06.txt")
frontier_data = np.loadtxt("frontier-scaling-2024-07-04.txt")

frontier_nodes = frontier_data[:, 0]
frontier_times = frontier_data[:, 3]
frontier_std = frontier_data[:, 4]

frontier_rkc_data = np.loadtxt("frontier-scaling-rkc-2023-05-31.txt")
frontier_rkc_data = np.loadtxt("frontier-scaling-rkc-2024-07-04.txt")

frontier_rkc_nodes = frontier_rkc_data[:, 0]
frontier_rkc_times = frontier_rkc_data[:, 3]
Expand All @@ -27,6 +27,12 @@
summit_times = summit_data[:, 2]
summit_std = summit_data[:, 3]

frontier_bignet_data = np.loadtxt("frontier-scaling-2024-07-04-subch_simple.txt")

frontier_bignet_nodes = frontier_bignet_data[:, 0]
frontier_bignet_times = frontier_bignet_data[:, 3]
frontier_bignet_std = frontier_bignet_data[:, 4]


def trend_line(c, t):
cnew = np.array(sorted(list(set(c))))
Expand All @@ -39,43 +45,31 @@ def trend_line(c, t):

fig, ax = plt.subplots(1)

ax.errorbar(frontier_nodes, frontier_times, yerr=frontier_std, ls="None", marker="x", label="Frontier (ROCm 5.3)")
ax.errorbar(frontier_rkc_nodes, frontier_rkc_times, yerr=frontier_rkc_std, ls="None", marker="x", label="Frontier (RKC integrator)")
ax.errorbar(summit_nodes, summit_times, yerr=summit_std, ls="None", marker="x", label="Summit (CUDA 11.4)")
ax.errorbar(frontier_nodes, frontier_times, yerr=frontier_std,
ls="None", marker="x", label="Frontier (ROCm 6.0)")
ax.errorbar(frontier_rkc_nodes, frontier_rkc_times, yerr=frontier_rkc_std,
ls="None", marker="x", label="Frontier (ROCm 6.0; RKC integrator)")
ax.errorbar(summit_nodes, summit_times, yerr=summit_std,
ls="None", marker="^", label="Summit (CUDA 11.4)")
ax.errorbar(frontier_bignet_nodes, frontier_bignet_times, yerr=frontier_bignet_std,
ls="None", marker="o", label="Frontier (ROCm 6.0; big network)")

c, t = trend_line(frontier_nodes, frontier_times)
ax.plot(c, t, alpha=0.5, linestyle=":")

ax.set_ylabel("wallclock time / step")
ax.set_xlabel("number of nodes")

ax.set_xscale("log")
ax.set_yscale("log")

ax.legend()
ax.plot(c, t, alpha=0.5, linestyle=":", color="k")

fig.savefig("frontier_flame_wave_scaling.png")


# now by GPUs

fig, ax = plt.subplots(1)
c, t = trend_line(frontier_bignet_nodes, frontier_bignet_times)
ax.plot(c, t, alpha=0.5, linestyle=":", color="k")

nfrontier_gpu = 8
nsummit_gpu = 6

ax.errorbar(frontier_nodes * nfrontier_gpu, frontier_times, yerr=frontier_std, ls="None", marker="x", label="Frontier (ROCm 5.3)")
ax.errorbar(summit_nodes * nsummit_gpu, summit_times, yerr=summit_std, ls="None", marker="x", label="Summit (CUDA 11.4)")

c, t = trend_line(frontier_nodes * nfrontier_gpu, frontier_times)
ax.plot(c, t, alpha=0.5, linestyle=":")

ax.set_ylabel("wallclock time / step")
ax.set_xlabel("number of GPUs")
ax.set_xlabel("number of nodes")

ax.set_xscale("log")
ax.set_yscale("log")

ax.legend()

fig.savefig("frontier_flame_wave_scaling_by_gpus.png")
ax.set_title("3D XRB flame scaling")

fig.tight_layout()
fig.savefig("frontier_flame_wave_scaling.png")

0 comments on commit 2fe8c1f

Please sign in to comment.