Skip to content

Commit 34cf245

Browse files
authoredApr 23, 2025
Fix nccl-tests-ami.sbatch
1 parent cc161b1 commit 34cf245

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed
 

‎micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ set -ex
1313

1414
# This script is designed to run by default on the Deep Learning AMI, Ubuntu 20.04
1515
# See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
16-
ALL_REDUCE_BINARY=${1:-/usr/local/cuda-12.3/efa/test-cuda-12.3/all_reduce_perf}
17-
ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-12.3/lib}
16+
ALL_REDUCE_BINARY=${1:-/usr/local/cuda-12.4/efa/test-cuda-12.4/all_reduce_perf}
17+
ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-12.4/lib}
1818

1919
# Get Hostname to Instance ID mapping
2020
mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
@@ -35,12 +35,12 @@ mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
3535
-x FI_PROVIDER=efa \
3636
-x FI_EFA_USE_DEVICE_RDMA=1 \
3737
-x FI_EFA_FORK_SAFE=1 \
38-
-x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
38+
-x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
3939
-x NCCL_DEBUG=INFO \
4040
-x NCCL_SOCKET_IFNAME=^docker,lo,veth \
4141
-x NCCL_BUFFSIZE=8388608 \
4242
-x NCCL_P2P_NET_CHUNKSIZE=524288 \
43-
-x NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/lib/libnccl-ofi-tuner.so \
43+
-x NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/libnccl-ofi-tuner.so \
4444
--mca pml ^ucx \
4545
--mca btl tcp,self \
4646
--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \

0 commit comments

Comments
 (0)