File tree 1 file changed +4
-4
lines changed
micro-benchmarks/nccl-tests/slurm 1 file changed +4
-4
lines changed Original file line number Diff line number Diff line change @@ -13,8 +13,8 @@ set -ex
13
13
14
14
# This script is designed to run by default on the Deep Learning AMI, Ubuntu 20.04
15
15
# See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
16
- ALL_REDUCE_BINARY=${1:-/ usr/ local/ cuda-12.3 / efa/ test-cuda-12.3 / all_reduce_perf}
17
- ADDITIONAL_LD_LIBRARY_PATH=${2:-/ usr/ local/ cuda-12.3 / lib}
16
+ ALL_REDUCE_BINARY=${1:-/ usr/ local/ cuda-12.4 / efa/ test-cuda-12.4 / all_reduce_perf}
17
+ ADDITIONAL_LD_LIBRARY_PATH=${2:-/ usr/ local/ cuda-12.4 / lib}
18
18
19
19
# Get Hostname to Instance ID mapping
20
20
mpirun -N 1 bash -c ' echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
@@ -35,12 +35,12 @@ mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
35
35
-x FI_PROVIDER=efa \
36
36
-x FI_EFA_USE_DEVICE_RDMA=1 \
37
37
-x FI_EFA_FORK_SAFE=1 \
38
- -x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH :/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws- ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
38
+ -x LD_LIBRARY_PATH=$ADDITIONAL_LD_LIBRARY_PATH :/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
39
39
-x NCCL_DEBUG=INFO \
40
40
-x NCCL_SOCKET_IFNAME=^docker,lo,veth \
41
41
-x NCCL_BUFFSIZE=8388608 \
42
42
-x NCCL_P2P_NET_CHUNKSIZE=524288 \
43
- -x NCCL_TUNER_PLUGIN=/opt/aws- ofi-nccl/lib/libnccl-ofi-tuner.so \
43
+ -x NCCL_TUNER_PLUGIN=/opt/amazon/ ofi-nccl/lib/libnccl-ofi-tuner.so \
44
44
--mca pml ^ucx \
45
45
--mca btl tcp,self \
46
46
--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
You can’t perform that action at this time.
0 commit comments