Skip to content

Commit 41708e5

Browse files
youkaichaoMuralidhar Andoorveedu
and
Muralidhar Andoorveedu
authored
[ci] try to add multi-node tests (#6280)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
1 parent d80aef3 commit 41708e5

File tree

13 files changed

+230
-275
lines changed

13 files changed

+230
-275
lines changed

.buildkite/run-multi-node-test.sh

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22

33
set -euox pipefail
44

5-
if [[ $# -lt 3 ]]; then
6-
echo "Please provide the number of nodes and GPU per node."
5+
if [[ $# -lt 4 ]]; then
6+
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
77
exit 1
88
fi
99

10-
NUM_NODES=$1
11-
NUM_GPUS=$2
12-
DOCKER_IMAGE=$3
10+
WORKING_DIR=$1
11+
NUM_NODES=$2
12+
NUM_GPUS=$3
13+
DOCKER_IMAGE=$4
1314

14-
shift 3
15+
shift 4
1516
COMMANDS=("$@")
1617
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
1718
echo "The number of commands must be equal to the number of nodes."
@@ -40,13 +41,40 @@ start_nodes() {
4041
fi
4142
done
4243
GPU_DEVICES+='"'
43-
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
44-
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
44+
45+
# start the container in detached mode
46+
# things to note:
47+
# 1. --shm-size=10.24gb is required. don't use --ipc=host
48+
# 2. pass HF_TOKEN to the container
49+
# 3. map the huggingface cache directory to the container
50+
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
51+
# starting from 192.168.10.11)
52+
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
53+
54+
# organize containers into a ray cluster
55+
if [ $node -eq 0 ]; then
56+
# start the ray head node
57+
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
58+
# wait for the head node to be ready
59+
sleep 10
60+
else
61+
# start the ray worker nodes, and connect them to the head node
62+
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
63+
fi
4564
done
65+
66+
# wait for the cluster to be ready
67+
sleep 10
68+
69+
# print the cluster status
70+
docker exec node0 /bin/bash -c "ray status"
4671
}
4772

4873
run_nodes() {
49-
for node in $(seq 0 $(($NUM_NODES-1))); do
74+
# important: iterate in reverse order to start the head node last
75+
# we start the worker nodes first, in detached mode, and then start the head node
76+
# in the foreground, so that the output of the head node is visible in the buildkite logs
77+
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
5078
GPU_DEVICES='"device='
5179
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
5280
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
@@ -57,10 +85,10 @@ run_nodes() {
5785
done
5886
GPU_DEVICES+='"'
5987
echo "Running node$node with GPU devices: $GPU_DEVICES"
60-
if [ $node -lt $(($NUM_NODES - 1)) ]; then
61-
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
88+
if [ $node -ne 0 ]; then
89+
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
6290
else
63-
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
91+
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
6492
fi
6593
done
6694
}

.buildkite/test-pipeline.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,17 @@ steps:
6868
- pytest -v -s distributed/test_comm_ops.py
6969
- pytest -v -s distributed/test_shm_broadcast.py
7070

71+
- label: 2 Node Tests (4 GPUs in total)
72+
working_dir: "/vllm-workspace/tests"
73+
num_gpus: 2
74+
num_nodes: 2
75+
commands:
76+
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
77+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
78+
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
79+
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
80+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
81+
7182
- label: Distributed Tests (2 GPUs)
7283
mirror_hardwares: [amd]
7384
working_dir: "/vllm-workspace/tests"
@@ -213,7 +224,10 @@ steps:
213224

214225
- label: Tensorizer Test
215226
#mirror_hardwares: [amd]
216-
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
227+
commands:
228+
- apt-get install curl libsodium23
229+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
230+
- pytest -v -s tensorizer_loader
217231

218232
- label: Metrics Test
219233
mirror_hardwares: [amd]

tests/async_engine/test_openapi_server_ray.py

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,26 @@
11
import openai # use the official client for correctness check
22
import pytest
3-
# using Ray for overall ease of process management, parallel requests,
4-
# and debugging.
5-
import ray
63

7-
from ..utils import VLLM_PATH, RemoteOpenAIServer
4+
from ..utils import RemoteOpenAIServer
85

96
# any model with a chat template should work here
107
MODEL_NAME = "facebook/opt-125m"
118

129

1310
@pytest.fixture(scope="module")
14-
def ray_ctx():
15-
ray.init(runtime_env={"working_dir": VLLM_PATH})
16-
yield
17-
ray.shutdown()
18-
19-
20-
@pytest.fixture(scope="module")
21-
def server(ray_ctx):
22-
return RemoteOpenAIServer([
23-
"--model",
24-
MODEL_NAME,
25-
# use half precision for speed and memory savings in CI environment
26-
"--dtype",
27-
"float16",
28-
"--max-model-len",
29-
"2048",
30-
"--enforce-eager",
31-
"--engine-use-ray"
32-
])
11+
def server():
12+
with RemoteOpenAIServer([
13+
"--model",
14+
MODEL_NAME,
15+
# use half precision for speed and memory savings in CI environment
16+
"--dtype",
17+
"float16",
18+
"--max-model-len",
19+
"2048",
20+
"--enforce-eager",
21+
"--engine-use-ray"
22+
]) as remote_server:
23+
yield remote_server
3324

3425

3526
@pytest.fixture(scope="module")

tests/distributed/test_pipeline_parallel.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22

33
import openai # use the official client for correctness check
44
import pytest
5-
# using Ray for overall ease of process management, parallel requests,
6-
# and debugging.
7-
import ray
85

9-
from ..utils import VLLM_PATH, RemoteOpenAIServer
6+
from ..utils import RemoteOpenAIServer
107

118
# downloading lora to test lora requests
129

@@ -21,14 +18,7 @@
2118

2219

2320
@pytest.fixture(scope="module")
24-
def ray_ctx():
25-
ray.init(runtime_env={"working_dir": VLLM_PATH})
26-
yield
27-
ray.shutdown()
28-
29-
30-
@pytest.fixture(scope="module")
31-
def server(ray_ctx):
21+
def server():
3222
args = [
3323
"--model",
3424
MODEL_NAME,
@@ -50,7 +40,8 @@ def server(ray_ctx):
5040
args += [
5141
"--enforce-eager",
5242
]
53-
return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
43+
with RemoteOpenAIServer(args) as remote_server:
44+
yield remote_server
5445

5546

5647
@pytest.fixture(scope="module")

tests/distributed/test_same_node.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010

1111
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
1212
assert test_result == expected, f"Expected {expected}, got {test_result}"
13+
print("Same node test passed!")

tests/entrypoints/openai/test_chat.py

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,12 @@
66
import jsonschema
77
import openai # use the official client for correctness check
88
import pytest
9-
# using Ray for overall ease of process management, parallel requests,
10-
# and debugging.
11-
import ray
129
import torch
1310
# downloading lora to test lora requests
1411
from huggingface_hub import snapshot_download
1512
from openai import BadRequestError
1613

17-
from ...utils import VLLM_PATH, RemoteOpenAIServer
14+
from ...utils import RemoteOpenAIServer
1815

1916
# any model with a chat template should work here
2017
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -29,35 +26,29 @@ def zephyr_lora_files():
2926

3027

3128
@pytest.fixture(scope="module")
32-
def ray_ctx():
33-
ray.init(runtime_env={"working_dir": VLLM_PATH})
34-
yield
35-
ray.shutdown()
36-
37-
38-
@pytest.fixture(scope="module")
39-
def server(zephyr_lora_files, ray_ctx):
40-
return RemoteOpenAIServer([
41-
"--model",
42-
MODEL_NAME,
43-
# use half precision for speed and memory savings in CI environment
44-
"--dtype",
45-
"bfloat16",
46-
"--max-model-len",
47-
"8192",
48-
"--enforce-eager",
49-
# lora config below
50-
"--enable-lora",
51-
"--lora-modules",
52-
f"zephyr-lora={zephyr_lora_files}",
53-
f"zephyr-lora2={zephyr_lora_files}",
54-
"--max-lora-rank",
55-
"64",
56-
"--max-cpu-loras",
57-
"2",
58-
"--max-num-seqs",
59-
"128",
60-
])
29+
def server(zephyr_lora_files):
30+
with RemoteOpenAIServer([
31+
"--model",
32+
MODEL_NAME,
33+
# use half precision for speed and memory savings in CI environment
34+
"--dtype",
35+
"bfloat16",
36+
"--max-model-len",
37+
"8192",
38+
"--enforce-eager",
39+
# lora config below
40+
"--enable-lora",
41+
"--lora-modules",
42+
f"zephyr-lora={zephyr_lora_files}",
43+
f"zephyr-lora2={zephyr_lora_files}",
44+
"--max-lora-rank",
45+
"64",
46+
"--max-cpu-loras",
47+
"2",
48+
"--max-num-seqs",
49+
"128",
50+
]) as remote_server:
51+
yield remote_server
6152

6253

6354
@pytest.fixture(scope="module")

tests/entrypoints/openai/test_completion.py

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@
66
import jsonschema
77
import openai # use the official client for correctness check
88
import pytest
9-
# using Ray for overall ease of process management, parallel requests,
10-
# and debugging.
11-
import ray
129
import requests
1310
# downloading lora to test lora requests
1411
from huggingface_hub import snapshot_download
1512
from openai import BadRequestError
1613

1714
from vllm.transformers_utils.tokenizer import get_tokenizer
1815

19-
from ...utils import VLLM_PATH, RemoteOpenAIServer
16+
from ...utils import RemoteOpenAIServer
2017

2118
# any model with a chat template should work here
2219
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -31,35 +28,29 @@ def zephyr_lora_files():
3128

3229

3330
@pytest.fixture(scope="module")
34-
def ray_ctx():
35-
ray.init(runtime_env={"working_dir": VLLM_PATH})
36-
yield
37-
ray.shutdown()
38-
39-
40-
@pytest.fixture(scope="module")
41-
def server(zephyr_lora_files, ray_ctx):
42-
return RemoteOpenAIServer([
43-
"--model",
44-
MODEL_NAME,
45-
# use half precision for speed and memory savings in CI environment
46-
"--dtype",
47-
"bfloat16",
48-
"--max-model-len",
49-
"8192",
50-
"--enforce-eager",
51-
# lora config below
52-
"--enable-lora",
53-
"--lora-modules",
54-
f"zephyr-lora={zephyr_lora_files}",
55-
f"zephyr-lora2={zephyr_lora_files}",
56-
"--max-lora-rank",
57-
"64",
58-
"--max-cpu-loras",
59-
"2",
60-
"--max-num-seqs",
61-
"128",
62-
])
31+
def server(zephyr_lora_files):
32+
with RemoteOpenAIServer([
33+
"--model",
34+
MODEL_NAME,
35+
# use half precision for speed and memory savings in CI environment
36+
"--dtype",
37+
"bfloat16",
38+
"--max-model-len",
39+
"8192",
40+
"--enforce-eager",
41+
# lora config below
42+
"--enable-lora",
43+
"--lora-modules",
44+
f"zephyr-lora={zephyr_lora_files}",
45+
f"zephyr-lora2={zephyr_lora_files}",
46+
"--max-lora-rank",
47+
"64",
48+
"--max-cpu-loras",
49+
"2",
50+
"--max-num-seqs",
51+
"128",
52+
]) as remote_server:
53+
yield remote_server
6354

6455

6556
@pytest.fixture(scope="module")

0 commit comments

Comments
 (0)