vllm-project
diff --git a/‎.buildkite/run-multi-node-test.sh
Lines changed: 40 additions & 12 deletions b/‎.buildkite/run-multi-node-test.sh
Lines changed: 40 additions & 12 deletions
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 15 additions & 1 deletion b/‎.buildkite/test-pipeline.yaml
Lines changed: 15 additions & 1 deletion
diff --git a/‎tests/async_engine/test_openapi_server_ray.py
Lines changed: 14 additions & 23 deletions b/‎tests/async_engine/test_openapi_server_ray.py
Lines changed: 14 additions & 23 deletions
diff --git a/‎tests/distributed/test_pipeline_parallel.py
Lines changed: 4 additions & 13 deletions b/‎tests/distributed/test_pipeline_parallel.py
Lines changed: 4 additions & 13 deletions
diff --git a/‎tests/distributed/test_same_node.py
Lines changed: 1 addition & 0 deletions b/‎tests/distributed/test_same_node.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_chat.py
Lines changed: 24 additions & 33 deletions b/‎tests/entrypoints/openai/test_chat.py
Lines changed: 24 additions & 33 deletions
diff --git a/‎tests/entrypoints/openai/test_completion.py
Lines changed: 24 additions & 33 deletions b/‎tests/entrypoints/openai/test_completion.py
Lines changed: 24 additions & 33 deletions
@@ -2,16 +2,17 @@
 
 set -euox pipefail
 
-if [[ $# -lt 3 ]]; then
-    echo "Please provide the number of nodes and GPU per node."
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
 fi
 
-NUM_NODES=$1
-NUM_GPUS=$2
-DOCKER_IMAGE=$3
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
 
-shift 3
+shift 4
 COMMANDS=("$@")
 if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
     echo "The number of commands must be equal to the number of nodes."
@@ -40,13 +41,40 @@ start_nodes() {
             fi
         done
         GPU_DEVICES+='"'
-        # echo "Starting node$node with GPU devices: $GPU_DEVICES"
-        docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ $node -eq 0 ]; then
+            # start the ray head node
+            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
     done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
 }
 
 run_nodes() {
-    for node in $(seq 0 $(($NUM_NODES-1))); do
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
         GPU_DEVICES='"device='
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
@@ -57,10 +85,10 @@ run_nodes() {
         done
         GPU_DEVICES+='"'
         echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -lt $(($NUM_NODES - 1)) ]; then
-            docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
+        if [ $node -ne 0 ]; then
+            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         else
-            docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
+            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         fi
     done
 }
 
@@ -68,6 +68,17 @@ steps:
   - pytest -v -s distributed/test_comm_ops.py
   - pytest -v -s distributed/test_shm_broadcast.py
 
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
@@ -213,7 +224,10 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+  commands:
+    - apt-get install curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
 
 - label: Metrics Test
   mirror_hardwares: [amd]
 
@@ -1,35 +1,26 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "2048",
-        "--enforce-eager",
-        "--engine-use-ray"
-    ])
+def server():
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "float16",
+            "--max-model-len",
+            "2048",
+            "--enforce-eager",
+            "--engine-use-ray"
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
 
@@ -2,11 +2,8 @@
 
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # downloading lora to test lora requests
 
@@ -21,14 +18,7 @@
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     args = [
         "--model",
         MODEL_NAME,
@@ -50,7 +40,8 @@ def server(ray_ctx):
         args += [
             "--enforce-eager",
         ]
-    return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
+    with RemoteOpenAIServer(args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
 
@@ -10,3 +10,4 @@
 
 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"
+print("Same node test passed!")
@@ -6,15 +6,12 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -29,35 +26,29 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "128",
-    ])
+def server(zephyr_lora_files):
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            # lora config below
+            "--enable-lora",
+            "--lora-modules",
+            f"zephyr-lora={zephyr_lora_files}",
+            f"zephyr-lora2={zephyr_lora_files}",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+            "--max-num-seqs",
+            "128",
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
 
@@ -6,17 +6,14 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -31,35 +28,29 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
-    return RemoteOpenAIServer([
-        "--model",
-        MODEL_NAME,
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "128",
-    ])
+def server(zephyr_lora_files):
+    with RemoteOpenAIServer([
+            "--model",
+            MODEL_NAME,
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            # lora config below
+            "--enable-lora",
+            "--lora-modules",
+            f"zephyr-lora={zephyr_lora_files}",
+            f"zephyr-lora2={zephyr_lora_files}",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+            "--max-num-seqs",
+            "128",
+    ]) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,4 @@`
`10`	`10`
`11`	`11`	`expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"`
`12`	`12`	`assert test_result == expected, f"Expected {expected}, got {test_result}"`
	`13`	`+print("Same node test passed!")`