Skip to content

Commit 61e5927

Browse files
[Core] Introduce SPMD worker execution using Ray accelerated DAG (#6032)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Co-authored-by: Stephanie Wang <swang@cs.berkeley.edu>
1 parent d25877d commit 61e5927

File tree

8 files changed

+218
-121
lines changed

8 files changed

+218
-121
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ steps:
8484
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
8585
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
8686
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
87+
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
88+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
8789
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
8890
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
8991
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
@@ -108,6 +110,7 @@ steps:
108110
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
109111
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
110112
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
113+
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
111114
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
112115
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
113116

vllm/engine/llm_engine.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from transformers import PreTrainedTokenizer
88

9+
import vllm.envs as envs
910
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
1011
LoRAConfig, ModelConfig, MultiModalConfig,
1112
ObservabilityConfig, ParallelConfig,
@@ -414,6 +415,9 @@ def from_engine_args(
414415
elif distributed_executor_backend == "mp":
415416
from vllm.executor.multiproc_gpu_executor import (
416417
MultiprocessingGPUExecutor)
418+
assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
419+
"multiprocessing distributed executor backend does not "
420+
"support VLLM_USE_RAY_SPMD_WORKER=1")
417421
executor_class = MultiprocessingGPUExecutor
418422
else:
419423
from vllm.executor.gpu_executor import GPUExecutor
@@ -426,6 +430,7 @@ def from_engine_args(
426430
usage_context=usage_context,
427431
stat_loggers=stat_loggers,
428432
)
433+
429434
return engine
430435

431436
def __reduce__(self):

vllm/envs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
3535
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
3636
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
37+
VLLM_USE_RAY_SPMD_WORKER: bool = False
3738
VLLM_USE_RAY_COMPILED_DAG: bool = False
3839
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
3940
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -261,6 +262,13 @@ def get_default_config_root():
261262
"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
262263
lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
263264

265+
# If the env var is set, then all workers will execute as separate
266+
# processes from the engine, and we use the same mechanism to trigger
267+
# execution on all workers.
268+
# Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
269+
"VLLM_USE_RAY_SPMD_WORKER":
270+
lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)),
271+
264272
# If the env var is set, it uses the Ray's compiled DAG API
265273
# which optimizes the control plane overhead.
266274
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.

vllm/executor/distributed_gpu_executor.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,18 @@ def initialize_cache(self, num_gpu_blocks: int,
6464
num_cpu_blocks=num_cpu_blocks)
6565

6666
def execute_model(
67-
self, execute_model_req: ExecuteModelRequest
68-
) -> Optional[List[SamplerOutput]]:
67+
self,
68+
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
6969
if self.parallel_worker_tasks is None:
7070
self.parallel_worker_tasks = self._run_workers(
7171
"start_worker_execution_loop",
7272
async_run_tensor_parallel_workers_only=True,
7373
**self.extra_execute_model_run_workers_kwargs)
7474

7575
# Only the driver worker returns the sampling results.
76-
return self._driver_execute_model(execute_model_req)
76+
driver_outputs = self._driver_execute_model(execute_model_req)
77+
assert driver_outputs is not None
78+
return driver_outputs
7779

7880
def stop_remote_worker_execution_loop(self) -> None:
7981
if self.parallel_worker_tasks is None:

0 commit comments

Comments
 (0)