Skip to content

Commit 5419a10

Browse files
russellbSzymonOzog
authored andcommitted
[Core] Don't do platform detection at import time (vllm-project#12933)
Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
1 parent cd836cb commit 5419a10

File tree

3 files changed

+8
-8
lines changed

3 files changed

+8
-8
lines changed

vllm/executor/executor_base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
import torch.nn as nn
99
from typing_extensions import TypeVar
1010

11+
import vllm.platforms
1112
from vllm.config import VllmConfig
1213
from vllm.logger import init_logger
1314
from vllm.lora.request import LoRARequest
1415
from vllm.model_executor.layers.sampler import SamplerOutput
15-
from vllm.platforms import current_platform
1616
from vllm.prompt_adapter.request import PromptAdapterRequest
1717
from vllm.sequence import ExecuteModelRequest, PoolerOutput
1818
from vllm.utils import make_async
@@ -108,8 +108,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
108108
"""
109109
# NOTE: This is logged in the executor because there can be >1 workers.
110110
logger.info("# %s blocks: %d, # CPU blocks: %d",
111-
current_platform.dispatch_key, num_gpu_blocks,
112-
num_cpu_blocks)
111+
vllm.platforms.current_platform.dispatch_key,
112+
num_gpu_blocks, num_cpu_blocks)
113113
max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
114114
self.model_config.max_model_len)
115115
logger.info("Maximum concurrency for %s tokens per request: %.2fx",

vllm/executor/ray_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77

88
import msgspec
99

10+
import vllm.platforms
1011
from vllm.config import ParallelConfig
1112
from vllm.executor.msgspec_utils import decode_hook, encode_hook
1213
from vllm.logger import init_logger
13-
from vllm.platforms import current_platform
1414
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
1515
from vllm.utils import get_ip
1616
from vllm.worker.worker_base import WorkerWrapperBase
@@ -54,10 +54,10 @@ def get_node_ip(self) -> str:
5454

5555
def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
5656
node_id = ray.get_runtime_context().get_node_id()
57-
device_key = current_platform.ray_device_key
57+
device_key = vllm.platforms.current_platform.ray_device_key
5858
if not device_key:
5959
raise RuntimeError("current platform %s does not support ray.",
60-
current_platform.device_name)
60+
vllm.platforms.current_platform.device_name)
6161
gpu_ids = ray.get_runtime_context().get_accelerator_ids(
6262
)[device_key]
6363
return node_id, gpu_ids

vllm/platforms/cuda.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,10 +334,10 @@ def log_warnings(cls):
334334
if (len(set(device_names)) > 1
335335
and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
336336
logger.warning(
337-
"Detected different devices in the system: \n%s\nPlease"
337+
"Detected different devices in the system: %s. Please"
338338
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
339339
"avoid unexpected behavior.",
340-
"\n".join(device_names),
340+
", ".join(device_names),
341341
)
342342

343343

0 commit comments

Comments
 (0)