[Core] Don't do platform detection at import time (vllm-project#12933)

russellb · SzymonOzog · commit 5419a10d2bb4 · 2025-02-12T15:51:44.000Z
Signed-off-by: Russell Bryant &lt;rbryant@redhat.com&gt;
Signed-off-by: SzymonOzog &lt;szymon.ozog@aleph-alpha.com&gt;
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
@@ -8,11 +8,11 @@
 import torch.nn as nn
 from typing_extensions import TypeVar
 
+import vllm.platforms
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import make_async
@@ -108,8 +108,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """
         # NOTE: This is logged in the executor because there can be >1 workers.
         logger.info("# %s blocks: %d, # CPU blocks: %d",
-                    current_platform.dispatch_key, num_gpu_blocks,
-                    num_cpu_blocks)
+                    vllm.platforms.current_platform.dispatch_key,
+                    num_gpu_blocks, num_cpu_blocks)
         max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
                            self.model_config.max_model_len)
         logger.info("Maximum concurrency for %s tokens per request: %.2fx",
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -7,10 +7,10 @@
 
 import msgspec
 
+import vllm.platforms
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -54,10 +54,10 @@ def get_node_ip(self) -> str:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            device_key = current_platform.ray_device_key
+            device_key = vllm.platforms.current_platform.ray_device_key
             if not device_key:
                 raise RuntimeError("current platform %s does not support ray.",
-                                   current_platform.device_name)
+                                   vllm.platforms.current_platform.device_name)
             gpu_ids = ray.get_runtime_context().get_accelerator_ids(
             )[device_key]
             return node_id, gpu_ids
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -334,10 +334,10 @@ def log_warnings(cls):
             if (len(set(device_names)) > 1
                     and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
                 logger.warning(
-                    "Detected different devices in the system: \n%s\nPlease"
+                    "Detected different devices in the system: %s. Please"
                     " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
                     "avoid unexpected behavior.",
-                    "\n".join(device_names),
+                    ", ".join(device_names),
                 )