[CI] upgrade to vllm 0.9.0 (#959)

wangxiyuan · web-flow · commit f6e5decc1096 · 2025-05-28T21:18:41.000+08:00
Upgrade to vllm 0.9.0.
0.8.5 will not be supported any more.

Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -43,7 +43,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.8.5.post1]
+        vllm_version: [main, v0.9.0]
     concurrency:
       group: >
         ${{
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -41,7 +41,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        vllm_version: [main, v0.8.5.post1]
+        vllm_version: [main, v0.9.0]
     name: vLLM Ascend long term test
     runs-on: linux-arm64-npu-1
     container:
diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml
@@ -40,7 +40,7 @@ jobs:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
     strategy:
       matrix:
-        vllm_verison: [main, v0.8.5.post1]
+        vllm_verison: [main, v0.9.0]
     name: vLLM Ascend prefilling decoding disaggregation test
     runs-on: linux-arm64-npu-static-8
 
diff --git a/Dockerfile b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.8.5.post1
+ARG VLLM_TAG=v0.9.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.8.5.post1
+ARG VLLM_TAG=v0.9.0
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/tests/long_term/spec_decode/e2e/conftest.py b/tests/long_term/spec_decode/e2e/conftest.py
@@ -26,9 +26,9 @@
 from vllm import SamplingParams
 from vllm.sequence import PromptLogprobs, SampleLogprobs
 
-from ....model_utils import (TokensTextLogprobs,
-                             TokensTextLogprobsPromptLogprobs,
-                             check_logprobs_close, check_outputs_equal)
+from tests.model_utils import (TokensTextLogprobs,
+                               TokensTextLogprobsPromptLogprobs,
+                               check_logprobs_close, check_outputs_equal)
 
 PROMPTS = [
     "Hello, my name is",
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -30,7 +30,6 @@
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
-from vllm_ascend.utils import vllm_version_is
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -142,14 +141,11 @@ def reorder_batch(self, input_batch: "InputBatch",
 
     def build(self, num_reqs, num_actual_tokens, max_query_len,
               common_prefix_len):
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            block_table = (self.runner.input_batch.block_table.
-                           get_device_tensor()[:num_reqs])
-        else:
-            block_table = self.runner.input_batch.block_table[
-                0].get_device_tensor()
-            block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
-                block_table[:num_reqs])
+
+        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
+        )
+        block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+            block_table[:num_reqs])
 
         query_lens = self.runner.query_lens
         seq_lens = self.runner.seq_lens_cpu[:num_reqs]
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -16,7 +16,6 @@
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.utils import vllm_version_is
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 if TYPE_CHECKING:
@@ -239,14 +238,11 @@ def build(self,
         # function. We should avoid GPU -> CPU sync as much as possible because
         # it blocks on all previous kernels.
         device = self.runner.device
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            block_table = (self.runner.input_batch.block_table.
-                           get_device_tensor()[:num_reqs])
-        else:
-            block_table = self.runner.input_batch.block_table[
-                0].get_device_tensor()
-            block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
-                block_table[:num_reqs])
+
+        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
+        )
+        block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+            block_table[:num_reqs])
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             device, non_blocking=True)
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -26,18 +26,10 @@
                               tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
-
-from vllm_ascend.utils import vllm_version_is
-
-if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
-    from vllm.model_executor.layers.fused_moe.layer import (
-        FusedMoEParallelConfig, MoEConfig)
-else:
-    MoEConfig = None
-
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, QuantizeMethodBase)
+    FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod,
+    determine_expert_map)
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizationConfig
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
@@ -587,10 +579,8 @@ def select_experts(
 class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
 
     def __init__(self, moe: MoEConfig = None):
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            super().__init__()
-        else:
-            super().__init__(moe=moe)
+
+        super().__init__(moe=moe)
         vllm_config = get_current_vllm_config()
 
         ep_group = get_ep_group()
@@ -731,24 +721,17 @@ def __init__(
             params_dtype = torch.get_default_dtype()
 
         vllm_config = get_current_vllm_config()
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            self.ep_size = get_ep_group().world_size
-            self.tp_size = get_etp_group().world_size
-            self.dp_size = (dp_size if dp_size is not None else
-                            get_dp_group().world_size)
-            self.dp_rank = (0 if self.dp_size == 1 else
-                            get_dp_group().rank_in_group)
-        else:
-            self.moe_parallel_config: FusedMoEParallelConfig = (
-                FusedMoEParallelConfig.make(
-                    tp_size_=(tp_size if tp_size is not None else
-                              get_tensor_model_parallel_world_size()),
-                    dp_size_=(dp_size if dp_size is not None else
-                              get_dp_group().world_size),
-                    vllm_parallel_config=vllm_config.parallel_config))
 
-            self.moe_parallel_config.ep_size = get_ep_group().world_size
-            self.moe_parallel_config.tp_size = get_etp_group().world_size
+        self.moe_parallel_config: FusedMoEParallelConfig = (
+            FusedMoEParallelConfig.make(
+                tp_size_=(tp_size if tp_size is not None else
+                          get_tensor_model_parallel_world_size()),
+                dp_size_=(dp_size if dp_size is not None else
+                          get_dp_group().world_size),
+                vllm_parallel_config=vllm_config.parallel_config))
+
+        self.moe_parallel_config.ep_size = get_ep_group().world_size
+        self.moe_parallel_config.tp_size = get_etp_group().world_size
 
         self.top_k = top_k
         self.num_experts = num_experts
@@ -773,54 +756,39 @@ def __init__(
             self.local_num_experts, self.expert_map = determine_expert_map(
                 self.ep_size,
                 get_ep_group().rank_in_group, self.global_num_experts)
-            if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-                self.tp_rank = get_etp_group().rank_in_group
-                self.ep_rank = get_ep_group().rank_in_group
-            else:
-                self.moe_parallel_config.tp_rank = get_etp_group(
-                ).rank_in_group
-                self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
+
+            self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group
+            self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
 
         else:
             # Adjust TP size for DP attention
             # haven't test its functionality yet, may remove in the future
-            if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-                self.tp_rank = self.tp_size * self.dp_rank
-                self.ep_rank = 0
-                self.tp_size = self.tp_size * self.dp_size
-                self.ep_size = 1
-            else:
-                self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
-                self.moe_parallel_config.ep_rank = 0
-                self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
-                self.moe_parallel_config.ep_size = 1
+
+            self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
+            self.moe_parallel_config.ep_rank = 0
+            self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
+            self.moe_parallel_config.ep_size = 1
 
             self.local_num_experts, self.expert_map = (self.global_num_experts,
                                                        None)
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
                              "non-grouped topk.")
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            if quant_config is None:
-                self.quant_method: Optional[QuantizeMethodBase] = (
-                    AscendUnquantizedFusedMoEMethod())
-            else:
-                self.quant_method = quant_config.get_quant_method(self, prefix)
-        else:
-            moe = MoEConfig(
-                num_experts=self.global_num_experts,
-                experts_per_token=top_k,
-                hidden_dim=hidden_size,
-                num_local_experts=self.local_num_experts,
-                moe_parallel_config=self.moe_parallel_config,
-                # TODO (bnell): this needs to be fixed for quantized types.
-                in_dtype=params_dtype,
-            )
 
-            if quant_config is None:
-                self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
-            else:
-                self.quant_method = quant_config.get_quant_method(self, prefix)
+        moe = MoEConfig(
+            num_experts=self.global_num_experts,
+            experts_per_token=top_k,
+            hidden_dim=hidden_size,
+            num_local_experts=self.local_num_experts,
+            moe_parallel_config=self.moe_parallel_config,
+            # TODO (bnell): this needs to be fixed for quantized types.
+            in_dtype=params_dtype,
+        )
+
+        if quant_config is None:
+            self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix)
 
         assert self.quant_method is not None
 
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
@@ -24,16 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_8_5: contains the patches applied when vllm version is 0.8.5.
+# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.8.5 and main branch.
-#
-# In the future, with the vllm version upgrade, the new patch folder such as
-# patch_0_8_5, patch_0_8_6, etc. will be added to manage the patch for different
-# vllm version. And the patch_common will contain the patches applied in all the
-# vllm version.
-# Once the vllm version is too old that vllm-ascend will not support, the related
-# patch folder will be removed as well.
+# - patch_common: contains the patches applied in both 0.9.0 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-    from vllm_ascend.patch.platform import patch_0_8_5  # noqa: F401
+if vllm_version_is("0.9.0"):
+    from vllm_ascend.patch.platform import patch_0_9_0  # noqa: F401
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_0/__init__.py
diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-    from vllm_ascend.patch.worker import patch_0_8_5  # noqa: F401
+if vllm_version_is("0.9.0"):
+    from vllm_ascend.patch.worker import patch_0_9_0  # noqa: F401
     from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.worker import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_0/__init__.py
diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py
@@ -64,8 +64,6 @@
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
 
-from vllm_ascend.utils import vllm_version_is
-
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
@@ -1017,10 +1015,8 @@ def save_sharded_state(
         pattern: Optional[str] = None,
         max_size: Optional[int] = None,
     ) -> None:
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            from vllm.model_executor.model_loader.loader import ShardedStateLoader  # type: ignore[import]  # isort: skip  # noqa
-        else:
-            from vllm.model_executor.model_loader import ShardedStateLoader
+
+        from vllm.model_executor.model_loader import ShardedStateLoader
         ShardedStateLoader.save_model(
             self.model,
             path,
@@ -1032,12 +1028,9 @@ def save_tensorized_model(
         self,
         tensorizer_config: TensorizerConfig,
     ) -> None:
-        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
-            from vllm.model_executor.model_loader.loader import \
-                TensorizerLoader  # type: ignore  # noqa
-        else:
-            from vllm.model_executor.model_loader import \
-                TensorizerLoader  # type: ignore  # noqa
+
+        from vllm.model_executor.model_loader import \
+            TensorizerLoader  # type: ignore  # noqa
         TensorizerLoader.save_model(
             self.model,
             tensorizer_config=tensorizer_config,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py