Skip to content

Commit f6e5dec

Browse files
authored
[CI] upgrade to vllm 0.9.0 (#959)
Upgrade to vllm 0.9.0. 0.8.5 will not be supported any more. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent e2a0c19 commit f6e5dec

File tree

16 files changed

+79
-146
lines changed

16 files changed

+79
-146
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ jobs:
4343
max-parallel: 2
4444
matrix:
4545
os: [linux-arm64-npu-1, linux-arm64-npu-4]
46-
vllm_version: [main, v0.8.5.post1]
46+
vllm_version: [main, v0.9.0]
4747
concurrency:
4848
group: >
4949
${{

.github/workflows/vllm_ascend_test_long_term.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
strategy:
4242
max-parallel: 2
4343
matrix:
44-
vllm_version: [main, v0.8.5.post1]
44+
vllm_version: [main, v0.9.0]
4545
name: vLLM Ascend long term test
4646
runs-on: linux-arm64-npu-1
4747
container:

.github/workflows/vllm_ascend_test_pd.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
4141
strategy:
4242
matrix:
43-
vllm_verison: [main, v0.8.5.post1]
43+
vllm_verison: [main, v0.9.0]
4444
name: vLLM Ascend prefilling decoding disaggregation test
4545
runs-on: linux-arm64-npu-static-8
4646

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
3737

3838
# Install vLLM
3939
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
40-
ARG VLLM_TAG=v0.8.5.post1
40+
ARG VLLM_TAG=v0.9.0
4141
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
4242
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
4343
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

Dockerfile.openEuler

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
3434

3535
# Install vLLM
3636
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
37-
ARG VLLM_TAG=v0.8.5.post1
37+
ARG VLLM_TAG=v0.9.0
3838

3939
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
4040
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

tests/long_term/spec_decode/e2e/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
from vllm import SamplingParams
2727
from vllm.sequence import PromptLogprobs, SampleLogprobs
2828

29-
from ....model_utils import (TokensTextLogprobs,
30-
TokensTextLogprobsPromptLogprobs,
31-
check_logprobs_close, check_outputs_equal)
29+
from tests.model_utils import (TokensTextLogprobs,
30+
TokensTextLogprobsPromptLogprobs,
31+
check_logprobs_close, check_outputs_equal)
3232

3333
PROMPTS = [
3434
"Hello, my name is",

vllm_ascend/attention/attention_v1.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from vllm.v1.worker.gpu_input_batch import InputBatch
3131

3232
from vllm_ascend.ops.attention import vanilla_chunked_prefill
33-
from vllm_ascend.utils import vllm_version_is
3433

3534

3635
class AscendAttentionBackend(AttentionBackend):
@@ -142,14 +141,11 @@ def reorder_batch(self, input_batch: "InputBatch",
142141

143142
def build(self, num_reqs, num_actual_tokens, max_query_len,
144143
common_prefix_len):
145-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
146-
block_table = (self.runner.input_batch.block_table.
147-
get_device_tensor()[:num_reqs])
148-
else:
149-
block_table = self.runner.input_batch.block_table[
150-
0].get_device_tensor()
151-
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
152-
block_table[:num_reqs])
144+
145+
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
146+
)
147+
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
148+
block_table[:num_reqs])
153149

154150
query_lens = self.runner.query_lens
155151
seq_lens = self.runner.seq_lens_cpu[:num_reqs]

vllm_ascend/attention/mla_v1.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from vllm_ascend.attention.attention_v1 import AscendAttentionState
1818
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
19-
from vllm_ascend.utils import vllm_version_is
2019
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
2120

2221
if TYPE_CHECKING:
@@ -239,14 +238,11 @@ def build(self,
239238
# function. We should avoid GPU -> CPU sync as much as possible because
240239
# it blocks on all previous kernels.
241240
device = self.runner.device
242-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
243-
block_table = (self.runner.input_batch.block_table.
244-
get_device_tensor()[:num_reqs])
245-
else:
246-
block_table = self.runner.input_batch.block_table[
247-
0].get_device_tensor()
248-
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
249-
block_table[:num_reqs])
241+
242+
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
243+
)
244+
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
245+
block_table[:num_reqs])
250246
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
251247
device, non_blocking=True)
252248
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(

vllm_ascend/ops/fused_moe.py

Lines changed: 38 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,10 @@
2626
tensor_model_parallel_all_reduce)
2727
from vllm.distributed.parallel_state import get_dp_group
2828
from vllm.model_executor.layers.fused_moe.layer import (
29-
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
30-
31-
from vllm_ascend.utils import vllm_version_is
32-
33-
if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")):
34-
from vllm.model_executor.layers.fused_moe.layer import (
35-
FusedMoEParallelConfig, MoEConfig)
36-
else:
37-
MoEConfig = None
38-
39-
from vllm.model_executor.layers.quantization.base_config import (
40-
QuantizationConfig, QuantizeMethodBase)
29+
FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod,
30+
determine_expert_map)
31+
from vllm.model_executor.layers.quantization.base_config import \
32+
QuantizationConfig
4133

4234
import vllm_ascend.envs as envs_ascend
4335
from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
@@ -587,10 +579,8 @@ def select_experts(
587579
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
588580

589581
def __init__(self, moe: MoEConfig = None):
590-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
591-
super().__init__()
592-
else:
593-
super().__init__(moe=moe)
582+
583+
super().__init__(moe=moe)
594584
vllm_config = get_current_vllm_config()
595585

596586
ep_group = get_ep_group()
@@ -731,24 +721,17 @@ def __init__(
731721
params_dtype = torch.get_default_dtype()
732722

733723
vllm_config = get_current_vllm_config()
734-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
735-
self.ep_size = get_ep_group().world_size
736-
self.tp_size = get_etp_group().world_size
737-
self.dp_size = (dp_size if dp_size is not None else
738-
get_dp_group().world_size)
739-
self.dp_rank = (0 if self.dp_size == 1 else
740-
get_dp_group().rank_in_group)
741-
else:
742-
self.moe_parallel_config: FusedMoEParallelConfig = (
743-
FusedMoEParallelConfig.make(
744-
tp_size_=(tp_size if tp_size is not None else
745-
get_tensor_model_parallel_world_size()),
746-
dp_size_=(dp_size if dp_size is not None else
747-
get_dp_group().world_size),
748-
vllm_parallel_config=vllm_config.parallel_config))
749724

750-
self.moe_parallel_config.ep_size = get_ep_group().world_size
751-
self.moe_parallel_config.tp_size = get_etp_group().world_size
725+
self.moe_parallel_config: FusedMoEParallelConfig = (
726+
FusedMoEParallelConfig.make(
727+
tp_size_=(tp_size if tp_size is not None else
728+
get_tensor_model_parallel_world_size()),
729+
dp_size_=(dp_size if dp_size is not None else
730+
get_dp_group().world_size),
731+
vllm_parallel_config=vllm_config.parallel_config))
732+
733+
self.moe_parallel_config.ep_size = get_ep_group().world_size
734+
self.moe_parallel_config.tp_size = get_etp_group().world_size
752735

753736
self.top_k = top_k
754737
self.num_experts = num_experts
@@ -773,54 +756,39 @@ def __init__(
773756
self.local_num_experts, self.expert_map = determine_expert_map(
774757
self.ep_size,
775758
get_ep_group().rank_in_group, self.global_num_experts)
776-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
777-
self.tp_rank = get_etp_group().rank_in_group
778-
self.ep_rank = get_ep_group().rank_in_group
779-
else:
780-
self.moe_parallel_config.tp_rank = get_etp_group(
781-
).rank_in_group
782-
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
759+
760+
self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group
761+
self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
783762

784763
else:
785764
# Adjust TP size for DP attention
786765
# haven't test its functionality yet, may remove in the future
787-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
788-
self.tp_rank = self.tp_size * self.dp_rank
789-
self.ep_rank = 0
790-
self.tp_size = self.tp_size * self.dp_size
791-
self.ep_size = 1
792-
else:
793-
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
794-
self.moe_parallel_config.ep_rank = 0
795-
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
796-
self.moe_parallel_config.ep_size = 1
766+
767+
self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank
768+
self.moe_parallel_config.ep_rank = 0
769+
self.moe_parallel_config.tp_size = self.tp_size * self.dp_size
770+
self.moe_parallel_config.ep_size = 1
797771

798772
self.local_num_experts, self.expert_map = (self.global_num_experts,
799773
None)
800774
if self.scoring_func != "softmax" and not self.use_grouped_topk:
801775
raise ValueError("Only softmax scoring function is supported for "
802776
"non-grouped topk.")
803-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
804-
if quant_config is None:
805-
self.quant_method: Optional[QuantizeMethodBase] = (
806-
AscendUnquantizedFusedMoEMethod())
807-
else:
808-
self.quant_method = quant_config.get_quant_method(self, prefix)
809-
else:
810-
moe = MoEConfig(
811-
num_experts=self.global_num_experts,
812-
experts_per_token=top_k,
813-
hidden_dim=hidden_size,
814-
num_local_experts=self.local_num_experts,
815-
moe_parallel_config=self.moe_parallel_config,
816-
# TODO (bnell): this needs to be fixed for quantized types.
817-
in_dtype=params_dtype,
818-
)
819777

820-
if quant_config is None:
821-
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
822-
else:
823-
self.quant_method = quant_config.get_quant_method(self, prefix)
778+
moe = MoEConfig(
779+
num_experts=self.global_num_experts,
780+
experts_per_token=top_k,
781+
hidden_dim=hidden_size,
782+
num_local_experts=self.local_num_experts,
783+
moe_parallel_config=self.moe_parallel_config,
784+
# TODO (bnell): this needs to be fixed for quantized types.
785+
in_dtype=params_dtype,
786+
)
787+
788+
if quant_config is None:
789+
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
790+
else:
791+
self.quant_method = quant_config.get_quant_method(self, prefix)
824792

825793
assert self.quant_method is not None
826794

vllm_ascend/patch/__init__.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,9 @@
2424
# each worker's `__init__` function.
2525
#
2626
# Then in each kind of patch, there are three folders:
27-
# - patch_0_8_5: contains the patches applied when vllm version is 0.8.5.
27+
# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
2828
# - patch_main: contains the patches applied when vllm version is main branch.
29-
# - patch_common: contains the patches applied in both 0.8.5 and main branch.
30-
#
31-
# In the future, with the vllm version upgrade, the new patch folder such as
32-
# patch_0_8_5, patch_0_8_6, etc. will be added to manage the patch for different
33-
# vllm version. And the patch_common will contain the patches applied in all the
34-
# vllm version.
35-
# Once the vllm version is too old that vllm-ascend will not support, the related
36-
# patch folder will be removed as well.
29+
# - patch_common: contains the patches applied in both 0.9.0 and main branch.
3730
#
3831
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
3932
# ----------------------------------------------------------------------------------

vllm_ascend/patch/platform/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
from vllm_ascend.utils import vllm_version_is
1818

1919
# Import specific patches for different versions
20-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
21-
from vllm_ascend.patch.platform import patch_0_8_5 # noqa: F401
20+
if vllm_version_is("0.9.0"):
21+
from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401
2222
from vllm_ascend.patch.platform import patch_common # noqa: F401
2323
else:
2424
from vllm_ascend.patch.platform import patch_common # noqa: F401

vllm_ascend/patch/worker/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
from vllm_ascend.utils import vllm_version_is
1919

2020
# Import specific patches for different versions
21-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
22-
from vllm_ascend.patch.worker import patch_0_8_5 # noqa: F401
21+
if vllm_version_is("0.9.0"):
22+
from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401
2323
from vllm_ascend.patch.worker import patch_common # noqa: F401
2424
else:
2525
from vllm_ascend.patch.worker import patch_common # noqa: F401

vllm_ascend/worker/model_runner.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,6 @@
6464
_init_attn_metadata_from_tensor_dict,
6565
_init_sampling_metadata_from_tensor_dict)
6666

67-
from vllm_ascend.utils import vllm_version_is
68-
6967
if TYPE_CHECKING:
7068
from vllm.attention.backends.abstract import AttentionBackend
7169

@@ -1017,10 +1015,8 @@ def save_sharded_state(
10171015
pattern: Optional[str] = None,
10181016
max_size: Optional[int] = None,
10191017
) -> None:
1020-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
1021-
from vllm.model_executor.model_loader.loader import ShardedStateLoader # type: ignore[import] # isort: skip # noqa
1022-
else:
1023-
from vllm.model_executor.model_loader import ShardedStateLoader
1018+
1019+
from vllm.model_executor.model_loader import ShardedStateLoader
10241020
ShardedStateLoader.save_model(
10251021
self.model,
10261022
path,
@@ -1032,12 +1028,9 @@ def save_tensorized_model(
10321028
self,
10331029
tensorizer_config: TensorizerConfig,
10341030
) -> None:
1035-
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
1036-
from vllm.model_executor.model_loader.loader import \
1037-
TensorizerLoader # type: ignore # noqa
1038-
else:
1039-
from vllm.model_executor.model_loader import \
1040-
TensorizerLoader # type: ignore # noqa
1031+
1032+
from vllm.model_executor.model_loader import \
1033+
TensorizerLoader # type: ignore # noqa
10411034
TensorizerLoader.save_model(
10421035
self.model,
10431036
tensorizer_config=tensorizer_config,

0 commit comments

Comments
 (0)