Skip to content

Commit 30b0cd3

Browse files
committed
merge: branch 'main' of github.com:vllm-project/vllm into feat/support-thinking-struct-outputs
2 parents 1dd058c + 92edf35 commit 30b0cd3

File tree

201 files changed

+13130
-2777
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

201 files changed

+13130
-2777
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,34 @@
55
set -ex
66

77
# Setup cleanup
8-
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
8+
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
99
trap remove_docker_container EXIT
1010
remove_docker_container
1111

1212
# Try building the docker image
13-
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
13+
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
14+
15+
# Run the image
16+
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
17+
18+
function cpu_tests() {
19+
20+
# offline inference
21+
podman exec cpu-test-ubi9-ppc bash -c "
22+
set -e
23+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
24+
25+
# Run basic model test
26+
podman exec cpu-test-ubi9-ppc bash -c "
27+
set -e
28+
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
29+
pip install sentence-transformers datamodel_code_generator
30+
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
31+
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
32+
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
33+
}
34+
35+
# All of CPU tests are expected to be finished less than 40 mins.
36+
export -f cpu_tests
37+
timeout 40m bash -c cpu_tests
1438

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
# This script build the CPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# Setup cleanup
8+
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
9+
trap remove_docker_container EXIT
10+
remove_docker_container
11+
12+
# Try building the docker image
13+
docker build -t cpu-test -f docker/Dockerfile.s390x .

.buildkite/test-pipeline.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,13 @@ steps:
341341
commands:
342342
- bash scripts/run-benchmarks.sh
343343

344+
- label: Benchmarks CLI Test # 10min
345+
source_file_dependencies:
346+
- vllm/
347+
- tests/benchmarks/
348+
commands:
349+
- pytest -v -s benchmarks/
350+
344351
- label: Quantization Test # 33min
345352
source_file_dependencies:
346353
- csrc/
@@ -393,8 +400,9 @@ steps:
393400
- pytest -v -s models/test_transformers.py
394401
- pytest -v -s models/test_registry.py
395402
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
396-
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
403+
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
397404
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
405+
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
398406

399407
- label: Language Models Test (Standard) # 32min
400408
#mirror_hardwares: [amd]
@@ -404,6 +412,8 @@ steps:
404412
- tests/models/embedding/language
405413
- tests/models/encoder_decoder/language
406414
commands:
415+
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
416+
- pip install causal-conv1d
407417
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
408418
- pytest -v -s models/embedding/language -m core_model
409419

@@ -415,6 +425,8 @@ steps:
415425
- tests/models/embedding/language
416426
- tests/models/encoder_decoder/language
417427
commands:
428+
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
429+
- pip install causal-conv1d
418430
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
419431
- pytest -v -s models/embedding/language -m 'not core_model'
420432

@@ -540,6 +552,7 @@ steps:
540552
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
541553
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
542554
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
555+
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
543556

544557
- label: Plugin Tests (2 GPUs) # 40min
545558
working_dir: "/vllm-workspace/tests"

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,6 @@ benchmarks/**/*.json
203203
# Linting
204204
actionlint
205205
shellcheck*/
206+
207+
# Ingore moe/marlin_moe gen code
208+
csrc/moe/marlin_moe_wna16/kernel_*

.pre-commit-config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ repos:
1111
hooks:
1212
- id: yapf
1313
args: [--in-place, --verbose]
14-
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
1514
- repo: https://github.com/astral-sh/ruff-pre-commit
1615
rev: v0.9.3
1716
hooks:

CMakeLists.txt

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -609,21 +609,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
609609
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
610610
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
611611
if (MARLIN_MOE_ARCHS)
612-
set(MARLIN_MOE_SRC
613-
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
614-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
615-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
616-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
617-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
618-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
619-
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
620-
"csrc/moe/marlin_moe_ops.cu")
621612

613+
#
614+
# For the Marlin MOE kernels we automatically generate sources for various
615+
# preselected input type pairs and schedules.
616+
# Generate sources:
617+
set(MOE_MARLIN_GEN_SCRIPT
618+
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
619+
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
620+
621+
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
622+
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
623+
624+
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
625+
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
626+
execute_process(
627+
COMMAND ${CMAKE_COMMAND} -E env
628+
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
629+
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
630+
RESULT_VARIABLE moe_marlin_generation_result
631+
OUTPUT_VARIABLE moe_marlin_generation_output
632+
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
633+
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
634+
)
635+
636+
if (NOT moe_marlin_generation_result EQUAL 0)
637+
message(FATAL_ERROR "Marlin MOE generation failed."
638+
" Result: \"${moe_marlin_generation_result}\""
639+
"\nCheck the log for details: "
640+
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
641+
else()
642+
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
643+
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
644+
message(STATUS "Marlin MOE generation completed successfully.")
645+
endif()
646+
else()
647+
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
648+
endif()
649+
650+
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
622651
set_gencode_flags_for_srcs(
623-
SRCS "${MARLIN_MOE_SRC}"
652+
SRCS "${MOE_WNAA16_MARLIN_SRC}"
624653
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
625654

626-
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
655+
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
656+
627657
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
628658
else()
629659
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"

csrc/attention/merge_attn_states.cu

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,14 @@ __global__ void merge_attn_states_kernel(
107107

108108
#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \
109109
{ \
110-
vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block>>>( \
111-
reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
112-
reinterpret_cast<scalar_t*>(prefix_output.data_ptr()), \
113-
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
114-
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
115-
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
116-
num_heads, head_size); \
110+
vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS> \
111+
<<<grid, block, 0, stream>>>( \
112+
reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
113+
reinterpret_cast<scalar_t*>(prefix_output.data_ptr()), \
114+
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
115+
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
116+
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
117+
num_heads, head_size); \
117118
}
118119

119120
/*@brief Merges the attention states from prefix and suffix
@@ -122,10 +123,10 @@ __global__ void merge_attn_states_kernel(
122123
* @param output [n,h,d] The output tensor to store the merged attention states.
123124
* @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
124125
* @param prefix_output [n,h,d] The prefix attention states.
125-
* @param prefix_lse [h,d] The log-sum-exp values for the prefix attention
126+
* @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
126127
* states.
127128
* @param suffix_output [n,h,d] The suffix attention states.
128-
* @param suffix_lse [h,d] The log-sum-exp values for the suffix attention
129+
* @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
129130
* states.
130131
*/
131132
template <typename scalar_t>
@@ -146,13 +147,17 @@ void merge_attn_states_launcher(torch::Tensor& output,
146147
if (output_lse.has_value()) {
147148
output_lse_ptr = output_lse.value().data_ptr<float>();
148149
}
149-
// process one pack elements per thread. float -> 4, half/bf16 -> 8
150+
// Process one pack elements per thread. for float, the
151+
// pack_size is 4 for half/bf16, the pack_size is 8.
150152
const uint threads_per_head = head_size / pack_size;
151153
const uint total_threads = num_tokens * num_heads * threads_per_head;
152154

153155
dim3 block(NUM_THREADS);
154156
dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
155157

158+
const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
159+
auto stream = at::cuda::getCurrentCUDAStream();
160+
156161
LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
157162
}
158163

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
import glob
3+
import itertools
4+
import os
5+
import subprocess
6+
7+
import jinja2
8+
9+
FILE_HEAD = """
10+
// auto generated by generate.py
11+
// clang-format off
12+
13+
#include "kernel.h"
14+
#include "marlin_template.h"
15+
16+
namespace MARLIN_NAMESPACE_NAME {
17+
""".strip()
18+
19+
TEMPLATE = ("template __global__ void Marlin<"
20+
"{{scalar_t}}, "
21+
"{{w_type_id}}, "
22+
"{{threads}}, "
23+
"{{thread_m_blocks}}, "
24+
"{{thread_n_blocks}}, "
25+
"{{thread_k_blocks}}, "
26+
"{{'true' if m_block_size_8 else 'false'}}, "
27+
"{{stages}}, "
28+
"{{'true' if has_act_order else 'false'}}, "
29+
"{{'true' if has_zp else 'false'}}, "
30+
"{{group_blocks}}, "
31+
"{{'true' if is_zp_float else 'false'}}>"
32+
"( MARLIN_KERNEL_PARAMS );")
33+
34+
# int8 with zero point case (vllm::kU8) is also supported,
35+
# we don't add it to reduce wheel size.
36+
SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
37+
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
38+
39+
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
40+
# group_blocks:
41+
# = 0 : act order case
42+
# = -1 : channelwise quantization
43+
# > 0 : group_size=16*group_blocks
44+
GROUP_BLOCKS = [0, -1, 2, 4, 8]
45+
DTYPES = ["fp16", "bf16"]
46+
47+
48+
def remove_old_kernels():
49+
for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
50+
subprocess.call(["rm", "-f", filename])
51+
52+
53+
def generate_new_kernels():
54+
for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
55+
has_zp = "B" not in scalar_type
56+
all_template_str_list = []
57+
58+
for group_blocks, m_blocks, thread_configs in itertools.product(
59+
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
60+
61+
has_act_order = group_blocks == 0
62+
if has_zp and has_act_order:
63+
continue
64+
if thread_configs[2] == 256:
65+
if m_blocks <= 1 and thread_configs[0] != 128:
66+
continue
67+
if m_blocks > 1 and thread_configs[0] != 64:
68+
continue
69+
70+
k_blocks = thread_configs[0] // 16
71+
n_blocks = thread_configs[1] // 16
72+
threads = thread_configs[2]
73+
74+
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
75+
76+
template_str = jinja2.Template(TEMPLATE).render(
77+
scalar_t=c_dtype,
78+
w_type_id=scalar_type + ".id()",
79+
threads=threads,
80+
thread_m_blocks=max(m_blocks, 1),
81+
thread_n_blocks=n_blocks,
82+
thread_k_blocks=k_blocks,
83+
m_block_size_8=m_blocks == 0.5,
84+
stages="pipe_stages",
85+
has_act_order=has_act_order,
86+
has_zp=has_zp,
87+
group_blocks=group_blocks,
88+
is_zp_float=False,
89+
)
90+
91+
all_template_str_list.append(template_str)
92+
93+
file_content = FILE_HEAD + "\n\n"
94+
file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
95+
filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
96+
97+
with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
98+
f.write(file_content)
99+
100+
101+
if __name__ == "__main__":
102+
remove_old_kernels()
103+
generate_new_kernels()

csrc/moe/marlin_moe_wna16/kernel.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
2+
#ifndef MARLIN_NAMESPACE_NAME
3+
#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
4+
#endif
5+
6+
#include "quantization/gptq_marlin/marlin.cuh"
7+
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
8+
#include "core/scalar_type.hpp"
9+
10+
#define MARLIN_KERNEL_PARAMS \
11+
const int4 *__restrict__ A, const int4 *__restrict__ B, \
12+
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
13+
const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, \
14+
const int *__restrict__ g_idx, \
15+
const int32_t *__restrict__ sorted_token_ids_ptr, \
16+
const int32_t *__restrict__ expert_ids_ptr, \
17+
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
18+
const float *__restrict__ topk_weights_ptr, int top_k, \
19+
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
20+
int prob_n, int prob_k, int *locks, bool use_atomic_add, \
21+
bool use_fp32_reduce
22+
23+
namespace MARLIN_NAMESPACE_NAME {
24+
template <typename scalar_t, // compute dtype, half or nv_float16
25+
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
26+
const int threads, // number of threads in a threadblock
27+
const int thread_m_blocks, // number of 16x16 blocks in the m
28+
// dimension (batchsize) of the
29+
// threadblock
30+
const int thread_n_blocks, // same for n dimension (output)
31+
const int thread_k_blocks, // same for k dimension (reduction)
32+
const bool m_block_size_8, // whether m_block_size == 8
33+
// only works when thread_m_blocks == 1
34+
const int stages, // number of stages for the async global->shared
35+
// fetch pipeline
36+
const bool has_act_order, // whether act_order is enabled
37+
const bool has_zp, // whether zero-points are enabled
38+
const int group_blocks, // number of consecutive 16x16 blocks
39+
// with a separate quantization scale
40+
const bool is_zp_float // is zero point of float16 type?
41+
>
42+
__global__ void Marlin(MARLIN_KERNEL_PARAMS);
43+
44+
}

0 commit comments

Comments
 (0)