Skip to content

Commit 3783696

Browse files
authored
Merge pull request #76 from neuralmagic/disagg_pd_dev_merge_main
[2025-5-5] Upstream sync
2 parents 8ce4c07 + bf0be1b commit 3783696

File tree

150 files changed

+4712
-4444
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

150 files changed

+4712
-4444
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ steps:
3939
- pip install -r ../../requirements/docs.txt
4040
- SPHINXOPTS=\"-W\" make html
4141
# Check API reference (if it fails, you may have missing mock imports)
42-
- grep \"sig sig-object py\" build/html/api/inference_params.html
42+
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
4343

4444
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4545
source_file_dependencies:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ instance/
8080
# Sphinx documentation
8181
docs/_build/
8282
docs/source/getting_started/examples/
83+
docs/source/api/vllm
8384

8485
# PyBuilder
8586
.pybuilder/

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ repos:
101101
args:
102102
- -c
103103
- |
104-
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
105-
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
104+
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
105+
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
106106
fi
107107
language: system
108108
verbose: true

CMakeLists.txt

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
250250
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
251251

252252
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
253-
set(CUTLASS_REVISION "v3.9.1" CACHE STRING "CUTLASS revision to use")
253+
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
254254

255255
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
256256
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -301,8 +301,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
301301
# are not supported by Machete yet.
302302
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
303303
if (MARLIN_ARCHS)
304+
305+
#
306+
# For the Marlin kernels we automatically generate sources for various
307+
# preselected input type pairs and schedules.
308+
# Generate sources:
309+
set(MARLIN_GEN_SCRIPT
310+
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
311+
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
312+
313+
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
314+
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
315+
316+
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
317+
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
318+
execute_process(
319+
COMMAND ${CMAKE_COMMAND} -E env
320+
PYTHONPATH=$PYTHONPATH
321+
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
322+
RESULT_VARIABLE marlin_generation_result
323+
OUTPUT_VARIABLE marlin_generation_result
324+
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
325+
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
326+
)
327+
328+
if (NOT marlin_generation_result EQUAL 0)
329+
message(FATAL_ERROR "Marlin generation failed."
330+
" Result: \"${marlin_generation_result}\""
331+
"\nCheck the log for details: "
332+
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
333+
else()
334+
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
335+
CACHE STRING "Last run Marlin generate script hash" FORCE)
336+
message(STATUS "Marlin generation completed successfully.")
337+
endif()
338+
else()
339+
message(STATUS "Marlin generation script has not changed, skipping generation.")
340+
endif()
341+
342+
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
343+
set_gencode_flags_for_srcs(
344+
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
345+
CUDA_ARCHS "${MARLIN_ARCHS}")
346+
347+
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
348+
304349
set(MARLIN_SRCS
305-
"csrc/quantization/fp8/fp8_marlin.cu"
306350
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
307351
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
308352
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@@ -644,7 +688,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
644688
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
645689
execute_process(
646690
COMMAND ${CMAKE_COMMAND} -E env
647-
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
691+
PYTHONPATH=$PYTHONPATH
648692
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
649693
RESULT_VARIABLE moe_marlin_generation_result
650694
OUTPUT_VARIABLE moe_marlin_generation_output

benchmarks/benchmark_serving_structured_output.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,6 @@ async def benchmark(
414414
ignore_eos: bool,
415415
max_concurrency: Optional[int],
416416
structured_output_ratio: float,
417-
structured_output_backend: str,
418417
goodput_config_dict: Optional[dict[str, float]] = None,
419418
):
420419
if backend in ASYNC_REQUEST_FUNCS:
@@ -426,8 +425,6 @@ def prepare_extra_body(request) -> dict:
426425
extra_body = {}
427426
# Add the schema to the extra_body
428427
extra_body[request.structure_type] = request.schema
429-
# Add the specific structured_output_backend
430-
extra_body["guided_decoding_backend"] = structured_output_backend
431428
return extra_body
432429

433430
print("Starting initial single prompt test run...")
@@ -785,7 +782,6 @@ def main(args: argparse.Namespace):
785782
ignore_eos=args.ignore_eos,
786783
max_concurrency=args.max_concurrency,
787784
structured_output_ratio=args.structured_output_ratio,
788-
structured_output_backend=args.structured_output_backend,
789785
goodput_config_dict=goodput_config_dict,
790786
))
791787

@@ -1000,14 +996,6 @@ def main(args: argparse.Namespace):
1000996
type=float,
1001997
default=1.0,
1002998
help="Ratio of Structured Outputs requests")
1003-
parser.add_argument("--structured-output-backend",
1004-
type=str,
1005-
choices=[
1006-
"outlines", "lm-format-enforcer", "xgrammar",
1007-
"guidance", "auto"
1008-
],
1009-
default="auto",
1010-
help="Backend to use for structured outputs")
1011999

10121000
args = parser.parse_args()
10131001
main(args)

benchmarks/run_structured_output_benchmark.sh

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,10 @@ BACKEND=${2:-"vllm"}
99
# Define the dataset to use
1010
DATASET=${3:-"xgrammar_bench"}
1111

12-
# Define the guided decoding backend
13-
GUIDED_BACKEND=${4:-"xgrammar"}
14-
1512
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
16-
OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
13+
OUTPUT_DIR=${4:-"$SCRIPT_DIR/structured_output_benchmark_results"}
1714

18-
GUIDED_RATIO=${6:-0.5}
15+
GUIDED_RATIO=${5:-0.5}
1916

2017
# Create output directory if it doesn't exist
2118
mkdir -p "$OUTPUT_DIR"
@@ -27,15 +24,13 @@ QPS_VALUES=(70 60 50 25 20 15 10)
2724
COMMON_PARAMS="--backend $BACKEND \
2825
--model $MODEL \
2926
--dataset $DATASET \
30-
--structured-output-backend $GUIDED_BACKEND \
3127
--structured-output-ratio $GUIDED_RATIO \
3228
--save-results \
3329
--result-dir $OUTPUT_DIR"
3430

3531
echo "Starting structured output benchmark with model: $MODEL"
3632
echo "Backend: $BACKEND"
3733
echo "Dataset: $DATASET"
38-
echo "Structured output backend: $GUIDED_BACKEND"
3934
echo "Results will be saved to: $OUTPUT_DIR"
4035
echo "----------------------------------------"
4136

@@ -48,7 +43,7 @@ for qps in "${QPS_VALUES[@]}"; do
4843
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
4944

5045
# Construct filename for this run
51-
FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
46+
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
5247

5348
# Run the benchmark
5449
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \

csrc/moe/marlin_moe_wna16/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
kernel_*.cu

csrc/moe/marlin_moe_wna16/generate_kernels.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,13 @@
2525
"{{thread_k_blocks}}, "
2626
"{{'true' if m_block_size_8 else 'false'}}, "
2727
"{{stages}}, "
28-
"{{'true' if has_act_order else 'false'}}, "
29-
"{{'true' if has_zp else 'false'}}, "
3028
"{{group_blocks}}, "
3129
"{{'true' if is_zp_float else 'false'}}>"
3230
"( MARLIN_KERNEL_PARAMS );")
3331

3432
# int8 with zero point case (vllm::kU8) is also supported,
3533
# we don't add it to reduce wheel size.
36-
SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
34+
SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn"]
3735
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
3836

3937
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
@@ -52,21 +50,29 @@ def remove_old_kernels():
5250

5351
def generate_new_kernels():
5452
for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
55-
has_zp = "B" not in scalar_type
5653
all_template_str_list = []
5754

5855
for group_blocks, m_blocks, thread_configs in itertools.product(
5956
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
6057

61-
has_act_order = group_blocks == 0
62-
if has_zp and has_act_order:
58+
# act order case only support gptq-int4 and gptq-int8
59+
if group_blocks == 0 and scalar_type not in [
60+
"vllm::kU4B8", "vllm::kU8B128"
61+
]:
6362
continue
6463
if thread_configs[2] == 256:
64+
# for small batch (m_blocks == 1), we only need (128, 128, 256)
65+
# for large batch (m_blocks > 1), we only need (64, 256, 256)
6566
if m_blocks <= 1 and thread_configs[0] != 128:
6667
continue
6768
if m_blocks > 1 and thread_configs[0] != 64:
6869
continue
6970

71+
# we only support channelwise quantization and group_size == 128
72+
# for fp8
73+
if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
74+
continue
75+
7076
k_blocks = thread_configs[0] // 16
7177
n_blocks = thread_configs[1] // 16
7278
threads = thread_configs[2]
@@ -82,8 +88,6 @@ def generate_new_kernels():
8288
thread_k_blocks=k_blocks,
8389
m_block_size_8=m_blocks == 0.5,
8490
stages="pipe_stages",
85-
has_act_order=has_act_order,
86-
has_zp=has_zp,
8791
group_blocks=group_blocks,
8892
is_zp_float=False,
8993
)

csrc/moe/marlin_moe_wna16/kernel.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
const float *__restrict__ topk_weights_ptr, int top_k, \
1919
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
2020
int prob_n, int prob_k, int *locks, bool use_atomic_add, \
21-
bool use_fp32_reduce
21+
bool use_fp32_reduce, int max_shared_mem
2222

2323
namespace MARLIN_NAMESPACE_NAME {
2424
template <typename scalar_t, // compute dtype, half or nv_float16
@@ -33,11 +33,9 @@ template <typename scalar_t, // compute dtype, half or nv_float16
3333
// only works when thread_m_blocks == 1
3434
const int stages, // number of stages for the async global->shared
3535
// fetch pipeline
36-
const bool has_act_order, // whether act_order is enabled
37-
const bool has_zp, // whether zero-points are enabled
38-
const int group_blocks, // number of consecutive 16x16 blocks
39-
// with a separate quantization scale
40-
const bool is_zp_float // is zero point of float16 type?
36+
const int group_blocks, // number of consecutive 16x16 blocks
37+
// with a separate quantization scale
38+
const bool is_zp_float // is zero point of float16 type?
4139
>
4240
__global__ void Marlin(MARLIN_KERNEL_PARAMS);
4341

0 commit comments

Comments
 (0)