neuralmagic
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 1 addition & 1 deletion b/‎.buildkite/test-pipeline.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 47 additions & 3 deletions b/‎CMakeLists.txt
Lines changed: 47 additions & 3 deletions
diff --git a/‎benchmarks/benchmark_serving_structured_output.py
Lines changed: 0 additions & 12 deletions b/‎benchmarks/benchmark_serving_structured_output.py
Lines changed: 0 additions & 12 deletions
diff --git a/‎benchmarks/run_structured_output_benchmark.sh
Lines changed: 3 additions & 8 deletions b/‎benchmarks/run_structured_output_benchmark.sh
Lines changed: 3 additions & 8 deletions
diff --git a/‎csrc/moe/marlin_moe_wna16/.gitignore
Lines changed: 1 addition & 0 deletions b/‎csrc/moe/marlin_moe_wna16/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/moe/marlin_moe_wna16/generate_kernels.py
Lines changed: 12 additions & 8 deletions b/‎csrc/moe/marlin_moe_wna16/generate_kernels.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎csrc/moe/marlin_moe_wna16/kernel.h
Lines changed: 4 additions & 6 deletions b/‎csrc/moe/marlin_moe_wna16/kernel.h
Lines changed: 4 additions & 6 deletions
@@ -39,7 +39,7 @@ steps:
   - pip install -r ../../requirements/docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/inference_params.html
+  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   source_file_dependencies:
 
@@ -80,6 +80,7 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
+docs/source/api/vllm
 
 # PyBuilder
 .pybuilder/
 
@@ -101,8 +101,8 @@ repos:
     args:
       - -c
       - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
         fi
     language: system
     verbose: true
 
@@ -250,7 +250,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v3.9.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -301,8 +301,52 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # are not supported by Machete yet.
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
+
+    #
+    # For the Marlin kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE marlin_generation_result
+        OUTPUT_VARIABLE marlin_generation_result
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+      )
+
+      if (NOT marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin generation failed."
+                            " Result: \"${marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
+      else()
+        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        message(STATUS "Marlin generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin generation script has not changed, skipping generation.")
+    endif()
+
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
     set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
        "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
        "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@@ -644,7 +688,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
         OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+        PYTHONPATH=$PYTHONPATH
           ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
         RESULT_VARIABLE moe_marlin_generation_result
         OUTPUT_VARIABLE moe_marlin_generation_output
 
@@ -414,7 +414,6 @@ async def benchmark(
     ignore_eos: bool,
     max_concurrency: Optional[int],
     structured_output_ratio: float,
-    structured_output_backend: str,
     goodput_config_dict: Optional[dict[str, float]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
@@ -426,8 +425,6 @@ def prepare_extra_body(request) -> dict:
         extra_body = {}
         # Add the schema to the extra_body
         extra_body[request.structure_type] = request.schema
-        # Add the specific structured_output_backend
-        extra_body["guided_decoding_backend"] = structured_output_backend
         return extra_body
 
     print("Starting initial single prompt test run...")
@@ -785,7 +782,6 @@ def main(args: argparse.Namespace):
             ignore_eos=args.ignore_eos,
             max_concurrency=args.max_concurrency,
             structured_output_ratio=args.structured_output_ratio,
-            structured_output_backend=args.structured_output_backend,
             goodput_config_dict=goodput_config_dict,
         ))
 
@@ -1000,14 +996,6 @@ def main(args: argparse.Namespace):
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=[
-                            "outlines", "lm-format-enforcer", "xgrammar",
-                            "guidance", "auto"
-                        ],
-                        default="auto",
-                        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)
@@ -9,13 +9,10 @@ BACKEND=${2:-"vllm"}
 # Define the dataset to use
 DATASET=${3:-"xgrammar_bench"}
 
-# Define the guided decoding backend
-GUIDED_BACKEND=${4:-"xgrammar"}
-
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+OUTPUT_DIR=${4:-"$SCRIPT_DIR/structured_output_benchmark_results"}
 
-GUIDED_RATIO=${6:-0.5}
+GUIDED_RATIO=${5:-0.5}
 
 # Create output directory if it doesn't exist
 mkdir -p "$OUTPUT_DIR"
@@ -27,15 +24,13 @@ QPS_VALUES=(70 60 50 25 20 15 10)
 COMMON_PARAMS="--backend $BACKEND \
                --model $MODEL \
                --dataset $DATASET \
-               --structured-output-backend $GUIDED_BACKEND \
                --structured-output-ratio $GUIDED_RATIO \
                --save-results \
                --result-dir $OUTPUT_DIR"
 
 echo "Starting structured output benchmark with model: $MODEL"
 echo "Backend: $BACKEND"
 echo "Dataset: $DATASET"
-echo "Structured output backend: $GUIDED_BACKEND"
 echo "Results will be saved to: $OUTPUT_DIR"
 echo "----------------------------------------"
 
@@ -48,7 +43,7 @@ for qps in "${QPS_VALUES[@]}"; do
   GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
 
   # Construct filename for this run
-  FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+  FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
 
   # Run the benchmark
   python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
 
@@ -0,0 +1 @@
+kernel_*.cu
@@ -25,15 +25,13 @@
             "{{thread_k_blocks}}, "
             "{{'true' if m_block_size_8 else 'false'}}, "
             "{{stages}}, "
-            "{{'true' if has_act_order else 'false'}}, "
-            "{{'true' if has_zp else 'false'}}, "
             "{{group_blocks}}, "
             "{{'true' if is_zp_float else 'false'}}>"
             "( MARLIN_KERNEL_PARAMS );")
 
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
-SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
+SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn"]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
@@ -52,21 +50,29 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
-        has_zp = "B" not in scalar_type
         all_template_str_list = []
 
         for group_blocks, m_blocks, thread_configs in itertools.product(
                 GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
 
-            has_act_order = group_blocks == 0
-            if has_zp and has_act_order:
+            # act order case only support gptq-int4 and gptq-int8
+            if group_blocks == 0 and scalar_type not in [
+                    "vllm::kU4B8", "vllm::kU8B128"
+            ]:
                 continue
             if thread_configs[2] == 256:
+                # for small batch (m_blocks == 1), we only need (128, 128, 256)
+                # for large batch (m_blocks > 1), we only need (64, 256, 256)
                 if m_blocks <= 1 and thread_configs[0] != 128:
                     continue
                 if m_blocks > 1 and thread_configs[0] != 64:
                     continue
 
+            # we only support channelwise quantization and group_size == 128
+            # for fp8
+            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
+                continue
+
             k_blocks = thread_configs[0] // 16
             n_blocks = thread_configs[1] // 16
             threads = thread_configs[2]
@@ -82,8 +88,6 @@ def generate_new_kernels():
                 thread_k_blocks=k_blocks,
                 m_block_size_8=m_blocks == 0.5,
                 stages="pipe_stages",
-                has_act_order=has_act_order,
-                has_zp=has_zp,
                 group_blocks=group_blocks,
                 is_zp_float=False,
             )
 
@@ -18,7 +18,7 @@
       const float *__restrict__ topk_weights_ptr, int top_k,                \
       bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,        \
       int prob_n, int prob_k, int *locks, bool use_atomic_add,              \
-      bool use_fp32_reduce
+      bool use_fp32_reduce, int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
@@ -33,11 +33,9 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
                                       // only works when thread_m_blocks == 1
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const bool has_act_order,  // whether act_order is enabled
-          const bool has_zp,         // whether zero-points are enabled
-          const int group_blocks,    // number of consecutive 16x16 blocks
-                                     // with a separate quantization scale
-          const bool is_zp_float     // is zero point of float16 type?
+          const int group_blocks,  // number of consecutive 16x16 blocks
+                                   // with a separate quantization scale
+          const bool is_zp_float   // is zero point of float16 type?
           >
 __global__ void Marlin(MARLIN_KERNEL_PARAMS);