vllm-project
diff --git a/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Lines changed: 3 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/ISSUE_TEMPLATE/400-bug-report.yml
Lines changed: 1 addition & 1 deletion b/‎.github/ISSUE_TEMPLATE/400-bug-report.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 15 additions & 9 deletions b/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 15 additions & 9 deletions
diff --git a/‎cmake/cpu_extension.cmake
Lines changed: 32 additions & 1 deletion b/‎cmake/cpu_extension.cmake
Lines changed: 32 additions & 1 deletion
@@ -50,6 +50,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py \
     && echo TEST_12 \
     && pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" \
+    # Disable the TPU LoRA tests until the feature is activated
+    # && echo TEST_13 \
+    # && pytest -s -v /workspace/vllm/tests/tpu/lora/" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 
@@ -75,7 +75,7 @@ body:
       ```
 
       ```
-      The error message you got, with the full traceback.
+      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
       ```
   validations:
     required: true
 
@@ -6,15 +6,16 @@
 from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
+from types import SimpleNamespace
 from typing import Any, TypedDict
 
 import ray
 import torch
 from ray.experimental.tqdm_ray import tqdm
-from transformers import AutoConfig
 
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser
 
@@ -534,8 +535,12 @@ def get_weight_block_size_safety(config, default_value=None):
 def main(args: argparse.Namespace):
     print(args)
 
-    config = AutoConfig.from_pretrained(
-        args.model, trust_remote_code=args.trust_remote_code)
+    config = get_config(model=args.model,
+                        trust_remote_code=args.trust_remote_code)
+    if args.model_prefix:
+        config = getattr(config, args.model_prefix)
+    config = SimpleNamespace(**config)
+
     if config.architectures[0] == "DbrxForCausalLM":
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
@@ -546,15 +551,14 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
-          or config.architectures[0] == "DeepseekV2ForCausalLM"):
+    elif (config.architectures[0]
+          in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM")):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] in [
-            "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
-    ]:
+    elif config.architectures[0] in ("Qwen2MoeForCausalLM",
+                                     "Qwen3MoeForCausalLM"):
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
@@ -569,7 +573,8 @@ def main(args: argparse.Namespace):
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
     hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else getattr(
+        torch, config.torch_dtype)
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
@@ -659,6 +664,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
     parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--model-prefix", type=str, required=False)
     args = parser.parse_args()
 
     main(args)
@@ -167,6 +167,33 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
 
     FetchContent_MakeAvailable(oneDNN)
 
+    list(APPEND LIBS dnnl)
+elseif(POWER10_FOUND)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG v3.7.2
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
+    )
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    set(DNNL_CPU_RUNTIME "OMP")
+
+    FetchContent_MakeAvailable(oneDNN)
+
     list(APPEND LIBS dnnl)
 endif()
 
@@ -197,6 +224,10 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         "csrc/cpu/quant.cpp"
         "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
+elseif(POWER10_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
 endif()
 
 #
@@ -214,4 +245,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")