vllm-project
diff --git a/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Lines changed: 2 additions & 1 deletion b/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 2 additions & 2 deletions b/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
Lines changed: 5 additions & 3 deletions b/‎csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
Lines changed: 5 additions & 3 deletions
diff --git a/‎csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
Lines changed: 9 additions & 10 deletions b/‎csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
Lines changed: 9 additions & 10 deletions
diff --git a/‎csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
Lines changed: 7 additions & 6 deletions b/‎csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
Lines changed: 7 additions & 6 deletions
diff --git a/‎tests/kernels/moe/test_moe.py
Lines changed: 2 additions & 1 deletion b/‎tests/kernels/moe/test_moe.py
Lines changed: 2 additions & 1 deletion
@@ -90,7 +90,8 @@ def bench_run(results: list[benchmark.Measurement], model: str,
 
     score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
 
-    topk_weights, topk_ids, token_expert_indices = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, renormalize=False)
 
     def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
 
@@ -115,8 +115,8 @@ def run():
         from vllm.model_executor.layers.fused_moe import override_config
         with override_config(config):
             if use_deep_gemm:
-                topk_weights, topk_ids,token_expert_indices = fused_topk(x, input_gating, topk,
-                                                    False)
+                topk_weights, topk_ids, token_expert_indices = fused_topk(
+                    x, input_gating, topk, False)
                 return fused_experts(
                     x,
                     w1,
 
@@ -127,7 +127,8 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
 }
 
 __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
-                                       const int* expert_map_ptr, int num_experts) {
+                                       const int* expert_map_ptr,
+                                       int num_experts) {
   auto tidx = threadIdx.x;
   auto bidx = blockIdx.x;
   auto lidx = tidx & 31;
@@ -157,8 +158,9 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
     topk_id_ptr[offset + tidx] = topk_id;
   }
 }
-void preprocessTopkIdLauncher(int* topk_id_ptr, int size, const int* expert_map_ptr,
-                              int num_experts, cudaStream_t stream) {
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream) {
   int block = std::min(size, 1024);
   int grid = (size + block - 1) / block;
   int smem_size = (num_experts) * sizeof(int);
 
@@ -18,7 +18,7 @@ inline T* get_ptr(torch::Tensor& t) {
 
 template <typename T>
 inline const T* get_ptr(const torch::Tensor& t) {
-    return reinterpret_cast<const T*>(t.data_ptr());
+  return reinterpret_cast<const T*>(t.data_ptr());
 }
 
 class CubKeyValueSorter {
@@ -42,7 +42,6 @@ class CubKeyValueSorter {
   int num_bits_;
 };
 
-
 void computeExpertFirstTokenOffset(int const* sorted_indices,
                                    int const total_indices,
                                    int const num_experts,
@@ -54,19 +53,18 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
                        int64_t* expert_first_token_offset, int num_rows,
                        int num_experts, int num_experts_per_node, int k,
                        CubKeyValueSorter& sorter, void* sorter_ws,
-                       cudaStream_t stream); 
-
+                       cudaStream_t stream);
 
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, const float* unpermuted_scales,
-    int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row,
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
     int* expanded_source_row_to_expanded_dest_row,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
     int num_local_experts, const int& align_block_size, cudaStream_t stream);
 
-
 // Final kernel to unpermute and scale
 // This kernel unpermutes the original data, does the k-way reduction and
 // performs the final skip connection.
@@ -83,10 +81,11 @@ void finalizeMoeRoutingKernelLauncher(
     float const* scales, int const* expanded_source_row_to_expanded_dest_row,
     int const* expert_for_source_row, int64_t const num_rows,
     int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
-    cudaStream_t stream); 
+    cudaStream_t stream);
 
-void preprocessTopkIdLauncher(int* topk_id_ptr, int size, const int* expert_map_ptr,
-                              int num_experts, cudaStream_t stream);
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream);
 
 void getMIndices(int64_t* expert_first_token_offset,
                  int64_t* align_expert_first_token_offset, int* m_indices,
 
@@ -1,9 +1,10 @@
-#pragma once 
+#pragma once
 
 template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output, const float* unpermuted_scales,
-    int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row,
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
     int* expanded_source_row_to_expanded_dest_row,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_dest_rows, int64_t const cols, int64_t k,
@@ -81,11 +82,11 @@ __global__ void expandInputRowsKernel(
   }
 }
 
-
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, const float* unpermuted_scales,
-    int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row,
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
     int* expanded_source_row_to_expanded_dest_row,
     int64_t* expert_first_token_offset, int64_t const num_rows,
     int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
 
@@ -420,7 +420,8 @@ def test_fused_marlin_moe(
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
-    topk_weights, topk_ids, token_expert_indices = fused_topk(a, score, topk, False)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, False)
 
     torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)