Skip to content

Commit 840cd41

Browse files
committed
fix pre-commit failed
Signed-off-by: Caleb_Du <Caleb_Du@zju.edu.cn>
1 parent c7e5830 commit 840cd41

File tree

13 files changed

+178
-159
lines changed

13 files changed

+178
-159
lines changed

benchmarks/kernels/benchmark_grouped_gemm_cutlass.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ def bench_run(results: list[benchmark.Measurement], model: str,
9090

9191
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
9292

93-
topk_weights, topk_ids, token_expert_indices = fused_topk(a, score, topk, renormalize=False)
93+
topk_weights, topk_ids, token_expert_indices = fused_topk(
94+
a, score, topk, renormalize=False)
9495

9596
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
9697
topk_weights: torch.Tensor, topk_ids: torch.Tensor,

benchmarks/kernels/benchmark_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ def run():
115115
from vllm.model_executor.layers.fused_moe import override_config
116116
with override_config(config):
117117
if use_deep_gemm:
118-
topk_weights, topk_ids,token_expert_indices = fused_topk(x, input_gating, topk,
119-
False)
118+
topk_weights, topk_ids, token_expert_indices = fused_topk(
119+
x, input_gating, topk, False)
120120
return fused_experts(
121121
x,
122122
w1,

csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
127127
}
128128

129129
__global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
130-
const int* expert_map_ptr, int num_experts) {
130+
const int* expert_map_ptr,
131+
int num_experts) {
131132
auto tidx = threadIdx.x;
132133
auto bidx = blockIdx.x;
133134
auto lidx = tidx & 31;
@@ -157,8 +158,9 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
157158
topk_id_ptr[offset + tidx] = topk_id;
158159
}
159160
}
160-
void preprocessTopkIdLauncher(int* topk_id_ptr, int size, const int* expert_map_ptr,
161-
int num_experts, cudaStream_t stream) {
161+
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
162+
const int* expert_map_ptr, int num_experts,
163+
cudaStream_t stream) {
162164
int block = std::min(size, 1024);
163165
int grid = (size + block - 1) / block;
164166
int smem_size = (num_experts) * sizeof(int);

csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ inline T* get_ptr(torch::Tensor& t) {
1818

1919
template <typename T>
2020
inline const T* get_ptr(const torch::Tensor& t) {
21-
return reinterpret_cast<const T*>(t.data_ptr());
21+
return reinterpret_cast<const T*>(t.data_ptr());
2222
}
2323

2424
class CubKeyValueSorter {
@@ -42,7 +42,6 @@ class CubKeyValueSorter {
4242
int num_bits_;
4343
};
4444

45-
4645
void computeExpertFirstTokenOffset(int const* sorted_indices,
4746
int const total_indices,
4847
int const num_experts,
@@ -54,19 +53,18 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
5453
int64_t* expert_first_token_offset, int num_rows,
5554
int num_experts, int num_experts_per_node, int k,
5655
CubKeyValueSorter& sorter, void* sorter_ws,
57-
cudaStream_t stream);
58-
56+
cudaStream_t stream);
5957

6058
template <typename T>
6159
void expandInputRowsKernelLauncher(
62-
T const* unpermuted_input, T* permuted_output, const float* unpermuted_scales,
63-
int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row,
60+
T const* unpermuted_input, T* permuted_output,
61+
const float* unpermuted_scales, int* sorted_experts,
62+
int const* expanded_dest_row_to_expanded_source_row,
6463
int* expanded_source_row_to_expanded_dest_row,
6564
int64_t* expert_first_token_offset, int64_t const num_rows,
6665
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
6766
int num_local_experts, const int& align_block_size, cudaStream_t stream);
6867

69-
7068
// Final kernel to unpermute and scale
7169
// This kernel unpermutes the original data, does the k-way reduction and
7270
// performs the final skip connection.
@@ -83,10 +81,11 @@ void finalizeMoeRoutingKernelLauncher(
8381
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
8482
int const* expert_for_source_row, int64_t const num_rows,
8583
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
86-
cudaStream_t stream);
84+
cudaStream_t stream);
8785

88-
void preprocessTopkIdLauncher(int* topk_id_ptr, int size, const int* expert_map_ptr,
89-
int num_experts, cudaStream_t stream);
86+
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
87+
const int* expert_map_ptr, int num_experts,
88+
cudaStream_t stream);
9089

9190
void getMIndices(int64_t* expert_first_token_offset,
9291
int64_t* align_expert_first_token_offset, int* m_indices,

csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
#pragma once
1+
#pragma once
22

33
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
44
__global__ void expandInputRowsKernel(
5-
T const* unpermuted_input, T* permuted_output, const float* unpermuted_scales,
6-
int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row,
5+
T const* unpermuted_input, T* permuted_output,
6+
const float* unpermuted_scales, int* sorted_experts,
7+
int const* expanded_dest_row_to_expanded_source_row,
78
int* expanded_source_row_to_expanded_dest_row,
89
int64_t* expert_first_token_offset, int64_t const num_rows,
910
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
@@ -81,11 +82,11 @@ __global__ void expandInputRowsKernel(
8182
}
8283
}
8384

84-
8585
template <typename T>
8686
void expandInputRowsKernelLauncher(
87-
T const* unpermuted_input, T* permuted_output, const float* unpermuted_scales,
88-
int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row,
87+
T const* unpermuted_input, T* permuted_output,
88+
const float* unpermuted_scales, int* sorted_experts,
89+
int const* expanded_dest_row_to_expanded_source_row,
8990
int* expanded_source_row_to_expanded_dest_row,
9091
int64_t* expert_first_token_offset, int64_t const num_rows,
9192
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,

tests/kernels/moe/test_moe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,8 @@ def test_fused_marlin_moe(
420420

421421
score = torch.randn((m, e), device="cuda", dtype=dtype)
422422

423-
topk_weights, topk_ids, token_expert_indices = fused_topk(a, score, topk, False)
423+
topk_weights, topk_ids, token_expert_indices = fused_topk(
424+
a, score, topk, False)
424425

425426
torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
426427

0 commit comments

Comments
 (0)