From 536709057a39391405cdf7370a5f6ac68a91d418 Mon Sep 17 00:00:00 2001 From: Ycros <18012+ycros@users.noreply.github.com> Date: Sat, 16 Dec 2023 10:16:08 +0000 Subject: [PATCH 1/5] Hacky poc of logging expert order. --- ggml-cuda.cu | 29 +++++++++++++++++++++++++++++ ggml.c | 4 ++++ llama.cpp | 2 ++ 3 files changed, 35 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 146659c9f212a..510e8b10906fc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7628,6 +7628,32 @@ inline void ggml_cuda_op_argsort( argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); + // printf ncols, nrows + // printf("\nncols: %d, nrows: %d", ncols, nrows); + + int* dst_host = (int *) malloc(ncols * nrows * sizeof(int)); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemcpy(dst_host, dst_dd, ncols * nrows * sizeof(int), cudaMemcpyDeviceToHost)); + // printf("\n"); + printf("%s: \n", ggml_get_name(dst)); + // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + // for (int i = 0; i < ncols; i++) { + // printf("%d ", ((int *)dst_host)[i]); + // } + + // row major order + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + printf("%d ", dst_host[i * ncols + j]); + } + printf("\n"); + } + + // for (int i = 0; i < ggml_nelements(dst); i++) { + // printf("%d ", ggml_get_i32_1d(dst, i)); + // } + // printf("\n"); + (void) src1; (void) dst; (void) src1_dd; @@ -9662,12 +9688,15 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph } } + printf("\n"); bool ok = ggml_cuda_compute_forward(¶ms, node); + printf("\n"); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); + #if 0 if (node->type == GGML_TYPE_F32) { cudaDeviceSynchronize(); diff --git a/ggml.c b/ggml.c index ee207a5d19454..bd7c283f92c14 100644 --- a/ggml.c +++ b/ggml.c @@ -12357,6 +12357,10 @@ static void ggml_compute_forward_argsort( case GGML_TYPE_F32: { ggml_compute_forward_argsort_f32(params, src0, dst); + for (int i = 0; i < ggml_nelements(dst); i++) { + printf("%d ", ggml_get_i32_1d(dst, i)); + } + printf("\n"); } break; default: { diff --git a/llama.cpp b/llama.cpp index fc38e18f6a84e..ec64717c9a491 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6074,6 +6074,8 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + printf("\n"); + ggml_allocr_reset(lctx.alloc); ggml_cgraph * gf = llama_build_graph(lctx, batch); From dae4ade6c8c59bd107822019420505f4dca72d34 Mon Sep 17 00:00:00 2001 From: Ycros <18012+ycros@users.noreply.github.com> Date: Sat, 16 Dec 2023 13:11:45 +0000 Subject: [PATCH 2/5] ggml-cuda: Replace bitonic argsort with insertion sort. --- ggml-cuda.cu | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 510e8b10906fc..8347b2f510d8c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5105,7 +5105,6 @@ static inline __device__ void swap(T & a, T & b) { template static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) { - // bitonic sort int col = threadIdx.x; int row = blockIdx.y; @@ -5114,31 +5113,32 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n const float * x_row = x + row * ncols; int * dst_row = dst + row * ncols; - // initialize indices - if (col < ncols) { - dst_row[col] = col; + // Initialize indices + for (int i = 0; i < ncols; i++) { + dst_row[i] = i; } - __syncthreads(); + __syncthreads(); // Ensure all indices are initialized - for (int k = 2; k <= ncols; k *= 2) { - for (int j = k / 2; j > 0; j /= 2) { - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { - swap(dst_row[col], dst_row[ixj]); - } + // Insertion sort + for (int i = 1; i < ncols; i++) { + int j = i; + while (j > 0) { + bool condition = order == GGML_SORT_ASC ? + x_row[dst_row[j-1]] > x_row[dst_row[j]] : + x_row[dst_row[j-1]] < x_row[dst_row[j]]; + + if (condition) { + // Swap + swap(dst_row[j], dst_row[j - 1]); } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { - swap(dst_row[col], dst_row[ixj]); + break; } - } - } - __syncthreads(); + j--; } } } + static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { const int col = blockDim.y*blockIdx.y + threadIdx.y; const int row = blockDim.x*blockIdx.x + threadIdx.x; @@ -6474,9 +6474,6 @@ static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, con } static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { - // bitonic sort requires ncols to be power of 2 - GGML_ASSERT((ncols & (ncols - 1)) == 0); - const dim3 block_dims(ncols, 1, 1); const dim3 block_nums(1, nrows, 1); if (order == GGML_SORT_ASC) { From 7a051397d003937feeeb466b5c881a0ca00e11b2 Mon Sep 17 00:00:00 2001 From: Ycros <18012+ycros@users.noreply.github.com> Date: Sat, 16 Dec 2023 14:03:02 +0000 Subject: [PATCH 3/5] Count and print experts used per token. --- ggml-cuda.cu | 52 +++++++++++++++++++++++++++++++++++----------------- ggml-cuda.h | 4 ++++ llama.cpp | 11 +++++++++++ 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8347b2f510d8c..85e685e454c0a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5130,9 +5130,9 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n if (condition) { // Swap swap(dst_row[j], dst_row[j - 1]); - } else { + } else { break; - } + } j--; } } @@ -7611,6 +7611,27 @@ inline void ggml_cuda_op_sum_rows( (void) src1_dd; } +// const size_t TOTAL_EXPERTS = 8; +static uint * expert_counter = nullptr; +static size_t expert_count; +static uint experts_per_tok = 2; + +void reset_expert_counter(uint experts_per_tok, size_t expert_count) { + if (expert_counter != nullptr) { + free(expert_counter); + } + expert_counter = (uint *) calloc(expert_count, sizeof(uint)); + ::experts_per_tok = experts_per_tok; + ::expert_count = expert_count; +} + +void print_expert_counter() { + for (int i = 0; i < expert_count; i++) { + printf("%u:%2u ", i, expert_counter[i]); + } + printf("\n"); +} + inline void ggml_cuda_op_argsort( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { @@ -7631,26 +7652,23 @@ inline void ggml_cuda_op_argsort( int* dst_host = (int *) malloc(ncols * nrows * sizeof(int)); CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(dst_host, dst_dd, ncols * nrows * sizeof(int), cudaMemcpyDeviceToHost)); - // printf("\n"); - printf("%s: \n", ggml_get_name(dst)); - // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - // for (int i = 0; i < ncols; i++) { - // printf("%d ", ((int *)dst_host)[i]); - // } + // printf("%s: \n", ggml_get_name(dst)); // row major order for (int i = 0; i < nrows; i++) { for (int j = 0; j < ncols; j++) { - printf("%d ", dst_host[i * ncols + j]); + int val = dst_host[i * ncols + j]; + if (j < experts_per_tok) { + GGML_ASSERT(val < expert_count); + expert_counter[val]++; + } else { + break; + } + // printf("%d ", val); } - printf("\n"); + // printf("\n"); } - // for (int i = 0; i < ggml_nelements(dst); i++) { - // printf("%d ", ggml_get_i32_1d(dst, i)); - // } - // printf("\n"); - (void) src1; (void) dst; (void) src1_dd; @@ -9685,9 +9703,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph } } - printf("\n"); + // printf("\n"); bool ok = ggml_cuda_compute_forward(¶ms, node); - printf("\n"); + // printf("\n"); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } diff --git a/ggml-cuda.h b/ggml-cuda.h index cdb0c0c41618a..8ff238506c1c5 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -59,6 +59,10 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // pinned host buffer for use with CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +void reset_expert_counter(uint experts_per_tok, size_t expert_count); + +void print_expert_counter(); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index ec64717c9a491..d3b50c4958c9c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4199,6 +4199,11 @@ struct llm_build_context { } struct ggml_cgraph * build_llama() { + #ifdef GGML_USE_CUBLAS + // printf("\nexpert_used: %u\n", n_expert_used); + reset_expert_counter(n_expert_used, n_expert); + #endif + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6173,6 +6178,12 @@ static int llama_decode_internal( ggml_graph_print(gf); #endif +#ifdef GGML_USE_CUBLAS + print_expert_counter(); +#endif + + // ycros_moe_debug(gf); + // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); From 31491d18c72622fa0335e2e0b1c1bb144db946fc Mon Sep 17 00:00:00 2001 From: Ycros <18012+ycros@users.noreply.github.com> Date: Sat, 16 Dec 2023 14:09:50 +0000 Subject: [PATCH 4/5] Kill ggml cpu debug prints. --- ggml.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index bd7c283f92c14..594a8da40c909 100644 --- a/ggml.c +++ b/ggml.c @@ -12357,10 +12357,17 @@ static void ggml_compute_forward_argsort( case GGML_TYPE_F32: { ggml_compute_forward_argsort_f32(params, src0, dst); - for (int i = 0; i < ggml_nelements(dst); i++) { - printf("%d ", ggml_get_i32_1d(dst, i)); - } - printf("\n"); + + // TODO: for cpu only or partial offload, the counters state would need to be + // moved out of ggml-cuda.cu and maybe into here, and then this would need to + // perform a similar update to what the CUDA code does. + // Additionally the state reset and print functions are currently only being + // called from CUBLAS ifdefs, they'd need to be moved outside of those. + + // for (int i = 0; i < ggml_nelements(dst); i++) { + // printf("%d ", ggml_get_i32_1d(dst, i)); + // } + // printf("\n"); } break; default: { From 8ee4c4d486f6161c48ea5e75b4aaa71083ded0e2 Mon Sep 17 00:00:00 2001 From: Ycros <18012+ycros@users.noreply.github.com> Date: Sun, 17 Dec 2023 11:05:14 +0000 Subject: [PATCH 5/5] Aggregate expert counters by prompt/gen. Print percentages as well. --- ggml-cuda.cu | 47 ++++++++++++++++++++++++++++----------------- gpttype_adapter.cpp | 24 +++++++++++++++++++++++ llama.cpp | 6 +++--- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a67d2353b33c4..d8d8b91dd95c5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7614,7 +7614,7 @@ inline void ggml_cuda_op_sum_rows( // const size_t TOTAL_EXPERTS = 8; static uint * expert_counter = nullptr; static size_t expert_count; -static uint experts_per_tok = 2; +static uint experts_per_tok = 0; void reset_expert_counter(uint experts_per_tok, size_t expert_count) { if (expert_counter != nullptr) { @@ -7626,9 +7626,18 @@ void reset_expert_counter(uint experts_per_tok, size_t expert_count) { } void print_expert_counter() { + printf("\n"); + uint total = 0; for (int i = 0; i < expert_count; i++) { - printf("%u:%2u ", i, expert_counter[i]); + printf("%u:%6u ", i, expert_counter[i]); + total += expert_counter[i]; } + + printf("\n"); + for (int i = 0; i < expert_count; i++) { + printf("%u:%6.2f%% ", i, 100.0f * expert_counter[i] / total); + } + printf("\n"); } @@ -7649,24 +7658,26 @@ inline void ggml_cuda_op_argsort( // printf ncols, nrows // printf("\nncols: %d, nrows: %d", ncols, nrows); - int* dst_host = (int *) malloc(ncols * nrows * sizeof(int)); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy(dst_host, dst_dd, ncols * nrows * sizeof(int), cudaMemcpyDeviceToHost)); - // printf("%s: \n", ggml_get_name(dst)); - - // row major order - for (int i = 0; i < nrows; i++) { - for (int j = 0; j < ncols; j++) { - int val = dst_host[i * ncols + j]; - if (j < experts_per_tok) { - GGML_ASSERT(val < expert_count); - expert_counter[val]++; - } else { - break; + if (expert_counter != nullptr) { + int* dst_host = (int *) malloc(ncols * nrows * sizeof(int)); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemcpy(dst_host, dst_dd, ncols * nrows * sizeof(int), cudaMemcpyDeviceToHost)); + // printf("%s: \n", ggml_get_name(dst)); + + // row major order + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + int val = dst_host[i * ncols + j]; + if (j < experts_per_tok) { + GGML_ASSERT(val < expert_count); + expert_counter[val]++; + } else { + break; + } + // printf("%d ", val); } - // printf("%d ", val); + // printf("\n"); } - // printf("\n"); } (void) src1; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 0e8b4b259bbab..1496ce0bbbec1 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -99,6 +99,9 @@ static std::mutex concat_output_mtx; static std::string concat_output = ""; static std::string concat_output_reader_copy = ""; +static uint32_t xx_n_expert = 0; +static uint32_t xx_n_expert_used = 0; + const int extra_context_handle_fragmentation = 80; inline bool IsNanCheck(float f) @@ -972,6 +975,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } } + xx_n_expert = llamamodel->hparams.n_expert; + xx_n_expert_used = llamamodel->hparams.n_expert_used; + llama_ctx_v4 = llama_new_context_with_model(llamamodel, llama_ctx_params); if (llama_ctx_v4 == NULL) @@ -1721,6 +1727,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o printf("%s\n\n", RemoveBell(outstr).c_str()); } +#ifdef GGML_USE_CUBLAS + if (xx_n_expert > 0) { + reset_expert_counter(xx_n_expert_used, xx_n_expert); + } +#endif + while (remaining_tokens > 0) { gpt_vocab::id id = 0; @@ -1841,6 +1853,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (!startedsampling) { + #ifdef GGML_USE_CUBLAS + if (xx_n_expert > 0) { + print_expert_counter(); + reset_expert_counter(xx_n_expert_used, xx_n_expert); + } + #endif startedsampling = true; params.n_batch = original_batch; params.n_threads = original_threads; @@ -2004,5 +2022,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o total_gens += 1; snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str()); +#ifdef GGML_USE_CUBLAS + if (xx_n_expert > 0) { + print_expert_counter(); + } +#endif + return output; } diff --git a/llama.cpp b/llama.cpp index af95d838bcb43..365f9249235c2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4205,7 +4205,7 @@ struct llm_build_context { struct ggml_cgraph * build_llama() { #ifdef GGML_USE_CUBLAS // printf("\nexpert_used: %u\n", n_expert_used); - reset_expert_counter(n_expert_used, n_expert); + // reset_expert_counter(n_expert_used, n_expert); #endif struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6083,7 +6083,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - printf("\n"); + // printf("\n"); ggml_allocr_reset(lctx.alloc); @@ -6183,7 +6183,7 @@ static int llama_decode_internal( #endif #ifdef GGML_USE_CUBLAS - print_expert_counter(); + // print_expert_counter(); #endif // ycros_moe_debug(gf);