Skip to content

Commit 053b3f9

Browse files
authored
ggml-cpu : update KleidiAI to v1.5.0 (ggml-org#12568)
ggml-cpu : bug fix related to KleidiAI LHS packing Signed-off-by: Dan Johansson <dan.johansson@arm.com>
1 parent e2f5601 commit 053b3f9

File tree

4 files changed

+7
-14
lines changed

4 files changed

+7
-14
lines changed

ggml/src/ggml-cpu/CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -359,9 +359,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
359359

360360
# Fetch KleidiAI sources:
361361
include(FetchContent)
362-
set(KLEIDIAI_COMMIT_TAG "v1.3.0")
362+
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
363363
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
364-
set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
364+
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
365365

366366
if (POLICY CMP0135)
367367
cmake_policy(SET CMP0135 NEW)

ggml/src/ggml-cpu/kleidiai/kernels.cpp

+2-7
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
5151
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
5252
},
5353
/* .lhs_info = */ {
54-
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
55-
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
54+
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
55+
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
5656
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
5757
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
58-
/* .require_aligned_m_idx = */ true,
5958
},
6059
/* .rhs_info = */ {
6160
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
@@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
10099
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
101100
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
102101
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
103-
/* .require_aligned_m_idx = */ false,
104102
},
105103
/* .rhs_info = */ {
106104
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
144142
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
145143
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
146144
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
147-
/* .require_aligned_m_idx = */ false,
148145
},
149146
/* .rhs_info = */ {
150147
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
189186
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
190187
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
191188
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
192-
/* .require_aligned_m_idx = */ false,
193189
},
194190
/* .rhs_info = */ {
195191
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
233229
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
234230
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
235231
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
236-
/* .require_aligned_m_idx = */ false,
237232
},
238233
/* .rhs_info = */ {
239234
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,

ggml/src/ggml-cpu/kleidiai/kernels.h

-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ struct lhs_packing_info {
4040
size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
4141
void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
4242
size_t lhs_stride, void* lhs_packed);
43-
bool require_aligned_m_idx;
4443
};
4544

4645
struct rhs_packing_info {

ggml/src/ggml-cpu/kleidiai/kleidiai.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
124124
size_t sr = kernel->get_sr();
125125

126126
// Calculate number of columns to be processed per thread
127-
const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
128-
const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
127+
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
129128
const size_t m_start = ith * num_m_per_thread;
130129
size_t m_to_process = num_m_per_thread;
131130
if ((m_start + m_to_process) > m) {
@@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
135134
if(m_start < m) {
136135
// Transform LHS
137136
const size_t src_stride = src1->nb[1];
138-
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1]));
137+
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
139138
const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
140139
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
141140

142-
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr);
141+
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
143142
}
144143

145144
ggml_barrier(params->threadpool);

0 commit comments

Comments
 (0)