Skip to content

Commit 0bb2919

Browse files
Djip007philou
and
philou
authored
llama : change cpu_buft_list order: ACCEL -> GPU host -> CPU extra -> CPU (ggml-org#12632)
this allow to use GPU host when possible over CPU repack. this have the same effect to resolve this issues (ggml-org#12498) without completely disable CPU extra buffer. Co-authored-by: philou <philou@framework>
1 parent a69f846 commit 0bb2919

File tree

1 file changed

+15
-27
lines changed

1 file changed

+15
-27
lines changed

src/llama-model.cpp

+15-27
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
256256
return nullptr;
257257
}
258258

259-
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
259+
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
260260
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
261261
buft_list_t buft_list;
262262

@@ -272,32 +272,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
272272
}
273273
}
274274

275-
bool has_gpu_device = false;
276-
for (auto * dev : devices) {
277-
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
278-
has_gpu_device = true;
279-
break;
280-
}
281-
}
282-
283-
// add extra buffer types, only if no GPU device is present
284-
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
285-
if (!has_gpu_device) {
286-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
287-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
288-
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
289-
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
290-
if (ggml_backend_dev_get_extra_bufts_fn) {
291-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
292-
while (extra_bufts && *extra_bufts) {
293-
buft_list.emplace_back(cpu_dev, *extra_bufts);
294-
++extra_bufts;
295-
}
296-
}
297-
} else {
298-
LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
299-
}
300-
301275
// add a host buffer type
302276
// storing the tensors in a host buffer is useful when the processing of large batches
303277
// is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -312,6 +286,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
312286
}
313287
}
314288

289+
// add extra buffer types, only if no GPU device is present
290+
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
291+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
292+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
293+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
294+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
295+
if (ggml_backend_dev_get_extra_bufts_fn) {
296+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
297+
while (extra_bufts && *extra_bufts) {
298+
buft_list.emplace_back(cpu_dev, *extra_bufts);
299+
++extra_bufts;
300+
}
301+
}
302+
315303
// add the CPU buffer type
316304
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
317305
ggml_backend_dev_t dev = ggml_backend_dev_get(i);

0 commit comments

Comments
 (0)