@@ -256,7 +256,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
256
256
return nullptr;
257
257
}
258
258
259
- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
259
+ // CPU: ACCEL -> GPU host -> CPU extra -> CPU
260
260
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
261
261
buft_list_t buft_list;
262
262
@@ -272,32 +272,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
272
272
}
273
273
}
274
274
275
- bool has_gpu_device = false;
276
- for (auto * dev : devices) {
277
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
278
- has_gpu_device = true;
279
- break;
280
- }
281
- }
282
-
283
- // add extra buffer types, only if no GPU device is present
284
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
285
- if (!has_gpu_device) {
286
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
287
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
288
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
289
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
290
- if (ggml_backend_dev_get_extra_bufts_fn) {
291
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
292
- while (extra_bufts && *extra_bufts) {
293
- buft_list.emplace_back(cpu_dev, *extra_bufts);
294
- ++extra_bufts;
295
- }
296
- }
297
- } else {
298
- LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
299
- }
300
-
301
275
// add a host buffer type
302
276
// storing the tensors in a host buffer is useful when the processing of large batches
303
277
// is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -312,6 +286,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
312
286
}
313
287
}
314
288
289
+ // add extra buffer types, only if no GPU device is present
290
+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
291
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
292
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
293
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
294
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
295
+ if (ggml_backend_dev_get_extra_bufts_fn) {
296
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
297
+ while (extra_bufts && *extra_bufts) {
298
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
299
+ ++extra_bufts;
300
+ }
301
+ }
302
+
315
303
// add the CPU buffer type
316
304
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
317
305
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
0 commit comments