@@ -180,6 +180,25 @@ static std::string format(const char * fmt, ...) {
180
180
return std::string(buf.data(), size);
181
181
}
182
182
183
+ static bool clblast_offload_fallback_mode = false; //used when regular offload will segfault
184
+ static int clblast_offload_fallback_layers = 0;
185
+ static int layer_name_to_number(std::string inputString)
186
+ {
187
+ size_t firstDotPosition = inputString.find('.');
188
+ int converted = -1;
189
+
190
+ if (firstDotPosition != std::string::npos) {
191
+ size_t secondDotPosition = inputString.find('.', firstDotPosition + 1);
192
+ if (secondDotPosition != std::string::npos) {
193
+ std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1);
194
+ try{converted = std::stoi(numbersPortion);}
195
+ catch (const std::invalid_argument& e) {}
196
+ catch (const std::out_of_range& e) {}
197
+ }
198
+ }
199
+ return converted;
200
+ }
201
+
183
202
//
184
203
// gguf constants (sync with gguf.py)
185
204
//
@@ -2628,6 +2647,19 @@ struct llama_model_loader {
2628
2647
}
2629
2648
}
2630
2649
2650
+ #if defined(GGML_USE_CLBLAST)
2651
+ if(clblast_offload_fallback_mode)
2652
+ {
2653
+ int layernum = layer_name_to_number(cur->name);
2654
+ bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum);
2655
+ if(shouldoffload)
2656
+ {
2657
+ cur->backend = GGML_BACKEND_GPU;
2658
+ ggml_cl_transform_tensor(cur->data, cur);
2659
+ }
2660
+ }
2661
+ #endif
2662
+
2631
2663
size_done += ggml_nbytes(cur);
2632
2664
}
2633
2665
@@ -3330,7 +3362,14 @@ static bool llm_load_tensors(
3330
3362
model.n_gpu_layers = n_gpu_layers;
3331
3363
3332
3364
const int64_t n_layer = hparams.n_layer;
3333
- const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
3365
+ int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
3366
+
3367
+ if(clblast_offload_fallback_mode)
3368
+ {
3369
+ printf("\nOpenCL GPU Offload Fallback...");
3370
+ clblast_offload_fallback_layers = n_gpu_layers;
3371
+ i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0);
3372
+ }
3334
3373
3335
3374
// there is very little benefit to offloading the input layer, so always keep it on the CPU
3336
3375
model.buft_input = llama_default_buffer_type_cpu(true);
@@ -3401,7 +3440,7 @@ static bool llm_load_tensors(
3401
3440
};
3402
3441
}
3403
3442
// assign the output layer
3404
- if (n_gpu_layers > n_layer) {
3443
+ if (n_gpu_layers > n_layer && !clblast_offload_fallback_mode ) {
3405
3444
model.buft_output = {
3406
3445
split_buft,
3407
3446
llama_default_buffer_type_offload(main_gpu)
0 commit comments