Skip to content

Commit 762eeb6

Browse files
committed
triage for opencl
1 parent 481f7a6 commit 762eeb6

File tree

4 files changed

+60
-6
lines changed

4 files changed

+60
-6
lines changed

gpttype_adapter.cpp

+11-3
Original file line numberDiff line numberDiff line change
@@ -934,10 +934,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
934934
model_params.n_gpu_layers = inputs.gpulayers;
935935

936936
#if defined(GGML_USE_CLBLAST)
937-
if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0)
937+
if(file_format==FileFormat::GGUF_GENERIC && model_params.n_gpu_layers>0)
938938
{
939-
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
940-
model_params.n_gpu_layers = 0;
939+
if(file_format_meta.model_architecture == GGUFArch::FALCON)
940+
{
941+
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
942+
model_params.n_gpu_layers = 0;
943+
}
944+
else if(file_format_meta.model_architecture == GGUFArch::PHI || file_format_meta.n_expert_count>1)
945+
{
946+
printf("\nOpenCL cannot use regular GPU offloading for this model architecture. A fallback GPU offloader will be used with degraded performance.\n");
947+
clblast_offload_fallback_mode = true;
948+
}
941949
}
942950
#endif
943951
#if defined(GGML_USE_CUBLAS)

llama.cpp

+41-2
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,25 @@ static std::string format(const char * fmt, ...) {
180180
return std::string(buf.data(), size);
181181
}
182182

183+
static bool clblast_offload_fallback_mode = false; //used when regular offload will segfault
184+
static int clblast_offload_fallback_layers = 0;
185+
static int layer_name_to_number(std::string inputString)
186+
{
187+
size_t firstDotPosition = inputString.find('.');
188+
int converted = -1;
189+
190+
if (firstDotPosition != std::string::npos) {
191+
size_t secondDotPosition = inputString.find('.', firstDotPosition + 1);
192+
if (secondDotPosition != std::string::npos) {
193+
std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1);
194+
try{converted = std::stoi(numbersPortion);}
195+
catch (const std::invalid_argument& e) {}
196+
catch (const std::out_of_range& e) {}
197+
}
198+
}
199+
return converted;
200+
}
201+
183202
//
184203
// gguf constants (sync with gguf.py)
185204
//
@@ -2628,6 +2647,19 @@ struct llama_model_loader {
26282647
}
26292648
}
26302649

2650+
#if defined(GGML_USE_CLBLAST)
2651+
if(clblast_offload_fallback_mode)
2652+
{
2653+
int layernum = layer_name_to_number(cur->name);
2654+
bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum);
2655+
if(shouldoffload)
2656+
{
2657+
cur->backend = GGML_BACKEND_GPU;
2658+
ggml_cl_transform_tensor(cur->data, cur);
2659+
}
2660+
}
2661+
#endif
2662+
26312663
size_done += ggml_nbytes(cur);
26322664
}
26332665

@@ -3330,7 +3362,14 @@ static bool llm_load_tensors(
33303362
model.n_gpu_layers = n_gpu_layers;
33313363

33323364
const int64_t n_layer = hparams.n_layer;
3333-
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
3365+
int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
3366+
3367+
if(clblast_offload_fallback_mode)
3368+
{
3369+
printf("\nOpenCL GPU Offload Fallback...");
3370+
clblast_offload_fallback_layers = n_gpu_layers;
3371+
i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0);
3372+
}
33343373

33353374
// there is very little benefit to offloading the input layer, so always keep it on the CPU
33363375
model.buft_input = llama_default_buffer_type_cpu(true);
@@ -3401,7 +3440,7 @@ static bool llm_load_tensors(
34013440
};
34023441
}
34033442
// assign the output layer
3404-
if (n_gpu_layers > n_layer) {
3443+
if (n_gpu_layers > n_layer && !clblast_offload_fallback_mode) {
34053444
model.buft_output = {
34063445
split_buft,
34073446
llama_default_buffer_type_offload(main_gpu)

model_adapter.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,16 @@ void print_tok_vec(std::vector<float> &embd)
272272
if(modelarch!="" && fileformatmeta!=nullptr)
273273
{
274274
std::string fkey = modelarch+".context_length";
275-
auto keyidx = gguf_find_key(ctx, fkey.c_str());
275+
int keyidx = gguf_find_key(ctx, fkey.c_str());
276276
if (keyidx != -1) {
277277
fileformatmeta->n_ctx_train = gguf_get_val_u32(ctx, keyidx);
278278
}
279+
fkey = modelarch+".expert_count";
280+
keyidx = gguf_find_key(ctx, fkey.c_str());
281+
if (keyidx != -1) {
282+
fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);
283+
}
284+
279285
int filever = gguf_get_version(ctx);
280286
fileformatmeta->fileversion = filever;
281287
fileformatmeta->model_architecture = GGUFArch::DEFAULT;

model_adapter.h

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct FileFormatExtraMeta
6262
int n_ctx_train = 2048;
6363
int fileversion = 0;
6464
GGUFArch model_architecture = GGUFArch::DEFAULT;
65+
int n_expert_count = 0;
6566
};
6667

6768
enum ModelLoadResult

0 commit comments

Comments
 (0)