triage for opencl

LostRuins · LostRuins · commit 762eeb620446 · 2024-01-27T11:09:43.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -934,10 +934,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         model_params.n_gpu_layers = inputs.gpulayers;
 
         #if defined(GGML_USE_CLBLAST)
-        if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0)
+        if(file_format==FileFormat::GGUF_GENERIC && model_params.n_gpu_layers>0)
         {
-            printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
-            model_params.n_gpu_layers = 0;
+            if(file_format_meta.model_architecture == GGUFArch::FALCON)
+            {
+                printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
+                model_params.n_gpu_layers = 0;
+            }
+            else if(file_format_meta.model_architecture == GGUFArch::PHI || file_format_meta.n_expert_count>1)
+            {
+                printf("\nOpenCL cannot use regular GPU offloading for this model architecture. A fallback GPU offloader will be used with degraded performance.\n");
+                clblast_offload_fallback_mode = true;
+            }
         }
         #endif
         #if defined(GGML_USE_CUBLAS)
diff --git a/llama.cpp b/llama.cpp
@@ -180,6 +180,25 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
+static bool clblast_offload_fallback_mode = false; //used when regular offload will segfault
+static int clblast_offload_fallback_layers = 0;
+static int layer_name_to_number(std::string inputString)
+{
+    size_t firstDotPosition = inputString.find('.');
+    int converted = -1;
+
+    if (firstDotPosition != std::string::npos) {
+        size_t secondDotPosition = inputString.find('.', firstDotPosition + 1);
+        if (secondDotPosition != std::string::npos) {
+            std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1);
+            try{converted = std::stoi(numbersPortion);}
+            catch (const std::invalid_argument& e) {}
+            catch (const std::out_of_range& e) {}
+        }
+    }
+    return converted;
+}
+
 //
 // gguf constants (sync with gguf.py)
 //
@@ -2628,6 +2647,19 @@ struct llama_model_loader {
                 }
             }
 
+            #if defined(GGML_USE_CLBLAST)
+            if(clblast_offload_fallback_mode)
+            {
+                int layernum = layer_name_to_number(cur->name);
+                bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum);
+                if(shouldoffload)
+                {
+                    cur->backend = GGML_BACKEND_GPU;
+                    ggml_cl_transform_tensor(cur->data, cur);
+                }
+            }
+            #endif
+
             size_done += ggml_nbytes(cur);
         }
 
@@ -3330,7 +3362,14 @@ static bool llm_load_tensors(
     model.n_gpu_layers = n_gpu_layers;
 
     const int64_t n_layer     = hparams.n_layer;
-    const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
+    int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
+
+    if(clblast_offload_fallback_mode)
+    {
+        printf("\nOpenCL GPU Offload Fallback...");
+        clblast_offload_fallback_layers = n_gpu_layers;
+        i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0);
+    }
 
     // there is very little benefit to offloading the input layer, so always keep it on the CPU
     model.buft_input = llama_default_buffer_type_cpu(true);
@@ -3401,7 +3440,7 @@ static bool llm_load_tensors(
             };
         }
         // assign the output layer
-        if (n_gpu_layers > n_layer) {
+        if (n_gpu_layers > n_layer && !clblast_offload_fallback_mode) {
             model.buft_output = {
                 split_buft,
                 llama_default_buffer_type_offload(main_gpu)
diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -272,10 +272,16 @@ void print_tok_vec(std::vector<float> &embd)
         if(modelarch!="" && fileformatmeta!=nullptr)
         {
             std::string fkey = modelarch+".context_length";
-            auto keyidx = gguf_find_key(ctx, fkey.c_str());
+            int keyidx = gguf_find_key(ctx, fkey.c_str());
             if (keyidx != -1) {
                 fileformatmeta->n_ctx_train = gguf_get_val_u32(ctx, keyidx);
             }
+            fkey = modelarch+".expert_count";
+            keyidx = gguf_find_key(ctx, fkey.c_str());
+            if (keyidx != -1) {
+                fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);
+            }
+
             int filever = gguf_get_version(ctx);
             fileformatmeta->fileversion = filever;
             fileformatmeta->model_architecture = GGUFArch::DEFAULT;
diff --git a/model_adapter.h b/model_adapter.h
@@ -62,6 +62,7 @@ struct FileFormatExtraMeta
     int n_ctx_train = 2048;
     int fileversion = 0;
     GGUFArch model_architecture = GGUFArch::DEFAULT;
+    int n_expert_count = 0;
 };
 
 enum ModelLoadResult