context : always use non-causal attention for encoder graphs (ggml-org#12447)

ggerganov · web-flow · commit 8551c44d840a · 2025-03-18T13:05:49.000+02:00
* context : always use non-causal attention for encoder graphs

ggml-ci

* context : move the change to llama_context::encode()

ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1057,13 +1057,22 @@ int llama_context::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
+    const auto causal_attn_org = cparams.causal_attn;
+
+    // always use non-causal attention for encoder graphs
+    // TODO: this is a tmp solution until we have a proper way to support enc-dec models
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
+    cparams.causal_attn = false;
+
     auto * gf = graph_init();
     auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     res->set_inputs(&ubatch);
 
+    cparams.causal_attn = causal_attn_org;
+
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS: