@@ -7347,6 +7347,7 @@ struct llm_build_context {
7347
7347
const int32_t n_tokens;
7348
7348
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
7349
7349
const int32_t n_outputs;
7350
+ const int32_t n_enc_outputs;
7350
7351
const int32_t kv_head; // index of where we store new KV data in the cache
7351
7352
const int32_t n_ctx_orig;
7352
7353
@@ -7396,6 +7397,7 @@ struct llm_build_context {
7396
7397
n_tokens (batch.n_tokens),
7397
7398
n_kv (worst_case ? kv_self.size : kv_self.n),
7398
7399
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
7400
+ n_enc_outputs (worst_case ? n_tokens : lctx.encoder_output.size() / hparams.n_embd),
7399
7401
kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
7400
7402
n_ctx_orig (cparams.n_ctx_orig_yarn),
7401
7403
flash_attn (cparams.flash_attn),
@@ -7660,14 +7662,14 @@ struct llm_build_context {
7660
7662
7661
7663
struct ggml_tensor * llm_build_inp_enc_output() {
7662
7664
const int64_t n_embd = hparams.n_embd;
7663
- lctx.inp_enc_output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, lctx.encoder_output.size() == 0 ? 512 : lctx.encoder_output.size() / n_embd );
7665
+ lctx.inp_enc_output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc_outputs );
7664
7666
ggml_set_input(lctx.inp_enc_output);
7665
7667
cb(lctx.inp_enc_output, "enc_output", -1);
7666
7668
return lctx.inp_enc_output;
7667
7669
}
7668
7670
7669
7671
struct ggml_tensor * llm_build_inp_cross_KQ_mask() {
7670
- lctx.inp_cross_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, lctx.encoder_output.size() == 0 ? 512 : lctx.encoder_output.size() / n_embd , GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
7672
+ lctx.inp_cross_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc_outputs , GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
7671
7673
ggml_set_input(lctx.inp_cross_KQ_mask);
7672
7674
cb(lctx.inp_cross_KQ_mask, "enc_mask", -1);
7673
7675
return lctx.inp_cross_KQ_mask;
@@ -11717,7 +11719,6 @@ struct llm_build_context {
11717
11719
const int64_t n_embd_head = hparams.n_embd_head_v;
11718
11720
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
11719
11721
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11720
- const int32_t n_enc_output = lctx.encoder_output.size() == 0 ? 512 : lctx.encoder_output.size() / n_embd;
11721
11722
11722
11723
struct ggml_tensor * cur;
11723
11724
struct ggml_tensor * inpL;
@@ -11926,7 +11927,7 @@ struct llm_build_context {
11926
11927
cb(Vcur, "Vcur", il);
11927
11928
11928
11929
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11929
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_enc_output );
11930
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_enc_outputs );
11930
11931
11931
11932
struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
11932
11933
struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
@@ -11937,10 +11938,10 @@ struct llm_build_context {
11937
11938
kq = ggml_soft_max_ext(ctx0, kq, enc_KQ_mask, 1.0f, hparams.f_max_alibi_bias);
11938
11939
cb(kq, "kq_soft_max_ext", il);
11939
11940
11940
- struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_enc_output )));
11941
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_enc_outputs )));
11941
11942
cb(v, "v", il);
11942
11943
11943
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_enc_output , n_embd_head, n_head_kv), kq);
11944
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_enc_outputs , n_embd_head, n_head_kv), kq);
11944
11945
cb(kqv, "kqv", il);
11945
11946
11946
11947
struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
0 commit comments