Add an env var to set the self attn text context factor

WilliamTambellini · WilliamTambellini · commit d93ce40c307f · 2024-09-26T12:01:11.000-07:00
Add an env var to set the text context factor WHISPER_SELFATTN_CACHE_TEXT_CTX_FACTOR same default to 3. Resolve: ggerganov#2334
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -1005,7 +1005,7 @@ static bool whisper_kv_cache_find_slot(
         }
 
         if (n_tested >= n_ctx) {
-            //WHISPER_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            WHISPER_LOG_ERROR("%s: failed to find a slot for %d tokens. n_tested=%d n_ctx=%d cache.head=%d\n", __func__, n_tokens, n_tested, n_ctx, cache.head);
             return false;
         }
     }
@@ -3409,8 +3409,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     }
 
     // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
-    // in theory, there can be a case where this is not enough, but in practice it should always be enough
-    const int factor = 3;
+    // Note: there are cases where 3 is not enough specially when increasing beamsize
+    static const char* text_ctx_factor_evstr = getenv("WHISPER_SELFATTN_CACHE_TEXT_CTX_FACTOR");
+    static const int text_ctx_factor_ev = text_ctx_factor_evstr ? atoi(text_ctx_factor_evstr) : 0;
+    const int factor =  text_ctx_factor_ev > 0 ? text_ctx_factor_ev : 3;
+    WHISPER_LOG_DEBUG("%s: init self-attn cache: n_ctx: %d factor: %d\n", __func__, factor*ctx->model.hparams.n_text_ctx, factor);
 
     if (!whisper_kv_cache_init(state->kv_self, state->backends[0], ctx->itype,
                 ctx->model.hparams.n_text_state,

Original file line number	Diff line number	Diff line change
`@@ -1005,7 +1005,7 @@ static bool whisper_kv_cache_find_slot(`
`1005`	`1005`	`}`
`1006`	`1006`
`1007`	`1007`	`if (n_tested >= n_ctx) {`
`1008`		`- //WHISPER_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);`
	`1008`	`+ WHISPER_LOG_ERROR("%s: failed to find a slot for %d tokens. n_tested=%d n_ctx=%d cache.head=%d\n", __func__, n_tokens, n_tested, n_ctx, cache.head);`
`1009`	`1009`	`return false;`
`1010`	`1010`	`}`
`1011`	`1011`	`}`
`@@ -3409,8 +3409,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {`
`3409`	`3409`	`}`
`3410`	`3410`
`3411`	`3411`	`// at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx`
`3412`		`- // in theory, there can be a case where this is not enough, but in practice it should always be enough`
`3413`		`- const int factor = 3;`
	`3412`	`+ // Note: there are cases where 3 is not enough specially when increasing beamsize`
	`3413`	`+ static const char* text_ctx_factor_evstr = getenv("WHISPER_SELFATTN_CACHE_TEXT_CTX_FACTOR");`
	`3414`	`+ static const int text_ctx_factor_ev = text_ctx_factor_evstr ? atoi(text_ctx_factor_evstr) : 0;`
	`3415`	`+ const int factor = text_ctx_factor_ev > 0 ? text_ctx_factor_ev : 3;`
	`3416`	`+ WHISPER_LOG_DEBUG("%s: init self-attn cache: n_ctx: %d factor: %d\n", __func__, factor*ctx->model.hparams.n_text_ctx, factor);`
`3414`	`3417`
`3415`	`3418`	`if (!whisper_kv_cache_init(state->kv_self, state->backends[0], ctx->itype,`
`3416`	`3419`	`ctx->model.hparams.n_text_state,`