@@ -198,7 +198,7 @@ void FreeCtx(llama_context* ctx)
198
198
void ClearContextKVCache (llama_context* ctx)
199
199
{
200
200
prevTokens.clear ();
201
- llama_kv_cache_clear (ctx);
201
+ llama_kv_self_clear (ctx);
202
202
}
203
203
204
204
void FreeModel (llama_model* model)
@@ -799,7 +799,7 @@ const char* InferToReadbackBuffer(
799
799
800
800
// Check when tokens diverge and remove everything after the common prefix
801
801
const size_t prefixEnd = common_lcp (tokens, prevTokens);
802
- llama_kv_cache_seq_rm (context, 0 , prefixEnd, -1 );
802
+ llama_kv_self_seq_rm (context, 0 , prefixEnd, -1 );
803
803
804
804
for (size_t i = prefixEnd; i < tokens.size (); i += batchSize) {
805
805
const size_t remaining = tokens.size () - i;
@@ -895,7 +895,7 @@ const char* InferToReadbackBuffer(
895
895
auto [newTokenId, isEnd] = gen (firstBatch, sampler);
896
896
897
897
// Extra samplers - Banned strings
898
- int rewindPos = llama_get_kv_cache_used_cells (context);
898
+ int rewindPos = llama_kv_self_used_cells (context);
899
899
int rewindTokenId = 0 ;
900
900
int tokenCount = 0 ;
901
901
int rewindTokenCount = 0 ;
@@ -952,7 +952,7 @@ const char* InferToReadbackBuffer(
952
952
buffer = " " ;
953
953
954
954
// Save last known accept point in case we have to rewind back to the last accept.
955
- rewindPos = llama_get_kv_cache_used_cells (context);
955
+ rewindPos = llama_kv_self_used_cells (context);
956
956
rewindTokenId = newTokenId;
957
957
rewindTokenCount = tokenCount;
958
958
@@ -979,7 +979,7 @@ const char* InferToReadbackBuffer(
979
979
continue ;
980
980
}
981
981
case MatchTrie::MatchResult::MATCHED_REWIND: {
982
- llama_kv_cache_seq_rm (context, 0 , rewindPos, -1 );
982
+ llama_kv_self_seq_rm (context, 0 , rewindPos, -1 );
983
983
984
984
// Reset the detokenizer too when rewinding
985
985
if (readbackBufferPtr->detokenizer ) {
0 commit comments