@@ -592,7 +592,15 @@ const char* InferToReadbackBuffer(
592
592
// Tokenize and determine the amount of tokens to generate
593
593
// addSpecial - Special tokens in this case are BOS tokens
594
594
// parseSpecial is always true since special tokens should be parsed
595
- auto promptTokens = Tokenize (model, prompt, addSpecial, true ).value ();
595
+ auto promptTokenResult = Tokenize (model, prompt, addSpecial, true );
596
+ if (!promptTokenResult) {
597
+ finishReason = " TokenEncode" ;
598
+ readbackBufferPtr->jsonOutputBuffer = strdup (MakeJsonOutputString (context, finishReason, stoppedAt).c_str ());
599
+ readbackBufferPtr->done = true ;
600
+ return nullptr ;
601
+ }
602
+
603
+ auto promptTokens = promptTokenResult.value ();
596
604
597
605
// Process the prompt in chunked batches
598
606
if (!processPromptBatches (promptTokens)) {
@@ -602,8 +610,14 @@ const char* InferToReadbackBuffer(
602
610
}
603
611
604
612
MatchTrie::MatchTrie matchingTrie;
605
- matchingTrie.AddMatchableWords (rewindStrings, numRewindStrings, MatchTrie::MatchType::REWIND);
606
- matchingTrie.AddMatchableWords (stoppingStrings, numStoppingStrings, MatchTrie::MatchType::STOP);
613
+
614
+ if (rewindStrings != nullptr && numRewindStrings > 0 ) {
615
+ matchingTrie.AddMatchableWords (rewindStrings, numRewindStrings, MatchTrie::MatchType::REWIND);
616
+ }
617
+
618
+ if (stoppingStrings != nullptr && numStoppingStrings > 0 ) {
619
+ matchingTrie.AddMatchableWords (stoppingStrings, numStoppingStrings, MatchTrie::MatchType::STOP);
620
+ }
607
621
608
622
std::string response;
609
623
std::string buffer;
@@ -643,18 +657,18 @@ const char* InferToReadbackBuffer(
643
657
// Abort if callback is fired
644
658
if (isEnd) {
645
659
finishReason = " StopToken" ;
646
- stoppedAt = TokenToPiece (model, newTokenId, decodeSpecial).value ( );
660
+ stoppedAt = TokenToPiece (model, newTokenId, decodeSpecial).value_or ( " " );
647
661
break ;
648
662
}
649
663
650
664
// End on length if max tokens is exceeded
651
665
if (tokenCount + batch.n_tokens > numberTokensToPredict) {
652
666
finishReason = " MaxNewTokens" ;
653
- stoppedAt = TokenToPiece (model, newTokenId, decodeSpecial).value ( );
667
+ stoppedAt = TokenToPiece (model, newTokenId, decodeSpecial).value_or ( " " );
654
668
break ;
655
669
}
656
670
657
- const std::string piece = TokenToPiece (model, newTokenId, decodeSpecial).value ( );
671
+ auto piece = TokenToPiece (model, newTokenId, decodeSpecial).value_or ( " " );
658
672
659
673
buffer += piece;
660
674
tokenCount += batch.n_tokens ;
@@ -685,15 +699,17 @@ const char* InferToReadbackBuffer(
685
699
WriteToReadbackBuffer (readbackBufferPtr, strdup (partialBuffer.c_str ()), newTokenId);
686
700
response += buffer;
687
701
688
- stoppedAt = TokenToPiece (model, newTokenId, decodeSpecial).value ( );
702
+ stoppedAt = TokenToPiece (model, newTokenId, decodeSpecial).value_or ( " " );
689
703
finishReason = " StopString" ;
690
704
break ;
691
705
} else if (matchInfo.result == MatchTrie::MatchResult::MATCHED_REWIND) {
692
706
llama_kv_cache_seq_rm (context, 0 , rewindPos, -1 );
693
707
694
708
const auto tokens = Tokenize (model, buffer, false , false );
695
- for (const llama_token token : tokens.value ()) {
696
- biases.push_back ({token, -50000 .0f });
709
+ if (tokens) {
710
+ for (const llama_token token : tokens.value ()) {
711
+ biases.push_back ({token, -50000 .0f });
712
+ }
697
713
}
698
714
699
715
if (banSampler == nullptr ) {
0 commit comments