Bindings: Add guards to generate and finish if prompt encoding fails

kingbri1 · kingbri1 · commit a3c4265dd083 · 2025-03-03T17:52:51.000-05:00
These guards and fallbacks should prevent segfaults during generation.
In addition, convert the finish reason check to switch/case for better
readability.

Signed-off-by: kingbri &lt;8082010+kingbri1@users.noreply.github.com&gt;
diff --git a/bindings/binding.cpp b/bindings/binding.cpp
@@ -592,7 +592,15 @@ const char* InferToReadbackBuffer(
     // Tokenize and determine the amount of tokens to generate
     // addSpecial - Special tokens in this case are BOS tokens
     // parseSpecial is always true since special tokens should be parsed
-    auto promptTokens = Tokenize(model, prompt, addSpecial, true).value();
+    auto promptTokenResult = Tokenize(model, prompt, addSpecial, true);
+    if (!promptTokenResult) {
+        finishReason = "TokenEncode";
+        readbackBufferPtr->jsonOutputBuffer = strdup(MakeJsonOutputString(context, finishReason, stoppedAt).c_str());
+        readbackBufferPtr->done = true;
+        return nullptr;
+    }
+
+    auto promptTokens = promptTokenResult.value();
 
     // Process the prompt in chunked batches
     if (!processPromptBatches(promptTokens)) {
@@ -602,8 +610,14 @@ const char* InferToReadbackBuffer(
     }
 
     MatchTrie::MatchTrie matchingTrie;
-    matchingTrie.AddMatchableWords(rewindStrings, numRewindStrings, MatchTrie::MatchType::REWIND);
-    matchingTrie.AddMatchableWords(stoppingStrings, numStoppingStrings, MatchTrie::MatchType::STOP);
+
+    if (rewindStrings != nullptr && numRewindStrings > 0) {
+        matchingTrie.AddMatchableWords(rewindStrings, numRewindStrings, MatchTrie::MatchType::REWIND);
+    }
+
+    if (stoppingStrings != nullptr && numStoppingStrings > 0) {
+        matchingTrie.AddMatchableWords(stoppingStrings, numStoppingStrings, MatchTrie::MatchType::STOP);
+    }
 
     std::string response;
     std::string buffer;
@@ -643,18 +657,18 @@ const char* InferToReadbackBuffer(
         // Abort if callback is fired
         if (isEnd) {
             finishReason = "StopToken";
-            stoppedAt = TokenToPiece(model, newTokenId, decodeSpecial).value();
+            stoppedAt = TokenToPiece(model, newTokenId, decodeSpecial).value_or("");
             break;
         }
 
         // End on length if max tokens is exceeded
         if (tokenCount + batch.n_tokens > numberTokensToPredict) {
             finishReason = "MaxNewTokens";
-            stoppedAt = TokenToPiece(model, newTokenId, decodeSpecial).value();
+            stoppedAt = TokenToPiece(model, newTokenId, decodeSpecial).value_or("");
             break;
         }
 
-        const std::string piece = TokenToPiece(model, newTokenId, decodeSpecial).value();
+        auto piece = TokenToPiece(model, newTokenId, decodeSpecial).value_or("");
 
         buffer += piece;
         tokenCount += batch.n_tokens;
@@ -685,15 +699,17 @@ const char* InferToReadbackBuffer(
                 WriteToReadbackBuffer(readbackBufferPtr, strdup(partialBuffer.c_str()), newTokenId);
                 response += buffer;
 
-                stoppedAt = TokenToPiece(model, newTokenId, decodeSpecial).value();
+                stoppedAt = TokenToPiece(model, newTokenId, decodeSpecial).value_or("");
                 finishReason = "StopString";
                 break;
             } else if (matchInfo.result == MatchTrie::MatchResult::MATCHED_REWIND) {
                 llama_kv_cache_seq_rm(context, 0, rewindPos, -1);
 
                 const auto tokens = Tokenize(model, buffer, false, false);
-                for (const llama_token token : tokens.value()) {
-                    biases.push_back({token, -50000.0f});
+                if (tokens) {
+                    for (const llama_token token : tokens.value()) {
+                        biases.push_back({token, -50000.0f});
+                    }
                 }
 
                 if (banSampler == nullptr) {
diff --git a/bindings/bindings.ts b/bindings/bindings.ts
@@ -763,19 +763,23 @@ export class Model {
 
         const finishResponse = await this.readbackBuffer.readJsonStatus();
         if (finishResponse) {
-            if (
-                finishResponse.finishReason == BindingFinishReason.CtxExceeded
-            ) {
-                throw new Error(
-                    `Prompt exceeds max context length of ${this.maxSeqLen}`,
-                );
-            } else if (
-                finishResponse.finishReason == BindingFinishReason.BatchDecode
-            ) {
-                throw new Error(
-                    "Internal generation state is broken due to llama_decode error. " +
-                        "Please restart the server.",
-                );
+            switch (finishResponse.finishReason) {
+                case BindingFinishReason.CtxExceeded:
+                    throw new Error(
+                        `Prompt exceeds max context length of ${this.maxSeqLen}`,
+                    );
+
+                case BindingFinishReason.BatchDecode:
+                    throw new Error(
+                        "Internal generation state is broken due to llama_decode error. " +
+                            "Please restart the server.",
+                    );
+
+                case BindingFinishReason.TokenEncode:
+                    throw new Error(
+                        "Could not tokenize the provided prompt. " +
+                            "Please make sure your prompt is formatted correctly.",
+                    );
             }
 
             const totalTime = finishResponse.promptSec + finishResponse.genSec;
diff --git a/bindings/types.ts b/bindings/types.ts
@@ -17,6 +17,7 @@ export enum BindingFinishReason {
     StopToken = "StopToken",
     MaxNewTokens = "MaxNewTokens",
     StopString = "StopString",
+    TokenEncode = "TokenEncode",
 }
 
 export type GenerationChunk = StreamChunk | FinishChunk;

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ export enum BindingFinishReason {`
`17`	`17`	`StopToken = "StopToken",`
`18`	`18`	`MaxNewTokens = "MaxNewTokens",`
`19`	`19`	`StopString = "StopString",`
	`20`	`+ TokenEncode = "TokenEncode",`
`20`	`21`	`}`
`21`	`22`
`22`	`23`	`export type GenerationChunk = StreamChunk \| FinishChunk;`