Skip to content

Commit 239200a

Browse files
authored
Fix tokenization edge case where llama output does not start with a space
See this notebook: https://colab.research.google.com/drive/1Ooz11nFPk19zyJdMDx42CeesU8aWZMdI#scrollTo=oKpHw5PZ30uC
1 parent c50d330 commit 239200a

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

llama_cpp/_internals.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
201201
# NOTE: Llama1 models automatically added a space at the start of the prompt
202202
# this line removes a leading space if the first token is a beginning of sentence token
203203
return (
204-
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
204+
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == ' ' else output
205205
)
206206

207207
# Extra
@@ -796,4 +796,4 @@ def sample(
796796
def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
797797
if apply_grammar and self.grammar is not None:
798798
ctx_main.grammar_accept_token(self.grammar, id)
799-
self.prev.append(id)
799+
self.prev.append(id)

0 commit comments

Comments
 (0)