Skip to content

Commit c17cf9a

Browse files
committed
enable skipping_special_tokens in hf_tokenizer detokenize()
1 parent 5816343 commit c17cf9a

File tree

1 file changed

+5
-4
lines changed

1 file changed

+5
-4
lines changed

llama_cpp/llama_tokenizer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,18 +78,19 @@ def tokenize(
7878
)
7979

8080
def detokenize(
81-
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
81+
self, tokens: List[int], prev_tokens: Optional[List[int]] = None, special: bool = True
8282
) -> bytes:
83+
skip_special_tokens = not special
8384
if prev_tokens is not None:
84-
text = self.hf_tokenizer.decode(prev_tokens + tokens).encode(
85+
text = self.hf_tokenizer.decode(prev_tokens + tokens, skip_special_tokens=skip_special_tokens).encode(
8586
"utf-8", errors="ignore"
8687
)
87-
prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
88+
prev_text = self.hf_tokenizer.decode(prev_tokens, skip_special_tokens=skip_special_tokens).encode(
8889
"utf-8", errors="ignore"
8990
)
9091
return text[len(prev_text) :]
9192
else:
92-
return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
93+
return self.hf_tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens).encode("utf-8", errors="ignore")
9394

9495
@classmethod
9596
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":

0 commit comments

Comments
 (0)