Skip to content

Commit 732b5fb

Browse files
convert : avoid calls to tokenizer.added_tokens_decoder (ggml-org#12473)
tokenizer.added_tokens_decoder returns a fresh dict every time relatively slowly (~0.04s on average) which results in massive slowdowns when we have a huge number of added tokens
1 parent 568013d commit 732b5fb

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

convert_hf_to_gguf.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
529529
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
530530
added_vocab = tokenizer.get_added_vocab()
531531

532+
added_tokens_decoder = tokenizer.added_tokens_decoder
533+
532534
for i in range(vocab_size):
533535
if i not in reverse_vocab:
534536
tokens.append(f"[PAD{i}]")
@@ -538,13 +540,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
538540
if token in added_vocab:
539541
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
540542
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
541-
if not tokenizer.added_tokens_decoder[i].normalized:
543+
if not added_tokens_decoder[i].normalized:
542544
previous_token = token
543545
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
544546
if previous_token != token:
545547
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
546548

547-
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
549+
if added_tokens_decoder[i].special or self.does_token_look_special(token):
548550
toktypes.append(gguf.TokenType.CONTROL)
549551
else:
550552
# NOTE: this was added for Gemma.

0 commit comments

Comments
 (0)