Skip to content

Commit 40a53ff

Browse files
authored
Merge pull request axolotl-ai-cloud#307 from OpenAccess-AI-Collective/xgen-user-sharegpt-tokens
better handling since xgen tokenizer breaks with convert_tokens_to_ids
2 parents dcdec44 + 2a428e8 commit 40a53ff

File tree

1 file changed

+12
-6
lines changed

1 file changed

+12
-6
lines changed

src/axolotl/prompt_tokenizers.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,22 @@ def tokenize_prompt(self, prompt):
4848

4949
@functools.lru_cache(maxsize=128)
5050
def _get_user_token(self):
51-
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
52-
if isinstance(id_or_ids, (int,)):
53-
return id_or_ids
51+
try:
52+
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
53+
if isinstance(id_or_ids, (int,)):
54+
return id_or_ids
55+
except KeyError:
56+
pass
5457
return False
5558

5659
@functools.lru_cache(maxsize=128)
5760
def _get_assistant_token(self):
58-
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
59-
if isinstance(id_or_ids, (int,)):
60-
return id_or_ids
61+
try:
62+
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
63+
if isinstance(id_or_ids, (int,)):
64+
return id_or_ids
65+
except KeyError:
66+
pass
6167
return False
6268

6369
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):

0 commit comments

Comments
 (0)