We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 4d64733 commit ec2cb7fCopy full SHA for ec2cb7f
converter/convert-tokenizer-hf.py
@@ -72,7 +72,12 @@ def resolveLlamaTokenizer(self):
72
t = processor.id_to_piece(i)
73
s = processor.get_score(i)
74
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
75
- b = t.encode('utf-8')
+ # Check for byte characters
76
+ if len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
77
+ # For example, "<0x0A>"" is a newline character
78
+ b = bytearray.fromhex(t[3:-1])
79
+ else:
80
+ b = t.encode('utf-8')
81
self.tokens.append(b)
82
self.scores.append(s)
83
0 commit comments