Skip to content

Commit ec2cb7f

Browse files
authored
fix: converter hf now handles byte characters (#189)
1 parent 4d64733 commit ec2cb7f

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

converter/convert-tokenizer-hf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,12 @@ def resolveLlamaTokenizer(self):
7272
t = processor.id_to_piece(i)
7373
s = processor.get_score(i)
7474
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
75-
b = t.encode('utf-8')
75+
# Check for byte characters
76+
if len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
77+
# For example, "<0x0A>"" is a newline character
78+
b = bytearray.fromhex(t[3:-1])
79+
else:
80+
b = t.encode('utf-8')
7681
self.tokens.append(b)
7782
self.scores.append(s)
7883

0 commit comments

Comments
 (0)