Skip to content

Commit 7c610fa

Browse files
authored
add t5 tokenizer tests
* convert : add t5 tokenizer tests, use "slow" tokenizer * llama : UGM tokenizer init with UNK tokens instead of PAD
1 parent 1d1cb01 commit 7c610fa

28 files changed

+58
-9
lines changed

convert-hf-to-gguf-update.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class TOKENIZER_TYPE(IntEnum):
4545
SPM = auto()
4646
BPE = auto()
4747
WPM = auto()
48+
UGM = auto()
4849

4950

5051
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
@@ -85,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum):
8586
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
8687
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
8788
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
89+
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
8890
]
8991

9092

@@ -106,9 +108,13 @@ def download_model(model):
106108
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
107109

108110
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
111+
109112
if tokt == TOKENIZER_TYPE.SPM:
110113
files.append("tokenizer.model")
111114

115+
if tokt == TOKENIZER_TYPE.UGM:
116+
files.append("spiece.model")
117+
112118
for file in files:
113119
save_path = f"models/tokenizers/{name}/{file}"
114120
if os.path.isfile(save_path):
@@ -131,7 +137,7 @@ def download_model(model):
131137
name = model["name"]
132138
tokt = model["tokt"]
133139

134-
if tokt == TOKENIZER_TYPE.SPM:
140+
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
135141
continue
136142

137143
# Skip if the tokenizer folder does not exist or there are other download issues previously
@@ -141,7 +147,10 @@ def download_model(model):
141147

142148
# create the tokenizer
143149
try:
144-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
150+
if name == "t5":
151+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
152+
else:
153+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
145154
except OSError as e:
146155
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
147156
continue # Skip to the next model if the tokenizer can't be loaded
@@ -262,6 +271,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
262271
"\n =",
263272
"' era",
264273
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
274+
"!!!!!!",
265275
"3",
266276
"33",
267277
"333",
@@ -299,7 +309,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
299309

300310
# create the tokenizer
301311
try:
302-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
312+
if name == "t5":
313+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
314+
else:
315+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
303316
except OSError as e:
304317
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
305318
continue # Skip this model and continue with the next one in the loop

models/ggml-vocab-bert-bge.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-bert-bge.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
1027
3232
1005 3690
3333
7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
34+
999 999 999 999 999 999
3435
1017
3536
3943
3637
21211

models/ggml-vocab-command-r.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-command-r.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
206 1857
3232
14 4515
3333
28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
34+
57178 10251
3435
26
3536
26 26
3637
26 26 26

models/ggml-vocab-deepseek-coder.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-deepseek-coder.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
185 405
3232
6 2895
3333
17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239
34+
15330 3023
3435
18
3536
18 18
3637
18 18 18

models/ggml-vocab-deepseek-llm.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-deepseek-llm.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
185 403
3232
6 2906
3333
17464 11 320 6 436 0 1724 418 340 33701 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239
34+
15278 3033
3435
18
3536
18 18
3637
18 18 18

models/ggml-vocab-falcon.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-falcon.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
1212 40
3232
18 4932
3333
9856 23 291 18 436 12 1265 362 299 8196 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236
34+
51520
3435
30
3536
3138
3637
22287

models/ggml-vocab-gpt-2.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-gpt-2.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
198 796
3232
6 6980
3333
15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252
34+
13896 3228
3435
18
3536
2091
3637
20370

models/ggml-vocab-llama-bpe.gguf.inp

+2-2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33
@@ -104,5 +106,3 @@ __ggml_vocab_test__
104106

105107
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
106108
__ggml_vocab_test__
107-
Việt
108-
__ggml_vocab_test__

models/ggml-vocab-llama-bpe.gguf.out

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
198 284
3232
6 11639
3333
9906 11 379 65948 0 2650 527 499 27623 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909
34+
17523 3001
3435
18
3536
1644
3637
8765
@@ -41,4 +42,3 @@
4142
8765 8765 1644
4243
8765 8765 8765
4344
198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
44-
101798

models/ggml-vocab-llama-spm.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-llama-spm.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
29871 13 353
3232
525 3152
3333
15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
34+
1738 6824 21004
3435
29871 29941
3536
29871 29941 29941
3637
29871 29941 29941 29941

models/ggml-vocab-mpt.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-mpt.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
187 426
3232
8 8685
3333
12092 13 340 8 455 2 1359 403 368 49042 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241
34+
18963 4672
3435
20
3536
1610
3637
20084

models/ggml-vocab-phi-3.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-phi-3.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
29871 13 353
3232
525 3152
3333
15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
34+
1738 6824 21004
3435
29871 29941
3536
29871 29941 29941
3637
29871 29941 29941 29941

models/ggml-vocab-qwen2.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-qwen2.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
198 284
3232
6 11385
3333
9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
34+
17085 2928
3435
18
3536
18 18
3637
18 18 18

models/ggml-vocab-refact.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-refact.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
203 280
3232
25 34666
3333
8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838
34+
9163 3202
3435
37
3536
37 37
3637
37 37 37

models/ggml-vocab-starcoder.gguf.inp

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ __ggml_vocab_test__
7373
__ggml_vocab_test__
7474
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
7575
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
7678
3
7779
__ggml_vocab_test__
7880
33

models/ggml-vocab-starcoder.gguf.out

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
222 299
3232
44 34719
3333
8302 49 553 44 483 38 4998 904 863 18445 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892
34+
9221 3226
3435
56
3536
56 56
3637
56 56 56

src/llama.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -14882,9 +14882,9 @@ struct llm_tokenizer_ugm {
1488214882
}
1488314883

1488414884
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
14885-
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
14885+
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
1488614886
// at the beginning tokenization score is zero
14887-
tokenization_results[0] = { 0, 0, 0 };
14887+
tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
1488814888

1488914889
for (size_t input_offset = 0; input_offset < input_len;) {
1489014890
size_t prefix_offset = input_offset;
@@ -14904,7 +14904,7 @@ struct llm_tokenizer_ugm {
1490414904
single_codepoint_token_found = true;
1490514905
}
1490614906
llama_token token_id = node->value;
14907-
const auto &token_data = vocab.id_to_token[token_id];
14907+
const auto & token_data = vocab.id_to_token[token_id];
1490814908

1490914909
// we set the user-defined token scores to 0 to make them more likely to be selected
1491014910
// (normal token scores are log probabilities, so they are negative)

0 commit comments

Comments
 (0)