@@ -4547,35 +4547,6 @@ static void llm_load_vocab(
4547
4547
vocab.special_cls_id = -1;
4548
4548
vocab.special_mask_id = -1;
4549
4549
4550
- // For Fill-In-the-Middle (FIM)/infill models which where converted
4551
- // prior to support of FIM special tokens in GGUF, the following
4552
- // will allow those models to continue to work. The general names
4553
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4554
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4555
- // new versions of these models have been published.
4556
- std::string gen_name;
4557
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4558
-
4559
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4560
- [](unsigned char c){ return std::tolower(c); });
4561
-
4562
- if (gen_name.find("code") != std::string::npos) {
4563
- if (model.arch == LLM_ARCH_LLAMA && gen_name.find("llama") != std::string::npos) {
4564
- vocab.special_prefix_id = 32007;
4565
- vocab.special_suffix_id = 32008;
4566
- vocab.special_middle_id = 32009;
4567
- vocab.special_eot_id = 32010;
4568
- } else if (model.arch == LLM_ARCH_GEMMA) {
4569
- vocab.special_prefix_id = 67;
4570
- vocab.special_suffix_id = 69;
4571
- vocab.special_middle_id = 68;
4572
- // TODO: this is not EOT, it is "file separator" token, needs fix
4573
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4574
- //vocab.special_eot_id = 70;
4575
- vocab.special_eot_id = 107;
4576
- }
4577
- }
4578
-
4579
4550
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4580
4551
if (add_space_prefix_keyidx != -1) {
4581
4552
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4747,6 +4718,45 @@ static void llm_load_vocab(
4747
4718
4748
4719
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
4749
4720
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4721
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4722
+ // prior to support of FIM special tokens in GGUF, the following
4723
+ // will allow those models to continue to work. The general names
4724
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4725
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4726
+ // new versions of these models have been published.
4727
+ std::string gen_name;
4728
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4729
+
4730
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4731
+ [](unsigned char c){ return std::tolower(c); });
4732
+
4733
+ if (gen_name.find("code") != std::string::npos) {
4734
+ if (model.arch == LLM_ARCH_LLAMA
4735
+ && 32010 < vocab.id_to_token.size()
4736
+ && vocab.id_to_token[32007].text == "<PRE>"
4737
+ && vocab.id_to_token[32008].text == "<SUF>"
4738
+ && vocab.id_to_token[32009].text == "<MID>"
4739
+ && vocab.id_to_token[32010].text == "<EOT>") {
4740
+ vocab.special_prefix_id = 32007;
4741
+ vocab.special_suffix_id = 32008;
4742
+ vocab.special_middle_id = 32009;
4743
+ vocab.special_eot_id = 32010;
4744
+ } else if (model.arch == LLM_ARCH_GEMMA
4745
+ && 107 < vocab.id_to_token.size()
4746
+ && vocab.id_to_token[67].text == "<|fim_prefix|>"
4747
+ && vocab.id_to_token[69].text == "<|fim_suffix|>"
4748
+ && vocab.id_to_token[68].text == "<|fim_middle|>"
4749
+ && vocab.id_to_token[107].text == "<end_of_turn>") {
4750
+ vocab.special_prefix_id = 67;
4751
+ vocab.special_suffix_id = 69;
4752
+ vocab.special_middle_id = 68;
4753
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4754
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4755
+ //vocab.special_eot_id = 70;
4756
+ vocab.special_eot_id = 107;
4757
+ }
4758
+ }
4759
+
4750
4760
try {
4751
4761
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
4752
4762
} catch (const std::exception & e) {
0 commit comments