Skip to content

Commit f75eedf

Browse files
authored
extensive token check
1 parent 29a9884 commit f75eedf

File tree

1 file changed

+39
-29
lines changed

1 file changed

+39
-29
lines changed

llama.cpp

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4547,35 +4547,6 @@ static void llm_load_vocab(
45474547
vocab.special_cls_id = -1;
45484548
vocab.special_mask_id = -1;
45494549

4550-
// For Fill-In-the-Middle (FIM)/infill models which where converted
4551-
// prior to support of FIM special tokens in GGUF, the following
4552-
// will allow those models to continue to work. The general names
4553-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4554-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4555-
// new versions of these models have been published.
4556-
std::string gen_name;
4557-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4558-
4559-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4560-
[](unsigned char c){ return std::tolower(c); });
4561-
4562-
if (gen_name.find("code") != std::string::npos) {
4563-
if (model.arch == LLM_ARCH_LLAMA && gen_name.find("llama") != std::string::npos) {
4564-
vocab.special_prefix_id = 32007;
4565-
vocab.special_suffix_id = 32008;
4566-
vocab.special_middle_id = 32009;
4567-
vocab.special_eot_id = 32010;
4568-
} else if (model.arch == LLM_ARCH_GEMMA) {
4569-
vocab.special_prefix_id = 67;
4570-
vocab.special_suffix_id = 69;
4571-
vocab.special_middle_id = 68;
4572-
// TODO: this is not EOT, it is "file separator" token, needs fix
4573-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4574-
//vocab.special_eot_id = 70;
4575-
vocab.special_eot_id = 107;
4576-
}
4577-
}
4578-
45794550
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
45804551
if (add_space_prefix_keyidx != -1) {
45814552
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4747,6 +4718,45 @@ static void llm_load_vocab(
47474718

47484719
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
47494720
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4721+
// For Fill-In-the-Middle (FIM)/infill models which where converted
4722+
// prior to support of FIM special tokens in GGUF, the following
4723+
// will allow those models to continue to work. The general names
4724+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4725+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4726+
// new versions of these models have been published.
4727+
std::string gen_name;
4728+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4729+
4730+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4731+
[](unsigned char c){ return std::tolower(c); });
4732+
4733+
if (gen_name.find("code") != std::string::npos) {
4734+
if (model.arch == LLM_ARCH_LLAMA
4735+
&& 32010 < vocab.id_to_token.size()
4736+
&& vocab.id_to_token[32007].text == "<PRE>"
4737+
&& vocab.id_to_token[32008].text == "<SUF>"
4738+
&& vocab.id_to_token[32009].text == "<MID>"
4739+
&& vocab.id_to_token[32010].text == "<EOT>") {
4740+
vocab.special_prefix_id = 32007;
4741+
vocab.special_suffix_id = 32008;
4742+
vocab.special_middle_id = 32009;
4743+
vocab.special_eot_id = 32010;
4744+
} else if (model.arch == LLM_ARCH_GEMMA
4745+
&& 107 < vocab.id_to_token.size()
4746+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
4747+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
4748+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
4749+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
4750+
vocab.special_prefix_id = 67;
4751+
vocab.special_suffix_id = 69;
4752+
vocab.special_middle_id = 68;
4753+
// TODO: this is not EOT, it is "file separator" token, needs fix
4754+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4755+
//vocab.special_eot_id = 70;
4756+
vocab.special_eot_id = 107;
4757+
}
4758+
}
4759+
47504760
try {
47514761
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
47524762
} catch (const std::exception & e) {

0 commit comments

Comments
 (0)