Skip to content

Commit f675b20

Browse files
kustaayakustaaya
and
kustaaya
authored
Added support for Viking pre-tokenizer (ggml-org#8135)
Co-authored-by: kustaaya <kustaaya@protonmail.com>
1 parent 911e35b commit f675b20

File tree

4 files changed

+14
-0
lines changed

4 files changed

+14
-0
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class TOKENIZER_TYPE(IntEnum):
8585
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
8686
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
8787
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
88+
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
8889
]
8990

9091

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
487487
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
488488
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
489489
res = "jina-v2-code"
490+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
491+
# ref: https://huggingface.co/LumiOpen/Viking-7B
492+
res = "viking"
490493

491494
if res is None:
492495
logger.warning("\n")

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ extern "C" {
8888
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
8989
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
9090
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
91+
LLAMA_VOCAB_PRE_TYPE_VIKING = 16,
9192
};
9293

9394
// note: these values should be synchronized with ggml_rope

src/llama.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5067,6 +5067,9 @@ static void llm_load_vocab(
50675067
} else if (
50685068
tokenizer_pre == "poro-chat") {
50695069
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
5070+
} else if (
5071+
tokenizer_pre == "viking") {
5072+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
50705073
} else {
50715074
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
50725075
}
@@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
1370313706
" ?[^(\\s|.,!?…。,、।۔،)]+",
1370413707
};
1370513708
break;
13709+
case LLAMA_VOCAB_PRE_TYPE_VIKING:
13710+
regex_exprs = {
13711+
"\\p{N}",
13712+
" ?[^(\\s|.,!?…。,、।۔،)]+",
13713+
};
13714+
break;
1370613715
default:
1370713716
// default regex for BPE tokenization pre-processing
1370813717
regex_exprs = {

0 commit comments

Comments
 (0)