Skip to content

Commit 7e42358

Browse files
authored
support --spm-infill
1 parent 5471e75 commit 7e42358

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

examples/server/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
7373
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
7474
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
7575
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
76+
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
7677

7778
**If compiled with `LLAMA_SERVER_SSL=ON`**
7879
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key

examples/server/server.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,6 +2020,7 @@ struct server_context {
20202020
slot.t_start_generation = 0;
20212021

20222022
if (slot.infill) {
2023+
const bool add_bos = llama_should_add_bos_token(model);
20232024
bool suff_rm_leading_spc = true;
20242025
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
20252026
params.input_suffix.erase(0, 1);
@@ -2035,16 +2036,21 @@ struct server_context {
20352036
}
20362037

20372038
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
2038-
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
2039-
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
2040-
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
2039+
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
2040+
2041+
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2042+
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
2043+
if (add_bos) {
2044+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
2045+
}
2046+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
20412047

20422048
const llama_token middle_token = llama_token_middle(model);
20432049
if (middle_token >= 0) {
2044-
prefix_tokens.push_back(middle_token);
2050+
embd_inp.push_back(middle_token);
20452051
}
20462052

2047-
prompt_tokens = prefix_tokens;
2053+
prompt_tokens = embd_inp;
20482054
} else {
20492055
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
20502056
}

0 commit comments

Comments
 (0)