Skip to content

Commit 9999720

Browse files
authored
Merge pull request #117 from ggerganov/master
b2849
2 parents d11afd6 + 3292733 commit 9999720

34 files changed

+775
-911
lines changed

common/common.cpp

+9-5
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
901901
params.interactive = true;
902902
return true;
903903
}
904+
if (arg == "--interactive-specials") {
905+
params.interactive_specials = true;
906+
return true;
907+
}
904908
if (arg == "--embedding") {
905909
params.embedding = true;
906910
return true;
@@ -1367,14 +1371,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
13671371
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
13681372
std::replace(arg.begin(), arg.end(), '_', '-');
13691373
}
1370-
13711374
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
13721375
throw std::invalid_argument("error: unknown argument: " + arg);
13731376
}
1374-
}
1375-
1376-
if (invalid_param) {
1377-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1377+
if (invalid_param) {
1378+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1379+
}
13781380
}
13791381

13801382
if (params.prompt_cache_all &&
@@ -1422,6 +1424,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
14221424
printf(" -h, --help show this help message and exit\n");
14231425
printf(" --version show version and build info\n");
14241426
printf(" -i, --interactive run in interactive mode\n");
1427+
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
14251428
printf(" --interactive-first run in interactive mode and wait for input right away\n");
14261429
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
14271430
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -2652,6 +2655,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
26522655
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
26532656
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
26542657
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2658+
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
26552659
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
26562660
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
26572661
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

common/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ struct gpt_params {
140140
bool random_prompt = false; // do not randomize prompt if none provided
141141
bool use_color = false; // use color to distinguish generations and inputs
142142
bool interactive = false; // interactive mode
143+
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
143144
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144145
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145146
bool prompt_cache_all = false; // save user input and generations to prompt cache

common/grammar-parser.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ namespace grammar_parser {
142142
pos++;
143143
last_sym_start = out_elements.size();
144144
while (*pos != '"') {
145+
if (!*pos) {
146+
throw std::runtime_error("unexpected end of input");
147+
}
145148
auto char_pair = parse_char(pos);
146149
pos = char_pair.second;
147150
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -156,6 +159,9 @@ namespace grammar_parser {
156159
}
157160
last_sym_start = out_elements.size();
158161
while (*pos != ']') {
162+
if (!*pos) {
163+
throw std::runtime_error("unexpected end of input");
164+
}
159165
auto char_pair = parse_char(pos);
160166
pos = char_pair.second;
161167
enum llama_gretype type = last_sym_start < out_elements.size()
@@ -164,6 +170,9 @@ namespace grammar_parser {
164170

165171
out_elements.push_back({type, char_pair.first});
166172
if (pos[0] == '-' && pos[1] != ']') {
173+
if (!pos[1]) {
174+
throw std::runtime_error("unexpected end of input");
175+
}
167176
auto endchar_pair = parse_char(pos + 1);
168177
pos = endchar_pair.second;
169178
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

common/sampling.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
3535

3636
result->prev.resize(params.n_prev);
3737

38-
result->n_considered = 0;
38+
result->n_valid = 0;
3939

4040
llama_sampling_set_rng_seed(result, params.seed);
4141

@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
6666

6767
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
6868
ctx->cur.clear();
69-
ctx->n_considered = 0;
69+
ctx->n_valid = 0;
7070
}
7171

7272
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -256,7 +256,7 @@ static llama_token llama_sampling_sample_impl(
256256
}
257257
}
258258

259-
ctx_sampling->n_considered = cur_p.size;
259+
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
260260

261261
return id;
262262
}

common/sampling.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ struct llama_sampling_context {
8181
// TODO: replace with ring-buffer
8282
std::vector<llama_token> prev;
8383
std::vector<llama_token_data> cur;
84-
size_t n_considered;
84+
size_t n_valid; // Number of correct top tokens with correct probabilities.
8585

8686
std::mt19937 rng;
8787
};

convert-hf-to-gguf-update.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ class TOKENIZER_TYPE(IntEnum):
7474
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
7575
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
7676
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
77+
{"name": "jina-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
78+
{"name": "jina-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
79+
{"name": "jina-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
7780
]
7881

7982
# make directory "models/tokenizers" if it doesn't exist
@@ -142,8 +145,17 @@ def download_file_with_auth(url, token, save_path):
142145
if tokt == TOKENIZER_TYPE.SPM:
143146
continue
144147

148+
# Skip if the tokenizer folder does not exist or there are other download issues previously
149+
if not os.path.exists(f"models/tokenizers/{name}"):
150+
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
151+
continue
152+
145153
# create the tokenizer
146-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
154+
try:
155+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
156+
except OSError as e:
157+
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
158+
continue # Skip to the next model if the tokenizer can't be loaded
147159

148160
chktok = tokenizer.encode(chktxt)
149161
chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -161,6 +173,8 @@ def download_file_with_auth(url, token, save_path):
161173
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
162174
pre_tokenizer = cfg["pre_tokenizer"]
163175
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
176+
if "ignore_merges" in cfg["model"]:
177+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
164178

165179
logger.info("")
166180

@@ -282,8 +296,17 @@ def get_vocab_base_pre(self, tokenizer) -> str:
282296
name = model["name"]
283297
tokt = model["tokt"]
284298

299+
# Skip if the tokenizer folder does not exist or there are other download issues previously
300+
if not os.path.exists(f"models/tokenizers/{name}"):
301+
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
302+
continue
303+
285304
# create the tokenizer
286-
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
305+
try:
306+
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
307+
except OSError as e:
308+
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
309+
continue # Skip this model and continue with the next one in the loop
287310

288311
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
289312
for text in tests:

convert-hf-to-gguf.py

+59-1
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,17 @@ def get_vocab_base_pre(self, tokenizer) -> str:
404404
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
405405
res = "olmo"
406406
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
407-
# ref: https://huggingface.co/databricks/dbrx-instruct
407+
# ref: https://huggingface.co/databricks/dbrx-base
408408
res = "dbrx"
409+
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
410+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
411+
res = "jina-en"
412+
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
413+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
414+
res = "jina-es"
415+
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
416+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
417+
res = "jina-de"
409418

410419
if res is None:
411420
logger.warning("\n")
@@ -1013,6 +1022,18 @@ def set_gguf_parameters(self):
10131022
class RefactModel(Model):
10141023
model_arch = gguf.MODEL_ARCH.REFACT
10151024

1025+
def set_vocab(self):
1026+
super().set_vocab()
1027+
1028+
# TODO: how to determine special FIM tokens automatically?
1029+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
1030+
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
1031+
special_vocab._set_special_token("prefix", 1)
1032+
special_vocab._set_special_token("suffix", 3)
1033+
special_vocab._set_special_token("middle", 2)
1034+
special_vocab._set_special_token("fsep", 4) # is this correct?
1035+
special_vocab.add_to_gguf(self.gguf_writer)
1036+
10161037
def set_gguf_parameters(self):
10171038
hidden_dim = self.hparams["n_embd"]
10181039
inner_dim = 4 * hidden_dim
@@ -2277,6 +2298,43 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
22772298
return [(self.map_tensor_name(name), data_torch)]
22782299

22792300

2301+
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
2302+
class JinaBertV2Model(BertModel):
2303+
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
2304+
2305+
def __init__(self, *args, **kwargs):
2306+
super().__init__(*args, **kwargs)
2307+
self.intermediate_size = self.hparams["intermediate_size"]
2308+
2309+
def get_tensors(self):
2310+
for name, data in super().get_tensors():
2311+
if 'gated_layers' in name:
2312+
d1 = data[:self.intermediate_size, :]
2313+
name1 = name.replace('gated_layers', 'gated_layers_w')
2314+
d2 = data[self.intermediate_size:, :]
2315+
name2 = name.replace('gated_layers', 'gated_layers_v')
2316+
yield name1, d1
2317+
yield name2, d2
2318+
continue
2319+
2320+
yield name, data
2321+
2322+
def set_vocab(self, *args, **kwargs):
2323+
tokenizer_class = 'BertTokenizer'
2324+
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
2325+
tokenizer_class = json.load(f)['tokenizer_class']
2326+
2327+
if tokenizer_class == 'BertTokenizer':
2328+
super().set_vocab()
2329+
elif tokenizer_class == 'RobertaTokenizer':
2330+
self._set_vocab_gpt2()
2331+
self.gguf_writer.add_token_type_count(2)
2332+
else:
2333+
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
2334+
self.gguf_writer.add_add_bos_token(True)
2335+
self.gguf_writer.add_add_eos_token(True)
2336+
2337+
22802338
###### CONVERSION LOGIC ######
22812339

22822340

examples/embedding/embedding.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
4949
}
5050

5151
float * out = output + batch.seq_id[i][0] * n_embd;
52+
//TODO: I would also add a parameter here to enable normalization or not.
53+
/*fprintf(stdout, "unnormalized_embedding:");
54+
for (int hh = 0; hh < n_embd; hh++) {
55+
fprintf(stdout, "%9.6f ", embd[hh]);
56+
}
57+
fprintf(stdout, "\n");*/
5258
llama_embd_normalize(embd, out, n_embd);
5359
}
5460
}
@@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
123129
inputs.push_back(inp);
124130
}
125131

126-
// add SEP if not present
132+
// check if the last token is SEP
133+
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
127134
for (auto & inp : inputs) {
128135
if (inp.empty() || inp.back() != llama_token_sep(model)) {
129-
inp.push_back(llama_token_sep(model));
136+
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
137+
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
130138
}
131139
}
132140

examples/llama-bench/README.md

+12-6
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,33 @@ options:
2626
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
2727
-p, --n-prompt <n> (default: 512)
2828
-n, --n-gen <n> (default: 128)
29-
-b, --batch-size <n> (default: 512)
30-
-ctk <t>, --cache-type-k <t> (default: f16)
31-
-ctv <t>, --cache-type-v <t> (default: f16)
32-
-t, --threads <n> (default: 112)
29+
-pg <pp,tg> (default: 512,128)
30+
-b, --batch-size <n> (default: 2048)
31+
-ub, --ubatch-size <n> (default: 512)
32+
-ctk, --cache-type-k <t> (default: f16)
33+
-ctv, --cache-type-v <t> (default: f16)
34+
-t, --threads <n> (default: 16)
3335
-ngl, --n-gpu-layers <n> (default: 99)
3436
-sm, --split-mode <none|layer|row> (default: layer)
3537
-mg, --main-gpu <i> (default: 0)
3638
-nkvo, --no-kv-offload <0|1> (default: 0)
39+
-fa, --flash-attn <0|1> (default: 0)
3740
-mmp, --mmap <0|1> (default: 1)
38-
-ts, --tensor_split <ts0/ts1/..> (default: 0)
41+
--numa <distribute|isolate|numactl> (default: disabled)
42+
-embd, --embeddings <0|1> (default: 0)
43+
-ts, --tensor-split <ts0/ts1/..> (default: 0)
3944
-r, --repetitions <n> (default: 5)
4045
-o, --output <csv|json|md|sql> (default: md)
4146
-v, --verbose (default: 0)
4247
4348
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
4449
```
4550

46-
llama-bench can perform two types of tests:
51+
llama-bench can perform three types of tests:
4752

4853
- Prompt processing (pp): processing a prompt in batches (`-p`)
4954
- Text generation (tg): generating a sequence of tokens (`-n`)
55+
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
5056

5157
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
5258

0 commit comments

Comments
 (0)