Skip to content

Commit bbbc6ff

Browse files
Merge branch 'ggerganov:master' into t5
2 parents b6f7cc6 + 6a2f298 commit bbbc6ff

22 files changed

+21413
-20499
lines changed

.editorconfig

+1
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ indent_size = 2
2828
indent_style = tab
2929

3030
[examples/cvector-generator/*.txt]
31+
trim_trailing_whitespace = unset
3132
insert_final_newline = unset

.github/workflows/server.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030

3131
strategy:
3232
matrix:
33-
sanitizer: [ADDRESS, THREAD, UNDEFINED]
33+
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
3434
build_type: [RelWithDebInfo]
3535
include:
3636
- build_type: Release

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
10511051
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
10521052
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
10531053

1054-
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
1054+
tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
10551055
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
10561056
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
10571057

common/common.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1989,8 +1989,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
19891989
options.push_back({ "cvector", " --completions-file FNAME",
19901990
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
19911991
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
1992-
options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
1993-
options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
1992+
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
1993+
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
19941994

19951995
printf("usage: %s [options]\n", argv[0]);
19961996

convert-hf-to-gguf.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -967,7 +967,11 @@ def set_vocab(self):
967967
from transformers import AutoTokenizer
968968
tokenizer = AutoTokenizer.from_pretrained(dir_model)
969969
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
970-
assert max(tokenizer.vocab.values()) < vocab_size
970+
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
971+
# because vocab_size is the count of items, and indexes start at 0.
972+
max_vocab_index = max(tokenizer.get_vocab().values())
973+
if max_vocab_index >= vocab_size:
974+
raise ValueError("Vocabulary size exceeds expected maximum size.")
971975

972976
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
973977
added_vocab = tokenizer.get_added_vocab()

examples/cvector-generator/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Related PRs:
1717
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
1818

1919
# With advanced options
20-
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
20+
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
2121

2222
# To see help message
2323
./cvector-generator -h

examples/cvector-generator/cvector-generator.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
4040
printf("\nexample usage:\n");
4141
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
4242
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
43-
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
43+
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
4444
printf("\n");
4545
}
4646

@@ -377,8 +377,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
377377
// create templated prompts
378378
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
379379
auto format_template = [](std::string persona, std::string suffix) {
380-
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
381-
return persona + " " + suffix;
380+
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
381+
return persona + suffix;
382382
};
383383
for (size_t i = 0; i < positive_prompts.size(); ++i) {
384384
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
[INST] Act like a person who is extremely sad. [/INST]
1+
[INST] Act like a person who is extremely sad. [/INST]
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
[INST] Act like a person who is extremely happy. [/INST]
1+
[INST] Act like a person who is extremely happy. [/INST]

examples/quantize/quantize.cpp

+23-23
Original file line numberDiff line numberDiff line change
@@ -16,41 +16,41 @@ struct quant_option {
1616
};
1717

1818
static const std::vector<struct quant_option> QUANT_OPTIONS = {
19-
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
20-
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
21-
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
22-
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
19+
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
20+
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
21+
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
22+
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
2323
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
2424
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
2525
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
2626
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
2727
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
2828
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29-
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
30-
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
29+
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30+
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
3131
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
3232
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
3333
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34-
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35-
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
36-
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
37-
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
38-
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
34+
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35+
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
36+
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
37+
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
38+
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
3939
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
4040
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41-
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42-
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
43-
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
44-
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45-
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
46-
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
47-
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
48-
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
49-
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
50-
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
51-
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
41+
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42+
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
43+
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
44+
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45+
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
46+
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
47+
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
48+
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
50+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
51+
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
5252
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
53-
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
53+
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
5454
};
5555

5656
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";

examples/server/public/index-new.html

+2-2
Original file line numberDiff line numberDiff line change
@@ -634,12 +634,12 @@
634634
<div>
635635
<div class="grammar">
636636
<label for="template"></label>
637-
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
637+
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
638638
</div>
639639
<div class="grammar-columns">
640640
<div class="json-schema-controls">
641641
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
642-
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
642+
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
643643
</div>
644644
</div>
645645
</div>

0 commit comments

Comments
 (0)