Skip to content

Commit 392ded3

Browse files
committed
IQ4_S
1 parent 9b63d76 commit 392ded3

File tree

5 files changed

+18
-0
lines changed

5 files changed

+18
-0
lines changed

examples/quantize/quantize.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4141
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
4242
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
4343
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
44+
{ "IQ4_S", LLAMA_FTYPE_MOSTLY_IQ4_S, " 4.75 bpw non-linear quant mix", },
4445
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
4546
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
4647
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },

gguf-py/gguf/constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -1815,6 +1815,8 @@ class LlamaFileType(IntEnum):
18151815
MOSTLY_TQ1_0 = 36 # except 1d tensors
18161816
MOSTLY_TQ2_0 = 37 # except 1d tensors
18171817

1818+
MOSTLY_IQ4_S = 630 # except 1d tensors
1819+
18181820
GUESSED = 1024 # not specified in the model file
18191821

18201822

include/llama.h

+2
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ extern "C" {
182182
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
183183
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
184184

185+
LLAMA_FTYPE_MOSTLY_IQ4_S = 630, // except 1d tensors
186+
185187
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
186188
};
187189

src/llama-model-loader.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5757
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
5858
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
5959
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
60+
case LLAMA_FTYPE_MOSTLY_IQ4_S: return "IQ4_S - 4.75 bpw";
6061
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
6162
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
6263

src/llama-quant.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
257257
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 2) {
258258
new_type = GGML_TYPE_Q6_K;
259259
}
260+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_S && qs.model.hparams.n_gqa() >= 2) {
261+
new_type = GGML_TYPE_Q6_K;
262+
}
260263
// else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
261264
// use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
262265
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -292,6 +295,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
292295
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL)
293296
&& qs.model.hparams.n_gqa() >= 2)
294297
new_type = GGML_TYPE_Q5_K;
298+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_S)
299+
&& qs.model.hparams.n_gqa() >= 2)
300+
new_type = GGML_TYPE_Q5_K;
295301
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
296302
&& qs.model.hparams.n_gqa() >= 2)
297303
new_type = GGML_TYPE_Q3_K;
@@ -352,6 +358,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
352358
// new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
353359
new_type = GGML_TYPE_Q4_K;
354360
}
361+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_S) {
362+
// new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
363+
new_type = GGML_TYPE_Q5_K;
364+
}
355365
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
356366
// if (arch == LLM_ARCH_FALCON) {
357367
// new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
@@ -398,6 +408,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
398408
} else {
399409
// if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
400410
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
411+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_S) new_type = GGML_TYPE_Q5_K;
401412
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
402413
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
403414
}
@@ -571,6 +582,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
571582
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
572583
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
573584
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
585+
case LLAMA_FTYPE_MOSTLY_IQ4_S: default_type = GGML_TYPE_IQ4_XS; break;
574586
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
575587
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
576588

0 commit comments

Comments
 (0)