Skip to content

Commit 3a027b8

Browse files
committed
Revamp IQ4_XSR, remove IQ3_XXXL
1 parent e05da54 commit 3a027b8

File tree

4 files changed

+63
-44
lines changed

4 files changed

+63
-44
lines changed

Diff for: examples/quantize/quantize.cpp

+2-3
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3535
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
3636
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
3737
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", },
38-
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.80 bpw quantization mix", },
39-
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 3.95 bpw quantization mix", },
40-
{ "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.10 bpw quantization mix", },
38+
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.90 bpw quantization mix", },
39+
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 4.10 bpw quantization mix", },
4140
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
4241
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
4342
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },

Diff for: gguf-py/gguf/constants.py

-1
Original file line numberDiff line numberDiff line change
@@ -1257,7 +1257,6 @@ class LlamaFileType(IntEnum):
12571257
MOSTLY_IQ1_XL = 42 # except 1d tensors
12581258
MOSTLY_IQ4_XSR = 43 # except 1d tensors
12591259
MOSTLY_IQ3_XXL = 44 # except 1d tensors
1260-
MOSTLY_IQ3_XXXL = 45 # except 1d tensors
12611260

12621261
GUESSED = 1024 # not specified in the model file
12631262

Diff for: include/llama.h

-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ extern "C" {
173173
LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors
174174
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors
175175
LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors
176-
LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 45, // except 1d tensors
177176
LLAMA_FTYPE_CQS = 99, // except 1d tensors
178177

179178
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file

Diff for: src/llama.cpp

+61-39
Original file line numberDiff line numberDiff line change
@@ -4528,9 +4528,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
45284528
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
45294529
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
45304530
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
4531-
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw";
4532-
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
4533-
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw";
4531+
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.90 bpw";
4532+
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 4.10 bpw";
45344533
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
45354534
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
45364535
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
@@ -15981,7 +15980,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1598115980
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
1598215981
else new_type = GGML_TYPE_IQ3_S;
1598315982
}
15984-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
15983+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1598515984
new_type = GGML_TYPE_IQ4_XS;
1598615985
}
1598715986
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
@@ -16038,7 +16037,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1603816037
else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1603916038
}
1604016039
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16041-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16040+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1604216041
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1604316042
else new_type = GGML_TYPE_Q4_K;
1604416043
}
@@ -16048,9 +16047,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1604816047
}
1604916048
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1605016049
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16051-
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16052-
difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
16050+
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
16051+
difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1605316052
}
16053+
else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1605416054
}
1605516055
++qs.i_attention_wv;
1605616056
} else if (name.find("attn_k.weight") != std::string::npos) {
@@ -16143,16 +16143,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1614316143
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1614416144
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1614516145
}
16146-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16147-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16148-
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16149-
else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16150-
}
1615116146
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1615216147
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16153-
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
16154-
difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
16148+
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
16149+
difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1615516150
}
16151+
else difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1615616152
}
1615716153
++qs.i_attention_wk;
1615816154
} else if (name.find("attn_q.weight") != std::string::npos) {
@@ -16170,16 +16166,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1617016166
}
1617116167
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
1617216168
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16173-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16169+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1617416170
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1617516171
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1617616172
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
1617716173
}
1617816174
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1617916175
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16180-
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S :
16181-
difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S;
16176+
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
16177+
difquant_more_fl_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1618216178
}
16179+
else difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1618316180
}
1618416181
++qs.i_attention_wq;
1618516182
} else if (name.find("ffn_down") != std::string::npos) {
@@ -16231,37 +16228,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1623116228
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1623216229
}
1623316230
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16234-
new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16231+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16232+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16233+
else new_type = GGML_TYPE_IQ3_XXS;
1623516234
}
1623616235
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16237-
new_type = difquant_first_last_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16236+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16237+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16238+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1623816239
}
1623916240
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16240-
new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16241+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16242+
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16243+
else new_type = GGML_TYPE_IQ3_S;
1624116244
}
1624216245
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16243-
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16246+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16247+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16248+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1624416249
}
1624516250
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16246-
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16251+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16252+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16253+
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1624716254
}
1624816255
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16249-
new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16256+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16257+
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16258+
else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1625016259
}
1625116260
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16252-
new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16253-
}
16254-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16255-
new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16261+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16262+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16263+
else new_type = GGML_TYPE_IQ4_XS;
1625616264
}
1625716265
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
1625816266
new_type = GGML_TYPE_Q5_K;
1625916267
}
1626016268
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1626116269
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16262-
new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS :
16263-
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
16270+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
16271+
difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1626416272
}
16273+
else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1626516274
}
1626616275
++qs.i_ffn_down;
1626716276
} else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16275,10 +16284,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1627516284
}
1627616285
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
1627716286
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16278-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
16287+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
1627916288
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1628016289
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16281-
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
1628216290
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
1628316291
new_type = GGML_TYPE_Q5_K;
1628416292
}
@@ -16305,13 +16313,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1630516313
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1630616314
new_type = GGML_TYPE_IQ3_S;
1630716315
}
16308-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16309-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS;
16316+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL)
16317+
new_type = GGML_TYPE_IQ4_XS;
1631016318
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1631116319
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16312-
new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS :
16313-
difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
16320+
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q6_K :
16321+
difquant_more_fl_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1631416322
}
16323+
else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1631516324
}
1631616325
}
1631716326
} else {
@@ -16340,7 +16349,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1634016349
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
1634116350
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K;
1634216351
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
16343-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K;
16352+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16353+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16354+
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16355+
difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16356+
}
16357+
else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16358+
}
1634416359
++qs.i_attention_wv;
1634516360
} else if (name.find("ffn_gate") != std::string::npos) {
1634616361
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -16434,7 +16449,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1643416449
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1643516450
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1643616451
}
16437-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16452+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16453+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16454+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16455+
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16456+
}
1643816457
++qs.i_ffn_gate;
1643916458
} else if (name.find("ffn_up") != std::string::npos) {
1644016459
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
@@ -16528,7 +16547,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1652816547
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1652916548
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1653016549
}
16531-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16550+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16551+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16552+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16553+
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16554+
}
1653216555
++qs.i_ffn_up;
1653316556
}
1653416557

@@ -16683,7 +16706,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1668316706
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1668416707
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
1668516708
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
16686-
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break;
1668716709
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
1668816710
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
1668916711
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;

0 commit comments

Comments
 (0)