Skip to content

Commit 4fb1af9

Browse files
committed
Update llama-quant.cpp
1 parent 784a67e commit 4fb1af9

File tree

1 file changed

+44
-13
lines changed

1 file changed

+44
-13
lines changed

src/llama-quant.cpp

+44-13
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
163163
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
164164
new_type = GGML_TYPE_Q5_K;
165165
}
166+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
167+
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
168+
new_type = !qs.has_output ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
169+
}
166170
else if (new_type != GGML_TYPE_Q8_0) {
167171
new_type = GGML_TYPE_Q6_K;
168172
}
@@ -236,7 +240,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
236240
new_type = GGML_TYPE_Q4_K;
237241
}
238242
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
239-
new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
243+
new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
240244
}
241245
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 2) {
242246
new_type = GGML_TYPE_Q5_K;
@@ -256,14 +260,15 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
256260
// else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
257261
// use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
258262
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
259-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
260-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ||
263+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
264+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S ||
261265
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
262266
if (qs.model.type == LLM_TYPE_70B) {
263267
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
264268
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
265269
// nearly negligible increase in model size by quantizing this tensor with more bits:
266-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
270+
if (new_type == GGML_TYPE_Q3_K) new_type = GGML_TYPE_Q5_K;
271+
if (new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K) new_type = GGML_TYPE_Q6_K;
267272
}
268273
if (qs.model.hparams.n_expert >= 2) {
269274
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -277,15 +282,22 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
277282
// TODO: explore better strategies
278283
new_type = GGML_TYPE_Q6_K;
279284
}
280-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
281-
new_type = GGML_TYPE_IQ3_XXS;
285+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M
286+
&& qs.model.hparams.n_gqa() >= 2) {
287+
new_type = GGML_TYPE_IQ4_XS;
282288
}
283-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
284-
new_type = GGML_TYPE_IQ2_S;
289+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 2) {
290+
new_type = GGML_TYPE_IQ3_S;
285291
}
286-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L)
292+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL)
287293
&& qs.model.hparams.n_gqa() >= 2)
288294
new_type = GGML_TYPE_Q5_K;
295+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
296+
&& qs.model.hparams.n_gqa() >= 2)
297+
new_type = GGML_TYPE_Q3_K;
298+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L)
299+
&& qs.model.hparams.n_gqa() >= 2)
300+
new_type = GGML_TYPE_Q4_K;
289301
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M)
290302
&& qs.model.hparams.n_gqa() >= 2)
291303
new_type = GGML_TYPE_Q5_K;
@@ -296,11 +308,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
296308
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
297309
new_type = GGML_TYPE_IQ3_XXS;
298310
}
311+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS && qs.has_imatrix) {
312+
new_type = GGML_TYPE_IQ3_S;
313+
}
299314
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
300315
new_type = GGML_TYPE_IQ2_S;
301316
}
302-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S)
317+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && qs.has_imatrix) {
318+
new_type = GGML_TYPE_Q2_K;
319+
}
320+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
321+
new_type = GGML_TYPE_Q3_K;
322+
}
323+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
303324
new_type = GGML_TYPE_Q4_K;
325+
}
326+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K) {
327+
new_type = GGML_TYPE_Q5_K;
328+
}
329+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
330+
new_type = GGML_TYPE_Q4_0;
331+
}
304332
} else if (name.find("ffn_down") != std::string::npos) {
305333
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
306334
int i_layer = info.first, n_layer = info.second;
@@ -332,11 +360,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
332360
// if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
333361
// }
334362
// }
363+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
335364
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
336-
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
365+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS)
366+
&& !qs.has_imatrix && use_more_bits(i_layer, n_layer)) {
337367
new_type = GGML_TYPE_Q5_K;
338368
}
339-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
369+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
340370
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
341371
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
342372
new_type = GGML_TYPE_Q5_K;
@@ -363,9 +393,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
363393
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
364394
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
365395
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
366-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
396+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_XS;
367397
}
368398
} else {
399+
// if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
369400
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
370401
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
371402
if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;

0 commit comments

Comments
 (0)