@@ -163,6 +163,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
163
163
ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
164
164
new_type = GGML_TYPE_Q5_K;
165
165
}
166
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
167
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
168
+ new_type = !qs.has_output ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
169
+ }
166
170
else if (new_type != GGML_TYPE_Q8_0) {
167
171
new_type = GGML_TYPE_Q6_K;
168
172
}
@@ -236,7 +240,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
236
240
new_type = GGML_TYPE_Q4_K;
237
241
}
238
242
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
239
- new_type = qs.model .hparams .n_gqa () >= 2 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS ;
243
+ new_type = qs.model .hparams .n_gqa () >= 2 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S ;
240
244
}
241
245
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model .hparams .n_gqa () >= 2 ) {
242
246
new_type = GGML_TYPE_Q5_K;
@@ -256,14 +260,15 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
256
260
// else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
257
261
// use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
258
262
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
259
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs. i_attention_wv < 4 ) new_type = GGML_TYPE_Q5_K;
260
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ||
263
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
264
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S ||
261
265
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
262
266
if (qs.model .type == LLM_TYPE_70B) {
263
267
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
264
268
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
265
269
// nearly negligible increase in model size by quantizing this tensor with more bits:
266
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
270
+ if (new_type == GGML_TYPE_Q3_K) new_type = GGML_TYPE_Q5_K;
271
+ if (new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K) new_type = GGML_TYPE_Q6_K;
267
272
}
268
273
if (qs.model .hparams .n_expert >= 2 ) {
269
274
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -277,15 +282,22 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
277
282
// TODO: explore better strategies
278
283
new_type = GGML_TYPE_Q6_K;
279
284
}
280
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
281
- new_type = GGML_TYPE_IQ3_XXS;
285
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M
286
+ && qs.model .hparams .n_gqa () >= 2 ) {
287
+ new_type = GGML_TYPE_IQ4_XS;
282
288
}
283
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
284
- new_type = GGML_TYPE_IQ2_S ;
289
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs. model . hparams . n_gqa () >= 2 ) {
290
+ new_type = GGML_TYPE_IQ3_S ;
285
291
}
286
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L )
292
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL )
287
293
&& qs.model .hparams .n_gqa () >= 2 )
288
294
new_type = GGML_TYPE_Q5_K;
295
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
296
+ && qs.model .hparams .n_gqa () >= 2 )
297
+ new_type = GGML_TYPE_Q3_K;
298
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L)
299
+ && qs.model .hparams .n_gqa () >= 2 )
300
+ new_type = GGML_TYPE_Q4_K;
289
301
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M)
290
302
&& qs.model .hparams .n_gqa () >= 2 )
291
303
new_type = GGML_TYPE_Q5_K;
@@ -296,11 +308,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
296
308
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
297
309
new_type = GGML_TYPE_IQ3_XXS;
298
310
}
311
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS && qs.has_imatrix ) {
312
+ new_type = GGML_TYPE_IQ3_S;
313
+ }
299
314
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
300
315
new_type = GGML_TYPE_IQ2_S;
301
316
}
302
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S)
317
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && qs.has_imatrix ) {
318
+ new_type = GGML_TYPE_Q2_K;
319
+ }
320
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
321
+ new_type = GGML_TYPE_Q3_K;
322
+ }
323
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
303
324
new_type = GGML_TYPE_Q4_K;
325
+ }
326
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K) {
327
+ new_type = GGML_TYPE_Q5_K;
328
+ }
329
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
330
+ new_type = GGML_TYPE_Q4_0;
331
+ }
304
332
} else if (name.find (" ffn_down" ) != std::string::npos) {
305
333
auto info = layer_info (qs.i_ffn_down , qs.n_ffn_down , name.c_str ());
306
334
int i_layer = info.first , n_layer = info.second ;
@@ -332,11 +360,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
332
360
// if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
333
361
// }
334
362
// }
363
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
335
364
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
336
- else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix ) {
365
+ else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS)
366
+ && !qs.has_imatrix && use_more_bits (i_layer, n_layer)) {
337
367
new_type = GGML_TYPE_Q5_K;
338
368
}
339
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
369
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
340
370
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
341
371
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8 ) {
342
372
new_type = GGML_TYPE_Q5_K;
@@ -363,9 +393,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
363
393
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
364
394
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
365
395
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
366
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K ;
396
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_XS ;
367
397
}
368
398
} else {
399
+ // if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
369
400
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
370
401
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
371
402
if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
0 commit comments