@@ -257,6 +257,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
257
257
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model .hparams .n_gqa () >= 2 ) {
258
258
new_type = GGML_TYPE_Q6_K;
259
259
}
260
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_S && qs.model .hparams .n_gqa () >= 2 ) {
261
+ new_type = GGML_TYPE_Q6_K;
262
+ }
260
263
// else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
261
264
// use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
262
265
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@@ -292,6 +295,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
292
295
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL)
293
296
&& qs.model .hparams .n_gqa () >= 2 )
294
297
new_type = GGML_TYPE_Q5_K;
298
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_S)
299
+ && qs.model .hparams .n_gqa () >= 2 )
300
+ new_type = GGML_TYPE_Q5_K;
295
301
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
296
302
&& qs.model .hparams .n_gqa () >= 2 )
297
303
new_type = GGML_TYPE_Q3_K;
@@ -352,6 +358,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
352
358
// new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
353
359
new_type = GGML_TYPE_Q4_K;
354
360
}
361
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_S) {
362
+ // new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
363
+ new_type = GGML_TYPE_Q5_K;
364
+ }
355
365
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
356
366
// if (arch == LLM_ARCH_FALCON) {
357
367
// new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
@@ -398,6 +408,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
398
408
} else {
399
409
// if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
400
410
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
411
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_S) new_type = GGML_TYPE_Q5_K;
401
412
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
402
413
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
403
414
}
@@ -571,6 +582,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
571
582
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break ;
572
583
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break ;
573
584
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break ;
585
+ case LLAMA_FTYPE_MOSTLY_IQ4_S: default_type = GGML_TYPE_IQ4_XS; break ;
574
586
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break ;
575
587
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break ;
576
588
0 commit comments