@@ -16676,7 +16676,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16676
16676
}
16677
16677
else if (name.find("attn_k.weight") != std::string::npos) {
16678
16678
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
16679
- else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K ;
16679
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K ;
16680
16680
else new_type = GGML_TYPE_IQ4_XS;
16681
16681
}
16682
16682
else if (name.find("attn_v.weight") != std::string::npos) {
@@ -16712,7 +16712,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16712
16712
}
16713
16713
else if (name.find("attn_k.weight") != std::string::npos) {
16714
16714
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
16715
- else if (qs.model.hparams.n_gqa() >= 2 ) new_type = GGML_TYPE_Q5_K ;
16715
+ else if (qs.model.hparams.n_gqa() >= 8 ) new_type = GGML_TYPE_Q4_K ;
16716
16716
else new_type = GGML_TYPE_IQ4_XS;
16717
16717
}
16718
16718
else if (name.find("attn_v.weight") != std::string::npos) {
@@ -16761,7 +16761,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16761
16761
}
16762
16762
else if (name.find("attn_k.weight") != std::string::npos) {
16763
16763
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
16764
- else if (qs.model.hparams.n_gqa() >= 2 ) new_type = GGML_TYPE_Q5_K ;
16764
+ else if (qs.model.hparams.n_gqa() >= 8 ) new_type = GGML_TYPE_Q4_K ;
16765
16765
else new_type = GGML_TYPE_IQ4_XS;
16766
16766
}
16767
16767
else if (name.find("attn_v.weight") != std::string::npos) {
@@ -16798,7 +16798,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16798
16798
}
16799
16799
else if (name.find("attn_k.weight") != std::string::npos) {
16800
16800
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
16801
- else if (qs.model.hparams.n_gqa() >= 2 ) new_type = GGML_TYPE_Q5_K ;
16801
+ else if (qs.model.hparams.n_gqa() >= 8 ) new_type = GGML_TYPE_Q4_K ;
16802
16802
else new_type = GGML_TYPE_IQ4_XS;
16803
16803
}
16804
16804
else if (name.find("attn_v.weight") != std::string::npos) {
@@ -16863,8 +16863,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16863
16863
}
16864
16864
else if (name.find("attn_k.weight") != std::string::npos) {
16865
16865
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
16866
- else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
16867
- else new_type = GGML_TYPE_Q4_K ;
16866
+ else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
16867
+ else new_type = GGML_TYPE_IQ4_XS ;
16868
16868
}
16869
16869
else if (name.find("attn_v.weight") != std::string::npos) {
16870
16870
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
@@ -16904,8 +16904,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16904
16904
}
16905
16905
else if (name.find("attn_v.weight") != std::string::npos) {
16906
16906
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
16907
- else if (qs.model.hparams.n_gqa() >= 2 ) new_type = GGML_TYPE_Q5_K ;
16908
- else new_type = GGML_TYPE_Q5_K ;
16907
+ else if (qs.model.hparams.n_gqa() >= 8 ) new_type = GGML_TYPE_Q4_K ;
16908
+ else new_type = GGML_TYPE_IQ4_XS ;
16909
16909
++qs.i_attention_wv;
16910
16910
}
16911
16911
else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16949,8 +16949,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16949
16949
}
16950
16950
else if (name.find("attn_k.weight") != std::string::npos) {
16951
16951
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
16952
- else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
16953
- else new_type = GGML_TYPE_Q4_K ;
16952
+ else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
16953
+ else new_type = GGML_TYPE_IQ4_XS ;
16954
16954
}
16955
16955
else if (name.find("attn_v.weight") != std::string::npos) {
16956
16956
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
@@ -16999,8 +16999,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16999
16999
}
17000
17000
else if (name.find("attn_k.weight") != std::string::npos) {
17001
17001
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
17002
- else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
17003
- else new_type = GGML_TYPE_Q4_K ;
17002
+ else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
17003
+ else new_type = GGML_TYPE_IQ4_XS ;
17004
17004
}
17005
17005
else if (name.find("attn_v.weight") != std::string::npos) {
17006
17006
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
@@ -17049,8 +17049,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17049
17049
}
17050
17050
else if (name.find("attn_k.weight") != std::string::npos) {
17051
17051
if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K;
17052
- else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
17053
- else new_type = GGML_TYPE_Q4_K ;
17052
+ else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
17053
+ else new_type = GGML_TYPE_IQ4_XS ;
17054
17054
}
17055
17055
else if (name.find("attn_v.weight") != std::string::npos) {
17056
17056
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
0 commit comments