File tree Expand file tree Collapse file tree 4 files changed +527
-79
lines changed Expand file tree Collapse file tree 4 files changed +527
-79
lines changed Original file line number Diff line number Diff line change @@ -426,6 +426,14 @@ extern "C" {
426
426
enum llama_ftype ftype; // quantize to this llama_ftype
427
427
enum ggml_type output_tensor_type; // output tensor type
428
428
enum ggml_type token_embedding_type; // token embeddings tensor type
429
+ enum ggml_type attn_q_type; // attention query tensor type
430
+ enum ggml_type attn_k_type; // attention key tensor type
431
+ enum ggml_type attn_v_type; // attention value tensor type
432
+ enum ggml_type attn_qkv_type; // attention query-key-value tensor type
433
+ enum ggml_type attn_output_type; // attention output tensor type
434
+ enum ggml_type ffn_gate_type; // feedforward network gate type
435
+ enum ggml_type ffn_down_type; // feedforward network down type
436
+ enum ggml_type ffn_up_type; // feedforward network up type
429
437
bool allow_requantize; // allow quantizing non-f32/f16 tensors
430
438
bool quantize_output_tensor; // quantize output.weight
431
439
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
Original file line number Diff line number Diff line change @@ -68,7 +68,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
68
68
case LLAMA_FTYPE_MOSTLY_IQ2_KT: return " IQ2_KT - 2.125 bpw" ;
69
69
case LLAMA_FTYPE_MOSTLY_IQ2_K: return " IQ2_K - 2.375 bpw" ;
70
70
case LLAMA_FTYPE_MOSTLY_IQ3_K: return " IQ3_K - 3.4325 bpw" ;
71
- case LLAMA_FTYPE_MOSTLY_IQ3_KS: return " IQ3_K - 3.25 bpw" ;
71
+ case LLAMA_FTYPE_MOSTLY_IQ3_KS: return " IQ3_KS - 3.25 bpw" ;
72
72
case LLAMA_FTYPE_MOSTLY_IQ3_KL: return " IQ3_KL - 4 bpw" ;
73
73
case LLAMA_FTYPE_MOSTLY_IQ3_KT: return " IQ3_KT - 3.125 bpw" ;
74
74
case LLAMA_FTYPE_MOSTLY_IQ4_KT: return " IQ4_KT - 4.0 bpw" ;
You can’t perform that action at this time.
0 commit comments