@@ -339,14 +339,27 @@ def prepare_tensors(self):
339
339
for key in (
340
340
gguf .MODEL_TENSOR .TOKEN_EMBD ,
341
341
gguf .MODEL_TENSOR .OUTPUT ,
342
+ gguf .MODEL_TENSOR .ATTN_V ,
343
+ # gguf.MODEL_TENSOR.ATTN_K,
342
344
)
343
345
):
344
346
if self .ftype in (
345
347
gguf .LlamaFileType .MOSTLY_TQ1_0 ,
346
348
gguf .LlamaFileType .MOSTLY_TQ2_0 ,
349
+ gguf .LlamaFileType .MOSTLY_Q4_0 ,
350
+ gguf .LlamaFileType .MOSTLY_Q4_1 ,
351
+ gguf .LlamaFileType .MOSTLY_Q5_0 ,
352
+ gguf .LlamaFileType .MOSTLY_Q5_1 ,
347
353
):
348
- # TODO: use Q4_K and Q6_K
349
- data_qtype = gguf .GGMLQuantizationType .F16
354
+ data_qtype = gguf .GGMLQuantizationType .Q8_0
355
+ # if self.ftype in (
356
+ # gguf.LlamaFileType.MOSTLY_Q2_K,
357
+ # gguf.LlamaFileType.MOSTLY_Q3_K,
358
+ # gguf.LlamaFileType.MOSTLY_Q4_K,
359
+ # gguf.LlamaFileType.MOSTLY_Q5_K,
360
+ # gguf.LlamaFileType.MOSTLY_Q6_K,
361
+ # ):
362
+ # data_qtype = gguf.GGMLQuantizationType.Q6_K
350
363
351
364
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
352
365
if isinstance (data_qtype , bool ):
@@ -356,15 +369,42 @@ def prepare_tensors(self):
356
369
data_qtype = gguf .GGMLQuantizationType .F16
357
370
elif self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
358
371
data_qtype = gguf .GGMLQuantizationType .BF16
372
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q4_0 :
373
+ data_qtype = gguf .GGMLQuantizationType .Q4_0
374
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q4_1 :
375
+ data_qtype = gguf .GGMLQuantizationType .Q4_1
376
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q5_0 :
377
+ data_qtype = gguf .GGMLQuantizationType .Q5_0
378
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q5_1 :
379
+ data_qtype = gguf .GGMLQuantizationType .Q5_1
359
380
elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 :
360
381
data_qtype = gguf .GGMLQuantizationType .Q8_0
361
382
elif self .ftype == gguf .LlamaFileType .MOSTLY_TQ1_0 :
362
383
data_qtype = gguf .GGMLQuantizationType .TQ1_0
363
384
elif self .ftype == gguf .LlamaFileType .MOSTLY_TQ2_0 :
364
385
data_qtype = gguf .GGMLQuantizationType .TQ2_0
386
+ # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q2_K:
387
+ # data_qtype = gguf.GGMLQuantizationType.Q2_K
388
+ # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q3_K:
389
+ # data_qtype = gguf.GGMLQuantizationType.Q3_K
390
+ # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_K:
391
+ # data_qtype = gguf.GGMLQuantizationType.Q4_K
392
+ # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_K:
393
+ # data_qtype = gguf.GGMLQuantizationType.Q5_K
394
+ # elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_K:
395
+ # data_qtype = gguf.GGMLQuantizationType.Q6_K
365
396
else :
366
397
raise ValueError (f"Unknown file type: { self .ftype .name } " )
367
398
399
+ # if data_qtype in [
400
+ # gguf.GGMLQuantizationType.Q5_1, gguf.LlamaFileType.MOSTLY_Q5_0,
401
+ # gguf.GGMLQuantizationType.Q4_1, gguf.LlamaFileType.MOSTLY_Q4_0,
402
+ # ]:
403
+ # logger.warning("**************************************************************************************")
404
+ # logger.warning("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
405
+ # logger.warning("** `llama-quantize` uses `Q4_K` and `Q6_K` for the token embeddings but this code not")
406
+ # logger.warning("**************************************************************************************")
407
+
368
408
try :
369
409
data = gguf .quants .quantize (data , data_qtype )
370
410
except gguf .QuantError as e :
@@ -687,6 +727,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
687
727
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1" :
688
728
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
689
729
res = "megrez"
730
+ if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5" :
731
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
732
+ res = "deepseek-v3"
690
733
691
734
if res is None :
692
735
logger .warning ("\n " )
@@ -3373,6 +3416,24 @@ def set_gguf_parameters(self):
3373
3416
self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
3374
3417
3375
3418
3419
+ @Model .register ("Cohere2ForCausalLM" )
3420
+ class Cohere2Model (Model ):
3421
+ model_arch = gguf .MODEL_ARCH .COHERE2
3422
+
3423
+ def set_gguf_parameters (self ):
3424
+ super ().set_gguf_parameters ()
3425
+
3426
+ self .gguf_writer .add_logit_scale (self .hparams ["logit_scale" ])
3427
+ self .gguf_writer .add_sliding_window (self .hparams ["sliding_window" ])
3428
+ self .gguf_writer .add_vocab_size (self .hparams ["vocab_size" ])
3429
+
3430
+ rotary_pct = self .hparams ["rotary_pct" ]
3431
+ hidden_size = self .hparams ["hidden_size" ]
3432
+ num_attention_heads = self .hparams ["num_attention_heads" ]
3433
+ self .gguf_writer .add_rope_dimension_count (int (rotary_pct * (hidden_size // num_attention_heads )))
3434
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
3435
+
3436
+
3376
3437
@Model .register ("OlmoForCausalLM" )
3377
3438
@Model .register ("OLMoForCausalLM" )
3378
3439
class OlmoModel (Model ):
@@ -3831,6 +3892,7 @@ def prepare_tensors(self):
3831
3892
3832
3893
3833
3894
@Model .register ("DeepseekV2ForCausalLM" )
3895
+ @Model .register ("DeepseekV3ForCausalLM" )
3834
3896
class DeepseekV2Model (Model ):
3835
3897
model_arch = gguf .MODEL_ARCH .DEEPSEEK2
3836
3898
@@ -3852,6 +3914,15 @@ def set_gguf_parameters(self):
3852
3914
self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
3853
3915
self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
3854
3916
self .gguf_writer .add_expert_weights_scale (hparams ["routed_scaling_factor" ])
3917
+ self .gguf_writer .add_expert_weights_norm (hparams ["norm_topk_prob" ])
3918
+
3919
+ if hparams ["scoring_func" ] == "sigmoid" :
3920
+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
3921
+ elif hparams ["scoring_func" ] == "softmax" :
3922
+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SOFTMAX )
3923
+ else :
3924
+ raise ValueError (f"Unsupported scoring_func value: { hparams ['scoring_func' ]} " )
3925
+
3855
3926
self .gguf_writer .add_rope_dimension_count (hparams ["qk_rope_head_dim" ])
3856
3927
3857
3928
if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
@@ -3864,6 +3935,16 @@ def set_gguf_parameters(self):
3864
3935
_experts : list [dict [str , Tensor ]] | None = None
3865
3936
3866
3937
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3938
+ # rename e_score_correction_bias tensors
3939
+ if name .endswith ("e_score_correction_bias" ):
3940
+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
3941
+
3942
+ # skip Multi-Token Prediction (MTP) layers
3943
+ block_count = self .hparams ["num_hidden_layers" ]
3944
+ match = re .match (r"model.layers.(\d+)" , name )
3945
+ if match and int (match .group (1 )) >= block_count :
3946
+ return []
3947
+
3867
3948
# process the experts separately
3868
3949
if name .find ("mlp.experts" ) != - 1 :
3869
3950
n_experts = self .hparams ["n_routed_experts" ]
@@ -4423,7 +4504,10 @@ def set_vocab(self):
4423
4504
special_vocab .merges = merges
4424
4505
# only add special tokens when they were not already loaded from config.json
4425
4506
special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
4426
- special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
4507
+ # special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # not_fixed glm4
4508
+ token_id = tokenizer .get_command ("<|user|>" ) # fix glm4
4509
+ print (token_id ) # fix glm4
4510
+ special_vocab ._set_special_token ("eot" , token_id ) # fix glm4
4427
4511
# this one is usually not in config.json anyway
4428
4512
special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
4429
4513
special_vocab .add_to_gguf (self .gguf_writer )
@@ -4748,8 +4832,11 @@ def parse_args() -> argparse.Namespace:
4748
4832
help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
4749
4833
)
4750
4834
parser .add_argument (
4751
- "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , "tq1_0" , "tq2_0" , "auto" ], default = "f16" ,
4752
- help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
4835
+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , "q4_0" , "q4_1" , "q5_0" , "q5_1" , "auto" ], default = "f16" ,
4836
+ # "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", "auto"], default="f16",
4837
+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, "
4838
+ "q8_0 for Q8_0, limited: q4_0 for Q4_0, q4_1 for Q4_1, q5_0 for Q5_0, q5_1 for Q5_1, etc"
4839
+ " auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
4753
4840
)
4754
4841
parser .add_argument (
4755
4842
"--bigendian" , action = "store_true" ,
@@ -4835,7 +4922,16 @@ def main() -> None:
4835
4922
"f32" : gguf .LlamaFileType .ALL_F32 ,
4836
4923
"f16" : gguf .LlamaFileType .MOSTLY_F16 ,
4837
4924
"bf16" : gguf .LlamaFileType .MOSTLY_BF16 ,
4925
+ "q4_0" : gguf .LlamaFileType .MOSTLY_Q4_0 ,
4926
+ "q4_1" : gguf .LlamaFileType .MOSTLY_Q4_1 ,
4927
+ "q5_0" : gguf .LlamaFileType .MOSTLY_Q5_0 ,
4928
+ "q5_1" : gguf .LlamaFileType .MOSTLY_Q5_1 ,
4838
4929
"q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
4930
+ # "q2_K": gguf.LlamaFileType.MOSTLY_Q2_K,
4931
+ # "q3_K": gguf.LlamaFileType.MOSTLY_Q3_K,
4932
+ # "q4_K": gguf.LlamaFileType.MOSTLY_Q4_K,
4933
+ # "q5_K": gguf.LlamaFileType.MOSTLY_Q5_K,
4934
+ # "q6_K": gguf.LlamaFileType.MOSTLY_Q6_K,
4839
4935
"tq1_0" : gguf .LlamaFileType .MOSTLY_TQ1_0 ,
4840
4936
"tq2_0" : gguf .LlamaFileType .MOSTLY_TQ2_0 ,
4841
4937
"auto" : gguf .LlamaFileType .GUESSED ,
0 commit comments