Skip to content

Commit e08b911

Browse files
committed
Custom convert for iMatrix
1 parent 1d3a02f commit e08b911

File tree

1 file changed

+101
-5
lines changed

1 file changed

+101
-5
lines changed

convert_hf_to_gguf.py

+101-5
Original file line numberDiff line numberDiff line change
@@ -339,14 +339,27 @@ def prepare_tensors(self):
339339
for key in (
340340
gguf.MODEL_TENSOR.TOKEN_EMBD,
341341
gguf.MODEL_TENSOR.OUTPUT,
342+
gguf.MODEL_TENSOR.ATTN_V,
343+
# gguf.MODEL_TENSOR.ATTN_K,
342344
)
343345
):
344346
if self.ftype in (
345347
gguf.LlamaFileType.MOSTLY_TQ1_0,
346348
gguf.LlamaFileType.MOSTLY_TQ2_0,
349+
gguf.LlamaFileType.MOSTLY_Q4_0,
350+
gguf.LlamaFileType.MOSTLY_Q4_1,
351+
gguf.LlamaFileType.MOSTLY_Q5_0,
352+
gguf.LlamaFileType.MOSTLY_Q5_1,
347353
):
348-
# TODO: use Q4_K and Q6_K
349-
data_qtype = gguf.GGMLQuantizationType.F16
354+
data_qtype = gguf.GGMLQuantizationType.Q8_0
355+
# if self.ftype in (
356+
# gguf.LlamaFileType.MOSTLY_Q2_K,
357+
# gguf.LlamaFileType.MOSTLY_Q3_K,
358+
# gguf.LlamaFileType.MOSTLY_Q4_K,
359+
# gguf.LlamaFileType.MOSTLY_Q5_K,
360+
# gguf.LlamaFileType.MOSTLY_Q6_K,
361+
# ):
362+
# data_qtype = gguf.GGMLQuantizationType.Q6_K
350363

351364
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
352365
if isinstance(data_qtype, bool):
@@ -356,15 +369,42 @@ def prepare_tensors(self):
356369
data_qtype = gguf.GGMLQuantizationType.F16
357370
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
358371
data_qtype = gguf.GGMLQuantizationType.BF16
372+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
373+
data_qtype = gguf.GGMLQuantizationType.Q4_0
374+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
375+
data_qtype = gguf.GGMLQuantizationType.Q4_1
376+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
377+
data_qtype = gguf.GGMLQuantizationType.Q5_0
378+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
379+
data_qtype = gguf.GGMLQuantizationType.Q5_1
359380
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
360381
data_qtype = gguf.GGMLQuantizationType.Q8_0
361382
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
362383
data_qtype = gguf.GGMLQuantizationType.TQ1_0
363384
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
364385
data_qtype = gguf.GGMLQuantizationType.TQ2_0
386+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q2_K:
387+
# data_qtype = gguf.GGMLQuantizationType.Q2_K
388+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q3_K:
389+
# data_qtype = gguf.GGMLQuantizationType.Q3_K
390+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_K:
391+
# data_qtype = gguf.GGMLQuantizationType.Q4_K
392+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_K:
393+
# data_qtype = gguf.GGMLQuantizationType.Q5_K
394+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_K:
395+
# data_qtype = gguf.GGMLQuantizationType.Q6_K
365396
else:
366397
raise ValueError(f"Unknown file type: {self.ftype.name}")
367398

399+
# if data_qtype in [
400+
# gguf.GGMLQuantizationType.Q5_1, gguf.LlamaFileType.MOSTLY_Q5_0,
401+
# gguf.GGMLQuantizationType.Q4_1, gguf.LlamaFileType.MOSTLY_Q4_0,
402+
# ]:
403+
# logger.warning("**************************************************************************************")
404+
# logger.warning("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
405+
# logger.warning("** `llama-quantize` uses `Q4_K` and `Q6_K` for the token embeddings but this code not")
406+
# logger.warning("**************************************************************************************")
407+
368408
try:
369409
data = gguf.quants.quantize(data, data_qtype)
370410
except gguf.QuantError as e:
@@ -687,6 +727,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
687727
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
688728
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
689729
res = "megrez"
730+
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
731+
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
732+
res = "deepseek-v3"
690733

691734
if res is None:
692735
logger.warning("\n")
@@ -3373,6 +3416,24 @@ def set_gguf_parameters(self):
33733416
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
33743417

33753418

3419+
@Model.register("Cohere2ForCausalLM")
3420+
class Cohere2Model(Model):
3421+
model_arch = gguf.MODEL_ARCH.COHERE2
3422+
3423+
def set_gguf_parameters(self):
3424+
super().set_gguf_parameters()
3425+
3426+
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
3427+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
3428+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3429+
3430+
rotary_pct = self.hparams["rotary_pct"]
3431+
hidden_size = self.hparams["hidden_size"]
3432+
num_attention_heads = self.hparams["num_attention_heads"]
3433+
self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
3434+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3435+
3436+
33763437
@Model.register("OlmoForCausalLM")
33773438
@Model.register("OLMoForCausalLM")
33783439
class OlmoModel(Model):
@@ -3831,6 +3892,7 @@ def prepare_tensors(self):
38313892

38323893

38333894
@Model.register("DeepseekV2ForCausalLM")
3895+
@Model.register("DeepseekV3ForCausalLM")
38343896
class DeepseekV2Model(Model):
38353897
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
38363898

@@ -3852,6 +3914,15 @@ def set_gguf_parameters(self):
38523914
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
38533915
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
38543916
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
3917+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
3918+
3919+
if hparams["scoring_func"] == "sigmoid":
3920+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
3921+
elif hparams["scoring_func"] == "softmax":
3922+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
3923+
else:
3924+
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
3925+
38553926
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
38563927

38573928
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@@ -3864,6 +3935,16 @@ def set_gguf_parameters(self):
38643935
_experts: list[dict[str, Tensor]] | None = None
38653936

38663937
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3938+
# rename e_score_correction_bias tensors
3939+
if name.endswith("e_score_correction_bias"):
3940+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
3941+
3942+
# skip Multi-Token Prediction (MTP) layers
3943+
block_count = self.hparams["num_hidden_layers"]
3944+
match = re.match(r"model.layers.(\d+)", name)
3945+
if match and int(match.group(1)) >= block_count:
3946+
return []
3947+
38673948
# process the experts separately
38683949
if name.find("mlp.experts") != -1:
38693950
n_experts = self.hparams["n_routed_experts"]
@@ -4423,7 +4504,10 @@ def set_vocab(self):
44234504
special_vocab.merges = merges
44244505
# only add special tokens when they were not already loaded from config.json
44254506
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
4426-
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
4507+
# special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # not_fixed glm4
4508+
token_id = tokenizer.get_command("<|user|>") # fix glm4
4509+
print(token_id) # fix glm4
4510+
special_vocab._set_special_token("eot", token_id) # fix glm4
44274511
# this one is usually not in config.json anyway
44284512
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
44294513
special_vocab.add_to_gguf(self.gguf_writer)
@@ -4748,8 +4832,11 @@ def parse_args() -> argparse.Namespace:
47484832
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
47494833
)
47504834
parser.add_argument(
4751-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
4752-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
4835+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
4836+
# "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q2_K", "q3_K", "q4_K", "q5_K", "q6_K", "auto"], default="f16",
4837+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, "
4838+
"q8_0 for Q8_0, limited: q4_0 for Q4_0, q4_1 for Q4_1, q5_0 for Q5_0, q5_1 for Q5_1, etc"
4839+
" auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
47534840
)
47544841
parser.add_argument(
47554842
"--bigendian", action="store_true",
@@ -4835,7 +4922,16 @@ def main() -> None:
48354922
"f32": gguf.LlamaFileType.ALL_F32,
48364923
"f16": gguf.LlamaFileType.MOSTLY_F16,
48374924
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
4925+
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
4926+
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
4927+
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
4928+
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
48384929
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
4930+
# "q2_K": gguf.LlamaFileType.MOSTLY_Q2_K,
4931+
# "q3_K": gguf.LlamaFileType.MOSTLY_Q3_K,
4932+
# "q4_K": gguf.LlamaFileType.MOSTLY_Q4_K,
4933+
# "q5_K": gguf.LlamaFileType.MOSTLY_Q5_K,
4934+
# "q6_K": gguf.LlamaFileType.MOSTLY_Q6_K,
48394935
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
48404936
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
48414937
"auto": gguf.LlamaFileType.GUESSED,

0 commit comments

Comments
 (0)