gguf-py, convert-hf : store decoder_start_token_id from T5 config.json in model header

sszymczy · sszymczy · commit cd9a96913d70 · 2024-06-19T21:04:06.000+02:00
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -2822,6 +2822,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
         self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
         self.gguf_writer.add_file_type(self.ftype)
 
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -48,6 +48,7 @@ class LLM:
         EXPERT_WEIGHTS_SCALE       = "{arch}.expert_weights_scale"
         POOLING_TYPE               = "{arch}.pooling_type"
         LOGIT_SCALE                = "{arch}.logit_scale"
+        DECODER_START_TOKEN_ID     = "{arch}.decoder_start_token_id"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -397,6 +397,9 @@ def add_expert_feed_forward_length(self, length: int) -> None:
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
 
+    def add_decoder_start_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
+
     def add_head_count(self, count: int) -> None:
         self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)