@@ -2342,13 +2342,6 @@ def set_vocab(self):
2342
2342
2343
2343
special_vocab .add_to_gguf (self .gguf_writer )
2344
2344
2345
- def _hf_permute_qk (self , weights , n_head : int , n_head_kv : int ):
2346
- if n_head_kv is not None and n_head != n_head_kv :
2347
- n_head = n_head_kv
2348
- return (weights .reshape (n_head , 2 , weights .shape [0 ] // n_head // 2 , * weights .shape [1 :])
2349
- .swapaxes (1 , 2 )
2350
- .reshape (weights .shape ))
2351
-
2352
2345
def set_gguf_parameters (self ):
2353
2346
self .gguf_writer .add_context_length (self .hparams ["max_position_embeddings" ])
2354
2347
self .gguf_writer .add_block_count (self .hparams ["num_hidden_layers" ])
@@ -2367,26 +2360,22 @@ def set_gguf_parameters(self):
2367
2360
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2368
2361
num_heads = self .hparams ["num_attention_heads" ]
2369
2362
num_kv_heads = self .hparams ["num_key_value_heads" ]
2370
- hidden_size = self .hparams ["hidden_size" ]
2363
+ n_embd = self .hparams ["hidden_size" ]
2371
2364
q_per_kv = num_heads // num_kv_heads
2372
- head_dim = hidden_size // num_heads
2365
+ head_dim = n_embd // num_heads
2373
2366
num_groups = num_heads // q_per_kv
2374
2367
2375
- qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2376
-
2377
- if re .match (qkv_pattern , name ):
2378
- bid = re .findall (qkv_pattern , name )[0 ]
2368
+ if bid is not None and f"model.layers.{ bid } .attention.wqkv" in name :
2379
2369
qkv = data_torch
2380
- # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2381
- qkv = qkv .T .reshape ((- 1 , num_groups , q_per_kv + 2 , head_dim ))
2382
- q , k , v = qkv [..., : q_per_kv , :], qkv [..., q_per_kv : q_per_kv + 1 , :], qkv [..., q_per_kv + 1 : q_per_kv + 2 , :]
2370
+
2371
+ qkv = qkv .reshape ((num_groups , q_per_kv + 2 , head_dim , n_embd ))
2372
+ q , k , v = qkv [:, : q_per_kv ], qkv [:, - 2 ], qkv [:, - 1 ]
2373
+
2383
2374
# The model weights of q and k equire additional reshape.
2384
- # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2385
- q = self ._hf_permute_qk (q .reshape ((q .shape [0 ], - 1 )).T , num_heads , num_heads )
2386
- # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2387
- k = self ._hf_permute_qk (k .reshape ((k .shape [0 ], - 1 )).T , num_heads , num_kv_heads )
2388
- # v = rearrange(v, " o g n i -> o (g n i)").T
2389
- v = v .reshape ((v .shape [0 ], - 1 )).T
2375
+ q = LlamaModel .permute (q .reshape ((- 1 , q .shape [- 1 ])), num_heads , num_heads )
2376
+ k = LlamaModel .permute (k .reshape ((- 1 , k .shape [- 1 ])), num_heads , num_kv_heads )
2377
+ v = v .reshape ((- 1 , v .shape [- 1 ]))
2378
+
2390
2379
return [
2391
2380
(self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_Q , bid ), q ),
2392
2381
(self .format_tensor_name (gguf .MODEL_TENSOR .ATTN_K , bid ), k ),
@@ -3695,6 +3684,7 @@ def main() -> None:
3695
3684
small_first_shard = args .no_tensor_first_split )
3696
3685
3697
3686
logger .info ("Set model parameters" )
3687
+ model_instance .gguf_writer .add_type (gguf .GGUFType .MODEL )
3698
3688
model_instance .set_gguf_parameters ()
3699
3689
3700
3690
logger .info ("Set model tokenizer" )
0 commit comments