@@ -310,6 +310,7 @@ enum llm_kv {
310
310
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
311
311
LLM_KV_FEED_FORWARD_LENGTH,
312
312
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
313
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
313
314
LLM_KV_USE_PARALLEL_RESIDUAL,
314
315
LLM_KV_TENSOR_DATA_LAYOUT,
315
316
LLM_KV_EXPERT_COUNT,
@@ -388,21 +389,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
388
389
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
389
390
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
390
391
391
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
392
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
393
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
394
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
395
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
396
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
397
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
398
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
399
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
400
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
401
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
402
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
403
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
404
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
405
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
392
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
393
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
394
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
395
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
396
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
397
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
398
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
399
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
400
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
401
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
402
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
403
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
404
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
405
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
406
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
407
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
406
408
407
409
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
408
410
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1878,6 +1880,7 @@ struct llama_hparams {
1878
1880
uint32_t n_lora_q = 0;
1879
1881
uint32_t n_lora_kv = 0;
1880
1882
uint32_t n_ff_exp = 0;
1883
+ uint32_t n_ff_shexp = 0;
1881
1884
uint32_t n_expert_shared = 0;
1882
1885
float expert_weights_scale = 0.0;
1883
1886
@@ -1926,6 +1929,7 @@ struct llama_hparams {
1926
1929
if (this->n_lora_q != other.n_lora_q) return true;
1927
1930
if (this->n_lora_kv != other.n_lora_kv) return true;
1928
1931
if (this->n_ff_exp != other.n_ff_exp) return true;
1932
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
1929
1933
if (this->n_expert_shared != other.n_expert_shared) return true;
1930
1934
1931
1935
if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -4386,6 +4390,9 @@ static void llm_load_hparams(
4386
4390
} break;
4387
4391
case LLM_ARCH_QWEN2MOE:
4388
4392
{
4393
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4394
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4395
+
4389
4396
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4390
4397
switch (hparams.n_layer) {
4391
4398
case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -5202,6 +5209,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5202
5209
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5203
5210
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5204
5211
}
5212
+
5213
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5214
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5215
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5216
+ }
5205
5217
}
5206
5218
5207
5219
// Returns false if cancelled by progress_callback
@@ -5995,16 +6007,17 @@ static bool llm_load_tensors(
5995
6007
GGML_ASSERT(hparams.n_expert_used > 0);
5996
6008
5997
6009
// MoE branch
5998
- auto n_ff_exp = n_ff / hparams.n_expert_used;
6010
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
5999
6011
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6000
6012
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6001
6013
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6002
6014
6003
6015
// Shared expert branch
6016
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
6004
6017
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
6005
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff });
6006
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff , n_embd});
6007
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff });
6018
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp });
6019
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp , n_embd});
6020
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp });
6008
6021
}
6009
6022
} break;
6010
6023
case LLM_ARCH_PHI2:
0 commit comments