Skip to content

Commit d11afd6

Browse files
authored
llava : fix moondream support (ggml-org#7163)
* Revert "Revert "llava : add support for moondream vision language model (ggml-org#6899)"" This reverts commit 9da243b. * Fix num_positions and embeddings initialization
1 parent 8c570c9 commit d11afd6

File tree

2 files changed

+71
-19
lines changed

2 files changed

+71
-19
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ Typically finetunes of the base models below are supported as well.
140140
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
141141
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
142142
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
143+
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
143144

144145
**HTTP server**
145146

examples/llava/clip.cpp

+70-19
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
104104
#define TN_POS_EMBD "%s.position_embd.weight"
105105
#define TN_CLASS_EMBD "v.class_embd"
106106
#define TN_PATCH_EMBD "v.patch_embd.weight"
107+
#define TN_PATCH_BIAS "v.patch_embd.bias"
107108
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
108109
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
109110
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@@ -425,6 +426,7 @@ struct clip_vision_model {
425426
// embeddings
426427
struct ggml_tensor * class_embedding;
427428
struct ggml_tensor * patch_embeddings;
429+
struct ggml_tensor * patch_bias;
428430
struct ggml_tensor * position_embeddings;
429431

430432
struct ggml_tensor * pre_ln_w;
@@ -501,6 +503,11 @@ struct clip_ctx {
501503
bool use_gelu = false;
502504
int32_t ftype = 1;
503505

506+
bool has_class_embedding = true;
507+
bool has_pre_norm = true;
508+
bool has_post_norm = false;
509+
bool has_patch_bias = false;
510+
504511
struct gguf_context * ctx_gguf;
505512
struct ggml_context * ctx_data;
506513

@@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
526533
const int patch_size = hparams.patch_size;
527534
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
528535
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
529-
const int num_positions = num_patches + 1;
536+
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
530537
const int hidden_size = hparams.hidden_size;
531538
const int n_head = hparams.n_head;
532539
const int d_head = hidden_size / n_head;
@@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
557564
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
558565
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
559566

560-
// concat class_embeddings and patch_embeddings
561-
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
562-
ggml_set_name(embeddings, "embeddings");
563-
ggml_set_input(embeddings);
567+
if (ctx->has_patch_bias) {
568+
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
569+
inp = ggml_add(ctx0, inp, model.patch_bias);
570+
}
564571

565-
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
566-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
572+
// concat class_embeddings and patch_embeddings
573+
struct ggml_tensor * embeddings = inp;
574+
if (ctx->has_class_embedding) {
575+
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
576+
ggml_set_name(embeddings, "embeddings");
577+
ggml_set_input(embeddings);
578+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
579+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
580+
embeddings = ggml_acc(ctx0, embeddings, inp,
581+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
582+
}
567583

568-
embeddings = ggml_acc(ctx0, embeddings, inp,
569-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
570584

571585
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
572586
ggml_set_name(positions, "positions");
@@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
576590
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
577591

578592
// pre-layernorm
579-
{
593+
if (ctx->has_pre_norm) {
580594
embeddings = ggml_norm(ctx0, embeddings, eps);
581595
ggml_set_name(embeddings, "pre_ln");
582596

@@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
664678
embeddings = cur;
665679
}
666680

681+
// post-layernorm
682+
if (ctx->has_post_norm) {
683+
embeddings = ggml_norm(ctx0, embeddings, eps);
684+
ggml_set_name(embeddings, "post_ln");
685+
686+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
687+
}
688+
667689
// llava projector
668690
{
669691
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11481170

11491171
}
11501172

1173+
try {
1174+
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1175+
new_clip->has_class_embedding = true;
1176+
} catch (const std::exception& e) {
1177+
new_clip->has_class_embedding = false;
1178+
}
1179+
1180+
try {
1181+
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1182+
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1183+
new_clip->has_pre_norm = true;
1184+
} catch (std::exception & e) {
1185+
new_clip->has_pre_norm = false;
1186+
}
1187+
1188+
try {
1189+
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
1190+
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
1191+
new_clip->has_post_norm = true;
1192+
} catch (std::exception & e) {
1193+
new_clip->has_post_norm = false;
1194+
}
1195+
1196+
try {
1197+
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
1198+
new_clip->has_patch_bias = true;
1199+
} catch (std::exception & e) {
1200+
new_clip->has_patch_bias = false;
1201+
}
1202+
11511203
try {
11521204
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1153-
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
11541205
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1155-
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1156-
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
11571206
} catch(const std::exception& e) {
11581207
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
11591208
}
@@ -1797,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
17971846
const int image_size = hparams.image_size;
17981847
const int patch_size = hparams.patch_size;
17991848
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
1800-
const int num_positions = num_patches + 1;
1849+
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
18011850

18021851
{
18031852
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1825,12 +1874,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
18251874
}
18261875

18271876
{
1828-
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
1877+
if (ctx->has_class_embedding) {
1878+
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
18291879

1830-
void* zero_mem = malloc(ggml_nbytes(embeddings));
1831-
memset(zero_mem, 0, ggml_nbytes(embeddings));
1832-
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1833-
free(zero_mem);
1880+
void* zero_mem = malloc(ggml_nbytes(embeddings));
1881+
memset(zero_mem, 0, ggml_nbytes(embeddings));
1882+
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
1883+
free(zero_mem);
1884+
}
18341885
}
18351886

18361887
{

0 commit comments

Comments
 (0)