diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91d3508a..bf95d42e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,12 +2,12 @@ cmake_minimum_required(VERSION 3.5)
 project(mnn-llm)
 
 option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF)
-option(USING_VISUAL_MODEL "Using visual model will need dpes: MNNOpenCV and httplib." OFF)
+option(LLM_SUPPORT_VISION "Llm model support vision input." OFF)
 option(DUMP_PROFILE_INFO "Dump profile info when chat." OFF)
 option(BUILD_JNI "Build JNI for android app." OFF)
 
-if (USING_VISUAL_MODEL)
-    add_definitions(-DUSING_VISUAL_MODEL)
+if (LLM_SUPPORT_VISION)
+    add_definitions(-DLLM_SUPPORT_VISION)
 endif()
 
 if (DUMP_PROFILE_INFO)
@@ -24,7 +24,7 @@ set(MNN_SUPPORT_TRANSFORMER_FUSE ON CACHE BOOL "Open MNN_SUPPORT_TRANSFORMER_FUS
 if (BUILD_FOR_ANDROID)
     set(MNN_ARM82 ON CACHE BOOL "Open MNN_ARM82" FORCE)
 endif()
-if (USING_VISUAL_MODEL)
+if (LLM_SUPPORT_VISION)
     set(MNN_BUILD_OPENCV ON CACHE BOOL "Open MNN_BUILD_OPENCV" FORCE)
     set(MNN_IMGCODECS ON CACHE BOOL "Open MNN_IMGCODECS" FORCE)
 endif()
@@ -33,7 +33,7 @@ add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/MNN)
 # include dir
 include_directories(${CMAKE_CURRENT_LIST_DIR}/include/
                     ${CMAKE_CURRENT_LIST_DIR}/MNN/include/
-                    ${CMAKE_CURRENT_LIST_DIR}/MNN/tools/cv/include/cv/
+                    ${CMAKE_CURRENT_LIST_DIR}/MNN/tools/cv/include/
                     )
 
 # source files
@@ -58,7 +58,7 @@ else()
     set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
 
     target_link_libraries(llm MNN MNN_Express)
-    if (USING_VISUAL_MODEL)
+    if (LLM_SUPPORT_VISION)
         target_link_libraries(llm MNNOpenCV)
     endif()
 endif()
diff --git a/README.md b/README.md
index f14205fd..337f9d5f 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 `modelscope`模型下载：
 
 <details>
-  <summary>qwen系列</summary>
+  <summary>qwen</summary>
 
 - [modelscope-qwen-1.8b-chat]
 - [modelscope-qwen-7b-chat]
@@ -31,14 +31,16 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 - [modelscope-qwen1.5-1.8b-chat]
 - [modelscope-qwen1.5-4b-chat]
 - [modelscope-qwen1.5-7b-chat]
-- [modelscope-qwen2-0.5b-chat]
-- [modelscope-qwen2-1.5b-chat]
-- [modelscope-qwen2-7b-chat]
+- [modelscope-qwen2-0.5b-instruct]
+- [modelscope-qwen2-1.5b-instruct]
+- [modelscope-qwen2-7b-instruct]
+- [modelscope-qwen2-vl-2b-instruct]
+- [modelscope-qwen2-vl-7b-instruct]
 
 </details>
 
 <details>
-  <summary>glm系列</summary>
+  <summary>glm</summary>
 
 - [modelscope-chatglm-6b]
 - [modelscope-chatglm2-6b]
@@ -49,7 +51,7 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 </details>
 
 <details>
-  <summary>llama系列</summary>
+  <summary>llama</summary>
 
 - [modelscope-llama2-7b-chat]
 - [modelscope-llama3-8b-instruct]
@@ -62,10 +64,17 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 </details>
 
 <details>
-  <summary>其他</summary>
+  <summary>phi</summary>
 
 - [modelscope-phi-2]
+
+</details>
+
+<details>
+  <summary>embedding</summary>
+
 - [modelscope-bge-large-zh]
+- [modelscope-gte_sentence-embedding_multilingual-base]
 
 </details>
 
@@ -77,9 +86,11 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 [modelscope-qwen1.5-1.8b-chat]: https://modelscope.cn/models/zhaode/Qwen1.5-1.8B-Chat-MNN/files
 [modelscope-qwen1.5-4b-chat]: https://modelscope.cn/models/zhaode/Qwen1.5-4B-Chat-MNN/files
 [modelscope-qwen1.5-7b-chat]: https://modelscope.cn/models/zhaode/Qwen1.5-7B-Chat-MNN/files
-[modelscope-qwen2-0.5b-chat]: https://modelscope.cn/models/zhaode/Qwen2-0.5B-Instruct-MNN/files
-[modelscope-qwen2-1.5b-chat]: https://modelscope.cn/models/zhaode/Qwen2-1.5B-Instruct-MNN/files
-[modelscope-qwen2-7b-chat]: https://modelscope.cn/models/zhaode/Qwen2-7B-Instruct-MNN/files
+[modelscope-qwen2-0.5b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-0.5B-Instruct-MNN/files
+[modelscope-qwen2-1.5b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-1.5B-Instruct-MNN/files
+[modelscope-qwen2-7b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-7B-Instruct-MNN/files
+[modelscope-qwen2-vl-2b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-VL-2B-Instruct-MNN/files
+[modelscope-qwen2-vl-7b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-VL-7B-Instruct-MNN/files
 
 [modelscope-chatglm-6b]: https://modelscope.cn/models/zhaode/chatglm-6b-MNN/files
 [modelscope-chatglm2-6b]: https://modelscope.cn/models/zhaode/chatglm2-6b-MNN/files
@@ -96,6 +107,7 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 [modelscope-tinyllama-1.1b-chat]: https://modelscope.cn/models/zhaode/TinyLlama-1.1B-Chat-MNN/files
 [modelscope-phi-2]: https://modelscope.cn/models/zhaode/phi-2-MNN/files
 [modelscope-bge-large-zh]: https://modelscope.cn/models/zhaode/bge-large-zh-MNN/files
+[modelscope-gte_sentence-embedding_multilingual-base]: https://modelscope.cn/models/zhaode/gte_sentence-embedding_multilingual-base-MNN/files
 
 ## 构建
 
@@ -151,13 +163,13 @@ cd mnn-llm
 
 一些编译宏：
 - `BUILD_FOR_ANDROID`: 编译到Android设备；
-- `USING_VISUAL_MODEL`: 支持多模态能力的模型，需要依赖`libMNNOpenCV`；
+- `LLM_SUPPORT_VISION`: 是否支持视觉处理能力；
 - `DUMP_PROFILE_INFO`: 每次对话后dump出性能数据到命令行中；
 
-默认使用`CPU`后端且不实用多模态能力，如果使用其他后端或能力，可以在编译MNN的脚本中添加`MNN`编译宏
+默认使用`CPU`，如果使用其他后端或能力，可以在编译MNN时添加`MNN`编译宏
 - cuda: `-DMNN_CUDA=ON`
 - opencl: `-DMNN_OPENCL=ON`
-- opencv: `-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON`
+- metal: `-DMNN_METAL=ON`
 
 ### 4. 执行
 
@@ -181,27 +193,35 @@ adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo ./Qwen2-
 <details>
   <summary>reference</summary>
 
+- [cpp-httplib](https://github.com/yhirose/cpp-httplib)
+- [chatgpt-web](https://github.com/xqdoo00o/chatgpt-web)
+- [ChatViewDemo](https://github.com/BrettFX/ChatViewDemo)
+- [nlohmann/json](https://github.com/nlohmann/json)
+- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
+- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
+- [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+- [Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)
+- [Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)
+- [Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)
+- [Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)
+- [Qwen2-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary)
+- [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct/summary)
+- [Qwen2-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-7B-Instruct/summary)
+- [Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)
+- [Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)
 - [chatglm-6b](https://modelscope.cn/models/ZhipuAI/chatglm-6b/summary)
 - [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)
-- [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
 - [codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)
-- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
-- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
-- [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
-- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
+- [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
+- [glm4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary)
 - [Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)
+- [Llama-3-8B-Instruct](https://modelscope.cn/models/modelscope/Meta-Llama-3-8B-Instruct/summary)
+- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
 - [internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)
+- [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)
+- [deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)
+- [TinyLlama-1.1B-Chat-v0.6](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6)
 - [phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)
 - [bge-large-zh](https://modelscope.cn/models/AI-ModelScope/bge-large-zh/summary)
-- [TinyLlama-1.1B-Chat-v0.6](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6)
-- [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)
-- [Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)
-- [Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)
-- [Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)
-- [Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)
-- [cpp-httplib](https://github.com/yhirose/cpp-httplib)
-- [chatgpt-web](https://github.com/xqdoo00o/chatgpt-web)
-- [ChatViewDemo](https://github.com/BrettFX/ChatViewDemo)
-- [nlohmann/json](https://github.com/nlohmann/json)
-
+- [gte_sentence-embedding_multilingual-base](https://modelscope.cn/models/iic/gte_sentence-embedding_multilingual-base/summary)
 </details>
\ No newline at end of file
diff --git a/README_en.md b/README_en.md
index 9d666ea9..a7c5d1ed 100644
--- a/README_en.md
+++ b/README_en.md
@@ -32,9 +32,11 @@ Download models from `modelscope`：
 - [modelscope-qwen1.5-1.8b-chat]
 - [modelscope-qwen1.5-4b-chat]
 - [modelscope-qwen1.5-7b-chat]
-- [modelscope-qwen2-0.5b-chat]
-- [modelscope-qwen2-1.5b-chat]
-- [modelscope-qwen2-7b-chat]
+- [modelscope-qwen2-0.5b-instruct]
+- [modelscope-qwen2-1.5b-instruct]
+- [modelscope-qwen2-7b-instruct]
+- [modelscope-qwen2-vl-2b-instruct]
+- [modelscope-qwen2-vl-7b-instruct]
 
 </details>
 
@@ -63,10 +65,17 @@ Download models from `modelscope`：
 </details>
 
 <details>
-  <summary>others</summary>
+  <summary>phi</summary>
 
 - [modelscope-phi-2]
+
+</details>
+
+<details>
+  <summary>embedding</summary>
+
 - [modelscope-bge-large-zh]
+- [modelscope-gte_sentence-embedding_multilingual-base]
 
 </details>
 
@@ -78,9 +87,11 @@ Download models from `modelscope`：
 [modelscope-qwen1.5-1.8b-chat]: https://modelscope.cn/models/zhaode/Qwen1.5-1.8B-Chat-MNN/files
 [modelscope-qwen1.5-4b-chat]: https://modelscope.cn/models/zhaode/Qwen1.5-4B-Chat-MNN/files
 [modelscope-qwen1.5-7b-chat]: https://modelscope.cn/models/zhaode/Qwen1.5-7B-Chat-MNN/files
-[modelscope-qwen2-0.5b-chat]: https://modelscope.cn/models/zhaode/Qwen2-0.5B-Instruct-MNN/files
-[modelscope-qwen2-1.5b-chat]: https://modelscope.cn/models/zhaode/Qwen2-1.5B-Instruct-MNN/files
-[modelscope-qwen2-7b-chat]: https://modelscope.cn/models/zhaode/Qwen2-7B-Instruct-MNN/files
+[modelscope-qwen2-0.5b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-0.5B-Instruct-MNN/files
+[modelscope-qwen2-1.5b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-1.5B-Instruct-MNN/files
+[modelscope-qwen2-7b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-7B-Instruct-MNN/files
+[modelscope-qwen2-vl-2b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-VL-2B-Instruct-MNN/files
+[modelscope-qwen2-vl-7b-instruct]: https://modelscope.cn/models/zhaode/Qwen2-VL-7B-Instruct-MNN/files
 
 [modelscope-chatglm-6b]: https://modelscope.cn/models/zhaode/chatglm-6b-MNN/files
 [modelscope-chatglm2-6b]: https://modelscope.cn/models/zhaode/chatglm2-6b-MNN/files
@@ -97,6 +108,7 @@ Download models from `modelscope`：
 [modelscope-tinyllama-1.1b-chat]: https://modelscope.cn/models/zhaode/TinyLlama-1.1B-Chat-MNN/files
 [modelscope-phi-2]: https://modelscope.cn/models/zhaode/phi-2-MNN/files
 [modelscope-bge-large-zh]: https://modelscope.cn/models/zhaode/bge-large-zh-MNN/files
+[modelscope-gte_sentence-embedding_multilingual-base]: https://modelscope.cn/models/zhaode/gte_sentence-embedding_multilingual-base-MNN/files
 
 ## Building
 
@@ -147,9 +159,10 @@ cd mnn-llm
 ./script/ios_build.sh
 ```
 
-The default backend used is `CPU`. If you want to use a different backend, you can add a MNN compilation macro within the script:
+The default backend used is `CPU`. If you want to use a different backend, you can add a MNN compilation macro:
 - cuda: `-DMNN_CUDA=ON`
 - opencl: `-DMNN_OPENCL=ON`
+- metal: `-DMNN_METAL=ON`
 
 
 ### 4. Execution
@@ -174,27 +187,36 @@ adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo ./Qwen2-
 <details>
   <summary>reference</summary>
 
+- [cpp-httplib](https://github.com/yhirose/cpp-httplib)
+- [chatgpt-web](https://github.com/xqdoo00o/chatgpt-web)
+- [ChatViewDemo](https://github.com/BrettFX/ChatViewDemo)
+- [nlohmann/json](https://github.com/nlohmann/json)
+- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
+- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
+- [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+- [Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)
+- [Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)
+- [Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)
+- [Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)
+- [Qwen2-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary)
+- [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct/summary)
+- [Qwen2-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-7B-Instruct/summary)
+- [Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)
+- [Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)
 - [chatglm-6b](https://modelscope.cn/models/ZhipuAI/chatglm-6b/summary)
 - [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)
-- [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
 - [codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)
-- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
-- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
-- [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
-- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
+- [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
+- [glm4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary)
 - [Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)
+- [Llama-3-8B-Instruct](https://modelscope.cn/models/modelscope/Meta-Llama-3-8B-Instruct/summary)
+- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
 - [internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)
+- [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)
+- [deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)
+- [TinyLlama-1.1B-Chat-v0.6](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6)
 - [phi-2](https://modelscope.cn/models/AI-ModelScope/phi-2/summary)
 - [bge-large-zh](https://modelscope.cn/models/AI-ModelScope/bge-large-zh/summary)
-- [TinyLlama-1.1B-Chat-v0.6](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6)
-- [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)
-- [Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)
-- [Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)
-- [Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)
-- [Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)
-- [cpp-httplib](https://github.com/yhirose/cpp-httplib)
-- [chatgpt-web](https://github.com/xqdoo00o/chatgpt-web)
-- [ChatViewDemo](https://github.com/BrettFX/ChatViewDemo)
-- [nlohmann/json](https://github.com/nlohmann/json)
+- [gte_sentence-embedding_multilingual-base](https://modelscope.cn/models/iic/gte_sentence-embedding_multilingual-base/summary)
 
 </details>
\ No newline at end of file
diff --git a/demo/embedding_demo.cpp b/demo/embedding_demo.cpp
index c998717e..457e21f5 100644
--- a/demo/embedding_demo.cpp
+++ b/demo/embedding_demo.cpp
@@ -31,9 +31,9 @@ int main(int argc, const char* argv[]) {
     std::string model_dir = argv[1];
     std::cout << "model path is " << model_dir << std::endl;
     std::unique_ptr<Embedding> embedding(Embedding::createEmbedding(model_dir));
-    auto vec_0 = embedding->embedding("在春暖花开的季节，走在樱花缤纷的道路上，人们纷纷拿出手机拍照留念。樱花树下，情侣手牵手享受着这绝美的春光。孩子们在树下追逐嬉戏，脸上洋溢着纯真的笑容。春天的气息在空气中弥漫，一切都显得那么生机勃勃，充满希望。");
-    auto vec_1 = embedding->embedding("春天到了，樱花树悄然绽放，吸引了众多游客前来观赏。小朋友们在花瓣飘落的树下玩耍，而恋人们则在这浪漫的景色中尽情享受二人世界。每个人的脸上都挂着幸福的笑容，仿佛整个世界都被春天温暖的阳光和满树的樱花渲染得更加美好。");
-    auto vec_2 = embedding->embedding("在炎热的夏日里，沙滩上的游客们穿着泳装享受着海水的清凉。孩子们在海边堆沙堡，大人们则在太阳伞下品尝冷饮，享受悠闲的时光。远处，冲浪者们挑战着波涛，体验着与海浪争斗的刺激。夏天的海滩，总是充满了活力和热情。");
+    auto vec_0 = embedding->txt_embedding("在春暖花开的季节，走在樱花缤纷的道路上，人们纷纷拿出手机拍照留念。樱花树下，情侣手牵手享受着这绝美的春光。孩子们在树下追逐嬉戏，脸上洋溢着纯真的笑容。春天的气息在空气中弥漫，一切都显得那么生机勃勃，充满希望。");
+    auto vec_1 = embedding->txt_embedding("春天到了，樱花树悄然绽放，吸引了众多游客前来观赏。小朋友们在花瓣飘落的树下玩耍，而恋人们则在这浪漫的景色中尽情享受二人世界。每个人的脸上都挂着幸福的笑容，仿佛整个世界都被春天温暖的阳光和满树的樱花渲染得更加美好。");
+    auto vec_2 = embedding->txt_embedding("在炎热的夏日里，沙滩上的游客们穿着泳装享受着海水的清凉。孩子们在海边堆沙堡，大人们则在太阳伞下品尝冷饮，享受悠闲的时光。远处，冲浪者们挑战着波涛，体验着与海浪争斗的刺激。夏天的海滩，总是充满了活力和热情。");
     dumpVARP(vec_0);
     dumpVARP(vec_1);
     dumpVARP(vec_2);
diff --git a/include/llm.hpp b/include/llm.hpp
index 393b6f44..2a61eb74 100644
--- a/include/llm.hpp
+++ b/include/llm.hpp
@@ -277,37 +277,41 @@ class Llm {
 class Lvlm : public Llm {
 public:
     Lvlm(std::shared_ptr<LlmConfig> config) : Llm(config) {
-        img_size_ = config->llm_config_.value("img_size", img_size_);
-        imgpad_len_ = config->llm_config_.value("imgpad_len", imgpad_len_);
-        img_start_ = config->llm_config_.value("img_start", img_start_);
-        img_end_ = config->llm_config_.value("img_end", img_end_);
-        img_pad_ = config->llm_config_.value("img_pad", img_pad_);
+        image_size_ = config->llm_config_.value("image_size", image_size_);
+        image_pad_ = config->llm_config_.value("image_pad", image_pad_);
+        vision_start_ = config->llm_config_.value("vision_start", vision_start_);
+        vision_end_ = config->llm_config_.value("vision_end", vision_end_);
+        image_mean_ = config->llm_config_.value("image_mean", image_mean_);
+        image_norm_ = config->llm_config_.value("image_norm", image_norm_);
     }
     ~Lvlm() { visual_module_.reset(); }
     virtual void load() override;
+    virtual std::vector<int> tokenizer(const std::string& query) override;
+    virtual MNN::Express::VARP embedding(const std::vector<int>& input_ids) override;
 private:
-    int img_size_ = 448, imgpad_len_ = 256, img_start_ = 151857, img_end_ = 151858, img_pad_ = 151859;
+    int image_size_ = 448, vision_start_ = 151857, vision_end_ = 151858, image_pad_ = 151859;
+    std::vector<float> image_mean_ {122.7709383 , 116.7460125 , 104.09373615};
+    std::vector<float> image_norm_ {0.01459843, 0.01500777, 0.01422007};
+    std::vector<int> image_process(const std::string& img_info);
     std::shared_ptr<Module> visual_module_;
-    VARP visual_embedding(const std::vector<int>& input_ids);
-    std::vector<int> url_encode(const std::string& url);
-    virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP embedding(const std::vector<int>& input_ids) override;
+    std::vector<VARP> image_embeddings_;
 };
 // Llm end
 
 // Embedding start
 class Embedding : public Llm {
 public:
-    Embedding(std::shared_ptr<LlmConfig> config) : Llm(config) {}
-    static Embedding* createEmbedding(const std::string& config_path);
-    static float dist(VARP var0, VARP var1);
+    Embedding(std::shared_ptr<LlmConfig> config);
+    static Embedding* createEmbedding(const std::string& config_path, bool load = true);
+    static float dist(MNN::Express::VARP var0, MNN::Express::VARP var1);
     virtual void load() override;
-    VARP embedding(const std::string& txt);
-    int dim() { return config_->hidden_size(); }
+    MNN::Express::VARP ids_embedding(const std::vector<int>& ids);
+    MNN::Express::VARP txt_embedding(const std::string& txt);
+    int dim() const;
 private:
     virtual std::vector<int> tokenizer(const std::string& query) override;
-    virtual VARP gen_attention_mask(int seq_len) override;
-    virtual VARP gen_position_ids(int seq_len) override;
+    virtual MNN::Express::VARP gen_attention_mask(int seq_len) override;
+    virtual MNN::Express::VARP gen_position_ids(int seq_len) override;
 };
 // Embedding end
 
diff --git a/src/llm.cpp b/src/llm.cpp
index e59d55f2..321152ea 100644
--- a/src/llm.cpp
+++ b/src/llm.cpp
@@ -17,7 +17,7 @@
 #include "llm.hpp"
 #include "tokenizer.hpp"
 
-#ifdef USING_VISUAL_MODEL
+#ifdef LLM_SUPPORT_VISION
 #include "httplib.h"
 #include <cv/cv.hpp>
 #endif
@@ -499,77 +499,22 @@ void Lvlm::load() {
     Module::Config module_config;
     module_config.shapeMutable = true;
     module_config.rearrange = false;
+    runtime_manager_->setExternalFile(config_->visual_model() + ".weight");
     visual_module_.reset(Module::load({}, {}, config_->visual_model().c_str(), runtime_manager_, &module_config));
 }
 
-std::vector<int> Lvlm::url_encode(const std::string& url) {
-    std::vector<int> ascii_values(imgpad_len_ + 2, img_pad_);
-    ascii_values[0] = img_start_;
-    ascii_values[imgpad_len_ + 1] = img_end_;
-    for (int i = 0; i < url.size(); i++) {
-        ascii_values[i + 1] = static_cast<int>(url[i]);
-    }
-    return ascii_values;
-}
-
-std::vector<int> Lvlm::tokenizer(const std::string& query) {
-    auto prompt = apply_prompt_template(query);
-    // split query
-    std::regex img_regex("<img>(.*?)</img>");
-    std::string::const_iterator searchStart(prompt.cbegin());
-    std::smatch match;
-    std::vector<std::string> img_info, txt_info;
-    std::vector<int> ids {};
-    while (std::regex_search(searchStart, prompt.cend(), match, img_regex)) {
-        std::cout << match[1].str() << std::endl;
-        auto txt_ids = tokenizer_->encode(match.prefix().str());
-        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
-        auto img_ids = url_encode(match[1].str());
-        ids.insert(ids.end(), img_ids.begin(), img_ids.end());
-        searchStart = match.suffix().first;
-    }
-    if (searchStart != prompt.cend()) {
-        auto txt_ids = tokenizer_->encode(std::string(searchStart, prompt.cend()));
-        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
-    }
-    return ids;
-}
-
-VARP Lvlm::embedding(const std::vector<int>& input_ids) {
-#ifdef USING_VISUAL_MODEL
-    int start_pos = 0, pad_pos = 0, end_pos = 0;
-    for (int i = 0; i < input_ids.size(); i++) {
-        int id = input_ids[i];
-        if (id == img_start_ && !start_pos) {
-            start_pos = i;
-        }
-        if (id == img_pad_ && !pad_pos) {
-            pad_pos = i;
-        }
-        if (id == img_end_ && !end_pos) {
-            end_pos = i;
-        }
-    }
-    if (!start_pos) {
-        return Llm::embedding(input_ids);
-    }
-    std::vector<int> prefix(input_ids.begin(), input_ids.begin() + start_pos + 1);
-    std::vector<int> img_ascii(input_ids.begin() + start_pos + 1, input_ids.begin() + pad_pos);
-    std::vector<int> suffix(input_ids.begin() + end_pos, input_ids.end());
-    std::string img_path;
-    for (auto ascii_val : img_ascii) {
-        img_path += static_cast<char>(ascii_val);
-    }
+std::vector<int> Lvlm::image_process(const std::string& image_info) {
+#ifdef LLM_SUPPORT_VISION
     VARP image = nullptr;
-    if (img_path.substr(0, 4) == "http") {
+    if (image_info.substr(0, 4) == "http") {
         std::regex url_regex(R"(^https?://([^/]+)(/.*))");
         std::smatch url_match_result;
         std::string host, path;
-        if (std::regex_search(img_path, url_match_result, url_regex) && url_match_result.size() == 3) {
+        if (std::regex_search(image_info, url_match_result, url_regex) && url_match_result.size() == 3) {
             host = url_match_result[1].str();
             path = url_match_result[2].str();
         }
-        std::cout << host << "#" << path << std::endl;
+        // std::cout << host << "#" << path << std::endl;
         httplib::Client cli(host);
         auto res = cli.Get(path);
         std::string img_file = "downloaded_image.jpg";
@@ -589,21 +534,77 @@ VARP Lvlm::embedding(const std::vector<int>& input_ids) {
         }
         image = MNN::CV::imread(img_file);
     } else {
-        image = MNN::CV::imread(img_path);
+        image = MNN::CV::imread(image_info);
     }
-    image = MNN::CV::resize(image, {img_size_, img_size_}, 0, 0, MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB,
-                            {123.25239296, 117.20384, 104.50194688}, {0.0145414 , 0.01494914, 0.01416452});
+    image = MNN::CV::resize(image, {image_size_, image_size_}, 0, 0, MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB, image_mean_, image_norm_);
     image = MNN::Express::_Unsqueeze(image, {0});
     image = MNN::Express::_Convert(image, NC4HW4);
     auto image_embedding = visual_module_->forward(image);
-    image_embedding = MNN::Express::_Permute(image_embedding, {1, 0, 2});
-    auto prefix_embedding = Llm::embedding(prefix);
-    auto suffix_embedding = Llm::embedding(suffix);
-    auto embeddings = MNN::Express::_Concat({prefix_embedding, image_embedding, suffix_embedding}, 0);
+    image_embeddings_.push_back(image_embedding);
+    int visual_len = image_embedding->getInfo()->dim[0];
+    std::vector<int> img_ids(visual_len, image_pad_);
+    img_ids.insert(img_ids.begin(), vision_start_);
+    img_ids.push_back(vision_end_);
+    return img_ids;
 #else
-    auto embeddings = Llm::embedding(input_ids);
+    return std::vector<int>(0);
 #endif
-    return embeddings;
+}
+
+std::vector<int> Lvlm::tokenizer(const std::string& query) {
+    auto prompt = apply_prompt_template(query);
+    // split query
+    std::regex img_regex("<img>(.*?)</img>");
+    std::string::const_iterator searchStart(prompt.cbegin());
+    std::smatch match;
+    std::vector<std::string> img_infos;
+    std::vector<int> ids {};
+
+    while (std::regex_search(searchStart, prompt.cend(), match, img_regex)) {
+        // std::cout << "img match: " << match[1].str() << std::endl;
+        auto txt_ids = tokenizer_->encode(match.prefix().str());
+        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
+        auto img_ids = image_process(match[1].str());
+        ids.insert(ids.end(), img_ids.begin(), img_ids.end());
+        searchStart = match.suffix().first;
+    }
+    if (searchStart != prompt.cend()) {
+        auto txt_ids = tokenizer_->encode(std::string(searchStart, prompt.cend()));
+        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
+    }
+    // printf("ids = ["); for (auto id : ids) printf("%d, ", id); printf("]\n");
+    return ids;
+}
+
+VARP Lvlm::embedding(const std::vector<int>& input_ids) {
+    if (input_ids.size() == 1) {
+        return Llm::embedding(input_ids);
+    }
+    std::vector<VARP> embeddings;
+    int img_idx = 0;
+    std::vector<int> cur_txt_ids;
+    for (int i = 0; i < input_ids.size(); i++) {
+        int id = input_ids[i];
+        if (id == image_pad_) {
+            continue;
+        }
+        cur_txt_ids.push_back(id);
+        if (id == vision_start_) {
+            auto txt_embedding = Llm::embedding(cur_txt_ids);
+            auto img_embedding = image_embeddings_[img_idx++];
+            embeddings.push_back(txt_embedding);
+            embeddings.push_back(img_embedding);
+        } else if (id == vision_end_) {
+            cur_txt_ids.clear();
+            cur_txt_ids.push_back(id);
+        }
+    }
+    if (!cur_txt_ids.empty()) {
+        auto txt_embedding = Llm::embedding(cur_txt_ids);
+        embeddings.push_back(txt_embedding);
+    }
+    auto embedding = MNN::Express::_Concat(embeddings, 0);
+    return embedding;
 }
 // Llm end
 
@@ -614,13 +615,19 @@ float Embedding::dist(VARP var0, VARP var1) {
     return dist;
 }
 
-Embedding* Embedding::createEmbedding(const std::string& config_path) {
+Embedding* Embedding::createEmbedding(const std::string& config_path, bool load) {
     std::shared_ptr<LlmConfig> config(new LlmConfig(config_path));
     Embedding* embedding = new Embedding(config);
-    embedding->load();
+    if (load) {
+        embedding->load();
+    }
     return embedding;
 }
 
+Embedding::Embedding(std::shared_ptr<LlmConfig> config) : Llm(config) {}
+
+int Embedding::dim() const { return config_->hidden_size(); }
+
 void Embedding::load() {
     init_runtime();
     printf("load tokenizer\n");
@@ -636,15 +643,14 @@ void Embedding::load() {
     MNN_PRINT("load %s ... ", model_path.c_str());
     modules_.resize(1);
     modules_[0].reset(Module::load(
-            {"input_ids", "attention_mask", "position_ids"},
-            {"sentence_embeddings"}, model_path.c_str(), runtime_manager_, &module_config));
+                                   {"input_ids", "attention_mask", "position_ids"},
+                                   {"sentence_embeddings"}, model_path.c_str(), runtime_manager_, &module_config));
     MNN_PRINT("Done!\n");
 }
 
-VARP Embedding::embedding(const std::string& txt) {
-    auto ids = tokenizer(txt);
+VARP Embedding::ids_embedding(const std::vector<int>& ids) {
     int prompt_len = ids.size();
-    auto inputs_ids = _Const(ids.data(), {prompt_len}, NCHW, halide_type_of<int>());
+    auto inputs_ids = embedding(ids);
     auto attention_mask = gen_attention_mask(prompt_len);
     auto position_ids = gen_position_ids(prompt_len);
     auto outputs = modules_[0]->onForward({inputs_ids, attention_mask, position_ids});
@@ -652,12 +658,12 @@ VARP Embedding::embedding(const std::string& txt) {
     return sentence_embeddings;
 }
 
+VARP Embedding::txt_embedding(const std::string& txt) {
+    return ids_embedding(tokenizer(txt));
+}
+
 std::vector<int> Embedding::tokenizer(const std::string& query) {
-    auto prompt = query;
-    if (query.size() <= 256) {
-        prompt = "为这个句子生成表示以用于检索相关文章：" + query;
-    }
-    prompt = apply_prompt_template(prompt);
+    auto prompt = apply_prompt_template(query);
     auto ids = tokenizer_->encode(prompt);
     return ids;
 }
@@ -770,7 +776,7 @@ VARP TextVectorStore::text2vector(const std::string& text) {
         std::cerr << "Not set embedding for TextVectorStore." << std::endl;
         return nullptr;
     }
-    auto vector = embedding_->embedding(text);
+    auto vector = embedding_->txt_embedding(text);
     return vector;
 }
 // TextVectorStore end