diff --git a/CMakeLists.txt b/CMakeLists.txt index 610820c4..1419bf69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,13 +3,8 @@ project(mnn-llm) option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF) option(USING_VISUAL_MODEL "Using visual model will need dpes: MNNOpenCV and httplib." OFF) -option(USING_DISK_EMBED "Using disk embedding to save memory." OFF) option(DUMP_PROFILE_INFO "Dump profile info when chat." OFF) -if (USING_DISK_EMBED) - add_definitions(-DUSING_DISK_EMBED) -endif() - if (USING_VISUAL_MODEL) add_definitions(-DUSING_VISUAL_MODEL) endif() diff --git a/include/llm.hpp b/include/llm.hpp index 70b10576..1fb7bb66 100644 --- a/include/llm.hpp +++ b/include/llm.hpp @@ -105,11 +105,13 @@ class Llm { protected: // model configs bool is_single_ = false; + bool is_disk_embedding_ = false; bool is_visual_ = false; int layer_nums_ = 0; int hidden_size_ = 4096; std::vector key_value_shape_ = {}; std::string model_name_ = ""; + std::string disk_embedding_file_ = ""; // gen info float load_progress_ = 0.f; // tokenizer diff --git a/src/llm.cpp b/src/llm.cpp index 177c11ff..11ab9bff 100644 --- a/src/llm.cpp +++ b/src/llm.cpp @@ -214,6 +214,7 @@ void Llm::load(const std::string& model_dir) { cpuBackendConfig.memory = BackendConfig::Memory_Low; config.backendConfig = &cpuBackendConfig; runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); + runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); if (config.type == MNN_FORWARD_OPENCL) { const char* cacheFileName = ".tempcache"; // runtime_manager_->setCache(cacheFileName); @@ -225,12 +226,20 @@ void Llm::load(const std::string& model_dir) { if (is_single_) { size_t pos = model_dir.find_last_of("/\\"); std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : ""; + model_dir_ = dir_path; tokenizer_path = dir_path + "/tokenizer.txt"; } load_progress_ += 5.f; tokenizer_->load(tokenizer_path); load_progress_ += 5.f; printf("load tokenizer Done\n"); + { + disk_embedding_file_ = model_dir_ + "/embeddings_bf16.bin"; + std::ifstream embedding_bin(disk_embedding_file_); + is_disk_embedding_ = embedding_bin.good(); + MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_); + embedding_bin.close(); + } // 2. load model Module::Config module_config; module_config.shapeMutable = true; @@ -254,17 +263,17 @@ void Llm::load(const std::string& model_dir) { char buffer[50]; // load lm model std::string lm_model_path = model_dir + "/lm.mnn"; - std::string embedding_model_path = model_dir + "/embedding.mnn"; MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, lm_model_path.c_str()); modules_[layer_nums_].reset(Module::load({}, {}, lm_model_path.c_str(), runtime_manager_, &module_config)); MNN_PRINT("Done!\n"); load_progress_ += step; -#ifndef USING_DISK_EMBED - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, embedding_model_path.c_str());fflush(stdout); - modules_[layer_nums_ + 1].reset(Module::load({}, {}, embedding_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += step; -#endif + if (!is_disk_embedding_) { + std::string embedding_model_path = model_dir + "/embedding.mnn"; + MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, embedding_model_path.c_str());fflush(stdout); + modules_[layer_nums_ + 1].reset(Module::load({}, {}, embedding_model_path.c_str(), runtime_manager_, &module_config)); + MNN_PRINT("Done!\n"); + load_progress_ += step; + } if (is_visual_) { std::string visual_model_path = model_dir + "/visual.mnn"; MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, visual_model_path.c_str());fflush(stdout); @@ -308,18 +317,23 @@ void Llm::warmup() { int Llm::forward(const std::vector& input_ids) { int seq_len = input_ids.size(); - auto inputs_ids_ = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of()); auto attention_mask = gen_attention_mask(seq_len); auto position_ids = gen_position_ids(seq_len); int id = -1; if (is_single_) { // single model - auto outputs = modules_.back()->onForward({inputs_ids_, attention_mask, position_ids, past_key_values_[0]}); + auto hidden_states = _Const(input_ids.data(), {seq_len}, NCHW, halide_type_of()); + if (is_disk_embedding_) { + hidden_states = embedding(input_ids); + } + auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); + ExecutorScope::Current()->gc(Executor::FULL); id = outputs[0]->readMap()[0]; past_key_values_[0] = outputs[1]; } else { // split block models auto hidden_states = embedding(input_ids); + ExecutorScope::Current()->gc(Executor::FULL); for (int i = 0; i < layer_nums_; i++) { AUTOTIME; auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); @@ -339,19 +353,18 @@ int Llm::forward(const std::vector& input_ids) { } VARP Llm::txt_embedding(const std::vector& input_ids) { -#ifndef USING_DISK_EMBED - // using model forward - auto inputs_ids_ = _Const(input_ids.data(), {static_cast(input_ids.size())}, NCHW, halide_type_of()); - auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0]; - return hidden_states; -#endif + if (!is_disk_embedding_) { + // using model forward + auto inputs_ids_ = _Const(input_ids.data(), {static_cast(input_ids.size())}, NCHW, halide_type_of()); + auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0]; + return hidden_states; + } AUTOTIME; // disk embedding to save memory size_t seq_len = input_ids.size(); auto embedding = _Input({static_cast(seq_len), 1, hidden_size_}, NCHW); size_t size = hidden_size_ * sizeof(int16_t); - std::string file_path = model_dir_ + "/embeddings_bf16.bin"; - FILE* file = fopen(file_path.c_str(), "rb"); + FILE* file = fopen(disk_embedding_file_.c_str(), "rb"); std::unique_ptr buffer(new int16_t[hidden_size_]); for (size_t i = 0; i < seq_len; i++) { fseek(file, input_ids[i] * size, SEEK_SET);