From 0a6c344f65edcd46b90607b077e1fe13a3ec5157 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=81=E8=A1=8C?= Date: Thu, 9 May 2024 17:51:29 +0800 Subject: [PATCH] add config --- demo/cli_demo.cpp | 2 +- demo/memory_demo.cpp | 2 +- demo/tokenizer_demo.cpp | 6 +- demo/web_demo.cpp | 2 +- include/llm.hpp | 90 +++++++++++-- include/tokenizer.hpp | 1 + src/llm.cpp | 277 +++++++++++++++------------------------- src/tokenizer.cpp | 16 +++ 8 files changed, 206 insertions(+), 190 deletions(-) diff --git a/demo/cli_demo.cpp b/demo/cli_demo.cpp index e4c399bc..450c1313 100644 --- a/demo/cli_demo.cpp +++ b/demo/cli_demo.cpp @@ -54,7 +54,7 @@ int main(int argc, const char* argv[]) { std::string model_dir = argv[1]; std::cout << "model path is " << model_dir << std::endl; std::unique_ptr llm(Llm::createLLM(model_dir)); - llm->load(model_dir); + llm->load(); if (argc < 3) { llm->chat(); } diff --git a/demo/memory_demo.cpp b/demo/memory_demo.cpp index 776fd81e..a1efa480 100644 --- a/demo/memory_demo.cpp +++ b/demo/memory_demo.cpp @@ -22,7 +22,7 @@ int main(int argc, const char* argv[]) { if (argc == 4) { auto llm_dir = argv[3]; std::shared_ptr llm(Llm::createLLM(llm_dir)); - llm->load(llm_dir); + llm->load(); chat_memory->summarize(llm); chat_memory->save(memory_dir); } diff --git a/demo/tokenizer_demo.cpp b/demo/tokenizer_demo.cpp index 6a630ecb..6b13d466 100644 --- a/demo/tokenizer_demo.cpp +++ b/demo/tokenizer_demo.cpp @@ -16,14 +16,16 @@ int main(int argc, const char* argv[]) { std::unique_ptr tokenizer_(new Tiktoken); tokenizer_->load(tokenizer_path); const std::string system_str = "Youare a helpful assistant."; - const std::string user_str = "<|endoftext|>"; + const std::string user_str = "Hello"; // const std::string query = "\n<|im_start|>system\n" + system_str + "<|im_end|>\n<|im_start|>\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n"; - const std::string query = system_str + "\n" + user_str; + const std::string query = "\n<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n"; + // const std::string query = system_str + "\n" + user_str; auto tokens = tokenizer_->encode(query); std::string decode_str; printf("encode tokens = [ "); for (auto token : tokens) { + printf("%d, ", token); decode_str += tokenizer_->decode(token); } printf("]\n"); diff --git a/demo/web_demo.cpp b/demo/web_demo.cpp index a37a733e..c0e14704 100644 --- a/demo/web_demo.cpp +++ b/demo/web_demo.cpp @@ -20,7 +20,7 @@ int main(int argc, const char* argv[]) { std::string web_dir = argv[2]; std::cout << "model path is " << model_dir << std::endl; std::unique_ptr llm(Llm::createLLM(model_dir)); - llm->load(model_dir); + llm->load(); std::stringstream ss; httplib::Server svr; diff --git a/include/llm.hpp b/include/llm.hpp index 9ef2491e..95b4b59e 100644 --- a/include/llm.hpp +++ b/include/llm.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,79 @@ struct Prompt { std::vector tokens; }; +class LlmConfig { +public: + LlmConfig() {} + LlmConfig(const std::string& dir) { + base_dir_ = dir + "/"; + std::ifstream config_file(dir + "/config.json"); + if (config_file.is_open()) { + config_ = json::parse(config_file); + } else { + std::cerr << "Unable to open config file: " << dir << std::endl; + } + } + + std::string model_type() const { + return config_.value("model_type", "unknow"); + } + + std::string tokenizer_type() const { + return config_.value("tokenizer_type", "tiktoken"); + } + + std::string llm_model() const { + return base_dir_ + config_.value("llm_model", "llm.mnn"); + } + + std::string llm_weight() const { + return base_dir_ + config_.value("llm_weight", "llm.mnn.weight"); + } + + std::string embedding_file() const { + return base_dir_ + config_.value("embedding_file", "embeddings_bf16.bin"); + } + + std::string tokenizer_file() const { + return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt"); + } + + int hidden_size() const { + return config_.value("hidden_size", 4096); + } + + std::vector key_value_shape() const { + return config_.value("key_value_shape", std::vector{}); + } + + std::vector stop_ids() const { + return config_.value("stop_ids", std::vector{}); + } + + std::string prompt_template() const { + return config_.value("prompt_template", ""); + } + + std::string backend_type() const { + return config_.value("backend_type", "cpu"); + } + + int thread_num() const { + return config_.value("thread_num", 4); + } + + std::string precision() const { + return config_.value("precision", "low"); + } + + std::string memory() const { + return config_.value("memory", "low"); + } +private: + std::string base_dir_; + json config_; +}; + class Llm { public: Llm() { @@ -75,7 +149,7 @@ class Llm { runtime_manager_.reset(); } static Llm* createLLM(const std::string& path, std::string model_type = "auto"); - void load(const std::string& model_dir); + void load(); void chat(); void warmup(); std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); @@ -104,6 +178,7 @@ class Llm { // time int64_t prefill_us_ = 0; int64_t decode_us_ = 0; + LlmConfig config_; protected: VARP embedding(const std::vector& input_ids); VARP txt_embedding(const std::vector& input_ids); @@ -112,13 +187,12 @@ class Llm { protected: VARP inputs_embeds_, attention_mask_, position_ids_; // model configs - bool is_single_ = false; - bool is_disk_embedding_ = false; + bool is_single_ = true; + bool is_disk_embedding_ = true; bool is_visual_ = false; int layer_nums_ = 0; int hidden_size_ = 4096; std::vector key_value_shape_ = {}; - std::string disk_embedding_file_ = ""; // gen info float load_progress_ = 0.f; // tokenizer @@ -126,10 +200,10 @@ class Llm { std::shared_ptr visual_module_; private: virtual VARP visual_embedding(const std::vector& input_ids) { return nullptr; } - virtual std::vector tokenizer(const std::string& query) = 0; - virtual VARP gen_attention_mask(int seq_len) = 0; - virtual VARP gen_position_ids(int seq_len) = 0; - virtual bool is_stop(int token_id) = 0; + virtual std::vector tokenizer(const std::string& query); + virtual VARP gen_attention_mask(int seq_len); + virtual VARP gen_position_ids(int seq_len); + virtual bool is_stop(int token_id); private: // MNN Modules std::shared_ptr runtime_manager_; diff --git a/include/tokenizer.hpp b/include/tokenizer.hpp index 7711e0ee..cb81cff2 100644 --- a/include/tokenizer.hpp +++ b/include/tokenizer.hpp @@ -19,6 +19,7 @@ class Tokenizer { public: Tokenizer() = default; virtual ~Tokenizer() = default; + static Tokenizer* createTokenizer(const std::string& type); virtual bool load(const std::string& filename) = 0; virtual std::vector encode(const std::string& str) = 0; virtual std::string decode(int id) = 0; diff --git a/src/llm.cpp b/src/llm.cpp index 48acfc05..e3cfc31c 100644 --- a/src/llm.cpp +++ b/src/llm.cpp @@ -22,81 +22,54 @@ #endif // Llm start -Llm* Llm::createLLM(const std::string& path, std::string model_type) { - auto size = path.size(); +Llm* Llm::createLLM(const std::string& model_dir, std::string model_type) { + Llm* llm = new Llm; + llm->config_ = LlmConfig(model_dir); + llm->tokenizer_.reset(Tokenizer::createTokenizer(llm->config_.tokenizer_type())); + // llm->load(); + llm->key_value_shape_ = llm->config_.key_value_shape(); + llm->layer_nums_ = 24; + return llm; +} - // end with '.mnn' is single model file, otherwise split block models - bool is_single = (size > 4 && - path[size - 4] == '.' && - path[size - 3] == 'm' && - path[size - 2] == 'n' && - path[size - 1] == 'n'); - Llm* llm = nullptr; - if (model_type == "auto") { - model_type = path; - } - if (model_type.find("chatglm") != std::string::npos) { - if (model_type.find("chatglm2") != std::string::npos) { - llm = new Chatglm2_6b; - } else if (model_type.find("chatglm3") != std::string::npos) { - llm = new Chatglm2_6b; - llm->model_name_ = "Chatglm3_6b"; - } else { - llm = new Chatglm_6b; - } - } else if (model_type.find("codegeex2") != std::string::npos) { - llm = new Chatglm2_6b; - llm->model_name_ = "Codegeex2_6b"; - } else if (model_type.find("qwen1.5") != std::string::npos || - model_type.find("qwen2") != std::string::npos) { - if (model_type.find("0.5b") != std::string::npos) { - llm = new Qwen2_0_5b; - } else if (model_type.find("1.8b") != std::string::npos) { - llm = new Qwen2_1_8b; - } else if (model_type.find("4b") != std::string::npos) { - llm = new Qwen2_4b; - } else if (model_type.find("7b") != std::string::npos) { - llm = new Qwen2_7b; - } - } else if (model_type.find("qwen") != std::string::npos) { - if (model_type.find("1.8") != std::string::npos) { - llm = new Qwen_1_8b; - } else if (model_type.find("vl") != std::string::npos) { - llm = new Qwen_vl; - } else { - llm = new Qwen_7b; - } - } else if (model_type.find("llama2") != std::string::npos) { - llm = new Llama2_7b; - } else if (model_type.find("baichuan") != std::string::npos) { - llm = new Llama2_7b; - llm->model_name_ = "Baichuan2_7b"; - } else if (model_type.find("phi2") != std::string::npos) { - llm = new Phi_2; - } else if (model_type.find("internlm") != std::string::npos) { - llm = new Llama2_7b; - llm->model_name_ = "Internlm_7b"; - } else if (model_type.find("deepseek") != std::string::npos) { - llm = new Llama2_7b; - llm->model_name_ = "deepseek_7b"; - llm->layer_nums_ = 30; - } else if (model_type.find("tinyllama") != std::string::npos) { - llm = new TinyLlama; - llm->model_name_ = "TinyLlama"; - } else if (model_type.find("yi") != std::string::npos) { - llm = new Yi_6b; - llm->model_name_ = "Yi_6b"; - } else if (model_type.find("llama3") != std::string::npos) { - llm = new Llama3_8b; - llm->model_name_ = "Llama3_8b"; - } - if (!llm) { - std::cerr << "model type can't judge!" << std::endl; - return llm; +void Llm::load() { + // init runtime + ScheduleConfig config; + BackendConfig cpuBackendConfig; + config.type = static_cast(backend_type_);; + config.numThread = config_.thread_num(); + cpuBackendConfig.precision = BackendConfig::Precision_Low; + cpuBackendConfig.memory = BackendConfig::Memory_Low; + config.backendConfig = &cpuBackendConfig; + runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); + runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); + load_progress_ = 0.f; + printf("load tokenizer\n"); + // 1. load vocab + tokenizer_->load(config_.tokenizer_file()); + printf("load tokenizer Done\n"); + { + std::ifstream embedding_bin(config_.embedding_file()); + is_disk_embedding_ = embedding_bin.good(); + MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_); + embedding_bin.close(); } - llm->is_single_ = is_single; - std::cout << "### model name : "<< llm->model_name_ << std::endl; - return llm; + // 2. load model + Module::Config module_config; + module_config.shapeMutable = true; + module_config.rearrange = true; + + key_value_shape_.insert(key_value_shape_.begin(), layer_nums_); + modules_.resize(1); + std::string model_path = config_.llm_model(); + std::string external_path = config_.llm_weight(); + MNN_PRINT("load %s ... ", model_path.c_str()); + runtime_manager_->setExternalFile(external_path); + modules_[0].reset(Module::load( + {"input_ids", "attention_mask", "position_ids", "past_key_values"}, + {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + MNN_PRINT("Done!\n"); + load_progress_ += 90.f; } void Llm::chat() { @@ -231,103 +204,6 @@ void Llm::reset() { history_.clear(); } -void Llm::load(const std::string& model_dir) { - model_dir_ = model_dir; - // init - ScheduleConfig config; - BackendConfig cpuBackendConfig; - config.type = static_cast(backend_type_);; - config.numThread = thread_num_; - if (low_precision_) { - cpuBackendConfig.precision = BackendConfig::Precision_Low; - } - cpuBackendConfig.memory = BackendConfig::Memory_Low; - config.backendConfig = &cpuBackendConfig; - runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); - runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); - if (config.type == MNN_FORWARD_OPENCL) { - const char* cacheFileName = ".tempcache"; - // runtime_manager_->setCache(cacheFileName); - } - load_progress_ = 0.f; - printf("load tokenizer\n"); - // 1. load vocab - std::string tokenizer_path = model_dir + "/tokenizer.txt"; - if (is_single_) { - size_t pos = model_dir.find_last_of("/\\"); - std::string dir_path = (pos != std::string::npos) ? model_dir.substr(0, pos + 1) : ""; - model_dir_ = dir_path; - tokenizer_path = dir_path + "/tokenizer.txt"; - } - load_progress_ += 5.f; - tokenizer_->load(tokenizer_path); - load_progress_ += 5.f; - printf("load tokenizer Done\n"); - { - disk_embedding_file_ = model_dir_ + "/embeddings_bf16.bin"; - std::ifstream embedding_bin(disk_embedding_file_); - is_disk_embedding_ = embedding_bin.good(); - MNN_PRINT("### disk embedding is %d\n", is_disk_embedding_); - embedding_bin.close(); - } - // 2. load model - Module::Config module_config; - module_config.shapeMutable = true; - module_config.rearrange = true; - if (is_single_) { - key_value_shape_.insert(key_value_shape_.begin(), layer_nums_); - modules_.resize(1); - std::string model_path = model_dir; - std::string external_path = model_dir + ".weight"; - MNN_PRINT("load %s ... ", model_path.c_str()); - runtime_manager_->setExternalFile(external_path); - modules_[0].reset(Module::load( - {"input_ids", "attention_mask", "position_ids", "past_key_values"}, - {"token_id", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += 90.f; - } else { - // 2. load models - modules_.resize(layer_nums_ + 2); - float step = 90.0 / modules_.size(); - char buffer[50]; - // load lm model - std::string lm_model_path = model_dir + "/lm.mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, lm_model_path.c_str()); - modules_[layer_nums_].reset(Module::load({}, {}, lm_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += step; - if (!is_disk_embedding_) { - std::string embedding_model_path = model_dir + "/embedding.mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, embedding_model_path.c_str());fflush(stdout); - modules_[layer_nums_ + 1].reset(Module::load({}, {}, embedding_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - load_progress_ += step; - } - if (is_visual_) { - std::string visual_model_path = model_dir + "/visual.mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, visual_model_path.c_str());fflush(stdout); - module_config.rearrange = false; - visual_module_.reset(Module::load({}, {}, visual_model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - module_config.rearrange = true; - } - // load glm_block models - for (int i = 0; i < layer_nums_; i++) { - load_progress_ += step; - std::string model_path = model_dir + "/block_" + std::to_string(i) + ".mnn"; - MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, model_path.c_str()); - modules_[i].reset(Module::load( - {"inputs_embeds", "attention_mask", "position_ids", "past_key_values"}, - {"hidden_states", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); - MNN_PRINT("Done!\n"); - } - } - if (config.type == MNN_FORWARD_OPENCL) { - // warmup(); - } -} - void Llm::warmup() { // warmup MNN_PRINT("### warmup ... "); @@ -401,19 +277,20 @@ VARP Llm::txt_embedding(const std::vector& input_ids) { } AUTOTIME; // disk embedding to save memory + int hidden_size = config_.hidden_size(); int seq_len = static_cast(input_ids.size()); if (needNewVar(inputs_embeds_, 0, seq_len)) { - inputs_embeds_ = _Input({seq_len, 1, hidden_size_}, NCHW); + inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW); } - size_t size = hidden_size_ * sizeof(int16_t); - FILE* file = fopen(disk_embedding_file_.c_str(), "rb"); - std::unique_ptr buffer(new int16_t[hidden_size_]); + size_t size = hidden_size * sizeof(int16_t); + FILE* file = fopen(config_.embedding_file().c_str(), "rb"); + std::unique_ptr buffer(new int16_t[hidden_size]); for (size_t i = 0; i < seq_len; i++) { fseek(file, input_ids[i] * size, SEEK_SET); fread(buffer.get(), 1, size, file); - auto ptr = inputs_embeds_->writeMap() + i * hidden_size_ * 2; - for (int j = 0; j < hidden_size_; j++) { + auto ptr = inputs_embeds_->writeMap() + i * hidden_size * 2; + for (int j = 0; j < hidden_size; j++) { ptr[j * 2] = 0; ptr[j * 2 + 1] = buffer[j]; } @@ -444,6 +321,51 @@ std::string Llm::decode(int id) { return word; } +// Llm +std::vector Llm::tokenizer(const std::string& query) { + auto ids = tokenizer_encode(query); + // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n"; + ids.insert(ids.begin(), {198, 151644, 872, 198}); + ids.insert(ids.end(), {151645, 198, 151644, 77091, 198}); + return ids; +} + +VARP Llm::gen_attention_mask(int seq_len) { + if (needNewVar(attention_mask_, 2, seq_len)) { + attention_mask_ = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of()); + } else { + return attention_mask_; + } + auto ptr = attention_mask_->writeMap(); + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < seq_len; j++) { + ptr[seq_len * i + j] = j <= i; + } + } + return attention_mask_; +} + +VARP Llm::gen_position_ids(int seq_len) { + if (needNewVar(position_ids_, 0, seq_len) || 0) { + position_ids_ = _Input({seq_len}, NCHW, halide_type_of()); + } + auto ptr = position_ids_->writeMap(); + if (seq_len == 1) { + ptr[0] = all_seq_len_; + } else { + for (int i = 0; i < seq_len; i++) { + ptr[i] = i; + } + } + return position_ids_; +} + +bool Llm::is_stop(int token_id) { + // <|endoftext|> <|im_end|> + return token_id == 151643 || token_id == 151645; +} + +#if 0 // Chatglm_6b std::vector Chatglm_6b::tokenizer(const std::string& query) { auto ids = tokenizer_encode(query); @@ -832,6 +754,7 @@ std::vector Llama3_8b::tokenizer(const std::string& query) { bool Llama3_8b::is_stop(int token_id) { return token_id == 128001 || token_id == 128009; } +#endif // Llm end // Embedding start @@ -1316,7 +1239,7 @@ Pipeline* Pipeline::load(const std::string& path) { } if (config.contains("llm")) { pipeline->llm_.reset(Llm::createLLM(config["llm"])); - pipeline->llm_->load(config["llm"]); + pipeline->llm_->load(); } if (config.contains("embedding")) { pipeline->embedding_.reset(Embedding::createEmbedding(config["embedding"])); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 63be777b..641eaa2e 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -78,6 +78,22 @@ static inline void to_lower_case(std::string& str) { } } +Tokenizer* Tokenizer::createTokenizer(const std::string& type) { + if (type == "sentencepiece") { + return new Sentencepiece(); + } + if (type == "tiktoken") { + return new Tiktoken(); + } + if (type == "bert") { + return new BertTokenizer(); + } + if (type == "huggingface") { + return new HuggingfaceTokenizer(); + } + return nullptr; +} + bool Sentencepiece::load(const std::string& filename) { std::ifstream tok_file(filename); std::string line, token;