From fc9e3fcac461ca4653ecead0e2b195e0eecd4181 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 12 May 2025 14:47:10 -0700 Subject: [PATCH 1/2] Tokenizer clang format Summary: (Diff train forward fix) Differential Revision: D74597256 --- src/hf_tokenizer.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp index a04005b..84467ed 100644 --- a/src/hf_tokenizer.cpp +++ b/src/hf_tokenizer.cpp @@ -147,15 +147,15 @@ Error HFTokenizer::load(const std::string& path) { // Pull out the token strings try { - const std::string bos_token = - parsed_config_json.contains("bos_token") && !parsed_config_json["bos_token"].is_null() - ? parsed_config_json["bos_token"].get() - : ""; + const std::string bos_token = parsed_config_json.contains("bos_token") && + !parsed_config_json["bos_token"].is_null() + ? parsed_config_json["bos_token"].get() + : ""; - const std::string eos_token = - parsed_config_json.contains("eos_token") && !parsed_config_json["eos_token"].is_null() - ? parsed_config_json["eos_token"].get() - : ""; + const std::string eos_token = parsed_config_json.contains("eos_token") && + !parsed_config_json["eos_token"].is_null() + ? parsed_config_json["eos_token"].get() + : ""; const auto bos_res = special_token_map_->tryGetInteger(bos_token); const auto eos_res = special_token_map_->tryGetInteger(eos_token); if (!bos_res) { From 59c4d65669fa494b5c66ddbc2115bd2a62c9ad59 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Mon, 12 May 2025 14:47:10 -0700 Subject: [PATCH 2/2] Tokenizer clang format Differential Revision: D74609261 --- src/hf_tokenizer.cpp | 3 ++- src/tiktoken.cpp | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp index 84467ed..fa62264 100644 --- a/src/hf_tokenizer.cpp +++ b/src/hf_tokenizer.cpp @@ -71,7 +71,8 @@ Error HFTokenizer::load(const std::string& path) { [](const auto& it) -> std::uint64_t { return it.at("id"); })); // Create special token regex to help later with encoding. - special_token_regex_ = TK_UNWRAP(detail::build_special_token_regex(special_token_map)); + special_token_regex_ = + TK_UNWRAP(detail::build_special_token_regex(special_token_map)); // Store for future use. special_token_map_.emplace(std::move(special_token_map)); diff --git a/src/tiktoken.cpp b/src/tiktoken.cpp index 9a3565d..c112221 100644 --- a/src/tiktoken.cpp +++ b/src/tiktoken.cpp @@ -46,7 +46,6 @@ static Result> _create_regex( return create_regex(pattern); } - static Result> _parse( const std::string& line) { // Tiktoken format @@ -138,7 +137,7 @@ Error Tiktoken::load(const std::string& path) { _regex = TK_UNWRAP(_create_regex(_pattern)); special_token_regex_ = - TK_UNWRAP(detail::build_special_token_regex(TokenMap(special_token_map))); + TK_UNWRAP(detail::build_special_token_regex(TokenMap(special_token_map))); // initialize vocab_size, bos_tok, eos_tok vocab_size_ = token_map_->size() + special_token_map_->size();