feat: add detokenize method to Model

yoshoku · yoshoku · commit 506c6f407408 · 2024-07-13T20:55:29.000+09:00
diff --git a/ext/llama_cpp/dummy.rb b/ext/llama_cpp/dummy.rb
@@ -577,6 +577,14 @@ def has_encoder?; end # rubocop:disable Naming/PredicateName
     # Returns the token id that must be provided to the decoder to start generating output sequence for encoder-decoder model.
     # @return [Integer]
     def decoder_start_token; end
+
+    # Returns the text that is converted from the given tokens.
+    #
+    # @param tokens [Array<Integer>] The tokens.
+    # @param remove_special [Boolean] The flag whether to allow removing BOS and EOS tokens.
+    # @param unparse_special [Boolean] The flag whether to render special tokens in the output.
+    # @return [String]
+    def detokenize(tokens, remove_special: false, unparse_special: false); end
   end
 
   # Class for model KV override.
diff --git a/ext/llama_cpp/llama_cpp.cpp b/ext/llama_cpp/llama_cpp.cpp
@@ -1554,6 +1554,7 @@ class RbLLaMAModel {
     rb_define_method(rb_cLLaMAModel, "token_is_control?", RUBY_METHOD_FUNC(_llama_model_token_is_control), 1);
     rb_define_method(rb_cLLaMAModel, "has_encoder?", RUBY_METHOD_FUNC(_llama_model_has_encoder), 0);
     rb_define_method(rb_cLLaMAModel, "decoder_start_token", RUBY_METHOD_FUNC(_llama_model_decoder_start_token), 0);
+    rb_define_method(rb_cLLaMAModel, "detokenize", RUBY_METHOD_FUNC(_llama_model_detokenize), -1);
   }
 
 private:
@@ -1906,6 +1907,48 @@ class RbLLaMAModel {
     LLaMAModelWrapper* ptr = get_llama_model(self);
     return INT2NUM(llama_model_decoder_start_token(ptr->model));
   }
+
+  static VALUE _llama_model_detokenize(int argc, VALUE* argv, VALUE self) {
+    VALUE kw_args = Qnil;
+    ID kw_table[2] = { rb_intern("remove_special"), rb_intern("unparse_special") };
+    VALUE kw_values[2] = { Qundef, Qundef };
+    VALUE tokens_ = Qnil;
+    rb_scan_args(argc, argv, "1:", &tokens_, &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 0, 2, kw_values);
+
+    if (!RB_TYPE_P(tokens_, T_ARRAY)) {
+      rb_raise(rb_eArgError, "tokens must be an array");
+      return Qnil;
+    }
+
+    const int32_t n_tokens = RARRAY_LEN(tokens_);
+    llama_token* tokens = ALLOCA_N(llama_token, n_tokens);
+    for (int32_t i = 0; i < n_tokens; i++) {
+      tokens[i] = NUM2INT(rb_ary_entry(tokens_, i));
+    }
+
+    std::string text;
+    text.resize(std::max(text.capacity(), static_cast<unsigned long>(n_tokens)));
+    const int32_t text_len_max = text.size();
+
+    bool remove_special = kw_values[0] != Qundef ? RTEST(kw_values[0]) : false;
+    bool unparse_special = kw_values[1] != Qundef ? RTEST(kw_values[1]) : false;
+
+    LLaMAModelWrapper* ptr = get_llama_model(self);
+    std::string result;
+    int32_t n_chars = llama_detokenize(ptr->model, tokens, n_tokens, &text[0], text_len_max, remove_special, unparse_special);
+    if (n_chars < 0) {
+      text.resize(-n_chars);
+      n_chars = llama_detokenize(ptr->model, tokens, n_tokens, &text[0], text_len_max, remove_special, unparse_special);
+      if (n_chars <= text.size()) {
+        rb_raise(rb_eRuntimeError, "Failed to detokenize");
+        return Qnil;
+      }
+    }
+
+    text.resize(n_chars);
+    return rb_utf8_str_new_cstr(text.c_str());
+  }
 };
 
 const rb_data_type_t RbLLaMAModel::llama_model_type = {
diff --git a/ext/llama_cpp/llama_cpp.h b/ext/llama_cpp/llama_cpp.h
@@ -1,6 +1,7 @@
 #ifndef LLAMA_CPP_RB_H
 #define LLAMA_CPP_RB_H 1
 
+#include <algorithm>
 #include <sstream>
 #include <string>
 #include <vector>
diff --git a/sig/llama_cpp.rbs b/sig/llama_cpp.rbs
@@ -188,6 +188,7 @@ module LLaMACpp
     def token_is_control?: (Integer) -> bool
     def has_encoder?: () -> bool
     def decoder_start_token: () -> Integer
+    def detokenize: (Array[Integer], ?remove_special: bool, ?unparse_special: bool) -> String
   end
 
   class Timings