pass min_tokens

twaka · twaka · commit c8e44a7a5846 · 2024-04-08T19:22:06.000+09:00
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -63,6 +63,7 @@ def __call__(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -501,6 +502,7 @@ def chat_completion_handler(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -586,6 +588,7 @@ def chat_completion_handler(
             stop=stop,
             seed=seed,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1295,6 +1298,7 @@ def functionary_chat_handler(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
+    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1501,6 +1505,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             stream=stream,
             stop=["user:", "</s>"],
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1577,6 +1582,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         stream=False,
         grammar=grammar,
         max_tokens=max_tokens,
+        min_tokens=min_tokens,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
@@ -1654,6 +1660,7 @@ def functionary_v1_v2_chat_handler(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
+    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1860,6 +1867,7 @@ def prepare_messages_for_inference(
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -1920,6 +1928,7 @@ def create_completion(stop):
             stream=False,
             stop=stop,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -2157,6 +2166,7 @@ def __call__(
             llama_types.ChatCompletionRequestResponseFormat
         ] = None,
         max_tokens: Optional[int] = None,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -2269,6 +2279,7 @@ def __call__(
                 stream=stream,
                 stop=stop,
                 max_tokens=max_tokens,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -2301,6 +2312,7 @@ def chatml_function_calling(
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
+    min_tokens: int = 0,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -2427,6 +2439,7 @@ def chatml_function_calling(
                 stream=stream,
                 stop=stop,
                 max_tokens=max_tokens,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -2479,6 +2492,7 @@ def chatml_function_calling(
             stream=stream,
             stop=stop,
             max_tokens=max_tokens,
+            min_tokens=min_tokens,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             repeat_penalty=repeat_penalty,
@@ -2523,6 +2537,7 @@ def chatml_function_calling(
         stream=False,
         stop=[":"],
         max_tokens=None,
+        min_tokens=min_tokens,
         presence_penalty=presence_penalty,
         frequency_penalty=frequency_penalty,
         repeat_penalty=repeat_penalty,
@@ -2550,6 +2565,7 @@ def chatml_function_calling(
                 stream=stream,
                 stop=["<|im_end|>"],
                 max_tokens=None,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -2597,6 +2613,7 @@ def chatml_function_calling(
                 stream=False,
                 stop=stop,
                 max_tokens=None,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,
@@ -2624,6 +2641,7 @@ def chatml_function_calling(
                 stream=False,
                 stop=stop,
                 max_tokens=None,
+                min_tokens=min_tokens,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 repeat_penalty=repeat_penalty,