set default to 0

twaka · twaka · commit 0f93158bca60 · 2024-04-08T19:22:06.000+09:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -891,7 +891,7 @@ def _create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -983,12 +983,11 @@ def logit_bias_processor(
             else (self._n_ctx - len(prompt_tokens))
         )
 
-        if min_tokens is not None:
+        if min_tokens > 0:
             def min_length_logits_processor(
                 input_ids: npt.NDArray[np.intc],
                 scores: npt.NDArray[np.single],
             ) -> npt.NDArray[np.single]:
-                print(f"{input_ids=}, {len(prompt_tokens)=}, {len(input_ids)=}, {self._token_eos=}")
                 # Does it make sense to copy the whole array or can we just overwrite the original one?
                 new_scores = np.copy(scores)
                 if len(input_ids) - len(prompt_tokens) < min_tokens:
@@ -1000,8 +999,6 @@ def min_length_logits_processor(
                 logits_processor = _min_length_logits_processor
             else:
                 logits_processor = logits_processor.extend(_min_length_logits_processor)
-        else:
-            assert False
 
         if stop != []:
             stop_sequences = [s.encode("utf-8") for s in stop]
@@ -1420,7 +1417,7 @@ def create_completion(
         prompt: Union[str, List[int]],
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1450,7 +1447,7 @@ def create_completion(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate.
+            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1520,7 +1517,7 @@ def __call__(
         prompt: str,
         suffix: Optional[str] = None,
         max_tokens: Optional[int] = 16,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -1550,7 +1547,7 @@ def __call__(
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate.
+            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
             min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1627,7 +1624,7 @@ def create_chat_completion(
         seed: Optional[int] = None,
         response_format: Optional[ChatCompletionRequestResponseFormat] = None,
         max_tokens: Optional[int] = None,
-        min_tokens: Optional[int] = 1,
+        min_tokens: int = 0,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -1662,7 +1659,7 @@ def create_chat_completion(
             seed: The seed to use for sampling.
             response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
             max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
-            min_tokens: The minimum number of tokens to generate.
+            min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
             presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
             frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
             repeat_penalty: The penalty to apply to repeated tokens.
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
@@ -17,7 +17,7 @@
 )
 
 min_tokens_field = Field(
-    default=1, ge=1, description="The minimum number of tokens to generate."
+    default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)."
 )
 
 temperature_field = Field(
@@ -117,7 +117,7 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: Optional[int] = Field(
         default=16, ge=0, description="The maximum number of tokens to generate."
     )
-    min_tokens: Optional[int] = min_tokens_field
+    min_tokens: int = min_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
@@ -213,7 +213,7 @@ class CreateChatCompletionRequest(BaseModel):
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
-    min_tokens: Optional[int] = min_tokens_field
+    min_tokens: int = min_tokens_field
     logprobs: Optional[bool] = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True"

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`)`
`18`	`18`
`19`	`19`	`min_tokens_field = Field(`
`20`		`- default=1, ge=1, description="The minimum number of tokens to generate."`
	`20`	`+ default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)."`
`21`	`21`	`)`
`22`	`22`
`23`	`23`	`temperature_field = Field(`
`@@ -117,7 +117,7 @@ class CreateCompletionRequest(BaseModel):`
`117`	`117`	`max_tokens: Optional[int] = Field(`
`118`	`118`	`default=16, ge=0, description="The maximum number of tokens to generate."`
`119`	`119`	`)`
`120`		`- min_tokens: Optional[int] = min_tokens_field`
	`120`	`+ min_tokens: int = min_tokens_field`
`121`	`121`	`temperature: float = temperature_field`
`122`	`122`	`top_p: float = top_p_field`
`123`	`123`	`min_p: float = min_p_field`
`@@ -213,7 +213,7 @@ class CreateChatCompletionRequest(BaseModel):`
`213`	`213`	`default=None,`
`214`	`214`	`description="The maximum number of tokens to generate. Defaults to inf",`
`215`	`215`	`)`
`216`		`- min_tokens: Optional[int] = min_tokens_field`
	`216`	`+ min_tokens: int = min_tokens_field`
`217`	`217`	`logprobs: Optional[bool] = Field(`
`218`	`218`	`default=False,`
`219`	`219`	`description="Whether to output the logprobs or not. Default is True"`