Skip to content

Commit 0f93158

Browse files
committed
set default to 0
1 parent 76c7d76 commit 0f93158

File tree

2 files changed

+11
-14
lines changed

2 files changed

+11
-14
lines changed

llama_cpp/llama.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -891,7 +891,7 @@ def _create_completion(
891891
prompt: Union[str, List[int]],
892892
suffix: Optional[str] = None,
893893
max_tokens: Optional[int] = 16,
894-
min_tokens: Optional[int] = 1,
894+
min_tokens: int = 0,
895895
temperature: float = 0.8,
896896
top_p: float = 0.95,
897897
min_p: float = 0.05,
@@ -983,12 +983,11 @@ def logit_bias_processor(
983983
else (self._n_ctx - len(prompt_tokens))
984984
)
985985

986-
if min_tokens is not None:
986+
if min_tokens > 0:
987987
def min_length_logits_processor(
988988
input_ids: npt.NDArray[np.intc],
989989
scores: npt.NDArray[np.single],
990990
) -> npt.NDArray[np.single]:
991-
print(f"{input_ids=}, {len(prompt_tokens)=}, {len(input_ids)=}, {self._token_eos=}")
992991
# Does it make sense to copy the whole array or can we just overwrite the original one?
993992
new_scores = np.copy(scores)
994993
if len(input_ids) - len(prompt_tokens) < min_tokens:
@@ -1000,8 +999,6 @@ def min_length_logits_processor(
1000999
logits_processor = _min_length_logits_processor
10011000
else:
10021001
logits_processor = logits_processor.extend(_min_length_logits_processor)
1003-
else:
1004-
assert False
10051002

10061003
if stop != []:
10071004
stop_sequences = [s.encode("utf-8") for s in stop]
@@ -1420,7 +1417,7 @@ def create_completion(
14201417
prompt: Union[str, List[int]],
14211418
suffix: Optional[str] = None,
14221419
max_tokens: Optional[int] = 16,
1423-
min_tokens: Optional[int] = 1,
1420+
min_tokens: int = 0,
14241421
temperature: float = 0.8,
14251422
top_p: float = 0.95,
14261423
min_p: float = 0.05,
@@ -1450,7 +1447,7 @@ def create_completion(
14501447
prompt: The prompt to generate text from.
14511448
suffix: A suffix to append to the generated text. If None, no suffix is appended.
14521449
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1453-
min_tokens: The minimum number of tokens to generate.
1450+
min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
14541451
temperature: The temperature to use for sampling.
14551452
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
14561453
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1520,7 +1517,7 @@ def __call__(
15201517
prompt: str,
15211518
suffix: Optional[str] = None,
15221519
max_tokens: Optional[int] = 16,
1523-
min_tokens: Optional[int] = 1,
1520+
min_tokens: int = 0,
15241521
temperature: float = 0.8,
15251522
top_p: float = 0.95,
15261523
min_p: float = 0.05,
@@ -1550,7 +1547,7 @@ def __call__(
15501547
prompt: The prompt to generate text from.
15511548
suffix: A suffix to append to the generated text. If None, no suffix is appended.
15521549
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1553-
min_tokens: The minimum number of tokens to generate.
1550+
min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
15541551
temperature: The temperature to use for sampling.
15551552
top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
15561553
min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -1627,7 +1624,7 @@ def create_chat_completion(
16271624
seed: Optional[int] = None,
16281625
response_format: Optional[ChatCompletionRequestResponseFormat] = None,
16291626
max_tokens: Optional[int] = None,
1630-
min_tokens: Optional[int] = 1,
1627+
min_tokens: int = 0,
16311628
presence_penalty: float = 0.0,
16321629
frequency_penalty: float = 0.0,
16331630
repeat_penalty: float = 1.1,
@@ -1662,7 +1659,7 @@ def create_chat_completion(
16621659
seed: The seed to use for sampling.
16631660
response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
16641661
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
1665-
min_tokens: The minimum number of tokens to generate.
1662+
min_tokens: The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).
16661663
presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
16671664
frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
16681665
repeat_penalty: The penalty to apply to repeated tokens.

llama_cpp/server/types.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
)
1818

1919
min_tokens_field = Field(
20-
default=1, ge=1, description="The minimum number of tokens to generate."
20+
default=0, ge=0, description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop)."
2121
)
2222

2323
temperature_field = Field(
@@ -117,7 +117,7 @@ class CreateCompletionRequest(BaseModel):
117117
max_tokens: Optional[int] = Field(
118118
default=16, ge=0, description="The maximum number of tokens to generate."
119119
)
120-
min_tokens: Optional[int] = min_tokens_field
120+
min_tokens: int = min_tokens_field
121121
temperature: float = temperature_field
122122
top_p: float = top_p_field
123123
min_p: float = min_p_field
@@ -213,7 +213,7 @@ class CreateChatCompletionRequest(BaseModel):
213213
default=None,
214214
description="The maximum number of tokens to generate. Defaults to inf",
215215
)
216-
min_tokens: Optional[int] = min_tokens_field
216+
min_tokens: int = min_tokens_field
217217
logprobs: Optional[bool] = Field(
218218
default=False,
219219
description="Whether to output the logprobs or not. Default is True"

0 commit comments

Comments
 (0)