From a2aaec5e3e0495bd23a2469ef05cce097a0d128d Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 21 May 2025 15:49:09 -0700 Subject: [PATCH 1/2] [Serve, LLM] feat: add missing `repetition_penalty` vLLM sampling param Signed-off-by: Arthur --- .../llm/_internal/serve/deployments/llm/vllm/vllm_engine.py | 3 +++ .../llm/_internal/serve/deployments/llm/vllm/vllm_models.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index b70b3ea13314a..554a363677716 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -919,6 +919,9 @@ def _parse_sampling_params( frequency_penalty=sampling_params.frequency_penalty if sampling_params.frequency_penalty is not None else 0.0, + repetition_penalty=sampling_params.repetition_penalty + if sampling_params.repetition_penalty is not None + else 1.0, temperature=sampling_params.temperature if sampling_params.temperature is not None else 1.0, diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index 39080404e8bdb..4c323a90192c7 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -211,11 +211,16 @@ class VLLMSamplingParams(SamplingParams): Args: top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. seed: Seed for deterministic sampling with temperature>0. + repetition_penalty: Float that penalizes new tokens based on whether they + appear in the prompt and the generated text so far. Values > 1 encourage + the model to use new tokens, while values < 1 encourage the model to repeat + tokens. """ _ignored_fields = {"best_of", "n", "logit_bias"} top_k: Optional[int] = None + repetition_penalty: Optional[float] = None seed: Optional[int] = None From abc4fc4e61ee80ef306ca5f9120588c3c8825ab2 Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 21 May 2025 16:45:35 -0700 Subject: [PATCH 2/2] [Serve, LLM] tests: updated tests with new `repetition_penalty` param Signed-off-by: Arthur --- .../cpu/deployments/llm/multiplex/test_multiplex_deployment.py | 2 ++ python/ray/llm/tests/serve/mocks/mock_vllm_engine.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py index 8bf0c152beb8c..d19dc4537479b 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/multiplex/test_multiplex_deployment.py @@ -74,6 +74,7 @@ "ignore_eos": None, "presence_penalty": None, "frequency_penalty": None, + "repetition_penalty": None, "best_of": 1, "response_format": None, "top_k": None, @@ -177,6 +178,7 @@ async def test_multiplex_deployment( "ignore_eos": None, "presence_penalty": None, "frequency_penalty": None, + "repetition_penalty": None, "top_k": None, "response_format": None, "logprobs": None, diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 6a6850db88228..fbc69b554e13f 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -162,6 +162,9 @@ def _parse_sampling_params( frequency_penalty=sampling_params.frequency_penalty if sampling_params.frequency_penalty is not None else 0.0, + repetition_penalty=sampling_params.repetition_penalty + if sampling_params.repetition_penalty is not None + else 1.0, temperature=sampling_params.temperature if sampling_params.temperature is not None else 1.0,