vllm-project
diff --git a/‎docs/source/assets/deployment/dify-chat.png
143 KB b/‎docs/source/assets/deployment/dify-chat.png
143 KB
diff --git a/‎docs/source/assets/deployment/dify-create-chatbot.png
265 KB b/‎docs/source/assets/deployment/dify-create-chatbot.png
265 KB
diff --git a/‎docs/source/assets/deployment/dify-settings.png
51.8 KB b/‎docs/source/assets/deployment/dify-settings.png
51.8 KB
diff --git a/‎docs/source/deployment/frameworks/dify.md
Lines changed: 56 additions & 0 deletions b/‎docs/source/deployment/frameworks/dify.md
Lines changed: 56 additions & 0 deletions
diff --git a/‎docs/source/deployment/frameworks/index.md
Lines changed: 1 addition & 0 deletions b/‎docs/source/deployment/frameworks/index.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/design/v1/prefix_caching.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/design/v1/prefix_caching.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/features/reasoning_outputs.md
Lines changed: 8 additions & 6 deletions b/‎docs/source/features/reasoning_outputs.md
Lines changed: 8 additions & 6 deletions
diff --git a/‎examples/online_serving/ray_serve_deepseek.py
Lines changed: 3 additions & 1 deletion b/‎examples/online_serving/ray_serve_deepseek.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 0 deletions b/‎tests/kernels/moe/test_moe.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/samplers/test_sampler.py
Lines changed: 1 addition & 1 deletion b/‎tests/samplers/test_sampler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/tensorizer_loader/conftest.py
Lines changed: 0 additions & 33 deletions b/‎tests/tensorizer_loader/conftest.py
Lines changed: 0 additions & 33 deletions
diff --git a/‎tests/tensorizer_loader/test_tensorizer.py
Lines changed: 1 addition & 2 deletions b/‎tests/tensorizer_loader/test_tensorizer.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/v1/core/test_kv_cache_utils.py
Lines changed: 3 additions & 3 deletions b/‎tests/v1/core/test_kv_cache_utils.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/v1/core/test_prefix_caching.py
Lines changed: 8 additions & 5 deletions b/‎tests/v1/core/test_prefix_caching.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎tests/v1/core/test_scheduler.py
Lines changed: 8 additions & 5 deletions b/‎tests/v1/core/test_scheduler.py
Lines changed: 8 additions & 5 deletions
@@ -0,0 +1,56 @@
+(deployment-dify)=
+
+# Dify
+
+[Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
+
+It supports vLLM as a model provider to efficiently serve large language models.
+
+This guide walks you through deploying Dify using a vLLM backend.
+
+## Prerequisites
+
+- Setup vLLM environment
+- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/)
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve Qwen/Qwen1.5-7B-Chat
+```
+
+- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
+
+```console
+git clone https://github.com/langgenius/dify.git
+cd dify
+cd docker
+cp .env.example .env
+docker compose up -d
+```
+
+- Open the browser to access `http://localhost/install`, config the basic login information and login.
+
+- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+
+- Fill in the model provider details as follows:
+  - **Model Type**: `LLM`
+  - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
+  - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
+  - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
+  - **Completion Mode**: `Completion`
+
+:::{image} /assets/deployment/dify-settings.png
+:::
+
+- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
+
+:::{image} /assets/deployment/dify-create-chatbot.png
+:::
+
+- Click the chatbot you just created to open the chat interface and start interacting with the model:
+
+:::{image} /assets/deployment/dify-chat.png
+:::
@@ -7,6 +7,7 @@ anything-llm
 bentoml
 cerebrium
 chatbox
+dify
 dstack
 helm
 lws
 
@@ -86,7 +86,7 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
     {"role": "user", "content": "Here is a document with details about the world series: ..."},
     {"role": "user", "content": "Who won the world series in 2020?"}
   ],
-  "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ=="
+  "cache_salt": "your-cache-salt"
 }
 ```
 
 
@@ -17,7 +17,9 @@ vLLM currently supports the following reasoning models:
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
 
-- IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+:::{note}
+IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+:::
 
 ## Quickstart
 
@@ -83,7 +85,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
 }
 ```
 
-OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
+OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
 
 ```python
 from openai import OpenAI
@@ -221,15 +223,15 @@ print(f"Function called: {tool_call.name}")
 print(f"Arguments: {tool_call.arguments}")
 ```
 
-For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
+For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
 
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
 
 ## How to support a new reasoning model
 
-You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`.
+You can add a new `ReasoningParser` similar to <gh-file:vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py>.
 
 ```python
 # import the required packages
@@ -286,7 +288,7 @@ class ExampleParser(ReasoningParser):
         """
 ```
 
-Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in `vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py`.
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py>.
 
 ```python
 @dataclass
@@ -312,7 +314,7 @@ class DeepSeekReasoner(Reasoner):
     ...
 ```
 
-The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
+The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
 
 Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
 
 
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
-See Ray Serve LLM documentation at:
+See more details at:
+https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
+And see Ray Serve LLM documentation at:
 https://docs.ray.io/en/latest/serve/llm/serving-llms.html
 
 Run `python3 ray_serve_deepseek.py` to deploy the model.
 
@@ -286,6 +286,7 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
                                    atol=mixtral_moe_tol[dtype])
 
 
+@pytest.mark.flaky(reruns=2)
 @pytest.mark.parametrize("m", [1, 123, 666])
 @pytest.mark.parametrize("n", [128, 1024])
 @pytest.mark.parametrize("k", [256, 2048])
 
@@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
             sampling_params = SamplingParams(
                 temperature=random.random() + 0.1,
                 top_p=min(random.random() + 0.1, 1),
-                top_k=random.randint(0, 10) or -1,
+                top_k=random.randint(0, 10),
                 n=n,
                 presence_penalty=random.randint(0, 1),
             )
 
@@ -1,12 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import functools
-import gc
-from typing import Callable, TypeVar
-
 import pytest
-import torch
-from typing_extensions import ParamSpec
 
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@@ -25,32 +18,6 @@ def cleanup():
     cleanup_dist_env_and_memory(shutdown_ray=True)
 
 
-_P = ParamSpec("_P")
-_R = TypeVar("_R")
-
-
-def retry_until_skip(n: int):
-
-    def decorator_retry(func: Callable[_P, _R]) -> Callable[_P, _R]:
-
-        @functools.wraps(func)
-        def wrapper_retry(*args: _P.args, **kwargs: _P.kwargs) -> _R:
-            for i in range(n):
-                try:
-                    return func(*args, **kwargs)
-                except AssertionError:
-                    gc.collect()
-                    torch.cuda.empty_cache()
-                    if i == n - 1:
-                        pytest.skip(f"Skipping test after {n} attempts.")
-
-            raise AssertionError("Code should not be reached")
-
-        return wrapper_retry
-
-    return decorator_retry
-
-
 @pytest.fixture(autouse=True)
 def tensorizer_config():
     config = TensorizerConfig(tensorizer_uri="vllm")
 
@@ -28,7 +28,6 @@
 from vllm.utils import PlaceholderModule, import_from_path
 
 from ..utils import VLLM_PATH, RemoteOpenAIServer
-from .conftest import retry_until_skip
 
 try:
     from tensorizer import EncryptionParams
@@ -325,7 +324,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
     assert outputs == deserialized_outputs
 
 
-@retry_until_skip(3)
+@pytest.mark.flaky(reruns=3)
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     gc.collect()
     torch.cuda.empty_cache()
 
@@ -539,7 +539,7 @@ def test_allocate_with_lookahead():
                                       max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
-        num_tokens=3,
+        num_new_tokens=3,
         num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
     )
     assert len(blocks.blocks) == 2  # ceil(5/4)=2 blocks
@@ -550,7 +550,7 @@ def test_allocate_with_lookahead():
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
-        num_tokens=3,
+        num_new_tokens=3,
         num_lookahead_tokens=2,
     )
     assert len(blocks.blocks) == 2
@@ -561,7 +561,7 @@ def test_allocate_with_lookahead():
                                       max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
-        num_tokens=3,
+        num_new_tokens=3,
         num_lookahead_tokens=4,
     )
     assert len(blocks.blocks) == 2
@@ -299,7 +299,8 @@ def test_decode():
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks.blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
+    assert manager.single_type_manager.req_to_blocks[
+        req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
     req0.num_computed_tokens = 59
@@ -309,8 +310,10 @@ def test_decode():
         req0.append_output_token_ids(7)
     new_blocks = manager.allocate_slots(req0, 19)
     assert new_blocks is not None and len(new_blocks.blocks) == 1
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
-    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
+    assert manager.single_type_manager.req_to_blocks[
+        req0.request_id][-2].block_hash is not None
+    assert manager.single_type_manager.req_to_blocks[
+        req0.request_id][-1].block_hash is None
 
 
 def test_evict():
@@ -689,15 +692,15 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert not computed_blocks.blocks
     assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48, computed_blocks)
-    block_part0 = manager.req_to_blocks[req0.request_id]
+    block_part0 = manager.single_type_manager.req_to_blocks[req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks == block_part0
     assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48, computed_blocks)
-    block_part1 = manager.req_to_blocks[req1.request_id]
+    block_part1 = manager.single_type_manager.req_to_blocks[req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
 
@@ -812,10 +812,11 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
-        blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
+        blocks = (scheduler.kv_cache_manager.single_type_manager.
+                  req_to_blocks[req_id])
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
-        assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
-                EXPECTED_TOTAL_BLOCKS)
+        assert (scheduler.kv_cache_manager.single_type_manager.
+                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
@@ -1195,9 +1196,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(scheduler.kv_cache_manager.req_to_blocks) == 0
+    assert len(
+        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(scheduler.kv_cache_manager.num_cached_block) == 0
+    assert len(
+        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
-Original file line number
+Diff line change
 bentoml
 cerebrium
 chatbox
 +dify
 dstack
 helm
 lws
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache`
`86`	`86`	`{"role": "user", "content": "Here is a document with details about the world series: ..."},`
`87`	`87`	`{"role": "user", "content": "Who won the world series in 2020?"}`
`88`	`88`	`],`
`89`		`- "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ=="`
	`89`	`+ "cache_salt": "your-cache-salt"`
`90`	`90`	`}`
`91`	`91`	```
`92`	`92`
Original file line number	Diff line number	Diff line change
`@@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str):`
`478`	`478`	`sampling_params = SamplingParams(`
`479`	`479`	`temperature=random.random() + 0.1,`
`480`	`480`	`top_p=min(random.random() + 0.1, 1),`
`481`		`- top_k=random.randint(0, 10) or -1,`
	`481`	`+ top_k=random.randint(0, 10),`
`482`	`482`	`n=n,`
`483`	`483`	`presence_penalty=random.randint(0, 1),`
`484`	`484`	`)`