[v1] Move block management logic from KVCacheManager to SpecializedManager (vllm-project#17474)

heheda12345 · mawong-amd · commit c530f825b841 · 2025-05-13T21:07:23.000Z
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -539,7 +539,7 @@ def test_allocate_with_lookahead():
                                       max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
-        num_tokens=3,
+        num_new_tokens=3,
         num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
     )
     assert len(blocks.blocks) == 2  # ceil(5/4)=2 blocks
@@ -550,7 +550,7 @@ def test_allocate_with_lookahead():
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
-        num_tokens=3,
+        num_new_tokens=3,
         num_lookahead_tokens=2,
     )
     assert len(blocks.blocks) == 2
@@ -561,7 +561,7 @@ def test_allocate_with_lookahead():
                                       max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
-        num_tokens=3,
+        num_new_tokens=3,
         num_lookahead_tokens=4,
     )
     assert len(blocks.blocks) == 2
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
@@ -299,7 +299,8 @@ def test_decode():
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks.blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
+    assert manager.single_type_manager.req_to_blocks[
+        req0.request_id][-1].block_hash is None
 
     # Append slots with allocating a new block.
     req0.num_computed_tokens = 59
@@ -309,8 +310,10 @@ def test_decode():
         req0.append_output_token_ids(7)
     new_blocks = manager.allocate_slots(req0, 19)
     assert new_blocks is not None and len(new_blocks.blocks) == 1
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
-    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
+    assert manager.single_type_manager.req_to_blocks[
+        req0.request_id][-2].block_hash is not None
+    assert manager.single_type_manager.req_to_blocks[
+        req0.request_id][-1].block_hash is None
 
 
 def test_evict():
@@ -689,15 +692,15 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     assert not computed_blocks.blocks
     assert num_computed_tokens == 0
     manager.allocate_slots(req0, 48, computed_blocks)
-    block_part0 = manager.req_to_blocks[req0.request_id]
+    block_part0 = manager.single_type_manager.req_to_blocks[req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
     req1 = make_request("1", common_token_ids * 2)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks == block_part0
     assert num_computed_tokens == 3 * 16
     manager.allocate_slots(req1, 48, computed_blocks)
-    block_part1 = manager.req_to_blocks[req1.request_id]
+    block_part1 = manager.single_type_manager.req_to_blocks[req1.request_id]
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| ... |
     manager.free(req1)
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -812,10 +812,11 @@ def _assert_right_kv_cache_manager(
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
     for req_id in req_ids:
-        blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
+        blocks = (scheduler.kv_cache_manager.single_type_manager.
+                  req_to_blocks[req_id])
         hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
-        assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
-                EXPECTED_TOTAL_BLOCKS)
+        assert (scheduler.kv_cache_manager.single_type_manager.
+                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
@@ -1195,9 +1196,11 @@ def assert_scheduler_empty(scheduler: Scheduler):
     assert len(scheduler.encoder_cache_manager.cached) == 0
 
     # KVCache Manager.
-    assert len(scheduler.kv_cache_manager.req_to_blocks) == 0
+    assert len(
+        scheduler.kv_cache_manager.single_type_manager.req_to_blocks) == 0
     assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
-    assert len(scheduler.kv_cache_manager.num_cached_block) == 0
+    assert len(
+        scheduler.kv_cache_manager.single_type_manager.num_cached_block) == 0
     num_free_blocks = (
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
     assert num_free_blocks == (
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
@@ -8,6 +8,14 @@
 from vllm.v1.kv_cache_interface import SlidingWindowSpec
 
 
+def get_sliding_window_manager(sliding_window_spec, block_pool):
+    return SlidingWindowManager(sliding_window_spec,
+                                block_pool,
+                                use_eagle=False,
+                                num_kv_cache_groups=1,
+                                caching_hash_fn=lambda x: x)
+
+
 def test_sliding_window_possible_cached_prefix():
     sliding_window_spec = SlidingWindowSpec(
         block_size=2,
@@ -19,9 +27,7 @@ def test_sliding_window_possible_cached_prefix():
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
-    manager = SlidingWindowManager(sliding_window_spec,
-                                   block_pool,
-                                   use_eagle=False)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
     def run_one_case(block_is_cached, expect_length):
         block_hash_list = [
@@ -81,9 +87,7 @@ def test_sliding_window_remove_skipped_blocks():
 
     block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
 
-    manager = SlidingWindowManager(sliding_window_spec,
-                                   block_pool,
-                                   use_eagle=False)
+    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
     null_block_id = block_pool.null_block.block_id
 
@@ -104,39 +108,35 @@ def assert_block_id(block_table, ids):
         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
     ]
     block_table = id_to_block_table(original_block_ids)
-    removed = manager.remove_skipped_blocks(block_table, 0)
-    assert_block_id(removed, [])
+    manager.req_to_blocks["test"] = block_table
+
+    manager.remove_skipped_blocks("test", 0)
     assert_block_id(block_table, original_block_ids)
 
     # 4 tokens are computed. Only token 0 is out of the sliding window. As
     # block 1000 also contains token 1 that is in the sliding window, block 1000
     # cannot be removed.
-    removed = manager.remove_skipped_blocks(block_table, 4)
-    assert_block_id(removed, [])
+    manager.remove_skipped_blocks("test", 4)
     assert_block_id(block_table, original_block_ids)
 
     # 5 tokens are computed. Token 0 & 1 are out of the sliding window.
     # Block 1000 can be removed.
-    removed = manager.remove_skipped_blocks(block_table, 5)
-    assert_block_id(removed, [original_block_ids[0]])
+    manager.remove_skipped_blocks("test", 5)
     assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
 
     # 6 tokens are computed. Token 0-2 are out of the sliding window.
     # Cannot remove new block as the block 1001 is still used by token 3.
-    removed = manager.remove_skipped_blocks(block_table, 6)
-    assert_block_id(removed, [])
+    manager.remove_skipped_blocks("test", 6)
     assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
 
     # 7 tokens are computed. Token 0-3 are out of the sliding window.
     # Block 1001 can be removed and block 1000 is already removed.
-    removed = manager.remove_skipped_blocks(block_table, 7)
-    assert_block_id(removed, [original_block_ids[1]])
+    manager.remove_skipped_blocks("test", 7)
     assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
 
     # 11 tokens are computed. Token 0-7 are out of the sliding window.
     # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
     # sequence, and is expected to be evicted earlier than 1002, so the order
     # of removed blocks should be [1003, 1002].
-    removed = manager.remove_skipped_blocks(block_table, 11)
-    assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
+    manager.remove_skipped_blocks("test", 11)
     assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py