Skip to content

Commit d191102

Browse files
robertgshaw2-redhattlrmchlsmthApostaCRobert Shawmgoin
authored
[P/D] NIXL Integration (#17751)
Signed-off-by: ApostaC <yihua98@uchicago.edu> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com> Signed-off-by: Robert Shaw <rshaw@neuralmagic.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Brent Salisbury <bsalisbu@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: ApostaC <yihua98@uchicago.edu> Co-authored-by: Robert Shaw <rshaw@neuralmagic.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Brent Salisbury <bsalisbu@redhat.com>
1 parent 05a4324 commit d191102

34 files changed

+2724
-109
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ steps:
214214
- pytest -v -s v1/worker
215215
- pytest -v -s v1/structured_output
216216
- pytest -v -s v1/spec_decode
217+
- pytest -v -s v1/kv_connector/unit
217218
- pytest -v -s v1/test_serial_utils.py
218219
- pytest -v -s v1/test_stats.py
219220
- pytest -v -s v1/test_utils.py

tests/v1/core/test_scheduler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@ def test_kv_connector_basic():
870870
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
871871
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
872872
scheduler.connector.get_num_new_matched_tokens.return_value = (
873-
NUM_MATCHED_NEW_TOKENS)
873+
NUM_MATCHED_NEW_TOKENS, False)
874874

875875
######################################################
876876
# FIRST SET OF REQUESTS - External Hit Only
@@ -981,7 +981,7 @@ def test_kv_connector_unable_to_allocate():
981981
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
982982
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
983983
scheduler.connector.get_num_new_matched_tokens.return_value = (
984-
NUM_MATCHED_NEW_TOKENS)
984+
NUM_MATCHED_NEW_TOKENS, False)
985985

986986
# Create two requests. The second request will not be able to
987987
# allocate slots because it will not have enough blocks.
@@ -1060,7 +1060,7 @@ def test_kv_connector_handles_preemption():
10601060
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
10611061
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
10621062
scheduler.connector.get_num_new_matched_tokens.return_value = (
1063-
NUM_MATCHED_NEW_TOKENS)
1063+
NUM_MATCHED_NEW_TOKENS, False)
10641064

10651065
# Create two requests.
10661066
# Both can be scheduled at first, but the second request
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#!/bin/bash
2+
set -xe
3+
4+
# Models to run
5+
MODELS=(
6+
"Qwen/Qwen3-0.6B"
7+
)
8+
9+
# Number of prefill and decode instances to create
10+
NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
11+
NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2} # Default to 2
12+
13+
# Find the git repository root directory
14+
GIT_ROOT=$(git rev-parse --show-toplevel)
15+
16+
# Trap the SIGINT signal (triggered by Ctrl+C)
17+
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
18+
19+
# Waits for vLLM to start.
20+
wait_for_server() {
21+
local port=$1
22+
timeout 1200 bash -c "
23+
until curl -s localhost:${port}/v1/completions > /dev/null; do
24+
sleep 1
25+
done" && return 0 || return 1
26+
}
27+
28+
# Function to clean up previous instances
29+
cleanup_instances() {
30+
echo "Cleaning up any running vLLM instances..."
31+
pkill -f "vllm serve" || true
32+
sleep 2
33+
}
34+
35+
# Handle to get model-specific arguments for deepseek
36+
get_model_args() {
37+
local model_name=$1
38+
local extra_args=""
39+
40+
if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
41+
extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
42+
fi
43+
44+
echo "$extra_args"
45+
}
46+
47+
48+
# Function to run tests for a specific model
49+
run_tests_for_model() {
50+
local model_name=$1
51+
echo "================================"
52+
echo "Testing model: $model_name"
53+
echo "================================"
54+
55+
# Get model-specific arguments
56+
local model_args=$(get_model_args "$model_name")
57+
58+
# Arrays to store all hosts and ports
59+
PREFILL_HOSTS=()
60+
PREFILL_PORTS=()
61+
DECODE_HOSTS=()
62+
DECODE_PORTS=()
63+
64+
# Start prefill instances
65+
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
66+
# Calculate GPU ID - we'll distribute across available GPUs
67+
GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
68+
# Calculate port number (base port + instance number)
69+
PORT=$((8100 + i))
70+
# Calculate side channel port
71+
SIDE_CHANNEL_PORT=$((5559 + i))
72+
73+
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
74+
75+
# Build the command with or without model-specific args
76+
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
77+
--port $PORT \
78+
--enforce-eager \
79+
--disable-log-requests \
80+
--gpu-memory-utilization 0.2 \
81+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
82+
83+
if [ -n "$model_args" ]; then
84+
FULL_CMD="$BASE_CMD $model_args"
85+
else
86+
FULL_CMD="$BASE_CMD"
87+
fi
88+
89+
eval "$FULL_CMD &"
90+
91+
# Store host and port for proxy configuration
92+
PREFILL_HOSTS+=("localhost")
93+
PREFILL_PORTS+=($PORT)
94+
done
95+
96+
# Start decode instances
97+
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
98+
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
99+
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
100+
# Calculate port number (base port + instance number)
101+
PORT=$((8200 + i))
102+
# Calculate side channel port
103+
SIDE_CHANNEL_PORT=$((5659 + i))
104+
105+
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"
106+
107+
# Build the command with or without model-specific args
108+
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
109+
--port $PORT \
110+
--enforce-eager \
111+
--disable-log-requests \
112+
--gpu-memory-utilization 0.2 \
113+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
114+
115+
if [ -n "$model_args" ]; then
116+
FULL_CMD="$BASE_CMD $model_args"
117+
else
118+
FULL_CMD="$BASE_CMD"
119+
fi
120+
121+
eval "$FULL_CMD &"
122+
123+
# Store host and port for proxy configuration
124+
DECODE_HOSTS+=("localhost")
125+
DECODE_PORTS+=($PORT)
126+
done
127+
128+
# Wait for all instances to start
129+
for PORT in "${PREFILL_PORTS[@]}"; do
130+
echo "Waiting for prefill instance on port $PORT to start..."
131+
wait_for_server $PORT
132+
done
133+
134+
for PORT in "${DECODE_PORTS[@]}"; do
135+
echo "Waiting for decode instance on port $PORT to start..."
136+
wait_for_server $PORT
137+
done
138+
139+
# Build the command for the proxy server with all the hosts and ports
140+
PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
141+
142+
# Add all prefill hosts and ports
143+
PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
144+
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}"
145+
146+
# Add all decode hosts and ports
147+
PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}"
148+
PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}"
149+
150+
# Start the proxy server
151+
echo "Starting proxy server with command: $PROXY_CMD"
152+
$PROXY_CMD &
153+
154+
# Wait for the proxy to start
155+
sleep 5
156+
157+
# Run lm eval for this model
158+
echo "Running tests for $model_name"
159+
TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
160+
161+
# Clean up before running next model
162+
cleanup_instances
163+
sleep 3
164+
}
165+
166+
# Run tests for each model
167+
for model in "${MODELS[@]}"; do
168+
run_tests_for_model "$model"
169+
done
170+
171+
echo "All tests completed!"
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/bin/bash
2+
set -xe
3+
4+
# Models to run
5+
MODELS=(
6+
"Qwen/Qwen3-0.6B"
7+
)
8+
9+
# Find the git repository root directory
10+
GIT_ROOT=$(git rev-parse --show-toplevel)
11+
12+
# Trap the SIGINT signal (triggered by Ctrl+C)
13+
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
14+
15+
# Waits for vLLM to start.
16+
wait_for_server() {
17+
local port=$1
18+
timeout 1200 bash -c "
19+
until curl -s localhost:${port}/v1/completions > /dev/null; do
20+
sleep 1
21+
done" && return 0 || return 1
22+
}
23+
24+
# Function to clean up previous instances
25+
cleanup_instances() {
26+
echo "Cleaning up any running vLLM instances..."
27+
pkill -f "vllm serve" || true
28+
sleep 2
29+
}
30+
31+
# Handle to get model-specific arguments for deepseek
32+
get_model_args() {
33+
local model_name=$1
34+
local extra_args=""
35+
36+
if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then
37+
extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code"
38+
fi
39+
40+
echo "$extra_args"
41+
}
42+
43+
44+
# Function to run tests for a specific model
45+
run_tests_for_model() {
46+
local model_name=$1
47+
echo "================================"
48+
echo "Testing model: $model_name"
49+
echo "================================"
50+
51+
# Get model-specific arguments
52+
local model_args=$(get_model_args "$model_name")
53+
54+
# Start prefill instance
55+
PREFILL_PORT=8001
56+
57+
BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
58+
--port $PREFILL_PORT \
59+
--enforce-eager \
60+
--disable-log-requests \
61+
--gpu-memory-utilization 0.2 \
62+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
63+
64+
if [ -n "$model_args" ]; then
65+
FULL_CMD="$BASE_CMD $model_args"
66+
else
67+
FULL_CMD="$BASE_CMD"
68+
fi
69+
70+
eval "$FULL_CMD &"
71+
72+
# Start decode instance
73+
DECODE_PORT=8002
74+
75+
# Build the command with or without model-specific args
76+
BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
77+
--port $DECODE_PORT \
78+
--enforce-eager \
79+
--disable-log-requests \
80+
--gpu-memory-utilization 0.2 \
81+
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
82+
83+
if [ -n "$model_args" ]; then
84+
FULL_CMD="$BASE_CMD $model_args"
85+
else
86+
FULL_CMD="$BASE_CMD"
87+
fi
88+
89+
eval "$FULL_CMD &"
90+
91+
# Wait for all instances to start
92+
echo "Waiting for prefill instance on port $PORT to start..."
93+
wait_for_server $PREFILL_PORT
94+
echo "Waiting for decode instance on port $PORT to start..."
95+
wait_for_server $DECODE_PORT
96+
97+
# Build the command for the proxy server with all the hosts and ports
98+
PROXY_PORT=8192
99+
PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port $PROXY_PORT"
100+
PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}"
101+
PROXY_CMD+=" --decoder-ports ${DECODE_PORT}"
102+
# Start the proxy server
103+
echo "Starting proxy server with command: $PROXY_CMD"
104+
$PROXY_CMD &
105+
106+
# Wait for the proxy to start
107+
sleep 5
108+
109+
# Run lm eval for this model
110+
echo "Running tests for $model_name"
111+
PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_edge_cases.py
112+
113+
# Clean up before running next model
114+
cleanup_instances
115+
sleep 3
116+
}
117+
118+
# Run tests for each model
119+
for model in "${MODELS[@]}"; do
120+
run_tests_for_model "$model"
121+
done
122+
123+
echo "All tests completed!"
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
import os
3+
4+
import lm_eval
5+
import openai
6+
7+
BASE_URL = "http://localhost:8192/v1"
8+
NUM_CONCURRENT = 100
9+
TASK = "gsm8k"
10+
FILTER = "exact_match,strict-match"
11+
RTOL = 0.03
12+
13+
# Model-specific expected values
14+
EXPECTED_VALUES = {
15+
"Qwen/Qwen3-0.6B": 0.41,
16+
}
17+
18+
SIMPLE_PROMPT = "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means", # noqa: E501
19+
20+
# Get model name from environment variable
21+
MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")
22+
23+
24+
def run_simple_prompt():
25+
client = openai.OpenAI(api_key="EMPTY", base_url=BASE_URL)
26+
completion = client.completions.create(model=MODEL_NAME,
27+
prompt=SIMPLE_PROMPT)
28+
29+
print("-" * 50)
30+
print(f"Completion results for {MODEL_NAME}:")
31+
print(completion)
32+
print("-" * 50)
33+
34+
35+
def test_accuracy():
36+
"""Run the end to end accuracy test."""
37+
run_simple_prompt()
38+
39+
model_args = (f"model={MODEL_NAME},"
40+
f"base_url={BASE_URL}/completions,"
41+
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
42+
43+
results = lm_eval.simple_evaluate(
44+
model="local-completions",
45+
model_args=model_args,
46+
tasks=TASK,
47+
)
48+
49+
measured_value = results["results"][TASK][FILTER]
50+
expected_value = EXPECTED_VALUES.get(MODEL_NAME)
51+
52+
if expected_value is None:
53+
print(f"Warning: No expected value found for {MODEL_NAME}. "
54+
"Skipping accuracy check.")
55+
print(f"Measured value: {measured_value}")
56+
return
57+
58+
assert (measured_value - RTOL < expected_value
59+
and measured_value + RTOL > expected_value
60+
), f"Expected: {expected_value} | Measured: {measured_value}"

0 commit comments

Comments
 (0)