Skip to content

Commit 7bdf67c

Browse files
authored
Merge branch 'main' into custom_executor
2 parents b2032fd + dbe5588 commit 7bdf67c

36 files changed

+969
-358
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
2+
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.758
8+
- name: "exact_match,flexible-extract"
9+
value: 0.759
10+
limit: 1000
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ Meta-Llama-3-8B-Instruct.yaml
22
Meta-Llama-3-8B-Instruct-FP8.yaml
33
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
44
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
5+
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
56
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml

.buildkite/test-pipeline.yaml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ steps:
2121
fast_check: true
2222
fast_check_only: true
2323
commands:
24-
- apt-get install curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
24+
- apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
2525
- pytest -v -s metrics # Metrics
2626
- "pip install \
2727
opentelemetry-sdk \
@@ -46,14 +46,15 @@ steps:
4646
commands:
4747
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
4848
- pytest -v -s basic_correctness/test_basic_correctness.py
49+
- pytest -v -s basic_correctness/test_cpu_offload.py
4950
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
5051
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
5152
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
5253

5354
- label: Core Test
5455
mirror_hardwares: [amd]
5556
fast_check: true
56-
commands:
57+
commands:
5758
- pytest -v -s core
5859
- pytest -v -s distributed/test_parallel_state.py
5960

@@ -72,7 +73,7 @@ steps:
7273
commands:
7374
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
7475
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
75-
- pytest -v -s distributed/test_pipeline_parallel.py
76+
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
7677
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
7778
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
7879

@@ -122,7 +123,7 @@ steps:
122123

123124
- label: Engine Test
124125
mirror_hardwares: [amd]
125-
commands:
126+
commands:
126127
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
127128
# OOM in the CI unless we run this separately
128129
- pytest -v -s tokenization
@@ -220,7 +221,7 @@ steps:
220221
- label: Tensorizer Test
221222
#mirror_hardwares: [amd]
222223
commands:
223-
- apt-get install curl libsodium23
224+
- apt-get install -y curl libsodium23
224225
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
225226
- pytest -v -s tensorizer_loader
226227

Dockerfile

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
ARG CUDA_VERSION=12.4.1
99
#################### BASE BUILD IMAGE ####################
1010
# prepare basic build environment
11-
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
11+
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
1212

1313
ARG CUDA_VERSION=12.4.1
14-
ARG PYTHON_VERSION=3
14+
ARG PYTHON_VERSION=3.10
1515

1616
ENV DEBIAN_FRONTEND=noninteractive
1717

@@ -21,13 +21,16 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
2121
&& apt-get install -y ccache software-properties-common \
2222
&& add-apt-repository ppa:deadsnakes/ppa \
2323
&& apt-get update -y \
24-
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
24+
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
2525
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
26-
&& python3 --version \
27-
&& python3 -m pip --version
26+
&& python3 --version
2827

2928
RUN apt-get update -y \
30-
&& apt-get install -y python3-pip git curl sudo
29+
&& apt-get install -y git curl sudo
30+
31+
# Install pip s.t. it will be compatible with our PYTHON_VERSION
32+
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
33+
RUN python3 -m pip --version
3134

3235
# Workaround for https://github.com/openai/triton/issues/2507 and
3336
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -58,7 +61,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
5861
#################### WHEEL BUILD IMAGE ####################
5962
FROM base AS build
6063

61-
ARG PYTHON_VERSION=3
64+
ARG PYTHON_VERSION=3.10
6265

6366
# install build dependencies
6467
COPY requirements-build.txt requirements-build.txt
@@ -149,12 +152,27 @@ RUN pip --verbose wheel -r requirements-mamba.txt \
149152

150153
#################### vLLM installation IMAGE ####################
151154
# image with vLLM installed
152-
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
155+
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
153156
ARG CUDA_VERSION=12.4.1
157+
ARG PYTHON_VERSION=3.10
154158
WORKDIR /vllm-workspace
155159

160+
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
161+
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
162+
&& apt-get update -y \
163+
&& apt-get install -y ccache software-properties-common \
164+
&& add-apt-repository ppa:deadsnakes/ppa \
165+
&& apt-get update -y \
166+
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
167+
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
168+
&& python3 --version
169+
156170
RUN apt-get update -y \
157-
&& apt-get install -y python3-pip git vim
171+
&& apt-get install -y python3-pip git curl
172+
173+
# Install pip s.t. it will be compatible with our PYTHON_VERSION
174+
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
175+
RUN python3 -m pip --version
158176

159177
# Workaround for https://github.com/openai/triton/issues/2507 and
160178
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully

Dockerfile.openvino

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
22
# to run the OpenAI compatible server.
33

4-
FROM ubuntu:22.04 AS dev
4+
FROM ubuntu:20.04 AS dev
55

66
RUN apt-get update -y && \
77
apt-get install -y python3-pip git

Dockerfile.xpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
1+
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
22

33
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
44
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from ..utils import compare_two_settings
2+
3+
4+
def test_cpu_offload():
5+
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
6+
["--cpu-offload-gb", "4"])
7+
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
8+
[], ["--cpu-offload-gb", "1"])
Lines changed: 25 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,42 @@
1+
import os
2+
13
import pytest
2-
from transformers import AutoTokenizer
34

4-
from ..utils import RemoteOpenAIServer
5+
from ..utils import compare_two_settings
6+
7+
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
58

69

710
@pytest.mark.parametrize(
8-
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME", [
9-
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B"),
10-
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B"),
11-
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B"),
12-
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B"),
13-
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
11+
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
12+
[
13+
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
14+
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
15+
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
16+
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
17+
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
18+
(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
19+
(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
20+
(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
21+
(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
22+
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
1423
])
15-
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
16-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
24+
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
25+
DIST_BACKEND):
26+
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
27+
pytest.skip("Skipping multi-node pipeline parallel test for "
28+
"multiprocessing distributed backend")
1729

1830
pp_args = [
1931
# use half precision for speed and memory savings in CI environment
2032
"--dtype",
21-
"bfloat16",
33+
"float16",
2234
"--pipeline-parallel-size",
2335
str(PP_SIZE),
2436
"--tensor-parallel-size",
2537
str(TP_SIZE),
2638
"--distributed-executor-backend",
27-
"ray",
39+
DIST_BACKEND,
2840
]
2941

3042
# compare without pipeline parallelism
@@ -48,85 +60,4 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
4860
pp_args.append("--enforce-eager")
4961
tp_args.append("--enforce-eager")
5062

51-
prompt = "Hello, my name is"
52-
token_ids = tokenizer(prompt)["input_ids"]
53-
results = []
54-
for args in (pp_args, tp_args):
55-
with RemoteOpenAIServer(MODEL_NAME, args) as server:
56-
client = server.get_client()
57-
58-
# test models list
59-
models = client.models.list()
60-
models = models.data
61-
served_model = models[0]
62-
results.append({
63-
"test": "models_list",
64-
"id": served_model.id,
65-
"root": served_model.root,
66-
})
67-
68-
# test with text prompt
69-
completion = client.completions.create(model=MODEL_NAME,
70-
prompt=prompt,
71-
max_tokens=5,
72-
temperature=0.0)
73-
74-
results.append({
75-
"test": "single_completion",
76-
"text": completion.choices[0].text,
77-
"finish_reason": completion.choices[0].finish_reason,
78-
"usage": completion.usage,
79-
})
80-
81-
# test using token IDs
82-
completion = client.completions.create(
83-
model=MODEL_NAME,
84-
prompt=token_ids,
85-
max_tokens=5,
86-
temperature=0.0,
87-
)
88-
89-
results.append({
90-
"test": "token_ids",
91-
"text": completion.choices[0].text,
92-
"finish_reason": completion.choices[0].finish_reason,
93-
"usage": completion.usage,
94-
})
95-
96-
# test simple list
97-
batch = client.completions.create(
98-
model=MODEL_NAME,
99-
prompt=[prompt, prompt],
100-
max_tokens=5,
101-
temperature=0.0,
102-
)
103-
104-
results.append({
105-
"test": "simple_list",
106-
"text0": batch.choices[0].text,
107-
"text1": batch.choices[1].text,
108-
})
109-
110-
# test streaming
111-
batch = client.completions.create(
112-
model=MODEL_NAME,
113-
prompt=[prompt, prompt],
114-
max_tokens=5,
115-
temperature=0.0,
116-
stream=True,
117-
)
118-
texts = [""] * 2
119-
for chunk in batch:
120-
assert len(chunk.choices) == 1
121-
choice = chunk.choices[0]
122-
texts[choice.index] += choice.text
123-
results.append({
124-
"test": "streaming",
125-
"texts": texts,
126-
})
127-
128-
n = len(results) // 2
129-
pp_results = results[:n]
130-
tp_results = results[n:]
131-
for pp, tp in zip(pp_results, tp_results):
132-
assert pp == tp
63+
compare_two_settings(MODEL_NAME, pp_args, tp_args)

tests/samplers/test_rejection_sampler.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,54 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
150150
high=vocab_size,
151151
size=(batch_size, k),
152152
dtype=torch.int64)
153+
generators = [None] * batch_size
153154

154155
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
155-
draft_token_ids)
156+
draft_token_ids, generators)
157+
158+
159+
@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
160+
@pytest.mark.parametrize("k", [1, 3, 6])
161+
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
162+
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
163+
@pytest.mark.parametrize("n_rep", [100])
164+
@pytest.mark.parametrize("device", CUDA_DEVICES)
165+
@torch.inference_mode()
166+
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
167+
frac_seeded: float, n_rep: int,
168+
device: str):
169+
torch.set_default_device(device)
170+
rejection_sampler = RejectionSampler()
171+
rejection_sampler.init_gpu_tensors(rank=0)
172+
173+
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
174+
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
175+
bonus_token_ids = torch.randint(low=0,
176+
high=vocab_size,
177+
size=(batch_size, 1),
178+
dtype=torch.int64)
179+
draft_token_ids = torch.randint(low=0,
180+
high=vocab_size,
181+
size=(batch_size, k),
182+
dtype=torch.int64)
183+
184+
seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
185+
186+
results = []
187+
for _ in range(n_rep):
188+
generators = [
189+
torch.Generator(
190+
device=device).manual_seed(i) if seeded_mask[i] else None
191+
for i in range(batch_size)
192+
]
193+
results.append(
194+
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
195+
draft_token_ids, generators))
196+
197+
for i in range(batch_size):
198+
if seeded_mask[i]:
199+
for j in range(1, n_rep):
200+
assert torch.equal(results[j][i], results[0][i])
156201

157202

158203
@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
@@ -197,10 +242,11 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
197242
raise AssertionError()
198243

199244
oob_token_ids[0][0] = rogue_token_id
245+
generators = [None] * batch_size
200246

201247
with pytest.raises(AssertionError):
202248
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
203-
draft_token_ids)
249+
draft_token_ids, generators)
204250

205251

206252
@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@@ -371,11 +417,15 @@ def _estimate_rejection_sampling_pdf(
371417
dtype=torch.int64,
372418
device="cuda").repeat(num_samples, 1)
373419

420+
# unseeded
421+
generators = [None]
422+
374423
# Get output tokens via rejection sampling.
375424
output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
376425
bonus_token_ids.to("cuda"),
377426
draft_probs.to("cuda"),
378-
draft_token_ids.to("cuda"))
427+
draft_token_ids.to("cuda"),
428+
generators)
379429

380430
# Remove bonus tokens
381431
output_token_ids = output_token_ids[:, :-1].flatten()

0 commit comments

Comments
 (0)