Skip to content

Commit 5760bdd

Browse files
committed
Merge branch 'main' into simplify-vllm-config-post-init
2 parents a8794da + e3f3aee commit 5760bdd

File tree

598 files changed

+21034
-8288
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

598 files changed

+21034
-8288
lines changed

.buildkite/check-wheel-size.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
# Note that we have 400 MiB quota, please use it wisely.
99
# See https://github.com/pypi/support/issues/3792 .
1010
# Please also sync the value with the one in Dockerfile.
11-
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
11+
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
1212

1313

1414
def print_top_10_largest_files(zip_file):
1515
"""Print the top 10 largest files in the given zip file."""
16-
with zipfile.ZipFile(zip_file, 'r') as z:
16+
with zipfile.ZipFile(zip_file, "r") as z:
1717
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
1818
file_sizes.sort(key=lambda x: x[1], reverse=True)
1919
for f, size in file_sizes[:10]:
@@ -28,14 +28,18 @@ def check_wheel_size(directory):
2828
wheel_path = os.path.join(root, file_name)
2929
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
3030
if wheel_size_mb > VLLM_MAX_SIZE_MB:
31-
print(f"Not allowed: Wheel {wheel_path} is larger "
32-
f"({wheel_size_mb:.2f} MB) than the limit "
33-
f"({VLLM_MAX_SIZE_MB} MB).")
31+
print(
32+
f"Not allowed: Wheel {wheel_path} is larger "
33+
f"({wheel_size_mb:.2f} MB) than the limit "
34+
f"({VLLM_MAX_SIZE_MB} MB)."
35+
)
3436
print_top_10_largest_files(wheel_path)
3537
return 1
3638
else:
37-
print(f"Wheel {wheel_path} is within the allowed size "
38-
f"({wheel_size_mb:.2f} MB).")
39+
print(
40+
f"Wheel {wheel_path} is within the allowed size "
41+
f"({wheel_size_mb:.2f} MB)."
42+
)
3943
return 0
4044

4145

@@ -45,4 +49,4 @@ def check_wheel_size(directory):
4549
sys.exit(1)
4650

4751
directory = sys.argv[1]
48-
sys.exit(check_wheel_size(directory))
52+
sys.exit(check_wheel_size(directory))

.buildkite/generate_index.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@
2222
print(f"Generated index.html for {args.wheel}")
2323
# cloudfront requires escaping the '+' character
2424
f.write(
25-
template.format(wheel=filename,
26-
wheel_html_escaped=filename.replace("+", "%2B")))
25+
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
26+
)

.buildkite/lm-eval-harness/conftest.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ def pytest_addoption(parser):
88
parser.addoption(
99
"--config-list-file",
1010
action="store",
11-
help="Path to the file listing model config YAMLs (one per line)")
12-
parser.addoption("--tp-size",
13-
action="store",
14-
default="1",
15-
help="Tensor parallel size to use for evaluation")
11+
help="Path to the file listing model config YAMLs (one per line)",
12+
)
13+
parser.addoption(
14+
"--tp-size",
15+
action="store",
16+
default="1",
17+
help="Tensor parallel size to use for evaluation",
18+
)
1619

1720

1821
@pytest.fixture(scope="session")
@@ -33,7 +36,8 @@ def pytest_generate_tests(metafunc):
3336
config_dir = config_list_file.parent
3437
with open(config_list_file, encoding="utf-8") as f:
3538
configs = [
36-
config_dir / line.strip() for line in f
39+
config_dir / line.strip()
40+
for line in f
3741
if line.strip() and not line.startswith("#")
3842
]
3943
metafunc.parametrize("config_filename", configs)

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,22 @@
1616

1717

1818
def launch_lm_eval(eval_config, tp_size):
19-
trust_remote_code = eval_config.get('trust_remote_code', False)
20-
model_args = f"pretrained={eval_config['model_name']}," \
21-
f"tensor_parallel_size={tp_size}," \
22-
f"enforce_eager=true," \
23-
f"add_bos_token=true," \
24-
f"trust_remote_code={trust_remote_code}"
19+
trust_remote_code = eval_config.get("trust_remote_code", False)
20+
model_args = (
21+
f"pretrained={eval_config['model_name']},"
22+
f"tensor_parallel_size={tp_size},"
23+
f"enforce_eager=true,"
24+
f"add_bos_token=true,"
25+
f"trust_remote_code={trust_remote_code}"
26+
)
2527
results = lm_eval.simple_evaluate(
2628
model="vllm",
2729
model_args=model_args,
2830
tasks=[task["name"] for task in eval_config["tasks"]],
2931
num_fewshot=eval_config["num_fewshot"],
3032
limit=eval_config["limit"],
31-
batch_size="auto")
33+
batch_size="auto",
34+
)
3235
return results
3336

3437

@@ -42,9 +45,10 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
4245
for metric in task["metrics"]:
4346
ground_truth = metric["value"]
4447
measured_value = results["results"][task["name"]][metric["name"]]
45-
print(f'{task["name"]} | {metric["name"]}: '
46-
f'ground_truth={ground_truth} | measured={measured_value}')
47-
success = success and np.isclose(
48-
ground_truth, measured_value, rtol=RTOL)
48+
print(
49+
f"{task['name']} | {metric['name']}: "
50+
f"ground_truth={ground_truth} | measured={measured_value}"
51+
)
52+
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
4953

5054
assert success

.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -65,18 +65,18 @@ def read_markdown(file):
6565

6666

6767
def results_to_json(latency, throughput, serving):
68-
return json.dumps({
69-
'latency': latency.to_dict(),
70-
'throughput': throughput.to_dict(),
71-
'serving': serving.to_dict()
72-
})
68+
return json.dumps(
69+
{
70+
"latency": latency.to_dict(),
71+
"throughput": throughput.to_dict(),
72+
"serving": serving.to_dict(),
73+
}
74+
)
7375

7476

7577
if __name__ == "__main__":
76-
7778
# collect results
7879
for test_file in results_folder.glob("*.json"):
79-
8080
with open(test_file) as f:
8181
raw_result = json.loads(f.read())
8282

@@ -120,7 +120,8 @@ def results_to_json(latency, throughput, serving):
120120
for perc in [10, 25, 50, 75, 90, 99]:
121121
# Multiply 1000 to convert the time unit from s to ms
122122
raw_result.update(
123-
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
123+
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
124+
)
124125
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
125126

126127
# add the result to raw_result
@@ -153,26 +154,27 @@ def results_to_json(latency, throughput, serving):
153154
serving_results = pd.DataFrame.from_dict(serving_results)
154155
throughput_results = pd.DataFrame.from_dict(throughput_results)
155156

156-
raw_results_json = results_to_json(latency_results, throughput_results,
157-
serving_results)
157+
raw_results_json = results_to_json(
158+
latency_results, throughput_results, serving_results
159+
)
158160

159161
# remapping the key, for visualization purpose
160162
if not latency_results.empty:
161-
latency_results = latency_results[list(
162-
latency_column_mapping.keys())].rename(
163-
columns=latency_column_mapping)
163+
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
164+
columns=latency_column_mapping
165+
)
164166
if not serving_results.empty:
165-
serving_results = serving_results[list(
166-
serving_column_mapping.keys())].rename(
167-
columns=serving_column_mapping)
167+
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
168+
columns=serving_column_mapping
169+
)
168170
if not throughput_results.empty:
169-
throughput_results = throughput_results[list(
170-
throughput_results_column_mapping.keys())].rename(
171-
columns=throughput_results_column_mapping)
171+
throughput_results = throughput_results[
172+
list(throughput_results_column_mapping.keys())
173+
].rename(columns=throughput_results_column_mapping)
172174

173-
processed_results_json = results_to_json(latency_results,
174-
throughput_results,
175-
serving_results)
175+
processed_results_json = results_to_json(
176+
latency_results, throughput_results, serving_results
177+
)
176178

177179
for df in [latency_results, serving_results, throughput_results]:
178180
if df.empty:
@@ -184,38 +186,39 @@ def results_to_json(latency, throughput, serving):
184186
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
185187
# we want to turn it into "8xGPUTYPE"
186188
df["GPU"] = df["GPU"].apply(
187-
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
189+
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
190+
)
188191

189192
# get markdown tables
190-
latency_md_table = tabulate(latency_results,
191-
headers='keys',
192-
tablefmt='pipe',
193-
showindex=False)
194-
serving_md_table = tabulate(serving_results,
195-
headers='keys',
196-
tablefmt='pipe',
197-
showindex=False)
198-
throughput_md_table = tabulate(throughput_results,
199-
headers='keys',
200-
tablefmt='pipe',
201-
showindex=False)
193+
latency_md_table = tabulate(
194+
latency_results, headers="keys", tablefmt="pipe", showindex=False
195+
)
196+
serving_md_table = tabulate(
197+
serving_results, headers="keys", tablefmt="pipe", showindex=False
198+
)
199+
throughput_md_table = tabulate(
200+
throughput_results, headers="keys", tablefmt="pipe", showindex=False
201+
)
202202

203203
# document the result
204204
with open(results_folder / "benchmark_results.md", "w") as f:
205-
206-
results = read_markdown("../.buildkite/nightly-benchmarks/" +
207-
"performance-benchmarks-descriptions.md")
205+
results = read_markdown(
206+
"../.buildkite/nightly-benchmarks/"
207+
+ "performance-benchmarks-descriptions.md"
208+
)
208209
results = results.format(
209210
latency_tests_markdown_table=latency_md_table,
210211
throughput_tests_markdown_table=throughput_md_table,
211212
serving_tests_markdown_table=serving_md_table,
212-
benchmarking_results_in_json_string=processed_results_json)
213+
benchmarking_results_in_json_string=processed_results_json,
214+
)
213215
f.write(results)
214216

215217
# document benchmarking results in json
216218
with open(results_folder / "benchmark_results.json", "w") as f:
217-
218-
results = latency_results.to_dict(
219-
orient='records') + throughput_results.to_dict(
220-
orient='records') + serving_results.to_dict(orient='records')
219+
results = (
220+
latency_results.to_dict(orient="records")
221+
+ throughput_results.to_dict(orient="records")
222+
+ serving_results.to_dict(orient="records")
223+
)
221224
f.write(json.dumps(results))

.buildkite/nightly-benchmarks/scripts/download-tokenizer.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,12 @@ def main(model, cachedir):
1414

1515
if __name__ == "__main__":
1616
parser = argparse.ArgumentParser(
17-
description="Download and save Hugging Face tokenizer")
18-
parser.add_argument("--model",
19-
type=str,
20-
required=True,
21-
help="Name of the model")
22-
parser.add_argument("--cachedir",
23-
type=str,
24-
required=True,
25-
help="Directory to save the tokenizer")
17+
description="Download and save Hugging Face tokenizer"
18+
)
19+
parser.add_argument("--model", type=str, required=True, help="Name of the model")
20+
parser.add_argument(
21+
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
22+
)
2623

2724
args = parser.parse_args()
2825
main(args.model, args.cachedir)

.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,41 +11,40 @@
1111

1212
def parse_arguments():
1313
parser = argparse.ArgumentParser(
14-
description=
15-
'Parse command line arguments for summary-nightly-results script.')
16-
parser.add_argument('--results-folder',
17-
type=str,
18-
required=True,
19-
help='The folder where the results are stored.')
20-
parser.add_argument('--description',
21-
type=str,
22-
required=True,
23-
help='Description of the results.')
14+
description="Parse command line arguments for summary-nightly-results script."
15+
)
16+
parser.add_argument(
17+
"--results-folder",
18+
type=str,
19+
required=True,
20+
help="The folder where the results are stored.",
21+
)
22+
parser.add_argument(
23+
"--description", type=str, required=True, help="Description of the results."
24+
)
2425

2526
args = parser.parse_args()
2627
return args
2728

2829

2930
def get_perf(df, method, model, metric):
30-
3131
means = []
3232

3333
for qps in [2, 4, 8, 16, "inf"]:
34-
target = df['Test name'].str.contains(model)
35-
target = target & df['Engine'].str.contains(method)
36-
target = target & df['Test name'].str.contains("qps_" + str(qps))
34+
target = df["Test name"].str.contains(model)
35+
target = target & df["Engine"].str.contains(method)
36+
target = target & df["Test name"].str.contains("qps_" + str(qps))
3737
filtered_df = df[target]
3838

3939
if filtered_df.empty:
40-
means.append(0.)
40+
means.append(0.0)
4141
else:
4242
means.append(filtered_df[metric].values[0])
4343

4444
return np.array(means)
4545

4646

4747
def get_perf_w_std(df, method, model, metric):
48-
4948
if metric in ["TTFT", "ITL"]:
5049
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
5150
mean = mean.tolist()
@@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric):
6059
else:
6160
assert metric == "Tput"
6261
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
63-
df, method, model, "Output Tput (tok/s)")
62+
df, method, model, "Output Tput (tok/s)"
63+
)
6464
mean = mean.tolist()
6565
std = None
6666

@@ -80,18 +80,17 @@ def main(args):
8080
# generate markdown table
8181
df = pd.DataFrame.from_dict(results)
8282

83-
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
83+
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
8484

8585
with open(args.description) as f:
8686
description = f.read()
8787

88-
description = description.format(
89-
nightly_results_benchmarking_table=md_table)
88+
description = description.format(nightly_results_benchmarking_table=md_table)
9089

9190
with open("nightly_results.md", "w") as f:
9291
f.write(description)
9392

9493

95-
if __name__ == '__main__':
94+
if __name__ == "__main__":
9695
args = parse_arguments()
9796
main(args)

0 commit comments

Comments
 (0)