vllm-project
diff --git a/‎.buildkite/check-wheel-size.py
Lines changed: 12 additions & 8 deletions b/‎.buildkite/check-wheel-size.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎.buildkite/generate_index.py
Lines changed: 2 additions & 2 deletions b/‎.buildkite/generate_index.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/lm-eval-harness/conftest.py
Lines changed: 10 additions & 6 deletions b/‎.buildkite/lm-eval-harness/conftest.py
Lines changed: 10 additions & 6 deletions
diff --git a/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Lines changed: 15 additions & 11 deletions b/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Lines changed: 15 additions & 11 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
Lines changed: 46 additions & 43 deletions b/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
Lines changed: 46 additions & 43 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Lines changed: 6 additions & 9 deletions b/‎.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Lines changed: 6 additions & 9 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
Lines changed: 20 additions & 21 deletions b/‎.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
Lines changed: 20 additions & 21 deletions
@@ -8,12 +8,12 @@
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
 
 
 def print_top_10_largest_files(zip_file):
     """Print the top 10 largest files in the given zip file."""
-    with zipfile.ZipFile(zip_file, 'r') as z:
+    with zipfile.ZipFile(zip_file, "r") as z:
         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
         file_sizes.sort(key=lambda x: x[1], reverse=True)
         for f, size in file_sizes[:10]:
@@ -28,14 +28,18 @@ def check_wheel_size(directory):
                 wheel_path = os.path.join(root, file_name)
                 wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                 if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(f"Not allowed: Wheel {wheel_path} is larger "
-                          f"({wheel_size_mb:.2f} MB) than the limit "
-                          f"({VLLM_MAX_SIZE_MB} MB).")
+                    print(
+                        f"Not allowed: Wheel {wheel_path} is larger "
+                        f"({wheel_size_mb:.2f} MB) than the limit "
+                        f"({VLLM_MAX_SIZE_MB} MB)."
+                    )
                     print_top_10_largest_files(wheel_path)
                     return 1
                 else:
-                    print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb:.2f} MB).")
+                    print(
+                        f"Wheel {wheel_path} is within the allowed size "
+                        f"({wheel_size_mb:.2f} MB)."
+                    )
     return 0
 
 
@@ -45,4 +49,4 @@ def check_wheel_size(directory):
         sys.exit(1)
 
     directory = sys.argv[1]
-    sys.exit(check_wheel_size(directory))
+    sys.exit(check_wheel_size(directory))
@@ -22,5 +22,5 @@
     print(f"Generated index.html for {args.wheel}")
     # cloudfront requires escaping the '+' character
     f.write(
-        template.format(wheel=filename,
-                        wheel_html_escaped=filename.replace("+", "%2B")))
+        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+    )
@@ -8,11 +8,14 @@ def pytest_addoption(parser):
     parser.addoption(
         "--config-list-file",
         action="store",
-        help="Path to the file listing model config YAMLs (one per line)")
-    parser.addoption("--tp-size",
-                     action="store",
-                     default="1",
-                     help="Tensor parallel size to use for evaluation")
+        help="Path to the file listing model config YAMLs (one per line)",
+    )
+    parser.addoption(
+        "--tp-size",
+        action="store",
+        default="1",
+        help="Tensor parallel size to use for evaluation",
+    )
 
 
 @pytest.fixture(scope="session")
@@ -33,7 +36,8 @@ def pytest_generate_tests(metafunc):
         config_dir = config_list_file.parent
         with open(config_list_file, encoding="utf-8") as f:
             configs = [
-                config_dir / line.strip() for line in f
+                config_dir / line.strip()
+                for line in f
                 if line.strip() and not line.startswith("#")
             ]
         metafunc.parametrize("config_filename", configs)
@@ -16,19 +16,22 @@
 
 
 def launch_lm_eval(eval_config, tp_size):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={tp_size}," \
-                 f"enforce_eager=true," \
-                 f"add_bos_token=true," \
-                 f"trust_remote_code={trust_remote_code}"
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    model_args = (
+        f"pretrained={eval_config['model_name']},"
+        f"tensor_parallel_size={tp_size},"
+        f"enforce_eager=true,"
+        f"add_bos_token=true,"
+        f"trust_remote_code={trust_remote_code}"
+    )
     results = lm_eval.simple_evaluate(
         model="vllm",
         model_args=model_args,
         tasks=[task["name"] for task in eval_config["tasks"]],
         num_fewshot=eval_config["num_fewshot"],
         limit=eval_config["limit"],
-        batch_size="auto")
+        batch_size="auto",
+    )
     return results
 
 
@@ -42,9 +45,10 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
         for metric in task["metrics"]:
             ground_truth = metric["value"]
             measured_value = results["results"][task["name"]][metric["name"]]
-            print(f'{task["name"]} | {metric["name"]}: '
-                  f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and np.isclose(
-                ground_truth, measured_value, rtol=RTOL)
+            print(
+                f"{task['name']} | {metric['name']}: "
+                f"ground_truth={ground_truth} | measured={measured_value}"
+            )
+            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
 
     assert success
@@ -65,18 +65,18 @@ def read_markdown(file):
 
 
 def results_to_json(latency, throughput, serving):
-    return json.dumps({
-        'latency': latency.to_dict(),
-        'throughput': throughput.to_dict(),
-        'serving': serving.to_dict()
-    })
+    return json.dumps(
+        {
+            "latency": latency.to_dict(),
+            "throughput": throughput.to_dict(),
+            "serving": serving.to_dict(),
+        }
+    )
 
 
 if __name__ == "__main__":
-
     # collect results
     for test_file in results_folder.glob("*.json"):
-
         with open(test_file) as f:
             raw_result = json.loads(f.read())
 
@@ -120,7 +120,8 @@ def results_to_json(latency, throughput, serving):
             for perc in [10, 25, 50, 75, 90, 99]:
                 # Multiply 1000 to convert the time unit from s to ms
                 raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                )
             raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
 
             # add the result to raw_result
@@ -153,26 +154,27 @@ def results_to_json(latency, throughput, serving):
     serving_results = pd.DataFrame.from_dict(serving_results)
     throughput_results = pd.DataFrame.from_dict(throughput_results)
 
-    raw_results_json = results_to_json(latency_results, throughput_results,
-                                       serving_results)
+    raw_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )
 
     # remapping the key, for visualization purpose
     if not latency_results.empty:
-        latency_results = latency_results[list(
-            latency_column_mapping.keys())].rename(
-                columns=latency_column_mapping)
+        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+            columns=latency_column_mapping
+        )
     if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
     if not throughput_results.empty:
-        throughput_results = throughput_results[list(
-            throughput_results_column_mapping.keys())].rename(
-                columns=throughput_results_column_mapping)
+        throughput_results = throughput_results[
+            list(throughput_results_column_mapping.keys())
+        ].rename(columns=throughput_results_column_mapping)
 
-    processed_results_json = results_to_json(latency_results,
-                                             throughput_results,
-                                             serving_results)
+    processed_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )
 
     for df in [latency_results, serving_results, throughput_results]:
         if df.empty:
@@ -184,38 +186,39 @@ def results_to_json(latency, throughput, serving):
         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
         # we want to turn it into "8xGPUTYPE"
         df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+        )
 
     # get markdown tables
-    latency_md_table = tabulate(latency_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    serving_md_table = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    throughput_md_table = tabulate(throughput_results,
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=False)
+    latency_md_table = tabulate(
+        latency_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    serving_md_table = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    throughput_md_table = tabulate(
+        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+    )
 
     # document the result
     with open(results_folder / "benchmark_results.md", "w") as f:
-
-        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-                                "performance-benchmarks-descriptions.md")
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/"
+            + "performance-benchmarks-descriptions.md"
+        )
         results = results.format(
             latency_tests_markdown_table=latency_md_table,
             throughput_tests_markdown_table=throughput_md_table,
             serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json)
+            benchmarking_results_in_json_string=processed_results_json,
+        )
         f.write(results)
 
     # document benchmarking results in json
     with open(results_folder / "benchmark_results.json", "w") as f:
-
-        results = latency_results.to_dict(
-            orient='records') + throughput_results.to_dict(
-                orient='records') + serving_results.to_dict(orient='records')
+        results = (
+            latency_results.to_dict(orient="records")
+            + throughput_results.to_dict(orient="records")
+            + serving_results.to_dict(orient="records")
+        )
         f.write(json.dumps(results))
@@ -14,15 +14,12 @@ def main(model, cachedir):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer")
-    parser.add_argument("--model",
-                        type=str,
-                        required=True,
-                        help="Name of the model")
-    parser.add_argument("--cachedir",
-                        type=str,
-                        required=True,
-                        help="Directory to save the tokenizer")
+        description="Download and save Hugging Face tokenizer"
+    )
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument(
+        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
+    )
 
     args = parser.parse_args()
     main(args.model, args.cachedir)
@@ -11,41 +11,40 @@
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        description=
-        'Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder',
-                        type=str,
-                        required=True,
-                        help='The folder where the results are stored.')
-    parser.add_argument('--description',
-                        type=str,
-                        required=True,
-                        help='Description of the results.')
+        description="Parse command line arguments for summary-nightly-results script."
+    )
+    parser.add_argument(
+        "--results-folder",
+        type=str,
+        required=True,
+        help="The folder where the results are stored.",
+    )
+    parser.add_argument(
+        "--description", type=str, required=True, help="Description of the results."
+    )
 
     args = parser.parse_args()
     return args
 
 
 def get_perf(df, method, model, metric):
-
     means = []
 
     for qps in [2, 4, 8, 16, "inf"]:
-        target = df['Test name'].str.contains(model)
-        target = target & df['Engine'].str.contains(method)
-        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        target = df["Test name"].str.contains(model)
+        target = target & df["Engine"].str.contains(method)
+        target = target & df["Test name"].str.contains("qps_" + str(qps))
         filtered_df = df[target]
 
         if filtered_df.empty:
-            means.append(0.)
+            means.append(0.0)
         else:
             means.append(filtered_df[metric].values[0])
 
     return np.array(means)
 
 
 def get_perf_w_std(df, method, model, metric):
-
     if metric in ["TTFT", "ITL"]:
         mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
         mean = mean.tolist()
@@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric):
     else:
         assert metric == "Tput"
         mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)")
+            df, method, model, "Output Tput (tok/s)"
+        )
         mean = mean.tolist()
         std = None
 
@@ -80,18 +80,17 @@ def main(args):
     # generate markdown table
     df = pd.DataFrame.from_dict(results)
 
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
 
     with open(args.description) as f:
         description = f.read()
 
-    description = description.format(
-        nightly_results_benchmarking_table=md_table)
+    description = description.format(nightly_results_benchmarking_table=md_table)
 
     with open("nightly_results.md", "w") as f:
         f.write(description)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parse_arguments()
     main(args)