diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/constants.py b/src/c++/perf_analyzer/genai-pa/genai_pa/constants.py index e31d471a8..c288b3146 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/constants.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/constants.py @@ -26,6 +26,10 @@ LOGGER_NAME: str = "genai-pa" +DEFAULT_HTTP_URL = "localhost:8000" +DEFAULT_GRPC_URL = "localhost:8001" + + OPEN_ORCA = "openorca" CNN_DAILY_MAIL = "cnn_dailymail" DEFAULT_INPUT_DATA_JSON = "llm_inputs.json" diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_inputs/llm_inputs.py index 64013c48e..dc40c5e40 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_inputs/llm_inputs.py @@ -46,6 +46,8 @@ class LlmInputs: A library of methods that control the generation of LLM Inputs """ + OUTPUT_FILENAME = DEFAULT_INPUT_DATA_JSON + OPEN_ORCA_URL = "https://datasets-server.huggingface.co/rows?dataset=Open-Orca%2FOpenOrca&config=default&split=train" CNN_DAILYMAIL_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=train" diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_profile.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py similarity index 97% rename from src/c++/perf_analyzer/genai-pa/genai_pa/llm_profile.py rename to src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py index b57a7f372..80230d176 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_profile.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py @@ -26,12 +26,19 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import contextlib +import io from dataclasses import dataclass from itertools import pairwise import numpy as np from genai_pa.utils import load_json -from transformers import AutoTokenizer + +# Silence tokenizer warning on import +with contextlib.redirect_stdout(io.StringIO()) as stdout, contextlib.redirect_stderr( + io.StringIO() +) as stderr: + from transformers import AutoTokenizer @dataclass diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py index b0f5c1e5a..3a9246c55 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py @@ -25,24 +25,71 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import contextlib +import io import logging import sys from genai_pa import parser from genai_pa.constants import LOGGER_NAME from genai_pa.exceptions import GenAiPAException +from genai_pa.llm_inputs.llm_inputs import LlmInputs + +# Silence tokenizer warning on import +with contextlib.redirect_stdout(io.StringIO()) as stdout, contextlib.redirect_stderr( + io.StringIO() +) as stderr: + from genai_pa.llm_metrics import LLMProfileData + from transformers import AutoTokenizer as tokenizer + from transformers import logging as token_logger + + token_logger.set_verbosity_error() + logging.basicConfig(level=logging.INFO, format="%(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(LOGGER_NAME) +def generate_inputs(args): + LlmInputs.create_openai_llm_inputs( + args.dataset, + LlmInputs.DEFAULT_STARTING_INDEX, + LlmInputs.DEFAULT_LENGTH, + args.model, + args.streaming, + ) + + +def calculate_metrics(file: str) -> LLMProfileData: + t = tokenizer.from_pretrained("gpt2") + return LLMProfileData(file, t) + + +def report_output(metrics: LLMProfileData, args): + if "concurrency_range" in args: + infer_mode = "concurrency" + load_level = args.concurrency_range + elif "request_rate_range" in args: + infer_mode = "request_rate" + load_level = args.request_rate_range + else: + raise GenAiPAException( + "Neither concurrency_range nor request_rate_range was found in args when reporting metrics" + ) + # TODO: metrics reporter class that consumes Stats class for nicer formatting + print(metrics.get_statistics(infer_mode, int(load_level))) + + # Separate function that can raise exceptions used for testing # to assert correct errors and messages. # Optional argv used for testing - will default to sys.argv if None. def run(argv=None): try: - args = parser.parse_args(argv) - args.func(args) + args, extra_args = parser.parse_args(argv) + generate_inputs(args) + args.func(args, extra_args) + metrics = calculate_metrics(args.profile_export_file) + report_output(metrics, args) except Exception as e: raise GenAiPAException(e) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py index d4e0497b7..c45026f63 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py @@ -28,12 +28,12 @@ import logging from pathlib import Path -from genai_pa.constants import LOGGER_NAME +from genai_pa.constants import CNN_DAILY_MAIL, DEFAULT_HTTP_URL, LOGGER_NAME, OPEN_ORCA logger = logging.getLogger(LOGGER_NAME) -def prune_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: +def _prune_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: """ Prune the parsed arguments to remove args with None or False values. """ @@ -42,7 +42,7 @@ def prune_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: ) -def update_load_manager_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: +def _update_load_manager_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: """ Update GenAI-PA load manager attributes to PA format """ @@ -57,17 +57,16 @@ def update_load_manager_args(args: argparse.ArgumentParser) -> argparse.Argument ### Handlers ### -# NOTE: Placeholder -def handler(args): +def handler(args, extra_args): from genai_pa.wrapper import Profiler - Profiler.run(model=args.model, args=args) + Profiler.run(model=args.model, args=args, extra_args=extra_args) ### Parsers ### -def add_model_args(parser): +def _add_model_args(parser): model_group = parser.add_argument_group("Model") model_group.add_argument( @@ -79,9 +78,9 @@ def add_model_args(parser): ) -def add_profile_args(parser): +def _add_profile_args(parser): profile_group = parser.add_argument_group("Profiling") - load_management_group = profile_group.add_mutually_exclusive_group() + load_management_group = profile_group.add_mutually_exclusive_group(required=True) profile_group.add_argument( "-b", @@ -152,14 +151,23 @@ def add_profile_args(parser): ) -def add_endpoint_args(parser): +def _add_endpoint_args(parser): endpoint_group = parser.add_argument_group("Endpoint") + endpoint_group.add_argument( + "-i", + type=str.lower, + choices=["http", "grpc"], + default="http", + required=False, + help=f"Sets the protocol used to communicate with inference service", + ) + endpoint_group.add_argument( "-u", "--url", type=str, - default="localhost:8001", + default=DEFAULT_HTTP_URL, required=False, dest="u", metavar="URL", @@ -167,19 +175,18 @@ def add_endpoint_args(parser): ) -def add_dataset_args(parser): - pass - +def _add_dataset_args(parser): dataset_group = parser.add_argument_group("Dataset") - # TODO: Do we want to remove dataset and tokenizer? - # dataset_group.add_argument( - # "--dataset", - # type=str, - # default="OpenOrca", - # choices=["OpenOrca", "cnn_dailymail"], - # required=False, - # help="HuggingFace dataset to use for the benchmark.", - # ) + + dataset_group.add_argument( + "--dataset", + type=str.lower, + default=OPEN_ORCA, + choices=[OPEN_ORCA, CNN_DAILY_MAIL], + required=False, + help="HuggingFace dataset to use for benchmarking.", + ) + # dataset_group.add_argument( # "--tokenizer", # type=str, @@ -202,14 +209,18 @@ def parse_args(argv=None): parser.set_defaults(func=handler) # Conceptually group args for easier visualization - add_model_args(parser) - add_profile_args(parser) - add_endpoint_args(parser) - add_dataset_args(parser) + _add_model_args(parser) + _add_profile_args(parser) + _add_endpoint_args(parser) + _add_dataset_args(parser) - args = parser.parse_args(argv) + args, extra_args = parser.parse_known_args(argv) + if extra_args: + # strip off the "--" demarking the pass through arguments + extra_args = extra_args[1:] + logger.info(f"Additional pass through args: {extra_args}") - args = update_load_manager_args(args) - args = prune_args(args) + args = _update_load_manager_args(args) + args = _prune_args(args) - return args + return args, extra_args diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py b/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py index 25e776425..df8643b8e 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py @@ -35,8 +35,8 @@ class Profiler: @staticmethod - def run(model, args=None): - skip_args = ["model", "func"] + def build_cmd(model, args, extra_args): + skip_args = ["model", "func", "dataset"] if hasattr(args, "version"): cmd = f"perf_analyzer --version" else: @@ -52,9 +52,22 @@ def run(model, args=None): cmd += f"-b {value} " else: if len(arg) == 1: - cmd += f"-{arg} {value}" + cmd += f"-{arg} {value} " else: arg = utils.convert_option_name(arg) cmd += f"--{arg} {value} " + + if extra_args is not None: + for arg in extra_args: + cmd += f"{arg} " + # TODO: Once the OpenAI endpoint support is in place in PA core, + # update the input-data option arg + # cmd += f"--input-data {DEFAULT_INPUT_DATA_JSON} -p 10000 -s 99" + cmd += f"--input-data ./input_data.json -p 10000 -s 99" + return cmd + + @staticmethod + def run(model, args=None, extra_args=None): + cmd = Profiler.build_cmd(model, args, extra_args) logger.info(f"Running Perf Analyzer : '{cmd}'") subprocess.run(cmd, shell=True, check=True) diff --git a/src/c++/perf_analyzer/genai-pa/pyproject.toml b/src/c++/perf_analyzer/genai-pa/pyproject.toml index 9e707fa1f..018d035e3 100644 --- a/src/c++/perf_analyzer/genai-pa/pyproject.toml +++ b/src/c++/perf_analyzer/genai-pa/pyproject.toml @@ -48,7 +48,8 @@ requires-python = ">=3.8,<4" dependencies = [ "numpy", "pytest", - "rich" + "rich", + "transformers" ] # CLI Entrypoint diff --git a/src/c++/perf_analyzer/genai-pa/tests/test_cli.py b/src/c++/perf_analyzer/genai-pa/tests/test_cli.py index 4aaff401b..42a0f0fe8 100644 --- a/src/c++/perf_analyzer/genai-pa/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-pa/tests/test_cli.py @@ -59,13 +59,11 @@ def test_help_arguments_output_and_exit(self, arg, expected_output, capsys): [ (["-b", "2"], {"batch_size": 2}), (["--batch-size", "2"], {"batch_size": 2}), - (["--concurrency", "3"], {"concurrency_range": "3"}), (["--max-threads", "4"], {"max_threads": 4}), ( ["--profile-export-file", "text.txt"], {"profile_export_file": Path("text.txt")}, ), - (["--request-rate", "1.5"], {"request_rate_range": "1.5"}), (["--service-kind", "triton"], {"service_kind": "triton"}), (["--service-kind", "openai"], {"service_kind": "openai"}), # TODO: Remove streaming from implementation. It is invalid with HTTP. @@ -76,8 +74,8 @@ def test_help_arguments_output_and_exit(self, arg, expected_output, capsys): ], ) def test_arguments_output(self, arg, expected_attributes, capsys): - combined_args = ["--model", "test_model"] + arg - args = parser.parse_args(combined_args) + combined_args = ["--model", "test_model", "--concurrency", "2"] + arg + args, _ = parser.parse_args(combined_args) # Check that the attributes are set correctly for key, value in expected_attributes.items(): @@ -96,4 +94,11 @@ def test_arguments_model_not_provided(self): def test_exception_on_nonzero_exit(self): with pytest.raises(GenAiPAException) as e: - run(["-m", "nonexistent_model"]) + run(["-m", "nonexistent_model", "--concurrency", "3"]) + + def test_pass_through_args(self): + args = ["-m", "test_model", "--concurrency", "1"] + other_args = ["--", "With", "great", "power"] + _, pass_through_args = parser.parse_args(args + other_args) + + assert pass_through_args == other_args[1:] diff --git a/src/c++/perf_analyzer/genai-pa/tests/test_llm_profile.py b/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py similarity index 99% rename from src/c++/perf_analyzer/genai-pa/tests/test_llm_profile.py rename to src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py index a1d9332c8..b417c9a95 100755 --- a/src/c++/perf_analyzer/genai-pa/tests/test_llm_profile.py +++ b/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py @@ -31,7 +31,7 @@ import numpy as np import pytest -from genai_pa.llm_profile import LLMMetrics, LLMProfileData +from genai_pa.llm_metrics import LLMMetrics, LLMProfileData from genai_pa.utils import remove_file from transformers import AutoTokenizer