Revert "[Core] Rename PromptInputs to PromptType, and inputs to prompt" (vllm-project#8750)

simon-mo · sumitd2 · commit 5b950cc83a81 · 2024-11-14T10:35:11.000-05:00
Signed-off-by: Sumit Dubey &lt;sumit.dubey2@ibm.com&gt;
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -11,7 +11,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
-from vllm.inputs import PromptType
+from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
 
@@ -61,7 +61,7 @@ def main(args: argparse.Namespace):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_inputs: List[PromptInputs] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
@@ -74,13 +74,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
                         str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
+                llm.generate(dummy_inputs,
                              sampling_params=sampling_params,
                              use_tqdm=False)
             print(p.key_averages())
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
+            llm.generate(dummy_inputs,
                          sampling_params=sampling_params,
                          use_tqdm=False)
             end_time = time.perf_counter()
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -8,7 +8,7 @@ Multi-Modality
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
 Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
-via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`.
 
 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
 by following :ref:`this guide <adding_multimodal_plugin>`.
diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.rst
@@ -1,7 +1,7 @@
 LLM Inputs
 ==========
 
-.. autodata:: vllm.inputs.PromptType
+.. autodata:: vllm.inputs.PromptInputs
 
 .. autoclass:: vllm.inputs.TextPrompt
     :show-inheritance:
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -27,7 +27,7 @@ The :class:`~vllm.LLM` class can be instantiated in much the same way as languag
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
     the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
 
-To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`:
+To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
 
 * ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
@@ -61,15 +61,15 @@ async def test_evil_forward(tmp_socket):
 
         # Throws an error in first forward pass.
         with pytest.raises(RAISED_ERROR):
-            async for _ in client.generate(prompt="Hello my name is",
+            async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
         assert client.errored
 
         # Engine is errored, should get ENGINE_DEAD_ERROR.
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
+            async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
 
         # Generate call should throw ENGINE_DEAD_ERROR
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
+            async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
@@ -165,7 +165,7 @@ async def bad_abort_after_2s():
         # with reference to the original KeyError("foo")
         with pytest.raises(MQEngineDeadError) as execinfo:
             async for _ in client.generate(
-                    prompt="Hello my name is",
+                    inputs="Hello my name is",
                     sampling_params=SamplingParams(max_tokens=2000),
                     request_id=uuid.uuid4()):
                 pass
@@ -190,7 +190,7 @@ async def test_bad_request(tmp_socket):
 
         # Invalid request should fail, but not crash the server.
         with pytest.raises(ValueError):
-            async for _ in client.generate(prompt="Hello my name is",
+            async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id="abcd-1",
                                            lora_request=LoRARequest(
@@ -199,7 +199,7 @@ async def test_bad_request(tmp_socket):
                 pass
 
         # This request should be okay.
-        async for _ in client.generate(prompt="Hello my name is",
+        async for _ in client.generate(inputs="Hello my name is",
                                        sampling_params=SamplingParams(),
                                        request_id="abcd-2"):
             pass
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
@@ -20,7 +20,7 @@ async def generate(
     count = 0
     async for out in client.generate(
             request_id=request_id,
-            prompt="Hello my name is Robert and",
+            inputs="Hello my name is Robert and",
             sampling_params=SamplingParams(max_tokens=num_tokens,
                                            temperature=0)):
 
diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -5,7 +5,7 @@
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import PromptType, TextPrompt, TokensPrompt
+from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
 from vllm.outputs import (CompletionOutput, EmbeddingOutput,
                           EmbeddingRequestOutput, RequestOutput)
@@ -19,7 +19,7 @@
     "__version_tuple__",
     "LLM",
     "ModelRegistry",
-    "PromptType",
+    "PromptInputs",
     "TextPrompt",
     "TokensPrompt",
     "SamplingParams",
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -17,7 +17,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import PromptType
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -405,7 +405,7 @@ async def stop_remote_worker_execution_loop_async(self) -> None:
     async def add_request_async(
         self,
         request_id: str,
-        prompt: PromptType,
+        inputs: PromptInputs,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -420,7 +420,7 @@ async def add_request_async(
             arrival_time = time.time()
 
         preprocessed_inputs = await self.input_preprocessor.preprocess_async(
-            prompt,
+            inputs,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
@@ -777,7 +777,7 @@ async def run_engine_loop(engine_ref: ReferenceType):
     async def add_request(
         self,
         request_id: str,
-        prompt: PromptType,
+        inputs: PromptInputs,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -797,7 +797,7 @@ async def add_request(
         stream = self._request_tracker.add_request(
             request_id,
             verbose=self.log_requests,
-            prompt=prompt,
+            inputs=inputs,
             params=params,
             arrival_time=arrival_time or time.time(),
             lora_request=lora_request,
@@ -808,7 +808,7 @@ async def add_request(
 
     async def generate(
         self,
-        prompt: PromptType,
+        inputs: PromptInputs,
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -822,7 +822,8 @@ async def generate(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            inputs: The inputs to the LLM. See
+                :class:`~vllm.inputs.PromptInputs`
                 for more details about the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
@@ -880,7 +881,7 @@ async def generate(
         """
         async for output in await self.add_request(
                 request_id,
-                prompt,
+                inputs,
                 sampling_params,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
@@ -890,7 +891,7 @@ async def generate(
 
     async def encode(
         self,
-        prompt: PromptType,
+        inputs: PromptInputs,
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -903,7 +904,8 @@ async def encode(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            inputs: The inputs to the LLM. See
+                :class:`~vllm.inputs.PromptInputs`
                 for more details about the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
@@ -957,7 +959,7 @@ async def encode(
         """
         async for output in await self.add_request(
                 request_id,
-                prompt,
+                inputs,
                 pooling_params,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -29,7 +29,7 @@
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
-                         InputRegistry, LLMInputs, PromptType)
+                         InputRegistry, LLMInputs, PromptInputs)
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -689,7 +689,7 @@ def stop_remote_worker_execution_loop(self) -> None:
     def add_request(
         self,
         request_id: str,
-        prompt: PromptType,
+        inputs: PromptInputs,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -704,7 +704,8 @@ def add_request(
 
         Args:
             request_id: The unique ID of the request.
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            inputs: The inputs to the LLM. See
+                :class:`~vllm.inputs.PromptInputs`
                 for more details about the format of each input.
             params: Parameters for sampling or pooling.
                 :class:`~vllm.SamplingParams` for text generation.
@@ -744,7 +745,7 @@ def add_request(
             arrival_time = time.time()
 
         preprocessed_inputs = self.input_preprocessor.preprocess(
-            prompt,
+            inputs,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
@@ -3,7 +3,7 @@
 from typing import List, Mapping, Optional, Union
 
 from vllm import PoolingParams
-from vllm.inputs import PromptType
+from vllm.inputs import PromptInputs
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -23,7 +23,7 @@ class MQEngineDeadError(RuntimeError):
 
 @dataclass
 class RPCProcessRequest:
-    prompt: PromptType
+    inputs: PromptInputs
     params: Union[SamplingParams, PoolingParams]
     request_id: str
     lora_request: Optional[LoRARequest] = None
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -25,7 +25,7 @@
                                          RPCStartupResponse)
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
-from vllm.inputs import PromptType
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
@@ -375,7 +375,7 @@ def dead_error(self) -> BaseException:
 
     def generate(
         self,
-        prompt: PromptType,
+        inputs: PromptInputs,
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -389,7 +389,8 @@ def generate(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            inputs: The inputs to the LLM. See
+                :class:`~vllm.inputs.PromptInputs`
                 for more details about the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
@@ -398,13 +399,13 @@ def generate(
             prompt_adapter_request: Prompt Adapter request to use
                                             for generation, if any.
         """
-        return self._process_request(prompt, sampling_params, request_id,
+        return self._process_request(inputs, sampling_params, request_id,
                                      lora_request, trace_headers,
                                      prompt_adapter_request)
 
     def encode(
         self,
-        prompt: PromptType,
+        inputs: PromptInputs,
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -417,7 +418,8 @@ def encode(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            inputs: The inputs to the LLM. See
+                :class:`~vllm.inputs.PromptInputs`
                 for more details about the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
@@ -428,12 +430,12 @@ def encode(
             The output `EmbeddingRequestOutput` objects from the LLMEngine
             for the request.
         """
-        return self._process_request(prompt, pooling_params, request_id,
+        return self._process_request(inputs, pooling_params, request_id,
                                      lora_request, trace_headers)
 
     async def _process_request(
         self,
-        prompt: PromptType,
+        inputs: PromptInputs,
         params: Union[SamplingParams, PoolingParams],
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -466,7 +468,7 @@ async def _process_request(
 
             request_bytes = pickle.dumps(
                 RPCProcessRequest(
-                    prompt=prompt,
+                    inputs=inputs,
                     params=params,
                     request_id=request_id,
                     lora_request=lora_request,
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
@@ -252,7 +252,7 @@ def _handle_process_request(self, request: RPCProcessRequest):
         try:
             self.engine.add_request(
                 request_id=request_id,
-                prompt=request.prompt,
+                inputs=request.inputs,
                 params=request.params,
                 lora_request=request.lora_request,
                 trace_headers=request.trace_headers,
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py