[Model] Explicit interface for vLLM models and support OOT embedding models (vllm-project#9108)

DarkLight1337 · LeiWang1999 · commit 5d8cec2b8306 · 2025-03-26T10:14:44.000Z
Signed-off-by: LeiWang1999 &lt;leiwang1999@outlook.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -871,6 +871,7 @@ def num_gpus_available():
 temp_dir = tempfile.gettempdir()
 _dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
 _dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
+_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
 
 
 @pytest.fixture
@@ -909,3 +910,22 @@ def dummy_llava_path():
         with open(json_path, "w") as f:
             json.dump(config, f)
     return _dummy_llava_path
+
+
+@pytest.fixture
+def dummy_gemma2_embedding_path():
+    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
+    if not os.path.exists(_dummy_gemma2_embedding_path):
+        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
+                          local_dir=_dummy_gemma2_embedding_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path, "r") as f:
+            config = json.load(f)
+        config["architectures"] = ["MyGemma2Embedding"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_gemma2_embedding_path
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, PoolingParams, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -17,7 +17,7 @@ def test_plugin(dummy_opt_path):
 
 
 @fork_new_process_for_each_test
-def test_oot_registration(dummy_opt_path):
+def test_oot_registration_text_generation(dummy_opt_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
     sampling_params = SamplingParams(temperature=0)
@@ -32,11 +32,23 @@ def test_oot_registration(dummy_opt_path):
         assert rest == ""
 
 
+@fork_new_process_for_each_test
+def test_oot_registration_embedding(dummy_gemma2_embedding_path):
+    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = PoolingParams()
+    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+    outputs = llm.encode(prompts, sampling_params)
+
+    for output in outputs:
+        assert all(v == 0 for v in output.outputs.embedding)
+
+
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
 
 @fork_new_process_for_each_test
-def test_oot_multimodal_registration(dummy_llava_path):
+def test_oot_registration_multimodal(dummy_llava_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = [{
         "prompt": "What's in the image?<image>",
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
@@ -3,7 +3,14 @@
 import pytest
 import torch.cuda
 
-from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models import (is_embedding_model,
+                                        is_text_generation_model,
+                                        supports_multimodal)
+from vllm.model_executor.models.registry import (_EMBEDDING_MODELS,
+                                                 _MULTIMODAL_MODELS,
+                                                 _SPECULATIVE_DECODING_MODELS,
+                                                 _TEXT_GENERATION_MODELS,
+                                                 ModelRegistry)
 from vllm.platforms import current_platform
 
 from ..utils import fork_new_process_for_each_test
@@ -12,7 +19,20 @@
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
     # Ensure all model classes can be imported successfully
-    ModelRegistry.resolve_model_cls(model_arch)
+    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+
+    if model_arch in _SPECULATIVE_DECODING_MODELS:
+        pass  # Ignore these models which do not have a unified format
+    else:
+        assert is_text_generation_model(model_cls) is (
+            model_arch in _TEXT_GENERATION_MODELS
+            or model_arch in _MULTIMODAL_MODELS)
+
+        assert is_embedding_model(model_cls) is (model_arch
+                                                 in _EMBEDDING_MODELS)
+
+        assert supports_multimodal(model_cls) is (model_arch
+                                                  in _MULTIMODAL_MODELS)
 
 
 @fork_new_process_for_each_test
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -9,6 +9,12 @@ def register():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
 
     # Test passing lazy model
+    if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model(
+            "MyGemma2Embedding",
+            "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
+        )
+
     if "MyLlava" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyLlava",
                                      "vllm_add_dummy_model.my_llava:MyLlava")
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -0,0 +1,34 @@
+from typing import List, Optional, Union
+
+import torch
+
+from vllm.attention import AttentionMetadata
+from vllm.model_executor.models.gemma2_embedding import Gemma2EmbeddingModel
+from vllm.sequence import IntermediateTensors
+
+
+class MyGemma2Embedding(Gemma2EmbeddingModel):
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = super().forward(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if isinstance(hidden_states, IntermediateTensors):
+            return hidden_states
+
+        # Return all-zero embeddings
+        return torch.zeros_like(hidden_states)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -1,10 +1,17 @@
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
                          supports_multimodal, supports_pp)
+from .interfaces_base import (VllmModelForEmbedding,
+                              VllmModelForTextGeneration, is_embedding_model,
+                              is_text_generation_model)
 from .registry import ModelRegistry
 
 __all__ = [
     "ModelRegistry",
+    "VllmModelForEmbedding",
+    "is_embedding_model",
+    "VllmModelForTextGeneration",
+    "is_text_generation_model",
     "HasInnerState",
     "has_inner_state",
     "SupportsLoRA",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -1,14 +1,13 @@
-import inspect
 from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
                     Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
 from typing_extensions import TypeIs
 
 from vllm.logger import init_logger
+from vllm.utils import supports_kw
 
 if TYPE_CHECKING:
-    from vllm.attention import AttentionMetadata
     from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
     from vllm.sequence import IntermediateTensors
 
@@ -142,9 +141,7 @@ def supports_lora(
     return result
 
 
-def _supports_lora(
-    model: Union[Type[object], object],
-) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
+def _supports_lora(model: Union[Type[object], object]) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsLoRAType)
 
@@ -175,10 +172,7 @@ def make_empty_intermediate_tensors(
 
     def forward(
         self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: "AttentionMetadata",
+        *,
         intermediate_tensors: Optional["IntermediateTensors"],
     ) -> Union[torch.Tensor, "IntermediateTensors"]:
         """
@@ -205,10 +199,7 @@ def make_empty_intermediate_tensors(
 
     def forward(
         self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: "AttentionMetadata",
+        *,
         intermediate_tensors: Optional["IntermediateTensors"],
     ) -> Union[torch.Tensor, "IntermediateTensors"]:
         ...
@@ -257,24 +248,19 @@ def supports_pp(
     return supports_attributes and supports_inspect
 
 
-def _supports_pp_attributes(
-    model: Union[Type[object], object],
-) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]:
+def _supports_pp_attributes(model: Union[Type[object], object]) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsPPType)
 
     return isinstance(model, SupportsPP)
 
 
-def _supports_pp_inspect(
-    model: Union[Type[object], object],
-) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]:
+def _supports_pp_inspect(model: Union[Type[object], object]) -> bool:
     model_forward = getattr(model, "forward", None)
     if not callable(model_forward):
         return False
 
-    forward_params = inspect.signature(model_forward).parameters
-    return "intermediate_tensors" in forward_params
+    return supports_kw(model_forward, "intermediate_tensors")
 
 
 @runtime_checkable
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
diff --git a/vllm/utils.py b/vllm/utils.py