Skip to content

Commit 8a924d2

Browse files
[Doc] Guide for adding multi-modal plugins (#6205)
1 parent 5ed3505 commit 8a924d2

File tree

7 files changed

+64
-23
lines changed

7 files changed

+64
-23
lines changed

docs/source/_templates/sections/header.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
justify-content: center;
66
align-items: center;
77
font-size: 16px;
8+
padding: 0 6px 0 6px;
89
}
910
.notification-bar p {
1011
margin: 0;
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
.. _adding_multimodal_plugin:
2+
3+
Adding a Multimodal Plugin
4+
==========================
5+
6+
This document teaches you how to add a new modality to vLLM.
7+
8+
Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
9+
For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
10+
11+
The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
12+
13+
.. note::
14+
This article is a work in progress.
15+
16+
..
17+
TODO: Add more instructions on how to add new plugins once embeddings is in.

docs/source/dev/multimodal/multimodal_index.rst

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,21 @@ Multi-Modality
77

88
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
99

10-
Multi-modal input can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
10+
Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
1111
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
1212

13-
.. note::
14-
``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
15-
the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
13+
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
14+
by following :ref:`this guide <adding_multimodal_plugin>`.
1615

17-
To implement a new multi-modal model in vLLM, please follow :ref:`this guide <enabling_multimodal_inputs>`.
16+
Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
1817

19-
..
20-
TODO: Add more instructions on how to add new plugins once embeddings is in.
18+
Guides
19+
++++++
20+
21+
.. toctree::
22+
:maxdepth: 1
23+
24+
adding_multimodal_plugin
2125

2226
Module Contents
2327
+++++++++++++++
@@ -36,10 +40,14 @@ Registry
3640
Base Classes
3741
------------
3842

39-
.. autoclass:: vllm.multimodal.MultiModalDataDict
43+
.. autodata:: vllm.multimodal.BatchedTensors
44+
45+
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
4046
:members:
4147
:show-inheritance:
4248

49+
.. autodata:: vllm.multimodal.MultiModalDataDict
50+
4351
.. autoclass:: vllm.multimodal.MultiModalInputs
4452
:members:
4553
:show-inheritance:

vllm/multimodal/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from .base import (BatchedTensors, MultiModalDataDict, MultiModalInputs,
2-
MultiModalPlugin)
1+
from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
2+
MultiModalInputs, MultiModalPlugin)
33
from .registry import MultiModalRegistry
44

55
MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -13,6 +13,7 @@
1313

1414
__all__ = [
1515
"BatchedTensors",
16+
"MultiModalDataBuiltins",
1617
"MultiModalDataDict",
1718
"MultiModalInputs",
1819
"MultiModalPlugin",

vllm/multimodal/base.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,6 @@ def try_concat(
4343
*,
4444
device: torch.types.Device,
4545
) -> BatchedTensors:
46-
# Avoid initializing CUDA too early
47-
import torch
48-
4946
unbatched_shape = tensors[0].shape[1:]
5047

5148
for tensor in tensors:
@@ -84,16 +81,21 @@ def batch(
8481

8582

8683
class MultiModalDataBuiltins(TypedDict, total=False):
84+
"""Modality types that are predefined by vLLM."""
85+
8786
image: Image.Image
87+
"""The input image."""
8888

8989

9090
MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
9191
"""
9292
A dictionary containing an item for each modality type to input.
9393
94-
The data belonging to each modality is converted into keyword arguments
95-
to the model by the corresponding mapper. By default, the mapper of
96-
the corresponding plugin with the same modality key is applied.
94+
Note:
95+
This dictionary also accepts modality keys defined outside
96+
:class:`MultiModalDataBuiltins` as long as a customized plugin is registered
97+
through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
98+
Read more on that :ref:`here <adding_multimodal_plugin>`.
9799
"""
98100

99101
MultiModalInputMapper = Callable[[InputContext, object], MultiModalInputs]
@@ -123,6 +125,9 @@ class MultiModalPlugin(ABC):
123125
process the same data differently). This registry is in turn used by
124126
:class:`~MultiModalRegistry` which acts at a higher level
125127
(i.e., the modality of the data).
128+
129+
See also:
130+
:ref:`adding_multimodal_plugin`
126131
"""
127132

128133
def __init__(self) -> None:
@@ -183,8 +188,8 @@ def wrapper(model_cls: N) -> N:
183188
def map_input(self, model_config: ModelConfig,
184189
data: object) -> MultiModalInputs:
185190
"""
186-
Apply an input mapper to a data passed
187-
to the model, transforming the data into a dictionary of model inputs.
191+
Transform the data into a dictionary of model inputs using the
192+
input mapper registered for that model.
188193
189194
The model is identified by ``model_config``.
190195

vllm/multimodal/image.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def repeat_and_pad_image_tokens(
100100

101101

102102
class ImagePlugin(MultiModalPlugin):
103+
"""Plugin for image data."""
103104

104105
def get_data_key(self) -> str:
105106
return "image"

vllm/multimodal/registry.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515

1616
class MultiModalRegistry:
1717
"""
18-
A registry to dispatch data processing
19-
according to its modality and the target model.
20-
21-
The registry handles both external and internal data input.
18+
A registry that dispatches data processing to the
19+
:class:`~vllm.multimodal.MultiModalPlugin` for each modality.
2220
"""
2321

2422
DEFAULT_PLUGINS = (ImagePlugin(), )
@@ -30,6 +28,12 @@ def __init__(
3028
self._plugins = {p.get_data_key(): p for p in plugins}
3129

3230
def register_plugin(self, plugin: MultiModalPlugin) -> None:
31+
"""
32+
Register a multi-modal plugin so it can be recognized by vLLM.
33+
34+
See also:
35+
:ref:`adding_multimodal_plugin`
36+
"""
3337
data_type_key = plugin.get_data_key()
3438

3539
if data_type_key in self._plugins:
@@ -75,7 +79,11 @@ def map_input(self, model_config: ModelConfig,
7579
data: MultiModalDataDict) -> MultiModalInputs:
7680
"""
7781
Apply an input mapper to the data passed to the model.
78-
82+
83+
The data belonging to each modality is passed to the corresponding
84+
plugin which in turn converts the data into into keyword arguments
85+
via the input mapper registered for that model.
86+
7987
See :meth:`MultiModalPlugin.map_input` for more details.
8088
"""
8189
merged_dict: Dict[str, torch.Tensor] = {}

0 commit comments

Comments
 (0)