CLIP score improvements (#2978)

SkafteNicki · Borda · mergify[bot] · web-flow · commit d332ae3b5971 · 2025-03-12T11:52:41.000+01:00
* add to examples

* functional improvement

* implement in tests

* fix processor for jina

* add to modular

* add changelog

* note

* fix testing

* add missing requirements

* fix truncation issue

* bit of refactor loading processor and model

* fix stupid mistake in naming

* fixes

* try fixing typing

* refactor

* timm

* einops

* drop

* try fixing tests

* fix more tests

* fix remaining tests

* try fixing more tests

* lower test burden

* try fixing

---------

Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Jirka B &lt;j.borovec+github@gmail.com&gt;
Co-authored-by: mergify[bot] &lt;37929162+mergify[bot]@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `ARNIQA` metric to image domain ([#2953](https://github.com/PyTorchLightning/metrics/pull/2953))
 
 
+- Added support for more models and processors in `CLIPScore` ([#2978](https://github.com/PyTorchLightning/metrics/pull/2978))
+
+
 ### Changed
 
 -
diff --git a/examples/image/clip_score.py b/examples/image/clip_score.py
@@ -41,9 +41,7 @@
 
 models = [
     "openai/clip-vit-base-patch16",
-    # "openai/clip-vit-base-patch32",
-    # "openai/clip-vit-large-patch14-336",
-    "openai/clip-vit-large-patch14",
+    # "zer0int/LongCLIP-L-Diffusers",
 ]
 
 # %%
diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt
@@ -3,3 +3,5 @@
 
 transformers >=4.42.3, <4.50.0
 piq <=0.8.0
+einops >=0.7.0, <=0.8.1  # CLIP dependency
+timm >=0.9.0, <1.1.0  # CLIP: needed for transformers/models--jinaai--jina-clip-implementation
diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, List, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, List, Union, cast
 
 import torch
 from torch import Tensor
@@ -41,6 +41,24 @@ def _download_clip_for_clip_score() -> None:
     _CLIPProcessor = None
 
 
+class JinaProcessorWrapper:
+    """Wrapper class to convert tensors to PIL images if needed for Jina CLIP model."""
+
+    def __init__(self, processor: _CLIPProcessor) -> None:
+        self.processor = processor
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        """Wrap the processor's __call__ method to convert tensors to PIL images if needed."""
+        # Check if 'images' is in kwargs and convert tensors to PIL images if needed
+        from torchvision.transforms.functional import to_pil_image
+
+        if "images" in kwargs:
+            kwargs["images"] = [
+                to_pil_image(img.float().cpu()) if isinstance(img, Tensor) else img for img in kwargs["images"]
+            ]
+        return self.processor(*args, **kwargs)
+
+
 def _detect_modality(input_data: Union[Tensor, List[Tensor], List[str], str]) -> Literal["image", "text"]:
     """Automatically detect the modality of the input data.
 
@@ -110,22 +128,22 @@ def _get_features(
 
     """
     if modality == "image":
-        # Add type checking for images
-        image_data = [i for i in data if isinstance(i, Tensor)]
+        image_data = [i for i in data if isinstance(i, Tensor)]  # Add type checking for images
         processed = processor(images=[i.cpu() for i in image_data], return_tensors="pt", padding=True)
         return model.get_image_features(processed["pixel_values"].to(device))
     if modality == "text":
         processed = processor(text=data, return_tensors="pt", padding=True)
-        max_position_embeddings = model.config.text_config.max_position_embeddings
-        if processed["attention_mask"].shape[-1] > max_position_embeddings:
-            rank_zero_warn(
-                f"Encountered caption longer than {max_position_embeddings=}. Will truncate captions to this length."
-                "If longer captions are needed, initialize argument `model_name_or_path` with a model that supports"
-                "longer sequences",
-                UserWarning,
-            )
-            processed["attention_mask"] = processed["attention_mask"][..., :max_position_embeddings]
-            processed["input_ids"] = processed["input_ids"][..., :max_position_embeddings]
+        if hasattr(model.config, "text_config") and hasattr(model.config.text_config, "max_position_embeddings"):
+            max_position_embeddings = model.config.text_config.max_position_embeddings
+            if processed["attention_mask"].shape[-1] > max_position_embeddings:
+                rank_zero_warn(
+                    f"Encountered caption longer than {max_position_embeddings=}. Will truncate captions to this"
+                    "length. If longer captions are needed, initialize argument `model_name_or_path` with a model that"
+                    "supports longer sequences.",
+                    UserWarning,
+                )
+                processed["attention_mask"] = processed["attention_mask"][..., :max_position_embeddings]
+                processed["input_ids"] = processed["input_ids"][..., :max_position_embeddings]
         return model.get_text_features(processed["input_ids"].to(device), processed["attention_mask"].to(device))
     raise ValueError(f"invalid modality {modality}")
 
@@ -136,6 +154,7 @@ def _clip_score_update(
     model: _CLIPModel,
     processor: _CLIPProcessor,
 ) -> tuple[Tensor, int]:
+    """Update function for CLIP Score."""
     source_modality = _detect_modality(source)
     target_modality = _detect_modality(target)
 
@@ -181,19 +200,43 @@ def _clip_score_update(
 
 
 def _get_clip_model_and_processor(
-    model_name_or_path: Literal[
-        "openai/clip-vit-base-patch16",
-        "openai/clip-vit-base-patch32",
-        "openai/clip-vit-large-patch14-336",
-        "openai/clip-vit-large-patch14",
-    ] = "openai/clip-vit-large-patch14",
+    model_name_or_path: Union[
+        Literal[
+            "openai/clip-vit-base-patch16",
+            "openai/clip-vit-base-patch32",
+            "openai/clip-vit-large-patch14-336",
+            "openai/clip-vit-large-patch14",
+            "jinaai/jina-clip-v2",
+            "zer0int/LongCLIP-L-Diffusers",
+            "zer0int/LongCLIP-GmP-ViT-L-14",
+        ],
+        Callable[[], tuple[_CLIPModel, _CLIPProcessor]],
+    ],
 ) -> tuple[_CLIPModel, _CLIPProcessor]:
+    if callable(model_name_or_path):
+        return model_name_or_path()
+
     if _TRANSFORMERS_GREATER_EQUAL_4_10:
+        from transformers import AutoModel, AutoProcessor
+        from transformers import CLIPConfig as _CLIPConfig
         from transformers import CLIPModel as _CLIPModel
         from transformers import CLIPProcessor as _CLIPProcessor
 
-        model = _CLIPModel.from_pretrained(model_name_or_path)
-        processor = _CLIPProcessor.from_pretrained(model_name_or_path)
+        if "openai" in model_name_or_path:
+            model = _CLIPModel.from_pretrained(model_name_or_path)
+            processor = _CLIPProcessor.from_pretrained(model_name_or_path)
+        elif "jinaai" in model_name_or_path:
+            model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
+            processor = JinaProcessorWrapper(
+                processor=AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+            )
+        elif "zer0int" in model_name_or_path:
+            config = _CLIPConfig.from_pretrained(model_name_or_path)
+            config.text_config.max_position_embeddings = 248
+            model = _CLIPModel.from_pretrained(model_name_or_path, config=config)
+            processor = _CLIPProcessor.from_pretrained(model_name_or_path, padding="max_length", max_length=248)
+        else:
+            raise ValueError(f"Invalid model_name_or_path {model_name_or_path}. Not supported by `clip_score` metric.")
         return model, processor
 
     raise ModuleNotFoundError(
@@ -205,11 +248,17 @@ def _get_clip_model_and_processor(
 def clip_score(
     source: Union[Tensor, List[Tensor], List[str], str],
     target: Union[Tensor, List[Tensor], List[str], str],
-    model_name_or_path: Literal[
-        "openai/clip-vit-base-patch16",
-        "openai/clip-vit-base-patch32",
-        "openai/clip-vit-large-patch14-336",
-        "openai/clip-vit-large-patch14",
+    model_name_or_path: Union[
+        Literal[
+            "openai/clip-vit-base-patch16",
+            "openai/clip-vit-base-patch32",
+            "openai/clip-vit-large-patch14-336",
+            "openai/clip-vit-large-patch14",
+            "jinaai/jina-clip-v2",
+            "zer0int/LongCLIP-L-Diffusers",
+            "zer0int/LongCLIP-GmP-ViT-L-14",
+        ],
+        Callable[[], tuple[_CLIPModel, _CLIPProcessor]],
     ] = "openai/clip-vit-large-patch14",
 ) -> Tensor:
     r"""Calculates `CLIP Score`_ which is a text-to-image similarity metric.
@@ -239,6 +288,11 @@ def clip_score(
 
     .. note:: Metric is not scriptable
 
+    .. note::
+        The default CLIP and processor used in this implementation has a maximum sequence length of 77 for text
+        inputs. If you need to process longer captions, you can use the `zer0int/LongCLIP-L-Diffusers` model which
+        has a maximum sequence length of 248.
+
     Args:
         source: Source input. This can be:
             - Images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors.
@@ -251,7 +305,14 @@ def clip_score(
             - `"openai/clip-vit-base-patch32"`
             - `"openai/clip-vit-large-patch14-336"`
             - `"openai/clip-vit-large-patch14"`
-
+            - `"jinaai/jina-clip-v2"`
+            - `"zer0int/LongCLIP-L-Diffusers"`
+            - `"zer0int/LongCLIP-GmP-ViT-L-14"`
+
+            Alternatively, a callable function that returns a tuple of CLIP compatible model and processor instances
+            can be passed in. By compatible, we mean that the processors `__call__` method should accept a list of
+            strings and list of images and that the model should have a `get_image_features` and `get_text_features`
+            methods.
 
     Raises:
         ModuleNotFoundError:
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Sequence, Union
 
 import torch
 from torch import Tensor
@@ -26,6 +26,10 @@
 if not _MATPLOTLIB_AVAILABLE:
     __doctest_skip__ = ["CLIPScore.plot"]
 
+if TYPE_CHECKING and _TRANSFORMERS_GREATER_EQUAL_4_10:
+    from transformers import CLIPModel as _CLIPModel
+    from transformers import CLIPProcessor as _CLIPProcessor
+
 if _SKIP_SLOW_DOCTEST and _TRANSFORMERS_GREATER_EQUAL_4_10:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
@@ -38,6 +42,8 @@ def _download_clip_for_clip_score() -> None:
         __doctest_skip__ = ["CLIPScore", "CLIPScore.plot"]
 else:
     __doctest_skip__ = ["CLIPScore", "CLIPScore.plot"]
+    _CLIPModel = None
+    _CLIPProcessor = None
 
 
 class CLIPScore(Metric):
@@ -69,6 +75,11 @@ class CLIPScore(Metric):
     .. caution::
         Metric is not scriptable
 
+    .. note::
+        The default CLIP and processor used in this implementation has a maximum sequence length of 77 for text
+        inputs. If you need to process longer captions, you can use the `zer0int/LongCLIP-L-Diffusers` model which
+        has a maximum sequence length of 248.
+
     As input to ``forward`` and ``update`` the metric accepts the following input
 
     - source: Source input.
@@ -110,6 +121,14 @@ class CLIPScore(Metric):
             - `"openai/clip-vit-base-patch32"`
             - `"openai/clip-vit-large-patch14-336"`
             - `"openai/clip-vit-large-patch14"`
+            - `"jinaai/jina-clip-v2"`
+            - `"zer0int/LongCLIP-L-Diffusers"`
+            - `"zer0int/LongCLIP-GmP-ViT-L-14"`
+
+            Alternatively, a callable function that returns a tuple of CLIP compatible model and processor instances
+            can be passed in. By compatible, we mean that the processors `__call__` method should accept a list of
+            strings and list of images and that the model should have a `get_image_features` and `get_text_features`
+            methods.
 
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
@@ -156,11 +175,17 @@ class CLIPScore(Metric):
 
     def __init__(
         self,
-        model_name_or_path: Literal[
-            "openai/clip-vit-base-patch16",
-            "openai/clip-vit-base-patch32",
-            "openai/clip-vit-large-patch14-336",
-            "openai/clip-vit-large-patch14",
+        model_name_or_path: Union[
+            Literal[
+                "openai/clip-vit-base-patch16",
+                "openai/clip-vit-base-patch32",
+                "openai/clip-vit-large-patch14-336",
+                "openai/clip-vit-large-patch14",
+                "jinaai/jina-clip-v2",
+                "zer0int/LongCLIP-L-Diffusers",
+                "zer0int/LongCLIP-GmP-ViT-L-14",
+            ],
+            Callable[[], tuple[_CLIPModel, _CLIPProcessor]],
         ] = "openai/clip-vit-large-patch14",
         **kwargs: Any,
     ) -> None:
diff --git a/tests/unittests/_helpers/__init__.py b/tests/unittests/_helpers/__init__.py
@@ -16,7 +16,7 @@
 import numpy
 import torch
 
-from unittests._helpers.wrappers import skip_on_connection_issues, skip_on_running_out_of_memory
+from unittests._helpers.wrappers import skip_on_connection_issues, skip_on_cuda_oom, skip_on_running_out_of_memory
 
 
 def seed_all(seed):
@@ -27,4 +27,4 @@ def seed_all(seed):
     torch.cuda.manual_seed_all(seed)
 
 
-__all__ = ["seed_all", "skip_on_connection_issues", "skip_on_running_out_of_memory"]
+__all__ = ["seed_all", "skip_on_connection_issues", "skip_on_cuda_oom", "skip_on_running_out_of_memory"]
diff --git a/tests/unittests/_helpers/testers.py b/tests/unittests/_helpers/testers.py
@@ -98,6 +98,7 @@ def _class_test(
     fragment_kwargs: bool = False,
     check_scriptable: bool = True,
     check_state_dict: bool = True,
+    check_picklable: bool = True,
     **kwargs_update: Any,
 ):
     """Comparison between class metric and reference metric.
@@ -121,6 +122,7 @@ def _class_test(
         fragment_kwargs: whether tensors in kwargs should be divided as `preds` and `target` among processes
         check_scriptable: bool indicating if metric should also be tested if it can be scripted
         check_state_dict: bool indicating if metric should be tested that its state_dict by default is empty
+        check_picklable: bool indicating if metric should be tested that it can be pickled
         kwargs_update: Additional keyword arguments that will be passed with preds and
             target when running update on the metric.
 
@@ -156,8 +158,9 @@ def _class_test(
     kwargs_update = {k: v.to(device) if isinstance(v, Tensor) else v for k, v in kwargs_update.items()}
 
     # verify metrics work after being loaded from pickled state
-    pickled_metric = pickle.dumps(metric)
-    metric = pickle.loads(pickled_metric)
+    if check_picklable:
+        pickled_metric = pickle.dumps(metric)
+        metric = pickle.loads(pickled_metric)
     metric_clone = deepcopy(metric)
 
     for i in range(rank, num_batches, world_size):
@@ -431,6 +434,7 @@ def run_class_metric_test(
         fragment_kwargs: bool = False,
         check_scriptable: bool = True,
         check_state_dict: bool = True,
+        check_picklable: bool = True,
         atol: Optional[float] = None,
         **kwargs_update: Any,
     ):
@@ -451,6 +455,7 @@ def run_class_metric_test(
             fragment_kwargs: whether tensors in kwargs should be divided as `preds` and `target` among processes
             check_scriptable: bool indicating if metric should also be tested if it can be scripted
             check_state_dict: bool indicating if metric should be tested that its state_dict by default is empty
+            check_picklable: bool indicating if metric should be tested that it can be pickled
             atol: absolute tolerance used for comparison of results, if None will use self.atol
             kwargs_update: Additional keyword arguments that will be passed with preds and
                 target when running update on the metric.
@@ -470,6 +475,7 @@ def run_class_metric_test(
             "fragment_kwargs": fragment_kwargs,
             "check_scriptable": check_scriptable,
             "check_state_dict": check_state_dict,
+            "check_picklable": check_picklable,
         }
 
         if ddp and hasattr(pytest, "pool"):
diff --git a/tests/unittests/_helpers/wrappers.py b/tests/unittests/_helpers/wrappers.py
@@ -69,3 +69,25 @@ def run_test(*args: Any, **kwargs: Any) -> Optional[Any]:
         return run_test
 
     return test_decorator
+
+
+def skip_on_cuda_oom(reason: str = "Skipping test due to CUDA Out of Memory (OOM) error."):
+    """Skip tests that fail due to CUDA Out of Memory (OOM) errors.
+
+    The test runs normally if no OOM error arises, but is marked as skipped otherwise.
+
+    """
+
+    def test_decorator(function: Callable) -> Callable:
+        @wraps(function)
+        def run_test(*args: Any, **kwargs: Any) -> Optional[Any]:
+            try:
+                return function(*args, **kwargs)
+            except RuntimeError as ex:
+                if "CUDA out of memory" not in str(ex):
+                    raise ex
+                pytest.skip(reason)
+
+        return run_test
+
+    return test_decorator
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0`
`21`	`21`	- Added `ARNIQA` metric to image domain ([#2953](https://github.com/PyTorchLightning/metrics/pull/2953))
`22`	`22`
`23`	`23`
	`24`	+- Added support for more models and processors in `CLIPScore` ([#2978](https://github.com/PyTorchLightning/metrics/pull/2978))
	`25`	`+`
	`26`	`+`
`24`	`27`	`### Changed`
`25`	`28`
`26`	`29`	`-`
Original file line number	Diff line number	Diff line change
`@@ -41,9 +41,7 @@`
`41`	`41`
`42`	`42`	`models = [`
`43`	`43`	`"openai/clip-vit-base-patch16",`
`44`		`- # "openai/clip-vit-base-patch32",`
`45`		`- # "openai/clip-vit-large-patch14-336",`
`46`		`- "openai/clip-vit-large-patch14",`
	`44`	`+ # "zer0int/LongCLIP-L-Diffusers",`
`47`	`45`	`]`
`48`	`46`
`49`	`47`	`# %%`