From f9c3fbeee312db862bd10dda4e0064e24d21c27f Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 5 Nov 2022 09:56:14 +0100
Subject: [PATCH 01/34] first steps

---
 docs/source/links.rst                     |  1 +
 src/torchmetrics/multimodal/__init__.py   | 17 ++++++
 src/torchmetrics/multimodal/clip_score.py | 65 +++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 src/torchmetrics/multimodal/__init__.py
 create mode 100644 src/torchmetrics/multimodal/clip_score.py

diff --git a/docs/source/links.rst b/docs/source/links.rst
index b305ac49fd8..28dd8dad06c 100644
--- a/docs/source/links.rst
+++ b/docs/source/links.rst
@@ -95,3 +95,4 @@
 .. _Fisher-Rao distance: http://www.scholarpedia.org/article/Fisher-Rao_metric
 .. _Kendall Rank Correlation Coefficient: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
 .. _The Treatment of Ties in Ranking Problems: https://www.jstor.org/stable/2332303
+.. _CLIP score: https://arxiv.org/pdf/2104.08718.pdf
diff --git a/src/torchmetrics/multimodal/__init__.py b/src/torchmetrics/multimodal/__init__.py
new file mode 100644
index 00000000000..e5b4ad56ce2
--- /dev/null
+++ b/src/torchmetrics/multimodal/__init__.py
@@ -0,0 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+
+if _TRANSFORMERS_AVAILABLE:
+    from torchmetrics.multimodal.clip_score import CLIPScore  # noqa: F401
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
new file mode 100644
index 00000000000..f53b5e27997
--- /dev/null
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -0,0 +1,65 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import torch
+from torch import Tensor
+from transformers import CLIPFeatureExtractor as _CLIPFeatureExtractor
+from transformers import CLIPModel as _CLIPModel
+from transformers import CLIPTokenizer as _CLIPTokenizer
+
+from torchmetrics import Metric
+
+
+class CLIPScore(Metric):
+    def __init__(self, version="openai/clip-vit-large-patch14", **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.tokenizer = _CLIPTokenizer.from_pretrained(version)
+        self.model = _CLIPModel.from_pretrained(version)
+        self.features = _CLIPFeatureExtractor.from_pretrained(version)
+        self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
+        self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
+
+    def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
+        if not isinstance(images, List):
+            images = [images]
+        if not isinstance(text, List):
+            text = [text]
+
+        img_features = [
+            self.model.get_image_features(self.features(i, return_tensors="pt")["pixel_values"]) for i in images
+        ]
+        img_features = torch.cat(img_features, 0)
+        img_features = img_features / torch.linalg.norm(img_features, axis=-1, keepdims=True)
+
+        txt_features = [
+            self.model.get_text_features(**self.tokenizer(t, padding=True, return_tensors="pt")) for t in text
+        ]
+        txt_features = torch.cat(txt_features, 0)
+        txt_features = txt_features / torch.linalg.norm(txt_features, axis=-1, keepdims=True)
+
+        # cosine similarity between feature vectors
+        score = (img_features * txt_features).sum(axis=-1)
+        self.score += score.sum(0)
+        self.n_samples += img_features.shape[0]
+
+    def compute(self) -> Tensor:
+        return self.score / self.n_samples
+
+
+if __name__ == "__main__":
+    img = torch.randint(255, (3, 224, 224))
+    text = "min hest er meget flot"
+    metric = CLIPScore()
+    metric.update(img, text)

From aaa3265d4c0b4082c1d3a4b5bb185014a2c1973d Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 5 Nov 2022 15:45:12 +0100
Subject: [PATCH 02/34] further updates

---
 docs/source/links.rst                     |  1 +
 docs/source/multimodal/clip_score.rst     | 14 ++++++
 requirements/multimodal.txt               |  1 +
 src/torchmetrics/multimodal/clip_score.py | 60 +++++++++++++++++------
 4 files changed, 62 insertions(+), 14 deletions(-)
 create mode 100644 docs/source/multimodal/clip_score.rst
 create mode 100644 requirements/multimodal.txt

diff --git a/docs/source/links.rst b/docs/source/links.rst
index 28dd8dad06c..ee227f9bd6d 100644
--- a/docs/source/links.rst
+++ b/docs/source/links.rst
@@ -96,3 +96,4 @@
 .. _Kendall Rank Correlation Coefficient: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
 .. _The Treatment of Ties in Ranking Problems: https://www.jstor.org/stable/2332303
 .. _CLIP score: https://arxiv.org/pdf/2104.08718.pdf
+.. _Huggingface OpenAI: https://huggingface.co/openai
diff --git a/docs/source/multimodal/clip_score.rst b/docs/source/multimodal/clip_score.rst
new file mode 100644
index 00000000000..8653308c9fb
--- /dev/null
+++ b/docs/source/multimodal/clip_score.rst
@@ -0,0 +1,14 @@
+.. customcarditem::
+   :header: CLIP Score
+   :image: https://pl-flash-data.s3.amazonaws.com/assets/thumbnails/image_classification.svg
+   :tags: Image
+
+############################################
+CLIP Score
+############################################
+
+Module Interface
+________________
+
+.. autoclass:: torchmetrics.multimodal.clip_score.CLIPScore
+    :noindex:
diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt
new file mode 100644
index 00000000000..69d510cb9bc
--- /dev/null
+++ b/requirements/multimodal.txt
@@ -0,0 +1 @@
+transformers>=4.4.0
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index f53b5e27997..57f8896deec 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -15,25 +15,64 @@
 
 import torch
 from torch import Tensor
-from transformers import CLIPFeatureExtractor as _CLIPFeatureExtractor
-from transformers import CLIPModel as _CLIPModel
-from transformers import CLIPTokenizer as _CLIPTokenizer
+
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+
+if _TRANSFORMERS_AVAILABLE:
+    from transformers import CLIPFeatureExtractor as _CLIPFeatureExtractor
+    from transformers import CLIPModel as _CLIPModel
+    from transformers import CLIPTokenizer as _CLIPTokenizer
+else:
+    _CLIPFeatureExtractor = None
+    _CLIPModel = None
+    _CLIPTokenizer = None
 
 from torchmetrics import Metric
 
 
 class CLIPScore(Metric):
+    """`CLIP Score`_ is a reference free metric that can be used to evaluate the correlation between an generated
+    caption for an image and the actual content of the image. It has been found to be highly correlated with human
+    judgement. The metric is defined as.
+
+    .. math::
+        \text{CLIPScore(I, C)} = \\max(cos(E_I, E_C), 0)
+
+    which corresponds to the cosine similarity between visual CLIP embedding :math:`E_i` for an image :math:`i` and
+    textual CLIP embedding :math:`E_C` for an caption :math:`C`.
+
+    Args:
+        version: string indicating the version of the CLIP model to use. See `Huggingface OpenAI`_ for more info
+        kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
+    """
+
+    is_differentiable: bool = False
+    higher_is_better: bool = True
+    full_state_update: bool = False
+
     def __init__(self, version="openai/clip-vit-large-patch14", **kwargs) -> None:
         super().__init__(**kwargs)
-        self.tokenizer = _CLIPTokenizer.from_pretrained(version)
-        self.model = _CLIPModel.from_pretrained(version)
-        self.features = _CLIPFeatureExtractor.from_pretrained(version)
+        if _TRANSFORMERS_AVAILABLE:
+            self.tokenizer = _CLIPTokenizer.from_pretrained(version)
+            self.model = _CLIPModel.from_pretrained(version)
+            self.features = _CLIPFeatureExtractor.from_pretrained(version)
+        else:
+            raise ModuleNotFoundError(
+                "`CLIPScore` metric requires `transformers` package be installed."
+                " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[multimodal]`."
+            )
         self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
     def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
         if not isinstance(images, List):
-            images = [images]
+            if images.ndim == 3:
+                images = [images]
+            else:  # unwrap into list
+                images = [i for i in images]
+        if not all(i.ndim == 3 for i in images):
+            raise ValueError("Expected all images to be 3d but found image that has either more or less")
+
         if not isinstance(text, List):
             text = [text]
 
@@ -56,10 +95,3 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
 
     def compute(self) -> Tensor:
         return self.score / self.n_samples
-
-
-if __name__ == "__main__":
-    img = torch.randint(255, (3, 224, 224))
-    text = "min hest er meget flot"
-    metric = CLIPScore()
-    metric.update(img, text)

From 7295fefc739a2a3b1d9b6130242f80d091d3a32f Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Sat, 5 Nov 2022 16:15:20 +0100
Subject: [PATCH 03/34] add some testing

---
 src/torchmetrics/multimodal/clip_score.py     | 14 +++-
 tests/unittests/multimodal/__init__.py        |  0
 tests/unittests/multimodal/test_clip_score.py | 72 +++++++++++++++++++
 3 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 tests/unittests/multimodal/__init__.py
 create mode 100644 tests/unittests/multimodal/test_clip_score.py

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 57f8896deec..fe08c86bb0c 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -76,6 +76,11 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
         if not isinstance(text, List):
             text = [text]
 
+        if len(text) != len(images):
+            raise ValueError(
+                f"Expected the number of images and text examples to be the same but got {len(images)} and {len(text)}"
+            )
+
         img_features = [
             self.model.get_image_features(self.features(i, return_tensors="pt")["pixel_values"]) for i in images
         ]
@@ -90,8 +95,15 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
 
         # cosine similarity between feature vectors
         score = (img_features * txt_features).sum(axis=-1)
-        self.score += score.sum(0)
+        self.score += 100 * score.sum(0)
         self.n_samples += img_features.shape[0]
 
     def compute(self) -> Tensor:
         return self.score / self.n_samples
+
+
+if __name__ == "__main__":
+    img = torch.randint(255, (3, 224, 224))
+    text = "min hest er meget flot"
+    metric = CLIPScore()
+    metric.update(img, text)
diff --git a/tests/unittests/multimodal/__init__.py b/tests/unittests/multimodal/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
new file mode 100644
index 00000000000..ebf91718193
--- /dev/null
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -0,0 +1,72 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import torch
+from transformers import CLIPModel as _CLIPModel
+from transformers import CLIPProcessor as _CLIPProcessor
+
+from torchmetrics.multimodal.clip_score import CLIPScore
+from unittests.helpers import seed_all
+from unittests.helpers.testers import MetricTester
+
+seed_all(42)
+
+
+def _compare_fn(image):
+    processor = _CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    model = _CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+    outputs = model(**inputs)
+    logits_per_image = outputs.logits_per_image
+    return logits_per_image.mean()
+
+
+preds = [
+    "28-year-old chef found dead in San Francisco mall",
+    "A 28-year-old chef who recently moved to San Francisco was "
+    "found dead in the staircase of a local shopping center.",
+    "The victim's brother said he cannot imagine anyone who would want to harm him,\"Finally, it went uphill again at "
+    'him."',
+]
+target = []
+
+
+@pytest.mark.parametrize("input", [preds, target])
+class TestCLIPScore(MetricTester):
+    atol = 1e-5
+
+    @pytest.mark.parametrize("ddp", [True, False])
+    def test_clip_score(self, input, ddp):
+        preds, target = input
+        self.run_class_metric_test(
+            ddp=ddp,
+            preds=preds,
+            target=target,
+            metric_class=CLIPScore,
+            sk_metric=_compare_fn,
+        )
+
+
+def test_error_on_not_same_amount_of_input():
+    """Test that an error is raised if the number of images and text examples does not match."""
+    metric = CLIPScore()
+    with pytest.raises(ValueError):
+        metric(torch.randint(255, (2, 3, 224, 224)), "28-year-old chef found dead in San Francisco mall")
+
+
+def test_error_on_wrong_image_format():
+    """Test that an error is raised if not all images are [c, h, w] format."""
+    metric = CLIPScore()
+    with pytest.raises(ValueError):
+        metric(torch.randint(255, (1, 224, 224)), "28-year-old chef found dead in San Francisco mall")

From f199d7e07d8ac64e0040d96c9c9372b7bde710ee Mon Sep 17 00:00:00 2001
From: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
Date: Sat, 5 Nov 2022 19:44:29 +0100
Subject: [PATCH 04/34] changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index df6488490b6..dd96d38fa1c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `KendallRankCorrCoef` to regression package ([#1271](https://github.com/Lightning-AI/metrics/pull/1271))
 
 
+- Added `CLIPScore` to new multimodal package ([#1314](https://github.com/Lightning-AI/metrics/pull/1314))
+
 ### Changed
 
 - Changed `MeanAveragePrecision` to vectorize `_find_best_gt_match` operation ([#1259](https://github.com/Lightning-AI/metrics/pull/1259))

From 7814d2b70996161e37ee1d09193b0efd2e36937d Mon Sep 17 00:00:00 2001
From: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
Date: Sat, 5 Nov 2022 19:50:28 +0100
Subject: [PATCH 05/34] docstring

---
 src/torchmetrics/multimodal/clip_score.py | 35 ++++++++++++++++++-----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index fe08c86bb0c..a3ca3ad570f 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -44,6 +44,20 @@ class CLIPScore(Metric):
     Args:
         version: string indicating the version of the CLIP model to use. See `Huggingface OpenAI`_ for more info
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
+
+    Raises:
+        ModuleNotFoundError:
+            If `transformers` package is not installed
+
+    Example:
+        >>> import torch
+        >>> _ = torch.manual_seed(42)
+        >>> from torchmetrics.multimodal import CLIPScore
+        >>> metric = CLIPScore()
+        >>> img = torch.randint(255, (3, 224, 224))
+        >>> text = "this is a random sentence"
+        >>> metric(img, text)
+        tensor([0.595])
     """
 
     is_differentiable: bool = False
@@ -65,6 +79,19 @@ def __init__(self, version="openai/clip-vit-large-patch14", **kwargs) -> None:
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
     def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
+        """ Updates CLIP score with current batch of images and text
+
+        Args:
+            images: either a single tensor with shape `(N, C, H, W)` or an list of tensors each
+                with shape `(C, H, W)`
+            text: either a single string or a list of strings
+
+        Raises:
+            ValueError:
+                If not all images have shape `(C, H, W)`
+            ValueError:
+                If the number of images and number of text samples are different
+        """
         if not isinstance(images, List):
             if images.ndim == 3:
                 images = [images]
@@ -99,11 +126,5 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
         self.n_samples += img_features.shape[0]
 
     def compute(self) -> Tensor:
+        """ Calculates the accumulated CLIP score over all samples """
         return self.score / self.n_samples
-
-
-if __name__ == "__main__":
-    img = torch.randint(255, (3, 224, 224))
-    text = "min hest er meget flot"
-    metric = CLIPScore()
-    metric.update(img, text)

From 7ca7854e46b26ec6be0614f7c5d48ffe0c27fcbf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 5 Nov 2022 18:50:59 +0000
Subject: [PATCH 06/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/torchmetrics/multimodal/clip_score.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index a3ca3ad570f..6729bbda512 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -79,7 +79,7 @@ def __init__(self, version="openai/clip-vit-large-patch14", **kwargs) -> None:
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
     def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
-        """ Updates CLIP score with current batch of images and text
+        """Updates CLIP score with current batch of images and text.
 
         Args:
             images: either a single tensor with shape `(N, C, H, W)` or an list of tensors each
@@ -126,5 +126,5 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
         self.n_samples += img_features.shape[0]
 
     def compute(self) -> Tensor:
-        """ Calculates the accumulated CLIP score over all samples """
+        """Calculates the accumulated CLIP score over all samples."""
         return self.score / self.n_samples

From 7df3cbc607b51084fd82ba7584611af4f9127c72 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Tue, 8 Nov 2022 13:52:05 +0100
Subject: [PATCH 07/34] add to index

---
 docs/source/index.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1977f03e303..2cfac3bcc4c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -158,6 +158,14 @@ Or directly from conda
 
    image/*
 
+.. toctree::
+   :maxdepth: 2
+   :name: multimodal
+   :caption: Multimodal
+   :glob:
+
+   multimodal/*
+
 .. toctree::
    :maxdepth: 2
    :name: detection

From b9db50085f5b143f9d966554f4ebb271b61f6c11 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Tue, 8 Nov 2022 13:53:17 +0100
Subject: [PATCH 08/34] add docstrings

---
 src/torchmetrics/multimodal/clip_score.py | 42 ++++++++++++++++-------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index fe08c86bb0c..213dd88960b 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Union
+from typing import Any, List, Union
 
 import torch
 from torch import Tensor
@@ -33,7 +33,7 @@
 class CLIPScore(Metric):
     """`CLIP Score`_ is a reference free metric that can be used to evaluate the correlation between an generated
     caption for an image and the actual content of the image. It has been found to be highly correlated with human
-    judgement. The metric is defined as.
+    judgement. The metric is defined as:
 
     .. math::
         \text{CLIPScore(I, C)} = \\max(cos(E_I, E_C), 0)
@@ -42,15 +42,28 @@ class CLIPScore(Metric):
     textual CLIP embedding :math:`E_C` for an caption :math:`C`.
 
     Args:
-        version: string indicating the version of the CLIP model to use. See `Huggingface OpenAI`_ for more info
+        version: string indicating the version of the CLIP model to use. See `Huggingface OpenAI`_ for more info on
+            availble CLIP models
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
+
+    Raises:
+        ModuleNotFoundError:
+            If transformers package is not installed
+
+    Example:
+        >>> import torch
+        >>> _ = torch.manual_seed(42)
+        >>> from torchmetrics.multimodal import CLIPScore
+        >>> metric = CLIPScore()
+        >>> metric(torch.randint(255, (3, 224, 224)), "a photo of a cat")
+        tensor(19.4135, grad_fn=<SqueezeBackward0>)
     """
 
     is_differentiable: bool = False
     higher_is_better: bool = True
     full_state_update: bool = False
 
-    def __init__(self, version="openai/clip-vit-large-patch14", **kwargs) -> None:
+    def __init__(self, version: str = "openai/clip-vit-large-patch14", **kwargs: Any) -> None:
         super().__init__(**kwargs)
         if _TRANSFORMERS_AVAILABLE:
             self.tokenizer = _CLIPTokenizer.from_pretrained(version)
@@ -65,6 +78,18 @@ def __init__(self, version="openai/clip-vit-large-patch14", **kwargs) -> None:
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
     def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
+        """Updates CLIP score on a batch of images and text.
+
+        Args:
+            images: Either a single [N, C, H, W] tensor or an list of [C, H, W] tensors
+            text: Either a single caption or a list of captions
+
+        Raises:
+            ValueError:
+                If not all images have format [C, H, W]
+            ValueError:
+                If the number of images and captions do not match
+        """
         if not isinstance(images, List):
             if images.ndim == 3:
                 images = [images]
@@ -99,11 +124,4 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
         self.n_samples += img_features.shape[0]
 
     def compute(self) -> Tensor:
-        return self.score / self.n_samples
-
-
-if __name__ == "__main__":
-    img = torch.randint(255, (3, 224, 224))
-    text = "min hest er meget flot"
-    metric = CLIPScore()
-    metric.update(img, text)
+        return torch.max(self.score / self.n_samples, torch.zeros_like(self.score))

From ff3e62ad9ff96238033339dee4cd32be7cdc881a Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Wed, 9 Nov 2022 13:39:16 +0100
Subject: [PATCH 09/34] update

---
 src/torchmetrics/multimodal/clip_score.py     | 36 +++++++++++--------
 tests/unittests/multimodal/test_clip_score.py | 34 ++++++++++++------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 213dd88960b..b62353e52e1 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -22,6 +22,7 @@
     from transformers import CLIPFeatureExtractor as _CLIPFeatureExtractor
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPTokenizer as _CLIPTokenizer
+    from transformers import CLIPProcessor as _CLIPProcessor
 else:
     _CLIPFeatureExtractor = None
     _CLIPModel = None
@@ -36,10 +37,13 @@ class CLIPScore(Metric):
     judgement. The metric is defined as:
 
     .. math::
-        \text{CLIPScore(I, C)} = \\max(cos(E_I, E_C), 0)
+        \text{CLIPScore(I, C)} = \max(100 * cos(E_I, E_C), 0)
 
     which corresponds to the cosine similarity between visual CLIP embedding :math:`E_i` for an image :math:`i` and
-    textual CLIP embedding :math:`E_C` for an caption :math:`C`.
+    textual CLIP embedding :math:`E_C` for an caption :math:`C`. The score is bound between 0 and 100 and the closer
+    to 100 the better.
+
+    .. note:: Metric is not scriptable
 
     Args:
         version: string indicating the version of the CLIP model to use. See `Huggingface OpenAI`_ for more info on
@@ -66,9 +70,10 @@ class CLIPScore(Metric):
     def __init__(self, version: str = "openai/clip-vit-large-patch14", **kwargs: Any) -> None:
         super().__init__(**kwargs)
         if _TRANSFORMERS_AVAILABLE:
-            self.tokenizer = _CLIPTokenizer.from_pretrained(version)
             self.model = _CLIPModel.from_pretrained(version)
-            self.features = _CLIPFeatureExtractor.from_pretrained(version)
+            self.processor = _CLIPProcessor.from_pretrained(version)
+            #self.tokenizer = _CLIPTokenizer.from_pretrained(version)
+            #self.features = _CLIPFeatureExtractor.from_pretrained(version)
         else:
             raise ModuleNotFoundError(
                 "`CLIPScore` metric requires `transformers` package be installed."
@@ -95,6 +100,7 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
                 images = [images]
             else:  # unwrap into list
                 images = [i for i in images]
+
         if not all(i.ndim == 3 for i in images):
             raise ValueError("Expected all images to be 3d but found image that has either more or less")
 
@@ -106,20 +112,22 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
                 f"Expected the number of images and text examples to be the same but got {len(images)} and {len(text)}"
             )
 
-        img_features = [
-            self.model.get_image_features(self.features(i, return_tensors="pt")["pixel_values"]) for i in images
-        ]
-        img_features = torch.cat(img_features, 0)
-        img_features = img_features / torch.linalg.norm(img_features, axis=-1, keepdims=True)
+        processed_input = self.processor(text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True)
+        output = self.model(**processed_input)
+
+        img_features = self.model.get_image_features(processed_input['pixel_values'].to(self.device))
+        img_features = img_features / img_features.norm(p=2, dim=-1, keepdim=True)
 
-        txt_features = [
-            self.model.get_text_features(**self.tokenizer(t, padding=True, return_tensors="pt")) for t in text
-        ]
-        txt_features = torch.cat(txt_features, 0)
-        txt_features = txt_features / torch.linalg.norm(txt_features, axis=-1, keepdims=True)
+        txt_features = self.model.get_text_features(
+            processed_input['input_ids'].to(self.device), processed_input['attention_mask'].to(self.device)
+        )
+        txt_features = txt_features / txt_features.norm(p=2, dim=-1, keepdim=True)
 
         # cosine similarity between feature vectors
+        import pdb
+        pdb.set_trace()
         score = (img_features * txt_features).sum(axis=-1)
+        print(score)
         self.score += 100 * score.sum(0)
         self.n_samples += img_features.shape[0]
 
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index ebf91718193..5c22cab684c 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -22,32 +22,43 @@
 
 seed_all(42)
 
+from collections import namedtuple
 
-def _compare_fn(image):
-    processor = _CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    model = _CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-    outputs = model(**inputs)
-    logits_per_image = outputs.logits_per_image
-    return logits_per_image.mean()
+
+Input = namedtuple("Input", ["images", "captions"])
 
 
-preds = [
+captions = [
     "28-year-old chef found dead in San Francisco mall",
     "A 28-year-old chef who recently moved to San Francisco was "
     "found dead in the staircase of a local shopping center.",
     "The victim's brother said he cannot imagine anyone who would want to harm him,\"Finally, it went uphill again at "
     'him."',
 ]
-target = []
+
+_random_input = Input(
+    images=torch.randint(255, (2, 2, 3, 224, 224)),
+    captions=[captions[0:2], captions[2:]]
+)
+
+
+def _compare_fn(preds, target):
+    processor = _CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    model = _CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    inputs = processor(text=target, images=[p.cpu() for p in preds], return_tensors="pt", padding=True)
+    outputs = model(**inputs)
+    logits_per_image = outputs.logits_per_image
+    print(logits_per_image)
+    return logits_per_image.diag().mean().detach()
 
 
-@pytest.mark.parametrize("input", [preds, target])
+@pytest.mark.parametrize("input", [_random_input,])
 class TestCLIPScore(MetricTester):
     atol = 1e-5
 
     @pytest.mark.parametrize("ddp", [True, False])
     def test_clip_score(self, input, ddp):
+        # images are preds and targets are captions
         preds, target = input
         self.run_class_metric_test(
             ddp=ddp,
@@ -55,6 +66,7 @@ def test_clip_score(self, input, ddp):
             target=target,
             metric_class=CLIPScore,
             sk_metric=_compare_fn,
+            check_scriptable=False,
         )
 
 
@@ -69,4 +81,4 @@ def test_error_on_wrong_image_format():
     """Test that an error is raised if not all images are [c, h, w] format."""
     metric = CLIPScore()
     with pytest.raises(ValueError):
-        metric(torch.randint(255, (1, 224, 224)), "28-year-old chef found dead in San Francisco mall")
+        metric(torch.randint(255, (224, 224)), "28-year-old chef found dead in San Francisco mall")

From 7e1c8d115d6617c54d07f019c440c457724cd2e2 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Thu, 10 Nov 2022 15:53:03 +0100
Subject: [PATCH 10/34] fix tests

---
 src/torchmetrics/multimodal/clip_score.py     | 18 ++----
 tests/unittests/helpers/testers.py            |  4 +-
 tests/unittests/multimodal/test_clip_score.py | 56 ++++++++++---------
 3 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index b62353e52e1..05aa3f93cd5 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -19,14 +19,11 @@
 from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 
 if _TRANSFORMERS_AVAILABLE:
-    from transformers import CLIPFeatureExtractor as _CLIPFeatureExtractor
     from transformers import CLIPModel as _CLIPModel
-    from transformers import CLIPTokenizer as _CLIPTokenizer
     from transformers import CLIPProcessor as _CLIPProcessor
 else:
-    _CLIPFeatureExtractor = None
     _CLIPModel = None
-    _CLIPTokenizer = None
+    _CLIPProcessor = None
 
 from torchmetrics import Metric
 
@@ -37,7 +34,7 @@ class CLIPScore(Metric):
     judgement. The metric is defined as:
 
     .. math::
-        \text{CLIPScore(I, C)} = \max(100 * cos(E_I, E_C), 0)
+        \text{CLIPScore(I, C)} = max(100 * cos(E_I, E_C), 0)
 
     which corresponds to the cosine similarity between visual CLIP embedding :math:`E_i` for an image :math:`i` and
     textual CLIP embedding :math:`E_C` for an caption :math:`C`. The score is bound between 0 and 100 and the closer
@@ -72,8 +69,6 @@ def __init__(self, version: str = "openai/clip-vit-large-patch14", **kwargs: Any
         if _TRANSFORMERS_AVAILABLE:
             self.model = _CLIPModel.from_pretrained(version)
             self.processor = _CLIPProcessor.from_pretrained(version)
-            #self.tokenizer = _CLIPTokenizer.from_pretrained(version)
-            #self.features = _CLIPFeatureExtractor.from_pretrained(version)
         else:
             raise ModuleNotFoundError(
                 "`CLIPScore` metric requires `transformers` package be installed."
@@ -113,23 +108,20 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
             )
 
         processed_input = self.processor(text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True)
-        output = self.model(**processed_input)
 
-        img_features = self.model.get_image_features(processed_input['pixel_values'].to(self.device))
+        img_features = self.model.get_image_features(processed_input["pixel_values"].to(self.device))
         img_features = img_features / img_features.norm(p=2, dim=-1, keepdim=True)
 
         txt_features = self.model.get_text_features(
-            processed_input['input_ids'].to(self.device), processed_input['attention_mask'].to(self.device)
+            processed_input["input_ids"].to(self.device), processed_input["attention_mask"].to(self.device)
         )
         txt_features = txt_features / txt_features.norm(p=2, dim=-1, keepdim=True)
 
         # cosine similarity between feature vectors
-        import pdb
-        pdb.set_trace()
         score = (img_features * txt_features).sum(axis=-1)
-        print(score)
         self.score += 100 * score.sum(0)
         self.n_samples += img_features.shape[0]
 
     def compute(self) -> Tensor:
+        """Computes accumulated clip score."""
         return torch.max(self.score / self.n_samples, torch.zeros_like(self.score))
diff --git a/tests/unittests/helpers/testers.py b/tests/unittests/helpers/testers.py
index 167a78f868a..abe7b0bda6c 100644
--- a/tests/unittests/helpers/testers.py
+++ b/tests/unittests/helpers/testers.py
@@ -237,9 +237,11 @@ def _class_test(
 
     if isinstance(preds, Tensor):
         total_preds = torch.cat([preds[i] for i in range(num_batches)]).cpu()
-        total_target = torch.cat([target[i] for i in range(num_batches)]).cpu()
     else:
         total_preds = [item for sublist in preds for item in sublist]
+    if isinstance(target, Tensor):
+        total_target = torch.cat([target[i] for i in range(num_batches)]).cpu()
+    else:
         total_target = [item for sublist in target for item in sublist]
 
     total_kwargs_update = {
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 5c22cab684c..7327a8f70ac 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import namedtuple
+from functools import partial
+
 import pytest
 import torch
 from transformers import CLIPModel as _CLIPModel
@@ -22,8 +25,6 @@
 
 seed_all(42)
 
-from collections import namedtuple
-
 
 Input = namedtuple("Input", ["images", "captions"])
 
@@ -34,30 +35,33 @@
     "found dead in the staircase of a local shopping center.",
     "The victim's brother said he cannot imagine anyone who would want to harm him,\"Finally, it went uphill again at "
     'him."',
+    "A lawyer says him .\nMoschetto, 54 and prosecutors say .\nAuthority abc Moschetto  .",
 ]
 
-_random_input = Input(
-    images=torch.randint(255, (2, 2, 3, 224, 224)),
-    captions=[captions[0:2], captions[2:]]
-)
+_random_input = Input(images=torch.randint(255, (2, 2, 3, 224, 224)), captions=[captions[0:2], captions[2:]])
 
 
-def _compare_fn(preds, target):
-    processor = _CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    model = _CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+def _compare_fn(preds, target, version):
+    processor = _CLIPProcessor.from_pretrained(version)
+    model = _CLIPModel.from_pretrained(version)
     inputs = processor(text=target, images=[p.cpu() for p in preds], return_tensors="pt", padding=True)
     outputs = model(**inputs)
     logits_per_image = outputs.logits_per_image
-    print(logits_per_image)
     return logits_per_image.diag().mean().detach()
 
 
-@pytest.mark.parametrize("input", [_random_input,])
+@pytest.mark.parametrize("version", ["openai/clip-vit-base-patch32"])
+@pytest.mark.parametrize(
+    "input",
+    [
+        _random_input,
+    ],
+)
 class TestCLIPScore(MetricTester):
     atol = 1e-5
 
     @pytest.mark.parametrize("ddp", [True, False])
-    def test_clip_score(self, input, ddp):
+    def test_clip_score(self, input, version, ddp):
         # images are preds and targets are captions
         preds, target = input
         self.run_class_metric_test(
@@ -65,20 +69,20 @@ def test_clip_score(self, input, ddp):
             preds=preds,
             target=target,
             metric_class=CLIPScore,
-            sk_metric=_compare_fn,
+            sk_metric=partial(_compare_fn, version=version),
+            metric_args={"version": version},
             check_scriptable=False,
+            check_state_dict=False,
         )
 
-
-def test_error_on_not_same_amount_of_input():
-    """Test that an error is raised if the number of images and text examples does not match."""
-    metric = CLIPScore()
-    with pytest.raises(ValueError):
-        metric(torch.randint(255, (2, 3, 224, 224)), "28-year-old chef found dead in San Francisco mall")
-
-
-def test_error_on_wrong_image_format():
-    """Test that an error is raised if not all images are [c, h, w] format."""
-    metric = CLIPScore()
-    with pytest.raises(ValueError):
-        metric(torch.randint(255, (224, 224)), "28-year-old chef found dead in San Francisco mall")
+    def test_error_on_not_same_amount_of_input(self, input, version):
+        """Test that an error is raised if the number of images and text examples does not match."""
+        metric = CLIPScore(version=version)
+        with pytest.raises(ValueError):
+            metric(torch.randint(255, (2, 3, 224, 224)), "28-year-old chef found dead in San Francisco mall")
+
+    def test_error_on_wrong_image_format(self, input, version):
+        """Test that an error is raised if not all images are [c, h, w] format."""
+        metric = CLIPScore(version=version)
+        with pytest.raises(ValueError):
+            metric(torch.randint(255, (224, 224)), "28-year-old chef found dead in San Francisco mall")

From b1c8b27ce7f8a9fc87ab053ad97dce8317679b10 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Fri, 11 Nov 2022 08:51:12 +0100
Subject: [PATCH 11/34] add requirement

---
 requirements/devel.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/devel.txt b/requirements/devel.txt
index 05aa5e61b9a..cc1b8722aad 100644
--- a/requirements/devel.txt
+++ b/requirements/devel.txt
@@ -9,6 +9,7 @@
 -r text.txt
 # -r detection.txt  # version collision with min versio of PyTorch
 -r audio.txt
+-r multimodal.txt
 
 # add extra testing
 -r image_test.txt

From c354fe01f60466f94d7530703dac1b74f43de7a1 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Fri, 11 Nov 2022 09:28:44 +0100
Subject: [PATCH 12/34] try fixing mypy and docs

---
 src/torchmetrics/multimodal/clip_score.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 05aa3f93cd5..10e8a1e9e70 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Union
+from typing import Any, List, Literal, Union
 
 import torch
 from torch import Tensor
@@ -21,9 +21,6 @@
 if _TRANSFORMERS_AVAILABLE:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
-else:
-    _CLIPModel = None
-    _CLIPProcessor = None
 
 from torchmetrics import Metric
 
@@ -43,8 +40,10 @@ class CLIPScore(Metric):
     .. note:: Metric is not scriptable
 
     Args:
-        version: string indicating the version of the CLIP model to use. See `Huggingface OpenAI`_ for more info on
-            availble CLIP models
+        version: string indicating the version of the CLIP model to use. Available models are
+            `"openai/clip-vit-base-patch16"`, `"openai/clip-vit-base-patch32"`, `"openai/clip-vit-large-patch14-336"`
+            and `"openai/clip-vit-large-patch14"`,
+
         kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.
 
     Raises:
@@ -64,7 +63,16 @@ class CLIPScore(Metric):
     higher_is_better: bool = True
     full_state_update: bool = False
 
-    def __init__(self, version: str = "openai/clip-vit-large-patch14", **kwargs: Any) -> None:
+    def __init__(
+        self,
+        version: Literal[
+            "openai/clip-vit-base-patch16",
+            "openai/clip-vit-base-patch32",
+            "openai/clip-vit-large-patch14-336",
+            "openai/clip-vit-large-patch14",
+        ] = "openai/clip-vit-large-patch14",
+        **kwargs: Any,
+    ) -> None:
         super().__init__(**kwargs)
         if _TRANSFORMERS_AVAILABLE:
             self.model = _CLIPModel.from_pretrained(version)

From 711e34384c7cceeef05349756603506cafe8b660 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Fri, 11 Nov 2022 09:34:44 +0100
Subject: [PATCH 13/34] fix

---
 src/torchmetrics/multimodal/clip_score.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 10e8a1e9e70..3a9d5bdceee 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Literal, Union
+from typing import Any, List, Union
 
 import torch
 from torch import Tensor
+from typing_extensions import Literal
 
 from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 

From 95fbff7be0d9ff182672056b6d129f438b13bc91 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Fri, 11 Nov 2022 09:41:25 +0100
Subject: [PATCH 14/34] skip on no transformer

---
 src/torchmetrics/multimodal/clip_score.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 3a9d5bdceee..d911c355f1c 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -22,6 +22,8 @@
 if _TRANSFORMERS_AVAILABLE:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
+else:
+    __doctest_skip__ = ["CLIPScore"]
 
 from torchmetrics import Metric
 

From 2111d091483bfab00582d9d65a060eb083d9df54 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Fri, 11 Nov 2022 09:50:59 +0100
Subject: [PATCH 15/34] fix typing

---
 src/torchmetrics/multimodal/clip_score.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index d911c355f1c..affd3784efe 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -65,6 +65,8 @@ class CLIPScore(Metric):
     is_differentiable: bool = False
     higher_is_better: bool = True
     full_state_update: bool = False
+    score: Tensor
+    n_samples: Tensor
 
     def __init__(
         self,

From 37026104ba20fcce2a86e9807169b8f1415c148a Mon Sep 17 00:00:00 2001
From: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
Date: Sun, 13 Nov 2022 16:31:33 +0100
Subject: [PATCH 16/34] Apply suggestions from code review

Co-authored-by: Daniel Stancl <46073029+stancld@users.noreply.github.com>
---
 src/torchmetrics/multimodal/clip_score.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index affd3784efe..e138a6ee9ee 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -29,7 +29,7 @@
 
 
 class CLIPScore(Metric):
-    """`CLIP Score`_ is a reference free metric that can be used to evaluate the correlation between an generated
+    """`CLIP Score`_ is a reference free metric that can be used to evaluate the correlation between a generated
     caption for an image and the actual content of the image. It has been found to be highly correlated with human
     judgement. The metric is defined as:
 
@@ -94,7 +94,7 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
         """Updates CLIP score on a batch of images and text.
 
         Args:
-            images: Either a single [N, C, H, W] tensor or an list of [C, H, W] tensors
+            images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors
             text: Either a single caption or a list of captions
 
         Raises:
@@ -103,7 +103,7 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
             ValueError:
                 If the number of images and captions do not match
         """
-        if not isinstance(images, List):
+        if not isinstance(images, list):
             if images.ndim == 3:
                 images = [images]
             else:  # unwrap into list
@@ -112,7 +112,7 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
         if not all(i.ndim == 3 for i in images):
             raise ValueError("Expected all images to be 3d but found image that has either more or less")
 
-        if not isinstance(text, List):
+        if not isinstance(text, list):
             text = [text]
 
         if len(text) != len(images):

From 1df18bec1c20abab1a02329b4e5f20c038acb580 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Mon, 14 Nov 2022 08:51:18 +0100
Subject: [PATCH 17/34] add functional and refactor

---
 docs/source/multimodal/clip_score.rst         |   6 +
 .../functional/multimodal/__init__.py         |  17 +++
 .../functional/multimodal/clip_score.py       | 130 ++++++++++++++++++
 src/torchmetrics/multimodal/clip_score.py     |  48 +------
 tests/unittests/multimodal/test_clip_score.py |  29 +++-
 5 files changed, 185 insertions(+), 45 deletions(-)
 create mode 100644 src/torchmetrics/functional/multimodal/__init__.py
 create mode 100644 src/torchmetrics/functional/multimodal/clip_score.py

diff --git a/docs/source/multimodal/clip_score.rst b/docs/source/multimodal/clip_score.rst
index 8653308c9fb..3f45ed441b6 100644
--- a/docs/source/multimodal/clip_score.rst
+++ b/docs/source/multimodal/clip_score.rst
@@ -12,3 +12,9 @@ ________________
 
 .. autoclass:: torchmetrics.multimodal.clip_score.CLIPScore
     :noindex:
+
+Functional Interface
+____________________
+
+.. autofunction:: torchmetrics.functional.multimodal.clip_score.clip_score
+    :noindex:
diff --git a/src/torchmetrics/functional/multimodal/__init__.py b/src/torchmetrics/functional/multimodal/__init__.py
new file mode 100644
index 00000000000..42b82326afe
--- /dev/null
+++ b/src/torchmetrics/functional/multimodal/__init__.py
@@ -0,0 +1,17 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+
+if _TRANSFORMERS_AVAILABLE:
+    from torchmetrics.functional.multimodal.clip_score import clip_score  # noqa: F401
diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
new file mode 100644
index 00000000000..e8d5da651eb
--- /dev/null
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -0,0 +1,130 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Tuple, Union
+
+import torch
+from torch import Tensor
+from typing_extensions import Literal
+
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+
+if _TRANSFORMERS_AVAILABLE:
+    from transformers import CLIPModel as _CLIPModel
+    from transformers import CLIPProcessor as _CLIPProcessor
+else:
+    __doctest_skip__ = ["clip_score"]
+
+
+def _clip_score_update(
+    images: Union[Tensor, List[Tensor]], text: Union[str, List[str]], processor, model
+) -> Tuple[Tensor, int]:
+    if not isinstance(images, list):
+        if images.ndim == 3:
+            images = [images]
+    else:  # unwrap into list
+        images = [i for i in images]
+
+    if not all(i.ndim == 3 for i in images):
+        raise ValueError("Expected all images to be 3d but found image that has either more or less")
+
+    if not isinstance(text, list):
+        text = [text]
+
+    if len(text) != len(images):
+        raise ValueError(
+            f"Expected the number of images and text examples to be the same but got {len(images)} and {len(text)}"
+        )
+    device = images[0].device
+    processed_input = processor(text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True)
+
+    img_features = model.get_image_features(processed_input["pixel_values"].to(device))
+    img_features = img_features / img_features.norm(p=2, dim=-1, keepdim=True)
+
+    txt_features = model.get_text_features(
+        processed_input["input_ids"].to(device), processed_input["attention_mask"].to(device)
+    )
+    txt_features = txt_features / txt_features.norm(p=2, dim=-1, keepdim=True)
+
+    # cosine similarity between feature vectors
+    score = 100 * (img_features * txt_features).sum(axis=-1)
+    return score, len(text)
+
+
+def _get_model_and_processor(
+    version: Literal[
+        "openai/clip-vit-base-patch16",
+        "openai/clip-vit-base-patch32",
+        "openai/clip-vit-large-patch14-336",
+        "openai/clip-vit-large-patch14",
+    ] = "openai/clip-vit-large-patch14",
+):
+    if _TRANSFORMERS_AVAILABLE:
+        model = _CLIPModel.from_pretrained(version)
+        processor = _CLIPProcessor.from_pretrained(version)
+        return model, processor
+    else:
+        raise ModuleNotFoundError(
+            "`clip_score` metric requires `transformers` package be installed."
+            " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[multimodal]`."
+        )
+
+
+def clip_score(
+    images: Union[Tensor, List[Tensor]],
+    text: Union[str, List[str]],
+    version: Literal[
+        "openai/clip-vit-base-patch16",
+        "openai/clip-vit-base-patch32",
+        "openai/clip-vit-large-patch14-336",
+        "openai/clip-vit-large-patch14",
+    ] = "openai/clip-vit-large-patch14",
+) -> Tensor:
+    """`CLIP Score`_ is a reference free metric that can be used to evaluate the correlation between a generated
+    caption for an image and the actual content of the image. It has been found to be highly correlated with human
+    judgement. The metric is defined as:
+
+    .. math::
+        \text{CLIPScore(I, C)} = max(100 * cos(E_I, E_C), 0)
+
+    which corresponds to the cosine similarity between visual CLIP embedding :math:`E_i` for an image :math:`i` and
+    textual CLIP embedding :math:`E_C` for an caption :math:`C`. The score is bound between 0 and 100 and the closer
+    to 100 the better.
+
+    .. note:: Metric is not scriptable
+
+    Args:
+        images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors
+        text: Either a single caption or a list of captions
+        version: string indicating the version of the CLIP model to use. Available models are
+            `"openai/clip-vit-base-patch16"`, `"openai/clip-vit-base-patch32"`, `"openai/clip-vit-large-patch14-336"`
+            and `"openai/clip-vit-large-patch14"`,
+
+    Raises:
+        ModuleNotFoundError:
+            If transformers package is not installed
+        ValueError:
+            If not all images have format [C, H, W]
+        ValueError:
+            If the number of images and captions do not match
+
+    Example:
+        >>> import torch
+        >>> _ = torch.manual_seed(42)
+        >>> from torchmetrics.functional.multimodal import clip_score
+        >>> clip_score(torch.randint(255, (3, 224, 224)), "a photo of a cat")
+        tensor(19.4135, grad_fn=<SqueezeBackward0>)
+    """
+    model, processor = _get_model_and_processor(version)
+    score, _ = _clip_score_update(images, text, model, processor)
+    return torch.max(score.mean(0), torch.zeros_like(score))
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index e138a6ee9ee..47b7bef3600 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -17,12 +17,10 @@
 from torch import Tensor
 from typing_extensions import Literal
 
+from torchmetrics.functional.multimodal.clip_score import _clip_score_update, _get_model_and_processor
 from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 
-if _TRANSFORMERS_AVAILABLE:
-    from transformers import CLIPModel as _CLIPModel
-    from transformers import CLIPProcessor as _CLIPProcessor
-else:
+if not _TRANSFORMERS_AVAILABLE:
     __doctest_skip__ = ["CLIPScore"]
 
 from torchmetrics import Metric
@@ -78,15 +76,9 @@ def __init__(
         ] = "openai/clip-vit-large-patch14",
         **kwargs: Any,
     ) -> None:
+
         super().__init__(**kwargs)
-        if _TRANSFORMERS_AVAILABLE:
-            self.model = _CLIPModel.from_pretrained(version)
-            self.processor = _CLIPProcessor.from_pretrained(version)
-        else:
-            raise ModuleNotFoundError(
-                "`CLIPScore` metric requires `transformers` package be installed."
-                " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[multimodal]`."
-            )
+        self.model, self.processor = _get_model_and_processor(version)
         self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
@@ -103,37 +95,9 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
             ValueError:
                 If the number of images and captions do not match
         """
-        if not isinstance(images, list):
-            if images.ndim == 3:
-                images = [images]
-            else:  # unwrap into list
-                images = [i for i in images]
-
-        if not all(i.ndim == 3 for i in images):
-            raise ValueError("Expected all images to be 3d but found image that has either more or less")
-
-        if not isinstance(text, list):
-            text = [text]
-
-        if len(text) != len(images):
-            raise ValueError(
-                f"Expected the number of images and text examples to be the same but got {len(images)} and {len(text)}"
-            )
-
-        processed_input = self.processor(text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True)
-
-        img_features = self.model.get_image_features(processed_input["pixel_values"].to(self.device))
-        img_features = img_features / img_features.norm(p=2, dim=-1, keepdim=True)
-
-        txt_features = self.model.get_text_features(
-            processed_input["input_ids"].to(self.device), processed_input["attention_mask"].to(self.device)
-        )
-        txt_features = txt_features / txt_features.norm(p=2, dim=-1, keepdim=True)
-
-        # cosine similarity between feature vectors
-        score = (img_features * txt_features).sum(axis=-1)
+        score, n_samples = _clip_score_update(images, text, self.model, self.processor)
         self.score += 100 * score.sum(0)
-        self.n_samples += img_features.shape[0]
+        self.n_samples += n_samples
 
     def compute(self) -> Tensor:
         """Computes accumulated clip score."""
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 7327a8f70ac..474931d075e 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -19,9 +19,11 @@
 from transformers import CLIPModel as _CLIPModel
 from transformers import CLIPProcessor as _CLIPProcessor
 
+from torchmetrics.functional.multimodal.clip_score import clip_score
 from torchmetrics.multimodal.clip_score import CLIPScore
 from unittests.helpers import seed_all
 from unittests.helpers.testers import MetricTester
+from unittests.text.helpers import skip_on_connection_issues
 
 seed_all(42)
 
@@ -50,6 +52,7 @@ def _compare_fn(preds, target, version):
     return logits_per_image.diag().mean().detach()
 
 
+@skip_on_connection_issues()
 @pytest.mark.parametrize("version", ["openai/clip-vit-base-patch32"])
 @pytest.mark.parametrize(
     "input",
@@ -58,10 +61,9 @@ def _compare_fn(preds, target, version):
     ],
 )
 class TestCLIPScore(MetricTester):
-    atol = 1e-5
-
     @pytest.mark.parametrize("ddp", [True, False])
-    def test_clip_score(self, input, version, ddp):
+    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
+    def test_clip_score(self, input, version, ddp, dist_sync_on_step):
         # images are preds and targets are captions
         preds, target = input
         self.run_class_metric_test(
@@ -70,11 +72,32 @@ def test_clip_score(self, input, version, ddp):
             target=target,
             metric_class=CLIPScore,
             sk_metric=partial(_compare_fn, version=version),
+            dist_sync_on_step=dist_sync_on_step,
             metric_args={"version": version},
             check_scriptable=False,
             check_state_dict=False,
         )
 
+    def test_clip_score_functional(self, input, version):
+        preds, target = input
+        self.run_functional_metric_test(
+            preds=preds,
+            target=target,
+            metric_functional=clip_score,
+            sk_metric=partial(_compare_fn, version=version),
+            metric_args={"version": version},
+        )
+
+    def test_mean_error_differentiability(self, input, version):
+        preds, target = input
+        self.run_differentiability_test(
+            preds=preds,
+            target=target,
+            metric_module=CLIPScore,
+            metric_functional=clip_score,
+            metric_args={"version": version},
+        )
+
     def test_error_on_not_same_amount_of_input(self, input, version):
         """Test that an error is raised if the number of images and text examples does not match."""
         metric = CLIPScore(version=version)

From 3205b91f8da92177cbea7f1a8f68568125a5a5e5 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Mon, 14 Nov 2022 08:55:03 +0100
Subject: [PATCH 18/34] change variable name

---
 .../functional/multimodal/clip_score.py       | 12 +++----
 src/torchmetrics/multimodal/clip_score.py     |  6 ++--
 tests/unittests/multimodal/test_clip_score.py | 32 +++++++++----------
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index e8d5da651eb..79b7bbd633c 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -62,7 +62,7 @@ def _clip_score_update(
 
 
 def _get_model_and_processor(
-    version: Literal[
+    model_name_or_path: Literal[
         "openai/clip-vit-base-patch16",
         "openai/clip-vit-base-patch32",
         "openai/clip-vit-large-patch14-336",
@@ -70,8 +70,8 @@ def _get_model_and_processor(
     ] = "openai/clip-vit-large-patch14",
 ):
     if _TRANSFORMERS_AVAILABLE:
-        model = _CLIPModel.from_pretrained(version)
-        processor = _CLIPProcessor.from_pretrained(version)
+        model = _CLIPModel.from_pretrained(model_name_or_path)
+        processor = _CLIPProcessor.from_pretrained(model_name_or_path)
         return model, processor
     else:
         raise ModuleNotFoundError(
@@ -83,7 +83,7 @@ def _get_model_and_processor(
 def clip_score(
     images: Union[Tensor, List[Tensor]],
     text: Union[str, List[str]],
-    version: Literal[
+    model_name_or_path: Literal[
         "openai/clip-vit-base-patch16",
         "openai/clip-vit-base-patch32",
         "openai/clip-vit-large-patch14-336",
@@ -106,7 +106,7 @@ def clip_score(
     Args:
         images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors
         text: Either a single caption or a list of captions
-        version: string indicating the version of the CLIP model to use. Available models are
+        model_name_or_path: string indicating the version of the CLIP model to use. Available models are
             `"openai/clip-vit-base-patch16"`, `"openai/clip-vit-base-patch32"`, `"openai/clip-vit-large-patch14-336"`
             and `"openai/clip-vit-large-patch14"`,
 
@@ -125,6 +125,6 @@ def clip_score(
         >>> clip_score(torch.randint(255, (3, 224, 224)), "a photo of a cat")
         tensor(19.4135, grad_fn=<SqueezeBackward0>)
     """
-    model, processor = _get_model_and_processor(version)
+    model, processor = _get_model_and_processor(model_name_or_path)
     score, _ = _clip_score_update(images, text, model, processor)
     return torch.max(score.mean(0), torch.zeros_like(score))
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 47b7bef3600..d0ea903ee7a 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -41,7 +41,7 @@ class CLIPScore(Metric):
     .. note:: Metric is not scriptable
 
     Args:
-        version: string indicating the version of the CLIP model to use. Available models are
+        model_name_or_path: string indicating the version of the CLIP model to use. Available models are
             `"openai/clip-vit-base-patch16"`, `"openai/clip-vit-base-patch32"`, `"openai/clip-vit-large-patch14-336"`
             and `"openai/clip-vit-large-patch14"`,
 
@@ -68,7 +68,7 @@ class CLIPScore(Metric):
 
     def __init__(
         self,
-        version: Literal[
+        model_name_or_path: Literal[
             "openai/clip-vit-base-patch16",
             "openai/clip-vit-base-patch32",
             "openai/clip-vit-large-patch14-336",
@@ -78,7 +78,7 @@ def __init__(
     ) -> None:
 
         super().__init__(**kwargs)
-        self.model, self.processor = _get_model_and_processor(version)
+        self.model, self.processor = _get_model_and_processor(model_name_or_path)
         self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 474931d075e..9dcf1507a4c 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -43,9 +43,9 @@
 _random_input = Input(images=torch.randint(255, (2, 2, 3, 224, 224)), captions=[captions[0:2], captions[2:]])
 
 
-def _compare_fn(preds, target, version):
-    processor = _CLIPProcessor.from_pretrained(version)
-    model = _CLIPModel.from_pretrained(version)
+def _compare_fn(preds, target, model_name_or_path):
+    processor = _CLIPProcessor.from_pretrained(model_name_or_path)
+    model = _CLIPModel.from_pretrained(model_name_or_path)
     inputs = processor(text=target, images=[p.cpu() for p in preds], return_tensors="pt", padding=True)
     outputs = model(**inputs)
     logits_per_image = outputs.logits_per_image
@@ -53,7 +53,7 @@ def _compare_fn(preds, target, version):
 
 
 @skip_on_connection_issues()
-@pytest.mark.parametrize("version", ["openai/clip-vit-base-patch32"])
+@pytest.mark.parametrize("model_name_or_path", ["openai/clip-vit-base-patch32"])
 @pytest.mark.parametrize(
     "input",
     [
@@ -63,7 +63,7 @@ def _compare_fn(preds, target, version):
 class TestCLIPScore(MetricTester):
     @pytest.mark.parametrize("ddp", [True, False])
     @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_clip_score(self, input, version, ddp, dist_sync_on_step):
+    def test_clip_score(self, input, model_name_or_path, ddp, dist_sync_on_step):
         # images are preds and targets are captions
         preds, target = input
         self.run_class_metric_test(
@@ -71,41 +71,41 @@ def test_clip_score(self, input, version, ddp, dist_sync_on_step):
             preds=preds,
             target=target,
             metric_class=CLIPScore,
-            sk_metric=partial(_compare_fn, version=version),
+            sk_metric=partial(_compare_fn, model_name_or_path=model_name_or_path),
             dist_sync_on_step=dist_sync_on_step,
-            metric_args={"version": version},
+            metric_args={"model_name_or_path": model_name_or_path},
             check_scriptable=False,
             check_state_dict=False,
         )
 
-    def test_clip_score_functional(self, input, version):
+    def test_clip_score_functional(self, input, model_name_or_path):
         preds, target = input
         self.run_functional_metric_test(
             preds=preds,
             target=target,
             metric_functional=clip_score,
-            sk_metric=partial(_compare_fn, version=version),
-            metric_args={"version": version},
+            sk_metric=partial(_compare_fn, model_name_or_path=model_name_or_path),
+            metric_args={"model_name_or_path": model_name_or_path},
         )
 
-    def test_mean_error_differentiability(self, input, version):
+    def test_mean_error_differentiability(self, input, model_name_or_path):
         preds, target = input
         self.run_differentiability_test(
             preds=preds,
             target=target,
             metric_module=CLIPScore,
             metric_functional=clip_score,
-            metric_args={"version": version},
+            metric_args={"model_name_or_path": model_name_or_path},
         )
 
-    def test_error_on_not_same_amount_of_input(self, input, version):
+    def test_error_on_not_same_amount_of_input(self, input, model_name_or_path):
         """Test that an error is raised if the number of images and text examples does not match."""
-        metric = CLIPScore(version=version)
+        metric = CLIPScore(model_name_or_path=model_name_or_path)
         with pytest.raises(ValueError):
             metric(torch.randint(255, (2, 3, 224, 224)), "28-year-old chef found dead in San Francisco mall")
 
-    def test_error_on_wrong_image_format(self, input, version):
+    def test_error_on_wrong_image_format(self, input, model_name_or_path):
         """Test that an error is raised if not all images are [c, h, w] format."""
-        metric = CLIPScore(version=version)
+        metric = CLIPScore(model_name_or_path=model_name_or_path)
         with pytest.raises(ValueError):
             metric(torch.randint(255, (224, 224)), "28-year-old chef found dead in San Francisco mall")

From 61cedeb2744ff8f566bf63ae49196f98836ea873 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Mon, 14 Nov 2022 12:29:07 +0100
Subject: [PATCH 19/34] fix testing

---
 .../functional/multimodal/clip_score.py       | 15 +++++---
 src/torchmetrics/multimodal/clip_score.py     |  6 ++--
 tests/unittests/helpers/testers.py            | 35 ++++++++++++-------
 tests/unittests/multimodal/test_clip_score.py | 17 ++++-----
 4 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index 79b7bbd633c..46f122c822f 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -22,12 +22,17 @@
 if _TRANSFORMERS_AVAILABLE:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
+    from transformers import ProcessorMixin as _ProcessorMixin
+    from transformers.modeling_utils import PreTrainedModel as _PreTrainedModel
 else:
     __doctest_skip__ = ["clip_score"]
 
 
 def _clip_score_update(
-    images: Union[Tensor, List[Tensor]], text: Union[str, List[str]], processor, model
+    images: Union[Tensor, List[Tensor]],
+    text: Union[str, List[str]],
+    model: _PreTrainedModel,
+    processor: _ProcessorMixin,
 ) -> Tuple[Tensor, int]:
     if not isinstance(images, list):
         if images.ndim == 3:
@@ -68,7 +73,7 @@ def _get_model_and_processor(
         "openai/clip-vit-large-patch14-336",
         "openai/clip-vit-large-patch14",
     ] = "openai/clip-vit-large-patch14",
-):
+) -> Tuple[_PreTrainedModel, _ProcessorMixin]:
     if _TRANSFORMERS_AVAILABLE:
         model = _CLIPModel.from_pretrained(model_name_or_path)
         processor = _CLIPProcessor.from_pretrained(model_name_or_path)
@@ -126,5 +131,7 @@ def clip_score(
         tensor(19.4135, grad_fn=<SqueezeBackward0>)
     """
     model, processor = _get_model_and_processor(model_name_or_path)
-    score, _ = _clip_score_update(images, text, model, processor)
-    return torch.max(score.mean(0), torch.zeros_like(score))
+    device = images.device if isinstance(images, Tensor) else images[0].device
+    score, _ = _clip_score_update(images, text, model.to(device), processor)
+    score = score.mean(0)
+    return torch.max(score, torch.zeros_like(score))
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index d0ea903ee7a..223c240b4b2 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -62,7 +62,7 @@ class CLIPScore(Metric):
 
     is_differentiable: bool = False
     higher_is_better: bool = True
-    full_state_update: bool = False
+    full_state_update: bool = True
     score: Tensor
     n_samples: Tensor
 
@@ -82,7 +82,7 @@ def __init__(
         self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
-    def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
+    def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:  # type: ignore
         """Updates CLIP score on a batch of images and text.
 
         Args:
@@ -96,7 +96,7 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]
                 If the number of images and captions do not match
         """
         score, n_samples = _clip_score_update(images, text, self.model, self.processor)
-        self.score += 100 * score.sum(0)
+        self.score += score.sum(0)
         self.n_samples += n_samples
 
     def compute(self) -> Tensor:
diff --git a/tests/unittests/helpers/testers.py b/tests/unittests/helpers/testers.py
index abe7b0bda6c..ddf320c8260 100644
--- a/tests/unittests/helpers/testers.py
+++ b/tests/unittests/helpers/testers.py
@@ -26,7 +26,7 @@
 
 from torchmetrics import Metric
 from torchmetrics.detection.mean_ap import MAPMetricResults
-from torchmetrics.utilities.data import apply_to_collection
+from torchmetrics.utilities.data import _flatten, apply_to_collection
 
 try:
     set_start_method("spawn")
@@ -112,8 +112,8 @@ def _assert_requires_grad(metric: Metric, pl_result: Any, key: Optional[str] = N
 def _class_test(
     rank: int,
     worldsize: int,
-    preds: Union[Tensor, List[Dict[str, Tensor]]],
-    target: Union[Tensor, List[Dict[str, Tensor]]],
+    preds: Union[Tensor, list, List[Dict[str, Tensor]]],
+    target: Union[Tensor, list, List[Dict[str, Tensor]]],
     metric_class: Metric,
     sk_metric: Callable,
     dist_sync_on_step: bool,
@@ -189,15 +189,16 @@ def _class_test(
         if metric.dist_sync_on_step and check_dist_sync_on_step and rank == 0:
             if isinstance(preds, Tensor):
                 ddp_preds = torch.cat([preds[i + r] for r in range(worldsize)]).cpu()
+            else:
+                ddp_preds = _flatten([preds[i + r] for r in range(worldsize)])
+            if isinstance(target, Tensor):
                 ddp_target = torch.cat([target[i + r] for r in range(worldsize)]).cpu()
             else:
-                ddp_preds = [preds[i + r] for r in range(worldsize)]
-                ddp_target = [target[i + r] for r in range(worldsize)]
+                ddp_target = _flatten([target[i + r] for r in range(worldsize)])
             ddp_kwargs_upd = {
                 k: torch.cat([v[i + r] for r in range(worldsize)]).cpu() if isinstance(v, Tensor) else v
                 for k, v in (kwargs_update if fragment_kwargs else batch_kwargs_update).items()
             }
-
             sk_batch_result = sk_metric(ddp_preds, ddp_target, **ddp_kwargs_upd)
             if isinstance(batch_result, dict):
                 for key in batch_result:
@@ -259,8 +260,8 @@ def _class_test(
 
 
 def _functional_test(
-    preds: Tensor,
-    target: Tensor,
+    preds: Union[Tensor, list],
+    target: Union[Tensor, list],
     metric_functional: Callable,
     sk_metric: Callable,
     metric_args: dict = None,
@@ -282,8 +283,10 @@ def _functional_test(
         kwargs_update: Additional keyword arguments that will be passed with preds and
             target when running update on the metric.
     """
-    assert preds.shape[0] == target.shape[0]
-    num_batches = preds.shape[0]
+    p_size = preds.shape[0] if isinstance(preds, Tensor) else len(preds)
+    t_size = target.shape[0] if isinstance(target, Tensor) else len(target)
+    assert p_size == t_size
+    num_batches = p_size
 
     if not metric_args:
         metric_args = {}
@@ -291,8 +294,10 @@ def _functional_test(
     metric = partial(metric_functional, **metric_args)
 
     # move to device
-    preds = preds.to(device)
-    target = target.to(device)
+    if isinstance(preds, Tensor):
+        preds = preds.to(device)
+    if isinstance(target, Tensor):
+        target = target.to(device)
     kwargs_update = {k: v.to(device) if isinstance(v, Tensor) else v for k, v in kwargs_update.items()}
 
     for i in range(num_batches):
@@ -302,7 +307,11 @@ def _functional_test(
             k: v.cpu() if isinstance(v, Tensor) else v
             for k, v in (extra_kwargs if fragment_kwargs else kwargs_update).items()
         }
-        sk_result = sk_metric(preds[i].cpu(), target[i].cpu(), **extra_kwargs)
+        sk_result = sk_metric(
+            preds[i].cpu() if isinstance(preds, Tensor) else preds[i],
+            target[i].cpu() if isinstance(target, Tensor) else target[i],
+            **extra_kwargs,
+        )
 
         # assert its the same
         _assert_allclose(tm_result, sk_result, atol=atol)
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 9dcf1507a4c..398118ae9f0 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -21,6 +21,7 @@
 
 from torchmetrics.functional.multimodal.clip_score import clip_score
 from torchmetrics.multimodal.clip_score import CLIPScore
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 from unittests.helpers import seed_all
 from unittests.helpers.testers import MetricTester
 from unittests.text.helpers import skip_on_connection_issues
@@ -52,17 +53,13 @@ def _compare_fn(preds, target, model_name_or_path):
     return logits_per_image.diag().mean().detach()
 
 
-@skip_on_connection_issues()
 @pytest.mark.parametrize("model_name_or_path", ["openai/clip-vit-base-patch32"])
-@pytest.mark.parametrize(
-    "input",
-    [
-        _random_input,
-    ],
-)
+@pytest.mark.parametrize("input", [_random_input])
+@pytest.mark.skipif(not _TRANSFORMERS_AVAILABLE, reason="test requires bert_score")
 class TestCLIPScore(MetricTester):
     @pytest.mark.parametrize("ddp", [True, False])
     @pytest.mark.parametrize("dist_sync_on_step", [True, False])
+    @skip_on_connection_issues()
     def test_clip_score(self, input, model_name_or_path, ddp, dist_sync_on_step):
         # images are preds and targets are captions
         preds, target = input
@@ -78,6 +75,7 @@ def test_clip_score(self, input, model_name_or_path, ddp, dist_sync_on_step):
             check_state_dict=False,
         )
 
+    @skip_on_connection_issues()
     def test_clip_score_functional(self, input, model_name_or_path):
         preds, target = input
         self.run_functional_metric_test(
@@ -88,7 +86,8 @@ def test_clip_score_functional(self, input, model_name_or_path):
             metric_args={"model_name_or_path": model_name_or_path},
         )
 
-    def test_mean_error_differentiability(self, input, model_name_or_path):
+    @skip_on_connection_issues()
+    def test_clip_score_differentiability(self, input, model_name_or_path):
         preds, target = input
         self.run_differentiability_test(
             preds=preds,
@@ -98,12 +97,14 @@ def test_mean_error_differentiability(self, input, model_name_or_path):
             metric_args={"model_name_or_path": model_name_or_path},
         )
 
+    @skip_on_connection_issues()
     def test_error_on_not_same_amount_of_input(self, input, model_name_or_path):
         """Test that an error is raised if the number of images and text examples does not match."""
         metric = CLIPScore(model_name_or_path=model_name_or_path)
         with pytest.raises(ValueError):
             metric(torch.randint(255, (2, 3, 224, 224)), "28-year-old chef found dead in San Francisco mall")
 
+    @skip_on_connection_issues()
     def test_error_on_wrong_image_format(self, input, model_name_or_path):
         """Test that an error is raised if not all images are [c, h, w] format."""
         metric = CLIPScore(model_name_or_path=model_name_or_path)

From c0cec12ea321dcf9189034366f8036ec372ee6da Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Mon, 14 Nov 2022 15:20:11 +0100
Subject: [PATCH 20/34] try fixing typing

---
 src/torchmetrics/functional/multimodal/clip_score.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index 46f122c822f..d78070c26c2 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -22,17 +22,17 @@
 if _TRANSFORMERS_AVAILABLE:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
-    from transformers import ProcessorMixin as _ProcessorMixin
-    from transformers.modeling_utils import PreTrainedModel as _PreTrainedModel
 else:
     __doctest_skip__ = ["clip_score"]
+    _CLIPModel = None  # type:ignore
+    _CLIPProcessor = None  # type:ignore
 
 
 def _clip_score_update(
     images: Union[Tensor, List[Tensor]],
     text: Union[str, List[str]],
-    model: _PreTrainedModel,
-    processor: _ProcessorMixin,
+    model: _CLIPModel,
+    processor: _CLIPProcessor,
 ) -> Tuple[Tensor, int]:
     if not isinstance(images, list):
         if images.ndim == 3:
@@ -73,7 +73,7 @@ def _get_model_and_processor(
         "openai/clip-vit-large-patch14-336",
         "openai/clip-vit-large-patch14",
     ] = "openai/clip-vit-large-patch14",
-) -> Tuple[_PreTrainedModel, _ProcessorMixin]:
+) -> Tuple[_CLIPModel, _CLIPProcessor]:
     if _TRANSFORMERS_AVAILABLE:
         model = _CLIPModel.from_pretrained(model_name_or_path)
         processor = _CLIPProcessor.from_pretrained(model_name_or_path)

From 0cbc15d29df59ecfa559431a5d9023b43e721ac2 Mon Sep 17 00:00:00 2001
From: Jirka <jirka.borovec@seznam.cz>
Date: Wed, 16 Nov 2022 09:50:28 +0100
Subject: [PATCH 21/34] 8g

---
 .azure/gpu-pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure/gpu-pipeline.yml b/.azure/gpu-pipeline.yml
index dfa1d53b916..320b3000fc4 100644
--- a/.azure/gpu-pipeline.yml
+++ b/.azure/gpu-pipeline.yml
@@ -37,7 +37,7 @@ jobs:
 
     container:
       image: "$(docker-image)"
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --name ci-container -v /usr/bin/docker:/tmp/docker:ro"
+      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=8g --name ci-container -v /usr/bin/docker:/tmp/docker:ro"
 
     workspace:
       clean: all

From 0a734db9c0f49b6748c1aa04e8061475bb9c13be Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Wed, 16 Nov 2022 09:54:33 +0100
Subject: [PATCH 22/34] fix requirement + testing

---
 requirements/multimodal.txt                   | 2 +-
 tests/unittests/multimodal/test_clip_score.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt
index 69d510cb9bc..89029eeeb33 100644
--- a/requirements/multimodal.txt
+++ b/requirements/multimodal.txt
@@ -1 +1 @@
-transformers>=4.4.0
+transformers>=4.10.0
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 398118ae9f0..5ee389ab0dc 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -56,6 +56,7 @@ def _compare_fn(preds, target, model_name_or_path):
 @pytest.mark.parametrize("model_name_or_path", ["openai/clip-vit-base-patch32"])
 @pytest.mark.parametrize("input", [_random_input])
 @pytest.mark.skipif(not _TRANSFORMERS_AVAILABLE, reason="test requires bert_score")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires cuda")
 class TestCLIPScore(MetricTester):
     @pytest.mark.parametrize("ddp", [True, False])
     @pytest.mark.parametrize("dist_sync_on_step", [True, False])

From 8a876c6aef2fcb5ee49ad4fa28919541c60ab026 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Wed, 16 Nov 2022 10:37:39 +0100
Subject: [PATCH 23/34] more requirements

---
 requirements/text_test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/text_test.txt b/requirements/text_test.txt
index a4fff8a013a..8a4c8826960 100644
--- a/requirements/text_test.txt
+++ b/requirements/text_test.txt
@@ -1,6 +1,6 @@
 jiwer>=2.3.0
 rouge-score>=0.0.4
 bert_score==0.3.10
-transformers>=4.4.0
+transformers>=4.10.0
 huggingface-hub<0.7  # hotfix, failing SDR for latest PT 1.11
 sacrebleu>=2.0.0

From ae798458ab1e1b247a84cd6bdbe63199e99df32f Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Wed, 16 Nov 2022 10:54:35 +0100
Subject: [PATCH 24/34] fix

---
 src/torchmetrics/multimodal/clip_score.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 223c240b4b2..a80c4b53ddc 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -82,7 +82,7 @@ def __init__(
         self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
-    def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:  # type: ignore
+    def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, List[str]]) -> None:
         """Updates CLIP score on a batch of images and text.
 
         Args:

From f091e6274ee744abf0143d9a44938008fa9e2a80 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Wed, 16 Nov 2022 16:46:18 +0100
Subject: [PATCH 25/34] fix doctests

---
 src/torchmetrics/functional/multimodal/clip_score.py | 4 ++--
 src/torchmetrics/multimodal/clip_score.py            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index d78070c26c2..5460cd5182b 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -127,8 +127,8 @@ def clip_score(
         >>> import torch
         >>> _ = torch.manual_seed(42)
         >>> from torchmetrics.functional.multimodal import clip_score
-        >>> clip_score(torch.randint(255, (3, 224, 224)), "a photo of a cat")
-        tensor(19.4135, grad_fn=<SqueezeBackward0>)
+        >>> clip_score(torch.randint(255, (3, 224, 224)), "a photo of a cat", "openai/clip-vit-base-patch16")
+        tensor(24.4255, grad_fn=<MaximumBackward0>)
     """
     model, processor = _get_model_and_processor(model_name_or_path)
     device = images.device if isinstance(images, Tensor) else images[0].device
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index a80c4b53ddc..a94a2336191 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -55,9 +55,9 @@ class CLIPScore(Metric):
         >>> import torch
         >>> _ = torch.manual_seed(42)
         >>> from torchmetrics.multimodal import CLIPScore
-        >>> metric = CLIPScore()
+        >>> metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")
         >>> metric(torch.randint(255, (3, 224, 224)), "a photo of a cat")
-        tensor(19.4135, grad_fn=<SqueezeBackward0>)
+        tensor(25.0936, grad_fn=<SqueezeBackward0>)
     """
 
     is_differentiable: bool = False

From 53ec80de86d1f8de5c8dcbf2582038db5caa5542 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Wed, 16 Nov 2022 17:09:58 +0100
Subject: [PATCH 26/34] fix

---
 .github/workflows/docs-check.yml                     | 2 +-
 src/torchmetrics/functional/multimodal/clip_score.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs-check.yml b/.github/workflows/docs-check.yml
index 7fd14a61cdb..fb2611973f0 100644
--- a/.github/workflows/docs-check.yml
+++ b/.github/workflows/docs-check.yml
@@ -8,7 +8,7 @@ jobs:
 
   test-docs:
     runs-on: ubuntu-20.04
-    timeout-minutes: 15
+    timeout-minutes: 20
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index 5460cd5182b..2416c8936c3 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -51,7 +51,9 @@ def _clip_score_update(
             f"Expected the number of images and text examples to be the same but got {len(images)} and {len(text)}"
         )
     device = images[0].device
-    processed_input = processor(text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True)
+    processed_input = processor(
+        text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True
+    )  # type:ignore
 
     img_features = model.get_image_features(processed_input["pixel_values"].to(device))
     img_features = img_features / img_features.norm(p=2, dim=-1, keepdim=True)

From e3a9117d2be841af7e58d3a4ba0c3057ad93be76 Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Thu, 17 Nov 2022 09:01:13 +0100
Subject: [PATCH 27/34] remove back

---
 src/torchmetrics/functional/multimodal/clip_score.py | 5 +++--
 src/torchmetrics/multimodal/clip_score.py            | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index 2416c8936c3..01a5236c0b3 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -129,8 +129,9 @@ def clip_score(
         >>> import torch
         >>> _ = torch.manual_seed(42)
         >>> from torchmetrics.functional.multimodal import clip_score
-        >>> clip_score(torch.randint(255, (3, 224, 224)), "a photo of a cat", "openai/clip-vit-base-patch16")
-        tensor(24.4255, grad_fn=<MaximumBackward0>)
+        >>> score = clip_score(torch.randint(255, (3, 224, 224)), "a photo of a cat", "openai/clip-vit-base-patch16")
+        >>> print(score.detach())
+        tensor(24.4255)
     """
     model, processor = _get_model_and_processor(model_name_or_path)
     device = images.device if isinstance(images, Tensor) else images[0].device
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index a94a2336191..0d72bda93c4 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -56,8 +56,9 @@ class CLIPScore(Metric):
         >>> _ = torch.manual_seed(42)
         >>> from torchmetrics.multimodal import CLIPScore
         >>> metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")
-        >>> metric(torch.randint(255, (3, 224, 224)), "a photo of a cat")
-        tensor(25.0936, grad_fn=<SqueezeBackward0>)
+        >>> score = metric(torch.randint(255, (3, 224, 224)), "a photo of a cat")
+        >>> print(score.detach())
+        tensor(25.0936)
     """
 
     is_differentiable: bool = False

From ea4b11afbcb6f9499800ba265c4b53bad830ca11 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 17 Nov 2022 09:39:26 +0000
Subject: [PATCH 28/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e11bb43802..df9b3ede696 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -55,7 +55,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
-- 
+-
 
 
 ## [0.10.3] - 2022-11-16

From f1595a27cf26700960007ab41c38cabedac7fb14 Mon Sep 17 00:00:00 2001
From: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
Date: Thu, 17 Nov 2022 11:09:32 +0100
Subject: [PATCH 29/34] move section in index

---
 docs/source/index.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7dd077b8dbf..896fbfb69bd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -176,19 +176,19 @@ Or directly from conda
 
 .. toctree::
    :maxdepth: 2
-   :name: nominal
-   :caption: Nominal
+   :name: multimodal
+   :caption: Multimodal
    :glob:
 
-   nominal/*
+   multimodal/*
 
 .. toctree::
    :maxdepth: 2
-   :name: multimodal
-   :caption: Multimodal
+   :name: nominal
+   :caption: Nominal
    :glob:
 
-   multimodal/*
+   nominal/*
 
 .. toctree::
    :maxdepth: 2

From ae925b9954f51eeece440ee7c1007898407380f5 Mon Sep 17 00:00:00 2001
From: Nicki Skafte Detlefsen <skaftenicki@gmail.com>
Date: Thu, 17 Nov 2022 11:15:41 +0100
Subject: [PATCH 30/34] set min version of transformers

---
 src/torchmetrics/functional/multimodal/clip_score.py | 8 ++++----
 src/torchmetrics/multimodal/clip_score.py            | 6 +++---
 src/torchmetrics/utilities/imports.py                | 1 +
 tests/unittests/multimodal/test_clip_score.py        | 4 ++--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index 01a5236c0b3..4a259450978 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -17,9 +17,9 @@
 from torch import Tensor
 from typing_extensions import Literal
 
-from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_10
 
-if _TRANSFORMERS_AVAILABLE:
+if _TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
 else:
@@ -76,7 +76,7 @@ def _get_model_and_processor(
         "openai/clip-vit-large-patch14",
     ] = "openai/clip-vit-large-patch14",
 ) -> Tuple[_CLIPModel, _CLIPProcessor]:
-    if _TRANSFORMERS_AVAILABLE:
+    if _TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10:
         model = _CLIPModel.from_pretrained(model_name_or_path)
         processor = _CLIPProcessor.from_pretrained(model_name_or_path)
         return model, processor
@@ -119,7 +119,7 @@ def clip_score(
 
     Raises:
         ModuleNotFoundError:
-            If transformers package is not installed
+            If transformers package is not installed or version is lower than 4.10.0
         ValueError:
             If not all images have format [C, H, W]
         ValueError:
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 0d72bda93c4..9af06aa76b0 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -18,9 +18,9 @@
 from typing_extensions import Literal
 
 from torchmetrics.functional.multimodal.clip_score import _clip_score_update, _get_model_and_processor
-from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_10
 
-if not _TRANSFORMERS_AVAILABLE:
+if not (_TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10):
     __doctest_skip__ = ["CLIPScore"]
 
 from torchmetrics import Metric
@@ -49,7 +49,7 @@ class CLIPScore(Metric):
 
     Raises:
         ModuleNotFoundError:
-            If transformers package is not installed
+            If transformers package is not installed or version is lower than 4.10.0
 
     Example:
         >>> import torch
diff --git a/src/torchmetrics/utilities/imports.py b/src/torchmetrics/utilities/imports.py
index b05c1bbb04b..871655daab3 100644
--- a/src/torchmetrics/utilities/imports.py
+++ b/src/torchmetrics/utilities/imports.py
@@ -113,6 +113,7 @@ def _compare_version(package: str, op: Callable, version: str) -> Optional[bool]
 _TORCHVISION_GREATER_EQUAL_0_8: Optional[bool] = _compare_version("torchvision", operator.ge, "0.8.0")
 _TQDM_AVAILABLE: bool = _package_available("tqdm")
 _TRANSFORMERS_AVAILABLE: bool = _package_available("transformers")
+_TRANSFORMERS_GREATER_EQUAL_4_10: Optional[bool] = _compare_version("transformers", operator.ge, "4.10.0")
 _PESQ_AVAILABLE: bool = _package_available("pesq")
 _SACREBLEU_AVAILABLE: bool = _package_available("sacrebleu")
 _REGEX_AVAILABLE: bool = _package_available("regex")
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 5ee389ab0dc..0016a2d127f 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -21,7 +21,7 @@
 
 from torchmetrics.functional.multimodal.clip_score import clip_score
 from torchmetrics.multimodal.clip_score import CLIPScore
-from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_10
 from unittests.helpers import seed_all
 from unittests.helpers.testers import MetricTester
 from unittests.text.helpers import skip_on_connection_issues
@@ -55,7 +55,7 @@ def _compare_fn(preds, target, model_name_or_path):
 
 @pytest.mark.parametrize("model_name_or_path", ["openai/clip-vit-base-patch32"])
 @pytest.mark.parametrize("input", [_random_input])
-@pytest.mark.skipif(not _TRANSFORMERS_AVAILABLE, reason="test requires bert_score")
+@pytest.mark.skipif(not (_TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10), reason="test requires bert_score")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires cuda")
 class TestCLIPScore(MetricTester):
     @pytest.mark.parametrize("ddp", [True, False])

From 8feb2cb2f2130a13775647aa9fd40f271692408d Mon Sep 17 00:00:00 2001
From: SkafteNicki <skaftenicki@gmail.com>
Date: Thu, 17 Nov 2022 14:27:29 +0100
Subject: [PATCH 31/34] fix flake

---
 tests/unittests/multimodal/test_clip_score.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 0016a2d127f..61af86e92ff 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -55,7 +55,9 @@ def _compare_fn(preds, target, model_name_or_path):
 
 @pytest.mark.parametrize("model_name_or_path", ["openai/clip-vit-base-patch32"])
 @pytest.mark.parametrize("input", [_random_input])
-@pytest.mark.skipif(not (_TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10), reason="test requires bert_score")
+@pytest.mark.skipif(
+    not (_TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10), reason="test requires bert_score"
+)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires cuda")
 class TestCLIPScore(MetricTester):
     @pytest.mark.parametrize("ddp", [True, False])

From 95bd30b5fa615952d9fb580d5f8c63132875cc33 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Thu, 17 Nov 2022 14:37:08 +0100
Subject: [PATCH 32/34] simple

---
 src/torchmetrics/functional/multimodal/clip_score.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index 4a259450978..f351387dd01 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -17,9 +17,9 @@
 from torch import Tensor
 from typing_extensions import Literal
 
-from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_10
+from torchmetrics.utilities.imports import _TRANSFORMERS_GREATER_EQUAL_4_10
 
-if _TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10:
+if _TRANSFORMERS_GREATER_EQUAL_4_10:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
 else:
@@ -76,7 +76,7 @@ def _get_model_and_processor(
         "openai/clip-vit-large-patch14",
     ] = "openai/clip-vit-large-patch14",
 ) -> Tuple[_CLIPModel, _CLIPProcessor]:
-    if _TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10:
+    if _TRANSFORMERS_GREATER_EQUAL_4_10:
         model = _CLIPModel.from_pretrained(model_name_or_path)
         processor = _CLIPProcessor.from_pretrained(model_name_or_path)
         return model, processor

From 0debc25b61d914c04de36011803e03d72a7150f9 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Thu, 17 Nov 2022 14:39:29 +0100
Subject: [PATCH 33/34] Apply suggestions from code review

---
 requirements/text_test.txt                | 2 +-
 src/torchmetrics/multimodal/clip_score.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements/text_test.txt b/requirements/text_test.txt
index 8a4c8826960..2046de7363c 100644
--- a/requirements/text_test.txt
+++ b/requirements/text_test.txt
@@ -1,6 +1,6 @@
 jiwer>=2.3.0
 rouge-score>=0.0.4
 bert_score==0.3.10
-transformers>=4.10.0
+transformers>4.4.0
 huggingface-hub<0.7  # hotfix, failing SDR for latest PT 1.11
 sacrebleu>=2.0.0
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index 9af06aa76b0..cfbdcaa3f73 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -18,9 +18,9 @@
 from typing_extensions import Literal
 
 from torchmetrics.functional.multimodal.clip_score import _clip_score_update, _get_model_and_processor
-from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_10
+from torchmetrics.utilities.imports import _TRANSFORMERS_GREATER_EQUAL_4_10
 
-if not (_TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10):
+if not _TRANSFORMERS_GREATER_EQUAL_4_10:
     __doctest_skip__ = ["CLIPScore"]
 
 from torchmetrics import Metric

From 56dc6f7eb877dda55331bafbfd659c080c0e541a Mon Sep 17 00:00:00 2001
From: Jirka <jirka.borovec@seznam.cz>
Date: Thu, 17 Nov 2022 14:44:07 +0100
Subject: [PATCH 34/34] avail

---
 src/torchmetrics/functional/multimodal/clip_score.py | 6 +++---
 src/torchmetrics/multimodal/clip_score.py            | 4 ++--
 src/torchmetrics/utilities/imports.py                | 1 -
 tests/unittests/multimodal/test_clip_score.py        | 6 ++----
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/torchmetrics/functional/multimodal/clip_score.py b/src/torchmetrics/functional/multimodal/clip_score.py
index f351387dd01..b97d7c1ddf3 100644
--- a/src/torchmetrics/functional/multimodal/clip_score.py
+++ b/src/torchmetrics/functional/multimodal/clip_score.py
@@ -17,9 +17,9 @@
 from torch import Tensor
 from typing_extensions import Literal
 
-from torchmetrics.utilities.imports import _TRANSFORMERS_GREATER_EQUAL_4_10
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 
-if _TRANSFORMERS_GREATER_EQUAL_4_10:
+if _TRANSFORMERS_AVAILABLE:
     from transformers import CLIPModel as _CLIPModel
     from transformers import CLIPProcessor as _CLIPProcessor
 else:
@@ -76,7 +76,7 @@ def _get_model_and_processor(
         "openai/clip-vit-large-patch14",
     ] = "openai/clip-vit-large-patch14",
 ) -> Tuple[_CLIPModel, _CLIPProcessor]:
-    if _TRANSFORMERS_GREATER_EQUAL_4_10:
+    if _TRANSFORMERS_AVAILABLE:
         model = _CLIPModel.from_pretrained(model_name_or_path)
         processor = _CLIPProcessor.from_pretrained(model_name_or_path)
         return model, processor
diff --git a/src/torchmetrics/multimodal/clip_score.py b/src/torchmetrics/multimodal/clip_score.py
index cfbdcaa3f73..234874b931b 100644
--- a/src/torchmetrics/multimodal/clip_score.py
+++ b/src/torchmetrics/multimodal/clip_score.py
@@ -18,9 +18,9 @@
 from typing_extensions import Literal
 
 from torchmetrics.functional.multimodal.clip_score import _clip_score_update, _get_model_and_processor
-from torchmetrics.utilities.imports import _TRANSFORMERS_GREATER_EQUAL_4_10
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 
-if not _TRANSFORMERS_GREATER_EQUAL_4_10:
+if not _TRANSFORMERS_AVAILABLE:
     __doctest_skip__ = ["CLIPScore"]
 
 from torchmetrics import Metric
diff --git a/src/torchmetrics/utilities/imports.py b/src/torchmetrics/utilities/imports.py
index 871655daab3..b05c1bbb04b 100644
--- a/src/torchmetrics/utilities/imports.py
+++ b/src/torchmetrics/utilities/imports.py
@@ -113,7 +113,6 @@ def _compare_version(package: str, op: Callable, version: str) -> Optional[bool]
 _TORCHVISION_GREATER_EQUAL_0_8: Optional[bool] = _compare_version("torchvision", operator.ge, "0.8.0")
 _TQDM_AVAILABLE: bool = _package_available("tqdm")
 _TRANSFORMERS_AVAILABLE: bool = _package_available("transformers")
-_TRANSFORMERS_GREATER_EQUAL_4_10: Optional[bool] = _compare_version("transformers", operator.ge, "4.10.0")
 _PESQ_AVAILABLE: bool = _package_available("pesq")
 _SACREBLEU_AVAILABLE: bool = _package_available("sacrebleu")
 _REGEX_AVAILABLE: bool = _package_available("regex")
diff --git a/tests/unittests/multimodal/test_clip_score.py b/tests/unittests/multimodal/test_clip_score.py
index 61af86e92ff..5ee389ab0dc 100644
--- a/tests/unittests/multimodal/test_clip_score.py
+++ b/tests/unittests/multimodal/test_clip_score.py
@@ -21,7 +21,7 @@
 
 from torchmetrics.functional.multimodal.clip_score import clip_score
 from torchmetrics.multimodal.clip_score import CLIPScore
-from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE, _TRANSFORMERS_GREATER_EQUAL_4_10
+from torchmetrics.utilities.imports import _TRANSFORMERS_AVAILABLE
 from unittests.helpers import seed_all
 from unittests.helpers.testers import MetricTester
 from unittests.text.helpers import skip_on_connection_issues
@@ -55,9 +55,7 @@ def _compare_fn(preds, target, model_name_or_path):
 
 @pytest.mark.parametrize("model_name_or_path", ["openai/clip-vit-base-patch32"])
 @pytest.mark.parametrize("input", [_random_input])
-@pytest.mark.skipif(
-    not (_TRANSFORMERS_AVAILABLE and _TRANSFORMERS_GREATER_EQUAL_4_10), reason="test requires bert_score"
-)
+@pytest.mark.skipif(not _TRANSFORMERS_AVAILABLE, reason="test requires bert_score")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires cuda")
 class TestCLIPScore(MetricTester):
     @pytest.mark.parametrize("ddp", [True, False])