Skip to content

Commit cd4f91c

Browse files
authored
feat: add get_vector_size method to embedder interface (#587)
1 parent d6c6eac commit cd4f91c

19 files changed

+667
-6
lines changed

packages/ragbits-core/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## Unreleased
44

5+
- feat: add get_vector_size method to all Embedders (#587)
6+
57
## 0.19.1 (2025-05-27)
68

79
## 0.19.0 (2025-05-27)

packages/ragbits-core/src/ragbits/core/embeddings/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .base import Embedder, EmbedderOptionsT, SparseVector
1+
from .base import Embedder, EmbedderOptionsT, SparseVector, VectorSize
22
from .dense import DenseEmbedder, LiteLLMEmbedder, NoopEmbedder
33
from .sparse import BagOfTokens, BagOfTokensOptions, SparseEmbedder, SparseEmbedderOptionsT
44

@@ -13,4 +13,5 @@
1313
"SparseEmbedder",
1414
"SparseEmbedderOptionsT",
1515
"SparseVector",
16+
"VectorSize",
1617
]

packages/ragbits-core/src/ragbits/core/embeddings/base.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ def __repr__(self) -> str:
2424
return f"SparseVector(indices={self.indices}, values={self.values})"
2525

2626

27+
class VectorSize(BaseModel):
28+
"""Information about vector dimensions returned by an embedder"""
29+
30+
size: int
31+
"""The size/dimension of the vector"""
32+
33+
is_sparse: bool = False
34+
"""Whether this represents a sparse vector (where size is vocabulary size) or dense vector"""
35+
36+
2737
class Embedder(ConfigurableComponent[EmbedderOptionsT], ABC):
2838
"""
2939
Abstract class that defines a common interface for both sparse and dense embedding models.
@@ -48,6 +58,15 @@ async def embed_text(
4858
List of embeddings for the given strings.
4959
"""
5060

61+
@abstractmethod
62+
async def get_vector_size(self) -> VectorSize:
63+
"""
64+
Get information about the vector size/dimensions returned by this embedder.
65+
66+
Returns:
67+
VectorSize object containing dimension information and whether vectors are sparse.
68+
"""
69+
5170
def image_support(self) -> bool: # noqa: PLR6301
5271
"""
5372
Check if the model supports image embeddings.

packages/ragbits-core/src/ragbits/core/embeddings/dense/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from abc import ABC, abstractmethod
22

3-
from ragbits.core.embeddings.base import Embedder, EmbedderOptionsT
3+
from ragbits.core.embeddings.base import Embedder, EmbedderOptionsT, VectorSize
44

55

66
class DenseEmbedder(Embedder[EmbedderOptionsT], ABC): # noqa: F821
@@ -21,6 +21,15 @@ async def embed_text(self, data: list[str], options: EmbedderOptionsT | None = N
2121
List of embeddings for the given strings.
2222
"""
2323

24+
@abstractmethod
25+
async def get_vector_size(self) -> VectorSize:
26+
"""
27+
Get information about the dense vector size/dimensions returned by this embedder.
28+
29+
Returns:
30+
VectorSize object with is_sparse=False and the embedding dimension.
31+
"""
32+
2433
async def embed_image(self, images: list[bytes], options: EmbedderOptionsT | None = None) -> list[list[float]]:
2534
"""
2635
Creates embeddings for the given images.

packages/ragbits-core/src/ragbits/core/embeddings/dense/fastembed.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from fastembed import TextEmbedding
44

55
from ragbits.core.audit.traces import trace
6+
from ragbits.core.embeddings.base import VectorSize
67
from ragbits.core.embeddings.dense.base import DenseEmbedder, EmbedderOptionsT
78
from ragbits.core.options import Options
89

@@ -43,6 +44,26 @@ def __reduce__(self) -> tuple[Callable, tuple[str, bool, FastEmbedOptions | None
4344
"""
4445
return (self.__class__, (self.model_name, self.use_gpu, self.default_options))
4546

47+
async def get_vector_size(self) -> VectorSize:
48+
"""
49+
Get the vector size for this FastEmbed model.
50+
51+
Returns:
52+
VectorSize object with the model's embedding dimension.
53+
"""
54+
# Get model info from FastEmbed's supported models list
55+
supported_models = self._model.list_supported_models()
56+
model_info = next((model for model in supported_models if model["model"] == self.model_name), None)
57+
58+
if model_info and "dim" in model_info:
59+
vector_size = model_info["dim"]
60+
else:
61+
# Fallback to the original method if metadata is not available
62+
sample_embedding = await self.embed_text(["sample"])
63+
vector_size = len(sample_embedding[0])
64+
65+
return VectorSize(size=vector_size, is_sparse=False)
66+
4667
async def embed_text(self, data: list[str], options: EmbedderOptionsT | None = None) -> list[list[float]]:
4768
"""
4869
Embeds a list of strings into a list of embeddings.

packages/ragbits-core/src/ragbits/core/embeddings/dense/litellm.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from typing import Any
1+
from typing import Any, cast
22

33
import litellm
44
from typing_extensions import Self
55

66
from ragbits.core.audit.traces import trace
7+
from ragbits.core.embeddings.base import VectorSize
78
from ragbits.core.embeddings.dense.base import DenseEmbedder
89
from ragbits.core.embeddings.exceptions import (
910
EmbeddingConnectionError,
@@ -68,6 +69,29 @@ def __init__(
6869
self.api_version = api_version
6970
self.router = router
7071

72+
async def get_vector_size(self) -> VectorSize:
73+
"""
74+
Get the vector size for this LiteLLM model.
75+
76+
If dimensions are specified in default options, use that value.
77+
Otherwise, embed a sample text to determine the dimension.
78+
79+
Returns:
80+
VectorSize object with the model's embedding dimension.
81+
"""
82+
# Check if dimensions are explicitly set in default options
83+
if (
84+
self.default_options
85+
and self.default_options.dimensions is not NOT_GIVEN
86+
and self.default_options.dimensions is not None
87+
):
88+
# We've checked that dimensions is not None and not NOT_GIVEN, so it must be int
89+
return VectorSize(size=cast(int, self.default_options.dimensions), is_sparse=False)
90+
91+
# If no dimensions specified, embed a sample text to determine size
92+
sample_embedding = await self.embed_text(["sample"])
93+
return VectorSize(size=len(sample_embedding[0]), is_sparse=False)
94+
7195
async def embed_text(self, data: list[str], options: LiteLLMEmbedderOptions | None = None) -> list[list[float]]:
7296
"""
7397
Creates embeddings for the given strings.

packages/ragbits-core/src/ragbits/core/embeddings/dense/local.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Any
33

44
from ragbits.core.audit.traces import trace
5+
from ragbits.core.embeddings.base import VectorSize
56
from ragbits.core.embeddings.dense.base import DenseEmbedder
67
from ragbits.core.options import Options
78

@@ -55,6 +56,19 @@ def __init__(
5556
self.model_name = model_name
5657
self.model = SentenceTransformer(self.model_name, **model_kwargs)
5758

59+
async def get_vector_size(self) -> VectorSize:
60+
"""
61+
Get the vector size for this local SentenceTransformer model.
62+
63+
Returns:
64+
VectorSize object with the model's embedding dimension.
65+
"""
66+
dimension = self.model.get_sentence_embedding_dimension()
67+
if dimension is None:
68+
sample_embedding = await self.embed_text(["sample"])
69+
dimension = len(sample_embedding[0])
70+
return VectorSize(size=dimension, is_sparse=False)
71+
5872
async def embed_text(self, data: list[str], options: LocalEmbedderOptions | None = None) -> list[list[float]]:
5973
"""
6074
Calls the appropriate encoder endpoint with the given data and options.

packages/ragbits-core/src/ragbits/core/embeddings/dense/noop.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from ragbits.core.audit.traces import trace
2+
from ragbits.core.embeddings.base import VectorSize
23
from ragbits.core.embeddings.dense.base import DenseEmbedder
34
from ragbits.core.options import Options, OptionsT
45

@@ -37,6 +38,15 @@ def __init__(
3738
self.return_cycle = 0
3839
self.image_return_cycle = 0
3940

41+
async def get_vector_size(self) -> VectorSize:
42+
"""
43+
Get the vector size for this NoopEmbedder.
44+
45+
Returns:
46+
VectorSize object with the dimension of the first embedding vector.
47+
"""
48+
return VectorSize(size=len(self.return_values[0][0]), is_sparse=False)
49+
4050
async def embed_text(self, data: list[str], options: Options | None = None) -> list[list[float]]: # noqa: PLR6301
4151
"""
4252
Embeds a list of strings into a list of vectors.

packages/ragbits-core/src/ragbits/core/embeddings/dense/vertex_multimodal.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import base64
33

4+
from ragbits.core.embeddings.base import VectorSize
45
from ragbits.core.embeddings.dense.litellm import LiteLLMEmbedderOptions
56

67
try:
@@ -69,6 +70,18 @@ def __init__(
6970
if model_name not in supported_models:
7071
raise ValueError(f"Model {model_name} is not supported by VertexAI multimodal embeddings")
7172

73+
async def get_vector_size(self) -> VectorSize:
74+
"""
75+
Get the vector size for this VertexAI multimodal model.
76+
77+
Embeds a sample text to determine the dimension.
78+
79+
Returns:
80+
VectorSize object with the model's embedding dimension.
81+
"""
82+
sample_embedding = await self.embed_text(["sample"])
83+
return VectorSize(size=len(sample_embedding[0]), is_sparse=False)
84+
7285
async def _embed(self, data: list[dict], options: LiteLLMEmbedderOptions | None = None) -> list[dict]:
7386
"""
7487
Creates embeddings for the given data. The format is defined in the VertexAI API:

packages/ragbits-core/src/ragbits/core/embeddings/sparse/bag_of_tokens.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import tiktoken
44

55
from ragbits.core.audit.traces import trace
6-
from ragbits.core.embeddings.base import SparseVector
6+
from ragbits.core.embeddings.base import SparseVector, VectorSize
77
from ragbits.core.embeddings.sparse.base import SparseEmbedder
88
from ragbits.core.options import Options
99
from ragbits.core.types import NOT_GIVEN, NotGiven
@@ -22,6 +22,33 @@ class BagOfTokens(SparseEmbedder[BagOfTokensOptions]):
2222

2323
options_cls = BagOfTokensOptions
2424

25+
async def get_vector_size(self) -> VectorSize:
26+
"""
27+
Get the vector size for this BagOfTokens model.
28+
29+
For BagOfTokens, this returns the tokenizer vocabulary size.
30+
31+
Returns:
32+
VectorSize object with is_sparse=True and the vocabulary size.
33+
"""
34+
merged_options = self.default_options
35+
36+
if merged_options.encoding_name and merged_options.model_name:
37+
raise ValueError("Please specify only one of encoding_name or model_name")
38+
if not (merged_options.encoding_name or merged_options.model_name):
39+
raise ValueError("Either encoding_name or model_name needs to be specified")
40+
41+
if merged_options.encoding_name:
42+
encoder = tiktoken.get_encoding(encoding_name=merged_options.encoding_name)
43+
elif merged_options.model_name:
44+
encoder = tiktoken.encoding_for_model(model_name=merged_options.model_name)
45+
else:
46+
raise ValueError("Either encoding_name or model_name needs to be specified")
47+
48+
# Get the vocabulary size from the encoder
49+
vocab_size = encoder.n_vocab
50+
return VectorSize(size=vocab_size, is_sparse=True)
51+
2552
async def embed_text(self, texts: list[str], options: BagOfTokensOptions | None = None) -> list[SparseVector]:
2653
"""
2754
Transforms a list of texts into sparse vectors using bag-of-tokens representation.

packages/ragbits-core/src/ragbits/core/embeddings/sparse/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from abc import ABC, abstractmethod
22
from typing import TypeVar
33

4-
from ragbits.core.embeddings.base import Embedder, SparseVector
4+
from ragbits.core.embeddings.base import Embedder, SparseVector, VectorSize
55
from ragbits.core.options import Options
66

77
SparseEmbedderOptionsT = TypeVar("SparseEmbedderOptionsT", bound=Options)
@@ -23,6 +23,15 @@ async def embed_text(self, texts: list[str], options: SparseEmbedderOptionsT | N
2323
list of sparse embeddings.
2424
"""
2525

26+
@abstractmethod
27+
async def get_vector_size(self) -> VectorSize:
28+
"""
29+
Get information about the sparse vector size/dimensions returned by this embedder.
30+
31+
Returns:
32+
VectorSize object with is_sparse=True and the vocabulary size.
33+
"""
34+
2635
async def embed_image(
2736
self, images: list[bytes], options: SparseEmbedderOptionsT | None = None
2837
) -> list[SparseVector]:

packages/ragbits-core/src/ragbits/core/embeddings/sparse/fastembed.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from fastembed import SparseTextEmbedding
44

55
from ragbits.core.audit.traces import trace
6-
from ragbits.core.embeddings.base import EmbedderOptionsT, SparseVector
6+
from ragbits.core.embeddings.base import EmbedderOptionsT, SparseVector, VectorSize
77
from ragbits.core.embeddings.dense.fastembed import FastEmbedOptions
88
from ragbits.core.embeddings.sparse.base import SparseEmbedder
99

@@ -35,6 +35,29 @@ def __reduce__(self) -> tuple[Callable, tuple[str, bool, FastEmbedOptions | None
3535
"""
3636
return (self.__class__, (self.model_name, self.use_gpu, self.default_options))
3737

38+
async def get_vector_size(self) -> VectorSize:
39+
"""
40+
Get the vector size for this FastEmbed sparse model.
41+
42+
For sparse models, this returns the vocabulary size.
43+
44+
Returns:
45+
VectorSize object with is_sparse=True and the vocabulary size.
46+
"""
47+
# Get model info from FastEmbed's supported models list
48+
supported_models = self._model.list_supported_models()
49+
model_info = next((model for model in supported_models if model["model"] == self.model_name), None)
50+
51+
if model_info and "vocab_size" in model_info:
52+
vocab_size = model_info["vocab_size"]
53+
else:
54+
sample_embedding = await self.embed_text(["sample text with various tokens"])
55+
vocab_size = (
56+
max(sample_embedding[0].indices) + 1 if sample_embedding and sample_embedding[0].indices else 30000
57+
)
58+
59+
return VectorSize(size=vocab_size, is_sparse=True)
60+
3861
async def embed_text(self, data: list[str], options: EmbedderOptionsT | None = None) -> list[SparseVector]:
3962
"""
4063
Embeds a list of strings into a list of sparse embeddings.

0 commit comments

Comments
 (0)