2
2
3
3
from ragbits .core .embeddings .base import VectorSize
4
4
from ragbits .core .embeddings .sparse .bag_of_tokens import BagOfTokens , BagOfTokensOptions
5
- from ragbits .core .types import NOT_GIVEN
6
5
7
6
8
7
async def test_bag_of_tokens_get_vector_size_with_encoding ():
9
8
"""Test BagOfTokens get_vector_size method with encoding_name."""
10
- options = BagOfTokensOptions (encoding_name = "cl100k_base" , model_name = NOT_GIVEN )
11
- embedder = BagOfTokens (default_options = options )
9
+ embedder = BagOfTokens (encoding_name = "cl100k_base" )
12
10
13
11
vector_size = await embedder .get_vector_size ()
14
12
@@ -20,8 +18,7 @@ async def test_bag_of_tokens_get_vector_size_with_encoding():
20
18
21
19
async def test_bag_of_tokens_get_vector_size_with_model ():
22
20
"""Test BagOfTokens get_vector_size method with model_name."""
23
- options = BagOfTokensOptions (model_name = "gpt-3.5-turbo" )
24
- embedder = BagOfTokens (default_options = options )
21
+ embedder = BagOfTokens (model_name = "gpt-3.5-turbo" )
25
22
26
23
vector_size = await embedder .get_vector_size ()
27
24
@@ -44,26 +41,22 @@ async def test_bag_of_tokens_get_vector_size_default():
44
41
45
42
async def test_bag_of_tokens_get_vector_size_error_both_specified ():
46
43
"""Test BagOfTokens get_vector_size raises error when both encoding_name and model_name are specified."""
47
- options = BagOfTokensOptions (encoding_name = "cl100k_base" , model_name = "gpt-3.5-turbo" )
48
- embedder = BagOfTokens (default_options = options )
49
-
50
44
with pytest .raises (ValueError , match = "Please specify only one of encoding_name or model_name" ):
51
- await embedder . get_vector_size ( )
45
+ BagOfTokens ( encoding_name = "cl100k_base" , model_name = "gpt-3.5-turbo" )
52
46
53
47
54
48
async def test_bag_of_tokens_get_vector_size_error_none_specified ():
55
49
"""Test BagOfTokens get_vector_size raises error when neither encoding_name nor model_name are specified."""
56
- options = BagOfTokensOptions ( encoding_name = NOT_GIVEN , model_name = NOT_GIVEN )
57
- embedder = BagOfTokens ( default_options = options )
58
-
59
- with pytest . raises ( ValueError , match = "Either encoding_name or model_name needs to be specified" ):
60
- await embedder . get_vector_size ()
50
+ # This test is no longer valid since we now default to gpt-4o when nothing is specified
51
+ # The constructor will automatically use gpt-4o as default
52
+ embedder = BagOfTokens ()
53
+ vector_size = await embedder . get_vector_size ()
54
+ assert vector_size . size > 0 # Should succeed with default gpt-4o
61
55
62
56
63
57
async def test_bag_of_tokens_embed_text_consistency ():
64
58
"""Test that BagOfTokens embeddings are consistent with vector size."""
65
- options = BagOfTokensOptions (encoding_name = "cl100k_base" , model_name = NOT_GIVEN )
66
- embedder = BagOfTokens (default_options = options )
59
+ embedder = BagOfTokens (encoding_name = "cl100k_base" )
67
60
68
61
# Get vector size
69
62
vector_size = await embedder .get_vector_size ()
@@ -79,15 +72,26 @@ async def test_bag_of_tokens_embed_text_consistency():
79
72
80
73
async def test_bag_of_tokens_different_encodings ():
81
74
"""Test BagOfTokens with different encodings have different vocabulary sizes."""
82
- options1 = BagOfTokensOptions (encoding_name = "cl100k_base" , model_name = NOT_GIVEN )
83
- embedder1 = BagOfTokens (default_options = options1 )
84
-
85
- options2 = BagOfTokensOptions (encoding_name = "p50k_base" , model_name = NOT_GIVEN )
86
- embedder2 = BagOfTokens (default_options = options2 )
75
+ embedder1 = BagOfTokens (encoding_name = "cl100k_base" )
76
+ embedder2 = BagOfTokens (encoding_name = "p50k_base" )
87
77
88
78
vector_size1 = await embedder1 .get_vector_size ()
89
79
vector_size2 = await embedder2 .get_vector_size ()
90
80
91
81
assert vector_size1 .size != vector_size2 .size
92
82
assert vector_size1 .is_sparse is True
93
83
assert vector_size2 .is_sparse is True
84
+
85
+
86
+ async def test_bag_of_tokens_min_token_count_option ():
87
+ """Test BagOfTokens with min_token_count option."""
88
+ embedder = BagOfTokens (encoding_name = "cl100k_base" )
89
+ options = BagOfTokensOptions (min_token_count = 2 )
90
+
91
+ # Test with text that has some repeated tokens
92
+ embeddings = await embedder .embed_text (["test test test" ], options = options )
93
+
94
+ # Should have embeddings (non-empty vectors)
95
+ assert len (embeddings ) == 1
96
+ assert len (embeddings [0 ].indices ) > 0
97
+ assert len (embeddings [0 ].values ) > 0
0 commit comments