Skip to content

Commit 8684770

Browse files
authored
[CI] Add mteb testing to test the accuracy of the embedding model (#17175)
1 parent d6c86d0 commit 8684770

File tree

6 files changed

+64
-5
lines changed

6 files changed

+64
-5
lines changed

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ num2words # required for smolvlm test
3333
opencv-python-headless >= 4.11.0 # required for video test
3434
datamodel_code_generator # required for minicpm3 test
3535
lm-eval[api]==0.4.8 # required for model evaluation test
36+
mteb>=1.38.11, <2 # required for mteb test
3637
transformers==4.51.3
3738
tokenizers==0.21.1
3839
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.

requirements/test.txt

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ datasets==3.0.2
9999
# via
100100
# evaluate
101101
# lm-eval
102+
# mteb
102103
decorator==5.1.1
103104
# via librosa
104105
dill==0.3.8
@@ -124,6 +125,8 @@ email-validator==2.2.0
124125
# via pydantic
125126
encodec==0.1.1
126127
# via vocos
128+
eval-type-backport==0.2.2
129+
# via mteb
127130
evaluate==0.4.3
128131
# via lm-eval
129132
fastparquet==2024.11.0
@@ -291,6 +294,8 @@ msgpack==1.1.0
291294
# via
292295
# librosa
293296
# ray
297+
mteb==1.38.11
298+
# via -r requirements/test.in
294299
multidict==6.1.0
295300
# via
296301
# aiohttp
@@ -331,6 +336,7 @@ numpy==1.26.4
331336
# librosa
332337
# matplotlib
333338
# mistral-common
339+
# mteb
334340
# numba
335341
# numexpr
336342
# opencv-python-headless
@@ -443,6 +449,8 @@ plotly==5.24.1
443449
# via genai-perf
444450
pluggy==1.5.0
445451
# via pytest
452+
polars==1.29.0
453+
# via mteb
446454
pooch==1.8.2
447455
# via librosa
448456
portalocker==2.10.1
@@ -476,6 +484,7 @@ pydantic==2.9.2
476484
# via
477485
# datamodel-code-generator
478486
# mistral-common
487+
# mteb
479488
pydantic-core==2.23.4
480489
# via pydantic
481490
pygments==2.18.0
@@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
522531
# typepy
523532
python-rapidjson==1.20
524533
# via tritonclient
534+
pytrec-eval-terrier==0.5.7
535+
# via mteb
525536
pytz==2024.2
526537
# via
527538
# pandas
@@ -564,6 +575,7 @@ requests==2.32.3
564575
# huggingface-hub
565576
# lm-eval
566577
# mistral-common
578+
# mteb
567579
# pooch
568580
# ray
569581
# responses
@@ -580,6 +592,7 @@ rfc3987==1.3.8
580592
rich==13.9.4
581593
# via
582594
# genai-perf
595+
# mteb
583596
# typer
584597
rouge-score==0.1.2
585598
# via lm-eval
@@ -607,16 +620,20 @@ scikit-learn==1.5.2
607620
# via
608621
# librosa
609622
# lm-eval
623+
# mteb
610624
# sentence-transformers
611625
scipy==1.13.1
612626
# via
613627
# librosa
628+
# mteb
614629
# scikit-learn
615630
# sentence-transformers
616631
# statsmodels
617632
# vocos
618633
sentence-transformers==3.2.1
619-
# via -r requirements/test.in
634+
# via
635+
# -r requirements/test.in
636+
# mteb
620637
sentencepiece==0.2.0
621638
# via mistral-common
622639
setuptools==77.0.3
@@ -696,6 +713,7 @@ torch==2.7.0+cu128
696713
# fastsafetensors
697714
# lm-eval
698715
# mamba-ssm
716+
# mteb
699717
# peft
700718
# runai-model-streamer
701719
# sentence-transformers
@@ -720,6 +738,7 @@ tqdm==4.66.6
720738
# evaluate
721739
# huggingface-hub
722740
# lm-eval
741+
# mteb
723742
# nltk
724743
# peft
725744
# pqdm
@@ -759,6 +778,7 @@ typing-extensions==4.12.2
759778
# huggingface-hub
760779
# librosa
761780
# mistral-common
781+
# mteb
762782
# pqdm
763783
# pydantic
764784
# pydantic-core
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
import math
3+
import os
4+
5+
import pytest
6+
7+
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
8+
OpenAIClientMtebEncoder,
9+
run_mteb_embed_task,
10+
run_mteb_embed_task_st)
11+
from tests.utils import RemoteOpenAIServer
12+
13+
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
14+
15+
MODEL_NAME = "BAAI/bge-m3"
16+
DTYPE = "float16"
17+
MAIN_SCORE = 0.7873427091972599
18+
19+
20+
@pytest.fixture(scope="module")
21+
def server():
22+
args = [
23+
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
24+
"--max-model-len", "512"
25+
]
26+
27+
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
28+
yield remote_server
29+
30+
31+
def test_mteb(server):
32+
client = server.get_client()
33+
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
34+
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
35+
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
36+
MODEL_NAME, MTEB_EMBED_TASKS)
37+
38+
print("VLLM main score: ", vllm_main_score)
39+
print("SentenceTransformer main score: ", st_main_score)
40+
print("Difference: ", st_main_score - vllm_main_score)
41+
42+
assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4)

tests/models/language/pooling/test_gte.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@
5858
@pytest.mark.parametrize("model_info", MODELS)
5959
def test_models_mteb(hf_runner, vllm_runner,
6060
model_info: EmbedModelInfo) -> None:
61-
pytest.skip("Skipping mteb test.")
62-
6361
from .mteb_utils import mteb_test_embed_models
6462

6563
vllm_extra_kwargs: dict[str, Any] = {}

tests/models/language/pooling/test_nomic.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
@pytest.mark.parametrize("model_info", MODELS)
2424
def test_models_mteb(hf_runner, vllm_runner,
2525
model_info: EmbedModelInfo) -> None:
26-
pytest.skip("Skipping mteb test.")
2726
from .mteb_utils import mteb_test_embed_models
2827
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
2928

tests/models/language/pooling/test_snowflake_arctic_embed.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ def test_models_mteb(
4646
vllm_runner,
4747
model_info: EmbedModelInfo,
4848
) -> None:
49-
pytest.skip("Skipping mteb test.")
5049
from .mteb_utils import mteb_test_embed_models
5150
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
5251

0 commit comments

Comments
 (0)