Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for FinMTEB benchmark #1379

Open
wants to merge 12 commits into
base: v2.0.0
Choose a base branch
from
22 changes: 14 additions & 8 deletions mteb/abstasks/AbsTaskSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class AbsTaskSTS(AbsTask):
"""

abstask_prompt = "Retrieve semantically similar text."
reference_summaries_column: str = "sentence1"
generated_summaries_column: str = "sentence2"

def __init__(self, **kwargs):
super().__init__(**kwargs)
Expand All @@ -56,8 +58,8 @@ def normalize(x):

normalized_scores = list(map(normalize, data_split["score"]))
evaluator = STSEvaluator(
data_split["sentence1"],
data_split["sentence2"],
data_split[self.reference_summaries_column],
data_split[self.generated_summaries_column],
normalized_scores,
task_name=self.metadata.name,
**kwargs,
Expand All @@ -74,20 +76,24 @@ def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> STSDescriptiveStatistics:
if hf_subset:
sentence1 = self.dataset[hf_subset][split]["sentence1"]
sentence2 = self.dataset[hf_subset][split]["sentence2"]
sentence1 = self.dataset[hf_subset][split][self.reference_summaries_column]
sentence2 = self.dataset[hf_subset][split][self.generated_summaries_column]
score = self.dataset[hf_subset][split]["score"]
elif compute_overall:
sentence1 = []
sentence2 = []
score = []
for hf_subset in self.metadata.eval_langs:
sentence1.extend(self.dataset[hf_subset][split]["sentence1"])
sentence2.extend(self.dataset[hf_subset][split]["sentence2"])
sentence1.extend(
self.dataset[hf_subset][split][self.reference_summaries_column]
)
sentence2.extend(
self.dataset[hf_subset][split][self.generated_summaries_column]
)
score.extend(self.dataset[hf_subset][split]["score"])
else:
sentence1 = self.dataset[split]["sentence1"]
sentence2 = self.dataset[split]["sentence2"]
sentence1 = self.dataset[split][self.reference_summaries_column]
sentence2 = self.dataset[split][self.generated_summaries_column]
score = self.dataset[split]["score"]

total_sentence1_len = sum([len(s) for s in sentence1])
Expand Down
23 changes: 15 additions & 8 deletions mteb/abstasks/AbsTaskSummarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class AbsTaskSummarization(AbsTask):
"Given a news summary, retrieve other semantically similar summaries."
)

reference_summaries_column: str = "human_summaries"
generated_summaries_column: str = "machine_summaries"

def __init__(self, **kwargs):
super().__init__(**kwargs)

Expand All @@ -66,8 +69,8 @@ def _evaluate_subset(
for x in data_split["relevance"]
]
evaluator = self.evalutor(
machine_summaries=data_split["machine_summaries"],
human_summaries=data_split["human_summaries"],
machine_summaries=data_split[self.generated_summaries_column],
human_summaries=data_split[self.reference_summaries_column],
texts=data_split["text"],
gold_scores=normalized_scores,
task_name=self.metadata.name,
Expand All @@ -85,8 +88,12 @@ def _calculate_metrics_from_split(
) -> SummarizationDescriptiveStatistics:
if hf_subset:
text = self.dataset[hf_subset][split]["text"]
human_summaries = self.dataset[hf_subset][split]["human_summaries"]
machine_summaries = self.dataset[hf_subset][split]["machine_summaries"]
human_summaries = self.dataset[hf_subset][split][
self.reference_summaries_column
]
machine_summaries = self.dataset[hf_subset][split][
self.generated_summaries_column
]
relevance = self.dataset[hf_subset][split]["relevance"]
elif compute_overall:
text = []
Expand All @@ -97,16 +104,16 @@ def _calculate_metrics_from_split(
for hf_subset in self.metadata.eval_langs:
text.extend(self.dataset[hf_subset][split]["text"])
human_summaries.extend(
self.dataset[hf_subset][split]["human_summaries"]
self.dataset[hf_subset][split][self.reference_summaries_column]
)
machine_summaries.extend(
self.dataset[hf_subset][split]["machine_summaries"]
self.dataset[hf_subset][split][self.generated_summaries_column]
)
relevance.extend(self.dataset[hf_subset][split]["relevance"])
else:
text = self.dataset[split]["text"]
human_summaries = self.dataset[split]["human_summaries"]
machine_summaries = self.dataset[split]["machine_summaries"]
human_summaries = self.dataset[split][self.reference_summaries_column]
machine_summaries = self.dataset[split][self.generated_summaries_column]
relevance = self.dataset[split]["relevance"]

total_text_len = sum(len(x) for x in text)
Expand Down
107 changes: 95 additions & 12 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from collections.abc import Sequence
from dataclasses import dataclass
from functools import lru_cache
from typing import Annotated

from pydantic import AnyUrl, BeforeValidator, TypeAdapter
Expand All @@ -14,7 +13,8 @@

http_url_adapter = TypeAdapter(AnyUrl)
UrlString = Annotated[
str, BeforeValidator(lambda value: str(http_url_adapter.validate_python(value)))
str,
BeforeValidator(lambda value: str(http_url_adapter.validate_python(value))),
] # Allows the type to be a string, but ensures that the string is a URL


Expand Down Expand Up @@ -258,13 +258,13 @@ def load_results(
description="Main Russian benchmarks from MTEB",
reference="https://aclanthology.org/2023.eacl-main.148/",
citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design},
author={Artem Snegirev and Maria Tikhonova and Anna Maksimova and Alena Fenogenova and Alexander Abramov},
year={2024},
eprint={2408.12503},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2408.12503},
url={https://arxiv.org/abs/2408.12503},
}
""",
)
Expand All @@ -281,7 +281,7 @@ def load_results(
description="Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.",
reference="https://arxiv.org/abs/2403.15246",
citation="""@misc{weller2024followir,
title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions},
title={FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions},
author={Orion Weller and Benjamin Chang and Sean MacAvaney and Kyle Lo and Arman Cohan and Benjamin Van Durme and Dawn Lawrie and Luca Soldaini},
year={2024},
eprint={2403.15246},
Expand Down Expand Up @@ -376,7 +376,7 @@ def load_results(
description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.",
reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/",
citation="""@misc{enevoldsen2024scandinavian,
title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding},
title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding},
author={Kenneth Enevoldsen and Márton Kardos and Niklas Muennighoff and Kristoffer Laigaard Nielbo},
year={2024},
eprint={2406.02396},
Expand Down Expand Up @@ -404,13 +404,13 @@ def load_results(
description="CoIR: A Comprehensive Benchmark for Code Information Retrieval Models",
reference="https://github.com/CoIR-team/coir",
citation="""@misc{li2024coircomprehensivebenchmarkcode,
title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models},
title={CoIR: A Comprehensive Benchmark for Code Information Retrieval Models},
author={Xiangyang Li and Kuicai Dong and Yi Quan Lee and Wei Xia and Yichun Yin and Hao Zhang and Yong Liu and Yasheng Wang and Ruiming Tang},
year={2024},
eprint={2407.02883},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2407.02883},
url={https://arxiv.org/abs/2407.02883},
}""",
)

Expand Down Expand Up @@ -456,13 +456,13 @@ def load_results(
description="Main French benchmarks from MTEB",
reference="https://arxiv.org/abs/2405.20468",
citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence,
title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis},
title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis},
author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini},
year={2024},
eprint={2405.20468},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.20468},
url={https://arxiv.org/abs/2405.20468},
}""",
)

Expand Down Expand Up @@ -502,13 +502,13 @@ def load_results(
description="Main German benchmarks from MTEB",
reference="https://arxiv.org/html/2401.02709v1",
citation="""@misc{wehrli2024germantextembeddingclustering,
title={German Text Embedding Clustering Benchmark},
title={German Text Embedding Clustering Benchmark},
author={Silvan Wehrli and Bert Arnrich and Christopher Irrgang},
year={2024},
eprint={2401.02709},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2401.02709},
url={https://arxiv.org/abs/2401.02709},
}""",
)

Expand Down Expand Up @@ -919,3 +919,86 @@ def load_results(
reference=None,
citation=None,
)

FinMTEB = Benchmark(
name="MTEB(Finance)",
tasks=get_tasks(
languages=["eng", "zho"],
tasks=[
"FinancialPhraseBankClassification",
"FinSentClassification",
"FiQAClassification",
"SemEva2017Classification",
"FLSClassification",
"ESGClassification",
"FOMCClassification",
"FinancialFraudClassification",
"FinNSPClassification",
"FinChinaSentimentClassification",
"FinFEClassification",
"OpenFinDataSentimentClassification",
"Weibo21Classification",
"MInDS14EnClustering",
"ComplaintsClustering",
"PiiClustering",
"FinanceArxivS2SClustering",
"FinanceArxivP2PClustering",
"WikiCompany2IndustryClustering",
"MInDS14ZhClustering",
"FinNLClustering",
"CCKS2022Clustering",
"CCKS2020Clustering",
"CCKS2019Clustering",
"HeadlineACPairClassification",
"HeadlinePDDPairClassification",
"HeadlinePDUPairClassification",
"AFQMCPairClassification",
"FinFactReranking",
"FiQA2018Reranking",
"HC3Reranking",
"FinEvaReranking",
"DISCFinLLMReranking",
"FiQA2018",
"FinanceBenchRetrieval",
"HC3Retrieval",
"Apple10KRetrieval",
"FinQARetrieval",
"TATQARetrieval",
"USNewsRetrieval",
"TradeTheEventEncyclopediaRetrieval",
"TradeTheEventNewsRetrieval",
"TheGoldmanEnRetrieval",
"FinTruthQARetrieval",
"FinEvaRetrieval",
"AlphaFinRetrieval",
"DISCFinLLMRetrieval",
"DISCFinLLMComputingRetrieval",
"DuEEFinRetrieval",
"SmoothNLPRetrieval",
"THUCNewsRetrieval",
"FinEvaEncyclopediaRetrieval",
"TheGoldmanZhRetrieval",
"FINAL",
"FinSTS",
"AFQMC",
"BQCorpus",
"Ectsum",
"FINDsum",
"FNS2022sum",
"FiNNAsum",
"FinEvaHeadlinesum",
"FinEvasum",
],
),
description="FinMTEB is an embedding benchmark consists of 64 financial domain-specific text datasets, across English and Chinese.",
reference="https://arxiv.org/abs/2409.18511v1",
citation="""@misc{tang2024needdomainspecificembeddingmodels,
title={Do We Need Domain-Specific Embedding Models? An Empirical Investigation},
author={Yixuan Tang and Yi Yang},
year={2024},
eprint={2409.18511},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.18511},
}""",
)
13 changes: 13 additions & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,21 @@
from .eng.Banking77Classification import *
from .eng.DBpediaClassification import *
from .eng.EmotionClassification import *
from .eng.ESGClassification import *
from .eng.FinancialFraudClassification import *
from .eng.FinancialPhraseBankClassification import *
from .eng.FinancialPhrasebankClassification import *
from .eng.FinSentClassification import *
from .eng.FiQAClassification import *
from .eng.FLSClassification import *
from .eng.FOMCClassification import *
from .eng.FrenkEnClassification import *
from .eng.ImdbClassification import *
from .eng.LegalBenchClassification import *
from .eng.NewsClassification import *
from .eng.PatentClassification import *
from .eng.PoemSentimentClassification import *
from .eng.SemEva2017Classification import *
from .eng.ToxicChatClassification import *
from .eng.ToxicConversationsClassification import *
from .eng.TweetSentimentExtractionClassification import *
Expand Down Expand Up @@ -138,6 +146,11 @@
from .urd.UrduRomanSentimentClassification import *
from .vie.VieStudentFeedbackClassification import *
from .zho.CMTEBClassification import *
from .zho.FinChinaSentimentClassification import *
from .zho.FinFEClassification import *
from .zho.FinNSPClassification import *
from .zho.OpenFinDataSentimentClassification import *
from .zho.Weibo21Classification import *
from .zho.YueOpenriceReviewClassification import (
YueOpenriceReviewClassification, # noqa: F401
)
Expand Down
21 changes: 21 additions & 0 deletions mteb/tasks/Classification/eng/ESGClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class ESGClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="ESGClassification",
description="A finance dataset performs sentence classification under the environmental, social, and corporate governance (ESG) framework.",
# reference="",
dataset={
"path": "FinanceMTEB/ESG",
"revision": "521d56feabadda80b11d6adcc6b335d4c5ad8285",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
)
21 changes: 21 additions & 0 deletions mteb/tasks/Classification/eng/FLSClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class FLSClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="FLSClassification",
description="A finance dataset detects whether the sentence is a forward-looking statement.",
# reference="",
dataset={
"path": "FinanceMTEB/FLS",
"revision": "39b6719f1d7197df4498fea9fce20d4ad782a083",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
)
24 changes: 24 additions & 0 deletions mteb/tasks/Classification/eng/FOMCClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class FOMCClassification(AbsTaskClassification):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

General comment: Meta data is required to be filled out.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah understood. Working on it.

metadata = TaskMetadata(
name="FOMCClassification",
description="A task of hawkish-dovish classification in finance domain.",
reference="https://github.com/gtfintechlab/fomc-hawkish-dovish",
dataset={
"path": "FinanceMTEB/FOMC",
"revision": "cdaf1306a24bc5e7441c7c871343efdf4c721bc2",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
Loading
Loading