feat: extend eval dataloaders (#576)

micpst · web-flow · commit dace28598046 · 2025-05-26T14:42:01.000Z
diff --git a/packages/ragbits-evaluate/CHANGELOG.md b/packages/ragbits-evaluate/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## Unreleased
 
+- Add support for slicing dataset (#576)
+- Separate load and map ops in data loaders (#576)
+
 ## 0.18.0 (2025-05-22)
 
 ### Changed
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataloaders/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataloaders/base.py
@@ -3,12 +3,14 @@
 from types import ModuleType
 from typing import ClassVar, Generic
 
+from datasets import load_dataset
 from pydantic import BaseModel
 from typing_extensions import Self
 
 from ragbits.core.sources.base import Source
 from ragbits.core.utils.config_handling import ObjectConstructionConfig, WithConstructionConfig
 from ragbits.evaluate import dataloaders
+from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
 from ragbits.evaluate.pipelines.base import EvaluationDataT
 
 
@@ -28,14 +30,19 @@ class DataLoader(WithConstructionConfig, Generic[EvaluationDataT], ABC):
     default_module: ClassVar[ModuleType | None] = dataloaders
     configuration_key: ClassVar[str] = "dataloader"
 
-    def __init__(self, source: Source) -> None:
+    def __init__(self, source: Source, *, split: str = "data", required_keys: set[str] | None = None) -> None:
         """
         Initialize the data loader.
 
         Args:
             source: The source to load the evaluation data from.
+            split: The split to load the data from. Split is fixed for data loaders to "data",
+                but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
+            required_keys: The required columns for the evaluation data.
         """
         self.source = source
+        self.split = split
+        self.required_keys = required_keys or set()
 
     @classmethod
     def from_config(cls, config: dict) -> Self:
@@ -52,11 +59,37 @@ def from_config(cls, config: dict) -> Self:
         config["source"] = Source.subclass_from_config(dataloader_config.source)
         return super().from_config(config)
 
-    @abstractmethod
     async def load(self) -> Iterable[EvaluationDataT]:
         """
         Load the data.
 
         Returns:
-            The loaded data.
+            The loaded evaluation data.
+
+        Raises:
+            DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
+        """
+        data_path = await self.source.fetch()
+        dataset = load_dataset(
+            path=str(data_path.parent),
+            data_files={"data": str(data_path.name)},
+            split=self.split,
+        )
+        if not self.required_keys.issubset(dataset.features):
+            raise DataLoaderIncorrectFormatDataError(
+                required_features=list(self.required_keys),
+                data_path=data_path,
+            )
+        return await self.map(dataset.to_list())
+
+    @abstractmethod
+    async def map(self, dataset: Iterable[dict]) -> Iterable[EvaluationDataT]:
+        """
+        Map the dataset to the evaluation data.
+
+        Args:
+            dataset: The dataset to map.
+
+        Returns:
+            The evaluation data.
         """
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/dataloaders/document_search.py b/packages/ragbits-evaluate/src/ragbits/evaluate/dataloaders/document_search.py
@@ -1,24 +1,22 @@
 from collections.abc import Iterable
 
-from datasets import load_dataset
-
 from ragbits.core.sources.base import Source
 from ragbits.evaluate.dataloaders.base import DataLoader
-from ragbits.evaluate.dataloaders.exceptions import DataLoaderIncorrectFormatDataError
 from ragbits.evaluate.pipelines.document_search import DocumentSearchData
 
 
 class DocumentSearchDataLoader(DataLoader[DocumentSearchData]):
     """
     Document search evaluation data loader.
 
-    The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files)
-    and contain the following features: "question, "passages".
+    The source used for this data loader should point to a file that can be loaded by [Hugging Face](https://huggingface.co/docs/datasets/loading#local-and-remote-files).
     """
 
     def __init__(
         self,
         source: Source,
+        *,
+        split: str = "data",
         question_key: str = "question",
         document_ids_key: str = "document_ids",
         passages_key: str = "passages",
@@ -29,42 +27,32 @@ def __init__(
 
         Args:
             source: The source to load the data from.
+            split: The split to load the data from. Split is fixed for data loaders to "data",
+                but you can slice it using the [Hugging Face API](https://huggingface.co/docs/datasets/v1.11.0/splits.html#slicing-api).
             question_key: The dataset column name that contains the question.
             document_ids_key: The dataset column name that contains the document ids. Document ids are optional.
             passages_key: The dataset column name that contains the passages. Passages are optional.
             page_numbers_key: The dataset column name that contains the page numbers. Page numbers are optional.
         """
-        super().__init__(source)
+        super().__init__(source=source, split=split, required_keys={question_key})
         self.question_key = question_key
         self.document_ids_key = document_ids_key
         self.passages_key = passages_key
         self.page_numbers_key = page_numbers_key
 
-    async def load(self) -> Iterable[DocumentSearchData]:
+    async def map(self, dataset: Iterable[dict]) -> Iterable[DocumentSearchData]:
         """
-        Load the data from source and format them.
+        Map the dataset to the document search data schema.
 
-        Returns:
-            The document search evaluation data.
+        Args:
+            dataset: The dataset to map.
 
-        Raises:
-            DataLoaderIncorrectFormatDataError: If evaluation dataset is incorrectly formatted.
+        Returns:
+            The document search data.
         """
-        data_path = await self.source.fetch()
-        dataset = load_dataset(
-            path=str(data_path.parent),
-            split="train",
-            data_files={"train": str(data_path.name)},
-        )
-        if self.question_key not in dataset.features:
-            raise DataLoaderIncorrectFormatDataError(
-                required_features=[self.question_key],
-                data_path=data_path,
-            )
-
         return [
             DocumentSearchData(
-                question=data.get(self.question_key),
+                question=data.get(self.question_key, ""),
                 reference_document_ids=data.get(self.document_ids_key),
                 reference_passages=data.get(self.passages_key),
                 reference_page_numbers=data.get(self.page_numbers_key),
diff --git a/packages/ragbits-evaluate/tests/unit/test_evaluator.py b/packages/ragbits-evaluate/tests/unit/test_evaluator.py
@@ -1,6 +1,6 @@
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import cast
+from typing import Any, cast
 from unittest.mock import Mock
 
 import pytest
@@ -58,6 +58,9 @@ def __init__(self, dataset_size: int = 4) -> None:
         self.dataset_size = dataset_size
 
     async def load(self) -> Iterable[MockEvaluationData]:
+        return await self.map()
+
+    async def map(self, *args: Any, **kwargs: Any) -> Iterable[MockEvaluationData]:  # noqa: ANN401
         return [MockEvaluationData(input_data=i) for i in range(1, self.dataset_size + 1)]
 
     @classmethod
diff --git a/packages/ragbits-evaluate/tests/unit/test_optimizer.py b/packages/ragbits-evaluate/tests/unit/test_optimizer.py
@@ -1,5 +1,6 @@
 from collections.abc import Iterable
 from dataclasses import dataclass
+from typing import Any
 from unittest.mock import Mock
 
 import pytest
@@ -55,6 +56,9 @@ def __init__(self, dataset_size: int = 4) -> None:
         self.dataset_size = dataset_size
 
     async def load(self) -> Iterable[MockEvaluationData]:
+        return await self.map()
+
+    async def map(self, *args: Any, **kwargs: Any) -> Iterable[MockEvaluationData]:  # noqa: ANN401
         return [MockEvaluationData(input_data=i) for i in range(1, self.dataset_size + 1)]
 
     @classmethod