Fix feature names in Croissant builder.

fineguy · The TensorFlow Datasets Authors · commit ed9de761f79b · 2024-11-06T01:15:24.000-08:00
PiperOrigin-RevId: 693628650
diff --git a/tensorflow_datasets/core/dataset_builders/croissant_builder.py b/tensorflow_datasets/core/dataset_builders/croissant_builder.py
@@ -36,8 +36,8 @@
 
 from __future__ import annotations
 
-from collections.abc import Mapping
-from typing import Any, Dict, Optional, Sequence
+from collections.abc import Mapping, Sequence
+from typing import Any
 
 from etils import epath
 import numpy as np
@@ -61,10 +61,23 @@
 from tensorflow_datasets.core.utils.lazy_imports_utils import pandas as pd
 
 
+_RecordOrFeature = Mapping[str, Any]
+
+
+def _strip_record_set_prefix(
+    record_or_feature: _RecordOrFeature, record_set_id: str
+) -> _RecordOrFeature:
+  """Removes the record set prefix from the field ids of a record or feature."""
+  return {
+      field_id.removeprefix(f'{record_set_id}/'): value
+      for field_id, value in record_or_feature.items()
+  }
+
+
 def datatype_converter(
     field: mlc.Field,
-    int_dtype: Optional[type_utils.TfdsDType] = np.int64,
-    float_dtype: Optional[type_utils.TfdsDType] = np.float32,
+    int_dtype: type_utils.TfdsDType = np.int64,
+    float_dtype: type_utils.TfdsDType = np.float32,
 ):
   """Converts a Croissant field to a TFDS-compatible feature.
 
@@ -162,8 +175,8 @@ def __init__(
       jsonld: epath.PathLike | Mapping[str, Any],
       record_set_ids: Sequence[str] | None = None,
       disable_shuffling: bool | None = False,
-      int_dtype: type_utils.TfdsDType | None = np.int64,
-      float_dtype: type_utils.TfdsDType | None = np.float32,
+      int_dtype: type_utils.TfdsDType = np.int64,
+      float_dtype: type_utils.TfdsDType = np.float32,
       mapping: Mapping[str, epath.PathLike] | None = None,
       overwrite_version: version_lib.VersionOrStr | None = None,
       filters: Mapping[str, Any] | None = None,
@@ -214,7 +227,7 @@ def __init__(
         conversion_utils.to_tfds_name(record_set_id)
         for record_set_id in record_set_ids
     ]
-    self.BUILDER_CONFIGS: Sequence[dataset_builder.BuilderConfig] = [  # pylint: disable=invalid-name
+    self.BUILDER_CONFIGS: list[dataset_builder.BuilderConfig] = [  # pylint: disable=invalid-name
         dataset_builder.BuilderConfig(name=config_name)
         for config_name in config_names
     ]
@@ -261,13 +274,14 @@ def get_features(self) -> features_dict.FeaturesDict:
       if field.repeated:
         feature = sequence_feature.Sequence(feature)
       features[field.id] = feature
+    features = _strip_record_set_prefix(features, record_set.id)
     return features_dict.FeaturesDict(features)
 
   def _split_generators(
       self,
       dl_manager: download.DownloadManager,
       pipeline: beam.Pipeline,
-  ) -> Dict[splits_lib.Split, split_builder_lib.SplitGenerator]:
+  ) -> dict[splits_lib.Split, split_builder_lib.SplitGenerator]:
     # If a split recordset is joined for the required record set, we generate
     # splits accordingly. Otherwise, it generates a single `default` split with
     # all the records.
@@ -317,11 +331,15 @@ def _generate_examples(
 
     def convert_to_tfds_format(
         global_index: int,
-        record: Any,
+        record: _RecordOrFeature,
         features: feature_lib.FeatureConnector | None = None,
-    ) -> tuple[int, Any]:
+        record_set_id: str | None = None,
+    ) -> tuple[int, _RecordOrFeature]:
       if not features:
         raise ValueError('features should not be None.')
+      if not record_set_id:
+        raise ValueError('record_set_id should not be None.')
+      record = _strip_record_set_prefix(record, record_set_id)
       return (
           global_index,
           conversion_utils.to_tfds_value(record, features),
@@ -330,5 +348,7 @@ def convert_to_tfds_format(
     return records.beam_reader(
         pipeline=pipeline
     ) | 'Convert to TFDS format' >> beam.MapTuple(
-        convert_to_tfds_format, features=self.info.features
+        convert_to_tfds_format,
+        features=self.info.features,
+        record_set_id=record_set.id,
     )
diff --git a/tensorflow_datasets/core/dataset_builders/croissant_builder_test.py b/tensorflow_datasets/core/dataset_builders/croissant_builder_test.py
@@ -91,8 +91,8 @@
 def test_simple_datatype_converter(field, feature_type, int_dtype, float_dtype):
   actual_feature = croissant_builder.datatype_converter(
       field,
-      int_dtype=int_dtype if int_dtype else np.int64,
-      float_dtype=float_dtype if float_dtype else np.float32,
+      int_dtype=int_dtype or np.int64,
+      float_dtype=float_dtype or np.float32,
   )
   assert actual_feature == feature_type
 
@@ -221,6 +221,6 @@ def test_download_and_prepare(crs_builder, expected_entries, split_name):
   crs_builder.download_and_prepare()
   data_source = crs_builder.as_data_source(split=split_name)
   assert len(data_source) == 2
-  for i in range(2):
-    assert data_source[i]["jsonl/index"] == expected_entries[i]["index"]
-    assert data_source[i]["jsonl/text"].decode() == expected_entries[i]["text"]
+  for entry, expected_entry in zip(data_source, expected_entries):
+    assert entry["index"] == expected_entry["index"]
+    assert entry["text"].decode() == expected_entry["text"]
diff --git a/tensorflow_datasets/core/features/features_dict.py b/tensorflow_datasets/core/features/features_dict.py
@@ -17,8 +17,8 @@
 
 from __future__ import annotations
 
+from collections.abc import Mapping
 import concurrent.futures
-from typing import Dict, List, Union
 
 from tensorflow_datasets.core import utils
 from tensorflow_datasets.core.features import feature as feature_lib
@@ -33,7 +33,7 @@
 WORKER_COUNT = 16
 
 
-class _DictGetCounter(object):
+class _DictGetCounter:
   """Wraps dict.get and counts successful key accesses."""
 
   def __init__(self, d):
@@ -114,15 +114,15 @@ class FeaturesDict(top_level_feature.TopLevelFeature):
 
   def __init__(
       self,
-      feature_dict: Dict[str, feature_lib.FeatureConnectorArg],
+      feature_dict: Mapping[str, feature_lib.FeatureConnectorArg],
       *,
       doc: feature_lib.DocArg = None,
   ):
     """Initialize the features.
 
     Args:
-      feature_dict (dict): Dictionary containing the feature connectors of a
-        example. The keys should correspond to the data dict as returned by
+      feature_dict: Mapping containing the feature connectors of a example. The
+        keys should correspond to the data dict as returned by
         tf.data.Dataset(). Types (np.int32,...) and dicts will automatically be
         converted into FeatureConnector.
       doc: Documentation of this feature (e.g. description).
@@ -173,7 +173,7 @@ def __repr__(self):
 
   def catalog_documentation(
       self,
-  ) -> List[feature_lib.CatalogFeatureDocumentation]:
+  ) -> list[feature_lib.CatalogFeatureDocumentation]:
     feature_docs = [
         feature_lib.CatalogFeatureDocumentation(
             name='',
@@ -210,7 +210,7 @@ def get_serialized_info(self):
 
   @classmethod
   def from_json_content(
-      cls, value: Union[Json, feature_pb2.FeaturesDict]
+      cls, value: Json | feature_pb2.FeaturesDict
   ) -> 'FeaturesDict':
     if isinstance(value, dict):
       features = {