[data] disable on_exit hook (#53249)

raulchen · lk-chen · web-flow · commit 2c1b283d5de6 · 2025-05-22T23:05:45.000Z
## Why are these changes needed? * The on_exit hook was introduced to allow users to perform cleanup. * However, it triggers a race condition bug in fault tolerance - after on_exit is called and the UDF is deleted, and before the actor actually exits, another retry task is submitted to the actor. * This PR disables it by default. Eventually this should be fixed in Ray Core #53169 --------- Signed-off-by: Hao Chen <chenh1024@gmail.com> Signed-off-by: lkchen <github@lkchen.net> Co-authored-by: lkchen <github@lkchen.net>
diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
@@ -127,7 +127,10 @@ def __init__(
             gpu=self._ray_remote_args.get("num_gpus", 0),
         )
         self._actor_pool = _ActorPool(
-            compute_strategy, self._start_actor, per_actor_resource_usage
+            compute_strategy,
+            self._start_actor,
+            per_actor_resource_usage,
+            self.data_context._enable_actor_pool_on_exit_hook,
         )
         # A queue of bundles awaiting dispatch to actors.
         self._bundle_queue = create_bundle_queue()
@@ -507,6 +510,7 @@ def __init__(
         compute_strategy: ActorPoolStrategy,
         create_actor_fn: Callable[[], Tuple[ActorHandle, ObjectRef[Any]]],
         per_actor_resource_usage: ExecutionResources,
+        _enable_actor_pool_on_exit_hook: bool = False,
     ):
         self._min_size: int = compute_strategy.min_size
         self._max_size: int = compute_strategy.max_size
@@ -530,6 +534,7 @@ def __init__(
         # Track locality matching stats.
         self._locality_hits: int = 0
         self._locality_misses: int = 0
+        self._enable_actor_pool_on_exit_hook = _enable_actor_pool_on_exit_hook
 
     # === Overriding methods of AutoscalingActorPool ===
 
@@ -856,7 +861,9 @@ def _release_running_actors(self, force: bool):
 
         # First release actors and collect their shutdown hook object-refs
         for actor in running:
-            on_exit_refs.append(self._release_running_actor(actor))
+            ref = self._release_running_actor(actor)
+            if ref:
+                on_exit_refs.append(ref)
 
         # Wait for all actors to shutdown gracefully before killing them
         ray.wait(on_exit_refs, timeout=self._ACTOR_POOL_GRACEFUL_SHUTDOWN_TIMEOUT_S)
@@ -882,9 +889,12 @@ def _release_running_actor(
         if actor not in self._running_actors:
             return None
 
-        # Call `on_exit` to trigger `UDF.__del__` which may perform
-        # cleanup operations.
-        ref = actor.on_exit.remote()
+        if self._enable_actor_pool_on_exit_hook:
+            # Call `on_exit` to trigger `UDF.__del__` which may perform
+            # cleanup operations.
+            ref = actor.on_exit.remote()
+        else:
+            ref = None
         del self._running_actors[actor]
 
         return ref
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
@@ -438,6 +438,12 @@ class DataContext:
     override_object_store_memory_limit_fraction: float = None
     memory_usage_poll_interval_s: Optional[float] = 1
     dataset_logger_id: Optional[str] = None
+    # This is a temporary workaround to allow actors to perform cleanup
+    # until https://github.com/ray-project/ray/issues/53169 is fixed.
+    # This hook is known to have a race condition bug in fault tolerance.
+    # I.E., after the hook is triggered and the UDF is deleted, another
+    # retry task may still be scheduled to this actor and it will fail.
+    _enable_actor_pool_on_exit_hook: bool = False
 
     def __post_init__(self):
         # The additonal ray remote args that should be added to
diff --git a/python/ray/data/tests/test_map.py b/python/ray/data/tests/test_map.py
@@ -1648,8 +1648,11 @@ def generate_data(n_per_block: int, n_blocks: int):
     assert set(ds.to_pandas()["item"].to_list()) == set(expected.tolist())
 
 
-def test_actor_udf_cleanup(ray_start_regular_shared, tmp_path):
+def test_actor_udf_cleanup(ray_start_regular_shared, tmp_path, restore_data_context):
     """Test that for the actor map operator, the UDF object is deleted properly."""
+    ctx = DataContext.get_current()
+    ctx._enable_actor_pool_on_exit_hook = True
+
     test_file = tmp_path / "test.txt"
 
     # Simulate the case that the UDF depends on some external resources that
diff --git a/python/ray/llm/_internal/batch/processor/base.py b/python/ray/llm/_internal/batch/processor/base.py
@@ -135,8 +135,12 @@ def __init__(
         self.stages: OrderedDict[str, StatefulStage] = OrderedDict()
 
         # FIXES: https://github.com/ray-project/ray/issues/53124
-        # TODO (Kourosh): Remove this once the issue is fixed.
-        ray.data.DataContext.get_current().wait_for_min_actors_s = 600
+        # TODO (Kourosh): Remove this once the issue is fixed
+        data_context = ray.data.DataContext.get_current()
+        data_context.wait_for_min_actors_s = 600
+        # TODO: Remove this when https://github.com/ray-project/ray/issues/53169
+        # is fixed.
+        data_context._enable_actor_pool_on_exit_hook = True
 
         # NOTE (Kourosh): If pre/postprocess is not provided, use the identity function.
         # Wrapping is required even if they are identity functions, b/c data_column