Giving up on RL training, at some point worked with rllib=2.2

micahcarroll · micahcarroll · commit 9f15bf00429a · 2025-03-22T11:35:16.000-07:00
diff --git a/src/human_aware_rl/imitation/behavior_cloning_tf2.py b/src/human_aware_rl/imitation/behavior_cloning_tf2.py
@@ -6,7 +6,6 @@
 import tensorflow as tf
 from ray.rllib.policy import Policy as RllibPolicy
 from tensorflow import keras
-from tensorflow.compat.v1.keras.backend import get_session
 
 from human_aware_rl.data_dir import DATA_DIR
 from human_aware_rl.human.process_dataframes import get_human_human_trajectories
@@ -176,6 +175,9 @@ def build_bc_model(use_lstm=True, eager=False, **kwargs):
 def train_bc_model(model_dir, bc_params, verbose=False):
     inputs, seq_lens, targets = load_data(bc_params, verbose)
 
+    # Ensure targets are int32 for SparseCategoricalCrossentropy
+    targets = tf.cast(targets, tf.int32)
+
     training_params = bc_params["training_params"]
 
     if training_params["use_class_weights"]:
@@ -220,7 +222,7 @@ def train_bc_model(model_dir, bc_params, verbose=False):
         ),
         # Save checkpoints of the models at the end of every epoch (saving only the best one so far)
         keras.callbacks.ModelCheckpoint(
-            filepath=os.path.join(model_dir, "checkpoints"),
+            filepath=os.path.join(model_dir, "checkpoints", "model.keras"),
             monitor="loss",
             save_best_only=True,
         ),
@@ -235,15 +237,25 @@ def train_bc_model(model_dir, bc_params, verbose=False):
 
     # Inputs unique to lstm model
     if bc_params["use_lstm"]:
-        inputs["seq_in"] = seq_lens
-        inputs["hidden_in"] = np.zeros((N, bc_params["cell_size"]))
-        inputs["memory_in"] = np.zeros((N, bc_params["cell_size"]))
+        inputs["seq_in"] = tf.cast(seq_lens, tf.int32)
+        inputs["hidden_in"] = tf.zeros((N, bc_params["cell_size"]), dtype=tf.float32)
+        inputs["memory_in"] = tf.zeros((N, bc_params["cell_size"]), dtype=tf.float32)
 
     # Batch size doesn't include time dimension (seq_len) so it should be smaller for rnn model
     batch_size = 1 if bc_params["use_lstm"] else training_params["batch_size"]
+    model_inputs = (
+        inputs
+        if not bc_params["use_lstm"]
+        else {
+            "Overcooked_observation": inputs,
+            "seq_in": seq_lens,
+            "hidden_in": np.zeros((N, bc_params["cell_size"])),
+            "memory_in": np.zeros((N, bc_params["cell_size"])),
+        }
+    )
     model.fit(
-        inputs,
-        targets,
+        model_inputs,
+        targets["logits"],
         callbacks=callbacks,
         batch_size=batch_size,
         epochs=training_params["epochs"],
@@ -260,18 +272,20 @@ def train_bc_model(model_dir, bc_params, verbose=False):
 
 def save_bc_model(model_dir, model, bc_params, verbose=False):
     """
-    Saves the specified model under the directory model_dir. This creates three items
-
-        assets/         stores information essential to reconstructing the context and tf graph
-        variables/      stores the model's trainable weights
-        saved_model.pd  the saved state of the model object
+    Saves the specified model under the directory model_dir. This creates a .keras file
+    containing the model's architecture, weights, and optimizer state.
 
     Additionally, saves a pickled dictionary containing all the parameters used to construct this model
     at model_dir/metadata.pickle
     """
     if verbose:
         print("Saving bc model at ", model_dir)
-    model.save(model_dir, save_format="tf")
+
+    # Save model with .keras extension
+    model_path = os.path.join(model_dir, "model.keras")
+    model.save(model_path)
+
+    # Save metadata
     with open(os.path.join(model_dir, "metadata.pickle"), "wb") as f:
         pickle.dump(bc_params, f)
 
@@ -283,7 +297,12 @@ def load_bc_model(model_dir, verbose=False):
     """
     if verbose:
         print("Loading bc model from ", model_dir)
-    model = keras.models.load_model(model_dir, custom_objects={"tf": tf})
+
+    # Load model from .keras file
+    model_path = os.path.join(model_dir, "model.keras")
+    model = keras.models.load_model(model_path, custom_objects={"tf": tf})
+
+    # Load metadata
     with open(os.path.join(model_dir, "metadata.pickle"), "rb") as f:
         bc_params = pickle.load(f)
     return model, bc_params
@@ -406,40 +425,6 @@ def _build_lstm_model(
 ################
 
 
-class NullContextManager:
-    """
-    No-op context manager that does nothing
-    """
-
-    def __init__(self):
-        pass
-
-    def __enter__(self):
-        pass
-
-    def __exit__(self, *args):
-        pass
-
-
-class TfContextManager:
-    """
-    Properly sets the execution graph and session of the keras backend given a "session" object as input
-
-    Used for isolating tf execution in graph mode. Do not use with eager models or with eager mode on
-    """
-
-    def __init__(self, session):
-        self.session = session
-
-    def __enter__(self):
-        self.ctx = self.session.graph.as_default()
-        self.ctx.__enter__()
-        set_session(self.session)
-
-    def __exit__(self, *args):
-        self.ctx.__exit__(*args)
-
-
 class BehaviorCloningPolicy(RllibPolicy):
     def __init__(self, observation_space, action_space, config):
         """
@@ -470,8 +455,6 @@ def __init__(self, observation_space, action_space, config):
             )
             model, bc_params = load_bc_model(config["model_dir"])
 
-        # Save the session that the model was loaded into so it is available at inference time if necessary
-        self._sess = get_session()
         self._setup_shapes()
 
         # Basic check to make sure model dimensions match
@@ -482,8 +465,6 @@ def __init__(self, observation_space, action_space, config):
         self.stochastic = config["stochastic"]
         self.use_lstm = bc_params["use_lstm"]
         self.cell_size = bc_params["cell_size"]
-        self.eager = config["eager"] if "eager" in config else bc_params["eager"]
-        self.context = self._create_execution_context()
 
     def _setup_shapes(self):
         # This is here to make the class compatible with both tuples or gymnasium.Space objs for the spaces
@@ -542,11 +523,8 @@ def compute_actions(
         # Cast to np.array if list (no-op if already np.array)
         obs_batch = np.array(obs_batch)
 
-        # Run the model
-        with self.context:
-            action_logits, states = self._forward(obs_batch, state_batches)
+        action_logits, states = self._forward(obs_batch, state_batches)
 
-        # Softmax in numpy to convert logits to probabilities
         action_probs = softmax(action_logits)
         if self.stochastic:
             # Sample according to action_probs for each row in the output
@@ -611,16 +589,6 @@ def _forward(self, obs_batch, state_batches):
         else:
             return self.model.predict(obs_batch, verbose=0), []
 
-    def _create_execution_context(self):
-        """
-        Creates a private execution context for the model
-
-        Necessary if using with rllib in order to isolate this policy model from others
-        """
-        if self.eager:
-            return NullContextManager()
-        return TfContextManager(self._sess)
-
 
 if __name__ == "__main__":
     params = get_bc_params()
diff --git a/src/human_aware_rl/imitation/behavior_cloning_tf2_test.py b/src/human_aware_rl/imitation/behavior_cloning_tf2_test.py
@@ -1,8 +1,10 @@
 import argparse
+import gc
 import os
 import pickle
 import shutil
 import sys
+import time
 import unittest
 import warnings
 
@@ -32,7 +34,6 @@ def _clear_pickle():
 
 
 class TestBCTraining(unittest.TestCase):
-
     """
     Unittests for behavior cloning training and utilities
 
@@ -48,9 +49,9 @@ def __init__(self, test_name):
         self.compute_pickle = False
         self.strict = False
         self.min_performance = 0
-        assert not (
-            self.compute_pickle and self.strict
-        ), "Cannot compute pickle and run strict reproducibility tests at same time"
+        assert not (self.compute_pickle and self.strict), (
+            "Cannot compute pickle and run strict reproducibility tests at same time"
+        )
         if self.compute_pickle:
             _clear_pickle()
 
@@ -94,7 +95,17 @@ def tearDown(self):
             with open(BC_EXPECTED_DATA_PATH, "wb") as f:
                 pickle.dump(self.expected, f)
 
-        shutil.rmtree(self.model_dir)
+        # Force garbage collection to close any open files
+        gc.collect()
+
+        # Add a small delay to ensure files are released
+        time.sleep(0.1)
+
+        try:
+            # Use ignore_errors=True to force removal even if some files are still locked
+            shutil.rmtree(self.model_dir, ignore_errors=True)
+        except Exception as e:
+            print(f"Warning: Could not fully remove directory {self.model_dir}: {e}")
 
     def test_model_construction(self):
         model = build_bc_model(**self.bc_params)
@@ -115,9 +126,7 @@ def test_save_and_load(self):
         loaded_model, loaded_params = load_bc_model(self.model_dir)
         self.assertDictEqual(self.bc_params, loaded_params)
         self.assertTrue(
-            np.allclose(
-                model(self.dummy_input), loaded_model(self.dummy_input)
-            )
+            np.allclose(model(self.dummy_input), loaded_model(self.dummy_input))
         )
 
     def test_training(self):
@@ -127,9 +136,7 @@ def test_training(self):
             self.expected["test_training"] = model(self.dummy_input)
         if self.strict:
             self.assertTrue(
-                np.allclose(
-                    model(self.dummy_input), self.expected["test_training"]
-                )
+                np.allclose(model(self.dummy_input), self.expected["test_training"])
             )
 
     def test_agent_evaluation(self):
@@ -143,9 +150,7 @@ def test_agent_evaluation(self):
         if self.compute_pickle:
             self.expected["test_agent_evaluation"] = results
         if self.strict:
-            self.assertAlmostEqual(
-                results, self.expected["test_agent_evaluation"]
-            )
+            self.assertAlmostEqual(results, self.expected["test_agent_evaluation"])
 
 
 class TestBCTrainingLSTM(TestBCTraining):
@@ -190,9 +195,7 @@ def test_lstm_evaluation(self):
         if self.compute_pickle:
             self.expected["test_lstm_evaluation"] = results
         if self.strict:
-            self.assertAlmostEqual(
-                results, self.expected["test_lstm_evaluation"]
-            )
+            self.assertAlmostEqual(results, self.expected["test_lstm_evaluation"])
 
     def test_lstm_save_and_load(self):
         self.bc_params["use_lstm"] = True