-
Notifications
You must be signed in to change notification settings - Fork 193
[Bug]: in "RecurrentPPO" not work "model.policy.evaluate_actions()" #270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Labels
bug
Something isn't working
check the checklist
You have checked the required items in the checklist but you didn't do what is written...
more information needed
Please fill the issue template completely
Comments
# Importing libraries
from collections import namedtuple
from sb3_contrib import RecurrentPPO
import numpy as np
import torch as th
# !!! Set to True to SHOW ERROR !!!
SHOW_ERROR = False # True, False
# Initialize environment and model
model = RecurrentPPO("MlpLstmPolicy", "CartPole-v1").learn(200)
env = model.get_env()
# Reset environment
observation_lst = env.reset()
observation_lst = observation_lst[0]
state = None
done = "True"
reward_sum = 0
# Run simulation for 100 steps
for step in range(0, 100, 1):
# Predict actions and update LSTM state
actions, state = model.predict(
observation=np.array(object=observation_lst, dtype=np.float32),
state=state,
episode_start=np.array(object=[done == "True"], dtype=np.bool_),
deterministic=True,
)
actions = actions.item()
# Prepare tensors for policy evaluation
obs = th.tensor(data=np.array(object=[observation_lst], dtype=np.float32), dtype=th.float32, device=model.policy.device)
act = th.tensor(data=np.array(object=[actions], dtype=np.int64), dtype=th.int64, device=model.policy.device)
net_arch = namedtuple("net_arch", ["pi", "vf"])
pi = th.tensor(data=state[0], dtype=th.float32, device=model.policy.device)
vf = th.tensor(data=state[1], dtype=th.float32, device=model.policy.device)
lstm_states = net_arch(pi=pi, vf=vf)
episode_starts = th.tensor(data=np.array(object=[done == "True"], dtype=np.bool_), dtype=th.int64, device=model.policy.device)
# Evaluate policy outputs
action_probability_distribution = model.policy.get_distribution(obs=obs, lstm_states=lstm_states, episode_starts=episode_starts)
state_value_estimates_a = model.policy.predict_values(obs=obs, lstm_states=lstm_states, episode_starts=episode_starts)
if SHOW_ERROR == True:
state_value_estimates_b, log_probabilities_of_actions, action_distribution_entropy = model.policy.evaluate_actions(obs=obs, actions=act, lstm_states=lstm_states, episode_starts=episode_starts)
# Extract key metrics
_action_probability_distribution_0 = action_probability_distribution[0].distribution.probs.detach().cpu().numpy()[0][0]
_action_probability_distribution_1 = action_probability_distribution[0].distribution.probs.detach().cpu().numpy()[0][1]
_state_value_estimates_a = state_value_estimates_a.detach().cpu().numpy()[0][0]
if SHOW_ERROR == True:
_state_value_estimates_b = state_value_estimates_b.detach().cpu().numpy()[0][0]
_log_probabilities_of_actions = log_probabilities_of_actions.detach().cpu().numpy()[0]
_action_distribution_entropy = action_distribution_entropy.detach().cpu().numpy()[0]
# Print information for current step
print("----------------------------------------------------------------------------------------------------------")
print(f"step: {step} | actions: {actions} | done: {done}")
print("----------------------------------")
print("ACTION_PROBABILITY_DISTRIBUTION :", action_probability_distribution[0].distribution.probs)
print("ACTION_PROBABILITY_DISTRIBUTION_0:", _action_probability_distribution_0)
print("ACTION_PROBABILITY_DISTRIBUTION_1:", _action_probability_distribution_1)
print("----------------------------------")
print("STATE_VALUE_ESTIMATES_A :", state_value_estimates_a)
print("STATE_VALUE_ESTIMATES_A :", _state_value_estimates_a)
if SHOW_ERROR == True:
print("----------------------------------")
print("STATE_VALUE_ESTIMATES_B :", state_value_estimates_b)
print("STATE_VALUE_ESTIMATES_B :", _state_value_estimates_b)
print("LOG_PROBABILITIES_OF_ACTIONS :", log_probabilities_of_actions)
print("LOG_PROBABILITIES_OF_ACTIONS :", _log_probabilities_of_actions)
print("ACTION_DISTRIBUTION_ENTROPY :", action_distribution_entropy)
print("ACTION_DISTRIBUTION_ENTROPY :", _action_distribution_entropy)
print("----------------------------------------------------------------------------------------------------------")
# Perform action in environment and retrieve results
observation_lst, reward, done, info = env.step(actions=np.array(object=[actions], dtype=np.int64))
observation_lst = observation_lst[0]
reward_sum += reward[0]
done = str(done[0])
# Print final step and total reward
print(f"final step: {step} | total reward: {reward_sum}")
# Traceback (most recent call last):
# File "/home/drulye/vscode/project/error_rppo_evaluate_actions.py", line 48, in <module>
# state_value_estimates_b, log_probabilities_of_actions, action_distribution_entropy = model.policy.evaluate_actions(obs=obs, actions=act, lstm_states=lstm_states, episode_starts=episode_starts)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# File "/home/drulye/anaconda3/lib/python3.12/site-packages/sb3_contrib/common/recurrent/policies.py", line 331, in evaluate_actions
# latent_pi, _ = self._process_sequence(pi_features, lstm_states.pi, episode_starts, self.lstm_actor)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# File "/home/drulye/anaconda3/lib/python3.12/site-packages/sb3_contrib/common/recurrent/policies.py", line 186, in _process_sequence
# features_sequence = features.reshape((n_seq, -1, lstm.input_size)).swapaxes(0, 1)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# RuntimeError: shape '[256, -1, 4]' is invalid for input of size 4 |
Hello, stable-baselines3-contrib/sb3_contrib/ppo_recurrent/ppo_recurrent.py Lines 345 to 350 in c070fc2
So I would recommend to set a debugger and analyze the shape of the tensors and compare it to your script. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
bug
Something isn't working
check the checklist
You have checked the required items in the checklist but you didn't do what is written...
more information needed
Please fill the issue template completely
🐛 Bug
in "RecurrentPPO" not work "model.policy.evaluate_actions()"
The text was updated successfully, but these errors were encountered: