Open
Description
In the config comment, it says:
# xuance/configs/matd3/mpe/simple_push_v3.yaml
start_training: 1000 # start training after n episodes
However, in off_policy_marl
as an example,
for _ in tqdm(range(n_steps)):
step_info = {}
policy_out = self.action(obs_dict=obs_dict, avail_actions_dict=avail_actions, test_mode=False)
actions_dict = policy_out['actions']
next_obs_dict, rewards_dict, terminated_dict, truncated, info = self.envs.step(actions_dict)
next_state = self.envs.buf_state.copy() if self.use_global_state else None
next_avail_actions = self.envs.buf_avail_actions if self.use_actions_mask else None
self.store_experience(obs_dict, avail_actions, actions_dict, next_obs_dict, next_avail_actions,
rewards_dict, terminated_dict, info,
**{'state': state, 'next_state': next_state})
+ if self.current_step >= self.start_training and self.current_step % self.training_frequency == 0:
train_info = self.train_epochs(n_epochs=self.n_epochs)
self.log_infos(train_info, self.current_step)
return_info.update(train_info)
obs_dict = deepcopy(next_obs_dict)
if self.use_global_state:
state = deepcopy(next_state)
if self.use_actions_mask:
avail_actions = deepcopy(next_avail_actions)
for i in range(self.n_envs):
if all(terminated_dict[i].values()) or truncated[i]:
obs_dict[i] = info[i]["reset_obs"]
self.envs.buf_obs[i] = info[i]["reset_obs"]
if self.use_global_state:
state = info[i]["reset_state"]
self.envs.buf_state[i] = info[i]["reset_state"]
if self.use_actions_mask:
avail_actions[i] = info[i]["reset_avail_actions"]
self.envs.buf_avail_actions[i] = info[i]["reset_avail_actions"]
if self.use_wandb:
step_info[f"Train-Results/Episode-Steps/rank_{self.rank}/env-%d" % i] = info[i]["episode_step"]
step_info[f"Train-Results/Episode-Rewards/rank_{self.rank}/env-%d" % i] = info[i]["episode_score"]
else:
step_info[f"Train-Results/Episode-Steps/rank_{self.rank}"] = {
"env-%d" % i: info[i]["episode_step"]}
step_info[f"Train-Results/Episode-Rewards/rank_{self.rank}"] = {
"env-%d" % i: np.mean(itemgetter(*self.agent_keys)(info[i]["episode_score"]))}
self.log_infos(step_info, self.current_step)
return_info.update(step_info)
+ self.current_step += self.n_envs
self._update_explore_factor()
It is evident that start_training
should be configured based on timesteps
rather than episodes
.
—
When defining environments, there is this parameter self.max_episode_steps
. If the start_training
parameter should be based on episode counts, maybe it should be calulated as self.envs[1].max_episode_steps * self.start_training
.
Metadata
Metadata
Assignees
Labels
No labels