-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_env.py
87 lines (75 loc) · 2.46 KB
/
check_env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from copy import deepcopy
import numpy as np
from stable_baselines3.ppo import MlpPolicy
from envs.flexible_arm_3dof import FlexibleArm3DOF, SymbolicFlexibleArm3DOF
from envs.flexible_arm_env import FlexibleArmEnv
from utils.gym_utils import CallableMPCExpert
from mpc_3dof import Mpc3dofOptions, Mpc3Dof
from utils.utils import ControlMode
control_mode = ControlMode.SET_POINT
n_seg = 3
fa_ld = FlexibleArm3DOF(n_seg)
fa_sym_ld = SymbolicFlexibleArm3DOF(n_seg)
# Create initial state simulated system
angle0_0 = 0.0
angle0_1 = 0
angle0_2 = np.pi / 20
q0 = np.zeros((fa_ld.nq, 1))
q0[0] += angle0_0
q0[1] += angle0_1
q0[1 + n_seg + 1] += angle0_2
# Compute reference
q_ref = deepcopy(q0)
q_ref[0] += np.pi / 2
q_ref[1] += 0
q_ref[1 + n_seg + 1] += -np.pi / 20
dq0 = np.zeros_like(q0)
x0 = np.vstack((q0, dq0))
dq_ref = np.zeros_like(q_ref)
x_ref = np.vstack((q_ref, dq_ref))
_, x_ee_ref = fa_ld.fk_ee(q_ref)
_, xee0 = fa_ld.fk_ee(q0)
# create expert MPC
mpc_options = Mpc3dofOptions(n_seg=n_seg)
mpc_options.n = 30
mpc_options.tf = 1
mpc_options.r_diag *= 10
controller = Mpc3Dof(model=fa_sym_ld, x0=x0, x0_ee=xee0, options=mpc_options)
u_ref = np.zeros(
(fa_sym_ld.nu, 1)
) # u_ref could be changed to some known value along the trajectory
# Choose one out of two control modes. Reference tracking uses a spline planner.
controller.set_reference_point(p_ee_ref=x_ee_ref, x_ref=x_ref, u_ref=u_ref)
env = FlexibleArmEnv(n_seg=n_seg, dt=0.05, q0=q0[:, 0], xee_final=x_ee_ref)
# create MPC expert
expert = CallableMPCExpert(
controller, observation_space=env.observation_space, action_space=env.action_space
)
# load trained policy
learned_policy = MlpPolicy(
observation_space=env.observation_space,
action_space=env.action_space,
lr_schedule=lambda x: 1,
)
learned_policy = learned_policy.load("bc_policy_1")
# run tests
n_eps = 3
use_trained = True
for i in range(n_eps):
sum_reward = 0
print("Episode {}".format(i))
done = False
obs = env.reset()
while not done:
action_exp = expert.predict_np(obs)
action_bc = learned_policy.predict(obs, deterministic=True)[0]
print("Action exp: {}, Action bc: {}".format(action_exp, action_bc))
if use_trained:
action = action_bc
else:
action = action_exp
(obs, reward, done, info) = env.step(action)
sum_reward += reward
if done:
print("N steps: {}".format(env.no_intg_steps))
print("Sum of rewards: {}".format(sum_reward))