-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathApproximateRL.py
82 lines (67 loc) · 2.88 KB
/
ApproximateRL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from Environment import *
import random
import numpy as np
class ApproxRL(): #base class for initializing parameters necessary to learn using approximate state space
def __init__(self, config, initializer, step, lamda, featurizer):
self.e = config.e
self.alpha = config.alpha
self.m = len(config.actions)
self.step = step #simulator for generating next state, reward for a given state-action pair
self.init = initializer #initializes the state at the start of an episode
self.featurizer = featurizer
S = self.init()
s = self.featurizer(S).shape
self.W = np.array([np.zeros(s)]*self.m) # Q(s,a) = phi(s,a).T*W
self.E = np.zeros_like(self.W) #eligibility trace
self.lamda = lamda
def max_action_value(self, s): #returns argmax Q[(s,a)], max Q[(s,a)]
q = np.sum(self.featurizer(s)*self.W, (1,2))
a_max = np.argmax(q)
v_max = np.max(q)
return a_max, v_max
def Qvalue(self, s, a): #returns Q[(s,a)]
w = self.W[a, :, :]
return np.sum(w*self.featurizer(s))
def next_action(self, s): #returns action based on e-greedy policy
probs = [self.e/self.m]*self.m
a_max, _ = self.max_action_value(s)
probs[a_max] += 1 - self.e
assert sum(probs) > 0.999, "probabilities don't sum to 1"
return np.argmax(np.random.multinomial(1, probs))
def getVtable(self): #returns 10 by 21 table of max values
V = np.zeros((10,21))
for x in range(10):
for y in range(21):
s = (x+1, y+1)
_, V[x,y] = self.max_action_value(s)
return V
def getQtable(self):
Q = np.zeros((2,10,21))
for x in range(10):
for y in range(21):
for a in range(2):
s = (x+1, y+1)
Q[a,x,y] = self.Qvalue(s,a)
return Q
def apply_control(self): #SARSA(Lambda) - Backward View for one episode
S = self.init()
A = self.next_action(S)
i = 1 #switch
while S != "terminal":
grad = self.featurizer(S)
self.E[A, :,:] += grad
S_, R = self.step(S, A)
delta = R - self.Qvalue(S,A)
if S_ != "terminal":
A_ = self.next_action(S_)
delta += self.Qvalue(S_, A_)
A = A_
else:
i = 0 #to esnure E set to 0 before start of new episode
S = S_
self.W += (self.alpha*delta)*self.E
self.E = (self.lamda*i)*self.E
def Apply_SARSA(self, episodes = 1000):
for i in range(episodes):
self.apply_control()
self.trained = True