@@ -3,21 +3,23 @@ program quadratic_fit
3
3
! stochastic gradient descent, batch gradient descent, and minibatch gradient
4
4
! descent.
5
5
use nf, only: dense, input, network
6
+ use nf_dense_layer, only: dense_layer
6
7
7
8
implicit none
8
- type (network) :: net_sgd, net_batch_sgd, net_minibatch_sgd
9
+ type (network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
9
10
10
11
! Training parameters
11
12
integer , parameter :: num_epochs = 1000
12
13
integer , parameter :: train_size = 1000
13
14
integer , parameter :: test_size = 30
14
15
integer , parameter :: batch_size = 10
15
16
real , parameter :: learning_rate = 0.01
17
+ real , parameter :: decay_rate = 0.9
16
18
17
19
! Input and output data
18
20
real , allocatable :: x(:), y(:) ! training data
19
21
real , allocatable :: xtest(:), ytest(:) ! testing data
20
- real , allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:)
22
+ real , allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)
21
23
22
24
integer :: i, n
23
25
@@ -42,6 +44,7 @@ program quadratic_fit
42
44
net_sgd = network([input(1 ), dense(3 ), dense(1 )])
43
45
net_batch_sgd = network([input(1 ), dense(3 ), dense(1 )])
44
46
net_minibatch_sgd = network([input(1 ), dense(3 ), dense(1 )])
47
+ net_rms_prop = network([input(1 ), dense(3 ), dense(1 )])
45
48
46
49
! Print network info to stdout; this will be the same for all three networks.
47
50
call net_sgd % print_info()
@@ -55,15 +58,20 @@ program quadratic_fit
55
58
! Mini-batch SGD optimizer
56
59
call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
57
60
61
+ ! RMSProp optimizer
62
+ call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)
63
+
58
64
! Calculate predictions on the test set
59
65
ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1 , test_size)]
60
66
ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1 , test_size)]
61
67
ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1 , test_size)]
68
+ ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1 , test_size)]
62
69
63
70
! Print the mean squared error
64
71
print ' ("Stochastic gradient descent MSE:", f9.6)' , sum ((ypred_sgd - ytest)** 2 ) / size (ytest)
65
72
print ' (" Batch gradient descent MSE: ", f9.6)' , sum ((ypred_batch_sgd - ytest)** 2 ) / size (ytest)
66
73
print ' (" Minibatch gradient descent MSE: ", f9.6)' , sum ((ypred_minibatch_sgd - ytest)** 2 ) / size (ytest)
74
+ print ' (" RMSProp MSE: ", f9.6)' , sum ((ypred_rms_prop - ytest)** 2 ) / size (ytest)
67
75
68
76
contains
69
77
@@ -161,6 +169,77 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
161
169
end do
162
170
end subroutine minibatch_gd_optimizer
163
171
172
+ subroutine rmsprop_optimizer (net , x , y , learning_rate , num_epochs , decay_rate )
173
+ ! RMSprop optimizer for updating weights using root mean square
174
+ type (network), intent (inout ) :: net
175
+ real , intent (in ) :: x(:), y(:)
176
+ real , intent (in ) :: learning_rate, decay_rate
177
+ integer , intent (in ) :: num_epochs
178
+ integer :: i, j, n
179
+ real , parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
180
+
181
+ ! Define a dedicated type to store the RMSprop gradients.
182
+ ! This is needed because array sizes vary between layers and we need to
183
+ ! keep track of gradients for each layer over time.
184
+ ! For now this works only for dense layers.
185
+ ! We will need to define a similar type for conv2d layers.
186
+ type :: rms_gradient_dense
187
+ real , allocatable :: dw(:,:)
188
+ real , allocatable :: db(:)
189
+ end type rms_gradient_dense
190
+
191
+ type (rms_gradient_dense), allocatable :: rms(:)
192
+
193
+ print * , " Running RMSprop optimizer..."
194
+
195
+ ! Here we allocate the array or RMS gradient derived types.
196
+ ! We need one for each dense layer, however we will allocate it to the
197
+ ! length of all layers as it will make housekeeping easier.
198
+ allocate (rms(size (net % layers)))
199
+
200
+ do n = 1 , num_epochs
201
+
202
+ do i = 1 , size (x)
203
+ call net % forward([x(i)])
204
+ call net % backward([y(i)])
205
+ end do
206
+
207
+ ! RMSprop update rule
208
+ do j = 1 , size (net % layers)
209
+ select type (this_layer = > net % layers(j) % p)
210
+ type is (dense_layer)
211
+
212
+ ! If this is our first time here for this layer, allocate the
213
+ ! internal RMS gradient arrays and initialize them to zero.
214
+ if (.not. allocated (rms(j) % dw)) then
215
+ allocate (rms(j) % dw, mold= this_layer % dw)
216
+ allocate (rms(j) % db, mold= this_layer % db)
217
+ rms(j) % dw = 0
218
+ rms(j) % db = 0
219
+ end if
220
+
221
+ ! Update the RMS gradients using the RMSprop moving average rule
222
+ rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw** 2
223
+ rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db** 2
224
+
225
+ ! Update weights and biases using the RMSprop update rule
226
+ this_layer % weights = this_layer % weights - learning_rate &
227
+ / sqrt (rms(j) % dw + epsilon) * this_layer % dw
228
+ this_layer % biases = this_layer % biases - learning_rate &
229
+ / sqrt (rms(j) % db + epsilon) * this_layer % db
230
+
231
+ ! We have updated the weights and biases, so we need to reset the
232
+ ! gradients to zero for the next epoch.
233
+ this_layer % dw = 0
234
+ this_layer % db = 0
235
+
236
+ end select
237
+ end do
238
+
239
+ end do
240
+
241
+ end subroutine rmsprop_optimizer
242
+
164
243
subroutine shuffle (arr )
165
244
! Shuffle an array using the Fisher-Yates algorithm.
166
245
integer , intent (inout ) :: arr(:)
0 commit comments