Added RMSProp Optimizer subroutine (#144)

Spnetic-5 · milancurcic · web-flow · commit 44833c20987a · 2023-06-20T12:24:49.000-04:00
* Added RMSProp subroutine

* Added Bias gradients

* Made suggested corrections

* Refactor RMSprop; it now compiles and converges

---------

Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/example/quadratic.f90 b/example/quadratic.f90
@@ -3,21 +3,23 @@ program quadratic_fit
   ! stochastic gradient descent, batch gradient descent, and minibatch gradient
   ! descent.
   use nf, only: dense, input, network
+  use nf_dense_layer, only: dense_layer
 
   implicit none
-  type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd
+  type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
 
   ! Training parameters
   integer, parameter :: num_epochs = 1000
   integer, parameter :: train_size = 1000
   integer, parameter :: test_size = 30
   integer, parameter :: batch_size = 10
   real, parameter :: learning_rate = 0.01
+  real, parameter :: decay_rate = 0.9
 
   ! Input and output data
   real, allocatable :: x(:), y(:) ! training data
   real, allocatable :: xtest(:), ytest(:) ! testing data
-  real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:)
+  real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)
 
   integer :: i, n
 
@@ -42,6 +44,7 @@ program quadratic_fit
   net_sgd = network([input(1), dense(3), dense(1)])
   net_batch_sgd = network([input(1), dense(3), dense(1)])
   net_minibatch_sgd = network([input(1), dense(3), dense(1)])
+  net_rms_prop = network([input(1), dense(3), dense(1)])
 
   ! Print network info to stdout; this will be the same for all three networks.
   call net_sgd % print_info()
@@ -55,15 +58,20 @@ program quadratic_fit
   ! Mini-batch SGD optimizer
   call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
 
+  ! RMSProp optimizer
+  call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)
+
   ! Calculate predictions on the test set
   ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)]
   ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)]
   ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)]
+  ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1, test_size)]
 
   ! Print the mean squared error
   print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest)
   print '("     Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest)
   print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest)
+  print '("                    RMSProp MSE: ", f9.6)', sum((ypred_rms_prop - ytest)**2) / size(ytest)
 
 contains
 
@@ -161,6 +169,77 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
     end do
   end subroutine minibatch_gd_optimizer
 
+  subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
+    ! RMSprop optimizer for updating weights using root mean square
+    type(network), intent(inout) :: net
+    real, intent(in) :: x(:), y(:)
+    real, intent(in) :: learning_rate, decay_rate
+    integer, intent(in) :: num_epochs
+    integer :: i, j, n
+    real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
+
+    ! Define a dedicated type to store the RMSprop gradients.
+    ! This is needed because array sizes vary between layers and we need to
+    ! keep track of gradients for each layer over time.
+    ! For now this works only for dense layers.
+    ! We will need to define a similar type for conv2d layers.
+    type :: rms_gradient_dense
+      real, allocatable :: dw(:,:)
+      real, allocatable :: db(:)
+    end type rms_gradient_dense
+
+    type(rms_gradient_dense), allocatable :: rms(:)
+
+    print *, "Running RMSprop optimizer..."
+
+    ! Here we allocate the array or RMS gradient derived types.
+    ! We need one for each dense layer, however we will allocate it to the
+    ! length of all layers as it will make housekeeping easier.
+    allocate(rms(size(net % layers)))
+
+    do n = 1, num_epochs
+
+      do i = 1, size(x)
+        call net % forward([x(i)])
+        call net % backward([y(i)])
+      end do
+
+      ! RMSprop update rule
+      do j = 1, size(net % layers)
+        select type (this_layer => net % layers(j) % p)
+          type is (dense_layer)
+
+            ! If this is our first time here for this layer, allocate the
+            ! internal RMS gradient arrays and initialize them to zero.
+            if (.not. allocated(rms(j) % dw)) then
+              allocate(rms(j) % dw, mold=this_layer % dw)
+              allocate(rms(j) % db, mold=this_layer % db)
+              rms(j) % dw = 0
+              rms(j) % db = 0
+            end if
+
+            ! Update the RMS gradients using the RMSprop moving average rule
+            rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw**2
+            rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db**2
+
+            ! Update weights and biases using the RMSprop update rule
+            this_layer % weights = this_layer % weights - learning_rate &
+              / sqrt(rms(j) % dw + epsilon) * this_layer % dw
+            this_layer % biases = this_layer % biases - learning_rate &
+              / sqrt(rms(j) % db + epsilon) * this_layer % db
+
+            ! We have updated the weights and biases, so we need to reset the
+            ! gradients to zero for the next epoch.
+            this_layer % dw = 0
+            this_layer % db = 0
+
+        end select
+      end do
+
+    end do
+
+  end subroutine rmsprop_optimizer
+
   subroutine shuffle(arr)
     ! Shuffle an array using the Fisher-Yates algorithm.
     integer, intent(inout) :: arr(:)