SGD optimizer stub (#139)

milancurcic · web-flow · commit 31fc06176f95 · 2023-06-22T11:18:15.000-04:00
* Defining the SGD minimization step in the optimizer type

* Add note about refactor needed

* Pass optimizer instance down to layer % update()

* Apply the optimizer update step in layer % update

* Changes in tests and examples to account for the API change in network % update()

* Make optimizer optional; default to SGD with learning rate of 1

* Apply optimizer to conv2d layer
diff --git a/example/get_set_network_params.f90 b/example/get_set_network_params.f90
@@ -1,5 +1,6 @@
 program get_set_network_params
   use nf, only: dense, input, network
+  use nf_optimizers, only: sgd
   implicit none
   type(network) :: net1, net2
   real :: x(1), y(1)
@@ -37,7 +38,7 @@ program get_set_network_params
 
     call net1 % forward(x)
     call net1 % backward(y)
-    call net1 % update(1.)
+    call net1 % update(sgd(learning_rate=1.))
 
     if (mod(n, 10000) == 0) then
       ypred1 = [(net1 % predict([xtest(i)]), i=1, test_size)]
diff --git a/example/quadratic.f90 b/example/quadratic.f90
@@ -4,6 +4,7 @@ program quadratic_fit
   ! descent.
   use nf, only: dense, input, network
   use nf_dense_layer, only: dense_layer
+  use nf_optimizers, only: sgd
 
   implicit none
   type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
@@ -97,7 +98,7 @@ subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs)
       do i = 1, size(x)
         call net % forward([x(i)])
         call net % backward([y(i)])
-        call net % update(learning_rate)
+        call net % update(sgd(learning_rate=learning_rate))
       end do
     end do
 
@@ -120,7 +121,7 @@ subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
         call net % forward([x(i)])
         call net % backward([y(i)])
       end do
-      call net % update(learning_rate / size(x))
+      call net % update(sgd(learning_rate=learning_rate / size(x)))
     end do
 
   end subroutine batch_gd_optimizer
@@ -164,7 +165,7 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
           call net % backward([y(i)])
         end do
 
-        call net % update(learning_rate / batch_size)
+        call net % update(sgd(learning_rate=learning_rate / batch_size))
       end do
     end do
   end subroutine minibatch_gd_optimizer
diff --git a/example/simple.f90 b/example/simple.f90
@@ -24,7 +24,7 @@ program simple
 
     call net % forward(x)
     call net % backward(y)
-    call net % update(1.)
+    call net % update()
 
     if (mod(n, 50) == 0) &
       print '(i4,2(3x,f8.6))', n, net % predict(x)
diff --git a/example/sine.f90 b/example/sine.f90
@@ -31,7 +31,7 @@ program sine
 
     call net % forward(x)
     call net % backward(y)
-    call net % update(1.)
+    call net % update()
 
     if (mod(n, 10000) == 0) then
       ypred = [(net % predict([xtest(i)]), i = 1, test_size)]
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
@@ -36,7 +36,6 @@ module nf_conv2d_layer
     procedure :: get_num_params
     procedure :: get_params
     procedure :: set_params
-    procedure :: update
 
   end type conv2d_layer
 
@@ -105,14 +104,6 @@ module subroutine set_params(self, params)
         !! Parameters to set
     end subroutine set_params
 
-    module subroutine update(self, learning_rate)
-      !! Update the weights and biases.
-      class(conv2d_layer), intent(in out) :: self
-        !! Dense layer instance
-      real, intent(in) :: learning_rate
-        !! Learning rate (must be > 0)
-    end subroutine update
-
   end interface
 
 end module nf_conv2d_layer
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
@@ -225,20 +225,4 @@ module subroutine set_params(self, params)
 
   end subroutine set_params
 
-
-  module subroutine update(self, learning_rate)
-    class(conv2d_layer), intent(in out) :: self
-    real, intent(in) :: learning_rate
-
-    ! Sum weight and bias gradients across images, if any
-    call co_sum(self % dw)
-    call co_sum(self % db)
-
-    self % kernel = self % kernel - learning_rate * self % dw
-    self % biases = self % biases - learning_rate * self % db
-    self % dw = 0
-    self % db = 0
-
-  end subroutine update
-
 end submodule nf_conv2d_layer_submodule
diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
@@ -37,7 +37,6 @@ module nf_dense_layer
     procedure :: get_params
     procedure :: set_params
     procedure :: init
-    procedure :: update
 
   end type dense_layer
 
@@ -115,14 +114,6 @@ module subroutine init(self, input_shape)
         !! Shape of the input layer
     end subroutine init
 
-    module subroutine update(self, learning_rate)
-      !! Update the weights and biases.
-      class(dense_layer), intent(in out) :: self
-        !! Dense layer instance
-      real, intent(in) :: learning_rate
-        !! Learning rate (must be > 0)
-    end subroutine update
-
   end interface
 
 end module nf_dense_layer
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
@@ -128,19 +128,4 @@ module subroutine init(self, input_shape)
 
   end subroutine init
 
-  module subroutine update(self, learning_rate)
-    class(dense_layer), intent(in out) :: self
-    real, intent(in) :: learning_rate
-
-    ! Sum weight and bias gradients across images, if any
-    call co_sum(self % dw)
-    call co_sum(self % db)
-
-    self % weights = self % weights - learning_rate * self % dw
-    self % biases = self % biases - learning_rate * self % db
-    self % dw = 0
-    self % db = 0
-
-  end subroutine update
-
 end submodule nf_dense_layer_submodule
diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
@@ -4,6 +4,7 @@ module nf_layer
   !! user-facing API.
 
   use nf_base_layer, only: base_layer
+  use nf_optimizers, only: optimizer_base_type
 
   implicit none
 
@@ -144,16 +145,18 @@ module subroutine set_params(self, params)
         !! Parameters of this layer
     end subroutine set_params
 
-    impure elemental module subroutine update(self, learning_rate)
+    impure elemental module subroutine update(self, optimizer, batch_size)
       !! Update the weights and biases on the layer using the stored
       !! gradients (from backward passes), and flush those same stored
       !! gradients to zero.
       !! This changes the state of the layer.
       !! Typically used only internally from the `network % update` method.
       class(layer), intent(in out) :: self
         !! Layer instance
-      real, intent(in) :: learning_rate
-        !! Learning rate to use; must be > 0.
+      class(optimizer_base_type), intent(in) :: optimizer
+        !! Optimizer instance to use
+      integer, intent(in), optional :: batch_size
+        !! Batch size (default 1)
     end subroutine update
 
   end interface
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
@@ -8,6 +8,7 @@
   use nf_input3d_layer, only: input3d_layer
   use nf_maxpool2d_layer, only: maxpool2d_layer
   use nf_reshape_layer, only: reshape3d_layer
+  use nf_optimizers, only: optimizer_base_type
 
 contains
 
@@ -382,15 +383,54 @@ module subroutine set_params(self, params)
   end subroutine set_params
 
 
-  impure elemental module subroutine update(self, learning_rate)
+  impure elemental module subroutine update(self, optimizer, batch_size)
     class(layer), intent(in out) :: self
-    real, intent(in) :: learning_rate
+    class(optimizer_base_type), intent(in) :: optimizer
+    integer, intent(in), optional :: batch_size
+    integer :: batch_size_
+
+    batch_size_ = 1
+    if (present(batch_size)) batch_size_ = batch_size
+
+    select type (this_layer => self % p)
+      type is (dense_layer)
+
+        ! Sum weight and bias gradients across images, if any
+        call co_sum(this_layer % dw)
+        call co_sum(this_layer % db)
+
+        call optimizer % minimize( &
+          this_layer % weights, &
+          this_layer % dw / batch_size_ &
+        )
+        call optimizer % minimize( &
+          this_layer % biases, &
+          this_layer % db / batch_size_ &
+        )
+
+        ! Reset gradients.
+        this_layer % dw = 0
+        this_layer % db = 0
+
+      type is (conv2d_layer)
+
+        ! Sum weight and bias gradients across images, if any
+        call co_sum(this_layer % dw)
+        call co_sum(this_layer % db)
+
+        call optimizer % minimize( &
+          this_layer % kernel, &
+          this_layer % dw / batch_size_ &
+        )
+        call optimizer % minimize( &
+          this_layer % biases, &
+          this_layer % db / batch_size_ &
+        )
+
+        ! Reset gradients.
+        this_layer % dw = 0
+        this_layer % db = 0
 
-    select type(this_layer => self % p)
-      type is(dense_layer)
-        call this_layer % update(learning_rate)
-      type is(conv2d_layer)
-        call this_layer % update(learning_rate)
     end select
 
   end subroutine update
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
@@ -193,12 +193,11 @@ module subroutine train(self, input_data, output_data, batch_size, &
         !! Set to `size(input_data, dim=2)` for a batch gradient descent.
       integer, intent(in) :: epochs
         !! Number of epochs to run
-      class(optimizer_base_type), intent(in) :: optimizer
-        !! Optimizer instance; currently this is an `sgd` optimizer type
-        !! and it will be made to be a more general optimizer type.
+      class(optimizer_base_type), intent(in), optional :: optimizer
+        !! Optimizer instance to use. If not provided, the default is sgd().
     end subroutine train
 
-    module subroutine update(self, learning_rate)
+    module subroutine update(self, optimizer, batch_size)
       !! Update the weights and biases on all layers using the stored
       !! gradients (from backward passes) on those layers, and flush those
       !! same stored gradients to zero.
@@ -207,8 +206,12 @@ module subroutine update(self, learning_rate)
       !! but can be invoked by the user when creating custom optimizers.
       class(network), intent(in out) :: self
         !! Network instance
-      real, intent(in) :: learning_rate
-        !! Learning rate to use; must be > 0.
+      class(optimizer_base_type), intent(in), optional :: optimizer
+        !! Optimizer instance to use
+      integer, intent(in), optional :: batch_size
+        !! Batch size to use.
+        !! Set to 1 for a pure stochastic gradient descent (default).
+        !! Set to `size(input_data, dim=2)` for a batch gradient descent.
     end subroutine update
 
   end interface
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
@@ -520,14 +520,23 @@ module subroutine train(self, input_data, output_data, batch_size, &
     real, intent(in) :: output_data(:,:)
     integer, intent(in) :: batch_size
     integer, intent(in) :: epochs
-    class(optimizer_base_type), intent(in) :: optimizer
+    class(optimizer_base_type), intent(in), optional :: optimizer
+    class(optimizer_base_type), allocatable :: optimizer_
 
     real :: pos
     integer :: dataset_size
     integer :: batch_start, batch_end
     integer :: i, j, n
     integer :: istart, iend, indices(2)
 
+    ! Passing the optimizer instance is optional.
+    ! If not provided, we default to SGD with its default settings.
+    if (present(optimizer)) then
+      optimizer_ = optimizer
+    else
+      optimizer_ = sgd()
+    end if
+
     dataset_size = size(output_data, dim=2)
 
     epoch_loop: do n = 1, epochs
@@ -552,9 +561,9 @@ module subroutine train(self, input_data, output_data, batch_size, &
           call self % backward(output_data(:,j))
         end do
 
-        select type (optimizer)
+        select type (optimizer_)
           type is (sgd)
-            call self % update(optimizer % learning_rate / batch_size)
+            call self % update(optimizer_, batch_size)
           class default
             error stop 'Unsupported optimizer'
         end select
@@ -565,10 +574,22 @@ module subroutine train(self, input_data, output_data, batch_size, &
   end subroutine train
 
 
-  module subroutine update(self, learning_rate)
+  module subroutine update(self, optimizer, batch_size)
     class(network), intent(in out) :: self
-    real, intent(in) :: learning_rate
-    call self % layers % update(learning_rate)
+    class(optimizer_base_type), intent(in), optional :: optimizer
+    integer, intent(in), optional :: batch_size
+    class(optimizer_base_type), allocatable :: optimizer_
+
+    ! Passing the optimizer instance is optional.
+    ! If not provided, we default to SGD with its default settings.
+    if (present(optimizer)) then
+      optimizer_ = optimizer
+    else
+      optimizer_ = sgd()
+    end if
+
+    call self % layers % update(optimizer_, batch_size)
+
   end subroutine update
 
 end submodule nf_network_submodule
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
diff --git a/test/test_dense_network.f90 b/test/test_dense_network.f90