Source code for pyml.neural_network.optimizer.adam

"""Adam Optimizer for Neural Networks.
"""

from pyml.neural_network.layer.transformation import _Transformation
from pyml.neural_network.optimizer import _Optimizer

import numpy as np

[docs]class Adam(_Optimizer):
    """Adam Optimizer for Neural Networks

    This optimizer uses the Adam (Adaptive Moment Estimation) algorithm to
    adapt the learning rate of each parameter based on both the first and second
    moments of the gradients.

    Parameters
    ----------
    learning_rate : float, optional
        The initial learning rate, by default 0.001
    decay : float, optional
        Learning rate decay factor, by default 0.
    epsilon : float, optional
        Small value added to the denominator to prevent division by zero,
        by default 1e-7
    beta_1 : float, optional
        Exponential moving average factor for the first moment (mean) of gradients,
        by default 0.9
    beta_2 : float, optional
        Exponential moving average factor for the second moment (uncentered variance) of gradients,
        by default 0.999
    """

    def __init__(
        self,
        learning_rate:float=0.001,
        decay:float=0.,
        epsilon:float=1e-7,
        beta_1:float=0.9,
        beta_2:float=0.999
    ) -> None:        
        super().__init__(learning_rate, decay)

        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

[docs]    def update_parameters(self, layer:_Transformation) -> None:
        """Update the parameters of the given layer using the Adam optimization algorithm.

        Parameters
        ----------
        layer : _Transformation
            The layer to update.

        Note
        ----
        If the layer does not have cache arrays for weights and biases,
        this method initializes them with zeros. It then updates the momentums
        and caches of gradients using the Adam algorithm and performs parameter
        updates accordingly.    
        """

        # Check if the layer has cache arrays for weight and bias updates.
        # If not, initialize them with zeros.
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)


        # Update the momentum arrays with the current gradients.
        layer.weight_momentums = self.beta_1 * \
                                 layer.weight_momentums + \
                                 (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                               layer.bias_momentums + \
                               (1 - self.beta_1) * layer.dbiases
        
        # Calculate the corrected momentum values.
        # self.iteration is 0 at the first pass, so we start with 1 here.
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        
        # Update the cache arrays with squared current gradients.
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases**2
        
        # Calculate the corrected cache values.
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))

        # Perform a parameter update using Vanilla SGD,
        # and normalize the update using the square root of the cache arrays.
        layer.weights += -self.current_learning_rate * \
                         weight_momentums_corrected / \
                         (np.sqrt(weight_cache_corrected) +
                             self.epsilon)
        layer.biases += -self.current_learning_rate * \
                         bias_momentums_corrected / \
                         (np.sqrt(bias_cache_corrected) +
                             self.epsilon)