Skip to content

Optimizers API

Optimizers update the model's parameters based on the computed gradients to minimize the loss function.

Base Optimizer

mpneuralnetwork.optimizers.Optimizer

Base class for all optimization algorithms.

Optimizers update the weights of the network layers to minimize the loss function. They also handle regularization (L1/L2).

Source code in src/mpneuralnetwork/optimizers.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class Optimizer:
    """Base class for all optimization algorithms.

    Optimizers update the weights of the network layers to minimize the loss function.
    They also handle regularization (L1/L2).
    """

    def __init__(self, learning_rate: float, regularization: Lit_R, weight_decay: float) -> None:
        """Initializes the optimizer.

        Args:
            learning_rate (float): The step size for parameter updates.
            regularization (Lit_R): Type of regularization ('L1' or 'L2').
            weight_decay (float): The strength of the regularization (lambda).
        """
        self.learning_rate: float = learning_rate
        self.regularization: Lit_R = regularization
        self.weight_decay: float = weight_decay

    @abstractmethod
    def step(self, layers: list[Layer]) -> None:
        """Performs a single optimization step.

        Iterates over all layers and updates their parameters based on stored gradients.

        Args:
            layers (list[Layer]): List of layers containing parameters to update.
        """
        pass

    def get_config(self) -> dict:
        return {
            "type": self.__class__.__name__,
            "learning_rate": self.learning_rate,
            "regularization": self.regularization,
            "weight_decay": self.weight_decay,
        }

    def apply_regularization(self, param_name: str, param: ArrayType) -> ArrayType | int:
        """Computes the regularization gradient term.

        Args:
            param_name (str): Name of the parameter (e.g., 'weights', 'bias').
            param (ArrayType): The parameter value.

        Returns:
            ArrayType | int: The gradient contribution from regularization.
        """
        regularization: ArrayType
        if "bias" in param_name or "beta" in param_name or "gamma" in param_name:
            return 0

        if self.regularization == "L2":
            regularization = self.weight_decay * param
        else:
            regularization = self.weight_decay * xp.sign(param)

        return regularization

    @property
    def params(self) -> dict:
        """Returns the optimizer's internal state (velocities, moments)."""
        return {}

params property

Returns the optimizer's internal state (velocities, moments).

__init__(learning_rate, regularization, weight_decay)

Initializes the optimizer.

Parameters:

Name Type Description Default
learning_rate float

The step size for parameter updates.

required
regularization Lit_R

Type of regularization ('L1' or 'L2').

required
weight_decay float

The strength of the regularization (lambda).

required
Source code in src/mpneuralnetwork/optimizers.py
18
19
20
21
22
23
24
25
26
27
28
def __init__(self, learning_rate: float, regularization: Lit_R, weight_decay: float) -> None:
    """Initializes the optimizer.

    Args:
        learning_rate (float): The step size for parameter updates.
        regularization (Lit_R): Type of regularization ('L1' or 'L2').
        weight_decay (float): The strength of the regularization (lambda).
    """
    self.learning_rate: float = learning_rate
    self.regularization: Lit_R = regularization
    self.weight_decay: float = weight_decay

apply_regularization(param_name, param)

Computes the regularization gradient term.

Parameters:

Name Type Description Default
param_name str

Name of the parameter (e.g., 'weights', 'bias').

required
param ArrayType

The parameter value.

required

Returns:

Type Description
ArrayType | int

ArrayType | int: The gradient contribution from regularization.

Source code in src/mpneuralnetwork/optimizers.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def apply_regularization(self, param_name: str, param: ArrayType) -> ArrayType | int:
    """Computes the regularization gradient term.

    Args:
        param_name (str): Name of the parameter (e.g., 'weights', 'bias').
        param (ArrayType): The parameter value.

    Returns:
        ArrayType | int: The gradient contribution from regularization.
    """
    regularization: ArrayType
    if "bias" in param_name or "beta" in param_name or "gamma" in param_name:
        return 0

    if self.regularization == "L2":
        regularization = self.weight_decay * param
    else:
        regularization = self.weight_decay * xp.sign(param)

    return regularization

step(layers) abstractmethod

Performs a single optimization step.

Iterates over all layers and updates their parameters based on stored gradients.

Parameters:

Name Type Description Default
layers list[Layer]

List of layers containing parameters to update.

required
Source code in src/mpneuralnetwork/optimizers.py
30
31
32
33
34
35
36
37
38
39
@abstractmethod
def step(self, layers: list[Layer]) -> None:
    """Performs a single optimization step.

    Iterates over all layers and updates their parameters based on stored gradients.

    Args:
        layers (list[Layer]): List of layers containing parameters to update.
    """
    pass

Algorithms

mpneuralnetwork.optimizers.SGD

Bases: Optimizer

Stochastic Gradient Descent (SGD) with Momentum.

Update rule
  1. v = momentum * v - lr * gradient
  2. w = w + v

Parameters:

Name Type Description Default
learning_rate float

Step size. Defaults to 0.01.

0.01
regularization Lit_R

'L1' or 'L2'. Defaults to 'L2'.

'L2'
weight_decay float

Regularization strength. Defaults to 0.001.

0.001
momentum float

Momentum factor (0 to 1). Defaults to 0.1.

0.1
Source code in src/mpneuralnetwork/optimizers.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class SGD(Optimizer):
    """Stochastic Gradient Descent (SGD) with Momentum.

    Update rule:
        1. `v = momentum * v - lr * gradient`
        2. `w = w + v`

    Args:
        learning_rate (float, optional): Step size. Defaults to 0.01.
        regularization (Lit_R, optional): 'L1' or 'L2'. Defaults to 'L2'.
        weight_decay (float, optional): Regularization strength. Defaults to 0.001.
        momentum (float, optional): Momentum factor (0 to 1). Defaults to 0.1.
    """

    def __init__(
        self,
        learning_rate: float = 0.01,
        regularization: Lit_R = "L2",
        weight_decay: float = 0.001,
        momentum: float = 0.1,
    ) -> None:
        super().__init__(learning_rate, regularization, weight_decay)
        self.momentum: float = momentum

        self.velocities: T = {}

    def step(self, layers: list[Layer]) -> None:
        for layer in layers:
            if not hasattr(layer, "params"):
                continue

            for param_name, (param, grad) in layer.params.items():
                grad += self.apply_regularization(param_name, param)

                p_id: int = id(param)

                if p_id not in self.velocities:
                    self.velocities[p_id] = xp.zeros_like(param, dtype=DTYPE)

                # Velocity Update: v = momentum * v - lr * grad

                # 1. v *= momentum (in-place)
                xp.multiply(self.velocities[p_id], self.momentum, out=self.velocities[p_id])

                # 2. v -= lr * grad
                self.velocities[p_id] -= self.learning_rate * grad

                # Parameter Update: w += v
                param += self.velocities[p_id]

    def get_config(self) -> dict:
        config = super().get_config()
        config.update({"momentum": self.momentum})
        return config

    @property
    def params(self) -> dict:
        return {"velocities": self.velocities}

mpneuralnetwork.optimizers.RMSprop

Bases: Optimizer

RMSprop optimizer.

Adapts learning rates by dividing the gradient by a running average of its recent magnitude.

Update rule
  1. cache = decay * cache + (1 - decay) * grad^2
  2. w = w - lr * grad / (sqrt(cache) + epsilon)

Parameters:

Name Type Description Default
learning_rate float

Defaults to 0.001.

0.001
regularization Lit_R

'L1' or 'L2'.

'L2'
weight_decay float

Defaults to 0.001.

0.001
decay_rate float

Discounting factor. Defaults to 0.9.

0.9
epsilon float

Small value for numerical stability. Defaults to 1e-8.

1e-08
Source code in src/mpneuralnetwork/optimizers.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
class RMSprop(Optimizer):
    """RMSprop optimizer.

    Adapts learning rates by dividing the gradient by a running average of its recent magnitude.

    Update rule:
        1. `cache = decay * cache + (1 - decay) * grad^2`
        2. `w = w - lr * grad / (sqrt(cache) + epsilon)`

    Args:
        learning_rate (float): Defaults to 0.001.
        regularization (Lit_R): 'L1' or 'L2'.
        weight_decay (float): Defaults to 0.001.
        decay_rate (float, optional): Discounting factor. Defaults to 0.9.
        epsilon (float, optional): Small value for numerical stability. Defaults to 1e-8.
    """

    def __init__(
        self,
        learning_rate: float = 0.001,
        regularization: Lit_R = "L2",
        weight_decay: float = 0.001,
        decay_rate: float = 0.9,
        epsilon: float = 1e-8,
    ) -> None:
        super().__init__(learning_rate, regularization, weight_decay)
        self.decay_rate: float = decay_rate
        self.epsilon: float = epsilon

        self.cache: T = {}

    def step(self, layers: list[Layer]) -> None:
        for layer in layers:
            if not hasattr(layer, "params"):
                continue

            for param_name, (param, grad) in layer.params.items():
                grad += self.apply_regularization(param_name, param)

                p_id: int = id(param)

                if p_id not in self.cache:
                    self.cache[p_id] = xp.zeros_like(param, dtype=DTYPE)

                # Cache Update: cache = decay * cache + (1 - decay) * grad^2

                # 1. cache *= decay (in-place)
                xp.multiply(self.cache[p_id], self.decay_rate, out=self.cache[p_id])

                # 2. cache += (1 - decay) * grad^2
                self.cache[p_id] += (1 - self.decay_rate) * xp.square(grad)

                # Parameter Update: w -= lr * grad / (sqrt(cache) + epsilon)

                # 1. Denominator = sqrt(cache) + epsilon
                denom = xp.sqrt(self.cache[p_id])
                xp.add(denom, self.epsilon, out=denom)

                # 2. Update = lr * grad / denom
                # w -= update
                param -= self.learning_rate * grad / denom

    def get_config(self) -> dict:
        config = super().get_config()
        config.update({"decay_rate": self.decay_rate, "epsilon": self.epsilon})
        return config

    @property
    def params(self) -> dict:
        return {"cache": self.cache}

mpneuralnetwork.optimizers.Adam

Bases: Optimizer

Adam Optimizer (Adaptive Moment Estimation).

Combines Momentum and RMSprop. Implements Decoupled Weight Decay (AdamW) when regularization='L2'.

Update rule
  1. m = beta1 * m + (1 - beta1) * g
  2. v = beta2 * v + (1 - beta2) * g^2
  3. m_hat = m / (1 - beta1^t)
  4. v_hat = v / (1 - beta2^t)
  5. w = w - lr * m_hat / (sqrt(v_hat) + eps)
  6. If L2: w = w - lr * decay * w (Decoupled)

Parameters:

Name Type Description Default
beta1 float

Decay rate for first moment. Defaults to 0.9.

0.9
beta2 float

Decay rate for second moment. Defaults to 0.999.

0.999
epsilon float

Stability term. Defaults to 1e-8.

1e-08
Source code in src/mpneuralnetwork/optimizers.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
class Adam(Optimizer):
    """Adam Optimizer (Adaptive Moment Estimation).

    Combines Momentum and RMSprop.
    Implements **Decoupled Weight Decay (AdamW)** when `regularization='L2'`.

    Update rule:
        1. `m = beta1 * m + (1 - beta1) * g`
        2. `v = beta2 * v + (1 - beta2) * g^2`
        3. `m_hat = m / (1 - beta1^t)`
        4. `v_hat = v / (1 - beta2^t)`
        5. `w = w - lr * m_hat / (sqrt(v_hat) + eps)`
        6. If L2: `w = w - lr * decay * w` (Decoupled)

    Args:
        beta1 (float, optional): Decay rate for first moment. Defaults to 0.9.
        beta2 (float, optional): Decay rate for second moment. Defaults to 0.999.
        epsilon (float, optional): Stability term. Defaults to 1e-8.
    """

    def __init__(
        self,
        learning_rate: float = 0.001,
        regularization: Lit_R = "L2",
        weight_decay: float = 0.001,
        beta1: float = 0.9,
        beta2: float = 0.999,
        epsilon: float = 1e-8,
    ) -> None:
        super().__init__(learning_rate, regularization, weight_decay)
        self.beta1: float = beta1
        self.beta2: float = beta2
        self.epsilon: float = epsilon

        self.t: int = 0
        self.momentums: T = {}
        self.velocities: T = {}

    def step(self, layers: list[Layer]) -> None:
        self.t += 1

        for layer in layers:
            if not hasattr(layer, "params"):
                continue

            for param_name, (param, grad) in layer.params.items():
                if self.regularization == "L1":
                    grad += self.apply_regularization(param_name, param)

                p_id: int = id(param)

                if p_id not in self.momentums:
                    self.momentums[p_id] = xp.zeros_like(param, dtype=DTYPE)
                    self.velocities[p_id] = xp.zeros_like(param, dtype=DTYPE)

                # --- 1. Update Momentum (First Moment) ---
                # m = beta1 * m + (1 - beta1) * grad

                # m *= beta1
                xp.multiply(self.momentums[p_id], self.beta1, out=self.momentums[p_id])
                # m += (1 - beta1) * grad
                self.momentums[p_id] += (1 - self.beta1) * grad

                # --- 2. Update Velocity (Second Moment) ---
                # v = beta2 * v + (1 - beta2) * grad^2

                # v *= beta2
                xp.multiply(self.velocities[p_id], self.beta2, out=self.velocities[p_id])
                # v += (1 - beta2) * grad^2
                self.velocities[p_id] += (1 - self.beta2) * xp.square(grad)

                # --- 3. Bias Correction ---
                # m_hat = m / (1 - beta1^t)
                # v_hat = v / (1 - beta2^t)

                bias_correction1 = 1 - self.beta1**self.t
                bias_correction2 = 1 - self.beta2**self.t

                # Efficient Update Formula:
                # w -= lr * m_hat / (sqrt(v_hat) + epsilon)
                # w -= (lr / bias_correction1) * m / (sqrt(v / bias_correction2) + epsilon)

                step_size = self.learning_rate / bias_correction1

                # Denominator construction
                denom = xp.sqrt(self.velocities[p_id])
                # denom /= sqrt(bias_correction2)
                xp.divide(denom, xp.sqrt(bias_correction2), out=denom)
                # denom += epsilon
                xp.add(denom, self.epsilon, out=denom)

                # Final update: w -= step_size * m / denom
                # param -= step_size * (self.momentums[p_id] / denom)
                param -= step_size * self.momentums[p_id] / denom

                if self.regularization == "L2":
                    param -= self.learning_rate * self.apply_regularization(param_name, param)

    def get_config(self) -> dict:
        config = super().get_config()
        config.update(
            {
                "beta1": self.beta1,
                "beta2": self.beta2,
                "epsilon": self.epsilon,
            }
        )
        return config

    @property
    def params(self) -> dict:
        return {
            "t": self.t,
            "momentums": self.momentums,
            "velocities": self.velocities,
        }