mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 22:03:34 -05:00
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
382 lines
13 KiB
Python
Generated
382 lines
13 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/06_optimizers/optimizers_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Optimizer', 'SGD', 'Adam', 'AdamW']
|
|
|
|
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 1
|
|
import numpy as np
|
|
from typing import List, Union, Optional, Dict, Any
|
|
|
|
# Import Tensor from Module 01 (now with gradient support from Module 05)
|
|
from .tensor import Tensor
|
|
|
|
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 5
|
|
class Optimizer:
|
|
"""
|
|
Base class for all optimizers.
|
|
|
|
This class defines the common interface that all optimizers must implement:
|
|
- zero_grad(): Clear gradients from parameters
|
|
- step(): Update parameters based on gradients
|
|
"""
|
|
|
|
def __init__(self, params: List[Tensor]):
|
|
"""
|
|
Initialize optimizer with parameters to optimize.
|
|
|
|
TODO: Set up the parameter list for optimization
|
|
|
|
APPROACH:
|
|
1. Store parameters as a list for iteration
|
|
2. Validate that all parameters require gradients
|
|
3. Initialize step counter for algorithms that need it
|
|
|
|
EXAMPLE:
|
|
>>> linear = Linear(784, 128)
|
|
>>> optimizer = SGD(linear.parameters(), lr=0.01)
|
|
|
|
HINT: Check that each parameter has requires_grad=True
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Validate and store parameters
|
|
if not isinstance(params, list):
|
|
params = list(params)
|
|
|
|
# Check that parameters require gradients
|
|
for i, param in enumerate(params):
|
|
if not isinstance(param, Tensor):
|
|
raise TypeError(f"Parameter {i} must be a Tensor, got {type(param)}")
|
|
if not param.requires_grad:
|
|
raise ValueError(f"Parameter {i} does not require gradients. Set requires_grad=True.")
|
|
|
|
self.params = params
|
|
self.step_count = 0 # For algorithms that need step counting
|
|
### END SOLUTION
|
|
|
|
def zero_grad(self):
|
|
"""
|
|
Clear gradients from all parameters.
|
|
|
|
TODO: Reset all parameter gradients to None
|
|
|
|
APPROACH:
|
|
1. Iterate through all parameters
|
|
2. Set each parameter's grad to None
|
|
|
|
EXAMPLE:
|
|
>>> optimizer.zero_grad() # Clears all gradients
|
|
>>> assert param.grad is None for param in optimizer.params
|
|
|
|
WHY: Gradients accumulate by default, so we need to clear them between batches
|
|
"""
|
|
### BEGIN SOLUTION
|
|
for param in self.params:
|
|
param.grad = None
|
|
### END SOLUTION
|
|
|
|
def step(self):
|
|
"""
|
|
Update parameters based on gradients.
|
|
|
|
This is abstract - each optimizer implements its own update rule.
|
|
"""
|
|
raise NotImplementedError("Subclasses must implement step()")
|
|
|
|
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 9
|
|
class SGD(Optimizer):
|
|
"""
|
|
Stochastic Gradient Descent with momentum.
|
|
|
|
SGD is the foundational optimization algorithm that moves parameters
|
|
in the direction opposite to gradients. With momentum, it remembers
|
|
previous updates to reduce oscillations and accelerate convergence.
|
|
"""
|
|
|
|
def __init__(self, params: List[Tensor], lr: float = 0.01, momentum: float = 0.0, weight_decay: float = 0.0):
|
|
"""
|
|
Initialize SGD optimizer.
|
|
|
|
TODO: Set up SGD with momentum and weight decay
|
|
|
|
APPROACH:
|
|
1. Call parent constructor to set up parameters
|
|
2. Store learning rate, momentum, and weight decay
|
|
3. Initialize momentum buffers for each parameter
|
|
|
|
EXAMPLE:
|
|
>>> optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
|
|
|
|
HINTS:
|
|
- Momentum buffers should be initialized as None
|
|
- They'll be created lazily on first step
|
|
"""
|
|
### BEGIN SOLUTION
|
|
super().__init__(params)
|
|
|
|
self.lr = lr
|
|
self.momentum = momentum
|
|
self.weight_decay = weight_decay
|
|
|
|
# Initialize momentum buffers (created lazily)
|
|
self.momentum_buffers = [None for _ in self.params]
|
|
### END SOLUTION
|
|
|
|
def step(self):
|
|
"""
|
|
Perform SGD update step with momentum.
|
|
|
|
TODO: Implement SGD parameter update with momentum
|
|
|
|
APPROACH:
|
|
1. For each parameter with gradients:
|
|
a. Apply weight decay if specified
|
|
b. Update momentum buffer
|
|
c. Update parameter using momentum
|
|
|
|
FORMULA:
|
|
- With weight decay: grad = grad + weight_decay * param
|
|
- Momentum: v = momentum * v_prev + grad
|
|
- Update: param = param - lr * v
|
|
|
|
HINTS:
|
|
- Skip parameters without gradients
|
|
- Initialize momentum buffers on first use
|
|
- Use in-place operations to save memory
|
|
"""
|
|
### BEGIN SOLUTION
|
|
for i, param in enumerate(self.params):
|
|
if param.grad is None:
|
|
continue
|
|
|
|
# Get gradient (param.grad is already a numpy array)
|
|
grad = param.grad
|
|
|
|
# Apply weight decay
|
|
if self.weight_decay != 0:
|
|
grad = grad + self.weight_decay * param.data
|
|
|
|
# Update momentum buffer
|
|
if self.momentum != 0:
|
|
if self.momentum_buffers[i] is None:
|
|
# Initialize momentum buffer
|
|
self.momentum_buffers[i] = np.zeros_like(param.data)
|
|
|
|
# Update momentum: v = momentum * v_prev + grad
|
|
self.momentum_buffers[i] = self.momentum * self.momentum_buffers[i] + grad
|
|
grad = self.momentum_buffers[i]
|
|
|
|
# Update parameter: param = param - lr * grad
|
|
param.data = param.data - self.lr * grad
|
|
|
|
# Increment step counter
|
|
self.step_count += 1
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 13
|
|
class Adam(Optimizer):
|
|
"""
|
|
Adam optimizer with adaptive learning rates.
|
|
|
|
Adam computes individual adaptive learning rates for different parameters
|
|
from estimates of first and second moments of the gradients.
|
|
This makes it effective for problems with sparse gradients or noisy data.
|
|
"""
|
|
|
|
def __init__(self, params: List[Tensor], lr: float = 0.001, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0):
|
|
"""
|
|
Initialize Adam optimizer.
|
|
|
|
TODO: Set up Adam with adaptive learning rates
|
|
|
|
APPROACH:
|
|
1. Call parent constructor
|
|
2. Store hyperparameters (lr, betas, eps, weight_decay)
|
|
3. Initialize first and second moment buffers
|
|
|
|
PARAMETERS:
|
|
- lr: Learning rate (default: 0.001)
|
|
- betas: Coefficients for computing running averages (default: (0.9, 0.999))
|
|
- eps: Small constant for numerical stability (default: 1e-8)
|
|
- weight_decay: L2 penalty coefficient (default: 0.0)
|
|
|
|
EXAMPLE:
|
|
>>> optimizer = Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
|
|
"""
|
|
### BEGIN SOLUTION
|
|
super().__init__(params)
|
|
|
|
self.lr = lr
|
|
self.beta1, self.beta2 = betas
|
|
self.eps = eps
|
|
self.weight_decay = weight_decay
|
|
|
|
# Initialize moment buffers (created lazily)
|
|
self.m_buffers = [None for _ in self.params] # First moment (mean)
|
|
self.v_buffers = [None for _ in self.params] # Second moment (variance)
|
|
### END SOLUTION
|
|
|
|
def step(self):
|
|
"""
|
|
Perform Adam update step.
|
|
|
|
TODO: Implement Adam parameter update with adaptive learning rates
|
|
|
|
APPROACH:
|
|
1. For each parameter with gradients:
|
|
a. Apply weight decay if specified
|
|
b. Update first moment estimate (momentum of gradient)
|
|
c. Update second moment estimate (momentum of squared gradient)
|
|
d. Compute bias-corrected moments
|
|
e. Update parameter using adaptive learning rate
|
|
|
|
FORMULAS:
|
|
- m_t = β₁ * m_{t-1} + (1-β₁) * g_t
|
|
- v_t = β₂ * v_{t-1} + (1-β₂) * g_t²
|
|
- m̂_t = m_t / (1-β₁^t)
|
|
- v̂_t = v_t / (1-β₂^t)
|
|
- θ_t = θ_{t-1} - lr * m̂_t / (√v̂_t + ε)
|
|
|
|
HINTS:
|
|
- Initialize buffers as zeros on first use
|
|
- Use step_count for bias correction
|
|
- Square gradients element-wise for second moment
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Increment step counter first (needed for bias correction)
|
|
self.step_count += 1
|
|
|
|
for i, param in enumerate(self.params):
|
|
if param.grad is None:
|
|
continue
|
|
|
|
# Get gradient (param.grad is already a numpy array)
|
|
grad = param.grad
|
|
|
|
# Apply weight decay
|
|
if self.weight_decay != 0:
|
|
grad = grad + self.weight_decay * param.data
|
|
|
|
# Initialize buffers if needed
|
|
if self.m_buffers[i] is None:
|
|
self.m_buffers[i] = np.zeros_like(param.data)
|
|
self.v_buffers[i] = np.zeros_like(param.data)
|
|
|
|
# Update biased first moment estimate
|
|
self.m_buffers[i] = self.beta1 * self.m_buffers[i] + (1 - self.beta1) * grad
|
|
|
|
# Update biased second moment estimate
|
|
self.v_buffers[i] = self.beta2 * self.v_buffers[i] + (1 - self.beta2) * (grad ** 2)
|
|
|
|
# Compute bias correction
|
|
bias_correction1 = 1 - self.beta1 ** self.step_count
|
|
bias_correction2 = 1 - self.beta2 ** self.step_count
|
|
|
|
# Compute bias-corrected moments
|
|
m_hat = self.m_buffers[i] / bias_correction1
|
|
v_hat = self.v_buffers[i] / bias_correction2
|
|
|
|
# Update parameter
|
|
param.data = param.data - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 17
|
|
class AdamW(Optimizer):
|
|
"""
|
|
AdamW optimizer with decoupled weight decay.
|
|
|
|
AdamW fixes a bug in Adam's weight decay implementation by decoupling
|
|
weight decay from the gradient-based update. This leads to better
|
|
regularization and is the preferred version for most applications.
|
|
"""
|
|
|
|
def __init__(self, params: List[Tensor], lr: float = 0.001, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01):
|
|
"""
|
|
Initialize AdamW optimizer.
|
|
|
|
TODO: Set up AdamW with decoupled weight decay
|
|
|
|
APPROACH:
|
|
1. Call parent constructor
|
|
2. Store hyperparameters (note higher default weight_decay)
|
|
3. Initialize moment buffers like Adam
|
|
|
|
KEY DIFFERENCE from Adam:
|
|
- Weight decay is applied directly to parameters, not added to gradients
|
|
- This provides better regularization behavior
|
|
|
|
EXAMPLE:
|
|
>>> optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
super().__init__(params)
|
|
|
|
self.lr = lr
|
|
self.beta1, self.beta2 = betas
|
|
self.eps = eps
|
|
self.weight_decay = weight_decay
|
|
|
|
# Initialize moment buffers (same as Adam)
|
|
self.m_buffers = [None for _ in self.params]
|
|
self.v_buffers = [None for _ in self.params]
|
|
### END SOLUTION
|
|
|
|
def step(self):
|
|
"""
|
|
Perform AdamW update step with decoupled weight decay.
|
|
|
|
TODO: Implement AdamW parameter update
|
|
|
|
APPROACH:
|
|
1. For each parameter with gradients:
|
|
a. Update moments using gradients (NOT modified by weight decay)
|
|
b. Compute bias-corrected moments
|
|
c. Apply gradient-based update
|
|
d. Apply weight decay directly to parameters
|
|
|
|
KEY DIFFERENCE from Adam:
|
|
- Weight decay: θ_t = θ_t - lr * weight_decay * θ_t (applied after gradient update)
|
|
- NOT: grad = grad + weight_decay * param (Adam's incorrect approach)
|
|
|
|
FORMULAS:
|
|
- Same moment updates as Adam (using unmodified gradients)
|
|
- Gradient update: θ_t = θ_{t-1} - lr * m̂_t / (√v̂_t + ε)
|
|
- Weight decay: θ_t = θ_t * (1 - lr * weight_decay)
|
|
|
|
HINT: Apply weight decay after gradient update for proper decoupling
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Increment step counter first
|
|
self.step_count += 1
|
|
|
|
for i, param in enumerate(self.params):
|
|
if param.grad is None:
|
|
continue
|
|
|
|
# Get gradient (NOT modified by weight decay) - param.grad is already a numpy array
|
|
grad = param.grad
|
|
|
|
# Initialize buffers if needed
|
|
if self.m_buffers[i] is None:
|
|
self.m_buffers[i] = np.zeros_like(param.data)
|
|
self.v_buffers[i] = np.zeros_like(param.data)
|
|
|
|
# Update moments using pure gradients
|
|
self.m_buffers[i] = self.beta1 * self.m_buffers[i] + (1 - self.beta1) * grad
|
|
self.v_buffers[i] = self.beta2 * self.v_buffers[i] + (1 - self.beta2) * (grad ** 2)
|
|
|
|
# Compute bias correction
|
|
bias_correction1 = 1 - self.beta1 ** self.step_count
|
|
bias_correction2 = 1 - self.beta2 ** self.step_count
|
|
|
|
# Compute bias-corrected moments
|
|
m_hat = self.m_buffers[i] / bias_correction1
|
|
v_hat = self.v_buffers[i] / bias_correction2
|
|
|
|
# Apply gradient-based update
|
|
param.data = param.data - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
|
|
|
|
# Apply decoupled weight decay
|
|
if self.weight_decay != 0:
|
|
param.data = param.data * (1 - self.lr * self.weight_decay)
|
|
### END SOLUTION
|