Files
TinyTorch/tinytorch/core/optimizers.py
Vijay Janapa Reddi 96880b3133 Update tinytorch and tito with module exports
Re-exported all modules after restructuring:
- Updated _modidx.py with new module locations
- Removed outdated autogeneration headers
- Updated all core modules (tensor, autograd, layers, etc.)
- Updated optimization modules (quantization, compression, etc.)
- Updated TITO commands for new structure

Changes include:
- 24 tinytorch/ module files
- 24 tito/ command and core files
- Updated references from modules/source/ to modules/

All modules re-exported via nbdev from their new locations.
2025-11-10 19:42:03 -05:00

382 lines
13 KiB
Python
Generated

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/06_optimizers/optimizers_dev.ipynb.
# %% auto 0
__all__ = ['Optimizer', 'SGD', 'Adam', 'AdamW']
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 1
import numpy as np
from typing import List, Union, Optional, Dict, Any
# Import Tensor from Module 01 (now with gradient support from Module 05)
from .tensor import Tensor
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 5
class Optimizer:
"""
Base class for all optimizers.
This class defines the common interface that all optimizers must implement:
- zero_grad(): Clear gradients from parameters
- step(): Update parameters based on gradients
"""
def __init__(self, params: List[Tensor]):
"""
Initialize optimizer with parameters to optimize.
TODO: Set up the parameter list for optimization
APPROACH:
1. Store parameters as a list for iteration
2. Validate that all parameters require gradients
3. Initialize step counter for algorithms that need it
EXAMPLE:
>>> linear = Linear(784, 128)
>>> optimizer = SGD(linear.parameters(), lr=0.01)
HINT: Check that each parameter has requires_grad=True
"""
### BEGIN SOLUTION
# Validate and store parameters
if not isinstance(params, list):
params = list(params)
# Check that parameters require gradients
for i, param in enumerate(params):
if not isinstance(param, Tensor):
raise TypeError(f"Parameter {i} must be a Tensor, got {type(param)}")
if not param.requires_grad:
raise ValueError(f"Parameter {i} does not require gradients. Set requires_grad=True.")
self.params = params
self.step_count = 0 # For algorithms that need step counting
### END SOLUTION
def zero_grad(self):
"""
Clear gradients from all parameters.
TODO: Reset all parameter gradients to None
APPROACH:
1. Iterate through all parameters
2. Set each parameter's grad to None
EXAMPLE:
>>> optimizer.zero_grad() # Clears all gradients
>>> assert param.grad is None for param in optimizer.params
WHY: Gradients accumulate by default, so we need to clear them between batches
"""
### BEGIN SOLUTION
for param in self.params:
param.grad = None
### END SOLUTION
def step(self):
"""
Update parameters based on gradients.
This is abstract - each optimizer implements its own update rule.
"""
raise NotImplementedError("Subclasses must implement step()")
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 9
class SGD(Optimizer):
"""
Stochastic Gradient Descent with momentum.
SGD is the foundational optimization algorithm that moves parameters
in the direction opposite to gradients. With momentum, it remembers
previous updates to reduce oscillations and accelerate convergence.
"""
def __init__(self, params: List[Tensor], lr: float = 0.01, momentum: float = 0.0, weight_decay: float = 0.0):
"""
Initialize SGD optimizer.
TODO: Set up SGD with momentum and weight decay
APPROACH:
1. Call parent constructor to set up parameters
2. Store learning rate, momentum, and weight decay
3. Initialize momentum buffers for each parameter
EXAMPLE:
>>> optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
HINTS:
- Momentum buffers should be initialized as None
- They'll be created lazily on first step
"""
### BEGIN SOLUTION
super().__init__(params)
self.lr = lr
self.momentum = momentum
self.weight_decay = weight_decay
# Initialize momentum buffers (created lazily)
self.momentum_buffers = [None for _ in self.params]
### END SOLUTION
def step(self):
"""
Perform SGD update step with momentum.
TODO: Implement SGD parameter update with momentum
APPROACH:
1. For each parameter with gradients:
a. Apply weight decay if specified
b. Update momentum buffer
c. Update parameter using momentum
FORMULA:
- With weight decay: grad = grad + weight_decay * param
- Momentum: v = momentum * v_prev + grad
- Update: param = param - lr * v
HINTS:
- Skip parameters without gradients
- Initialize momentum buffers on first use
- Use in-place operations to save memory
"""
### BEGIN SOLUTION
for i, param in enumerate(self.params):
if param.grad is None:
continue
# Get gradient (param.grad is already a numpy array)
grad = param.grad
# Apply weight decay
if self.weight_decay != 0:
grad = grad + self.weight_decay * param.data
# Update momentum buffer
if self.momentum != 0:
if self.momentum_buffers[i] is None:
# Initialize momentum buffer
self.momentum_buffers[i] = np.zeros_like(param.data)
# Update momentum: v = momentum * v_prev + grad
self.momentum_buffers[i] = self.momentum * self.momentum_buffers[i] + grad
grad = self.momentum_buffers[i]
# Update parameter: param = param - lr * grad
param.data = param.data - self.lr * grad
# Increment step counter
self.step_count += 1
### END SOLUTION
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 13
class Adam(Optimizer):
"""
Adam optimizer with adaptive learning rates.
Adam computes individual adaptive learning rates for different parameters
from estimates of first and second moments of the gradients.
This makes it effective for problems with sparse gradients or noisy data.
"""
def __init__(self, params: List[Tensor], lr: float = 0.001, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.0):
"""
Initialize Adam optimizer.
TODO: Set up Adam with adaptive learning rates
APPROACH:
1. Call parent constructor
2. Store hyperparameters (lr, betas, eps, weight_decay)
3. Initialize first and second moment buffers
PARAMETERS:
- lr: Learning rate (default: 0.001)
- betas: Coefficients for computing running averages (default: (0.9, 0.999))
- eps: Small constant for numerical stability (default: 1e-8)
- weight_decay: L2 penalty coefficient (default: 0.0)
EXAMPLE:
>>> optimizer = Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
"""
### BEGIN SOLUTION
super().__init__(params)
self.lr = lr
self.beta1, self.beta2 = betas
self.eps = eps
self.weight_decay = weight_decay
# Initialize moment buffers (created lazily)
self.m_buffers = [None for _ in self.params] # First moment (mean)
self.v_buffers = [None for _ in self.params] # Second moment (variance)
### END SOLUTION
def step(self):
"""
Perform Adam update step.
TODO: Implement Adam parameter update with adaptive learning rates
APPROACH:
1. For each parameter with gradients:
a. Apply weight decay if specified
b. Update first moment estimate (momentum of gradient)
c. Update second moment estimate (momentum of squared gradient)
d. Compute bias-corrected moments
e. Update parameter using adaptive learning rate
FORMULAS:
- m_t = β₁ * m_{t-1} + (1-β₁) * g_t
- v_t = β₂ * v_{t-1} + (1-β₂) * g_t²
- m̂_t = m_t / (1-β₁^t)
- v̂_t = v_t / (1-β₂^t)
- θ_t = θ_{t-1} - lr * m̂_t / (√v̂_t + ε)
HINTS:
- Initialize buffers as zeros on first use
- Use step_count for bias correction
- Square gradients element-wise for second moment
"""
### BEGIN SOLUTION
# Increment step counter first (needed for bias correction)
self.step_count += 1
for i, param in enumerate(self.params):
if param.grad is None:
continue
# Get gradient (param.grad is already a numpy array)
grad = param.grad
# Apply weight decay
if self.weight_decay != 0:
grad = grad + self.weight_decay * param.data
# Initialize buffers if needed
if self.m_buffers[i] is None:
self.m_buffers[i] = np.zeros_like(param.data)
self.v_buffers[i] = np.zeros_like(param.data)
# Update biased first moment estimate
self.m_buffers[i] = self.beta1 * self.m_buffers[i] + (1 - self.beta1) * grad
# Update biased second moment estimate
self.v_buffers[i] = self.beta2 * self.v_buffers[i] + (1 - self.beta2) * (grad ** 2)
# Compute bias correction
bias_correction1 = 1 - self.beta1 ** self.step_count
bias_correction2 = 1 - self.beta2 ** self.step_count
# Compute bias-corrected moments
m_hat = self.m_buffers[i] / bias_correction1
v_hat = self.v_buffers[i] / bias_correction2
# Update parameter
param.data = param.data - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
### END SOLUTION
# %% ../../modules/source/06_optimizers/optimizers_dev.ipynb 17
class AdamW(Optimizer):
"""
AdamW optimizer with decoupled weight decay.
AdamW fixes a bug in Adam's weight decay implementation by decoupling
weight decay from the gradient-based update. This leads to better
regularization and is the preferred version for most applications.
"""
def __init__(self, params: List[Tensor], lr: float = 0.001, betas: tuple = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01):
"""
Initialize AdamW optimizer.
TODO: Set up AdamW with decoupled weight decay
APPROACH:
1. Call parent constructor
2. Store hyperparameters (note higher default weight_decay)
3. Initialize moment buffers like Adam
KEY DIFFERENCE from Adam:
- Weight decay is applied directly to parameters, not added to gradients
- This provides better regularization behavior
EXAMPLE:
>>> optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
"""
### BEGIN SOLUTION
super().__init__(params)
self.lr = lr
self.beta1, self.beta2 = betas
self.eps = eps
self.weight_decay = weight_decay
# Initialize moment buffers (same as Adam)
self.m_buffers = [None for _ in self.params]
self.v_buffers = [None for _ in self.params]
### END SOLUTION
def step(self):
"""
Perform AdamW update step with decoupled weight decay.
TODO: Implement AdamW parameter update
APPROACH:
1. For each parameter with gradients:
a. Update moments using gradients (NOT modified by weight decay)
b. Compute bias-corrected moments
c. Apply gradient-based update
d. Apply weight decay directly to parameters
KEY DIFFERENCE from Adam:
- Weight decay: θ_t = θ_t - lr * weight_decay * θ_t (applied after gradient update)
- NOT: grad = grad + weight_decay * param (Adam's incorrect approach)
FORMULAS:
- Same moment updates as Adam (using unmodified gradients)
- Gradient update: θ_t = θ_{t-1} - lr * m̂_t / (√v̂_t + ε)
- Weight decay: θ_t = θ_t * (1 - lr * weight_decay)
HINT: Apply weight decay after gradient update for proper decoupling
"""
### BEGIN SOLUTION
# Increment step counter first
self.step_count += 1
for i, param in enumerate(self.params):
if param.grad is None:
continue
# Get gradient (NOT modified by weight decay) - param.grad is already a numpy array
grad = param.grad
# Initialize buffers if needed
if self.m_buffers[i] is None:
self.m_buffers[i] = np.zeros_like(param.data)
self.v_buffers[i] = np.zeros_like(param.data)
# Update moments using pure gradients
self.m_buffers[i] = self.beta1 * self.m_buffers[i] + (1 - self.beta1) * grad
self.v_buffers[i] = self.beta2 * self.v_buffers[i] + (1 - self.beta2) * (grad ** 2)
# Compute bias correction
bias_correction1 = 1 - self.beta1 ** self.step_count
bias_correction2 = 1 - self.beta2 ** self.step_count
# Compute bias-corrected moments
m_hat = self.m_buffers[i] / bias_correction1
v_hat = self.v_buffers[i] / bias_correction2
# Apply gradient-based update
param.data = param.data - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
# Apply decoupled weight decay
if self.weight_decay != 0:
param.data = param.data * (1 - self.lr * self.weight_decay)
### END SOLUTION