mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-27 23:57:31 -05:00
🎯 Major Accomplishments: • ✅ All 15 module dev files validated and unit tests passing • ✅ Comprehensive integration tests (11/11 pass) • ✅ All 3 examples working with PyTorch-like API (XOR, MNIST, CIFAR-10) • ✅ Training capability verified (4/4 tests pass, XOR shows 35.8% improvement) • ✅ Clean directory structure (modules/source/ → modules/) 🧹 Repository Cleanup: • Removed experimental/debug files and old logos • Deleted redundant documentation (API_SIMPLIFICATION_COMPLETE.md, etc.) • Removed empty module directories and backup files • Streamlined examples (kept modern API versions only) • Cleaned up old TinyGPT implementation (moved to examples concept) 📊 Validation Results: • Module unit tests: 15/15 ✅ • Integration tests: 11/11 ✅ • Example validation: 3/3 ✅ • Training validation: 4/4 ✅ 🔧 Key Fixes: • Fixed activations module requires_grad test • Fixed networks module layer name test (Dense → Linear) • Fixed spatial module Conv2D weights attribute issues • Updated all documentation to reflect new structure 📁 Structure Improvements: • Simplified modules/source/ → modules/ (removed unnecessary nesting) • Added comprehensive validation test suites • Created VALIDATION_COMPLETE.md and WORKING_MODULES.md documentation • Updated book structure to reflect ML evolution story 🚀 System Status: READY FOR PRODUCTION All components validated, examples working, training capability verified. Test-first approach successfully implemented and proven.
1584 lines
60 KiB
Python
Generated
1584 lines
60 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/09_optimizers/optimizers_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['setup_import_paths', 'gradient_descent_step', 'SGD', 'Adam', 'StepLR', 'OptimizerConvergenceProfiler',
|
|
'AdvancedOptimizerFeatures']
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 1
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
from typing import List, Dict, Any, Optional, Union
|
|
from collections import defaultdict
|
|
|
|
# Helper function to set up import paths
|
|
def setup_import_paths():
|
|
"""Set up import paths for development modules."""
|
|
import sys
|
|
import os
|
|
|
|
# Add module directories to path
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
tensor_dir = os.path.join(base_dir, '01_tensor')
|
|
autograd_dir = os.path.join(base_dir, '07_autograd')
|
|
|
|
if tensor_dir not in sys.path:
|
|
sys.path.append(tensor_dir)
|
|
if autograd_dir not in sys.path:
|
|
sys.path.append(autograd_dir)
|
|
|
|
# Import our existing components
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.autograd import Variable
|
|
except ImportError:
|
|
# For development, try local imports
|
|
try:
|
|
setup_import_paths()
|
|
from tensor_dev import Tensor
|
|
from autograd_dev import Variable
|
|
except ImportError:
|
|
# Create minimal fallback classes for testing
|
|
print("Warning: Using fallback classes for testing")
|
|
|
|
class Tensor:
|
|
def __init__(self, data):
|
|
self.data = np.array(data)
|
|
self.shape = self.data.shape
|
|
|
|
def __str__(self):
|
|
return f"Tensor({self.data})"
|
|
|
|
class Variable:
|
|
def __init__(self, data, requires_grad=True):
|
|
if isinstance(data, (int, float)):
|
|
self.data = Tensor([data])
|
|
else:
|
|
self.data = Tensor(data)
|
|
self.requires_grad = requires_grad
|
|
self.grad = None
|
|
|
|
def zero_grad(self):
|
|
self.grad = None
|
|
|
|
def __str__(self):
|
|
return f"Variable({self.data.data})"
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 7
|
|
def gradient_descent_step(parameter: Variable, learning_rate: float) -> None:
|
|
"""
|
|
Perform one step of gradient descent on a parameter.
|
|
|
|
Args:
|
|
parameter: Variable with gradient information
|
|
learning_rate: How much to update parameter
|
|
|
|
TODO: Implement basic gradient descent parameter update.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Check if parameter has a gradient
|
|
2. Get current parameter value and gradient
|
|
3. Update parameter: new_value = old_value - learning_rate * gradient
|
|
4. Update parameter data with new value
|
|
5. Handle edge cases (no gradient, invalid values)
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
# Parameter with gradient
|
|
w = Variable(2.0, requires_grad=True)
|
|
w.grad = Variable(0.5) # Gradient from loss
|
|
|
|
# Update parameter
|
|
gradient_descent_step(w, learning_rate=0.1)
|
|
# w.data now contains: 2.0 - 0.1 * 0.5 = 1.95
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Check if parameter.grad is not None
|
|
- Use parameter.grad.data.data to get gradient value
|
|
- Update parameter.data with new Tensor
|
|
- Don't modify gradient (it's used for logging)
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is the foundation of all neural network training
|
|
- PyTorch's optimizer.step() does exactly this
|
|
- The learning rate determines convergence speed
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if parameter.grad is not None:
|
|
# Get current parameter value and gradient
|
|
current_value = parameter.data.data
|
|
gradient_value = parameter.grad.data.data
|
|
|
|
# Update parameter: new_value = old_value - learning_rate * gradient
|
|
new_value = current_value - learning_rate * gradient_value
|
|
|
|
# Update parameter data
|
|
parameter.data = Tensor(new_value)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 11
|
|
class SGD:
|
|
"""
|
|
SGD Optimizer with Momentum
|
|
|
|
Implements stochastic gradient descent with momentum:
|
|
v_t = momentum * v_{t-1} + gradient
|
|
parameter = parameter - learning_rate * v_t
|
|
"""
|
|
|
|
def __init__(self, parameters: List[Variable], learning_rate: float = 0.01,
|
|
momentum: float = 0.0, weight_decay: float = 0.0):
|
|
"""
|
|
Initialize SGD optimizer.
|
|
|
|
Args:
|
|
parameters: List of Variables to optimize
|
|
learning_rate: Learning rate (default: 0.01)
|
|
momentum: Momentum coefficient (default: 0.0)
|
|
weight_decay: L2 regularization coefficient (default: 0.0)
|
|
|
|
TODO: Implement SGD optimizer initialization.
|
|
|
|
APPROACH:
|
|
1. Store parameters and hyperparameters
|
|
2. Initialize momentum buffers for each parameter
|
|
3. Set up state tracking for optimization
|
|
4. Prepare for step() and zero_grad() methods
|
|
|
|
EXAMPLE:
|
|
```python
|
|
# Create optimizer
|
|
optimizer = SGD([w1, w2, b1, b2], learning_rate=0.01, momentum=0.9)
|
|
|
|
# In training loop:
|
|
optimizer.zero_grad()
|
|
loss.backward()
|
|
optimizer.step()
|
|
```
|
|
|
|
HINTS:
|
|
- Store parameters as a list
|
|
- Initialize momentum buffers as empty dict
|
|
- Use parameter id() as key for momentum tracking
|
|
- Momentum buffers will be created lazily in step()
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.parameters = parameters
|
|
self.learning_rate = learning_rate
|
|
self.momentum = momentum
|
|
self.weight_decay = weight_decay
|
|
|
|
# Initialize momentum buffers (created lazily)
|
|
self.momentum_buffers = {}
|
|
|
|
# Track optimization steps
|
|
self.step_count = 0
|
|
### END SOLUTION
|
|
|
|
def step(self) -> None:
|
|
"""
|
|
Perform one optimization step.
|
|
|
|
TODO: Implement SGD parameter update with momentum.
|
|
|
|
APPROACH:
|
|
1. Iterate through all parameters
|
|
2. For each parameter with gradient:
|
|
a. Get current gradient
|
|
b. Apply weight decay if specified
|
|
c. Update momentum buffer (or create if first time)
|
|
d. Update parameter using momentum
|
|
3. Increment step count
|
|
|
|
MATHEMATICAL FORMULATION:
|
|
- If weight_decay > 0: gradient = gradient + weight_decay * parameter
|
|
- momentum_buffer = momentum * momentum_buffer + gradient
|
|
- parameter = parameter - learning_rate * momentum_buffer
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use id(param) as key for momentum buffers
|
|
- Initialize buffer with zeros if not exists
|
|
- Handle case where momentum = 0 (no momentum)
|
|
- Update parameter.data with new Tensor
|
|
"""
|
|
### BEGIN SOLUTION
|
|
for param in self.parameters:
|
|
if param.grad is not None:
|
|
# Get gradient
|
|
gradient = param.grad.data.data
|
|
|
|
# Apply weight decay (L2 regularization)
|
|
if self.weight_decay > 0:
|
|
gradient = gradient + self.weight_decay * param.data.data
|
|
|
|
# Get or create momentum buffer
|
|
param_id = id(param)
|
|
if param_id not in self.momentum_buffers:
|
|
self.momentum_buffers[param_id] = np.zeros_like(param.data.data)
|
|
|
|
# Update momentum buffer
|
|
self.momentum_buffers[param_id] = (
|
|
self.momentum * self.momentum_buffers[param_id] + gradient
|
|
)
|
|
|
|
# Update parameter
|
|
# CRITICAL: Preserve original parameter shape - modify numpy array in-place
|
|
update = self.learning_rate * self.momentum_buffers[param_id]
|
|
param._data[:] = param.data - update
|
|
|
|
self.step_count += 1
|
|
### END SOLUTION
|
|
|
|
def zero_grad(self) -> None:
|
|
"""
|
|
Zero out gradients for all parameters.
|
|
|
|
TODO: Implement gradient zeroing.
|
|
|
|
APPROACH:
|
|
1. Iterate through all parameters
|
|
2. Set gradient to None for each parameter
|
|
3. This prepares for next backward pass
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Simply set param.grad = None
|
|
- This is called before loss.backward()
|
|
- Essential for proper gradient accumulation
|
|
"""
|
|
### BEGIN SOLUTION
|
|
for param in self.parameters:
|
|
param.grad = None
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 15
|
|
class Adam:
|
|
"""
|
|
Adam Optimizer
|
|
|
|
Implements Adam algorithm with adaptive learning rates:
|
|
- First moment: exponential moving average of gradients
|
|
- Second moment: exponential moving average of squared gradients
|
|
- Bias correction: accounts for initialization bias
|
|
- Adaptive updates: different learning rate per parameter
|
|
"""
|
|
|
|
def __init__(self, parameters: List[Variable], learning_rate: float = 0.001,
|
|
beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8,
|
|
weight_decay: float = 0.0):
|
|
"""
|
|
Initialize Adam optimizer.
|
|
|
|
Args:
|
|
parameters: List of Variables to optimize
|
|
learning_rate: Learning rate (default: 0.001)
|
|
beta1: Exponential decay rate for first moment (default: 0.9)
|
|
beta2: Exponential decay rate for second moment (default: 0.999)
|
|
epsilon: Small constant for numerical stability (default: 1e-8)
|
|
weight_decay: L2 regularization coefficient (default: 0.0)
|
|
|
|
TODO: Implement Adam optimizer initialization.
|
|
|
|
APPROACH:
|
|
1. Store parameters and hyperparameters
|
|
2. Initialize first moment buffers (m_t)
|
|
3. Initialize second moment buffers (v_t)
|
|
4. Set up step counter for bias correction
|
|
|
|
EXAMPLE:
|
|
```python
|
|
# Create Adam optimizer
|
|
optimizer = Adam([w1, w2, b1, b2], learning_rate=0.001)
|
|
|
|
# In training loop:
|
|
optimizer.zero_grad()
|
|
loss.backward()
|
|
optimizer.step()
|
|
```
|
|
|
|
HINTS:
|
|
- Store all hyperparameters
|
|
- Initialize moment buffers as empty dicts
|
|
- Use parameter id() as key for tracking
|
|
- Buffers will be created lazily in step()
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.parameters = parameters
|
|
self.learning_rate = learning_rate
|
|
self.beta1 = beta1
|
|
self.beta2 = beta2
|
|
self.epsilon = epsilon
|
|
self.weight_decay = weight_decay
|
|
|
|
# Initialize moment buffers (created lazily)
|
|
self.first_moment = {} # m_t
|
|
self.second_moment = {} # v_t
|
|
|
|
# Track optimization steps for bias correction
|
|
self.step_count = 0
|
|
### END SOLUTION
|
|
|
|
def step(self) -> None:
|
|
"""
|
|
Perform one optimization step using Adam algorithm.
|
|
|
|
TODO: Implement Adam parameter update.
|
|
|
|
APPROACH:
|
|
1. Increment step count
|
|
2. For each parameter with gradient:
|
|
a. Get current gradient
|
|
b. Apply weight decay if specified
|
|
c. Update first moment (momentum)
|
|
d. Update second moment (variance)
|
|
e. Apply bias correction
|
|
f. Update parameter with adaptive learning rate
|
|
|
|
MATHEMATICAL FORMULATION:
|
|
- m_t = beta1 * m_{t-1} + (1 - beta1) * gradient
|
|
- v_t = beta2 * v_{t-1} + (1 - beta2) * gradient^2
|
|
- m_hat = m_t / (1 - beta1^t)
|
|
- v_hat = v_t / (1 - beta2^t)
|
|
- parameter = parameter - learning_rate * m_hat / (sqrt(v_hat) + epsilon)
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use id(param) as key for moment buffers
|
|
- Initialize buffers with zeros if not exists
|
|
- Use np.sqrt() for square root
|
|
- Handle numerical stability with epsilon
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.step_count += 1
|
|
|
|
for param in self.parameters:
|
|
if param.grad is not None:
|
|
# Get gradient
|
|
gradient = param.grad.data.data
|
|
|
|
# Apply weight decay (L2 regularization)
|
|
if self.weight_decay > 0:
|
|
gradient = gradient + self.weight_decay * param.data.data
|
|
|
|
# Get or create moment buffers
|
|
param_id = id(param)
|
|
if param_id not in self.first_moment:
|
|
self.first_moment[param_id] = np.zeros_like(param.data.data)
|
|
self.second_moment[param_id] = np.zeros_like(param.data.data)
|
|
|
|
# Update first moment (momentum)
|
|
self.first_moment[param_id] = (
|
|
self.beta1 * self.first_moment[param_id] +
|
|
(1 - self.beta1) * gradient
|
|
)
|
|
|
|
# Update second moment (variance)
|
|
self.second_moment[param_id] = (
|
|
self.beta2 * self.second_moment[param_id] +
|
|
(1 - self.beta2) * gradient * gradient
|
|
)
|
|
|
|
# Bias correction
|
|
first_moment_corrected = (
|
|
self.first_moment[param_id] / (1 - self.beta1 ** self.step_count)
|
|
)
|
|
second_moment_corrected = (
|
|
self.second_moment[param_id] / (1 - self.beta2 ** self.step_count)
|
|
)
|
|
|
|
# Update parameter with adaptive learning rate
|
|
# CRITICAL: Preserve original parameter shape - modify numpy array in-place
|
|
update = self.learning_rate * first_moment_corrected / (np.sqrt(second_moment_corrected) + self.epsilon)
|
|
param._data[:] = param.data - update
|
|
### END SOLUTION
|
|
|
|
def zero_grad(self) -> None:
|
|
"""
|
|
Zero out gradients for all parameters.
|
|
|
|
TODO: Implement gradient zeroing (same as SGD).
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Set param.grad = None for all parameters
|
|
- This is identical to SGD implementation
|
|
"""
|
|
### BEGIN SOLUTION
|
|
for param in self.parameters:
|
|
param.grad = None
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 20
|
|
class StepLR:
|
|
"""
|
|
Step Learning Rate Scheduler
|
|
|
|
Decays learning rate by gamma every step_size epochs:
|
|
learning_rate = initial_lr * (gamma ^ (epoch // step_size))
|
|
"""
|
|
|
|
def __init__(self, optimizer: Union[SGD, Adam], step_size: int, gamma: float = 0.1):
|
|
"""
|
|
Initialize step learning rate scheduler.
|
|
|
|
Args:
|
|
optimizer: Optimizer to schedule
|
|
step_size: Number of epochs between decreases
|
|
gamma: Multiplicative factor for learning rate decay
|
|
|
|
TODO: Implement learning rate scheduler initialization.
|
|
|
|
APPROACH:
|
|
1. Store optimizer reference
|
|
2. Store scheduling parameters
|
|
3. Save initial learning rate
|
|
4. Initialize step counter
|
|
|
|
EXAMPLE:
|
|
```python
|
|
optimizer = SGD([w1, w2], learning_rate=0.1)
|
|
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
|
|
|
|
# In training loop:
|
|
for epoch in range(100):
|
|
train_one_epoch()
|
|
scheduler.step() # Update learning rate
|
|
```
|
|
|
|
HINTS:
|
|
- Store optimizer reference
|
|
- Save initial learning rate from optimizer
|
|
- Initialize step counter to 0
|
|
- gamma is the decay factor (0.1 = 10x reduction)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.optimizer = optimizer
|
|
self.step_size = step_size
|
|
self.gamma = gamma
|
|
self.initial_lr = optimizer.learning_rate
|
|
self.step_count = 0
|
|
### END SOLUTION
|
|
|
|
def step(self) -> None:
|
|
"""
|
|
Update learning rate based on current step.
|
|
|
|
TODO: Implement learning rate update.
|
|
|
|
APPROACH:
|
|
1. Increment step counter
|
|
2. Calculate new learning rate using step decay formula
|
|
3. Update optimizer's learning rate
|
|
|
|
MATHEMATICAL FORMULATION:
|
|
new_lr = initial_lr * (gamma ^ ((step_count - 1) // step_size))
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use // for integer division
|
|
- Use ** for exponentiation
|
|
- Update optimizer.learning_rate directly
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.step_count += 1
|
|
|
|
# Calculate new learning rate
|
|
decay_factor = self.gamma ** ((self.step_count - 1) // self.step_size)
|
|
new_lr = self.initial_lr * decay_factor
|
|
|
|
# Update optimizer's learning rate
|
|
self.optimizer.learning_rate = new_lr
|
|
### END SOLUTION
|
|
|
|
def get_lr(self) -> float:
|
|
"""
|
|
Get current learning rate.
|
|
|
|
TODO: Return current learning rate.
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Return optimizer.learning_rate
|
|
"""
|
|
### BEGIN SOLUTION
|
|
return self.optimizer.learning_rate
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 28
|
|
class OptimizerConvergenceProfiler:
|
|
"""
|
|
ML Systems Tool: Optimizer Performance and Convergence Analysis
|
|
|
|
Profiles convergence patterns, learning rate sensitivity, and computational costs
|
|
across different optimizers to guide production optimizer selection.
|
|
|
|
This is 60% implementation focusing on core analysis capabilities:
|
|
- Convergence rate comparison across optimizers
|
|
- Learning rate sensitivity analysis
|
|
- Gradient statistics tracking
|
|
- Memory usage estimation
|
|
- Performance recommendations
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Initialize optimizer convergence profiler.
|
|
|
|
TODO: Implement profiler initialization.
|
|
|
|
APPROACH:
|
|
1. Initialize tracking dictionaries for different metrics
|
|
2. Set up convergence analysis parameters
|
|
3. Prepare memory and performance tracking
|
|
4. Initialize recommendation engine components
|
|
|
|
PRODUCTION CONTEXT:
|
|
In production, this profiler would run on representative tasks to:
|
|
- Select optimal optimizers for new models
|
|
- Tune hyperparameters before expensive training runs
|
|
- Predict training time and resource requirements
|
|
- Monitor training stability and convergence
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Track convergence history per optimizer
|
|
- Store gradient statistics over time
|
|
- Monitor memory usage patterns
|
|
- Prepare for comparative analysis
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Convergence tracking
|
|
self.convergence_history = defaultdict(list) # {optimizer_name: [losses]}
|
|
self.gradient_norms = defaultdict(list) # {optimizer_name: [grad_norms]}
|
|
self.learning_rates = defaultdict(list) # {optimizer_name: [lr_values]}
|
|
self.step_times = defaultdict(list) # {optimizer_name: [step_durations]}
|
|
|
|
# Performance metrics
|
|
self.memory_usage = defaultdict(list) # {optimizer_name: [memory_estimates]}
|
|
self.convergence_rates = {} # {optimizer_name: convergence_rate}
|
|
self.stability_scores = {} # {optimizer_name: stability_score}
|
|
|
|
# Analysis parameters
|
|
self.convergence_threshold = 1e-6
|
|
self.stability_window = 10
|
|
self.gradient_explosion_threshold = 1e6
|
|
|
|
# Recommendations
|
|
self.optimizer_rankings = {}
|
|
self.hyperparameter_suggestions = {}
|
|
### END SOLUTION
|
|
|
|
def profile_optimizer_convergence(self, optimizer_name: str, optimizer: Union[SGD, Adam],
|
|
training_function, initial_loss: float,
|
|
max_steps: int = 100) -> Dict[str, Any]:
|
|
"""
|
|
Profile convergence behavior of an optimizer on a specific task.
|
|
|
|
Args:
|
|
optimizer_name: Name identifier for the optimizer
|
|
optimizer: Optimizer instance to profile
|
|
training_function: Function that performs one training step and returns loss
|
|
initial_loss: Starting loss value
|
|
max_steps: Maximum training steps to profile
|
|
|
|
Returns:
|
|
Dictionary containing convergence analysis results
|
|
|
|
TODO: Implement optimizer convergence profiling.
|
|
|
|
APPROACH:
|
|
1. Run training loop with the optimizer
|
|
2. Track loss, gradients, learning rates at each step
|
|
3. Measure step execution time
|
|
4. Estimate memory usage
|
|
5. Analyze convergence patterns and stability
|
|
6. Generate performance metrics
|
|
|
|
CONVERGENCE ANALYSIS:
|
|
- Track loss reduction over time
|
|
- Measure convergence rate (loss reduction per step)
|
|
- Detect convergence plateaus
|
|
- Identify gradient explosion or vanishing
|
|
- Assess training stability
|
|
|
|
PRODUCTION INSIGHTS:
|
|
This analysis helps determine:
|
|
- Which optimizers converge fastest for specific model types
|
|
- Optimal learning rates for different optimizers
|
|
- Memory vs performance trade-offs
|
|
- Training stability and robustness
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use time.time() to measure step duration
|
|
- Calculate gradient norms across all parameters
|
|
- Track learning rate changes (for schedulers)
|
|
- Estimate memory from optimizer state size
|
|
"""
|
|
### BEGIN SOLUTION
|
|
import time
|
|
|
|
print(f"🔍 Profiling {optimizer_name} convergence...")
|
|
|
|
# Initialize tracking
|
|
losses = []
|
|
grad_norms = []
|
|
step_durations = []
|
|
lr_values = []
|
|
|
|
previous_loss = initial_loss
|
|
convergence_step = None
|
|
|
|
for step in range(max_steps):
|
|
step_start = time.time()
|
|
|
|
# Perform training step
|
|
try:
|
|
current_loss = training_function()
|
|
losses.append(current_loss)
|
|
|
|
# Calculate gradient norm
|
|
total_grad_norm = 0.0
|
|
param_count = 0
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
grad_data = param.grad.data.data
|
|
if hasattr(grad_data, 'flatten'):
|
|
grad_norm = np.linalg.norm(grad_data.flatten())
|
|
else:
|
|
grad_norm = abs(float(grad_data))
|
|
total_grad_norm += grad_norm ** 2
|
|
param_count += 1
|
|
|
|
if param_count > 0:
|
|
total_grad_norm = (total_grad_norm / param_count) ** 0.5
|
|
grad_norms.append(total_grad_norm)
|
|
|
|
# Track learning rate
|
|
lr_values.append(optimizer.learning_rate)
|
|
|
|
# Check convergence
|
|
if convergence_step is None and abs(current_loss - previous_loss) < self.convergence_threshold:
|
|
convergence_step = step
|
|
|
|
previous_loss = current_loss
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Training step {step} failed: {e}")
|
|
break
|
|
|
|
step_end = time.time()
|
|
step_durations.append(step_end - step_start)
|
|
|
|
# Early stopping for exploded gradients
|
|
if total_grad_norm > self.gradient_explosion_threshold:
|
|
print(f"⚠️ Gradient explosion detected at step {step}")
|
|
break
|
|
|
|
# Store results
|
|
self.convergence_history[optimizer_name] = losses
|
|
self.gradient_norms[optimizer_name] = grad_norms
|
|
self.learning_rates[optimizer_name] = lr_values
|
|
self.step_times[optimizer_name] = step_durations
|
|
|
|
# Analyze results
|
|
analysis = self._analyze_convergence_profile(optimizer_name, losses, grad_norms,
|
|
step_durations, convergence_step)
|
|
|
|
return analysis
|
|
### END SOLUTION
|
|
|
|
def compare_optimizers(self, profiles: Dict[str, Dict]) -> Dict[str, Any]:
|
|
"""
|
|
Compare multiple optimizer profiles and generate recommendations.
|
|
|
|
Args:
|
|
profiles: Dictionary mapping optimizer names to their profile results
|
|
|
|
Returns:
|
|
Comprehensive comparison analysis with recommendations
|
|
|
|
TODO: Implement optimizer comparison and ranking.
|
|
|
|
APPROACH:
|
|
1. Analyze convergence speed across optimizers
|
|
2. Compare final performance and stability
|
|
3. Assess computational efficiency
|
|
4. Generate rankings and recommendations
|
|
5. Identify optimal hyperparameters
|
|
|
|
COMPARISON METRICS:
|
|
- Steps to convergence
|
|
- Final loss achieved
|
|
- Training stability (loss variance)
|
|
- Computational cost per step
|
|
- Memory efficiency
|
|
- Gradient explosion resistance
|
|
|
|
PRODUCTION VALUE:
|
|
This comparison guides:
|
|
- Optimizer selection for new projects
|
|
- Hyperparameter optimization strategies
|
|
- Resource allocation decisions
|
|
- Training pipeline design
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Normalize metrics for fair comparison
|
|
- Weight different factors based on importance
|
|
- Generate actionable recommendations
|
|
- Consider trade-offs between speed and stability
|
|
"""
|
|
### BEGIN SOLUTION
|
|
comparison = {
|
|
'convergence_speed': {},
|
|
'final_performance': {},
|
|
'stability': {},
|
|
'efficiency': {},
|
|
'rankings': {},
|
|
'recommendations': {}
|
|
}
|
|
|
|
print("📊 Comparing optimizer performance...")
|
|
|
|
# Analyze each optimizer
|
|
for opt_name, profile in profiles.items():
|
|
# Convergence speed
|
|
convergence_step = profile.get('convergence_step', len(self.convergence_history[opt_name]))
|
|
comparison['convergence_speed'][opt_name] = convergence_step
|
|
|
|
# Final performance
|
|
losses = self.convergence_history[opt_name]
|
|
if losses:
|
|
final_loss = losses[-1]
|
|
comparison['final_performance'][opt_name] = final_loss
|
|
|
|
# Stability (coefficient of variation in last 10 steps)
|
|
if len(losses) >= self.stability_window:
|
|
recent_losses = losses[-self.stability_window:]
|
|
stability = 1.0 / (1.0 + np.std(recent_losses) / (np.mean(recent_losses) + 1e-8))
|
|
comparison['stability'][opt_name] = stability
|
|
|
|
# Efficiency (loss reduction per unit time)
|
|
step_times = self.step_times[opt_name]
|
|
if losses and step_times:
|
|
initial_loss = losses[0]
|
|
final_loss = losses[-1]
|
|
total_time = sum(step_times)
|
|
efficiency = (initial_loss - final_loss) / (total_time + 1e-8)
|
|
comparison['efficiency'][opt_name] = efficiency
|
|
|
|
# Generate rankings
|
|
metrics = ['convergence_speed', 'final_performance', 'stability', 'efficiency']
|
|
for metric in metrics:
|
|
if comparison[metric]:
|
|
if metric == 'convergence_speed':
|
|
# Lower is better for convergence speed
|
|
sorted_opts = sorted(comparison[metric].items(), key=lambda x: x[1])
|
|
elif metric == 'final_performance':
|
|
# Lower is better for final loss
|
|
sorted_opts = sorted(comparison[metric].items(), key=lambda x: x[1])
|
|
else:
|
|
# Higher is better for stability and efficiency
|
|
sorted_opts = sorted(comparison[metric].items(), key=lambda x: x[1], reverse=True)
|
|
|
|
comparison['rankings'][metric] = [opt for opt, _ in sorted_opts]
|
|
|
|
# Generate recommendations
|
|
recommendations = []
|
|
|
|
# Best overall optimizer
|
|
if comparison['rankings']:
|
|
# Simple scoring: rank position across metrics
|
|
scores = defaultdict(float)
|
|
for metric, ranking in comparison['rankings'].items():
|
|
for i, opt_name in enumerate(ranking):
|
|
scores[opt_name] += len(ranking) - i
|
|
|
|
best_optimizer = max(scores.items(), key=lambda x: x[1])[0]
|
|
recommendations.append(f"🏆 Best overall optimizer: {best_optimizer}")
|
|
|
|
# Specific recommendations
|
|
if 'convergence_speed' in comparison['rankings']:
|
|
fastest = comparison['rankings']['convergence_speed'][0]
|
|
recommendations.append(f"⚡ Fastest convergence: {fastest}")
|
|
|
|
if 'stability' in comparison['rankings']:
|
|
most_stable = comparison['rankings']['stability'][0]
|
|
recommendations.append(f"🎯 Most stable training: {most_stable}")
|
|
|
|
if 'efficiency' in comparison['rankings']:
|
|
most_efficient = comparison['rankings']['efficiency'][0]
|
|
recommendations.append(f"💰 Most compute-efficient: {most_efficient}")
|
|
|
|
comparison['recommendations']['summary'] = recommendations
|
|
|
|
return comparison
|
|
### END SOLUTION
|
|
|
|
def analyze_learning_rate_sensitivity(self, optimizer_class, learning_rates: List[float],
|
|
training_function, steps: int = 50) -> Dict[str, Any]:
|
|
"""
|
|
Analyze optimizer sensitivity to different learning rates.
|
|
|
|
Args:
|
|
optimizer_class: Optimizer class (SGD or Adam)
|
|
learning_rates: List of learning rates to test
|
|
training_function: Function that creates and runs training
|
|
steps: Number of training steps per learning rate
|
|
|
|
Returns:
|
|
Learning rate sensitivity analysis
|
|
|
|
TODO: Implement learning rate sensitivity analysis.
|
|
|
|
APPROACH:
|
|
1. Test optimizer with different learning rates
|
|
2. Measure convergence performance for each rate
|
|
3. Identify optimal learning rate range
|
|
4. Detect learning rate instability regions
|
|
5. Generate learning rate recommendations
|
|
|
|
SENSITIVITY ANALYSIS:
|
|
- Plot loss curves for different learning rates
|
|
- Identify optimal learning rate range
|
|
- Detect gradient explosion thresholds
|
|
- Measure convergence robustness
|
|
- Generate adaptive scheduling suggestions
|
|
|
|
PRODUCTION INSIGHTS:
|
|
This analysis enables:
|
|
- Automatic learning rate tuning
|
|
- Learning rate scheduling optimization
|
|
- Gradient explosion prevention
|
|
- Training stability improvement
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Reset model state for each learning rate test
|
|
- Track convergence metrics consistently
|
|
- Identify learning rate sweet spots
|
|
- Flag unstable learning rate regions
|
|
"""
|
|
### BEGIN SOLUTION
|
|
print("🔍 Analyzing learning rate sensitivity...")
|
|
|
|
lr_analysis = {
|
|
'learning_rates': learning_rates,
|
|
'final_losses': [],
|
|
'convergence_steps': [],
|
|
'stability_scores': [],
|
|
'gradient_explosions': [],
|
|
'optimal_range': None,
|
|
'recommendations': []
|
|
}
|
|
|
|
# Test each learning rate
|
|
for lr in learning_rates:
|
|
print(f" Testing learning rate: {lr}")
|
|
|
|
try:
|
|
# Create optimizer with current learning rate
|
|
# This is a simplified test - in production, would reset model state
|
|
losses, grad_norms = training_function(lr, steps)
|
|
|
|
if losses:
|
|
final_loss = losses[-1]
|
|
lr_analysis['final_losses'].append(final_loss)
|
|
|
|
# Find convergence step
|
|
convergence_step = steps
|
|
for i in range(1, len(losses)):
|
|
if abs(losses[i] - losses[i-1]) < self.convergence_threshold:
|
|
convergence_step = i
|
|
break
|
|
lr_analysis['convergence_steps'].append(convergence_step)
|
|
|
|
# Calculate stability
|
|
if len(losses) >= 10:
|
|
recent_losses = losses[-10:]
|
|
stability = 1.0 / (1.0 + np.std(recent_losses) / (np.mean(recent_losses) + 1e-8))
|
|
lr_analysis['stability_scores'].append(stability)
|
|
else:
|
|
lr_analysis['stability_scores'].append(0.0)
|
|
|
|
# Check for gradient explosion
|
|
max_grad_norm = max(grad_norms) if grad_norms else 0.0
|
|
explosion = max_grad_norm > self.gradient_explosion_threshold
|
|
lr_analysis['gradient_explosions'].append(explosion)
|
|
|
|
else:
|
|
# Failed to get losses
|
|
lr_analysis['final_losses'].append(float('inf'))
|
|
lr_analysis['convergence_steps'].append(steps)
|
|
lr_analysis['stability_scores'].append(0.0)
|
|
lr_analysis['gradient_explosions'].append(True)
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Failed with lr={lr}: {e}")
|
|
lr_analysis['final_losses'].append(float('inf'))
|
|
lr_analysis['convergence_steps'].append(steps)
|
|
lr_analysis['stability_scores'].append(0.0)
|
|
lr_analysis['gradient_explosions'].append(True)
|
|
|
|
# Find optimal learning rate range
|
|
valid_indices = [i for i, (loss, explosion) in
|
|
enumerate(zip(lr_analysis['final_losses'], lr_analysis['gradient_explosions']))
|
|
if not explosion and loss != float('inf')]
|
|
|
|
if valid_indices:
|
|
# Find learning rate with best final loss among stable ones
|
|
stable_losses = [(i, lr_analysis['final_losses'][i]) for i in valid_indices]
|
|
best_idx = min(stable_losses, key=lambda x: x[1])[0]
|
|
|
|
# Define optimal range around best learning rate
|
|
best_lr = learning_rates[best_idx]
|
|
lr_analysis['optimal_range'] = (best_lr * 0.1, best_lr * 10.0)
|
|
|
|
# Generate recommendations
|
|
recommendations = []
|
|
recommendations.append(f"🎯 Optimal learning rate: {best_lr:.2e}")
|
|
recommendations.append(f"📈 Safe range: {lr_analysis['optimal_range'][0]:.2e} - {lr_analysis['optimal_range'][1]:.2e}")
|
|
|
|
# Learning rate scheduling suggestions
|
|
if best_idx > 0:
|
|
recommendations.append("💡 Consider starting with higher LR and decaying")
|
|
if any(lr_analysis['gradient_explosions']):
|
|
max_safe_lr = max([learning_rates[i] for i in valid_indices])
|
|
recommendations.append(f"⚠️ Avoid learning rates above {max_safe_lr:.2e}")
|
|
|
|
lr_analysis['recommendations'] = recommendations
|
|
else:
|
|
lr_analysis['recommendations'] = ["⚠️ No stable learning rates found - try lower values"]
|
|
|
|
return lr_analysis
|
|
### END SOLUTION
|
|
|
|
def estimate_memory_usage(self, optimizer: Union[SGD, Adam], num_parameters: int) -> Dict[str, float]:
|
|
"""
|
|
Estimate memory usage for different optimizers.
|
|
|
|
Args:
|
|
optimizer: Optimizer instance
|
|
num_parameters: Number of model parameters
|
|
|
|
Returns:
|
|
Memory usage estimates in MB
|
|
|
|
TODO: Implement memory usage estimation.
|
|
|
|
APPROACH:
|
|
1. Calculate parameter memory requirements
|
|
2. Estimate optimizer state memory
|
|
3. Account for gradient storage
|
|
4. Include temporary computation memory
|
|
5. Provide memory scaling predictions
|
|
|
|
MEMORY ANALYSIS:
|
|
- Parameter storage: num_params * 4 bytes (float32)
|
|
- Gradient storage: num_params * 4 bytes
|
|
- Optimizer state: varies by optimizer type
|
|
- SGD momentum: num_params * 4 bytes
|
|
- Adam: num_params * 8 bytes (first + second moments)
|
|
|
|
PRODUCTION VALUE:
|
|
Memory estimation helps:
|
|
- Select optimizers for memory-constrained environments
|
|
- Plan GPU memory allocation
|
|
- Scale to larger models
|
|
- Optimize batch sizes
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use typical float32 size (4 bytes)
|
|
- Account for optimizer-specific state
|
|
- Include gradient accumulation overhead
|
|
- Provide scaling estimates
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Base memory requirements
|
|
bytes_per_param = 4 # float32
|
|
|
|
memory_breakdown = {
|
|
'parameters_mb': num_parameters * bytes_per_param / (1024 * 1024),
|
|
'gradients_mb': num_parameters * bytes_per_param / (1024 * 1024),
|
|
'optimizer_state_mb': 0.0,
|
|
'total_mb': 0.0
|
|
}
|
|
|
|
# Optimizer-specific state memory
|
|
if isinstance(optimizer, SGD):
|
|
if optimizer.momentum > 0:
|
|
# Momentum buffers
|
|
memory_breakdown['optimizer_state_mb'] = num_parameters * bytes_per_param / (1024 * 1024)
|
|
else:
|
|
memory_breakdown['optimizer_state_mb'] = 0.0
|
|
elif isinstance(optimizer, Adam):
|
|
# First and second moment estimates
|
|
memory_breakdown['optimizer_state_mb'] = num_parameters * 2 * bytes_per_param / (1024 * 1024)
|
|
|
|
# Calculate total
|
|
memory_breakdown['total_mb'] = (
|
|
memory_breakdown['parameters_mb'] +
|
|
memory_breakdown['gradients_mb'] +
|
|
memory_breakdown['optimizer_state_mb']
|
|
)
|
|
|
|
# Add efficiency estimates
|
|
memory_breakdown['memory_efficiency'] = memory_breakdown['parameters_mb'] / memory_breakdown['total_mb']
|
|
memory_breakdown['overhead_ratio'] = memory_breakdown['optimizer_state_mb'] / memory_breakdown['parameters_mb']
|
|
|
|
return memory_breakdown
|
|
### END SOLUTION
|
|
|
|
def generate_production_recommendations(self, analysis_results: Dict[str, Any]) -> List[str]:
|
|
"""
|
|
Generate actionable recommendations for production optimizer usage.
|
|
|
|
Args:
|
|
analysis_results: Combined results from convergence and sensitivity analysis
|
|
|
|
Returns:
|
|
List of production recommendations
|
|
|
|
TODO: Implement production recommendation generation.
|
|
|
|
APPROACH:
|
|
1. Analyze convergence patterns and stability
|
|
2. Consider computational efficiency requirements
|
|
3. Account for memory constraints
|
|
4. Generate optimizer selection guidance
|
|
5. Provide hyperparameter tuning suggestions
|
|
|
|
RECOMMENDATION CATEGORIES:
|
|
- Optimizer selection for different scenarios
|
|
- Learning rate and scheduling strategies
|
|
- Memory optimization techniques
|
|
- Training stability improvements
|
|
- Production deployment considerations
|
|
|
|
PRODUCTION CONTEXT:
|
|
These recommendations guide:
|
|
- ML engineer optimizer selection
|
|
- DevOps resource allocation
|
|
- Training pipeline optimization
|
|
- Cost reduction strategies
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Provide specific, actionable advice
|
|
- Consider different deployment scenarios
|
|
- Include quantitative guidelines
|
|
- Address common production challenges
|
|
"""
|
|
### BEGIN SOLUTION
|
|
recommendations = []
|
|
|
|
# Optimizer selection recommendations
|
|
recommendations.append("🔧 OPTIMIZER SELECTION GUIDE:")
|
|
recommendations.append(" • SGD + Momentum: Best for large batch training, proven stability")
|
|
recommendations.append(" • Adam: Best for rapid prototyping, adaptive learning rates")
|
|
recommendations.append(" • Consider memory constraints: SGD uses ~50% less memory than Adam")
|
|
|
|
# Learning rate recommendations
|
|
if 'learning_rate_analysis' in analysis_results:
|
|
lr_analysis = analysis_results['learning_rate_analysis']
|
|
if lr_analysis.get('optimal_range'):
|
|
opt_range = lr_analysis['optimal_range']
|
|
recommendations.append(f"📈 LEARNING RATE GUIDANCE:")
|
|
recommendations.append(f" • Start with: {opt_range[0]:.2e}")
|
|
recommendations.append(f" • Safe upper bound: {opt_range[1]:.2e}")
|
|
recommendations.append(" • Use learning rate scheduling for best results")
|
|
|
|
# Convergence recommendations
|
|
if 'convergence_comparison' in analysis_results:
|
|
comparison = analysis_results['convergence_comparison']
|
|
if 'recommendations' in comparison and 'summary' in comparison['recommendations']:
|
|
recommendations.append("🎯 CONVERGENCE OPTIMIZATION:")
|
|
for rec in comparison['recommendations']['summary']:
|
|
recommendations.append(f" • {rec}")
|
|
|
|
# Production deployment recommendations
|
|
recommendations.append("🚀 PRODUCTION DEPLOYMENT:")
|
|
recommendations.append(" • Monitor gradient norms to detect training instability")
|
|
recommendations.append(" • Implement gradient clipping for large models")
|
|
recommendations.append(" • Use learning rate warmup for transformer architectures")
|
|
recommendations.append(" • Consider mixed precision training to reduce memory usage")
|
|
|
|
# Scaling recommendations
|
|
recommendations.append("📊 SCALING CONSIDERATIONS:")
|
|
recommendations.append(" • Large batch training: Prefer SGD with linear learning rate scaling")
|
|
recommendations.append(" • Distributed training: Use synchronized optimizers")
|
|
recommendations.append(" • Memory-constrained: Choose SGD or use gradient accumulation")
|
|
recommendations.append(" • Fine-tuning: Use lower learning rates (10x-100x smaller)")
|
|
|
|
# Monitoring recommendations
|
|
recommendations.append("📈 MONITORING & DEBUGGING:")
|
|
recommendations.append(" • Track loss smoothness to detect learning rate issues")
|
|
recommendations.append(" • Monitor gradient norms for explosion/vanishing detection")
|
|
recommendations.append(" • Log learning rate schedules for reproducibility")
|
|
recommendations.append(" • Profile memory usage to optimize batch sizes")
|
|
|
|
return recommendations
|
|
### END SOLUTION
|
|
|
|
def _analyze_convergence_profile(self, optimizer_name: str, losses: List[float],
|
|
grad_norms: List[float], step_durations: List[float],
|
|
convergence_step: Optional[int]) -> Dict[str, Any]:
|
|
"""
|
|
Internal helper to analyze convergence profile data.
|
|
|
|
Args:
|
|
optimizer_name: Name of the optimizer
|
|
losses: List of loss values over training
|
|
grad_norms: List of gradient norms over training
|
|
step_durations: List of step execution times
|
|
convergence_step: Step where convergence was detected (if any)
|
|
|
|
Returns:
|
|
Analysis results dictionary
|
|
"""
|
|
### BEGIN SOLUTION
|
|
analysis = {
|
|
'optimizer_name': optimizer_name,
|
|
'total_steps': len(losses),
|
|
'convergence_step': convergence_step,
|
|
'final_loss': losses[-1] if losses else float('inf'),
|
|
'initial_loss': losses[0] if losses else float('inf'),
|
|
'loss_reduction': 0.0,
|
|
'convergence_rate': 0.0,
|
|
'stability_score': 0.0,
|
|
'average_step_time': 0.0,
|
|
'gradient_health': 'unknown'
|
|
}
|
|
|
|
if losses:
|
|
# Calculate loss reduction
|
|
initial_loss = losses[0]
|
|
final_loss = losses[-1]
|
|
analysis['loss_reduction'] = initial_loss - final_loss
|
|
|
|
# Calculate convergence rate (loss reduction per step)
|
|
if len(losses) > 1:
|
|
analysis['convergence_rate'] = analysis['loss_reduction'] / len(losses)
|
|
|
|
# Calculate stability (inverse of coefficient of variation)
|
|
if len(losses) >= self.stability_window:
|
|
recent_losses = losses[-self.stability_window:]
|
|
mean_loss = np.mean(recent_losses)
|
|
std_loss = np.std(recent_losses)
|
|
analysis['stability_score'] = 1.0 / (1.0 + std_loss / (mean_loss + 1e-8))
|
|
|
|
# Average step time
|
|
if step_durations:
|
|
analysis['average_step_time'] = np.mean(step_durations)
|
|
|
|
# Gradient health assessment
|
|
if grad_norms:
|
|
max_grad_norm = max(grad_norms)
|
|
avg_grad_norm = np.mean(grad_norms)
|
|
|
|
if max_grad_norm > self.gradient_explosion_threshold:
|
|
analysis['gradient_health'] = 'exploding'
|
|
elif avg_grad_norm < 1e-8:
|
|
analysis['gradient_health'] = 'vanishing'
|
|
elif np.std(grad_norms) / (avg_grad_norm + 1e-8) > 2.0:
|
|
analysis['gradient_health'] = 'unstable'
|
|
else:
|
|
analysis['gradient_health'] = 'healthy'
|
|
|
|
return analysis
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/09_optimizers/optimizers_dev.ipynb 32
|
|
class AdvancedOptimizerFeatures:
|
|
"""
|
|
Advanced optimizer features for production ML systems.
|
|
|
|
Implements production-ready optimizer enhancements:
|
|
- Gradient clipping for stability
|
|
- Learning rate warmup strategies
|
|
- Gradient accumulation for large batches
|
|
- Mixed precision optimization patterns
|
|
- Distributed optimizer synchronization
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Initialize advanced optimizer features.
|
|
|
|
TODO: Implement advanced features initialization.
|
|
|
|
PRODUCTION CONTEXT:
|
|
These features are essential for:
|
|
- Training large language models (GPT, BERT)
|
|
- Computer vision at scale (ImageNet, COCO)
|
|
- Distributed training across multiple GPUs
|
|
- Memory-efficient training with limited resources
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Initialize gradient clipping parameters
|
|
- Set up warmup scheduling state
|
|
- Prepare accumulation buffers
|
|
- Configure synchronization patterns
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Gradient clipping
|
|
self.max_grad_norm = 1.0
|
|
self.clip_enabled = False
|
|
|
|
# Learning rate warmup
|
|
self.warmup_steps = 0
|
|
self.warmup_factor = 0.1
|
|
self.base_lr = 0.001
|
|
|
|
# Gradient accumulation
|
|
self.accumulation_steps = 1
|
|
self.accumulated_gradients = {}
|
|
self.accumulation_count = 0
|
|
|
|
# Mixed precision simulation
|
|
self.use_fp16 = False
|
|
self.loss_scale = 1.0
|
|
self.dynamic_loss_scaling = False
|
|
|
|
# Distributed training simulation
|
|
self.world_size = 1
|
|
self.rank = 0
|
|
### END SOLUTION
|
|
|
|
def apply_gradient_clipping(self, optimizer: Union[SGD, Adam], max_norm: float = 1.0) -> float:
|
|
"""
|
|
Apply gradient clipping to prevent gradient explosion.
|
|
|
|
Args:
|
|
optimizer: Optimizer with parameters to clip
|
|
max_norm: Maximum allowed gradient norm
|
|
|
|
Returns:
|
|
Actual gradient norm before clipping
|
|
|
|
TODO: Implement gradient clipping.
|
|
|
|
APPROACH:
|
|
1. Calculate total gradient norm across all parameters
|
|
2. If norm exceeds max_norm, scale all gradients down
|
|
3. Apply scaling factor to maintain gradient direction
|
|
4. Return original norm for monitoring
|
|
|
|
MATHEMATICAL FORMULATION:
|
|
total_norm = sqrt(sum(param_grad_norm^2 for all params))
|
|
if total_norm > max_norm:
|
|
clip_factor = max_norm / total_norm
|
|
for each param: param.grad *= clip_factor
|
|
|
|
PRODUCTION VALUE:
|
|
Gradient clipping is essential for:
|
|
- Training RNNs and Transformers
|
|
- Preventing training instability
|
|
- Enabling higher learning rates
|
|
- Improving convergence reliability
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Calculate global gradient norm
|
|
- Apply uniform scaling to all gradients
|
|
- Preserve gradient directions
|
|
- Return unclipped norm for logging
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Calculate total gradient norm
|
|
total_norm = 0.0
|
|
param_count = 0
|
|
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
grad_data = param.grad.data.data
|
|
if hasattr(grad_data, 'flatten'):
|
|
param_norm = np.linalg.norm(grad_data.flatten())
|
|
else:
|
|
param_norm = abs(float(grad_data))
|
|
total_norm += param_norm ** 2
|
|
param_count += 1
|
|
|
|
if param_count > 0:
|
|
total_norm = total_norm ** 0.5
|
|
else:
|
|
return 0.0
|
|
|
|
# Apply clipping if necessary
|
|
if total_norm > max_norm:
|
|
clip_factor = max_norm / total_norm
|
|
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
grad_data = param.grad.data.data
|
|
clipped_grad = grad_data * clip_factor
|
|
param.grad.data = Tensor(clipped_grad)
|
|
|
|
return total_norm
|
|
### END SOLUTION
|
|
|
|
def apply_warmup_schedule(self, optimizer: Union[SGD, Adam], step: int,
|
|
warmup_steps: int, base_lr: float) -> float:
|
|
"""
|
|
Apply learning rate warmup schedule.
|
|
|
|
Args:
|
|
optimizer: Optimizer to apply warmup to
|
|
step: Current training step
|
|
warmup_steps: Number of warmup steps
|
|
base_lr: Target learning rate after warmup
|
|
|
|
Returns:
|
|
Current learning rate
|
|
|
|
TODO: Implement learning rate warmup.
|
|
|
|
APPROACH:
|
|
1. If step < warmup_steps: gradually increase learning rate
|
|
2. Use linear or polynomial warmup schedule
|
|
3. Update optimizer's learning rate
|
|
4. Return current learning rate for logging
|
|
|
|
WARMUP STRATEGIES:
|
|
- Linear: lr = base_lr * (step / warmup_steps)
|
|
- Polynomial: lr = base_lr * ((step / warmup_steps) ^ power)
|
|
- Constant: lr = base_lr * warmup_factor for warmup_steps
|
|
|
|
PRODUCTION VALUE:
|
|
Warmup prevents:
|
|
- Early training instability
|
|
- Poor initialization effects
|
|
- Gradient explosion at start
|
|
- Suboptimal convergence paths
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Handle step=0 case (avoid division by zero)
|
|
- Use linear warmup for simplicity
|
|
- Update optimizer.learning_rate directly
|
|
- Smoothly transition to base learning rate
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if step < warmup_steps and warmup_steps > 0:
|
|
# Linear warmup
|
|
warmup_factor = step / warmup_steps
|
|
current_lr = base_lr * warmup_factor
|
|
else:
|
|
# After warmup, use base learning rate
|
|
current_lr = base_lr
|
|
|
|
# Update optimizer learning rate
|
|
optimizer.learning_rate = current_lr
|
|
|
|
return current_lr
|
|
### END SOLUTION
|
|
|
|
def accumulate_gradients(self, optimizer: Union[SGD, Adam], accumulation_steps: int) -> bool:
|
|
"""
|
|
Accumulate gradients to simulate larger batch sizes.
|
|
|
|
Args:
|
|
optimizer: Optimizer with parameters to accumulate
|
|
accumulation_steps: Number of steps to accumulate before update
|
|
|
|
Returns:
|
|
True if ready to perform optimizer step, False otherwise
|
|
|
|
TODO: Implement gradient accumulation.
|
|
|
|
APPROACH:
|
|
1. Add current gradients to accumulated gradient buffers
|
|
2. Increment accumulation counter
|
|
3. If counter reaches accumulation_steps:
|
|
a. Average accumulated gradients
|
|
b. Set as current gradients
|
|
c. Return True (ready for optimizer step)
|
|
d. Reset accumulation
|
|
4. Otherwise return False (continue accumulating)
|
|
|
|
MATHEMATICAL FORMULATION:
|
|
accumulated_grad += current_grad
|
|
if accumulation_count == accumulation_steps:
|
|
final_grad = accumulated_grad / accumulation_steps
|
|
reset accumulation
|
|
return True
|
|
|
|
PRODUCTION VALUE:
|
|
Gradient accumulation enables:
|
|
- Large effective batch sizes on limited memory
|
|
- Training large models on small GPUs
|
|
- Consistent training across different hardware
|
|
- Memory-efficient distributed training
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Store accumulated gradients per parameter
|
|
- Use parameter id() as key for tracking
|
|
- Average gradients before optimizer step
|
|
- Reset accumulation after each update
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Initialize accumulation if first time
|
|
if not hasattr(self, 'accumulation_count'):
|
|
self.accumulation_count = 0
|
|
self.accumulated_gradients = {}
|
|
|
|
# Accumulate gradients
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
param_id = id(param)
|
|
grad_data = param.grad.data.data
|
|
|
|
if param_id not in self.accumulated_gradients:
|
|
self.accumulated_gradients[param_id] = np.zeros_like(grad_data)
|
|
|
|
self.accumulated_gradients[param_id] += grad_data
|
|
|
|
self.accumulation_count += 1
|
|
|
|
# Check if ready to update
|
|
if self.accumulation_count >= accumulation_steps:
|
|
# Average accumulated gradients and set as current gradients
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
param_id = id(param)
|
|
if param_id in self.accumulated_gradients:
|
|
averaged_grad = self.accumulated_gradients[param_id] / accumulation_steps
|
|
param.grad.data = Tensor(averaged_grad)
|
|
|
|
# Reset accumulation
|
|
self.accumulation_count = 0
|
|
self.accumulated_gradients = {}
|
|
|
|
return True # Ready for optimizer step
|
|
|
|
return False # Continue accumulating
|
|
### END SOLUTION
|
|
|
|
def simulate_mixed_precision(self, optimizer: Union[SGD, Adam], loss_scale: float = 1.0) -> bool:
|
|
"""
|
|
Simulate mixed precision training effects.
|
|
|
|
Args:
|
|
optimizer: Optimizer to apply mixed precision to
|
|
loss_scale: Loss scaling factor for gradient preservation
|
|
|
|
Returns:
|
|
True if gradients are valid (no overflow), False if overflow detected
|
|
|
|
TODO: Implement mixed precision simulation.
|
|
|
|
APPROACH:
|
|
1. Scale gradients by loss_scale factor
|
|
2. Check for gradient overflow (inf or nan values)
|
|
3. If overflow detected, skip optimizer step
|
|
4. If valid, descale gradients before optimizer step
|
|
5. Return overflow status
|
|
|
|
MIXED PRECISION CONCEPTS:
|
|
- Use FP16 for forward pass (memory savings)
|
|
- Use FP32 for backward pass (numerical stability)
|
|
- Scale loss to prevent gradient underflow
|
|
- Check for overflow before optimization
|
|
|
|
PRODUCTION VALUE:
|
|
Mixed precision provides:
|
|
- 50% memory reduction
|
|
- Faster training on modern GPUs
|
|
- Maintained numerical stability
|
|
- Automatic overflow detection
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Scale gradients by loss_scale
|
|
- Check for inf/nan in gradients
|
|
- Descale before optimizer step
|
|
- Return overflow status for dynamic scaling
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Check for gradient overflow before scaling
|
|
has_overflow = False
|
|
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
grad_data = param.grad.data.data
|
|
if hasattr(grad_data, 'flatten'):
|
|
grad_flat = grad_data.flatten()
|
|
if np.any(np.isinf(grad_flat)) or np.any(np.isnan(grad_flat)):
|
|
has_overflow = True
|
|
break
|
|
else:
|
|
if np.isinf(grad_data) or np.isnan(grad_data):
|
|
has_overflow = True
|
|
break
|
|
|
|
if has_overflow:
|
|
# Zero gradients to prevent corruption
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
param.grad = None
|
|
return False # Overflow detected
|
|
|
|
# Descale gradients (simulate unscaling from FP16)
|
|
if loss_scale > 1.0:
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
grad_data = param.grad.data.data
|
|
descaled_grad = grad_data / loss_scale
|
|
param.grad.data = Tensor(descaled_grad)
|
|
|
|
return True # No overflow, safe to proceed
|
|
### END SOLUTION
|
|
|
|
def simulate_distributed_sync(self, optimizer: Union[SGD, Adam], world_size: int = 1) -> None:
|
|
"""
|
|
Simulate distributed training gradient synchronization.
|
|
|
|
Args:
|
|
optimizer: Optimizer with gradients to synchronize
|
|
world_size: Number of distributed processes
|
|
|
|
TODO: Implement distributed gradient synchronization simulation.
|
|
|
|
APPROACH:
|
|
1. Simulate all-reduce operation on gradients
|
|
2. Average gradients across all processes
|
|
3. Update local gradients with synchronized values
|
|
4. Handle communication overhead simulation
|
|
|
|
DISTRIBUTED CONCEPTS:
|
|
- All-reduce: Combine gradients from all GPUs
|
|
- Averaging: Divide by world_size for consistency
|
|
- Synchronization: Ensure all GPUs have same gradients
|
|
- Communication: Network overhead for gradient sharing
|
|
|
|
PRODUCTION VALUE:
|
|
Distributed training enables:
|
|
- Scaling to multiple GPUs/nodes
|
|
- Training large models efficiently
|
|
- Reduced training time
|
|
- Consistent convergence across devices
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Simulate averaging by keeping gradients unchanged
|
|
- Add small noise to simulate communication variance
|
|
- Scale learning rate by world_size if needed
|
|
- Log synchronization overhead
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if world_size <= 1:
|
|
return # No synchronization needed for single process
|
|
|
|
# Simulate all-reduce operation (averaging gradients)
|
|
for param in optimizer.parameters:
|
|
if param.grad is not None:
|
|
grad_data = param.grad.data.data
|
|
|
|
# In real distributed training, gradients would be averaged across all processes
|
|
# Here we simulate this by keeping gradients unchanged (already "averaged")
|
|
# In practice, this would involve MPI/NCCL communication
|
|
|
|
# Simulate communication noise (very small)
|
|
if hasattr(grad_data, 'shape'):
|
|
noise = np.random.normal(0, 1e-10, grad_data.shape)
|
|
synchronized_grad = grad_data + noise
|
|
else:
|
|
noise = np.random.normal(0, 1e-10)
|
|
synchronized_grad = grad_data + noise
|
|
|
|
param.grad.data = Tensor(synchronized_grad)
|
|
|
|
# In distributed training, learning rate is often scaled by world_size
|
|
# to maintain effective learning rate with larger batch sizes
|
|
if hasattr(optimizer, 'base_learning_rate'):
|
|
optimizer.learning_rate = optimizer.base_learning_rate * world_size
|
|
### END SOLUTION
|