# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/08_optimizers/optimizers_dev.ipynb. # %% auto 0 __all__ = ['setup_import_paths', 'gradient_descent_step', 'SGD', 'Adam', 'StepLR', 'OptimizerConvergenceProfiler', 'AdvancedOptimizerFeatures'] # %% ../../modules/08_optimizers/optimizers_dev.ipynb 1 import numpy as np import sys import os from typing import List, Dict, Any, Optional, Union from collections import defaultdict # Helper function to set up import paths def setup_import_paths(): """Set up import paths for development modules.""" import sys import os # Add module directories to path base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) tensor_dir = os.path.join(base_dir, '01_tensor') autograd_dir = os.path.join(base_dir, '06_autograd') # Fixed: Module 6, not 7 if tensor_dir not in sys.path: sys.path.append(tensor_dir) if autograd_dir not in sys.path: sys.path.append(autograd_dir) # Import our existing components try: from tinytorch.core.tensor import Tensor from tinytorch.core.autograd import Variable except ImportError: # For development, try local imports try: setup_import_paths() from tensor_dev import Tensor from autograd_dev import Variable except ImportError: # Create simplified fallback classes for basic gradient operations print("Warning: Using simplified classes for basic gradient operations") class Tensor: def __init__(self, data): self.data = np.array(data) self.shape = self.data.shape def __str__(self): return f"Tensor({self.data})" class Variable: def __init__(self, data, requires_grad=True): if isinstance(data, (int, float)): self.data = Tensor([data]) else: self.data = Tensor(data) self.requires_grad = requires_grad self.grad = None # Simple gradient storage def zero_grad(self): """Reset gradients to None (basic operation from Module 6)""" self.grad = None def __str__(self): return f"Variable({self.data.data})" # %% ../../modules/08_optimizers/optimizers_dev.ipynb 7 def gradient_descent_step(parameter: Variable, learning_rate: float) -> None: """ Perform one step of gradient descent on a parameter. Args: parameter: Variable with gradient information learning_rate: How much to update parameter TODO: Implement basic gradient descent parameter update. STEP-BY-STEP IMPLEMENTATION: 1. Check if parameter has a gradient 2. Get current parameter value and gradient 3. Update parameter: new_value = old_value - learning_rate * gradient 4. Update parameter data with new value 5. Handle edge cases (no gradient, invalid values) EXAMPLE USAGE: ```python # Parameter with gradient w = Variable(2.0, requires_grad=True) w.grad = Variable(0.5) # Gradient from loss # Update parameter gradient_descent_step(w, learning_rate=0.1) # w.data now contains: 2.0 - 0.1 * 0.5 = 1.95 ``` IMPLEMENTATION HINTS: - Check if parameter.grad is not None - Use parameter.grad.data.data to get gradient value - Update parameter.data with new Tensor - Don't modify gradient (it's used for logging) LEARNING CONNECTIONS: - This is the foundation of all neural network training - PyTorch's optimizer.step() does exactly this - The learning rate determines convergence speed """ ### BEGIN SOLUTION if parameter.grad is not None: # Get current parameter value and gradient current_value = parameter.data.data gradient_value = parameter.grad.data.data # Update parameter: new_value = old_value - learning_rate * gradient new_value = current_value - learning_rate * gradient_value # Update parameter data parameter.data = Tensor(new_value) ### END SOLUTION # %% ../../modules/08_optimizers/optimizers_dev.ipynb 11 class SGD: """ Simplified SGD Optimizer Implements basic stochastic gradient descent with optional momentum. Uses simple gradient operations from Module 6. Mathematical Update Rule: parameter = parameter - learning_rate * gradient With momentum: velocity = momentum * velocity + gradient parameter = parameter - learning_rate * velocity """ def __init__(self, parameters: List[Variable], learning_rate: float = 0.01, momentum: float = 0.0): """ Initialize SGD optimizer with basic parameters. Args: parameters: List of Variables to optimize (from Module 6) learning_rate: Learning rate (default: 0.01) momentum: Momentum coefficient (default: 0.0) TODO: Implement basic SGD optimizer initialization. APPROACH: 1. Store parameters and learning rate 2. Store momentum coefficient 3. Initialize simple momentum buffers EXAMPLE: ```python # Basic optimizer setup w = Variable(1.0, requires_grad=True) b = Variable(0.0, requires_grad=True) optimizer = SGD([w, b], learning_rate=0.01) # In training: optimizer.zero_grad() # ... compute gradients ... optimizer.step() ``` """ ### BEGIN SOLUTION self.parameters = parameters self.learning_rate = learning_rate self.momentum = momentum # Simple momentum storage (using basic dict) self.velocity = {} for i, param in enumerate(parameters): if self.momentum > 0: self.velocity[i] = 0.0 # Initialize velocity to zero ### END SOLUTION def step(self) -> None: """ Perform one optimization step using basic gradient operations. TODO: Implement simplified SGD parameter update. APPROACH: 1. Iterate through all parameters 2. For each parameter with gradient (from Module 6): a. Get gradient using simple param.grad access b. Apply momentum if specified c. Update parameter with learning rate SIMPLIFIED MATHEMATICAL FORMULATION: - Without momentum: parameter = parameter - learning_rate * gradient - With momentum: velocity = momentum * velocity + gradient parameter = parameter - learning_rate * velocity IMPLEMENTATION HINTS: - Use basic param.grad access (from Module 6) - Simple momentum using self.velocity dict - Basic parameter update using scalar operations """ ### BEGIN SOLUTION for i, param in enumerate(self.parameters): if param.grad is not None: # Get gradient data (works for both Tensor and Variable) # In modern PyTorch style, grad.data gives us the numpy array gradient = param.grad.data # Ensure gradient is numpy array (fix for memoryview issue) if hasattr(gradient, 'data'): gradient_data = gradient.data # Check if the inner data is memoryview and convert if isinstance(gradient_data, memoryview): gradient_data = np.array(gradient_data) elif isinstance(gradient, memoryview): gradient_data = np.array(gradient) else: gradient_data = np.array(gradient) if self.momentum > 0: # Apply momentum (simplified) using numpy arrays if i in self.velocity: self.velocity[i] = self.momentum * self.velocity[i] + gradient_data else: self.velocity[i] = gradient_data update = self.velocity[i] else: # Simple gradient descent (no momentum) update = gradient_data # Clean parameter update - PyTorch style # NOTE: In production PyTorch, this is an in-place operation (param.data.sub_()) # for memory efficiency. We create a new Tensor here for clarity, but real # systems modify the existing memory to avoid allocation overhead. from tinytorch.core.tensor import Tensor new_value = param.data - self.learning_rate * update param.data = Tensor(new_value) ### END SOLUTION def zero_grad(self) -> None: """ Zero out gradients for all parameters. TODO: Implement gradient zeroing. APPROACH: 1. Iterate through all parameters 2. Set gradient to None for each parameter 3. This prepares for next backward pass IMPLEMENTATION HINTS: - Simply set param.grad = None - This is called before loss.backward() - Essential for proper gradient accumulation """ ### BEGIN SOLUTION for param in self.parameters: param.grad = None ### END SOLUTION # %% ../../modules/08_optimizers/optimizers_dev.ipynb 15 class Adam: """ Simplified Adam Optimizer Implements a simplified version of Adam algorithm with adaptive learning rates. Educational focus on understanding optimization concepts rather than complex implementation. Key concepts: - Momentum: Running average of gradients (first moment) - Adaptive learning: Running average of squared gradients (second moment) - Bias correction: Adjust for initialization bias """ def __init__(self, parameters: List[Variable], learning_rate: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8): """ Initialize simplified Adam optimizer. Args: parameters: List of Variables to optimize (from Module 6) learning_rate: Learning rate (default: 0.001) beta1: Decay rate for momentum (default: 0.9) beta2: Decay rate for squared gradients (default: 0.999) epsilon: Small constant for numerical stability (default: 1e-8) TODO: Implement simplified Adam optimizer initialization. APPROACH: 1. Store parameters and learning rate 2. Store Adam hyperparameters (beta1, beta2, epsilon) 3. Initialize simple moment storage EDUCATIONAL FOCUS: - Understand Adam concepts: momentum + adaptive learning - Learn why Adam uses running averages - See how bias correction helps early training EXAMPLE: ```python # Simple Adam setup w = Variable(1.0, requires_grad=True) b = Variable(0.0, requires_grad=True) optimizer = Adam([w, b], learning_rate=0.001) ``` """ ### BEGIN SOLUTION self.parameters = parameters self.learning_rate = learning_rate self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon # Simple moment storage (using basic dict with indices) # MEMORY INSIGHT: Adam uses 3x memory of SGD because it stores: # 1. Parameters (1x memory) # 2. First moment estimates m[i] (1x memory) # 3. Second moment estimates v[i] (1x memory) # This is why Adam can be problematic for very large models! self.m = {} # First moment (momentum) self.v = {} # Second moment (squared gradients) # Initialize moments for each parameter for i, param in enumerate(parameters): self.m[i] = 0.0 self.v[i] = 0.0 # Step counter for bias correction self.t = 0 ### END SOLUTION def step(self) -> None: """ Perform one optimization step using simplified Adam algorithm. TODO: Implement simplified Adam parameter update. APPROACH: 1. Increment step counter 2. For each parameter with gradient: a. Get gradient (basic operation from Module 6) b. Update momentum (first moment) c. Update squared gradient average (second moment) d. Apply bias correction e. Update parameter with adaptive learning rate SIMPLIFIED MATHEMATICAL FORMULATION: - m = beta1 * m + (1 - beta1) * gradient (momentum) - v = beta2 * v + (1 - beta2) * gradientΒ² (squared gradients) - m_corrected = m / (1 - beta1^t) (bias correction) - v_corrected = v / (1 - beta2^t) (bias correction) - parameter = parameter - lr * m_corrected / (√v_corrected + Ξ΅) EDUCATIONAL INSIGHTS: - Momentum helps accelerate learning - Squared gradients adapt learning rate per parameter - Bias correction prevents slow start """ ### BEGIN SOLUTION self.t += 1 # Increment step counter for i, param in enumerate(self.parameters): if param.grad is not None: # Get gradient data - clean PyTorch style gradient = param.grad.data # Ensure gradient is numpy array (fix for memoryview issue) if hasattr(gradient, 'data'): gradient_data = gradient.data # Check if the inner data is memoryview and convert if isinstance(gradient_data, memoryview): gradient_data = np.array(gradient_data) elif isinstance(gradient, memoryview): gradient_data = np.array(gradient) else: gradient_data = np.array(gradient) # Update first moment (momentum) - use numpy arrays self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * gradient_data # Update second moment (squared gradients) - use numpy arrays self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * gradient_data * gradient_data # Bias correction m_corrected = self.m[i] / (1 - self.beta1 ** self.t) v_corrected = self.v[i] / (1 - self.beta2 ** self.t) # Clean adaptive parameter update - PyTorch style # NOTE: In production PyTorch, parameters are updated in-place for efficiency. # We create a new Tensor for educational clarity, but real systems use # param.data.add_(-update) to modify memory directly without allocation. update = self.learning_rate * m_corrected / (np.sqrt(v_corrected) + self.epsilon) from tinytorch.core.tensor import Tensor new_value = param.data - update param.data = Tensor(new_value) ### END SOLUTION def zero_grad(self) -> None: """ Zero out gradients for all parameters. TODO: Implement gradient zeroing (same as SGD). IMPLEMENTATION HINTS: - Set param.grad = None for all parameters - This is identical to SGD implementation """ ### BEGIN SOLUTION for param in self.parameters: param.grad = None ### END SOLUTION # %% ../../modules/08_optimizers/optimizers_dev.ipynb 20 class StepLR: """ Step Learning Rate Scheduler Decays learning rate by gamma every step_size epochs: learning_rate = initial_lr * (gamma ^ (epoch // step_size)) """ def __init__(self, optimizer: Union[SGD, Adam], step_size: int, gamma: float = 0.1): """ Initialize step learning rate scheduler. Args: optimizer: Optimizer to schedule step_size: Number of epochs between decreases gamma: Multiplicative factor for learning rate decay TODO: Implement learning rate scheduler initialization. APPROACH: 1. Store optimizer reference 2. Store scheduling parameters 3. Save initial learning rate 4. Initialize step counter EXAMPLE: ```python optimizer = SGD([w1, w2], learning_rate=0.1) scheduler = StepLR(optimizer, step_size=10, gamma=0.1) # In training loop: for epoch in range(100): train_one_epoch() scheduler.step() # Update learning rate ``` HINTS: - Store optimizer reference - Save initial learning rate from optimizer - Initialize step counter to 0 - gamma is the decay factor (0.1 = 10x reduction) """ ### BEGIN SOLUTION self.optimizer = optimizer self.step_size = step_size self.gamma = gamma self.initial_lr = optimizer.learning_rate self.step_count = 0 ### END SOLUTION def step(self) -> None: """ Update learning rate based on current step. TODO: Implement learning rate update. APPROACH: 1. Increment step counter 2. Calculate new learning rate using step decay formula 3. Update optimizer's learning rate MATHEMATICAL FORMULATION: new_lr = initial_lr * (gamma ^ ((step_count - 1) // step_size)) IMPLEMENTATION HINTS: - Use // for integer division - Use ** for exponentiation - Update optimizer.learning_rate directly """ ### BEGIN SOLUTION self.step_count += 1 # Calculate new learning rate decay_factor = self.gamma ** ((self.step_count - 1) // self.step_size) new_lr = self.initial_lr * decay_factor # Update optimizer's learning rate self.optimizer.learning_rate = new_lr ### END SOLUTION def get_lr(self) -> float: """ Get current learning rate. TODO: Return current learning rate. IMPLEMENTATION HINTS: - Return optimizer.learning_rate """ ### BEGIN SOLUTION return self.optimizer.learning_rate ### END SOLUTION # %% ../../modules/08_optimizers/optimizers_dev.ipynb 28 class OptimizerConvergenceProfiler: """ ML Systems Tool: Optimizer Performance and Convergence Analysis Profiles convergence patterns, learning rate sensitivity, and computational costs across different optimizers to guide production optimizer selection. This is 60% implementation focusing on core analysis capabilities: - Convergence rate comparison across optimizers - Learning rate sensitivity analysis - Gradient statistics tracking - Memory usage estimation - Performance recommendations """ def __init__(self): """ Initialize optimizer convergence profiler. TODO: Implement profiler initialization. APPROACH: 1. Initialize tracking dictionaries for different metrics 2. Set up convergence analysis parameters 3. Prepare memory and performance tracking 4. Initialize recommendation engine components PRODUCTION CONTEXT: In production, this profiler would run on representative tasks to: - Select optimal optimizers for new models - Tune hyperparameters before expensive training runs - Predict training time and resource requirements - Monitor training stability and convergence IMPLEMENTATION HINTS: - Track convergence history per optimizer - Store gradient statistics over time - Monitor memory usage patterns - Prepare for comparative analysis """ ### BEGIN SOLUTION # Convergence tracking self.convergence_history = defaultdict(list) # {optimizer_name: [losses]} self.gradient_norms = defaultdict(list) # {optimizer_name: [grad_norms]} self.learning_rates = defaultdict(list) # {optimizer_name: [lr_values]} self.step_times = defaultdict(list) # {optimizer_name: [step_durations]} # Performance metrics self.memory_usage = defaultdict(list) # {optimizer_name: [memory_estimates]} self.convergence_rates = {} # {optimizer_name: convergence_rate} self.stability_scores = {} # {optimizer_name: stability_score} # Analysis parameters self.convergence_threshold = 1e-6 self.stability_window = 10 self.gradient_explosion_threshold = 1e6 # Recommendations self.optimizer_rankings = {} self.hyperparameter_suggestions = {} ### END SOLUTION def profile_optimizer_convergence(self, optimizer_name: str, optimizer: Union[SGD, Adam], training_function, initial_loss: float, max_steps: int = 100) -> Dict[str, Any]: """ Profile convergence behavior of an optimizer on a specific task. Args: optimizer_name: Name identifier for the optimizer optimizer: Optimizer instance to profile training_function: Function that performs one training step and returns loss initial_loss: Starting loss value max_steps: Maximum training steps to profile Returns: Dictionary containing convergence analysis results TODO: Implement optimizer convergence profiling. APPROACH: 1. Run training loop with the optimizer 2. Track loss, gradients, learning rates at each step 3. Measure step execution time 4. Estimate memory usage 5. Analyze convergence patterns and stability 6. Generate performance metrics CONVERGENCE ANALYSIS: - Track loss reduction over time - Measure convergence rate (loss reduction per step) - Detect convergence plateaus - Identify gradient explosion or vanishing - Assess training stability PRODUCTION INSIGHTS: This analysis helps determine: - Which optimizers converge fastest for specific model types - Optimal learning rates for different optimizers - Memory vs performance trade-offs - Training stability and robustness IMPLEMENTATION HINTS: - Use time.time() to measure step duration - Calculate gradient norms across all parameters - Track learning rate changes (for schedulers) - Estimate memory from optimizer state size """ ### BEGIN SOLUTION import time print(f"πŸ” Profiling {optimizer_name} convergence...") # Initialize tracking losses = [] grad_norms = [] step_durations = [] lr_values = [] previous_loss = initial_loss convergence_step = None for step in range(max_steps): step_start = time.time() # Perform training step try: current_loss = training_function() losses.append(current_loss) # Calculate gradient norm total_grad_norm = 0.0 param_count = 0 for param in optimizer.parameters: if param.grad is not None: grad_data = param.grad.data.data if hasattr(grad_data, 'flatten'): grad_norm = np.linalg.norm(grad_data.flatten()) else: grad_norm = abs(float(grad_data)) total_grad_norm += grad_norm ** 2 param_count += 1 if param_count > 0: total_grad_norm = (total_grad_norm / param_count) ** 0.5 grad_norms.append(total_grad_norm) # Track learning rate lr_values.append(optimizer.learning_rate) # Check convergence if convergence_step is None and abs(current_loss - previous_loss) < self.convergence_threshold: convergence_step = step previous_loss = current_loss except Exception as e: print(f"⚠️ Training step {step} failed: {e}") break step_end = time.time() step_durations.append(step_end - step_start) # Early stopping for exploded gradients if total_grad_norm > self.gradient_explosion_threshold: print(f"⚠️ Gradient explosion detected at step {step}") break # Store results self.convergence_history[optimizer_name] = losses self.gradient_norms[optimizer_name] = grad_norms self.learning_rates[optimizer_name] = lr_values self.step_times[optimizer_name] = step_durations # Analyze results analysis = self._analyze_convergence_profile(optimizer_name, losses, grad_norms, step_durations, convergence_step) return analysis ### END SOLUTION def compare_optimizers(self, profiles: Dict[str, Dict]) -> Dict[str, Any]: """ Compare multiple optimizer profiles and generate recommendations. Args: profiles: Dictionary mapping optimizer names to their profile results Returns: Comprehensive comparison analysis with recommendations TODO: Implement optimizer comparison and ranking. APPROACH: 1. Analyze convergence speed across optimizers 2. Compare final performance and stability 3. Assess computational efficiency 4. Generate rankings and recommendations 5. Identify optimal hyperparameters COMPARISON METRICS: - Steps to convergence - Final loss achieved - Training stability (loss variance) - Computational cost per step - Memory efficiency - Gradient explosion resistance PRODUCTION VALUE: This comparison guides: - Optimizer selection for new projects - Hyperparameter optimization strategies - Resource allocation decisions - Training pipeline design IMPLEMENTATION HINTS: - Normalize metrics for fair comparison - Weight different factors based on importance - Generate actionable recommendations - Consider trade-offs between speed and stability """ ### BEGIN SOLUTION comparison = { 'convergence_speed': {}, 'final_performance': {}, 'stability': {}, 'efficiency': {}, 'rankings': {}, 'recommendations': {} } print("πŸ“Š Comparing optimizer performance...") # Analyze each optimizer for opt_name, profile in profiles.items(): # Convergence speed convergence_step = profile.get('convergence_step', len(self.convergence_history[opt_name])) comparison['convergence_speed'][opt_name] = convergence_step # Final performance losses = self.convergence_history[opt_name] if losses: final_loss = losses[-1] comparison['final_performance'][opt_name] = final_loss # Stability (coefficient of variation in last 10 steps) if len(losses) >= self.stability_window: recent_losses = losses[-self.stability_window:] stability = 1.0 / (1.0 + np.std(recent_losses) / (np.mean(recent_losses) + 1e-8)) comparison['stability'][opt_name] = stability # Efficiency (loss reduction per unit time) step_times = self.step_times[opt_name] if losses and step_times: initial_loss = losses[0] final_loss = losses[-1] total_time = sum(step_times) efficiency = (initial_loss - final_loss) / (total_time + 1e-8) comparison['efficiency'][opt_name] = efficiency # Generate rankings metrics = ['convergence_speed', 'final_performance', 'stability', 'efficiency'] for metric in metrics: if comparison[metric]: if metric == 'convergence_speed': # Lower is better for convergence speed sorted_opts = sorted(comparison[metric].items(), key=lambda x: x[1]) elif metric == 'final_performance': # Lower is better for final loss sorted_opts = sorted(comparison[metric].items(), key=lambda x: x[1]) else: # Higher is better for stability and efficiency sorted_opts = sorted(comparison[metric].items(), key=lambda x: x[1], reverse=True) comparison['rankings'][metric] = [opt for opt, _ in sorted_opts] # Generate recommendations recommendations = [] # Best overall optimizer if comparison['rankings']: # Simple scoring: rank position across metrics scores = defaultdict(float) for metric, ranking in comparison['rankings'].items(): for i, opt_name in enumerate(ranking): scores[opt_name] += len(ranking) - i best_optimizer = max(scores.items(), key=lambda x: x[1])[0] recommendations.append(f"πŸ† Best overall optimizer: {best_optimizer}") # Specific recommendations if 'convergence_speed' in comparison['rankings']: fastest = comparison['rankings']['convergence_speed'][0] recommendations.append(f"⚑ Fastest convergence: {fastest}") if 'stability' in comparison['rankings']: most_stable = comparison['rankings']['stability'][0] recommendations.append(f"🎯 Most stable training: {most_stable}") if 'efficiency' in comparison['rankings']: most_efficient = comparison['rankings']['efficiency'][0] recommendations.append(f"πŸ’° Most compute-efficient: {most_efficient}") comparison['recommendations']['summary'] = recommendations return comparison ### END SOLUTION def analyze_learning_rate_sensitivity(self, optimizer_class, learning_rates: List[float], training_function, steps: int = 50) -> Dict[str, Any]: """ Analyze optimizer sensitivity to different learning rates. Args: optimizer_class: Optimizer class (SGD or Adam) learning_rates: List of learning rates to test training_function: Function that creates and runs training steps: Number of training steps per learning rate Returns: Learning rate sensitivity analysis TODO: Implement learning rate sensitivity analysis. APPROACH: 1. Test optimizer with different learning rates 2. Measure convergence performance for each rate 3. Identify optimal learning rate range 4. Detect learning rate instability regions 5. Generate learning rate recommendations SENSITIVITY ANALYSIS: - Plot loss curves for different learning rates - Identify optimal learning rate range - Detect gradient explosion thresholds - Measure convergence robustness - Generate adaptive scheduling suggestions PRODUCTION INSIGHTS: This analysis enables: - Automatic learning rate tuning - Learning rate scheduling optimization - Gradient explosion prevention - Training stability improvement IMPLEMENTATION HINTS: - Reset model state for each learning rate test - Track convergence metrics consistently - Identify learning rate sweet spots - Flag unstable learning rate regions """ ### BEGIN SOLUTION print("πŸ” Analyzing learning rate sensitivity...") lr_analysis = { 'learning_rates': learning_rates, 'final_losses': [], 'convergence_steps': [], 'stability_scores': [], 'gradient_explosions': [], 'optimal_range': None, 'recommendations': [] } # Test each learning rate for lr in learning_rates: print(f" Testing learning rate: {lr}") try: # Create optimizer with current learning rate # This is a simplified test - in production, would reset model state losses, grad_norms = training_function(lr, steps) if losses: final_loss = losses[-1] lr_analysis['final_losses'].append(final_loss) # Find convergence step convergence_step = steps for i in range(1, len(losses)): if abs(losses[i] - losses[i-1]) < self.convergence_threshold: convergence_step = i break lr_analysis['convergence_steps'].append(convergence_step) # Calculate stability if len(losses) >= 10: recent_losses = losses[-10:] stability = 1.0 / (1.0 + np.std(recent_losses) / (np.mean(recent_losses) + 1e-8)) lr_analysis['stability_scores'].append(stability) else: lr_analysis['stability_scores'].append(0.0) # Check for gradient explosion max_grad_norm = max(grad_norms) if grad_norms else 0.0 explosion = max_grad_norm > self.gradient_explosion_threshold lr_analysis['gradient_explosions'].append(explosion) else: # Failed to get losses lr_analysis['final_losses'].append(float('inf')) lr_analysis['convergence_steps'].append(steps) lr_analysis['stability_scores'].append(0.0) lr_analysis['gradient_explosions'].append(True) except Exception as e: print(f" ⚠️ Failed with lr={lr}: {e}") lr_analysis['final_losses'].append(float('inf')) lr_analysis['convergence_steps'].append(steps) lr_analysis['stability_scores'].append(0.0) lr_analysis['gradient_explosions'].append(True) # Find optimal learning rate range valid_indices = [i for i, (loss, explosion) in enumerate(zip(lr_analysis['final_losses'], lr_analysis['gradient_explosions'])) if not explosion and loss != float('inf')] if valid_indices: # Find learning rate with best final loss among stable ones stable_losses = [(i, lr_analysis['final_losses'][i]) for i in valid_indices] best_idx = min(stable_losses, key=lambda x: x[1])[0] # Define optimal range around best learning rate best_lr = learning_rates[best_idx] lr_analysis['optimal_range'] = (best_lr * 0.1, best_lr * 10.0) # Generate recommendations recommendations = [] recommendations.append(f"🎯 Optimal learning rate: {best_lr:.2e}") recommendations.append(f"πŸ“ˆ Safe range: {lr_analysis['optimal_range'][0]:.2e} - {lr_analysis['optimal_range'][1]:.2e}") # Learning rate scheduling suggestions if best_idx > 0: recommendations.append("πŸ’‘ Consider starting with higher LR and decaying") if any(lr_analysis['gradient_explosions']): max_safe_lr = max([learning_rates[i] for i in valid_indices]) recommendations.append(f"⚠️ Avoid learning rates above {max_safe_lr:.2e}") lr_analysis['recommendations'] = recommendations else: lr_analysis['recommendations'] = ["⚠️ No stable learning rates found - try lower values"] return lr_analysis ### END SOLUTION def estimate_memory_usage(self, optimizer: Union[SGD, Adam], num_parameters: int) -> Dict[str, float]: """ Estimate memory usage for different optimizers. Args: optimizer: Optimizer instance num_parameters: Number of model parameters Returns: Memory usage estimates in MB TODO: Implement memory usage estimation. APPROACH: 1. Calculate parameter memory requirements 2. Estimate optimizer state memory 3. Account for gradient storage 4. Include temporary computation memory 5. Provide memory scaling predictions MEMORY ANALYSIS: - Parameter storage: num_params * 4 bytes (float32) - Gradient storage: num_params * 4 bytes - Optimizer state: varies by optimizer type - SGD momentum: num_params * 4 bytes - Adam: num_params * 8 bytes (first + second moments) PRODUCTION VALUE: Memory estimation helps: - Select optimizers for memory-constrained environments - Plan GPU memory allocation - Scale to larger models - Optimize batch sizes IMPLEMENTATION HINTS: - Use typical float32 size (4 bytes) - Account for optimizer-specific state - Include gradient accumulation overhead - Provide scaling estimates """ ### BEGIN SOLUTION # Base memory requirements bytes_per_param = 4 # float32 memory_breakdown = { 'parameters_mb': num_parameters * bytes_per_param / (1024 * 1024), 'gradients_mb': num_parameters * bytes_per_param / (1024 * 1024), 'optimizer_state_mb': 0.0, 'total_mb': 0.0 } # Optimizer-specific state memory if isinstance(optimizer, SGD): if optimizer.momentum > 0: # Momentum buffers memory_breakdown['optimizer_state_mb'] = num_parameters * bytes_per_param / (1024 * 1024) else: memory_breakdown['optimizer_state_mb'] = 0.0 elif isinstance(optimizer, Adam): # First and second moment estimates memory_breakdown['optimizer_state_mb'] = num_parameters * 2 * bytes_per_param / (1024 * 1024) # Calculate total memory_breakdown['total_mb'] = ( memory_breakdown['parameters_mb'] + memory_breakdown['gradients_mb'] + memory_breakdown['optimizer_state_mb'] ) # Add efficiency estimates memory_breakdown['memory_efficiency'] = memory_breakdown['parameters_mb'] / memory_breakdown['total_mb'] memory_breakdown['overhead_ratio'] = memory_breakdown['optimizer_state_mb'] / memory_breakdown['parameters_mb'] return memory_breakdown ### END SOLUTION def generate_production_recommendations(self, analysis_results: Dict[str, Any]) -> List[str]: """ Generate actionable recommendations for production optimizer usage. Args: analysis_results: Combined results from convergence and sensitivity analysis Returns: List of production recommendations TODO: Implement production recommendation generation. APPROACH: 1. Analyze convergence patterns and stability 2. Consider computational efficiency requirements 3. Account for memory constraints 4. Generate optimizer selection guidance 5. Provide hyperparameter tuning suggestions RECOMMENDATION CATEGORIES: - Optimizer selection for different scenarios - Learning rate and scheduling strategies - Memory optimization techniques - Training stability improvements - Production deployment considerations PRODUCTION CONTEXT: These recommendations guide: - ML engineer optimizer selection - DevOps resource allocation - Training pipeline optimization - Cost reduction strategies IMPLEMENTATION HINTS: - Provide specific, actionable advice - Consider different deployment scenarios - Include quantitative guidelines - Address common production challenges """ ### BEGIN SOLUTION recommendations = [] # Optimizer selection recommendations recommendations.append("πŸ”§ OPTIMIZER SELECTION GUIDE:") recommendations.append(" β€’ SGD + Momentum: Best for large batch training, proven stability") recommendations.append(" β€’ Adam: Best for rapid prototyping, adaptive learning rates") recommendations.append(" β€’ Consider memory constraints: SGD uses ~50% less memory than Adam") # Learning rate recommendations if 'learning_rate_analysis' in analysis_results: lr_analysis = analysis_results['learning_rate_analysis'] if lr_analysis.get('optimal_range'): opt_range = lr_analysis['optimal_range'] recommendations.append(f"πŸ“ˆ LEARNING RATE GUIDANCE:") recommendations.append(f" β€’ Start with: {opt_range[0]:.2e}") recommendations.append(f" β€’ Safe upper bound: {opt_range[1]:.2e}") recommendations.append(" β€’ Use learning rate scheduling for best results") # Convergence recommendations if 'convergence_comparison' in analysis_results: comparison = analysis_results['convergence_comparison'] if 'recommendations' in comparison and 'summary' in comparison['recommendations']: recommendations.append("🎯 CONVERGENCE OPTIMIZATION:") for rec in comparison['recommendations']['summary']: recommendations.append(f" β€’ {rec}") # Production deployment recommendations recommendations.append("πŸš€ PRODUCTION DEPLOYMENT:") recommendations.append(" β€’ Monitor gradient norms to detect training instability") recommendations.append(" β€’ Implement gradient clipping for large models") recommendations.append(" β€’ Use learning rate warmup for transformer architectures") recommendations.append(" β€’ Consider mixed precision training to reduce memory usage") # Scaling recommendations recommendations.append("πŸ“Š SCALING CONSIDERATIONS:") recommendations.append(" β€’ Large batch training: Prefer SGD with linear learning rate scaling") recommendations.append(" β€’ Distributed training: Use synchronized optimizers") recommendations.append(" β€’ Memory-constrained: Choose SGD or use gradient accumulation") recommendations.append(" β€’ Fine-tuning: Use lower learning rates (10x-100x smaller)") # Monitoring recommendations recommendations.append("πŸ“ˆ MONITORING & DEBUGGING:") recommendations.append(" β€’ Track loss smoothness to detect learning rate issues") recommendations.append(" β€’ Monitor gradient norms for explosion/vanishing detection") recommendations.append(" β€’ Log learning rate schedules for reproducibility") recommendations.append(" β€’ Profile memory usage to optimize batch sizes") return recommendations ### END SOLUTION def _analyze_convergence_profile(self, optimizer_name: str, losses: List[float], grad_norms: List[float], step_durations: List[float], convergence_step: Optional[int]) -> Dict[str, Any]: """ Internal helper to analyze convergence profile data. Args: optimizer_name: Name of the optimizer losses: List of loss values over training grad_norms: List of gradient norms over training step_durations: List of step execution times convergence_step: Step where convergence was detected (if any) Returns: Analysis results dictionary """ ### BEGIN SOLUTION analysis = { 'optimizer_name': optimizer_name, 'total_steps': len(losses), 'convergence_step': convergence_step, 'final_loss': losses[-1] if losses else float('inf'), 'initial_loss': losses[0] if losses else float('inf'), 'loss_reduction': 0.0, 'convergence_rate': 0.0, 'stability_score': 0.0, 'average_step_time': 0.0, 'gradient_health': 'unknown' } if losses: # Calculate loss reduction initial_loss = losses[0] final_loss = losses[-1] analysis['loss_reduction'] = initial_loss - final_loss # Calculate convergence rate (loss reduction per step) if len(losses) > 1: analysis['convergence_rate'] = analysis['loss_reduction'] / len(losses) # Calculate stability (inverse of coefficient of variation) if len(losses) >= self.stability_window: recent_losses = losses[-self.stability_window:] mean_loss = np.mean(recent_losses) std_loss = np.std(recent_losses) analysis['stability_score'] = 1.0 / (1.0 + std_loss / (mean_loss + 1e-8)) # Average step time if step_durations: analysis['average_step_time'] = np.mean(step_durations) # Gradient health assessment if grad_norms: max_grad_norm = max(grad_norms) avg_grad_norm = np.mean(grad_norms) if max_grad_norm > self.gradient_explosion_threshold: analysis['gradient_health'] = 'exploding' elif avg_grad_norm < 1e-8: analysis['gradient_health'] = 'vanishing' elif np.std(grad_norms) / (avg_grad_norm + 1e-8) > 2.0: analysis['gradient_health'] = 'unstable' else: analysis['gradient_health'] = 'healthy' return analysis ### END SOLUTION # %% ../../modules/08_optimizers/optimizers_dev.ipynb 32 class AdvancedOptimizerFeatures: """ Advanced optimizer features for production ML systems. Implements production-ready optimizer enhancements: - Gradient clipping for stability - Learning rate warmup strategies - Gradient accumulation for large batches - Mixed precision optimization patterns - Distributed optimizer synchronization """ def __init__(self): """ Initialize advanced optimizer features. TODO: Implement advanced features initialization. PRODUCTION CONTEXT: These features are essential for: - Training large language models (GPT, BERT) - Computer vision at scale (ImageNet, COCO) - Distributed training across multiple GPUs - Memory-efficient training with limited resources IMPLEMENTATION HINTS: - Initialize gradient clipping parameters - Set up warmup scheduling state - Prepare accumulation buffers - Configure synchronization patterns """ ### BEGIN SOLUTION # Gradient clipping self.max_grad_norm = 1.0 self.clip_enabled = False # Learning rate warmup self.warmup_steps = 0 self.warmup_factor = 0.1 self.base_lr = 0.001 # Gradient accumulation self.accumulation_steps = 1 self.accumulated_gradients = {} self.accumulation_count = 0 # Mixed precision simulation self.use_fp16 = False self.loss_scale = 1.0 self.dynamic_loss_scaling = False # Distributed training simulation self.world_size = 1 self.rank = 0 ### END SOLUTION def apply_gradient_clipping(self, optimizer: Union[SGD, Adam], max_norm: float = 1.0) -> float: """ Apply gradient clipping to prevent gradient explosion. Args: optimizer: Optimizer with parameters to clip max_norm: Maximum allowed gradient norm Returns: Actual gradient norm before clipping TODO: Implement gradient clipping. APPROACH: 1. Calculate total gradient norm across all parameters 2. If norm exceeds max_norm, scale all gradients down 3. Apply scaling factor to maintain gradient direction 4. Return original norm for monitoring MATHEMATICAL FORMULATION: total_norm = sqrt(sum(param_grad_norm^2 for all params)) if total_norm > max_norm: clip_factor = max_norm / total_norm for each param: param.grad *= clip_factor PRODUCTION VALUE: Gradient clipping is essential for: - Training RNNs and Transformers - Preventing training instability - Enabling higher learning rates - Improving convergence reliability IMPLEMENTATION HINTS: - Calculate global gradient norm - Apply uniform scaling to all gradients - Preserve gradient directions - Return unclipped norm for logging """ ### BEGIN SOLUTION # Calculate total gradient norm total_norm = 0.0 param_count = 0 for param in optimizer.parameters: if param.grad is not None: grad_data = param.grad.data.data if hasattr(grad_data, 'flatten'): param_norm = np.linalg.norm(grad_data.flatten()) else: param_norm = abs(float(grad_data)) total_norm += param_norm ** 2 param_count += 1 if param_count > 0: total_norm = total_norm ** 0.5 else: return 0.0 # Apply clipping if necessary if total_norm > max_norm: clip_factor = max_norm / total_norm for param in optimizer.parameters: if param.grad is not None: grad_data = param.grad.data.data clipped_grad = grad_data * clip_factor param.grad.data = Tensor(clipped_grad) return total_norm ### END SOLUTION def apply_warmup_schedule(self, optimizer: Union[SGD, Adam], step: int, warmup_steps: int, base_lr: float) -> float: """ Apply learning rate warmup schedule. Args: optimizer: Optimizer to apply warmup to step: Current training step warmup_steps: Number of warmup steps base_lr: Target learning rate after warmup Returns: Current learning rate TODO: Implement learning rate warmup. APPROACH: 1. If step < warmup_steps: gradually increase learning rate 2. Use linear or polynomial warmup schedule 3. Update optimizer's learning rate 4. Return current learning rate for logging WARMUP STRATEGIES: - Linear: lr = base_lr * (step / warmup_steps) - Polynomial: lr = base_lr * ((step / warmup_steps) ^ power) - Constant: lr = base_lr * warmup_factor for warmup_steps PRODUCTION VALUE: Warmup prevents: - Early training instability - Poor initialization effects - Gradient explosion at start - Suboptimal convergence paths IMPLEMENTATION HINTS: - Handle step=0 case (avoid division by zero) - Use linear warmup for simplicity - Update optimizer.learning_rate directly - Smoothly transition to base learning rate """ ### BEGIN SOLUTION if step < warmup_steps and warmup_steps > 0: # Linear warmup warmup_factor = step / warmup_steps current_lr = base_lr * warmup_factor else: # After warmup, use base learning rate current_lr = base_lr # Update optimizer learning rate optimizer.learning_rate = current_lr return current_lr ### END SOLUTION def accumulate_gradients(self, optimizer: Union[SGD, Adam], accumulation_steps: int) -> bool: """ Accumulate gradients to simulate larger batch sizes. Args: optimizer: Optimizer with parameters to accumulate accumulation_steps: Number of steps to accumulate before update Returns: True if ready to perform optimizer step, False otherwise TODO: Implement gradient accumulation. APPROACH: 1. Add current gradients to accumulated gradient buffers 2. Increment accumulation counter 3. If counter reaches accumulation_steps: a. Average accumulated gradients b. Set as current gradients c. Return True (ready for optimizer step) d. Reset accumulation 4. Otherwise return False (continue accumulating) MATHEMATICAL FORMULATION: accumulated_grad += current_grad if accumulation_count == accumulation_steps: final_grad = accumulated_grad / accumulation_steps reset accumulation return True PRODUCTION VALUE: Gradient accumulation enables: - Large effective batch sizes on limited memory - Training large models on small GPUs - Consistent training across different hardware - Memory-efficient distributed training IMPLEMENTATION HINTS: - Store accumulated gradients per parameter - Use parameter id() as key for tracking - Average gradients before optimizer step - Reset accumulation after each update """ ### BEGIN SOLUTION # Initialize accumulation if first time if not hasattr(self, 'accumulation_count'): self.accumulation_count = 0 self.accumulated_gradients = {} # Accumulate gradients for param in optimizer.parameters: if param.grad is not None: param_id = id(param) grad_data = param.grad.data.data if param_id not in self.accumulated_gradients: self.accumulated_gradients[param_id] = np.zeros_like(grad_data) self.accumulated_gradients[param_id] += grad_data self.accumulation_count += 1 # Check if ready to update if self.accumulation_count >= accumulation_steps: # Average accumulated gradients and set as current gradients for param in optimizer.parameters: if param.grad is not None: param_id = id(param) if param_id in self.accumulated_gradients: averaged_grad = self.accumulated_gradients[param_id] / accumulation_steps param.grad.data = Tensor(averaged_grad) # Reset accumulation self.accumulation_count = 0 self.accumulated_gradients = {} return True # Ready for optimizer step return False # Continue accumulating ### END SOLUTION def simulate_mixed_precision(self, optimizer: Union[SGD, Adam], loss_scale: float = 1.0) -> bool: """ Simulate mixed precision training effects. Args: optimizer: Optimizer to apply mixed precision to loss_scale: Loss scaling factor for gradient preservation Returns: True if gradients are valid (no overflow), False if overflow detected TODO: Implement mixed precision simulation. APPROACH: 1. Scale gradients by loss_scale factor 2. Check for gradient overflow (inf or nan values) 3. If overflow detected, skip optimizer step 4. If valid, descale gradients before optimizer step 5. Return overflow status MIXED PRECISION CONCEPTS: - Use FP16 for forward pass (memory savings) - Use FP32 for backward pass (numerical stability) - Scale loss to prevent gradient underflow - Check for overflow before optimization PRODUCTION VALUE: Mixed precision provides: - 50% memory reduction - Faster training on modern GPUs - Maintained numerical stability - Automatic overflow detection IMPLEMENTATION HINTS: - Scale gradients by loss_scale - Check for inf/nan in gradients - Descale before optimizer step - Return overflow status for dynamic scaling """ ### BEGIN SOLUTION # Check for gradient overflow before scaling has_overflow = False for param in optimizer.parameters: if param.grad is not None: grad_data = param.grad.data.data if hasattr(grad_data, 'flatten'): grad_flat = grad_data.flatten() if np.any(np.isinf(grad_flat)) or np.any(np.isnan(grad_flat)): has_overflow = True break else: if np.isinf(grad_data) or np.isnan(grad_data): has_overflow = True break if has_overflow: # Zero gradients to prevent corruption for param in optimizer.parameters: if param.grad is not None: param.grad = None return False # Overflow detected # Descale gradients (simulate unscaling from FP16) if loss_scale > 1.0: for param in optimizer.parameters: if param.grad is not None: grad_data = param.grad.data.data descaled_grad = grad_data / loss_scale param.grad.data = Tensor(descaled_grad) return True # No overflow, safe to proceed ### END SOLUTION def simulate_distributed_sync(self, optimizer: Union[SGD, Adam], world_size: int = 1) -> None: """ Simulate distributed training gradient synchronization. Args: optimizer: Optimizer with gradients to synchronize world_size: Number of distributed processes TODO: Implement distributed gradient synchronization simulation. APPROACH: 1. Simulate all-reduce operation on gradients 2. Average gradients across all processes 3. Update local gradients with synchronized values 4. Handle communication overhead simulation DISTRIBUTED CONCEPTS: - All-reduce: Combine gradients from all GPUs - Averaging: Divide by world_size for consistency - Synchronization: Ensure all GPUs have same gradients - Communication: Network overhead for gradient sharing PRODUCTION VALUE: Distributed training enables: - Scaling to multiple GPUs/nodes - Training large models efficiently - Reduced training time - Consistent convergence across devices IMPLEMENTATION HINTS: - Simulate averaging by keeping gradients unchanged - Add small noise to simulate communication variance - Scale learning rate by world_size if needed - Log synchronization overhead """ ### BEGIN SOLUTION if world_size <= 1: return # No synchronization needed for single process # Simulate all-reduce operation (averaging gradients) for param in optimizer.parameters: if param.grad is not None: grad_data = param.grad.data.data # In real distributed training, gradients would be averaged across all processes # Here we simulate this by keeping gradients unchanged (already "averaged") # In practice, this would involve MPI/NCCL communication # Simulate communication noise (very small) if hasattr(grad_data, 'shape'): noise = np.random.normal(0, 1e-10, grad_data.shape) synchronized_grad = grad_data + noise else: noise = np.random.normal(0, 1e-10) synchronized_grad = grad_data + noise param.grad.data = Tensor(synchronized_grad) # In distributed training, learning rate is often scaled by world_size # to maintain effective learning rate with larger batch sizes if hasattr(optimizer, 'base_learning_rate'): optimizer.learning_rate = optimizer.base_learning_rate * world_size ### END SOLUTION