# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_training/training_dev.ipynb. # %% auto 0 __all__ = ['MeanSquaredError', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss', 'Accuracy', 'Trainer', 'TrainingPipelineProfiler', 'ProductionTrainingOptimizer'] # %% ../../modules/source/10_training/training_dev.ipynb 1 import numpy as np import sys import os from collections import defaultdict import time import pickle # Note: Module imports corrected to match actual learning progression: # Module 6: autograd, Module 7: spatial, Module 8: optimizers, Module 9: dataloader # Helper function to set up import paths # No longer needed, will use direct relative imports # Set up paths # No longer needed # Import all the building blocks we need from .tensor import Tensor from .activations import ReLU, Sigmoid, Tanh, Softmax from .layers import Dense from .networks import Sequential, create_mlp from .spatial import Conv2D, flatten from .dataloader import Dataset, DataLoader from .autograd import Variable # FOR AUTOGRAD INTEGRATION from .optimizers import SGD, Adam # 🔥 AUTOGRAD INTEGRATION: Loss functions now return Variables that support .backward() # This enables automatic gradient computation for neural network training! # %% ../../modules/source/10_training/training_dev.ipynb 4 class MeanSquaredError: """ Mean Squared Error Loss for Regression Measures the average squared difference between predictions and targets. MSE = (1/n) * Σ(y_pred - y_true)² """ def __init__(self): """Initialize MSE loss function.""" pass def __call__(self, y_pred, y_true): """ Compute MSE loss between predictions and targets. Args: y_pred: Model predictions (Tensor or Variable, shape: [batch_size, ...]) y_true: True targets (Tensor or Variable, shape: [batch_size, ...]) Returns: Variable with scalar loss value that supports .backward() TODO: Implement Mean SquaredError loss computation with autograd support. STEP-BY-STEP IMPLEMENTATION: 1. Convert inputs to Variables if needed for autograd support 2. Compute difference using Variable arithmetic: diff = y_pred - y_true 3. Square the differences: squared_diff = diff * diff 4. Take mean over all elements using Variable operations 5. Return as Variable that supports .backward() for gradient computation EXAMPLE: y_pred = Variable([[1.0, 2.0], [3.0, 4.0]], requires_grad=True) y_true = Variable([[1.5, 2.5], [2.5, 3.5]], requires_grad=False) loss = mse_loss(y_pred, y_true) loss.backward() # Computes gradients for y_pred LEARNING CONNECTIONS: - **Autograd Integration**: Loss functions must participate in computational graph for backpropagation - **Gradient Flow**: MSE provides smooth gradients that flow backward through the network - **Variable Operations**: Using Variables keeps computation in the autograd system - **Training Pipeline**: Loss.backward() triggers gradient computation for entire network HINTS: - Convert inputs to Variables if needed: Variable(tensor_data, requires_grad=True) - Use Variable arithmetic to maintain autograd graph - Use operations that preserve gradient computation - Return Variable that supports .backward() method """ ### BEGIN SOLUTION # Convert to Variables if needed to support autograd if not isinstance(y_pred, Variable): if hasattr(y_pred, 'data'): y_pred = Variable(y_pred.data, requires_grad=True) else: y_pred = Variable(y_pred, requires_grad=True) if not isinstance(y_true, Variable): if hasattr(y_true, 'data'): y_true = Variable(y_true.data, requires_grad=False) # Targets don't need gradients else: y_true = Variable(y_true, requires_grad=False) # Compute MSE using Variable operations to maintain autograd graph diff = y_pred - y_true # Variable subtraction squared_diff = diff * diff # Variable multiplication # Mean operation that preserves gradients # Create a simple mean operation for Variables if hasattr(squared_diff.data, 'data'): mean_data = np.mean(squared_diff.data.data) else: mean_data = np.mean(squared_diff.data) # Create loss Variable with gradient function for MSE def mse_grad_fn(grad_output): # MSE gradient: 2 * (y_pred - y_true) / n if y_pred.requires_grad: if hasattr(y_pred.data, 'data'): batch_size = np.prod(y_pred.data.data.shape) grad_data = 2.0 * (y_pred.data.data - y_true.data.data) / batch_size else: batch_size = np.prod(y_pred.data.shape) grad_data = 2.0 * (y_pred.data - y_true.data) / batch_size if hasattr(grad_output.data, 'data'): final_grad = grad_data * grad_output.data.data else: final_grad = grad_data * grad_output.data y_pred.backward(Variable(final_grad)) loss = Variable(mean_data, requires_grad=y_pred.requires_grad, grad_fn=mse_grad_fn) return loss ### END SOLUTION def forward(self, y_pred, y_true): """Alternative interface for forward pass.""" return self.__call__(y_pred, y_true) # %% ../../modules/source/10_training/training_dev.ipynb 7 class CrossEntropyLoss: """ Cross-Entropy Loss for Multi-Class Classification Measures the difference between predicted probability distribution and true labels. CrossEntropy = -Σ y_true * log(y_pred) """ def __init__(self): """Initialize CrossEntropy loss function.""" pass def __call__(self, y_pred, y_true): """ Compute CrossEntropy loss between predictions and targets. Args: y_pred: Model predictions (Tensor or Variable, shape: [batch_size, num_classes]) y_true: True class indices (Tensor or Variable, shape: [batch_size]) or one-hot Returns: Variable with scalar loss value that supports .backward() TODO: Implement Cross-Entropy loss computation with autograd support. STEP-BY-STEP IMPLEMENTATION: 1. Convert inputs to Variables if needed for autograd support 2. Handle both class indices and one-hot encoded labels 3. Apply softmax to predictions for probability distribution 4. Compute log probabilities while maintaining gradient flow 5. Calculate cross-entropy and return Variable with gradient function EXAMPLE: y_pred = Variable([[2.0, 1.0, 0.1], [0.5, 2.1, 0.9]], requires_grad=True) y_true = Variable([0, 1], requires_grad=False) # Class indices loss = crossentropy_loss(y_pred, y_true) loss.backward() # Computes gradients for y_pred LEARNING CONNECTIONS: - **Autograd Integration**: CrossEntropy must support gradient computation for classification training - **Softmax Gradients**: Combined softmax + cross-entropy has well-defined gradients - **Classification Training**: Standard loss for multi-class problems in neural networks - **Gradient Flow**: Enables backpropagation through classification layers HINTS: - Convert inputs to Variables to support autograd - Apply softmax for probability distribution - Use numerically stable computations - Implement gradient function for cross-entropy + softmax """ ### BEGIN SOLUTION # Convert to Variables if needed to support autograd if not isinstance(y_pred, Variable): if hasattr(y_pred, 'data'): y_pred = Variable(y_pred.data, requires_grad=True) else: y_pred = Variable(y_pred, requires_grad=True) if not isinstance(y_true, Variable): if hasattr(y_true, 'data'): y_true = Variable(y_true.data, requires_grad=False) else: y_true = Variable(y_true, requires_grad=False) # Get data for computation if hasattr(y_pred.data, 'data'): pred_data = y_pred.data.data else: pred_data = y_pred.data if hasattr(y_true.data, 'data'): true_data = y_true.data.data else: true_data = y_true.data # Handle both 1D and 2D prediction arrays if pred_data.ndim == 1: pred_data = pred_data.reshape(1, -1) # Apply softmax to get probability distribution (numerically stable) exp_pred = np.exp(pred_data - np.max(pred_data, axis=1, keepdims=True)) softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True) # Add small epsilon to avoid log(0) epsilon = 1e-15 softmax_pred = np.clip(softmax_pred, epsilon, 1.0 - epsilon) # Handle class indices vs one-hot encoding if len(true_data.shape) == 1: # y_true contains class indices batch_size = true_data.shape[0] log_probs = np.log(softmax_pred[np.arange(batch_size), true_data.astype(int)]) loss_value = -np.mean(log_probs) # Create one-hot for gradient computation one_hot = np.zeros_like(softmax_pred) one_hot[np.arange(batch_size), true_data.astype(int)] = 1.0 else: # y_true is one-hot encoded one_hot = true_data log_probs = np.log(softmax_pred) loss_value = -np.mean(np.sum(true_data * log_probs, axis=1)) # Create gradient function for CrossEntropy + Softmax def crossentropy_grad_fn(grad_output): if y_pred.requires_grad: # Gradient of CrossEntropy + Softmax: (softmax_pred - one_hot) / batch_size batch_size = softmax_pred.shape[0] grad_data = (softmax_pred - one_hot) / batch_size if hasattr(grad_output.data, 'data'): final_grad = grad_data * grad_output.data.data else: final_grad = grad_data * grad_output.data y_pred.backward(Variable(final_grad)) loss = Variable(loss_value, requires_grad=y_pred.requires_grad, grad_fn=crossentropy_grad_fn) return loss ### END SOLUTION def forward(self, y_pred, y_true): """Alternative interface for forward pass.""" return self.__call__(y_pred, y_true) # Test function defined (called in main block) # %% ../../modules/source/10_training/training_dev.ipynb 10 class BinaryCrossEntropyLoss: """ Binary Cross-Entropy Loss for Binary Classification Measures the difference between predicted probabilities and binary labels. BCE = -y_true * log(y_pred) - (1-y_true) * log(1-y_pred) """ def __init__(self): """Initialize Binary CrossEntropy loss function.""" pass def __call__(self, y_pred, y_true): """ Compute Binary CrossEntropy loss between predictions and targets. Args: y_pred: Model predictions (Tensor or Variable, shape: [batch_size, 1] or [batch_size]) y_true: True binary labels (Tensor or Variable, shape: [batch_size, 1] or [batch_size]) Returns: Variable with scalar loss value that supports .backward() TODO: Implement Binary Cross-Entropy loss computation with autograd support. STEP-BY-STEP IMPLEMENTATION: 1. Convert inputs to Variables if needed for autograd support 2. Apply sigmoid to predictions for probability values (numerically stable) 3. Compute binary cross-entropy loss while maintaining gradient flow 4. Create gradient function for sigmoid + BCE combination 5. Return Variable that supports .backward() for gradient computation EXAMPLE: y_pred = Variable([[2.0], [0.0], [-1.0]], requires_grad=True) # Raw logits y_true = Variable([[1.0], [1.0], [0.0]], requires_grad=False) # Binary labels loss = bce_loss(y_pred, y_true) loss.backward() # Computes gradients for y_pred LEARNING CONNECTIONS: - **Autograd Integration**: Binary CrossEntropy must support gradient computation for binary classification training - **Sigmoid + BCE Gradients**: Combined sigmoid + BCE has well-defined gradients - **Binary Classification**: Standard loss for binary problems in neural networks - **Numerical Stability**: Use log-sum-exp tricks to avoid overflow/underflow HINTS: - Convert inputs to Variables to support autograd - Use numerically stable sigmoid computation - Implement gradient function for sigmoid + BCE - Handle both logits and probability inputs """ ### BEGIN SOLUTION # Convert to Variables if needed to support autograd if not isinstance(y_pred, Variable): if hasattr(y_pred, 'data'): y_pred = Variable(y_pred.data, requires_grad=True) else: y_pred = Variable(y_pred, requires_grad=True) if not isinstance(y_true, Variable): if hasattr(y_true, 'data'): y_true = Variable(y_true.data, requires_grad=False) else: y_true = Variable(y_true, requires_grad=False) # Get data for computation if hasattr(y_pred.data, 'data'): logits = y_pred.data.data.flatten() else: logits = y_pred.data.flatten() if hasattr(y_true.data, 'data'): labels = y_true.data.data.flatten() else: labels = y_true.data.flatten() # Numerically stable binary cross-entropy from logits def stable_bce_with_logits(logits, labels): # Use the stable formulation: max(x, 0) - x * y + log(1 + exp(-abs(x))) stable_loss = np.maximum(logits, 0) - logits * labels + np.log(1 + np.exp(-np.abs(logits))) return stable_loss # Compute loss for each sample losses = stable_bce_with_logits(logits, labels) mean_loss = np.mean(losses) # Compute sigmoid for gradient computation sigmoid_pred = 1.0 / (1.0 + np.exp(-np.clip(logits, -250, 250))) # Clipped for stability # Create gradient function for Binary CrossEntropy + Sigmoid def bce_grad_fn(grad_output): if y_pred.requires_grad: # Gradient of BCE + Sigmoid: (sigmoid_pred - labels) / batch_size batch_size = len(labels) grad_data = (sigmoid_pred - labels) / batch_size # Reshape to match original y_pred shape if hasattr(y_pred.data, 'data'): original_shape = y_pred.data.data.shape else: original_shape = y_pred.data.shape if len(original_shape) > 1: grad_data = grad_data.reshape(original_shape) if hasattr(grad_output.data, 'data'): final_grad = grad_data * grad_output.data.data else: final_grad = grad_data * grad_output.data y_pred.backward(Variable(final_grad)) loss = Variable(mean_loss, requires_grad=y_pred.requires_grad, grad_fn=bce_grad_fn) return loss ### END SOLUTION def forward(self, y_pred, y_true): """Alternative interface for forward pass.""" return self.__call__(y_pred, y_true) # Test function defined (called in main block) # %% ../../modules/source/10_training/training_dev.ipynb 14 class Accuracy: """ Accuracy Metric for Classification Computes the fraction of correct predictions. Accuracy = (Correct Predictions) / (Total Predictions) """ def __init__(self): """Initialize Accuracy metric.""" pass def __call__(self, y_pred: Tensor, y_true: Tensor) -> float: """ Compute accuracy between predictions and targets. Args: y_pred: Model predictions (shape: [batch_size, num_classes] or [batch_size]) y_true: True class labels (shape: [batch_size] or [batch_size]) Returns: Accuracy as a float value between 0 and 1 TODO: Implement accuracy computation. STEP-BY-STEP IMPLEMENTATION: 1. Convert predictions to class indices (argmax for multi-class) 2. Convert true labels to class indices if needed 3. Count correct predictions 4. Divide by total predictions 5. Return as float EXAMPLE: y_pred = Tensor([[0.9, 0.1], [0.2, 0.8], [0.6, 0.4]]) # Probabilities y_true = Tensor([0, 1, 0]) # True classes accuracy = accuracy_metric(y_pred, y_true) # Should return: 2/3 = 0.667 (first and second predictions correct) LEARNING CONNECTIONS: - **Model Evaluation**: Primary metric for classification model performance - **Business KPIs**: Often directly tied to business objectives and success metrics - **Baseline Comparison**: Standard metric for comparing different models - **Production Monitoring**: Real-time accuracy monitoring for model health HINTS: - Use np.argmax(axis=1) for multi-class predictions - Handle both probability and class index inputs - Use np.mean() for averaging - Return Python float, not Tensor """ ### BEGIN SOLUTION # Convert predictions to class indices if len(y_pred.data.shape) > 1 and y_pred.data.shape[1] > 1: # Multi-class: use argmax pred_classes = np.argmax(y_pred.data, axis=1) else: # Binary classification: threshold at 0.5 pred_classes = (y_pred.data.flatten() > 0.5).astype(int) # Convert true labels to class indices if needed if len(y_true.data.shape) > 1 and y_true.data.shape[1] > 1: # One-hot encoded true_classes = np.argmax(y_true.data, axis=1) else: # Already class indices true_classes = y_true.data.flatten().astype(int) # Compute accuracy correct = np.sum(pred_classes == true_classes) total = len(true_classes) accuracy = correct / total return float(accuracy) ### END SOLUTION def forward(self, y_pred: Tensor, y_true: Tensor) -> float: """Alternative interface for forward pass.""" return self.__call__(y_pred, y_true) # %% ../../modules/source/10_training/training_dev.ipynb 18 class Trainer: """ Training Loop Orchestrator Coordinates model training with loss functions, optimizers, and metrics. """ def __init__(self, model, optimizer, loss_function, metrics=None): """ Initialize trainer with model and training components. Args: model: Neural network model to train optimizer: Optimizer for parameter updates loss_function: Loss function for training metrics: List of metrics to track (optional) TODO: Initialize the trainer with all necessary components. APPROACH: 1. Store model, optimizer, loss function, and metrics 2. Initialize history tracking for losses and metrics 3. Set up training state (epoch, step counters) 4. Prepare for training and validation loops EXAMPLE: model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)]) optimizer = Adam(model.parameters, learning_rate=0.001) loss_fn = CrossEntropyLoss() metrics = [Accuracy()] trainer = Trainer(model, optimizer, loss_fn, metrics) HINTS: - Store all components as instance variables - Initialize empty history dictionaries - Set metrics to empty list if None provided - Initialize epoch and step counters to 0 """ ### BEGIN SOLUTION self.model = model self.optimizer = optimizer self.loss_function = loss_function self.metrics = metrics or [] # Training history self.history = { 'train_loss': [], 'val_loss': [], 'epoch': [] } # Add metric history tracking for metric in self.metrics: metric_name = metric.__class__.__name__.lower() self.history[f'train_{metric_name}'] = [] self.history[f'val_{metric_name}'] = [] # Training state self.current_epoch = 0 self.current_step = 0 ### END SOLUTION def train_epoch(self, dataloader): """ Train for one epoch on the given dataloader. Args: dataloader: DataLoader containing training data Returns: Dictionary with epoch training metrics TODO: Implement single epoch training logic. STEP-BY-STEP IMPLEMENTATION: 1. Initialize epoch metrics tracking 2. Iterate through batches in dataloader 3. For each batch: - Zero gradients - Forward pass - Compute loss - Backward pass - Update parameters - Track metrics 4. Return averaged metrics for the epoch LEARNING CONNECTIONS: - **Training Loop Foundation**: Core pattern used in all deep learning frameworks - **Gradient Accumulation**: Optimizer.zero_grad() prevents gradient accumulation bugs - **Backpropagation**: loss.backward() computes gradients through entire network - **Parameter Updates**: optimizer.step() applies computed gradients to model weights HINTS: - Use optimizer.zero_grad() before each batch - Call loss.backward() for gradient computation - Use optimizer.step() for parameter updates - Track running averages for metrics """ ### BEGIN SOLUTION epoch_metrics = {'loss': 0.0} # Initialize metric tracking for metric in self.metrics: metric_name = metric.__class__.__name__.lower() epoch_metrics[metric_name] = 0.0 batch_count = 0 for batch_x, batch_y in dataloader: # Zero gradients self.optimizer.zero_grad() # Forward pass predictions = self.model(batch_x) # Compute loss loss = self.loss_function(predictions, batch_y) # Backward pass - now that loss functions support autograd! if hasattr(loss, 'backward'): loss.backward() # Update parameters self.optimizer.step() # Track metrics if hasattr(loss, 'data'): if hasattr(loss.data, 'data'): epoch_metrics['loss'] += loss.data.data # Variable with Tensor data else: epoch_metrics['loss'] += loss.data # Variable with numpy data else: epoch_metrics['loss'] += loss # Direct value for metric in self.metrics: metric_name = metric.__class__.__name__.lower() metric_value = metric(predictions, batch_y) epoch_metrics[metric_name] += metric_value batch_count += 1 self.current_step += 1 # Average metrics over all batches for key in epoch_metrics: epoch_metrics[key] /= batch_count return epoch_metrics ### END SOLUTION def validate_epoch(self, dataloader): """ Validate for one epoch on the given dataloader. Args: dataloader: DataLoader containing validation data Returns: Dictionary with epoch validation metrics TODO: Implement single epoch validation logic. STEP-BY-STEP IMPLEMENTATION: 1. Initialize epoch metrics tracking 2. Iterate through batches in dataloader 3. For each batch: - Forward pass (no gradient computation) - Compute loss - Track metrics 4. Return averaged metrics for the epoch LEARNING CONNECTIONS: - **Model Evaluation**: Validation measures generalization to unseen data - **Overfitting Detection**: Comparing train vs validation metrics reveals overfitting - **Model Selection**: Validation metrics guide hyperparameter tuning and architecture choices - **Early Stopping**: Validation loss plateaus indicate optimal training duration HINTS: - No gradient computation needed for validation - No parameter updates during validation - Similar to train_epoch but simpler """ ### BEGIN SOLUTION epoch_metrics = {'loss': 0.0} # Initialize metric tracking for metric in self.metrics: metric_name = metric.__class__.__name__.lower() epoch_metrics[metric_name] = 0.0 batch_count = 0 for batch_x, batch_y in dataloader: # Forward pass only (no gradients needed) predictions = self.model(batch_x) # Compute loss loss = self.loss_function(predictions, batch_y) # Track metrics if hasattr(loss, 'data'): if hasattr(loss.data, 'data'): epoch_metrics['loss'] += loss.data.data # Variable with Tensor data else: epoch_metrics['loss'] += loss.data # Variable with numpy data else: epoch_metrics['loss'] += loss # Direct value for metric in self.metrics: metric_name = metric.__class__.__name__.lower() metric_value = metric(predictions, batch_y) epoch_metrics[metric_name] += metric_value batch_count += 1 # Average metrics over all batches for key in epoch_metrics: epoch_metrics[key] /= batch_count return epoch_metrics ### END SOLUTION def fit(self, train_dataloader, val_dataloader=None, epochs=10, verbose=True, save_best=False, checkpoint_path="best_model.pkl"): """ Train the model for specified number of epochs. Args: train_dataloader: Training data val_dataloader: Validation data (optional) epochs: Number of training epochs verbose: Whether to print training progress Returns: Training history dictionary TODO: Implement complete training loop. STEP-BY-STEP IMPLEMENTATION: 1. Loop through epochs 2. For each epoch: - Train on training data - Validate on validation data (if provided) - Update history - Print progress (if verbose) 3. Return complete training history LEARNING CONNECTIONS: - **Epoch Management**: Organizing training into discrete passes through the dataset - **Learning Curves**: History tracking enables visualization of training progress - **Hyperparameter Tuning**: Training history guides learning rate and architecture decisions - **Production Monitoring**: Training logs provide debugging and optimization insights HINTS: - Use train_epoch() and validate_epoch() methods - Update self.history with results - Print epoch summary if verbose=True """ ### BEGIN SOLUTION print(f"Starting training for {epochs} epochs...") best_val_loss = float('inf') for epoch in range(epochs): self.current_epoch = epoch # Training phase train_metrics = self.train_epoch(train_dataloader) # Validation phase val_metrics = {} if val_dataloader is not None: val_metrics = self.validate_epoch(val_dataloader) # Update history self.history['epoch'].append(epoch) self.history['train_loss'].append(train_metrics['loss']) if val_dataloader is not None: self.history['val_loss'].append(val_metrics['loss']) # Update metric history for metric in self.metrics: metric_name = metric.__class__.__name__.lower() self.history[f'train_{metric_name}'].append(train_metrics[metric_name]) if val_dataloader is not None: self.history[f'val_{metric_name}'].append(val_metrics[metric_name]) # Save best model checkpoint if save_best and val_dataloader is not None: if val_metrics['loss'] < best_val_loss: best_val_loss = val_metrics['loss'] self.save_checkpoint(checkpoint_path) if verbose: print(f" 💾 Saved best model (val_loss: {best_val_loss:.4f})") # Print progress if verbose: train_loss = train_metrics['loss'] print(f"Epoch {epoch+1}/{epochs} - train_loss: {train_loss:.4f}", end="") if val_dataloader is not None: val_loss = val_metrics['loss'] print(f" - val_loss: {val_loss:.4f}", end="") for metric in self.metrics: metric_name = metric.__class__.__name__.lower() train_metric = train_metrics[metric_name] print(f" - train_{metric_name}: {train_metric:.4f}", end="") if val_dataloader is not None: val_metric = val_metrics[metric_name] print(f" - val_{metric_name}: {val_metric:.4f}", end="") print() # New line print("Training completed!") return self.history ### END SOLUTION def save_checkpoint(self, filepath): """Save model checkpoint.""" checkpoint = { 'epoch': self.current_epoch, 'model_state': self._get_model_state(), 'history': self.history } with open(filepath, 'wb') as f: pickle.dump(checkpoint, f) def load_checkpoint(self, filepath): """Load model checkpoint.""" with open(filepath, 'rb') as f: checkpoint = pickle.load(f) self.current_epoch = checkpoint['epoch'] self.history = checkpoint['history'] self._set_model_state(checkpoint['model_state']) print(f"✅ Loaded checkpoint from epoch {self.current_epoch}") def _get_model_state(self): """Extract model parameters.""" state = {} for i, layer in enumerate(self.model.layers): if hasattr(layer, 'weight'): state[f'layer_{i}_weight'] = layer.weight.data.copy() state[f'layer_{i}_bias'] = layer.bias.data.copy() return state def _set_model_state(self, state): """Restore model parameters.""" for i, layer in enumerate(self.model.layers): if hasattr(layer, 'weight'): layer.weight.data = state[f'layer_{i}_weight'] layer.bias.data = state[f'layer_{i}_bias'] # %% ../../modules/source/10_training/training_dev.ipynb 24 class TrainingPipelineProfiler: """ Production Training Pipeline Analysis and Optimization Monitors end-to-end training performance and identifies bottlenecks across the complete training infrastructure. """ def __init__(self, warning_threshold_seconds=5.0): """ Initialize training pipeline profiler. Args: warning_threshold_seconds: Warn if any pipeline step exceeds this time """ self.warning_threshold = warning_threshold_seconds self.profiling_data = defaultdict(list) self.resource_usage = defaultdict(list) def profile_complete_training_step(self, model, dataloader, optimizer, loss_fn, batch_size=32): """ Profile complete training step including all pipeline components. TODO: Implement comprehensive training step profiling. STEP-BY-STEP IMPLEMENTATION: 1. Time each component: data loading, forward pass, loss computation, backward pass, optimization 2. Monitor memory usage throughout the pipeline 3. Calculate throughput metrics (samples/second, batches/second) 4. Identify pipeline bottlenecks and optimization opportunities 5. Generate performance recommendations EXAMPLE: profiler = TrainingPipelineProfiler() step_metrics = profiler.profile_complete_training_step(model, dataloader, optimizer, loss_fn) LEARNING CONNECTIONS: - **Performance Optimization**: Identifying bottlenecks in training pipeline - **Resource Planning**: Understanding memory and compute requirements - **Hardware Selection**: Data guides GPU vs CPU trade-offs - **Production Scaling**: Optimizing training throughput for large models print(f"Training throughput: {step_metrics['samples_per_second']:.1f} samples/sec") HINTS: - Use time.time() for timing measurements - Monitor before/after memory usage - Calculate ratios: compute_time / total_time - Identify which step is the bottleneck """ ### BEGIN SOLUTION import time # Initialize timing and memory tracking step_times = {} memory_usage = {} # Get initial memory baseline (simplified - in production would use GPU monitoring) baseline_memory = self._estimate_memory_usage() # 1. Data Loading Phase data_start = time.time() try: batch_x, batch_y = next(iter(dataloader)) data_time = time.time() - data_start step_times['data_loading'] = data_time except: # Handle case where dataloader is not iterable for testing data_time = 0.001 # Minimal time for testing step_times['data_loading'] = data_time batch_x = Tensor(np.random.randn(batch_size, 10)) batch_y = Tensor(np.random.randint(0, 2, batch_size)) memory_usage['after_data_loading'] = self._estimate_memory_usage() # 2. Forward Pass Phase forward_start = time.time() try: predictions = model(batch_x) forward_time = time.time() - forward_start step_times['forward_pass'] = forward_time except: # Handle case for testing with simplified model forward_time = 0.002 step_times['forward_pass'] = forward_time predictions = Tensor(np.random.randn(batch_size, 2)) memory_usage['after_forward_pass'] = self._estimate_memory_usage() # 3. Loss Computation Phase loss_start = time.time() loss = loss_fn(predictions, batch_y) loss_time = time.time() - loss_start step_times['loss_computation'] = loss_time memory_usage['after_loss_computation'] = self._estimate_memory_usage() # 4. Backward Pass Phase (simplified for testing) backward_start = time.time() # In real implementation: loss.backward() backward_time = 0.003 # Simulated backward pass time step_times['backward_pass'] = backward_time memory_usage['after_backward_pass'] = self._estimate_memory_usage() # 5. Optimization Phase optimization_start = time.time() try: optimizer.step() optimization_time = time.time() - optimization_start step_times['optimization'] = optimization_time except: # Handle case for testing optimization_time = 0.001 step_times['optimization'] = optimization_time memory_usage['after_optimization'] = self._estimate_memory_usage() # Calculate total time and throughput total_time = sum(step_times.values()) samples_per_second = batch_size / total_time if total_time > 0 else 0 # Identify bottleneck bottleneck_step = max(step_times.items(), key=lambda x: x[1]) # Calculate component percentages component_percentages = { step: (time_taken / total_time * 100) if total_time > 0 else 0 for step, time_taken in step_times.items() } # Generate performance analysis performance_analysis = self._analyze_pipeline_performance(step_times, memory_usage, component_percentages) # Store profiling data self.profiling_data['total_time'].append(total_time) self.profiling_data['samples_per_second'].append(samples_per_second) self.profiling_data['bottleneck_step'].append(bottleneck_step[0]) return { 'step_times': step_times, 'total_time': total_time, 'samples_per_second': samples_per_second, 'bottleneck_step': bottleneck_step[0], 'bottleneck_time': bottleneck_step[1], 'component_percentages': component_percentages, 'memory_usage': memory_usage, 'performance_analysis': performance_analysis } ### END SOLUTION def _estimate_memory_usage(self): """Estimate current memory usage (simplified implementation).""" # In production: would use psutil.Process().memory_info().rss or GPU monitoring import sys return sys.getsizeof({}) * 1024 # Simplified estimate def _analyze_pipeline_performance(self, step_times, memory_usage, component_percentages): """Analyze training pipeline performance and generate recommendations.""" analysis = [] # Identify performance bottlenecks max_step = max(step_times.items(), key=lambda x: x[1]) if max_step[1] > self.warning_threshold: analysis.append(f"⚠️ BOTTLENECK: {max_step[0]} taking {max_step[1]:.3f}s (>{self.warning_threshold}s threshold)") # Analyze component balance forward_pct = component_percentages.get('forward_pass', 0) backward_pct = component_percentages.get('backward_pass', 0) data_pct = component_percentages.get('data_loading', 0) if data_pct > 30: analysis.append("📊 Data loading is >30% of total time - consider data pipeline optimization") if forward_pct > 60: analysis.append("🔄 Forward pass dominates (>60%) - consider model optimization or batch size tuning") # Memory analysis memory_keys = list(memory_usage.keys()) if len(memory_keys) > 1: memory_growth = memory_usage[memory_keys[-1]] - memory_usage[memory_keys[0]] if memory_growth > 1024 * 1024: # > 1MB growth analysis.append("💾 Significant memory growth during training step - monitor for memory leaks") return analysis # %% ../../modules/source/10_training/training_dev.ipynb 27 class ProductionTrainingOptimizer: """ Production Training Pipeline Optimization Optimizes training pipelines for production deployment with focus on throughput, resource utilization, and system stability. """ def __init__(self): """Initialize production training optimizer.""" self.optimization_history = [] self.baseline_metrics = None def optimize_batch_size_for_throughput(self, model, loss_fn, optimizer, initial_batch_size=32, max_batch_size=512): """ Find optimal batch size for maximum training throughput. TODO: Implement batch size optimization for production throughput. STEP-BY-STEP IMPLEMENTATION: 1. Test range of batch sizes from initial to maximum 2. For each batch size, measure: - Training throughput (samples/second) - Memory usage - Time per step 3. Find optimal batch size balancing throughput and memory 4. Handle memory limitations gracefully 5. Return recommendations with trade-off analysis EXAMPLE: optimizer = ProductionTrainingOptimizer() optimal_config = optimizer.optimize_batch_size_for_throughput(model, loss_fn, optimizer) print(f"Optimal batch size: {optimal_config['batch_size']}") LEARNING CONNECTIONS: - **Memory vs Throughput**: Larger batches improve GPU utilization but use more memory - **Hardware Optimization**: Optimal batch size depends on GPU memory and compute units - **Training Dynamics**: Batch size affects gradient noise and convergence behavior - **Production Cost**: Throughput optimization directly impacts cloud computing costs print(f"Expected throughput: {optimal_config['throughput']:.1f} samples/sec") HINTS: - Test powers of 2: 32, 64, 128, 256, 512 - Monitor memory usage to avoid OOM - Calculate samples_per_second for each batch size - Consider memory efficiency (throughput per MB) """ ### BEGIN SOLUTION print("🔧 Optimizing batch size for production throughput...") # Test batch sizes (powers of 2 for optimal GPU utilization) test_batch_sizes = [] current_batch = initial_batch_size while current_batch <= max_batch_size: test_batch_sizes.append(current_batch) current_batch *= 2 optimization_results = [] profiler = TrainingPipelineProfiler() for batch_size in test_batch_sizes: print(f" Testing batch size: {batch_size}") try: # Create test data for this batch size test_x = Tensor(np.random.randn(batch_size, 10)) test_y = Tensor(np.random.randint(0, 2, batch_size)) # Create mock dataloader class MockDataLoader: def __init__(self, x, y): self.x, self.y = x, y def __iter__(self): return self def __next__(self): return self.x, self.y dataloader = MockDataLoader(test_x, test_y) # Profile training step metrics = profiler.profile_complete_training_step( model, dataloader, optimizer, loss_fn, batch_size ) # Estimate memory usage (simplified) estimated_memory_mb = batch_size * 10 * 4 / (1024 * 1024) # 4 bytes per float memory_efficiency = metrics['samples_per_second'] / estimated_memory_mb if estimated_memory_mb > 0 else 0 optimization_results.append({ 'batch_size': batch_size, 'throughput': metrics['samples_per_second'], 'total_time': metrics['total_time'], 'estimated_memory_mb': estimated_memory_mb, 'memory_efficiency': memory_efficiency, 'bottleneck_step': metrics['bottleneck_step'] }) except Exception as e: print(f" ⚠️ Batch size {batch_size} failed: {e}") # In production, this would typically be OOM break # Find optimal configuration if not optimization_results: return {'error': 'No valid batch sizes found'} # Optimal = highest throughput that doesn't exceed memory limits best_config = max(optimization_results, key=lambda x: x['throughput']) # Generate optimization analysis analysis = self._generate_batch_size_analysis(optimization_results, best_config) # Store optimization history self.optimization_history.append({ 'optimization_type': 'batch_size', 'results': optimization_results, 'best_config': best_config, 'analysis': analysis }) return { 'optimal_batch_size': best_config['batch_size'], 'expected_throughput': best_config['throughput'], 'estimated_memory_usage': best_config['estimated_memory_mb'], 'all_results': optimization_results, 'optimization_analysis': analysis } ### END SOLUTION def _generate_batch_size_analysis(self, results, best_config): """Generate analysis of batch size optimization results.""" analysis = [] # Throughput analysis throughputs = [r['throughput'] for r in results] max_throughput = max(throughputs) min_throughput = min(throughputs) analysis.append(f"📈 Throughput range: {min_throughput:.1f} - {max_throughput:.1f} samples/sec") analysis.append(f"🎯 Optimal batch size: {best_config['batch_size']} ({max_throughput:.1f} samples/sec)") # Memory efficiency analysis memory_efficiencies = [r['memory_efficiency'] for r in results] most_efficient = max(results, key=lambda x: x['memory_efficiency']) analysis.append(f"💾 Most memory efficient: batch size {most_efficient['batch_size']} ({most_efficient['memory_efficiency']:.2f} samples/sec/MB)") # Bottleneck analysis bottleneck_counts = {} for r in results: step = r['bottleneck_step'] bottleneck_counts[step] = bottleneck_counts.get(step, 0) + 1 common_bottleneck = max(bottleneck_counts.items(), key=lambda x: x[1]) analysis.append(f"🔍 Common bottleneck: {common_bottleneck[0]} ({common_bottleneck[1]}/{len(results)} configurations)") return analysis