""" Checkpoint 11: Regularization (After Module 12 - Regularization) Question: "Can I prevent overfitting and build robust models?" """ import numpy as np import pytest def test_checkpoint_11_regularization(): """ Checkpoint 11: Regularization Validates that students can apply regularization techniques to prevent overfitting and build models that generalize well to unseen data - essential for practical machine learning applications. """ print("\nšŸ›”ļø Checkpoint 11: Regularization") print("=" * 50) try: from tinytorch.core.tensor import Tensor from tinytorch.core.layers import Dense from tinytorch.core.activations import ReLU from tinytorch.core.regularization import Dropout, L1Regularization, L2Regularization from tinytorch.core.losses import MeanSquaredError from tinytorch.core.optimizers import Adam except ImportError as e: pytest.fail(f"āŒ Cannot import required classes - complete Modules 2-12 first: {e}") # Test 1: Dropout for generalization print("šŸŽ­ Testing dropout...") dropout = Dropout(p=0.5) # Create test data input_data = Tensor(np.ones((10, 20))) # All ones for predictable testing # Training mode (should drop some neurons) if hasattr(dropout, 'training'): dropout.training = True dropped_output = dropout(input_data) # Check that some values are zeroed num_zeros = np.sum(dropped_output.data == 0) total_elements = dropped_output.data.size dropout_rate = num_zeros / total_elements # Should drop approximately 50% (with some variance) assert dropout_rate > 0.3 and dropout_rate < 0.7, f"Dropout rate should be ~0.5, got {dropout_rate:.3f}" print(f"āœ… Dropout training: {dropout_rate:.3f} dropout rate") # Inference mode (should keep all values) if hasattr(dropout, 'training'): dropout.training = False inference_output = dropout(input_data) # In inference, should scale but not drop if hasattr(dropout, 'training'): # Proper dropout scales by 1/(1-p) in training or keeps values in inference assert not np.any(inference_output.data == 0), "Inference mode should not drop neurons" print(f"āœ… Dropout inference: no neurons dropped") else: print(f"āš ļø Dropout mode switching not implemented") # Test 2: L2 Regularization (Weight Decay) print("āš–ļø Testing L2 regularization...") # Create model with large weights model = Dense(5, 3) model.weights.data = np.random.randn(5, 3) * 2 # Larger weights model.bias.data = np.random.randn(3) * 2 model.weights.requires_grad = True model.bias.requires_grad = True l2_reg = L2Regularization(lambda_reg=0.01) loss_fn = MeanSquaredError() # Test data X = Tensor(np.random.randn(4, 5)) y = Tensor(np.random.randn(4, 3)) # Forward pass with regularization pred = model(X) base_loss = loss_fn(pred, y) reg_loss = l2_reg(model.weights) total_loss = base_loss + reg_loss # L2 regularization should add penalty for large weights assert reg_loss.data > 0, f"L2 regularization should add positive penalty, got {reg_loss.data}" assert total_loss.data > base_loss.data, "Total loss should be larger than base loss" print(f"āœ… L2 regularization: base={base_loss.data:.4f}, penalty={reg_loss.data:.4f}") # Test 3: L1 Regularization (Sparsity) print("šŸ“‰ Testing L1 regularization...") l1_reg = L1Regularization(lambda_reg=0.01) l1_penalty = l1_reg(model.weights) # L1 should encourage sparsity assert l1_penalty.data > 0, f"L1 regularization should add positive penalty, got {l1_penalty.data}" print(f"āœ… L1 regularization: sparsity penalty={l1_penalty.data:.4f}") # Test 4: Regularized training print("šŸŽÆ Testing regularized training...") # Create overfitting scenario (small dataset, complex model) np.random.seed(42) X_small = np.random.randn(20, 10) # Only 20 samples y_small = np.random.randn(20, 1) # Complex model (prone to overfitting) model_reg = [ Dense(10, 50), ReLU(), Dropout(p=0.3), Dense(50, 50), ReLU(), Dropout(p=0.3), Dense(50, 1) ] # Set requires_grad for all layers for layer in model_reg: if hasattr(layer, 'weights'): layer.weights.requires_grad = True layer.bias.requires_grad = True if hasattr(layer, 'training'): layer.training = True # Collect parameters params = [] for layer in model_reg: if hasattr(layer, 'weights'): params.extend([layer.weights, layer.bias]) optimizer = Adam(params, lr=0.01) l2_regularizer = L2Regularization(lambda_reg=0.001) # Training with regularization reg_losses = [] for epoch in range(5): X_tensor = Tensor(X_small) y_tensor = Tensor(y_small) # Forward pass x = X_tensor for layer in model_reg: x = layer(x) # Loss with regularization base_loss = loss_fn(x, y_tensor) reg_penalty = sum(l2_regularizer(layer.weights) for layer in model_reg if hasattr(layer, 'weights')) total_loss = base_loss + reg_penalty reg_losses.append(total_loss.data.item() if hasattr(total_loss.data, 'item') else float(total_loss.data)) total_loss.backward() optimizer.step() optimizer.zero_grad() print(f"āœ… Regularized training: {len(reg_losses)} epochs with dropout + L2") # Test 5: Generalization gap print("šŸ“Š Testing generalization...") # Create train/test split np.random.seed(123) X_full = np.random.randn(100, 8) y_full = X_full[:, 0] + 0.5 * X_full[:, 1] + 0.1 * np.random.randn(100) y_full = y_full.reshape(-1, 1) split = 70 X_train, X_test = X_full[:split], X_full[split:] y_train, y_test = y_full[:split], y_full[split:] # Train regularized model gen_model = Dense(8, 1) gen_model.weights.requires_grad = True gen_model.bias.requires_grad = True gen_optimizer = Adam([gen_model.weights, gen_model.bias], lr=0.01) gen_l2 = L2Regularization(lambda_reg=0.01) train_losses = [] test_losses = [] for epoch in range(10): # Training X_train_tensor = Tensor(X_train) y_train_tensor = Tensor(y_train) pred_train = gen_model(X_train_tensor) loss_train = loss_fn(pred_train, y_train_tensor) + gen_l2(gen_model.weights) loss_train.backward() gen_optimizer.step() gen_optimizer.zero_grad() train_losses.append(loss_train.data.item() if hasattr(loss_train.data, 'item') else float(loss_train.data)) # Testing (no regularization in evaluation) X_test_tensor = Tensor(X_test) y_test_tensor = Tensor(y_test) pred_test = gen_model(X_test_tensor) loss_test = loss_fn(pred_test, y_test_tensor) test_losses.append(loss_test.data.item() if hasattr(loss_test.data, 'item') else float(loss_test.data)) # Check generalization final_gap = test_losses[-1] - train_losses[-1] print(f"āœ… Generalization: train={train_losses[-1]:.4f}, test={test_losses[-1]:.4f}, gap={final_gap:.4f}") # Test 6: Early stopping concept print("ā° Testing early stopping concept...") # Simulate early stopping by tracking validation loss val_losses = test_losses # Use test as validation for this demo # Find best epoch (lowest validation loss) best_epoch = np.argmin(val_losses) best_val_loss = val_losses[best_epoch] # Check if we can detect optimal stopping point if best_epoch < len(val_losses) - 2: # Not the last epoch print(f"āœ… Early stopping: optimal at epoch {best_epoch}, val_loss={best_val_loss:.4f}") else: print(f"āœ… Early stopping: training could continue, best val_loss={best_val_loss:.4f}") # Test 7: Model complexity vs performance print("šŸ—ļø Testing model complexity trade-offs...") # Compare simple vs complex models simple_model = Dense(8, 1) complex_model = [ Dense(8, 32), ReLU(), Dense(32, 16), ReLU(), Dense(16, 1) ] # Set requires_grad simple_model.weights.requires_grad = True simple_model.bias.requires_grad = True for layer in complex_model: if hasattr(layer, 'weights'): layer.weights.requires_grad = True layer.bias.requires_grad = True # Train simple model simple_opt = Adam([simple_model.weights, simple_model.bias], lr=0.01) X_tensor = Tensor(X_train) y_tensor = Tensor(y_train) for _ in range(5): pred = simple_model(X_tensor) loss = loss_fn(pred, y_tensor) loss.backward() simple_opt.step() simple_opt.zero_grad() # Evaluate simple model simple_test_pred = simple_model(Tensor(X_test)) simple_test_loss = loss_fn(simple_test_pred, Tensor(y_test)) print(f"āœ… Complexity: simple model test_loss={simple_test_loss.data:.4f}") # Test 8: Regularization strength effects print("šŸ’Ŗ Testing regularization strength...") # Test different L2 strengths strengths = [0.001, 0.01, 0.1] strength_results = [] for strength in strengths: temp_model = Dense(5, 1) temp_model.weights.requires_grad = True temp_model.bias.requires_grad = True temp_opt = Adam([temp_model.weights, temp_model.bias], lr=0.01) temp_l2 = L2Regularization(lambda_reg=strength) # Quick training X_temp = Tensor(np.random.randn(10, 5)) y_temp = Tensor(np.random.randn(10, 1)) for _ in range(3): pred = temp_model(X_temp) loss = loss_fn(pred, y_temp) + temp_l2(temp_model.weights) loss.backward() temp_opt.step() temp_opt.zero_grad() # Check weight magnitude weight_norm = np.linalg.norm(temp_model.weights.data) strength_results.append(weight_norm) # Higher regularization should lead to smaller weights assert strength_results[2] < strength_results[0], "Higher L2 should produce smaller weights" print(f"āœ… Regularization strength: {strengths} → weight norms {[f'{r:.3f}' for r in strength_results]}") print("\nšŸŽ‰ Regularization Complete!") print("šŸ“ You can now prevent overfitting and build robust models") print("šŸ”§ Built capabilities: Dropout, L1/L2 regularization, early stopping, complexity control") print("🧠 Breakthrough: You can now build models that generalize to real-world data!") print("šŸŽÆ Next: Add high-performance computational kernels") if __name__ == "__main__": test_checkpoint_11_regularization()