TinyTorch/tests/checkpoints/checkpoint_11_regularization.py

"""
Checkpoint 11: Regularization (After Module 12 - Regularization)
Question: "Can I prevent overfitting and build robust models?"
"""

import numpy as np
import pytest

def test_checkpoint_11_regularization():
    """
    Checkpoint 11: Regularization

    Validates that students can apply regularization techniques to prevent
    overfitting and build models that generalize well to unseen data -
    essential for practical machine learning applications.
    """
    print("\n🛡️ Checkpoint 11: Regularization")
    print("=" * 50)

    try:
        from tinytorch.core.tensor import Tensor
        from tinytorch.core.layers import Dense
        from tinytorch.core.activations import ReLU
        from tinytorch.core.regularization import Dropout, L1Regularization, L2Regularization
        from tinytorch.core.losses import MeanSquaredError
        from tinytorch.core.optimizers import Adam
    except ImportError as e:
        pytest.fail(f"❌ Cannot import required classes - complete Modules 2-12 first: {e}")

    # Test 1: Dropout for generalization
    print("🎭 Testing dropout...")

    dropout = Dropout(p=0.5)

    # Create test data
    input_data = Tensor(np.ones((10, 20)))  # All ones for predictable testing

    # Training mode (should drop some neurons)
    if hasattr(dropout, 'training'):
        dropout.training = True

    dropped_output = dropout(input_data)

    # Check that some values are zeroed
    num_zeros = np.sum(dropped_output.data == 0)
    total_elements = dropped_output.data.size
    dropout_rate = num_zeros / total_elements

    # Should drop approximately 50% (with some variance)
    assert dropout_rate > 0.3 and dropout_rate < 0.7, f"Dropout rate should be ~0.5, got {dropout_rate:.3f}"
    print(f"✅ Dropout training: {dropout_rate:.3f} dropout rate")

    # Inference mode (should keep all values)
    if hasattr(dropout, 'training'):
        dropout.training = False

    inference_output = dropout(input_data)

    # In inference, should scale but not drop
    if hasattr(dropout, 'training'):
        # Proper dropout scales by 1/(1-p) in training or keeps values in inference
        assert not np.any(inference_output.data == 0), "Inference mode should not drop neurons"
        print(f"✅ Dropout inference: no neurons dropped")
    else:
        print(f"⚠️ Dropout mode switching not implemented")

    # Test 2: L2 Regularization (Weight Decay)
    print("⚖️ Testing L2 regularization...")

    # Create model with large weights
    model = Dense(5, 3)
    model.weights.data = np.random.randn(5, 3) * 2  # Larger weights
    model.bias.data = np.random.randn(3) * 2
    model.weights.requires_grad = True
    model.bias.requires_grad = True

    l2_reg = L2Regularization(lambda_reg=0.01)
    loss_fn = MeanSquaredError()

    # Test data
    X = Tensor(np.random.randn(4, 5))
    y = Tensor(np.random.randn(4, 3))

    # Forward pass with regularization
    pred = model(X)
    base_loss = loss_fn(pred, y)
    reg_loss = l2_reg(model.weights)
    total_loss = base_loss + reg_loss

    # L2 regularization should add penalty for large weights
    assert reg_loss.data > 0, f"L2 regularization should add positive penalty, got {reg_loss.data}"
    assert total_loss.data > base_loss.data, "Total loss should be larger than base loss"
    print(f"✅ L2 regularization: base={base_loss.data:.4f}, penalty={reg_loss.data:.4f}")

    # Test 3: L1 Regularization (Sparsity)
    print("📉 Testing L1 regularization...")

    l1_reg = L1Regularization(lambda_reg=0.01)
    l1_penalty = l1_reg(model.weights)

    # L1 should encourage sparsity
    assert l1_penalty.data > 0, f"L1 regularization should add positive penalty, got {l1_penalty.data}"
    print(f"✅ L1 regularization: sparsity penalty={l1_penalty.data:.4f}")

    # Test 4: Regularized training
    print("🎯 Testing regularized training...")

    # Create overfitting scenario (small dataset, complex model)
    np.random.seed(42)
    X_small = np.random.randn(20, 10)  # Only 20 samples
    y_small = np.random.randn(20, 1)

    # Complex model (prone to overfitting)
    model_reg = [
        Dense(10, 50),
        ReLU(),
        Dropout(p=0.3),
        Dense(50, 50),
        ReLU(),
        Dropout(p=0.3),
        Dense(50, 1)
    ]

    # Set requires_grad for all layers
    for layer in model_reg:
        if hasattr(layer, 'weights'):
            layer.weights.requires_grad = True
            layer.bias.requires_grad = True
        if hasattr(layer, 'training'):
            layer.training = True

    # Collect parameters
    params = []
    for layer in model_reg:
        if hasattr(layer, 'weights'):
            params.extend([layer.weights, layer.bias])

    optimizer = Adam(params, lr=0.01)
    l2_regularizer = L2Regularization(lambda_reg=0.001)

    # Training with regularization
    reg_losses = []
    for epoch in range(5):
        X_tensor = Tensor(X_small)
        y_tensor = Tensor(y_small)

        # Forward pass
        x = X_tensor
        for layer in model_reg:
            x = layer(x)

        # Loss with regularization
        base_loss = loss_fn(x, y_tensor)
        reg_penalty = sum(l2_regularizer(layer.weights) for layer in model_reg if hasattr(layer, 'weights'))
        total_loss = base_loss + reg_penalty

        reg_losses.append(total_loss.data.item() if hasattr(total_loss.data, 'item') else float(total_loss.data))

        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"✅ Regularized training: {len(reg_losses)} epochs with dropout + L2")

    # Test 5: Generalization gap
    print("📊 Testing generalization...")

    # Create train/test split
    np.random.seed(123)
    X_full = np.random.randn(100, 8)
    y_full = X_full[:, 0] + 0.5 * X_full[:, 1] + 0.1 * np.random.randn(100)
    y_full = y_full.reshape(-1, 1)

    split = 70
    X_train, X_test = X_full[:split], X_full[split:]
    y_train, y_test = y_full[:split], y_full[split:]

    # Train regularized model
    gen_model = Dense(8, 1)
    gen_model.weights.requires_grad = True
    gen_model.bias.requires_grad = True

    gen_optimizer = Adam([gen_model.weights, gen_model.bias], lr=0.01)
    gen_l2 = L2Regularization(lambda_reg=0.01)

    train_losses = []
    test_losses = []

    for epoch in range(10):
        # Training
        X_train_tensor = Tensor(X_train)
        y_train_tensor = Tensor(y_train)
        pred_train = gen_model(X_train_tensor)
        loss_train = loss_fn(pred_train, y_train_tensor) + gen_l2(gen_model.weights)

        loss_train.backward()
        gen_optimizer.step()
        gen_optimizer.zero_grad()

        train_losses.append(loss_train.data.item() if hasattr(loss_train.data, 'item') else float(loss_train.data))

        # Testing (no regularization in evaluation)
        X_test_tensor = Tensor(X_test)
        y_test_tensor = Tensor(y_test)
        pred_test = gen_model(X_test_tensor)
        loss_test = loss_fn(pred_test, y_test_tensor)

        test_losses.append(loss_test.data.item() if hasattr(loss_test.data, 'item') else float(loss_test.data))

    # Check generalization
    final_gap = test_losses[-1] - train_losses[-1]
    print(f"✅ Generalization: train={train_losses[-1]:.4f}, test={test_losses[-1]:.4f}, gap={final_gap:.4f}")

    # Test 6: Early stopping concept
    print("⏰ Testing early stopping concept...")

    # Simulate early stopping by tracking validation loss
    val_losses = test_losses  # Use test as validation for this demo

    # Find best epoch (lowest validation loss)
    best_epoch = np.argmin(val_losses)
    best_val_loss = val_losses[best_epoch]

    # Check if we can detect optimal stopping point
    if best_epoch < len(val_losses) - 2:  # Not the last epoch
        print(f"✅ Early stopping: optimal at epoch {best_epoch}, val_loss={best_val_loss:.4f}")
    else:
        print(f"✅ Early stopping: training could continue, best val_loss={best_val_loss:.4f}")

    # Test 7: Model complexity vs performance
    print("🏗️ Testing model complexity trade-offs...")

    # Compare simple vs complex models
    simple_model = Dense(8, 1)
    complex_model = [
        Dense(8, 32),
        ReLU(),
        Dense(32, 16),
        ReLU(),
        Dense(16, 1)
    ]

    # Set requires_grad
    simple_model.weights.requires_grad = True
    simple_model.bias.requires_grad = True

    for layer in complex_model:
        if hasattr(layer, 'weights'):
            layer.weights.requires_grad = True
            layer.bias.requires_grad = True

    # Train simple model
    simple_opt = Adam([simple_model.weights, simple_model.bias], lr=0.01)

    X_tensor = Tensor(X_train)
    y_tensor = Tensor(y_train)

    for _ in range(5):
        pred = simple_model(X_tensor)
        loss = loss_fn(pred, y_tensor)
        loss.backward()
        simple_opt.step()
        simple_opt.zero_grad()

    # Evaluate simple model
    simple_test_pred = simple_model(Tensor(X_test))
    simple_test_loss = loss_fn(simple_test_pred, Tensor(y_test))

    print(f"✅ Complexity: simple model test_loss={simple_test_loss.data:.4f}")

    # Test 8: Regularization strength effects
    print("💪 Testing regularization strength...")

    # Test different L2 strengths
    strengths = [0.001, 0.01, 0.1]
    strength_results = []

    for strength in strengths:
        temp_model = Dense(5, 1)
        temp_model.weights.requires_grad = True
        temp_model.bias.requires_grad = True

        temp_opt = Adam([temp_model.weights, temp_model.bias], lr=0.01)
        temp_l2 = L2Regularization(lambda_reg=strength)

        # Quick training
        X_temp = Tensor(np.random.randn(10, 5))
        y_temp = Tensor(np.random.randn(10, 1))

        for _ in range(3):
            pred = temp_model(X_temp)
            loss = loss_fn(pred, y_temp) + temp_l2(temp_model.weights)
            loss.backward()
            temp_opt.step()
            temp_opt.zero_grad()

        # Check weight magnitude
        weight_norm = np.linalg.norm(temp_model.weights.data)
        strength_results.append(weight_norm)

    # Higher regularization should lead to smaller weights
    assert strength_results[2] < strength_results[0], "Higher L2 should produce smaller weights"
    print(f"✅ Regularization strength: {strengths} → weight norms {[f'{r:.3f}' for r in strength_results]}")

    print("\n🎉 Regularization Complete!")
    print("📝 You can now prevent overfitting and build robust models")
    print("🔧 Built capabilities: Dropout, L1/L2 regularization, early stopping, complexity control")
    print("🧠 Breakthrough: You can now build models that generalize to real-world data!")
    print("🎯 Next: Add high-performance computational kernels")

if __name__ == "__main__":
    test_checkpoint_11_regularization()