cs249r_book/tinytorch/tests/integration/test_gradients.py

#!/usr/bin/env python
"""
Gradient Flow Validation Tests for TinyTorch
=============================================
Ensures gradients propagate correctly through all architectures.
Critical for verifying that models can actually learn.

Test Categories:
- Gradient existence through deep networks
- Gradient magnitude (not vanishing/exploding)
- Chain rule validation
- Gradient accumulation
- Optimizer parameter updates
"""

import sys
import os
import numpy as np
import pytest

# Add project root to path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
sys.path.insert(0, project_root)

from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Linear
from tinytorch.core.activations import ReLU, Sigmoid, Tanh
from tinytorch.core.losses import MSELoss, CrossEntropyLoss
from tinytorch.core.optimizers import SGD, Adam
from tinytorch.core.spatial import Conv2d
from tinytorch.core.transformers import TransformerBlock

class Sequential:
    """Simple sequential container for testing."""
    def __init__(self, layers):
        self.layers = layers
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    def parameters(self):
        params = []
        for layer in self.layers:
            if hasattr(layer, 'parameters'):
                params.extend(layer.parameters())
        return params

class F:
    """Functional interface for testing."""
    @staticmethod
    def relu(x):
        from tinytorch.core.activations import ReLU
        return ReLU()(x)
    @staticmethod
    def max_pool2d(x, kernel_size):
        from tinytorch.core.spatial import MaxPool2d
        return MaxPool2d(kernel_size)(x)
    @staticmethod
    def flatten(x, start_dim=1):
        import numpy as np
        shape = x.shape
        new_shape = shape[:start_dim] + (np.prod(shape[start_dim:]),)
        return x.reshape(*new_shape)


# ============== Gradient Existence Tests ==============

def test_gradient_exists_single_layer():
    """Gradients exist after backward through single layer."""
    layer = Linear(10, 5)
    # Create optimizer to enable requires_grad on layer parameters (real usage pattern)
    optimizer = SGD(layer.parameters(), lr=0.01)

    x = Tensor(np.random.randn(3, 10))
    y_true = Tensor(np.random.randn(3, 5))

    y_pred = layer(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    assert layer.weight.grad is not None, "No gradient for weights"
    assert layer.bias.grad is not None, "No gradient for bias"


def test_gradient_exists_deep_network():
    """Gradients flow through deep network (5 layers)."""
    model = Sequential([
        Linear(10, 20),
        ReLU(),
        Linear(20, 20),
        ReLU(),
        Linear(20, 20),
        ReLU(),
        Linear(20, 20),
        ReLU(),
        Linear(20, 5)
    ])

    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    x = Tensor(np.random.randn(4, 10))
    y_true = Tensor(np.random.randn(4, 5))

    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    # Check first and last layers have gradients
    first_layer = model.layers[0]
    last_layer = model.layers[-1]
    assert first_layer.weight.grad is not None, "No gradient in first layer"
    assert last_layer.weight.grad is not None, "No gradient in last layer"


def test_gradient_exists_cnn():
    """Gradients flow through CNN architecture."""
    class SimpleCNN:
        def __init__(self):
            self.conv1 = Conv2d(1, 16, kernel_size=3)
            self.conv2 = Conv2d(16, 32, kernel_size=3)
            self.fc = Linear(32 * 5 * 5, 10)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            x = F.max_pool2d(x, 2)
            x = F.relu(self.conv2(x))
            x = F.max_pool2d(x, 2)
            x = F.flatten(x, start_dim=1)
            return self.fc(x)

        def parameters(self):
            params = []
            for layer in [self.conv1, self.conv2, self.fc]:
                if hasattr(layer, 'parameters'):
                    params.extend(layer.parameters())
            return params

    model = SimpleCNN()
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    x = Tensor(np.random.randn(2, 1, 28, 28))
    y_true = Tensor(np.random.randn(2, 10))

    y_pred = model.forward(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    assert model.conv1.weight.grad is not None, "No gradient in conv1"
    assert model.fc.weight.grad is not None, "No gradient in fc layer"


# ============== Gradient Magnitude Tests ==============

def test_gradient_not_vanishing():
    """Gradients don't vanish in deep network."""
    # Build deep network prone to vanishing gradients
    layers = []
    for i in range(10):
        layers.append(Linear(20, 20))
        layers.append(Sigmoid())  # Sigmoid can cause vanishing gradients
    layers.append(Linear(20, 1))

    model = Sequential(layers)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    x = Tensor(np.random.randn(5, 20))
    y_true = Tensor(np.random.randn(5, 1))

    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    first_layer = model.layers[0]
    assert first_layer.weight.grad is not None, "No gradient in first layer"
    grad_magnitude = np.abs(first_layer.weight.grad.data).mean()
    assert grad_magnitude > 1e-8, f"Gradient vanished: {grad_magnitude}"


def test_gradient_not_exploding():
    """Gradients don't explode in deep network."""
    # Use fixed seed for reproducibility
    np.random.seed(123)

    # Build network that could have exploding gradients
    layers = []
    for i in range(5):
        layers.append(Linear(20, 20))
        layers.append(ReLU())
    layers.append(Linear(20, 1))

    model = Sequential(layers)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    # Use standard initialization (Xavier scale)
    # Note: Previous test used * 2.0 which could cause explosion
    for layer in model.layers:
        if hasattr(layer, 'weight'):
            scale = np.sqrt(2.0 / layer.weight.shape[0])  # He initialization
            layer.weight.data = np.random.randn(*layer.weight.shape) * scale

    x = Tensor(np.random.randn(5, 20))
    y_true = Tensor(np.random.randn(5, 1))

    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    last_layer = model.layers[-1]
    assert last_layer.weight.grad is not None, "No gradient in last layer"
    grad_magnitude = np.abs(last_layer.weight.grad.data).mean()
    assert grad_magnitude < 1000, f"Gradient exploded: {grad_magnitude}"


def test_gradient_reasonable_magnitude():
    """Gradients have reasonable magnitude for learning."""
    model = Sequential([
        Linear(10, 20),
        ReLU(),
        Linear(20, 5)
    ])
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    x = Tensor(np.random.randn(8, 10))
    y_true = Tensor(np.random.randn(8, 5))

    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    for layer in model.layers:
        if hasattr(layer, 'weight'):
            assert layer.weight.grad is not None, f"No gradient for layer {layer}"
            grad_mag = np.abs(layer.weight.grad.data).mean()
            # Reasonable range for gradients
            assert 1e-6 < grad_mag < 100, f"Gradient magnitude out of range: {grad_mag}"


# ============== Chain Rule Tests ==============

def test_chain_rule_linear_relu():
    """Chain rule works correctly through Linear→ReLU."""
    linear = Linear(5, 3)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(linear.parameters(), lr=0.01)

    x = Tensor(np.random.randn(2, 5))
    y_true = Tensor(np.random.randn(2, 3))

    # Forward
    z = linear(x)
    y = F.relu(z)
    loss = MSELoss()(y, y_true)

    loss.backward()
    # ReLU should only backprop where input > 0
    assert hasattr(z, 'data'), "z should have data attribute"
    relu_mask = z.data > 0
    # Gradient should exist
    assert linear.weight.grad is not None, "Chain rule broken - no gradient"


def test_chain_rule_multiple_paths():
    """Chain rule handles multiple paths (residual connection)."""
    linear1 = Linear(10, 10)
    linear2 = Linear(10, 10)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(linear1.parameters() + linear2.parameters(), lr=0.01)

    x = Tensor(np.random.randn(4, 10))
    y_true = Tensor(np.random.randn(4, 10))

    # Forward with residual connection
    z1 = linear1(x)
    z2 = linear2(F.relu(z1))
    y = z1 + z2  # Residual connection

    loss = MSELoss()(y, y_true)

    loss.backward()
    # Both paths should contribute to gradient
    assert linear1.weight.grad is not None, "No gradient through residual path"
    assert linear2.weight.grad is not None, "No gradient through main path"


# ============== Gradient Accumulation Tests ==============

def test_gradient_accumulation():
    """Gradients accumulate correctly over multiple backward passes."""
    model = Linear(5, 3)
    optimizer = SGD(model.parameters(), lr=0.01)

    x1 = Tensor(np.random.randn(2, 5))
    y1 = Tensor(np.random.randn(2, 3))

    x2 = Tensor(np.random.randn(2, 5))
    y2 = Tensor(np.random.randn(2, 3))

    # First backward
    loss1 = MSELoss()(model(x1), y1)
    loss1.backward()

    assert model.weight.grad is not None, "No gradient after first backward"
    grad1 = np.array(model.weight.grad).copy()

    # Second backward (should accumulate)
    loss2 = MSELoss()(model(x2), y2)
    loss2.backward()

    grad2 = np.array(model.weight.grad)
    # Gradient should have changed (accumulated)
    assert not np.allclose(grad1, grad2), "Gradients didn't accumulate"


def test_zero_grad():
    """zero_grad() correctly resets gradients."""
    model = Linear(5, 3)
    optimizer = SGD(model.parameters(), lr=0.01)

    x = Tensor(np.random.randn(2, 5))
    y = Tensor(np.random.randn(2, 3))

    # Accumulate gradient
    loss = MSELoss()(model(x), y)
    loss.backward()

    assert model.weight.grad is not None, "No gradient after backward"

    # Clear gradients
    optimizer.zero_grad()

    # Check gradients are reset (implementation sets to None)
    # Note: Some implementations zero the array, ours sets to None
    assert model.weight.grad is None or np.allclose(model.weight.grad, 0), \
        "Gradients not cleared by zero_grad()"


# ============== Optimizer Update Tests ==============

def test_sgd_updates_parameters():
    """SGD optimizer updates parameters in correct direction."""
    model = Linear(5, 3)
    optimizer = SGD(model.parameters(), lr=0.1)

    # Save initial weights
    initial_weights = np.array(model.weight.data).copy()

    x = Tensor(np.random.randn(4, 5))
    y_true = Tensor(np.random.randn(4, 3))

    # Forward and backward
    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Weights should have changed
    assert not np.allclose(initial_weights, model.weight.data), "Weights didn't update"

    # Check update direction (gradient descent)
    assert model.weight.grad is not None, "No gradient after backward"
    expected_update = initial_weights - 0.1 * np.array(model.weight.grad)
    assert np.allclose(model.weight.data, expected_update, rtol=1e-5), \
        "SGD update incorrect"


def test_adam_updates_parameters():
    """Adam optimizer updates parameters with momentum."""
    model = Linear(5, 3)
    optimizer = Adam(model.parameters(), lr=0.01)

    initial_weights = np.array(model.weight.data).copy()

    x = Tensor(np.random.randn(4, 5))
    y_true = Tensor(np.random.randn(4, 3))

    # Multiple steps to see momentum effect
    for _ in range(3):
        y_pred = model(x)
        loss = MSELoss()(y_pred, y_true)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Weights should have changed
    assert not np.allclose(initial_weights, model.weight.data), \
        "Adam didn't update weights"


# ============== Special Architecture Tests ==============

def test_transformer_gradient_flow():
    """Gradients flow through transformer architecture."""
    block = TransformerBlock(embed_dim=64, num_heads=4)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(block.parameters(), lr=0.01)

    x = Tensor(np.random.randn(2, 10, 64))  # (batch, seq, embed)
    y_true = Tensor(np.random.randn(2, 10, 64))

    y_pred = block(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    # Check key components have gradients
    params = block.parameters()
    gradients_exist = any(
        p.grad is not None for p in params
        if hasattr(p, 'grad')
    )
    assert gradients_exist, "No gradients in transformer block"


def test_loss_gradient_correctness():
    """Loss functions produce correct gradients."""
    # Simple case where we can verify gradient analytically
    model = Linear(2, 1, bias=False)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)
    model.weight.data = np.array([[1.0], [1.0]])  # Known weights

    x = Tensor(np.array([[1.0, 0.0], [0.0, 1.0]]))
    y_true = Tensor(np.array([[2.0], [3.0]]))

    y_pred = model(x)
    # y_pred should be [[1.0], [1.0]]
    # MSE loss = mean((1-2)^2 + (1-3)^2) = mean(1 + 4) = 2.5
    # Gradient w.r.t. predictions: [[-1], [-2]]

    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    assert model.weight.grad is not None, "No gradient from loss"


# ============== Common Issues Detection ==============

def test_dead_relu_detection():
    """Detect dead ReLU problem (all gradients blocked)."""
    model = Sequential([
        Linear(10, 20),
        ReLU(),
        Linear(20, 5)
    ])
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    # Set very negative bias to kill ReLU
    first_layer = model.layers[0]
    if hasattr(first_layer, 'bias'):
        first_layer.bias.data = np.ones(20) * -10

    x = Tensor(np.random.randn(4, 10) * 0.1)  # Small inputs
    y_true = Tensor(np.random.randn(4, 5))

    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()
    # With dead ReLUs, gradients might be very small or zero
    assert first_layer.weight.grad is not None, "No gradient for first layer"
    grad_mag = np.abs(first_layer.weight.grad.data).mean()
    # Dead ReLU should still produce a gradient (even if small)
    # This test validates gradient flow, not dead ReLU detection
    assert grad_mag >= 0, "Invalid gradient magnitude"


def test_gradient_clipping():
    """Test gradient clipping prevents explosion."""
    model = Linear(10, 10)
    # Create optimizer to enable requires_grad on layer parameters
    optimizer = SGD(model.parameters(), lr=0.01)

    # Create artificially large gradient scenario
    x = Tensor(np.random.randn(2, 10) * 100)
    y_true = Tensor(np.random.randn(2, 10) * 100)

    y_pred = model(x)
    loss = MSELoss()(y_pred, y_true)

    loss.backward()

    # Clip gradients
    max_norm = 1.0
    for param in model.parameters():
        assert hasattr(param, 'grad'), "Parameter missing grad attribute"
        assert param.grad is not None, "Parameter has no gradient"
        grad_norm = np.linalg.norm(param.grad)
        if grad_norm > max_norm:
            param.grad = param.grad * (max_norm / grad_norm)

        # Verify clipping worked
        new_norm = np.linalg.norm(param.grad)
        assert new_norm <= max_norm * 1.01, "Gradient clipping failed"


if __name__ == "__main__":
    # When run directly, use pytest
    import subprocess
    result = subprocess.run(["pytest", __file__, "-v"], capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print(result.stderr)
    sys.exit(result.returncode)