Files
cs249r_book/tinytorch/tests/integration/test_gradients.py
Vijay Janapa Reddi 2c9b0dccbf fix: restore Conv2dBackward and MaxPool2dBackward for CNN gradient flow
- Restore Conv2dBackward class removed in commit 23c5eb2b5
- Restore MaxPool2dBackward class for pooling gradient routing
- Update Conv2d/MaxPool2d forward() to attach _grad_fn
- Set requires_grad=True on Conv2d weights and bias
- Add enable_autograd() to Module 11 (Embeddings) for progressive disclosure
- Remove skip markers from convolution gradient tests

CNN training now works correctly - conv weights receive gradients and update
during training. All 40 convolution tests pass.
2026-01-24 17:39:11 -05:00

512 lines
16 KiB
Python

#!/usr/bin/env python
"""
Gradient Flow Validation Tests for TinyTorch
=============================================
Ensures gradients propagate correctly through all architectures.
Critical for verifying that models can actually learn.
Test Categories:
- Gradient existence through deep networks
- Gradient magnitude (not vanishing/exploding)
- Chain rule validation
- Gradient accumulation
- Optimizer parameter updates
"""
import sys
import os
import numpy as np
import pytest
# Add project root to path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
sys.path.insert(0, project_root)
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Linear
from tinytorch.core.activations import ReLU, Sigmoid, Tanh
from tinytorch.core.losses import MSELoss, CrossEntropyLoss
from tinytorch.core.optimizers import SGD, Adam
from tinytorch.core.spatial import Conv2d
from tinytorch.core.transformers import TransformerBlock
class Sequential:
"""Simple sequential container for testing."""
def __init__(self, layers):
self.layers = layers
def __call__(self, x):
for layer in self.layers:
x = layer(x)
return x
def parameters(self):
params = []
for layer in self.layers:
if hasattr(layer, 'parameters'):
params.extend(layer.parameters())
return params
class F:
"""Functional interface for testing."""
@staticmethod
def relu(x):
from tinytorch.core.activations import ReLU
return ReLU()(x)
@staticmethod
def max_pool2d(x, kernel_size):
from tinytorch.core.spatial import MaxPool2d
return MaxPool2d(kernel_size)(x)
@staticmethod
def flatten(x, start_dim=1):
import numpy as np
shape = x.shape
new_shape = shape[:start_dim] + (np.prod(shape[start_dim:]),)
return x.reshape(*new_shape)
# ============== Gradient Existence Tests ==============
def test_gradient_exists_single_layer():
"""Gradients exist after backward through single layer."""
layer = Linear(10, 5)
# Create optimizer to enable requires_grad on layer parameters (real usage pattern)
optimizer = SGD(layer.parameters(), lr=0.01)
x = Tensor(np.random.randn(3, 10))
y_true = Tensor(np.random.randn(3, 5))
y_pred = layer(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
assert layer.weight.grad is not None, "No gradient for weights"
assert layer.bias.grad is not None, "No gradient for bias"
def test_gradient_exists_deep_network():
"""Gradients flow through deep network (5 layers)."""
model = Sequential([
Linear(10, 20),
ReLU(),
Linear(20, 20),
ReLU(),
Linear(20, 20),
ReLU(),
Linear(20, 20),
ReLU(),
Linear(20, 5)
])
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
x = Tensor(np.random.randn(4, 10))
y_true = Tensor(np.random.randn(4, 5))
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
# Check first and last layers have gradients
first_layer = model.layers[0]
last_layer = model.layers[-1]
assert first_layer.weight.grad is not None, "No gradient in first layer"
assert last_layer.weight.grad is not None, "No gradient in last layer"
def test_gradient_exists_cnn():
"""Gradients flow through CNN architecture."""
class SimpleCNN:
def __init__(self):
self.conv1 = Conv2d(1, 16, kernel_size=3)
self.conv2 = Conv2d(16, 32, kernel_size=3)
self.fc = Linear(32 * 5 * 5, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = F.flatten(x, start_dim=1)
return self.fc(x)
def parameters(self):
params = []
for layer in [self.conv1, self.conv2, self.fc]:
if hasattr(layer, 'parameters'):
params.extend(layer.parameters())
return params
model = SimpleCNN()
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
x = Tensor(np.random.randn(2, 1, 28, 28))
y_true = Tensor(np.random.randn(2, 10))
y_pred = model.forward(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
assert model.conv1.weight.grad is not None, "No gradient in conv1"
assert model.fc.weight.grad is not None, "No gradient in fc layer"
# ============== Gradient Magnitude Tests ==============
def test_gradient_not_vanishing():
"""Gradients don't vanish in deep network."""
# Build deep network prone to vanishing gradients
layers = []
for i in range(10):
layers.append(Linear(20, 20))
layers.append(Sigmoid()) # Sigmoid can cause vanishing gradients
layers.append(Linear(20, 1))
model = Sequential(layers)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
x = Tensor(np.random.randn(5, 20))
y_true = Tensor(np.random.randn(5, 1))
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
first_layer = model.layers[0]
assert first_layer.weight.grad is not None, "No gradient in first layer"
grad_magnitude = np.abs(first_layer.weight.grad.data).mean()
assert grad_magnitude > 1e-8, f"Gradient vanished: {grad_magnitude}"
def test_gradient_not_exploding():
"""Gradients don't explode in deep network."""
# Use fixed seed for reproducibility
np.random.seed(123)
# Build network that could have exploding gradients
layers = []
for i in range(5):
layers.append(Linear(20, 20))
layers.append(ReLU())
layers.append(Linear(20, 1))
model = Sequential(layers)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
# Use standard initialization (Xavier scale)
# Note: Previous test used * 2.0 which could cause explosion
for layer in model.layers:
if hasattr(layer, 'weight'):
scale = np.sqrt(2.0 / layer.weight.shape[0]) # He initialization
layer.weight.data = np.random.randn(*layer.weight.shape) * scale
x = Tensor(np.random.randn(5, 20))
y_true = Tensor(np.random.randn(5, 1))
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
last_layer = model.layers[-1]
assert last_layer.weight.grad is not None, "No gradient in last layer"
grad_magnitude = np.abs(last_layer.weight.grad.data).mean()
assert grad_magnitude < 1000, f"Gradient exploded: {grad_magnitude}"
def test_gradient_reasonable_magnitude():
"""Gradients have reasonable magnitude for learning."""
model = Sequential([
Linear(10, 20),
ReLU(),
Linear(20, 5)
])
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
x = Tensor(np.random.randn(8, 10))
y_true = Tensor(np.random.randn(8, 5))
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
for layer in model.layers:
if hasattr(layer, 'weight'):
assert layer.weight.grad is not None, f"No gradient for layer {layer}"
grad_mag = np.abs(layer.weight.grad.data).mean()
# Reasonable range for gradients
assert 1e-6 < grad_mag < 100, f"Gradient magnitude out of range: {grad_mag}"
# ============== Chain Rule Tests ==============
def test_chain_rule_linear_relu():
"""Chain rule works correctly through Linear→ReLU."""
linear = Linear(5, 3)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(linear.parameters(), lr=0.01)
x = Tensor(np.random.randn(2, 5))
y_true = Tensor(np.random.randn(2, 3))
# Forward
z = linear(x)
y = F.relu(z)
loss = MSELoss()(y, y_true)
loss.backward()
# ReLU should only backprop where input > 0
assert hasattr(z, 'data'), "z should have data attribute"
relu_mask = z.data > 0
# Gradient should exist
assert linear.weight.grad is not None, "Chain rule broken - no gradient"
def test_chain_rule_multiple_paths():
"""Chain rule handles multiple paths (residual connection)."""
linear1 = Linear(10, 10)
linear2 = Linear(10, 10)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(linear1.parameters() + linear2.parameters(), lr=0.01)
x = Tensor(np.random.randn(4, 10))
y_true = Tensor(np.random.randn(4, 10))
# Forward with residual connection
z1 = linear1(x)
z2 = linear2(F.relu(z1))
y = z1 + z2 # Residual connection
loss = MSELoss()(y, y_true)
loss.backward()
# Both paths should contribute to gradient
assert linear1.weight.grad is not None, "No gradient through residual path"
assert linear2.weight.grad is not None, "No gradient through main path"
# ============== Gradient Accumulation Tests ==============
def test_gradient_accumulation():
"""Gradients accumulate correctly over multiple backward passes."""
model = Linear(5, 3)
optimizer = SGD(model.parameters(), lr=0.01)
x1 = Tensor(np.random.randn(2, 5))
y1 = Tensor(np.random.randn(2, 3))
x2 = Tensor(np.random.randn(2, 5))
y2 = Tensor(np.random.randn(2, 3))
# First backward
loss1 = MSELoss()(model(x1), y1)
loss1.backward()
assert model.weight.grad is not None, "No gradient after first backward"
grad1 = np.array(model.weight.grad).copy()
# Second backward (should accumulate)
loss2 = MSELoss()(model(x2), y2)
loss2.backward()
grad2 = np.array(model.weight.grad)
# Gradient should have changed (accumulated)
assert not np.allclose(grad1, grad2), "Gradients didn't accumulate"
def test_zero_grad():
"""zero_grad() correctly resets gradients."""
model = Linear(5, 3)
optimizer = SGD(model.parameters(), lr=0.01)
x = Tensor(np.random.randn(2, 5))
y = Tensor(np.random.randn(2, 3))
# Accumulate gradient
loss = MSELoss()(model(x), y)
loss.backward()
assert model.weight.grad is not None, "No gradient after backward"
# Clear gradients
optimizer.zero_grad()
# Check gradients are reset (implementation sets to None)
# Note: Some implementations zero the array, ours sets to None
assert model.weight.grad is None or np.allclose(model.weight.grad, 0), \
"Gradients not cleared by zero_grad()"
# ============== Optimizer Update Tests ==============
def test_sgd_updates_parameters():
"""SGD optimizer updates parameters in correct direction."""
model = Linear(5, 3)
optimizer = SGD(model.parameters(), lr=0.1)
# Save initial weights
initial_weights = np.array(model.weight.data).copy()
x = Tensor(np.random.randn(4, 5))
y_true = Tensor(np.random.randn(4, 3))
# Forward and backward
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Weights should have changed
assert not np.allclose(initial_weights, model.weight.data), "Weights didn't update"
# Check update direction (gradient descent)
assert model.weight.grad is not None, "No gradient after backward"
expected_update = initial_weights - 0.1 * np.array(model.weight.grad)
assert np.allclose(model.weight.data, expected_update, rtol=1e-5), \
"SGD update incorrect"
def test_adam_updates_parameters():
"""Adam optimizer updates parameters with momentum."""
model = Linear(5, 3)
optimizer = Adam(model.parameters(), lr=0.01)
initial_weights = np.array(model.weight.data).copy()
x = Tensor(np.random.randn(4, 5))
y_true = Tensor(np.random.randn(4, 3))
# Multiple steps to see momentum effect
for _ in range(3):
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Weights should have changed
assert not np.allclose(initial_weights, model.weight.data), \
"Adam didn't update weights"
# ============== Special Architecture Tests ==============
def test_transformer_gradient_flow():
"""Gradients flow through transformer architecture."""
block = TransformerBlock(embed_dim=64, num_heads=4)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(block.parameters(), lr=0.01)
x = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, embed)
y_true = Tensor(np.random.randn(2, 10, 64))
y_pred = block(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
# Check key components have gradients
params = block.parameters()
gradients_exist = any(
p.grad is not None for p in params
if hasattr(p, 'grad')
)
assert gradients_exist, "No gradients in transformer block"
def test_loss_gradient_correctness():
"""Loss functions produce correct gradients."""
# Simple case where we can verify gradient analytically
model = Linear(2, 1, bias=False)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
model.weight.data = np.array([[1.0], [1.0]]) # Known weights
x = Tensor(np.array([[1.0, 0.0], [0.0, 1.0]]))
y_true = Tensor(np.array([[2.0], [3.0]]))
y_pred = model(x)
# y_pred should be [[1.0], [1.0]]
# MSE loss = mean((1-2)^2 + (1-3)^2) = mean(1 + 4) = 2.5
# Gradient w.r.t. predictions: [[-1], [-2]]
loss = MSELoss()(y_pred, y_true)
loss.backward()
assert model.weight.grad is not None, "No gradient from loss"
# ============== Common Issues Detection ==============
def test_dead_relu_detection():
"""Detect dead ReLU problem (all gradients blocked)."""
model = Sequential([
Linear(10, 20),
ReLU(),
Linear(20, 5)
])
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
# Set very negative bias to kill ReLU
first_layer = model.layers[0]
if hasattr(first_layer, 'bias'):
first_layer.bias.data = np.ones(20) * -10
x = Tensor(np.random.randn(4, 10) * 0.1) # Small inputs
y_true = Tensor(np.random.randn(4, 5))
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
# With dead ReLUs, gradients might be very small or zero
assert first_layer.weight.grad is not None, "No gradient for first layer"
grad_mag = np.abs(first_layer.weight.grad.data).mean()
# Dead ReLU should still produce a gradient (even if small)
# This test validates gradient flow, not dead ReLU detection
assert grad_mag >= 0, "Invalid gradient magnitude"
def test_gradient_clipping():
"""Test gradient clipping prevents explosion."""
model = Linear(10, 10)
# Create optimizer to enable requires_grad on layer parameters
optimizer = SGD(model.parameters(), lr=0.01)
# Create artificially large gradient scenario
x = Tensor(np.random.randn(2, 10) * 100)
y_true = Tensor(np.random.randn(2, 10) * 100)
y_pred = model(x)
loss = MSELoss()(y_pred, y_true)
loss.backward()
# Clip gradients
max_norm = 1.0
for param in model.parameters():
assert hasattr(param, 'grad'), "Parameter missing grad attribute"
assert param.grad is not None, "Parameter has no gradient"
grad_norm = np.linalg.norm(param.grad)
if grad_norm > max_norm:
param.grad = param.grad * (max_norm / grad_norm)
# Verify clipping worked
new_norm = np.linalg.norm(param.grad)
assert new_norm <= max_norm * 1.01, "Gradient clipping failed"
if __name__ == "__main__":
# When run directly, use pytest
import subprocess
result = subprocess.run(["pytest", __file__, "-v"], capture_output=True, text=True)
print(result.stdout)
if result.stderr:
print(result.stderr)
sys.exit(result.returncode)