Files
cs249r_book/tinytorch/tests/integration/test_training_flow.py
Vijay Janapa Reddi 389989ece7 refactor(tests): clean up test folder and fix gradient flow issues
Test Cleanup (113 files, -22,000 lines):
- Remove 21 redundant run_all_tests.py files
- Remove checkpoints/ folder (22 obsolete checkpoint files)
- Remove progressive/, debugging/, diagnostic/ folders
- Remove duplicate integration tests and examples
- Remove orphaned dev artifacts and generated outputs
- Consolidate test_gradient_flow_overall.py into system/

Documentation Cleanup (4 files removed):
- Remove duplicate HOW_TO_USE.md, WORKFLOW.md, SYSTEM_DESIGN.md
- Trim environment/README.md from 334 to 86 lines
- Update capstone/README.md removing outdated bug references

Test Fixes:
- Add requires_grad=True to layer parameters in gradient tests
- Fix PositionalEncoding argument order in test_shapes.py
- Adjust performance thresholds for realistic expectations
- Fix gradient clipping to handle memoryview correctly
- Update zero_grad assertions to accept None or zeros
2026-01-24 12:22:37 -05:00

380 lines
11 KiB
Python

"""
Training Flow Integration Tests
================================
Tests that the complete training pipeline works:
1. Forward pass produces valid outputs
2. Loss computes correctly
3. Backward pass populates gradients
4. Optimizer updates weights
5. Loss decreases over iterations
These tests catch issues that unit tests miss - where modules
work individually but fail when connected.
Modules tested: 01-08 (Tensor → Training)
"""
import pytest
import numpy as np
import sys
from pathlib import Path
# Add project root
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Linear
from tinytorch.core.activations import ReLU, Sigmoid
from tinytorch.core.losses import MSELoss, CrossEntropyLoss
from tinytorch.core.optimizers import SGD, Adam
from tinytorch.core.autograd import enable_autograd
# Enable autograd for all tests
enable_autograd()
class TestOptimzerActuallyUpdatesWeights:
"""
Critical Test: Verify optimizer.step() actually changes weights.
Common bugs caught:
- Optimizer not connected to parameters
- Gradients not flowing to weights
- Learning rate is zero
- step() not implemented correctly
"""
def test_sgd_updates_weights(self):
"""SGD must modify weights after step()"""
layer = Linear(2, 1)
optimizer = SGD([layer.weight, layer.bias], lr=0.1)
# Store initial weights
initial_weight = layer.weight.data.copy()
initial_bias = layer.bias.data.copy()
# Forward + backward
x = Tensor([[1.0, 2.0]], requires_grad=True)
target = Tensor([[5.0]])
output = layer.forward(x)
loss = MSELoss().forward(output, target)
loss.backward()
# Verify gradients exist
assert layer.weight.grad is not None, "Weight gradient is None!"
assert layer.bias.grad is not None, "Bias gradient is None!"
# Step should update weights
optimizer.step()
# Weights MUST be different
weight_changed = not np.allclose(initial_weight, layer.weight.data)
bias_changed = not np.allclose(initial_bias, layer.bias.data)
assert weight_changed, (
f"SGD.step() did not change weights!\n"
f" Before: {initial_weight}\n"
f" After: {layer.weight.data}\n"
f" Grad: {layer.weight.grad}"
)
assert bias_changed, "SGD.step() did not change bias!"
def test_adam_updates_weights(self):
"""Adam must modify weights after step()"""
layer = Linear(2, 1)
optimizer = Adam([layer.weight, layer.bias], lr=0.1)
initial_weight = layer.weight.data.copy()
x = Tensor([[1.0, 2.0]], requires_grad=True)
target = Tensor([[5.0]])
output = layer.forward(x)
loss = MSELoss().forward(output, target)
loss.backward()
optimizer.step()
assert not np.allclose(initial_weight, layer.weight.data), (
"Adam.step() did not change weights!"
)
class TestTrainingReducesLoss:
"""
Critical Test: Verify that training actually reduces loss.
Common bugs caught:
- Gradients have wrong sign
- Learning rate too high (divergence)
- Optimizer not using gradients correctly
- Loss function returning wrong values
"""
def test_mlp_loss_decreases(self):
"""A simple MLP must learn XOR-like pattern"""
# Simple 2-layer network
layer1 = Linear(2, 4)
relu = ReLU()
layer2 = Linear(4, 1)
sigmoid = Sigmoid()
loss_fn = MSELoss()
params = [layer1.weight, layer1.bias, layer2.weight, layer2.bias]
optimizer = SGD(params, lr=0.5)
# XOR-like data
X = Tensor([
[0., 0.],
[0., 1.],
[1., 0.],
[1., 1.]
], requires_grad=True)
y = Tensor([[0.], [1.], [1.], [0.]])
# Track loss over time
losses = []
for epoch in range(100):
# Zero gradients
for p in params:
if p.grad is not None:
p.grad = np.zeros_like(p.grad)
# Forward
h = relu.forward(layer1.forward(X))
out = sigmoid.forward(layer2.forward(h))
loss = loss_fn.forward(out, y)
losses.append(float(loss.data))
# Backward
loss.backward()
# Update
optimizer.step()
# Loss MUST decrease
initial_loss = losses[0]
final_loss = losses[-1]
assert final_loss < initial_loss, (
f"Training did not reduce loss!\n"
f" Initial: {initial_loss:.4f}\n"
f" Final: {final_loss:.4f}\n"
f" Loss history: {losses[:5]}...{losses[-5:]}"
)
# Loss should decrease (at least 5% - being lenient for test stability)
improvement = (initial_loss - final_loss) / initial_loss
assert improvement > 0.05, (
f"Training improved loss by only {improvement*100:.1f}%\n"
f" Expected at least 5% improvement"
)
class TestGradientChainNotBroken:
"""
Critical Test: Verify gradient chain is not broken.
Common bugs caught:
- requires_grad not propagating
- Operations not recording grad_fn
- Intermediate tensors breaking the chain
"""
def test_deep_network_gradient_chain(self):
"""Gradients must flow through 5 layers"""
# Use fixed seed for reproducibility - prevents flaky test due to
# random initialization that might kill all ReLUs
np.random.seed(42)
layers = [Linear(4, 4) for _ in range(5)]
# Enable gradient tracking on all layer parameters
for layer in layers:
layer.weight.requires_grad = True
if layer.bias is not None:
layer.bias.requires_grad = True
relu = ReLU()
x = Tensor(np.random.randn(1, 4), requires_grad=True)
target = Tensor(np.random.randn(1, 4))
# Forward through all layers
h = x
for layer in layers:
h = relu.forward(layer.forward(h))
loss = MSELoss().forward(h, target)
loss.backward()
# ALL layers must have gradients
for i, layer in enumerate(layers):
assert layer.weight.grad is not None, (
f"Layer {i} weight.grad is None - gradient chain broken!"
)
assert layer.bias.grad is not None, (
f"Layer {i} bias.grad is None - gradient chain broken!"
)
# Gradients should be non-trivial
grad_norm = np.linalg.norm(layer.weight.grad)
assert grad_norm > 1e-10, (
f"Layer {i} has vanishing gradients: {grad_norm}"
)
def test_input_receives_gradients(self):
"""Input tensor must receive gradients for visualization/debugging"""
layer = Linear(3, 2)
layer.weight.requires_grad = True
if layer.bias is not None:
layer.bias.requires_grad = True
x = Tensor([[1., 2., 3.]], requires_grad=True)
target = Tensor([[1., 0.]])
output = layer.forward(x)
loss = MSELoss().forward(output, target)
loss.backward()
assert x.grad is not None, "Input tensor did not receive gradients!"
assert x.grad.shape == x.shape, (
f"Input gradient shape mismatch: {x.grad.shape} vs {x.shape}"
)
class TestZeroGradWorks:
"""
Critical Test: Verify zero_grad clears gradients properly.
Common bugs caught:
- Gradients accumulating across batches
- zero_grad not actually zeroing
- Memory leaks from gradient accumulation
"""
def test_gradients_dont_accumulate_after_zero_grad(self):
"""Gradients must not accumulate when zero_grad is called"""
layer = Linear(2, 1)
layer.weight.requires_grad = True
if layer.bias is not None:
layer.bias.requires_grad = True
optimizer = SGD([layer.weight, layer.bias], lr=0.1)
x = Tensor([[1., 2.]], requires_grad=True)
target = Tensor([[1.]])
# First forward/backward
out1 = layer.forward(x)
loss1 = MSELoss().forward(out1, target)
loss1.backward()
grad_after_first = layer.weight.grad.copy()
# Zero gradients
optimizer.zero_grad()
# Verify zeroed
assert layer.weight.grad is None or np.allclose(layer.weight.grad, 0), (
"zero_grad() did not clear weight gradients!"
)
# Second forward/backward
out2 = layer.forward(x)
loss2 = MSELoss().forward(out2, target)
loss2.backward()
grad_after_second = layer.weight.grad.copy()
# Gradients should be similar magnitude (not accumulated)
ratio = np.linalg.norm(grad_after_second) / np.linalg.norm(grad_after_first)
assert 0.5 < ratio < 2.0, (
f"Gradients appear to be accumulating!\n"
f" First grad norm: {np.linalg.norm(grad_after_first)}\n"
f" Second grad norm: {np.linalg.norm(grad_after_second)}\n"
f" Ratio: {ratio} (should be ~1.0)"
)
class TestBatchTraining:
"""
Critical Test: Verify batch training works correctly.
Common bugs caught:
- Shape mismatches with batches
- Mean vs sum reduction issues
- Gradient scaling problems
"""
def test_batch_gradients_are_averaged(self):
"""Gradients should be averaged over batch (not summed)"""
layer = Linear(2, 1)
layer.weight.requires_grad = True
if layer.bias is not None:
layer.bias.requires_grad = True
# Single sample
x1 = Tensor([[1., 2.]], requires_grad=True)
target1 = Tensor([[3.]])
out1 = layer.forward(x1)
loss1 = MSELoss().forward(out1, target1)
loss1.backward()
single_grad = layer.weight.grad.copy()
# Reset
layer.weight.grad = None
layer.bias.grad = None
# Batch of same sample repeated 4 times
x_batch = Tensor([[1., 2.]] * 4, requires_grad=True)
target_batch = Tensor([[3.]] * 4)
out_batch = layer.forward(x_batch)
loss_batch = MSELoss().forward(out_batch, target_batch)
loss_batch.backward()
batch_grad = layer.weight.grad.copy()
# Gradients should be similar (averaged, not 4x)
ratio = np.linalg.norm(batch_grad) / np.linalg.norm(single_grad)
assert 0.8 < ratio < 1.2, (
f"Batch gradients not properly averaged!\n"
f" Single sample grad norm: {np.linalg.norm(single_grad)}\n"
f" Batch (4x same) grad norm: {np.linalg.norm(batch_grad)}\n"
f" Ratio: {ratio} (should be ~1.0, got {ratio:.2f})"
)
# Quick smoke test for CI
@pytest.mark.quick
class TestQuickTrainingSmoke:
"""Fast tests for CI - just verify nothing crashes"""
def test_simple_training_step(self):
"""One training step should not crash"""
layer = Linear(2, 1)
layer.weight.requires_grad = True
if layer.bias is not None:
layer.bias.requires_grad = True
opt = SGD([layer.weight, layer.bias], lr=0.1)
x = Tensor([[1., 2.]], requires_grad=True)
y = Tensor([[1.]])
out = layer.forward(x)
loss = MSELoss().forward(out, y)
loss.backward()
opt.step()
assert True # If we got here, it works
if __name__ == "__main__":
pytest.main([__file__, "-v"])