mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-30 23:47:32 -05:00
516 lines
19 KiB
Python
516 lines
19 KiB
Python
"""
|
|
Module 07 Training - Critical Integration Tests Template
|
|
|
|
This file contains the TOP 3 CRITICAL tests that MUST be implemented immediately
|
|
to establish basic confidence that Module 07 (Training) works correctly.
|
|
|
|
These tests catch the most common and severe bugs in training systems.
|
|
|
|
PRIORITY: P0 - IMPLEMENT IMMEDIATELY
|
|
ESTIMATED TIME: 2-3 hours
|
|
BUG-CATCHING VALUE: CRITICAL
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
# Import from TinyTorch
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.layers import Linear
|
|
from tinytorch.core.activations import ReLU
|
|
from tinytorch.core.losses import MSELoss, CrossEntropyLoss
|
|
from tinytorch.core.optimizers import SGD, AdamW
|
|
from tinytorch.core.training import Trainer, CosineSchedule, clip_grad_norm
|
|
|
|
|
|
# =============================================================================
|
|
# CRITICAL TEST 1: Missing zero_grad() Detection
|
|
# =============================================================================
|
|
# BUG-CATCHING VALUE: CRITICAL
|
|
# COMMON STUDENT MISTAKE: Forgetting optimizer.zero_grad()
|
|
# SYMPTOM: Training appears to run but gradients accumulate incorrectly
|
|
# =============================================================================
|
|
|
|
class TestMissingZeroGrad:
|
|
"""Test that missing zero_grad() is caught and causes visible failure."""
|
|
|
|
def test_zero_grad_required_for_correct_training(self):
|
|
"""
|
|
Test that zero_grad() is essential for correct gradient computation.
|
|
|
|
This test validates that:
|
|
1. Without zero_grad(), gradients accumulate across batches
|
|
2. Accumulated gradients cause incorrect parameter updates
|
|
3. Training with accumulated gradients behaves differently than correct training
|
|
"""
|
|
# Create simple linear model: y = Wx + b
|
|
layer_correct = Linear(1, 1)
|
|
layer_broken = Linear(1, 1)
|
|
|
|
# Make weights identical to start
|
|
layer_broken.weights.data = layer_correct.weights.data.copy()
|
|
if hasattr(layer_correct, 'bias') and layer_correct.bias is not None:
|
|
layer_broken.bias.data = layer_correct.bias.data.copy()
|
|
|
|
# Create optimizers
|
|
optimizer_correct = SGD(layer_correct.parameters(), lr=0.1)
|
|
optimizer_broken = SGD(layer_broken.parameters(), lr=0.1)
|
|
|
|
loss_fn = MSELoss()
|
|
|
|
# Training data: 5 identical samples
|
|
x_data = Tensor([[1.0]])
|
|
y_data = Tensor([[2.0]])
|
|
|
|
# === CORRECT TRAINING (with zero_grad) ===
|
|
correct_grad_norms = []
|
|
for step in range(5):
|
|
optimizer_correct.zero_grad() # ✅ CRITICAL: Clear gradients
|
|
|
|
output = layer_correct.forward(x_data)
|
|
loss = loss_fn.forward(output, y_data)
|
|
loss.backward()
|
|
|
|
# Record gradient norm
|
|
grad_norm = np.linalg.norm(layer_correct.weights.grad.data)
|
|
correct_grad_norms.append(grad_norm)
|
|
|
|
optimizer_correct.step()
|
|
|
|
# === BROKEN TRAINING (without zero_grad) ===
|
|
broken_grad_norms = []
|
|
for step in range(5):
|
|
# ❌ BUG: Missing optimizer_broken.zero_grad()
|
|
|
|
output = layer_broken.forward(x_data)
|
|
loss = loss_fn.forward(output, y_data)
|
|
loss.backward()
|
|
|
|
# Record gradient norm (should accumulate!)
|
|
grad_norm = np.linalg.norm(layer_broken.weights.grad.data)
|
|
broken_grad_norms.append(grad_norm)
|
|
|
|
optimizer_broken.step()
|
|
|
|
# === VALIDATION ===
|
|
print("\n🔬 Testing zero_grad() requirement:")
|
|
print(f"Correct gradient norms (with zero_grad): {correct_grad_norms}")
|
|
print(f"Broken gradient norms (without zero_grad): {broken_grad_norms}")
|
|
|
|
# Test 1: Gradients should accumulate without zero_grad()
|
|
assert broken_grad_norms[-1] > broken_grad_norms[0] * 2.0, \
|
|
"Gradients should accumulate when zero_grad() is missing"
|
|
|
|
# Test 2: Correct gradients should be relatively stable
|
|
correct_variation = max(correct_grad_norms) / (min(correct_grad_norms) + 1e-8)
|
|
assert correct_variation < 5.0, \
|
|
"Correct gradients shouldn't grow excessively"
|
|
|
|
# Test 3: Broken gradients grow much larger than correct ones
|
|
assert broken_grad_norms[-1] > correct_grad_norms[-1] * 2.0, \
|
|
"Missing zero_grad() should cause noticeably larger gradients"
|
|
|
|
print("✅ zero_grad() requirement correctly enforced!")
|
|
|
|
def test_trainer_calls_zero_grad(self):
|
|
"""
|
|
Test that Trainer class properly calls zero_grad() during training.
|
|
|
|
This validates the Trainer implementation includes the critical zero_grad() call.
|
|
"""
|
|
# Create simple model
|
|
class SimpleModel:
|
|
def __init__(self):
|
|
self.layer = Linear(2, 1)
|
|
self.training = True
|
|
|
|
def forward(self, x):
|
|
return self.layer.forward(x)
|
|
|
|
def parameters(self):
|
|
return self.layer.parameters()
|
|
|
|
model = SimpleModel()
|
|
optimizer = SGD(model.parameters(), lr=0.01)
|
|
loss_fn = MSELoss()
|
|
trainer = Trainer(model, optimizer, loss_fn)
|
|
|
|
# Create simple dataset
|
|
class SimpleDataset:
|
|
def __iter__(self):
|
|
for _ in range(3):
|
|
x = Tensor(np.random.randn(2, 2))
|
|
y = Tensor(np.random.randn(2, 1))
|
|
yield x, y
|
|
|
|
# Train for 2 epochs
|
|
for epoch in range(2):
|
|
trainer.train_epoch(SimpleDataset())
|
|
|
|
# After training, gradients should be zeroed (from last zero_grad() call)
|
|
# OR they should exist from last backward (depends on implementation)
|
|
# Key test: Training should have called zero_grad() internally
|
|
# (This is validated by training not diverging)
|
|
|
|
print("✅ Trainer correctly manages gradient clearing!")
|
|
|
|
|
|
# =============================================================================
|
|
# CRITICAL TEST 2: Loss Convergence Validation
|
|
# =============================================================================
|
|
# BUG-CATCHING VALUE: CRITICAL
|
|
# PURPOSE: Validate entire training pipeline produces learning
|
|
# SYMPTOM: Training runs but model doesn't improve
|
|
# =============================================================================
|
|
|
|
class TestLossConvergence:
|
|
"""Test that training actually produces learning on simple problems."""
|
|
|
|
def test_linear_regression_convergence(self):
|
|
"""
|
|
Test training converges on simple linear regression problem.
|
|
|
|
Problem: Learn y = 2x + 1
|
|
Model: Linear(1, 1) with weights and bias
|
|
Success criteria: Loss decreases, learned weights ≈ [2.0], bias ≈ [1.0]
|
|
"""
|
|
# Create model
|
|
class LinearModel:
|
|
def __init__(self):
|
|
self.layer = Linear(1, 1)
|
|
self.training = True
|
|
|
|
def forward(self, x):
|
|
return self.layer.forward(x)
|
|
|
|
def parameters(self):
|
|
return self.layer.parameters()
|
|
|
|
model = LinearModel()
|
|
optimizer = SGD(model.parameters(), lr=0.01)
|
|
loss_fn = MSELoss()
|
|
trainer = Trainer(model, optimizer, loss_fn)
|
|
|
|
# Generate training data: y = 2x + 1
|
|
np.random.seed(42)
|
|
X_train = np.random.randn(100, 1).astype(np.float32)
|
|
y_train = (2.0 * X_train + 1.0).astype(np.float32)
|
|
|
|
# Create dataset
|
|
class RegressionDataset:
|
|
def __init__(self, X, y, batch_size=10):
|
|
self.X = X
|
|
self.y = y
|
|
self.batch_size = batch_size
|
|
|
|
def __iter__(self):
|
|
indices = np.arange(len(self.X))
|
|
np.random.shuffle(indices)
|
|
for i in range(0, len(self.X), self.batch_size):
|
|
batch_indices = indices[i:i+self.batch_size]
|
|
yield Tensor(self.X[batch_indices]), Tensor(self.y[batch_indices])
|
|
|
|
dataset = RegressionDataset(X_train, y_train, batch_size=10)
|
|
|
|
# Train for 100 epochs
|
|
print("\n🔬 Testing loss convergence on y = 2x + 1:")
|
|
losses = []
|
|
for epoch in range(100):
|
|
loss = trainer.train_epoch(dataset)
|
|
losses.append(loss)
|
|
|
|
if epoch % 20 == 0:
|
|
print(f"Epoch {epoch:3d}: Loss = {loss:.6f}")
|
|
|
|
initial_loss = losses[0]
|
|
final_loss = losses[-1]
|
|
|
|
print(f"\nInitial loss: {initial_loss:.6f}")
|
|
print(f"Final loss: {final_loss:.6f}")
|
|
print(f"Reduction: {(1 - final_loss/initial_loss)*100:.1f}%")
|
|
|
|
# Test 1: Loss should decrease significantly
|
|
assert final_loss < initial_loss * 0.1, \
|
|
f"Loss should decrease to < 10% of initial. Got {final_loss/initial_loss*100:.1f}%"
|
|
|
|
# Test 2: Loss should be near zero (good fit)
|
|
assert final_loss < 0.1, \
|
|
f"Final loss should be < 0.1 for simple problem. Got {final_loss:.6f}"
|
|
|
|
# Test 3: Learned weights should approximate true values
|
|
learned_weight = model.layer.weights.data[0, 0]
|
|
learned_bias = model.layer.bias.data[0] if model.layer.bias is not None else 0.0
|
|
|
|
print(f"\nTrue parameters: weight=2.0, bias=1.0")
|
|
print(f"Learned parameters: weight={learned_weight:.3f}, bias={learned_bias:.3f}")
|
|
|
|
# Allow some tolerance for learning
|
|
assert abs(learned_weight - 2.0) < 0.5, \
|
|
f"Weight should be close to 2.0, got {learned_weight:.3f}"
|
|
|
|
if model.layer.bias is not None:
|
|
assert abs(learned_bias - 1.0) < 0.5, \
|
|
f"Bias should be close to 1.0, got {learned_bias:.3f}"
|
|
|
|
print("✅ Training successfully converged to correct solution!")
|
|
|
|
def test_classification_convergence(self):
|
|
"""
|
|
Test training converges on simple classification problem.
|
|
|
|
Problem: Learn XOR-like pattern with 2-layer network
|
|
Success criteria: Loss decreases, accuracy improves
|
|
"""
|
|
# Create 2-layer model for XOR
|
|
class XORModel:
|
|
def __init__(self):
|
|
self.layer1 = Linear(2, 4)
|
|
self.relu = ReLU()
|
|
self.layer2 = Linear(4, 2)
|
|
self.training = True
|
|
|
|
def forward(self, x):
|
|
x = self.layer1.forward(x)
|
|
x = self.relu.forward(x)
|
|
x = self.layer2.forward(x)
|
|
return x
|
|
|
|
def parameters(self):
|
|
return self.layer1.parameters() + self.layer2.parameters()
|
|
|
|
model = XORModel()
|
|
optimizer = AdamW(model.parameters(), lr=0.01)
|
|
loss_fn = CrossEntropyLoss()
|
|
trainer = Trainer(model, optimizer, loss_fn)
|
|
|
|
# Generate XOR-like data
|
|
np.random.seed(42)
|
|
X_train = np.array([
|
|
[0, 0], [0, 1], [1, 0], [1, 1],
|
|
[0, 0], [0, 1], [1, 0], [1, 1],
|
|
[0, 0], [0, 1], [1, 0], [1, 1],
|
|
], dtype=np.float32)
|
|
|
|
y_train = np.array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0], dtype=np.int64)
|
|
|
|
# Create dataset
|
|
class XORDataset:
|
|
def __iter__(self):
|
|
for i in range(len(X_train)):
|
|
yield Tensor(X_train[i:i+1]), Tensor(y_train[i:i+1])
|
|
|
|
dataset = XORDataset()
|
|
|
|
# Train for 200 epochs
|
|
print("\n🔬 Testing classification convergence on XOR pattern:")
|
|
losses = []
|
|
for epoch in range(200):
|
|
loss = trainer.train_epoch(dataset)
|
|
losses.append(loss)
|
|
|
|
if epoch % 40 == 0:
|
|
print(f"Epoch {epoch:3d}: Loss = {loss:.6f}")
|
|
|
|
initial_loss = losses[0]
|
|
final_loss = losses[-1]
|
|
|
|
print(f"\nInitial loss: {initial_loss:.6f}")
|
|
print(f"Final loss: {final_loss:.6f}")
|
|
print(f"Reduction: {(1 - final_loss/initial_loss)*100:.1f}%")
|
|
|
|
# Test: Loss should decrease significantly
|
|
assert final_loss < initial_loss * 0.5, \
|
|
f"Loss should decrease to < 50% of initial. Got {final_loss/initial_loss*100:.1f}%"
|
|
|
|
print("✅ Classification training successfully converged!")
|
|
|
|
|
|
# =============================================================================
|
|
# CRITICAL TEST 3: Scheduler Integration
|
|
# =============================================================================
|
|
# BUG-CATCHING VALUE: HIGH
|
|
# COMMON BUG: Scheduler exists but doesn't actually update learning rate
|
|
# SYMPTOM: Learning rate stays constant despite scheduler
|
|
# =============================================================================
|
|
|
|
class TestSchedulerIntegration:
|
|
"""Test that learning rate scheduler actually updates optimizer learning rate."""
|
|
|
|
def test_scheduler_updates_learning_rate(self):
|
|
"""
|
|
Test that CosineSchedule integrates with Trainer and updates LR each epoch.
|
|
|
|
This validates:
|
|
1. Scheduler computes correct learning rates
|
|
2. Trainer applies scheduler updates to optimizer
|
|
3. Learning rate actually changes during training
|
|
"""
|
|
# Create simple model
|
|
class SimpleModel:
|
|
def __init__(self):
|
|
self.layer = Linear(2, 1)
|
|
self.training = True
|
|
|
|
def forward(self, x):
|
|
return self.layer.forward(x)
|
|
|
|
def parameters(self):
|
|
return self.layer.parameters()
|
|
|
|
model = SimpleModel()
|
|
optimizer = SGD(model.parameters(), lr=0.1) # Initial LR (will be overridden)
|
|
|
|
# Create scheduler: 0.1 → 0.01 over 10 epochs
|
|
scheduler = CosineSchedule(max_lr=0.1, min_lr=0.01, total_epochs=10)
|
|
|
|
loss_fn = MSELoss()
|
|
trainer = Trainer(model, optimizer, loss_fn, scheduler=scheduler)
|
|
|
|
# Create simple dataset
|
|
class SimpleDataset:
|
|
def __iter__(self):
|
|
for _ in range(5):
|
|
x = Tensor(np.random.randn(4, 2))
|
|
y = Tensor(np.random.randn(4, 1))
|
|
yield x, y
|
|
|
|
print("\n🔬 Testing learning rate scheduling:")
|
|
|
|
# Train for 10 epochs and track learning rate
|
|
learning_rates = []
|
|
for epoch in range(10):
|
|
# Record LR before training
|
|
lr_before = optimizer.lr
|
|
|
|
# Train one epoch
|
|
trainer.train_epoch(SimpleDataset())
|
|
|
|
# Record LR after training (scheduler should have updated it)
|
|
lr_after = optimizer.lr
|
|
learning_rates.append(lr_after)
|
|
|
|
print(f"Epoch {epoch}: LR = {lr_after:.6f}")
|
|
|
|
print(f"\nLearning rates: {[f'{lr:.4f}' for lr in learning_rates]}")
|
|
|
|
# Test 1: Learning rate should start at max_lr
|
|
assert abs(learning_rates[0] - 0.1) < 1e-6, \
|
|
f"Initial LR should be 0.1, got {learning_rates[0]:.6f}"
|
|
|
|
# Test 2: Learning rate should end at min_lr
|
|
assert abs(learning_rates[-1] - 0.01) < 1e-6, \
|
|
f"Final LR should be 0.01, got {learning_rates[-1]:.6f}"
|
|
|
|
# Test 3: Learning rate should decrease monotonically
|
|
for i in range(len(learning_rates) - 1):
|
|
assert learning_rates[i] >= learning_rates[i+1], \
|
|
f"LR should decrease monotonically. Epoch {i}: {learning_rates[i]:.6f} > Epoch {i+1}: {learning_rates[i+1]:.6f}"
|
|
|
|
# Test 4: Learning rate should actually change (not stuck)
|
|
unique_lrs = len(set([round(lr, 6) for lr in learning_rates]))
|
|
assert unique_lrs >= 5, \
|
|
f"LR should change across epochs. Only {unique_lrs} unique values found."
|
|
|
|
# Test 5: History should track learning rates
|
|
assert len(trainer.history['learning_rates']) == 10, \
|
|
"Trainer should record learning rate for each epoch"
|
|
|
|
print("✅ Learning rate scheduling works correctly!")
|
|
|
|
def test_training_without_scheduler(self):
|
|
"""
|
|
Test that training works correctly when scheduler=None.
|
|
|
|
This validates that scheduler is truly optional.
|
|
"""
|
|
# Create simple model
|
|
class SimpleModel:
|
|
def __init__(self):
|
|
self.layer = Linear(1, 1)
|
|
self.training = True
|
|
|
|
def forward(self, x):
|
|
return self.layer.forward(x)
|
|
|
|
def parameters(self):
|
|
return self.layer.parameters()
|
|
|
|
model = SimpleModel()
|
|
optimizer = SGD(model.parameters(), lr=0.05)
|
|
loss_fn = MSELoss()
|
|
|
|
# Create trainer WITHOUT scheduler
|
|
trainer = Trainer(model, optimizer, loss_fn, scheduler=None)
|
|
|
|
# Create simple dataset
|
|
class SimpleDataset:
|
|
def __iter__(self):
|
|
for _ in range(3):
|
|
x = Tensor(np.random.randn(2, 1))
|
|
y = Tensor(np.random.randn(2, 1))
|
|
yield x, y
|
|
|
|
print("\n🔬 Testing training without scheduler:")
|
|
|
|
# Train for 5 epochs
|
|
initial_lr = optimizer.lr
|
|
for epoch in range(5):
|
|
trainer.train_epoch(SimpleDataset())
|
|
current_lr = optimizer.lr
|
|
|
|
print(f"Epoch {epoch}: LR = {current_lr:.6f}")
|
|
|
|
# Learning rate should stay constant
|
|
assert abs(current_lr - initial_lr) < 1e-9, \
|
|
f"LR should remain constant without scheduler. Expected {initial_lr}, got {current_lr}"
|
|
|
|
print("✅ Training without scheduler works correctly!")
|
|
|
|
|
|
# =============================================================================
|
|
# Test Execution
|
|
# =============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 70)
|
|
print("Module 07 - CRITICAL Integration Tests")
|
|
print("=" * 70)
|
|
|
|
# Test 1: Missing zero_grad()
|
|
print("\n" + "=" * 70)
|
|
print("TEST 1: Missing zero_grad() Detection")
|
|
print("=" * 70)
|
|
test_zero_grad = TestMissingZeroGrad()
|
|
test_zero_grad.test_zero_grad_required_for_correct_training()
|
|
test_zero_grad.test_trainer_calls_zero_grad()
|
|
|
|
# Test 2: Loss Convergence
|
|
print("\n" + "=" * 70)
|
|
print("TEST 2: Loss Convergence Validation")
|
|
print("=" * 70)
|
|
test_convergence = TestLossConvergence()
|
|
test_convergence.test_linear_regression_convergence()
|
|
test_convergence.test_classification_convergence()
|
|
|
|
# Test 3: Scheduler Integration
|
|
print("\n" + "=" * 70)
|
|
print("TEST 3: Scheduler Integration")
|
|
print("=" * 70)
|
|
test_scheduler = TestSchedulerIntegration()
|
|
test_scheduler.test_scheduler_updates_learning_rate()
|
|
test_scheduler.test_training_without_scheduler()
|
|
|
|
print("\n" + "=" * 70)
|
|
print("ALL CRITICAL TESTS PASSED! ✅")
|
|
print("=" * 70)
|
|
print("\nModule 07 Training has passed critical integration validation.")
|
|
print("These tests verify:")
|
|
print(" ✅ Gradients are managed correctly (zero_grad)")
|
|
print(" ✅ Training produces learning (convergence)")
|
|
print(" ✅ Learning rate scheduling works (scheduler integration)")
|