Files
TinyTorch/tests/checkpoints/checkpoint_08_differentiation.py
Vijay Janapa Reddi b4b920c64d Implement comprehensive checkpoint system with CLI integration
Features:
- 16 checkpoint test suite validating ML systems capabilities
- Integration tests covering complete learning progression
- Rich CLI progress tracking with visual timelines
- Capability-driven assessment from environment to production

Checkpoints:
- Environment setup through full ML system deployment
- Each checkpoint validates integrated functionality
- Progressive capability building with clear success criteria
- Professional CLI interface with status/timeline/test commands
2025-09-16 21:02:11 -04:00

222 lines
8.1 KiB
Python

"""
Checkpoint 8: Differentiation (After Module 9 - Autograd)
Question: "Can I automatically compute gradients for learning?"
"""
import numpy as np
import pytest
def test_checkpoint_08_differentiation():
"""
Checkpoint 8: Differentiation
Validates that students can automatically compute gradients through
computational graphs - the foundation that makes neural network learning
possible and practical.
"""
print("\n∇ Checkpoint 8: Differentiation")
print("=" * 50)
try:
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Dense
from tinytorch.core.activations import ReLU, Sigmoid
from tinytorch.core.losses import MeanSquaredError
except ImportError as e:
pytest.fail(f"❌ Cannot import required classes - complete Modules 2-9 first: {e}")
# Test 1: Basic gradient computation
print("📐 Testing basic gradient computation...")
# Create tensor that requires gradients
x = Tensor([[2.0, 3.0]], requires_grad=True)
# Simple computation: y = x^2 + 2x + 1
y = x * x + 2 * x + 1
# Compute gradients
y.backward()
# Check that gradients were computed
assert hasattr(x, 'grad'), "Tensor should have gradient after backward()"
assert x.grad is not None, "Gradient should not be None"
# Expected gradient: dy/dx = 2x + 2 = [6, 8] for x = [2, 3]
expected_grad = np.array([[6.0, 8.0]])
assert np.allclose(x.grad.data, expected_grad, atol=1e-5), f"Expected gradient {expected_grad}, got {x.grad.data}"
print(f"✅ Basic gradients: y = x² + 2x + 1 → dy/dx = {x.grad.data}")
# Test 2: Neural network gradient computation
print("🧠 Testing neural network gradients...")
# Create simple network
layer = Dense(input_size=2, output_size=1)
activation = Sigmoid()
loss_fn = MeanSquaredError()
# Set network to require gradients
layer.weights.requires_grad = True
layer.bias.requires_grad = True
# Forward pass
input_data = Tensor([[1.0, 2.0]], requires_grad=True)
target = Tensor([[0.5]])
hidden = layer(input_data)
output = activation(hidden)
loss = loss_fn(output, target)
# Backward pass
loss.backward()
# Check that all parameters have gradients
assert layer.weights.grad is not None, "Weights should have gradients"
assert layer.bias.grad is not None, "Bias should have gradients"
assert input_data.grad is not None, "Input should have gradients"
print(f"✅ Network gradients: weights{layer.weights.grad.shape}, bias{layer.bias.grad.shape}, input{input_data.grad.shape}")
# Test 3: Chain rule verification
print("🔗 Testing chain rule...")
# Multi-layer computation to test chain rule
x = Tensor([[1.0]], requires_grad=True)
# z = (x * 2)^2 = 4x^2, dz/dx = 8x = 8 for x=1
intermediate = x * 2 # u = 2x, du/dx = 2
z = intermediate * intermediate # z = u^2, dz/du = 2u = 4x
z.backward()
expected_chain_grad = 8.0 # dz/dx = dz/du * du/dx = 4x * 2 = 8x = 8
assert np.allclose(x.grad.data, expected_chain_grad, atol=1e-5), f"Chain rule: expected {expected_chain_grad}, got {x.grad.data}"
print(f"✅ Chain rule: z = (2x)² → dz/dx = {x.grad.data[0, 0]}")
# Test 4: Multi-layer network gradients
print("🏗️ Testing multi-layer network gradients...")
# Build deeper network
layer1 = Dense(3, 5)
layer2 = Dense(5, 2)
layer3 = Dense(2, 1)
relu = ReLU()
# Enable gradients for all parameters
for layer in [layer1, layer2, layer3]:
layer.weights.requires_grad = True
layer.bias.requires_grad = True
# Forward and backward pass
batch_input = Tensor(np.random.randn(2, 3), requires_grad=True)
batch_target = Tensor(np.random.randn(2, 1))
h1 = relu(layer1(batch_input))
h2 = relu(layer2(h1))
prediction = layer3(h2)
batch_loss = loss_fn(prediction, batch_target)
batch_loss.backward()
# Verify all layers have gradients
gradient_shapes = []
for i, layer in enumerate([layer1, layer2, layer3], 1):
assert layer.weights.grad is not None, f"Layer {i} weights should have gradients"
assert layer.bias.grad is not None, f"Layer {i} bias should have gradients"
gradient_shapes.append(f"L{i}_w{layer.weights.grad.shape}")
print(f"✅ Multi-layer gradients: {', '.join(gradient_shapes)}")
# Test 5: Gradient accumulation
print("📈 Testing gradient accumulation...")
# Create parameter for accumulation test
param = Tensor([[1.0, 2.0]], requires_grad=True)
# First computation
loss1 = (param * 2).sum()
loss1.backward()
first_grad = param.grad.data.copy()
# Second computation (without zeroing gradients)
loss2 = (param * 3).sum()
loss2.backward()
accumulated_grad = param.grad.data
# Gradients should accumulate: grad = 2 + 3 = 5 for each element
expected_accumulated = first_grad + np.array([[3.0, 3.0]])
assert np.allclose(accumulated_grad, expected_accumulated), f"Gradients should accumulate: {accumulated_grad} vs {expected_accumulated}"
print(f"✅ Gradient accumulation: {first_grad} + [3, 3] = {accumulated_grad}")
# Test 6: Gradient zeroing
print("🔄 Testing gradient zeroing...")
# Zero gradients and recompute
if hasattr(param, 'zero_grad'):
param.zero_grad()
else:
param.grad = None
loss3 = (param * 4).sum()
loss3.backward()
zeroed_grad = param.grad.data
expected_fresh = np.array([[4.0, 4.0]])
assert np.allclose(zeroed_grad, expected_fresh), f"Zeroed gradients should be fresh: {zeroed_grad} vs {expected_fresh}"
print(f"✅ Gradient zeroing: fresh computation → {zeroed_grad}")
# Test 7: Computational graph complexity
print("🕸️ Testing complex computational graph...")
# Complex computation with multiple paths
a = Tensor([[2.0]], requires_grad=True)
b = Tensor([[3.0]], requires_grad=True)
# Multiple paths: c = a*b + a^2 + b^2
path1 = a * b # ab, da = b, db = a
path2 = a * a # a^2, da = 2a
path3 = b * b # b^2, db = 2b
c = path1 + path2 + path3
c.backward()
# Expected gradients:
# dc/da = b + 2a = 3 + 4 = 7
# dc/db = a + 2b = 2 + 6 = 8
expected_a_grad = 7.0
expected_b_grad = 8.0
assert np.allclose(a.grad.data, expected_a_grad), f"Complex graph grad_a: expected {expected_a_grad}, got {a.grad.data}"
assert np.allclose(b.grad.data, expected_b_grad), f"Complex graph grad_b: expected {expected_b_grad}, got {b.grad.data}"
print(f"✅ Complex graph: c = ab + a² + b² → da = {a.grad.data[0,0]}, db = {b.grad.data[0,0]}")
# Test 8: Memory efficiency
print("💾 Testing gradient computation efficiency...")
# Test that intermediate computations don't leak memory
large_param = Tensor(np.random.randn(100, 100), requires_grad=True)
# Multiple forward-backward cycles
for i in range(3):
output = (large_param * (i + 1)).sum()
output.backward()
# Check gradient exists and has correct shape
assert large_param.grad is not None, f"Gradient should exist in cycle {i}"
assert large_param.grad.shape == large_param.shape, f"Gradient shape should match parameter shape"
# Zero gradients for next iteration
if hasattr(large_param, 'zero_grad'):
large_param.zero_grad()
else:
large_param.grad = None
print(f"✅ Memory efficiency: multiple cycles on {large_param.shape} tensor")
print("\n🎉 Differentiation Complete!")
print("📝 You can now automatically compute gradients for learning")
print("🔧 Built capabilities: Autograd, chain rule, gradient accumulation, complex graphs")
print("🧠 Breakthrough: You have the foundation for all neural network learning!")
print("🎯 Next: Build optimizers to update parameters using gradients")
if __name__ == "__main__":
test_checkpoint_08_differentiation()