mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-05 01:54:47 -05:00
Merge transformer-training into dev
Complete Milestone 05 - 2017 Transformer implementation Major Features: - TinyTalks interactive dashboard with rich CLI - Complete gradient flow fixes (13 tests passing) - Multiple training examples (5-min, 10-min, levels 1-2) - Milestone celebration card (perceptron style) - Comprehensive documentation Gradient Flow Fixes: - Fixed reshape, matmul (3D), embedding, sqrt, mean, sub, div, GELU - All transformer components now fully differentiable - Hybrid attention approach for educational clarity + gradients Training Results: - 10-min training: 96.6% loss improvement, 62.5% accuracy - 5-min training: 97.8% loss improvement, 66.7% accuracy - Working chatbot with coherent responses Files Added: - tinytalks_dashboard.py (main demo) - tinytalks_chatbot.py, tinytalks_dataset.py - level1_memorization.py, level2_patterns.py - Comprehensive docs and test suites Ready for student use 2>&1
This commit is contained in:
180
tests/05_autograd/test_gradient_flow.py
Normal file
180
tests/05_autograd/test_gradient_flow.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""
|
||||
Test gradient flow through all autograd operations.
|
||||
|
||||
This test suite validates that all arithmetic operations and activations
|
||||
properly preserve gradient tracking and enable backpropagation.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.activations import GELU
|
||||
# Import transformer to ensure mean/sqrt monkey-patches are applied
|
||||
from tinytorch.models import transformer
|
||||
|
||||
|
||||
def test_arithmetic_gradient_flow():
|
||||
"""Test that arithmetic operations preserve requires_grad and set _grad_fn."""
|
||||
print("Testing arithmetic gradient flow...")
|
||||
|
||||
x = Tensor(np.array([2.0, 3.0]), requires_grad=True)
|
||||
y = Tensor(np.array([4.0, 5.0]), requires_grad=True)
|
||||
|
||||
# Test addition
|
||||
z_add = x + y
|
||||
assert z_add.requires_grad, "Addition should preserve requires_grad"
|
||||
assert hasattr(z_add, '_grad_fn'), "Addition should set _grad_fn"
|
||||
|
||||
# Test subtraction
|
||||
z_sub = x - y
|
||||
assert z_sub.requires_grad, "Subtraction should preserve requires_grad"
|
||||
assert hasattr(z_sub, '_grad_fn'), "Subtraction should set _grad_fn"
|
||||
|
||||
# Test multiplication
|
||||
z_mul = x * y
|
||||
assert z_mul.requires_grad, "Multiplication should preserve requires_grad"
|
||||
assert hasattr(z_mul, '_grad_fn'), "Multiplication should set _grad_fn"
|
||||
|
||||
# Test division
|
||||
z_div = x / y
|
||||
assert z_div.requires_grad, "Division should preserve requires_grad"
|
||||
assert hasattr(z_div, '_grad_fn'), "Division should set _grad_fn"
|
||||
|
||||
print("✅ All arithmetic operations preserve gradient tracking")
|
||||
|
||||
|
||||
def test_subtraction_backward():
|
||||
"""Test that subtraction computes correct gradients."""
|
||||
print("Testing subtraction backward pass...")
|
||||
|
||||
a = Tensor(np.array([5.0, 10.0]), requires_grad=True)
|
||||
b = Tensor(np.array([2.0, 3.0]), requires_grad=True)
|
||||
|
||||
# Forward: c = a - b
|
||||
c = a - b
|
||||
|
||||
# Backward
|
||||
loss = c.sum()
|
||||
loss.backward()
|
||||
|
||||
# Check gradients: ∂loss/∂a = 1, ∂loss/∂b = -1
|
||||
assert a.grad is not None, "Gradient should flow to a"
|
||||
assert b.grad is not None, "Gradient should flow to b"
|
||||
assert np.allclose(a.grad, np.array([1.0, 1.0])), "Gradient wrt a should be 1"
|
||||
assert np.allclose(b.grad, np.array([-1.0, -1.0])), "Gradient wrt b should be -1"
|
||||
|
||||
print("✅ Subtraction backward pass correct")
|
||||
|
||||
|
||||
def test_division_backward():
|
||||
"""Test that division computes correct gradients."""
|
||||
print("Testing division backward pass...")
|
||||
|
||||
a = Tensor(np.array([6.0, 12.0]), requires_grad=True)
|
||||
b = Tensor(np.array([2.0, 3.0]), requires_grad=True)
|
||||
|
||||
# Forward: c = a / b
|
||||
c = a / b
|
||||
|
||||
# Backward
|
||||
loss = c.sum()
|
||||
loss.backward()
|
||||
|
||||
# Check gradients: ∂(a/b)/∂a = 1/b, ∂(a/b)/∂b = -a/b²
|
||||
assert a.grad is not None, "Gradient should flow to a"
|
||||
assert b.grad is not None, "Gradient should flow to b"
|
||||
assert np.allclose(a.grad, 1.0 / b.data), "Gradient wrt a should be 1/b"
|
||||
expected_b_grad = -a.data / (b.data ** 2)
|
||||
assert np.allclose(b.grad, expected_b_grad), "Gradient wrt b should be -a/b²"
|
||||
|
||||
print("✅ Division backward pass correct")
|
||||
|
||||
|
||||
def test_gelu_gradient_flow():
|
||||
"""Test that GELU activation preserves gradient flow."""
|
||||
print("Testing GELU gradient flow...")
|
||||
|
||||
x = Tensor(np.array([1.0, 2.0, 3.0]), requires_grad=True)
|
||||
gelu = GELU()
|
||||
|
||||
# Forward
|
||||
y = gelu(x)
|
||||
assert y.requires_grad, "GELU output should have requires_grad=True"
|
||||
assert hasattr(y, '_grad_fn'), "GELU should set _grad_fn"
|
||||
|
||||
# Backward
|
||||
loss = y.sum()
|
||||
loss.backward()
|
||||
|
||||
assert x.grad is not None, "Gradient should flow through GELU"
|
||||
assert np.abs(x.grad).max() > 1e-10, "GELU gradient should be non-zero"
|
||||
|
||||
print("✅ GELU gradient flow works correctly")
|
||||
|
||||
|
||||
def test_layernorm_operations():
|
||||
"""Test gradient flow through LayerNorm operations (sqrt, div)."""
|
||||
print("Testing LayerNorm operations gradient flow...")
|
||||
|
||||
# Test sqrt (monkey-patched in transformer module)
|
||||
x = Tensor(np.array([4.0, 9.0, 16.0]), requires_grad=True)
|
||||
sqrt_x = x.sqrt()
|
||||
assert sqrt_x.requires_grad, "Sqrt should preserve requires_grad"
|
||||
loss = sqrt_x.sum()
|
||||
loss.backward()
|
||||
assert x.grad is not None, "Gradient should flow through sqrt"
|
||||
|
||||
# Test mean (monkey-patched in transformer module)
|
||||
x2 = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), requires_grad=True)
|
||||
mean = x2.mean(axis=-1, keepdims=True)
|
||||
# Mean uses monkey-patched version in transformer context
|
||||
assert mean.requires_grad, "Mean should preserve requires_grad"
|
||||
loss2 = mean.sum()
|
||||
loss2.backward()
|
||||
assert x2.grad is not None, "Gradient should flow through mean"
|
||||
|
||||
print("✅ LayerNorm operations gradient flow works")
|
||||
|
||||
|
||||
def test_reshape_gradient_flow():
|
||||
"""Test that reshape preserves gradient flow."""
|
||||
print("Testing reshape gradient flow...")
|
||||
|
||||
x = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]), requires_grad=True)
|
||||
y = x.reshape(4)
|
||||
|
||||
assert y.requires_grad, "Reshape should preserve requires_grad"
|
||||
assert hasattr(y, '_grad_fn'), "Reshape should set _grad_fn"
|
||||
|
||||
# Backward
|
||||
loss = y.sum()
|
||||
loss.backward()
|
||||
|
||||
assert x.grad is not None, "Gradient should flow through reshape"
|
||||
assert x.grad.shape == x.shape, "Gradient shape should match input shape"
|
||||
|
||||
print("✅ Reshape gradient flow works correctly")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n" + "="*70)
|
||||
print("GRADIENT FLOW TEST SUITE")
|
||||
print("="*70 + "\n")
|
||||
|
||||
test_arithmetic_gradient_flow()
|
||||
test_subtraction_backward()
|
||||
test_division_backward()
|
||||
test_gelu_gradient_flow()
|
||||
test_layernorm_operations()
|
||||
test_reshape_gradient_flow()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("✅ ALL GRADIENT FLOW TESTS PASSED")
|
||||
print("="*70 + "\n")
|
||||
|
||||
238
tests/13_transformers/test_training_simple.py
Normal file
238
tests/13_transformers/test_training_simple.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Simple end-to-end training test for transformers.
|
||||
|
||||
This test validates that a transformer can successfully learn from a tiny dataset,
|
||||
demonstrating that the entire training pipeline (forward, loss, backward, update) works.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.optimizers import Adam
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
from tinytorch.models.transformer import GPT
|
||||
from tinytorch.text.tokenization import CharTokenizer
|
||||
|
||||
|
||||
def test_transformer_memorization():
|
||||
"""
|
||||
Test that a transformer can memorize a tiny dataset.
|
||||
|
||||
Success criteria:
|
||||
- Loss decreases by at least 80% in 500 steps
|
||||
- No NaN/Inf losses
|
||||
- All parameters receive gradients
|
||||
- Training completes in reasonable time (<120s)
|
||||
"""
|
||||
print("\n" + "="*70)
|
||||
print("TEST: Transformer Memorization Capability")
|
||||
print("="*70)
|
||||
|
||||
# Tiny dataset (5 patterns)
|
||||
patterns = [
|
||||
"def add(a, b):\n return a + b",
|
||||
"def sub(a, b):\n return a - b",
|
||||
"for i in range(10):\n print(i)",
|
||||
"if x > 0:\n print('positive')",
|
||||
"numbers = [1, 2, 3, 4, 5]",
|
||||
]
|
||||
|
||||
# Create tokenizer
|
||||
tokenizer = CharTokenizer()
|
||||
tokenizer.build_vocab(patterns)
|
||||
print(f" Vocabulary size: {tokenizer.vocab_size}")
|
||||
|
||||
# Create model (small for fast testing)
|
||||
model = GPT(
|
||||
vocab_size=tokenizer.vocab_size,
|
||||
embed_dim=32,
|
||||
num_layers=1,
|
||||
num_heads=4,
|
||||
max_seq_len=64
|
||||
)
|
||||
|
||||
num_params = sum(np.prod(p.shape) for p in model.parameters())
|
||||
print(f" Model parameters: {num_params:,}")
|
||||
|
||||
# Optimizer and loss
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Encode and pad patterns
|
||||
max_len = 64
|
||||
encoded = []
|
||||
for p in patterns:
|
||||
tokens = tokenizer.encode(p)
|
||||
if len(tokens) > max_len:
|
||||
tokens = tokens[:max_len]
|
||||
else:
|
||||
tokens = tokens + [0] * (max_len - len(tokens))
|
||||
encoded.append(tokens)
|
||||
|
||||
# Training
|
||||
print(" Training for 500 steps...")
|
||||
losses = []
|
||||
start_time = time.time()
|
||||
|
||||
for step in range(500):
|
||||
# Sample random pattern
|
||||
tokens = encoded[np.random.randint(len(encoded))]
|
||||
x = Tensor(np.array([tokens[:-1]], dtype=np.int32))
|
||||
y = Tensor(np.array([tokens[1:]], dtype=np.int32))
|
||||
|
||||
# Forward pass
|
||||
logits = model.forward(x)
|
||||
logits_flat = logits.reshape(len(tokens)-1, tokenizer.vocab_size)
|
||||
y_flat = y.reshape(len(tokens)-1)
|
||||
loss = loss_fn(logits_flat, y_flat)
|
||||
|
||||
# Check for NaN/Inf
|
||||
assert not np.isnan(loss.data).any(), f"NaN loss at step {step}"
|
||||
assert not np.isinf(loss.data).any(), f"Inf loss at step {step}"
|
||||
|
||||
# Backward pass
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
# Check gradients on first step
|
||||
if step == 0:
|
||||
params_with_grad = sum(1 for p in model.parameters()
|
||||
if p.grad is not None and np.abs(p.grad).max() > 1e-10)
|
||||
total_params = len(model.parameters())
|
||||
assert params_with_grad == total_params, \
|
||||
f"Only {params_with_grad}/{total_params} parameters have gradients"
|
||||
|
||||
# Gradient clipping
|
||||
for p in model.parameters():
|
||||
if p.grad is not None:
|
||||
p.grad = np.clip(p.grad, -1.0, 1.0)
|
||||
|
||||
# Update
|
||||
optimizer.step()
|
||||
|
||||
# Track loss
|
||||
losses.append(loss.data.item())
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Compute statistics
|
||||
initial_loss = losses[0]
|
||||
final_loss = np.mean(losses[-100:])
|
||||
loss_decrease_pct = ((initial_loss - final_loss) / initial_loss) * 100
|
||||
|
||||
print(f"\n Results:")
|
||||
print(f" ├─ Initial loss: {initial_loss:.3f}")
|
||||
print(f" ├─ Final loss: {final_loss:.3f}")
|
||||
print(f" ├─ Loss decrease: {loss_decrease_pct:.1f}%")
|
||||
print(f" └─ Training time: {elapsed:.1f}s")
|
||||
|
||||
# Assertions
|
||||
assert elapsed < 120, f"Training too slow: {elapsed:.1f}s > 120s"
|
||||
assert loss_decrease_pct > 80, \
|
||||
f"Insufficient learning: loss decreased only {loss_decrease_pct:.1f}% (expected >80%)"
|
||||
assert final_loss < 0.5, \
|
||||
f"Final loss too high: {final_loss:.3f} (expected <0.5 for memorization)"
|
||||
|
||||
print(f"\n✅ Transformer successfully memorized dataset!")
|
||||
print(f" Loss decreased {loss_decrease_pct:.1f}% in {elapsed:.1f}s")
|
||||
return True
|
||||
|
||||
|
||||
def test_transformer_convergence_rate():
|
||||
"""
|
||||
Test that transformer converges at expected rate.
|
||||
|
||||
This is a regression test to catch training instabilities.
|
||||
"""
|
||||
print("\n" + "="*70)
|
||||
print("TEST: Transformer Convergence Rate")
|
||||
print("="*70)
|
||||
|
||||
# Setup (same as memorization test)
|
||||
patterns = [
|
||||
"def add(a, b):\n return a + b",
|
||||
"def sub(a, b):\n return a - b",
|
||||
]
|
||||
|
||||
tokenizer = CharTokenizer()
|
||||
tokenizer.build_vocab(patterns)
|
||||
|
||||
model = GPT(
|
||||
vocab_size=tokenizer.vocab_size,
|
||||
embed_dim=32,
|
||||
num_layers=1,
|
||||
num_heads=4,
|
||||
max_seq_len=64
|
||||
)
|
||||
|
||||
optimizer = Adam(model.parameters(), lr=0.001)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Encode patterns
|
||||
max_len = 64
|
||||
encoded = []
|
||||
for p in patterns:
|
||||
tokens = tokenizer.encode(p)
|
||||
if len(tokens) > max_len:
|
||||
tokens = tokens[:max_len]
|
||||
else:
|
||||
tokens = tokens + [0] * (max_len - len(tokens))
|
||||
encoded.append(tokens)
|
||||
|
||||
# Train until loss < 0.1
|
||||
step = 0
|
||||
loss_val = float('inf')
|
||||
|
||||
print(f" Training until loss < 0.1...")
|
||||
|
||||
while loss_val > 0.1 and step < 1000:
|
||||
tokens = encoded[np.random.randint(len(encoded))]
|
||||
x = Tensor(np.array([tokens[:-1]], dtype=np.int32))
|
||||
y = Tensor(np.array([tokens[1:]], dtype=np.int32))
|
||||
|
||||
logits = model.forward(x)
|
||||
logits_flat = logits.reshape(len(tokens)-1, tokenizer.vocab_size)
|
||||
y_flat = y.reshape(len(tokens)-1)
|
||||
loss = loss_fn(logits_flat, y_flat)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
for p in model.parameters():
|
||||
if p.grad is not None:
|
||||
p.grad = np.clip(p.grad, -1.0, 1.0)
|
||||
|
||||
optimizer.step()
|
||||
|
||||
loss_val = loss.data.item()
|
||||
step += 1
|
||||
|
||||
print(f" Reached loss < 0.1 in {step} steps")
|
||||
|
||||
# Regression check: should converge in < 500 steps for 2 patterns
|
||||
assert step < 500, \
|
||||
f"Convergence too slow: {step} steps (expected <500). Training may be unstable."
|
||||
|
||||
print(f"✅ Convergence rate is acceptable ({step} steps)")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n" + "="*70)
|
||||
print("TRANSFORMER TRAINING TEST SUITE")
|
||||
print("="*70)
|
||||
|
||||
test_transformer_memorization()
|
||||
test_transformer_convergence_rate()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("✅ ALL TRAINING TESTS PASSED")
|
||||
print("="*70 + "\n")
|
||||
|
||||
239
tests/13_transformers/test_transformer_gradient_flow.py
Normal file
239
tests/13_transformers/test_transformer_gradient_flow.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""
|
||||
Test gradient flow through complete transformer architecture.
|
||||
|
||||
This test validates that all transformer components (embeddings, attention,
|
||||
LayerNorm, MLP) properly propagate gradients during backpropagation.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.models.transformer import GPT, MultiHeadAttention, LayerNorm, MLP
|
||||
from tinytorch.core.losses import CrossEntropyLoss
|
||||
|
||||
|
||||
def test_multihead_attention_gradient_flow():
|
||||
"""Test that all MultiHeadAttention parameters receive gradients."""
|
||||
print("Testing MultiHeadAttention gradient flow...")
|
||||
|
||||
batch_size, seq_len, embed_dim = 2, 8, 16
|
||||
num_heads = 4
|
||||
|
||||
# Create attention module
|
||||
mha = MultiHeadAttention(embed_dim, num_heads)
|
||||
|
||||
# Forward pass
|
||||
x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
|
||||
output = mha.forward(x)
|
||||
|
||||
# Backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
# Check all parameters have gradients
|
||||
params = mha.parameters()
|
||||
params_with_grad = 0
|
||||
params_without_grad = []
|
||||
|
||||
for i, param in enumerate(params):
|
||||
if param.grad is not None and np.abs(param.grad).max() > 1e-10:
|
||||
params_with_grad += 1
|
||||
else:
|
||||
params_without_grad.append(i)
|
||||
|
||||
assert params_with_grad == len(params), \
|
||||
f"All {len(params)} MHA parameters should have gradients, but only {params_with_grad} do. Missing: {params_without_grad}"
|
||||
|
||||
print(f"✅ All {len(params)} MultiHeadAttention parameters receive gradients")
|
||||
|
||||
|
||||
def test_layernorm_gradient_flow():
|
||||
"""Test that LayerNorm parameters receive gradients."""
|
||||
print("Testing LayerNorm gradient flow...")
|
||||
|
||||
batch_size, seq_len, embed_dim = 2, 8, 16
|
||||
|
||||
# Create LayerNorm
|
||||
ln = LayerNorm(embed_dim)
|
||||
|
||||
# Forward pass
|
||||
x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
|
||||
output = ln.forward(x)
|
||||
|
||||
# Backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
# Check parameters have gradients
|
||||
params = ln.parameters()
|
||||
assert len(params) == 2, "LayerNorm should have 2 parameters (gamma, beta)"
|
||||
|
||||
for i, param in enumerate(params):
|
||||
assert param.grad is not None, f"Parameter {i} should have gradient"
|
||||
assert np.abs(param.grad).max() > 1e-10, f"Parameter {i} gradient should be non-zero"
|
||||
|
||||
print("✅ LayerNorm gradient flow works correctly")
|
||||
|
||||
|
||||
def test_mlp_gradient_flow():
|
||||
"""Test that MLP parameters receive gradients."""
|
||||
print("Testing MLP gradient flow...")
|
||||
|
||||
batch_size, seq_len, embed_dim = 2, 8, 16
|
||||
|
||||
# Create MLP
|
||||
mlp = MLP(embed_dim)
|
||||
|
||||
# Forward pass
|
||||
x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
|
||||
output = mlp.forward(x)
|
||||
|
||||
# Backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
# Check all parameters have gradients
|
||||
params = mlp.parameters()
|
||||
for i, param in enumerate(params):
|
||||
assert param.grad is not None, f"MLP parameter {i} should have gradient"
|
||||
assert np.abs(param.grad).max() > 1e-10, f"MLP parameter {i} gradient should be non-zero"
|
||||
|
||||
print(f"✅ All {len(params)} MLP parameters receive gradients")
|
||||
|
||||
|
||||
def test_full_gpt_gradient_flow():
|
||||
"""Test that all GPT model parameters receive gradients end-to-end."""
|
||||
print("Testing full GPT gradient flow...")
|
||||
|
||||
# Create small GPT model
|
||||
vocab_size = 20
|
||||
embed_dim = 16
|
||||
num_layers = 2
|
||||
num_heads = 2
|
||||
max_seq_len = 32
|
||||
|
||||
model = GPT(
|
||||
vocab_size=vocab_size,
|
||||
embed_dim=embed_dim,
|
||||
num_layers=num_layers,
|
||||
num_heads=num_heads,
|
||||
max_seq_len=max_seq_len
|
||||
)
|
||||
|
||||
# Create input and targets
|
||||
batch_size = 2
|
||||
seq_len = 8
|
||||
tokens = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_len)))
|
||||
targets = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_len)))
|
||||
|
||||
# Forward pass
|
||||
logits = model.forward(tokens)
|
||||
|
||||
# Compute loss
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||||
targets_flat = targets.reshape(batch_size * seq_len)
|
||||
loss_fn = CrossEntropyLoss()
|
||||
loss = loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
print(f" Loss: {loss.data:.3f}")
|
||||
|
||||
# Backward pass
|
||||
loss.backward()
|
||||
|
||||
# Check gradient flow to all parameters
|
||||
params = model.parameters()
|
||||
params_with_grad = 0
|
||||
params_without_grad = []
|
||||
|
||||
for i, param in enumerate(params):
|
||||
if param.grad is not None and np.abs(param.grad).max() > 1e-10:
|
||||
params_with_grad += 1
|
||||
else:
|
||||
params_without_grad.append(i)
|
||||
|
||||
# Report detailed results
|
||||
print(f" Parameters with gradients: {params_with_grad}/{len(params)}")
|
||||
|
||||
if params_without_grad:
|
||||
print(f" ⚠️ Parameters WITHOUT gradients: {params_without_grad}")
|
||||
|
||||
# Provide parameter mapping for debugging
|
||||
print("\n Parameter breakdown:")
|
||||
param_idx = 0
|
||||
print(f" {param_idx}: Token embedding weight")
|
||||
param_idx += 1
|
||||
print(f" {param_idx}: Position embedding weight")
|
||||
param_idx += 1
|
||||
|
||||
for block_idx in range(num_layers):
|
||||
print(f" Block {block_idx}:")
|
||||
print(f" {param_idx}-{param_idx+7}: Attention (Q/K/V/out + biases)")
|
||||
param_idx += 8
|
||||
print(f" {param_idx}-{param_idx+1}: LayerNorm 1 (gamma, beta)")
|
||||
param_idx += 2
|
||||
print(f" {param_idx}-{param_idx+1}: LayerNorm 2 (gamma, beta)")
|
||||
param_idx += 2
|
||||
print(f" {param_idx}-{param_idx+3}: MLP (2 linears + biases)")
|
||||
param_idx += 4
|
||||
|
||||
print(f" {param_idx}-{param_idx+1}: Final LayerNorm (gamma, beta)")
|
||||
param_idx += 2
|
||||
print(f" {param_idx}: LM head weight")
|
||||
|
||||
raise AssertionError(f"Expected all {len(params)} parameters to have gradients, but {len(params_without_grad)} don't")
|
||||
|
||||
print(f"✅ All {len(params)} GPT parameters receive gradients")
|
||||
|
||||
|
||||
def test_attention_mask_gradient_flow():
|
||||
"""Test that attention with masking preserves gradient flow."""
|
||||
print("Testing attention with causal mask gradient flow...")
|
||||
|
||||
batch_size, seq_len, embed_dim = 2, 4, 16
|
||||
num_heads = 4
|
||||
|
||||
# Create attention module
|
||||
mha = MultiHeadAttention(embed_dim, num_heads)
|
||||
|
||||
# Create causal mask
|
||||
mask = Tensor(-1e9 * np.triu(np.ones((seq_len, seq_len)), k=1))
|
||||
|
||||
# Forward pass
|
||||
x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
|
||||
output = mha.forward(x, mask)
|
||||
|
||||
# Backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
# Check all parameters have gradients
|
||||
params = mha.parameters()
|
||||
params_with_grad = sum(1 for p in params if p.grad is not None and np.abs(p.grad).max() > 1e-10)
|
||||
|
||||
assert params_with_grad == len(params), \
|
||||
f"Masking should not break gradient flow. Expected {len(params)} params with grads, got {params_with_grad}"
|
||||
|
||||
print("✅ Attention with masking preserves gradient flow")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n" + "="*70)
|
||||
print("TRANSFORMER GRADIENT FLOW TEST SUITE")
|
||||
print("="*70 + "\n")
|
||||
|
||||
test_multihead_attention_gradient_flow()
|
||||
test_layernorm_gradient_flow()
|
||||
test_mlp_gradient_flow()
|
||||
test_attention_mask_gradient_flow()
|
||||
test_full_gpt_gradient_flow()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("✅ ALL TRANSFORMER GRADIENT FLOW TESTS PASSED")
|
||||
print("="*70 + "\n")
|
||||
|
||||
235
tests/TRANSFORMER_LEARNING_TEST_PLAN.md
Normal file
235
tests/TRANSFORMER_LEARNING_TEST_PLAN.md
Normal file
@@ -0,0 +1,235 @@
|
||||
# Transformer Learning Test Plan
|
||||
|
||||
## Overview
|
||||
This document outlines a systematic approach to testing and validating that TinyTorch transformers learn properly across all components and training scenarios.
|
||||
|
||||
## Test Status: ✅ PASSING
|
||||
|
||||
**Quick Validation Results** (2025-10-30):
|
||||
- Initial loss: 3.555
|
||||
- Final loss: 0.031
|
||||
- Loss decrease: 99.1%
|
||||
- Training time: 52.1s (500 steps)
|
||||
- Gradient flow: 21/21 parameters ✅
|
||||
|
||||
---
|
||||
|
||||
## Layer 1: Component-Level Tests
|
||||
|
||||
### 1.1 Autograd Operations
|
||||
**Purpose**: Verify all arithmetic operations preserve gradients
|
||||
|
||||
**Tests**:
|
||||
- ✅ `tests/05_autograd/test_gradient_flow.py`
|
||||
- Addition, subtraction, multiplication, division
|
||||
- Backward pass correctness
|
||||
- GELU activation gradient flow
|
||||
- LayerNorm operations (mean, sqrt, div)
|
||||
- Reshape gradient preservation
|
||||
|
||||
**Coverage**: 6/6 tests passing
|
||||
|
||||
### 1.2 Transformer Components
|
||||
**Purpose**: Verify gradient flow through transformer building blocks
|
||||
|
||||
**Tests**:
|
||||
- ✅ `tests/13_transformers/test_transformer_gradient_flow.py`
|
||||
- MultiHeadAttention (8 parameters)
|
||||
- LayerNorm (2 parameters)
|
||||
- MLP (4 parameters)
|
||||
- Masked attention
|
||||
- Full GPT end-to-end (37 parameters)
|
||||
|
||||
**Coverage**: 5/5 tests passing
|
||||
|
||||
---
|
||||
|
||||
## Layer 2: Training Validation Tests
|
||||
|
||||
### 2.1 Memorization Test
|
||||
**Purpose**: Can the model memorize a tiny dataset?
|
||||
|
||||
**Setup**:
|
||||
```python
|
||||
# 5 patterns, train for 500 steps
|
||||
patterns = [
|
||||
"def add(a, b):\\n return a + b",
|
||||
"def sub(a, b):\\n return a - b",
|
||||
"for i in range(10):\\n print(i)",
|
||||
"if x > 0:\\n print('positive')",
|
||||
"numbers = [1, 2, 3, 4, 5]",
|
||||
]
|
||||
```
|
||||
|
||||
**Expected**: Loss should decrease > 80% in 500 steps
|
||||
**Result**: ✅ 99.1% decrease (3.555 → 0.031)
|
||||
|
||||
### 2.2 Pattern Learning Test
|
||||
**Purpose**: Can the model learn systematic patterns?
|
||||
|
||||
**Setup**:
|
||||
- Train on arithmetic functions with various names
|
||||
- Test if model can complete similar patterns
|
||||
|
||||
**Expected**: Model should predict correct structure even with new variable names
|
||||
|
||||
### 2.3 Generalization Test
|
||||
**Purpose**: Does the model generalize or just memorize?
|
||||
|
||||
**Setup**:
|
||||
- Train/test split (45/5 patterns)
|
||||
- Measure loss on held-out patterns
|
||||
|
||||
**Expected**: Test loss should be within 2x of train loss
|
||||
|
||||
---
|
||||
|
||||
## Layer 3: Regression Tests
|
||||
|
||||
### 3.1 Gradient Flow Regression
|
||||
**File**: `tests/13_transformers/test_transformer_gradient_flow.py`
|
||||
|
||||
**What it tests**:
|
||||
- All attention Q/K/V projections receive gradients
|
||||
- LayerNorm parameters (gamma, beta) receive gradients
|
||||
- MLP parameters receive gradients
|
||||
- Embedding layers receive gradients
|
||||
|
||||
**Why it matters**: Previous bugs broke gradient flow to attention parameters
|
||||
|
||||
### 3.2 Loss Decrease Regression
|
||||
**File**: `tests/13_transformers/test_training_simple.py` (to be created)
|
||||
|
||||
**What it tests**:
|
||||
- Loss decreases on simple dataset
|
||||
- Loss decrease rate > threshold
|
||||
- Training completes without errors
|
||||
|
||||
**Why it matters**: Ensures the entire training loop works end-to-end
|
||||
|
||||
---
|
||||
|
||||
## Layer 4: Performance Benchmarks
|
||||
|
||||
### 4.1 Training Speed
|
||||
**Metric**: Steps per second
|
||||
**Baseline**: ~10 steps/sec for 1-layer, 32d model
|
||||
**Test**: Monitor for regressions
|
||||
|
||||
### 4.2 Memory Usage
|
||||
**Metric**: Peak memory during training
|
||||
**Baseline**: <500MB for small models
|
||||
**Test**: Detect memory leaks
|
||||
|
||||
### 4.3 Convergence Rate
|
||||
**Metric**: Steps to reach 0.1 loss
|
||||
**Baseline**: ~300 steps on 5-pattern dataset
|
||||
**Test**: Detect training instabilities
|
||||
|
||||
---
|
||||
|
||||
## Layer 5: Integration Tests
|
||||
|
||||
### 5.1 Full Pipeline Test
|
||||
**Components**: Tokenizer → Model → Loss → Optimizer → Backward → Update
|
||||
|
||||
**Test**:
|
||||
```bash
|
||||
python milestones/05_2017_transformer/vaswani_copilot.py --train-only
|
||||
```
|
||||
|
||||
**Expected**: Completes training in < 3 minutes with loss decrease > 80%
|
||||
|
||||
### 5.2 Checkpoint Save/Load
|
||||
**Test**: Save model mid-training, load, continue training
|
||||
|
||||
**Expected**: Loss continues decreasing from checkpoint
|
||||
|
||||
### 5.3 Generation Quality
|
||||
**Test**: Generate code completions after training
|
||||
|
||||
**Expected**: Completions should be syntactically valid Python
|
||||
|
||||
---
|
||||
|
||||
## Debugging Checklist
|
||||
|
||||
When a model isn't learning:
|
||||
|
||||
1. **Check Gradient Flow**
|
||||
```bash
|
||||
python tests/13_transformers/test_transformer_gradient_flow.py
|
||||
```
|
||||
- Verify all parameters receive non-zero gradients
|
||||
|
||||
2. **Check Loss Computation**
|
||||
- Print initial loss (should be ~ln(vocab_size))
|
||||
- Verify loss decreases over time
|
||||
- Check for NaN/Inf values
|
||||
|
||||
3. **Check Data Processing**
|
||||
- Verify tokenization produces correct IDs
|
||||
- Check padding/masking is correct
|
||||
- Ensure targets are shifted by 1
|
||||
|
||||
4. **Check Hyperparameters**
|
||||
- Learning rate not too high (>0.01) or too low (<0.0001)
|
||||
- Batch size appropriate
|
||||
- Gradient clipping prevents explosions
|
||||
|
||||
5. **Check Architecture**
|
||||
- Embedding dimension divisible by num_heads
|
||||
- Sequence length < max_seq_len
|
||||
- Vocabulary size matches tokenizer
|
||||
|
||||
---
|
||||
|
||||
## Test Execution
|
||||
|
||||
### Run All Tests
|
||||
```bash
|
||||
# Component tests
|
||||
pytest tests/05_autograd/test_gradient_flow.py -v
|
||||
pytest tests/13_transformers/test_transformer_gradient_flow.py -v
|
||||
|
||||
# Integration test
|
||||
python milestones/05_2017_transformer/vaswani_copilot.py --train-only
|
||||
|
||||
# Quick validation
|
||||
python tests/13_transformers/test_training_simple.py
|
||||
```
|
||||
|
||||
### Expected Output
|
||||
```
|
||||
tests/05_autograd/test_gradient_flow.py ................ [ 54%]
|
||||
tests/13_transformers/test_transformer_gradient_flow.py . [100%]
|
||||
|
||||
====== 11 passed in 3.2s ======
|
||||
|
||||
Transformer learning: ✅ VERIFIED
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Maintenance
|
||||
|
||||
### When to Update Tests
|
||||
1. **After any autograd changes**: Run gradient flow tests
|
||||
2. **After transformer architecture changes**: Run full pipeline test
|
||||
3. **Before releases**: Run all tests + visual inspection of generations
|
||||
|
||||
### Adding New Tests
|
||||
1. Follow existing test structure
|
||||
2. Include clear docstrings explaining what's tested
|
||||
3. Use meaningful assertions with error messages
|
||||
4. Add to this test plan document
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- Gradient Flow Tests: `tests/05_autograd/test_gradient_flow.py`
|
||||
- Transformer Tests: `tests/13_transformers/test_transformer_gradient_flow.py`
|
||||
- Training Validation: Quick 500-step test shown above
|
||||
- Integration: `milestones/05_2017_transformer/vaswani_copilot.py`
|
||||
|
||||
Reference in New Issue
Block a user