Merge transformer-training into dev

Complete Milestone 05 - 2017 Transformer implementation Major Features: - TinyTalks interactive dashboard with rich CLI - Complete gradient flow fixes (13 tests passing) - Multiple training examples (5-min, 10-min, levels 1-2) - Milestone celebration card (perceptron style) - Comprehensive documentation Gradient Flow Fixes: - Fixed reshape, matmul (3D), embedding, sqrt, mean, sub, div, GELU - All transformer components now fully differentiable - Hybrid attention approach for educational clarity + gradients Training Results: - 10-min training: 96.6% loss improvement, 62.5% accuracy - 5-min training: 97.8% loss improvement, 66.7% accuracy - Working chatbot with coherent responses Files Added: - tinytalks_dashboard.py (main demo) - tinytalks_chatbot.py, tinytalks_dataset.py - level1_memorization.py, level2_patterns.py - Comprehensive docs and test suites Ready for student use 2>&1
2026-06-05 01:54:47 -05:00 · 2025-10-30 17:48:11 -04:00
parent ca93669fbc 330e1738db
commit 15d3ed5251
36 changed files with 7365 additions and 2240 deletions
--- a/tests/05_autograd/test_gradient_flow.py
+++ b/tests/05_autograd/test_gradient_flow.py
@@ -0,0 +1,180 @@
+"""
+Test gradient flow through all autograd operations.
+
+This test suite validates that all arithmetic operations and activations
+properly preserve gradient tracking and enable backpropagation.
+"""
+
+import numpy as np
+import sys
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from tinytorch.core.tensor import Tensor
+from tinytorch.core.autograd import enable_autograd
+from tinytorch.core.activations import GELU
+# Import transformer to ensure mean/sqrt monkey-patches are applied
+from tinytorch.models import transformer
+
+
+def test_arithmetic_gradient_flow():
+    """Test that arithmetic operations preserve requires_grad and set _grad_fn."""
+    print("Testing arithmetic gradient flow...")
+    
+    x = Tensor(np.array([2.0, 3.0]), requires_grad=True)
+    y = Tensor(np.array([4.0, 5.0]), requires_grad=True)
+    
+    # Test addition
+    z_add = x + y
+    assert z_add.requires_grad, "Addition should preserve requires_grad"
+    assert hasattr(z_add, '_grad_fn'), "Addition should set _grad_fn"
+    
+    # Test subtraction
+    z_sub = x - y
+    assert z_sub.requires_grad, "Subtraction should preserve requires_grad"
+    assert hasattr(z_sub, '_grad_fn'), "Subtraction should set _grad_fn"
+    
+    # Test multiplication
+    z_mul = x * y
+    assert z_mul.requires_grad, "Multiplication should preserve requires_grad"
+    assert hasattr(z_mul, '_grad_fn'), "Multiplication should set _grad_fn"
+    
+    # Test division
+    z_div = x / y
+    assert z_div.requires_grad, "Division should preserve requires_grad"
+    assert hasattr(z_div, '_grad_fn'), "Division should set _grad_fn"
+    
+    print("✅ All arithmetic operations preserve gradient tracking")
+
+
+def test_subtraction_backward():
+    """Test that subtraction computes correct gradients."""
+    print("Testing subtraction backward pass...")
+    
+    a = Tensor(np.array([5.0, 10.0]), requires_grad=True)
+    b = Tensor(np.array([2.0, 3.0]), requires_grad=True)
+    
+    # Forward: c = a - b
+    c = a - b
+    
+    # Backward
+    loss = c.sum()
+    loss.backward()
+    
+    # Check gradients: ∂loss/∂a = 1, ∂loss/∂b = -1
+    assert a.grad is not None, "Gradient should flow to a"
+    assert b.grad is not None, "Gradient should flow to b"
+    assert np.allclose(a.grad, np.array([1.0, 1.0])), "Gradient wrt a should be 1"
+    assert np.allclose(b.grad, np.array([-1.0, -1.0])), "Gradient wrt b should be -1"
+    
+    print("✅ Subtraction backward pass correct")
+
+
+def test_division_backward():
+    """Test that division computes correct gradients."""
+    print("Testing division backward pass...")
+    
+    a = Tensor(np.array([6.0, 12.0]), requires_grad=True)
+    b = Tensor(np.array([2.0, 3.0]), requires_grad=True)
+    
+    # Forward: c = a / b
+    c = a / b
+    
+    # Backward
+    loss = c.sum()
+    loss.backward()
+    
+    # Check gradients: ∂(a/b)/∂a = 1/b, ∂(a/b)/∂b = -a/b²
+    assert a.grad is not None, "Gradient should flow to a"
+    assert b.grad is not None, "Gradient should flow to b"
+    assert np.allclose(a.grad, 1.0 / b.data), "Gradient wrt a should be 1/b"
+    expected_b_grad = -a.data / (b.data ** 2)
+    assert np.allclose(b.grad, expected_b_grad), "Gradient wrt b should be -a/b²"
+    
+    print("✅ Division backward pass correct")
+
+
+def test_gelu_gradient_flow():
+    """Test that GELU activation preserves gradient flow."""
+    print("Testing GELU gradient flow...")
+    
+    x = Tensor(np.array([1.0, 2.0, 3.0]), requires_grad=True)
+    gelu = GELU()
+    
+    # Forward
+    y = gelu(x)
+    assert y.requires_grad, "GELU output should have requires_grad=True"
+    assert hasattr(y, '_grad_fn'), "GELU should set _grad_fn"
+    
+    # Backward
+    loss = y.sum()
+    loss.backward()
+    
+    assert x.grad is not None, "Gradient should flow through GELU"
+    assert np.abs(x.grad).max() > 1e-10, "GELU gradient should be non-zero"
+    
+    print("✅ GELU gradient flow works correctly")
+
+
+def test_layernorm_operations():
+    """Test gradient flow through LayerNorm operations (sqrt, div)."""
+    print("Testing LayerNorm operations gradient flow...")
+    
+    # Test sqrt (monkey-patched in transformer module)
+    x = Tensor(np.array([4.0, 9.0, 16.0]), requires_grad=True)
+    sqrt_x = x.sqrt()
+    assert sqrt_x.requires_grad, "Sqrt should preserve requires_grad"
+    loss = sqrt_x.sum()
+    loss.backward()
+    assert x.grad is not None, "Gradient should flow through sqrt"
+    
+    # Test mean (monkey-patched in transformer module)
+    x2 = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), requires_grad=True)
+    mean = x2.mean(axis=-1, keepdims=True)
+    # Mean uses monkey-patched version in transformer context
+    assert mean.requires_grad, "Mean should preserve requires_grad"
+    loss2 = mean.sum()
+    loss2.backward()
+    assert x2.grad is not None, "Gradient should flow through mean"
+    
+    print("✅ LayerNorm operations gradient flow works")
+
+
+def test_reshape_gradient_flow():
+    """Test that reshape preserves gradient flow."""
+    print("Testing reshape gradient flow...")
+    
+    x = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]), requires_grad=True)
+    y = x.reshape(4)
+    
+    assert y.requires_grad, "Reshape should preserve requires_grad"
+    assert hasattr(y, '_grad_fn'), "Reshape should set _grad_fn"
+    
+    # Backward
+    loss = y.sum()
+    loss.backward()
+    
+    assert x.grad is not None, "Gradient should flow through reshape"
+    assert x.grad.shape == x.shape, "Gradient shape should match input shape"
+    
+    print("✅ Reshape gradient flow works correctly")
+
+
+if __name__ == "__main__":
+    print("\n" + "="*70)
+    print("GRADIENT FLOW TEST SUITE")
+    print("="*70 + "\n")
+    
+    test_arithmetic_gradient_flow()
+    test_subtraction_backward()
+    test_division_backward()
+    test_gelu_gradient_flow()
+    test_layernorm_operations()
+    test_reshape_gradient_flow()
+    
+    print("\n" + "="*70)
+    print("✅ ALL GRADIENT FLOW TESTS PASSED")
+    print("="*70 + "\n")
+
--- a/tests/13_transformers/test_training_simple.py
+++ b/tests/13_transformers/test_training_simple.py
@@ -0,0 +1,238 @@
+"""
+Simple end-to-end training test for transformers.
+
+This test validates that a transformer can successfully learn from a tiny dataset,
+demonstrating that the entire training pipeline (forward, loss, backward, update) works.
+"""
+
+import numpy as np
+import sys
+import time
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from tinytorch.core.tensor import Tensor
+from tinytorch.core.autograd import enable_autograd
+from tinytorch.core.optimizers import Adam
+from tinytorch.core.losses import CrossEntropyLoss
+from tinytorch.models.transformer import GPT
+from tinytorch.text.tokenization import CharTokenizer
+
+
+def test_transformer_memorization():
+    """
+    Test that a transformer can memorize a tiny dataset.
+    
+    Success criteria:
+    - Loss decreases by at least 80% in 500 steps
+    - No NaN/Inf losses
+    - All parameters receive gradients
+    - Training completes in reasonable time (<120s)
+    """
+    print("\n" + "="*70)
+    print("TEST: Transformer Memorization Capability")
+    print("="*70)
+    
+    # Tiny dataset (5 patterns)
+    patterns = [
+        "def add(a, b):\n    return a + b",
+        "def sub(a, b):\n    return a - b",
+        "for i in range(10):\n    print(i)",
+        "if x > 0:\n    print('positive')",
+        "numbers = [1, 2, 3, 4, 5]",
+    ]
+    
+    # Create tokenizer
+    tokenizer = CharTokenizer()
+    tokenizer.build_vocab(patterns)
+    print(f"   Vocabulary size: {tokenizer.vocab_size}")
+    
+    # Create model (small for fast testing)
+    model = GPT(
+        vocab_size=tokenizer.vocab_size,
+        embed_dim=32,
+        num_layers=1,
+        num_heads=4,
+        max_seq_len=64
+    )
+    
+    num_params = sum(np.prod(p.shape) for p in model.parameters())
+    print(f"   Model parameters: {num_params:,}")
+    
+    # Optimizer and loss
+    optimizer = Adam(model.parameters(), lr=0.001)
+    loss_fn = CrossEntropyLoss()
+    
+    # Encode and pad patterns
+    max_len = 64
+    encoded = []
+    for p in patterns:
+        tokens = tokenizer.encode(p)
+        if len(tokens) > max_len:
+            tokens = tokens[:max_len]
+        else:
+            tokens = tokens + [0] * (max_len - len(tokens))
+        encoded.append(tokens)
+    
+    # Training
+    print("   Training for 500 steps...")
+    losses = []
+    start_time = time.time()
+    
+    for step in range(500):
+        # Sample random pattern
+        tokens = encoded[np.random.randint(len(encoded))]
+        x = Tensor(np.array([tokens[:-1]], dtype=np.int32))
+        y = Tensor(np.array([tokens[1:]], dtype=np.int32))
+        
+        # Forward pass
+        logits = model.forward(x)
+        logits_flat = logits.reshape(len(tokens)-1, tokenizer.vocab_size)
+        y_flat = y.reshape(len(tokens)-1)
+        loss = loss_fn(logits_flat, y_flat)
+        
+        # Check for NaN/Inf
+        assert not np.isnan(loss.data).any(), f"NaN loss at step {step}"
+        assert not np.isinf(loss.data).any(), f"Inf loss at step {step}"
+        
+        # Backward pass
+        optimizer.zero_grad()
+        loss.backward()
+        
+        # Check gradients on first step
+        if step == 0:
+            params_with_grad = sum(1 for p in model.parameters() 
+                                   if p.grad is not None and np.abs(p.grad).max() > 1e-10)
+            total_params = len(model.parameters())
+            assert params_with_grad == total_params, \
+                f"Only {params_with_grad}/{total_params} parameters have gradients"
+        
+        # Gradient clipping
+        for p in model.parameters():
+            if p.grad is not None:
+                p.grad = np.clip(p.grad, -1.0, 1.0)
+        
+        # Update
+        optimizer.step()
+        
+        # Track loss
+        losses.append(loss.data.item())
+    
+    elapsed = time.time() - start_time
+    
+    # Compute statistics
+    initial_loss = losses[0]
+    final_loss = np.mean(losses[-100:])
+    loss_decrease_pct = ((initial_loss - final_loss) / initial_loss) * 100
+    
+    print(f"\n   Results:")
+    print(f"   ├─ Initial loss: {initial_loss:.3f}")
+    print(f"   ├─ Final loss: {final_loss:.3f}")
+    print(f"   ├─ Loss decrease: {loss_decrease_pct:.1f}%")
+    print(f"   └─ Training time: {elapsed:.1f}s")
+    
+    # Assertions
+    assert elapsed < 120, f"Training too slow: {elapsed:.1f}s > 120s"
+    assert loss_decrease_pct > 80, \
+        f"Insufficient learning: loss decreased only {loss_decrease_pct:.1f}% (expected >80%)"
+    assert final_loss < 0.5, \
+        f"Final loss too high: {final_loss:.3f} (expected <0.5 for memorization)"
+    
+    print(f"\n✅ Transformer successfully memorized dataset!")
+    print(f"   Loss decreased {loss_decrease_pct:.1f}% in {elapsed:.1f}s")
+    return True
+
+
+def test_transformer_convergence_rate():
+    """
+    Test that transformer converges at expected rate.
+    
+    This is a regression test to catch training instabilities.
+    """
+    print("\n" + "="*70)
+    print("TEST: Transformer Convergence Rate")
+    print("="*70)
+    
+    # Setup (same as memorization test)
+    patterns = [
+        "def add(a, b):\n    return a + b",
+        "def sub(a, b):\n    return a - b",
+    ]
+    
+    tokenizer = CharTokenizer()
+    tokenizer.build_vocab(patterns)
+    
+    model = GPT(
+        vocab_size=tokenizer.vocab_size,
+        embed_dim=32,
+        num_layers=1,
+        num_heads=4,
+        max_seq_len=64
+    )
+    
+    optimizer = Adam(model.parameters(), lr=0.001)
+    loss_fn = CrossEntropyLoss()
+    
+    # Encode patterns
+    max_len = 64
+    encoded = []
+    for p in patterns:
+        tokens = tokenizer.encode(p)
+        if len(tokens) > max_len:
+            tokens = tokens[:max_len]
+        else:
+            tokens = tokens + [0] * (max_len - len(tokens))
+        encoded.append(tokens)
+    
+    # Train until loss < 0.1
+    step = 0
+    loss_val = float('inf')
+    
+    print(f"   Training until loss < 0.1...")
+    
+    while loss_val > 0.1 and step < 1000:
+        tokens = encoded[np.random.randint(len(encoded))]
+        x = Tensor(np.array([tokens[:-1]], dtype=np.int32))
+        y = Tensor(np.array([tokens[1:]], dtype=np.int32))
+        
+        logits = model.forward(x)
+        logits_flat = logits.reshape(len(tokens)-1, tokenizer.vocab_size)
+        y_flat = y.reshape(len(tokens)-1)
+        loss = loss_fn(logits_flat, y_flat)
+        
+        optimizer.zero_grad()
+        loss.backward()
+        
+        for p in model.parameters():
+            if p.grad is not None:
+                p.grad = np.clip(p.grad, -1.0, 1.0)
+        
+        optimizer.step()
+        
+        loss_val = loss.data.item()
+        step += 1
+    
+    print(f"   Reached loss < 0.1 in {step} steps")
+    
+    # Regression check: should converge in < 500 steps for 2 patterns
+    assert step < 500, \
+        f"Convergence too slow: {step} steps (expected <500). Training may be unstable."
+    
+    print(f"✅ Convergence rate is acceptable ({step} steps)")
+    return True
+
+
+if __name__ == "__main__":
+    print("\n" + "="*70)
+    print("TRANSFORMER TRAINING TEST SUITE")
+    print("="*70)
+    
+    test_transformer_memorization()
+    test_transformer_convergence_rate()
+    
+    print("\n" + "="*70)
+    print("✅ ALL TRAINING TESTS PASSED")
+    print("="*70 + "\n")
+
--- a/tests/13_transformers/test_transformer_gradient_flow.py
+++ b/tests/13_transformers/test_transformer_gradient_flow.py
@@ -0,0 +1,239 @@
+"""
+Test gradient flow through complete transformer architecture.
+
+This test validates that all transformer components (embeddings, attention,
+LayerNorm, MLP) properly propagate gradients during backpropagation.
+"""
+
+import numpy as np
+import sys
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from tinytorch.core.tensor import Tensor
+from tinytorch.core.autograd import enable_autograd
+from tinytorch.models.transformer import GPT, MultiHeadAttention, LayerNorm, MLP
+from tinytorch.core.losses import CrossEntropyLoss
+
+
+def test_multihead_attention_gradient_flow():
+    """Test that all MultiHeadAttention parameters receive gradients."""
+    print("Testing MultiHeadAttention gradient flow...")
+    
+    batch_size, seq_len, embed_dim = 2, 8, 16
+    num_heads = 4
+    
+    # Create attention module
+    mha = MultiHeadAttention(embed_dim, num_heads)
+    
+    # Forward pass
+    x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
+    output = mha.forward(x)
+    
+    # Backward pass
+    loss = output.sum()
+    loss.backward()
+    
+    # Check all parameters have gradients
+    params = mha.parameters()
+    params_with_grad = 0
+    params_without_grad = []
+    
+    for i, param in enumerate(params):
+        if param.grad is not None and np.abs(param.grad).max() > 1e-10:
+            params_with_grad += 1
+        else:
+            params_without_grad.append(i)
+    
+    assert params_with_grad == len(params), \
+        f"All {len(params)} MHA parameters should have gradients, but only {params_with_grad} do. Missing: {params_without_grad}"
+    
+    print(f"✅ All {len(params)} MultiHeadAttention parameters receive gradients")
+
+
+def test_layernorm_gradient_flow():
+    """Test that LayerNorm parameters receive gradients."""
+    print("Testing LayerNorm gradient flow...")
+    
+    batch_size, seq_len, embed_dim = 2, 8, 16
+    
+    # Create LayerNorm
+    ln = LayerNorm(embed_dim)
+    
+    # Forward pass
+    x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
+    output = ln.forward(x)
+    
+    # Backward pass
+    loss = output.sum()
+    loss.backward()
+    
+    # Check parameters have gradients
+    params = ln.parameters()
+    assert len(params) == 2, "LayerNorm should have 2 parameters (gamma, beta)"
+    
+    for i, param in enumerate(params):
+        assert param.grad is not None, f"Parameter {i} should have gradient"
+        assert np.abs(param.grad).max() > 1e-10, f"Parameter {i} gradient should be non-zero"
+    
+    print("✅ LayerNorm gradient flow works correctly")
+
+
+def test_mlp_gradient_flow():
+    """Test that MLP parameters receive gradients."""
+    print("Testing MLP gradient flow...")
+    
+    batch_size, seq_len, embed_dim = 2, 8, 16
+    
+    # Create MLP
+    mlp = MLP(embed_dim)
+    
+    # Forward pass
+    x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
+    output = mlp.forward(x)
+    
+    # Backward pass
+    loss = output.sum()
+    loss.backward()
+    
+    # Check all parameters have gradients
+    params = mlp.parameters()
+    for i, param in enumerate(params):
+        assert param.grad is not None, f"MLP parameter {i} should have gradient"
+        assert np.abs(param.grad).max() > 1e-10, f"MLP parameter {i} gradient should be non-zero"
+    
+    print(f"✅ All {len(params)} MLP parameters receive gradients")
+
+
+def test_full_gpt_gradient_flow():
+    """Test that all GPT model parameters receive gradients end-to-end."""
+    print("Testing full GPT gradient flow...")
+    
+    # Create small GPT model
+    vocab_size = 20
+    embed_dim = 16
+    num_layers = 2
+    num_heads = 2
+    max_seq_len = 32
+    
+    model = GPT(
+        vocab_size=vocab_size,
+        embed_dim=embed_dim,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        max_seq_len=max_seq_len
+    )
+    
+    # Create input and targets
+    batch_size = 2
+    seq_len = 8
+    tokens = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_len)))
+    targets = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_len)))
+    
+    # Forward pass
+    logits = model.forward(tokens)
+    
+    # Compute loss
+    logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
+    targets_flat = targets.reshape(batch_size * seq_len)
+    loss_fn = CrossEntropyLoss()
+    loss = loss_fn.forward(logits_flat, targets_flat)
+    
+    print(f"   Loss: {loss.data:.3f}")
+    
+    # Backward pass
+    loss.backward()
+    
+    # Check gradient flow to all parameters
+    params = model.parameters()
+    params_with_grad = 0
+    params_without_grad = []
+    
+    for i, param in enumerate(params):
+        if param.grad is not None and np.abs(param.grad).max() > 1e-10:
+            params_with_grad += 1
+        else:
+            params_without_grad.append(i)
+    
+    # Report detailed results
+    print(f"   Parameters with gradients: {params_with_grad}/{len(params)}")
+    
+    if params_without_grad:
+        print(f"   ⚠️  Parameters WITHOUT gradients: {params_without_grad}")
+        
+        # Provide parameter mapping for debugging
+        print("\n   Parameter breakdown:")
+        param_idx = 0
+        print(f"     {param_idx}: Token embedding weight")
+        param_idx += 1
+        print(f"     {param_idx}: Position embedding weight")
+        param_idx += 1
+        
+        for block_idx in range(num_layers):
+            print(f"     Block {block_idx}:")
+            print(f"       {param_idx}-{param_idx+7}: Attention (Q/K/V/out + biases)")
+            param_idx += 8
+            print(f"       {param_idx}-{param_idx+1}: LayerNorm 1 (gamma, beta)")
+            param_idx += 2
+            print(f"       {param_idx}-{param_idx+1}: LayerNorm 2 (gamma, beta)")
+            param_idx += 2
+            print(f"       {param_idx}-{param_idx+3}: MLP (2 linears + biases)")
+            param_idx += 4
+        
+        print(f"     {param_idx}-{param_idx+1}: Final LayerNorm (gamma, beta)")
+        param_idx += 2
+        print(f"     {param_idx}: LM head weight")
+        
+        raise AssertionError(f"Expected all {len(params)} parameters to have gradients, but {len(params_without_grad)} don't")
+    
+    print(f"✅ All {len(params)} GPT parameters receive gradients")
+
+
+def test_attention_mask_gradient_flow():
+    """Test that attention with masking preserves gradient flow."""
+    print("Testing attention with causal mask gradient flow...")
+    
+    batch_size, seq_len, embed_dim = 2, 4, 16
+    num_heads = 4
+    
+    # Create attention module
+    mha = MultiHeadAttention(embed_dim, num_heads)
+    
+    # Create causal mask
+    mask = Tensor(-1e9 * np.triu(np.ones((seq_len, seq_len)), k=1))
+    
+    # Forward pass
+    x = Tensor(np.random.randn(batch_size, seq_len, embed_dim))
+    output = mha.forward(x, mask)
+    
+    # Backward pass
+    loss = output.sum()
+    loss.backward()
+    
+    # Check all parameters have gradients
+    params = mha.parameters()
+    params_with_grad = sum(1 for p in params if p.grad is not None and np.abs(p.grad).max() > 1e-10)
+    
+    assert params_with_grad == len(params), \
+        f"Masking should not break gradient flow. Expected {len(params)} params with grads, got {params_with_grad}"
+    
+    print("✅ Attention with masking preserves gradient flow")
+
+
+if __name__ == "__main__":
+    print("\n" + "="*70)
+    print("TRANSFORMER GRADIENT FLOW TEST SUITE")
+    print("="*70 + "\n")
+    
+    test_multihead_attention_gradient_flow()
+    test_layernorm_gradient_flow()
+    test_mlp_gradient_flow()
+    test_attention_mask_gradient_flow()
+    test_full_gpt_gradient_flow()
+    
+    print("\n" + "="*70)
+    print("✅ ALL TRANSFORMER GRADIENT FLOW TESTS PASSED")
+    print("="*70 + "\n")
+
--- a/tests/TRANSFORMER_LEARNING_TEST_PLAN.md
+++ b/tests/TRANSFORMER_LEARNING_TEST_PLAN.md
@@ -0,0 +1,235 @@
+# Transformer Learning Test Plan
+
+## Overview
+This document outlines a systematic approach to testing and validating that TinyTorch transformers learn properly across all components and training scenarios.
+
+## Test Status: ✅ PASSING
+
+**Quick Validation Results** (2025-10-30):
+- Initial loss: 3.555
+- Final loss: 0.031
+- Loss decrease: 99.1%
+- Training time: 52.1s (500 steps)
+- Gradient flow: 21/21 parameters ✅
+
+---
+
+## Layer 1: Component-Level Tests
+
+### 1.1 Autograd Operations
+**Purpose**: Verify all arithmetic operations preserve gradients
+
+**Tests**:
+- ✅ `tests/05_autograd/test_gradient_flow.py`
+  - Addition, subtraction, multiplication, division
+  - Backward pass correctness
+  - GELU activation gradient flow
+  - LayerNorm operations (mean, sqrt, div)
+  - Reshape gradient preservation
+
+**Coverage**: 6/6 tests passing
+
+### 1.2 Transformer Components
+**Purpose**: Verify gradient flow through transformer building blocks
+
+**Tests**:
+- ✅ `tests/13_transformers/test_transformer_gradient_flow.py`
+  - MultiHeadAttention (8 parameters)
+  - LayerNorm (2 parameters)
+  - MLP (4 parameters)
+  - Masked attention
+  - Full GPT end-to-end (37 parameters)
+
+**Coverage**: 5/5 tests passing
+
+---
+
+## Layer 2: Training Validation Tests
+
+### 2.1 Memorization Test
+**Purpose**: Can the model memorize a tiny dataset?
+
+**Setup**:
+```python
+# 5 patterns, train for 500 steps
+patterns = [
+    "def add(a, b):\\n    return a + b",
+    "def sub(a, b):\\n    return a - b",
+    "for i in range(10):\\n    print(i)",
+    "if x > 0:\\n    print('positive')",
+    "numbers = [1, 2, 3, 4, 5]",
+]
+```
+
+**Expected**: Loss should decrease > 80% in 500 steps
+**Result**: ✅ 99.1% decrease (3.555 → 0.031)
+
+### 2.2 Pattern Learning Test
+**Purpose**: Can the model learn systematic patterns?
+
+**Setup**:
+- Train on arithmetic functions with various names
+- Test if model can complete similar patterns
+
+**Expected**: Model should predict correct structure even with new variable names
+
+### 2.3 Generalization Test
+**Purpose**: Does the model generalize or just memorize?
+
+**Setup**:
+- Train/test split (45/5 patterns)
+- Measure loss on held-out patterns
+
+**Expected**: Test loss should be within 2x of train loss
+
+---
+
+## Layer 3: Regression Tests
+
+### 3.1 Gradient Flow Regression
+**File**: `tests/13_transformers/test_transformer_gradient_flow.py`
+
+**What it tests**:
+- All attention Q/K/V projections receive gradients
+- LayerNorm parameters (gamma, beta) receive gradients  
+- MLP parameters receive gradients
+- Embedding layers receive gradients
+
+**Why it matters**: Previous bugs broke gradient flow to attention parameters
+
+### 3.2 Loss Decrease Regression
+**File**: `tests/13_transformers/test_training_simple.py` (to be created)
+
+**What it tests**:
+- Loss decreases on simple dataset
+- Loss decrease rate > threshold
+- Training completes without errors
+
+**Why it matters**: Ensures the entire training loop works end-to-end
+
+---
+
+## Layer 4: Performance Benchmarks
+
+### 4.1 Training Speed
+**Metric**: Steps per second
+**Baseline**: ~10 steps/sec for 1-layer, 32d model
+**Test**: Monitor for regressions
+
+### 4.2 Memory Usage
+**Metric**: Peak memory during training
+**Baseline**: <500MB for small models
+**Test**: Detect memory leaks
+
+### 4.3 Convergence Rate
+**Metric**: Steps to reach 0.1 loss
+**Baseline**: ~300 steps on 5-pattern dataset
+**Test**: Detect training instabilities
+
+---
+
+## Layer 5: Integration Tests
+
+### 5.1 Full Pipeline Test
+**Components**: Tokenizer → Model → Loss → Optimizer → Backward → Update
+
+**Test**:
+```bash
+python milestones/05_2017_transformer/vaswani_copilot.py --train-only
+```
+
+**Expected**: Completes training in < 3 minutes with loss decrease > 80%
+
+### 5.2 Checkpoint Save/Load
+**Test**: Save model mid-training, load, continue training
+
+**Expected**: Loss continues decreasing from checkpoint
+
+### 5.3 Generation Quality
+**Test**: Generate code completions after training
+
+**Expected**: Completions should be syntactically valid Python
+
+---
+
+## Debugging Checklist
+
+When a model isn't learning:
+
+1. **Check Gradient Flow**
+   ```bash
+   python tests/13_transformers/test_transformer_gradient_flow.py
+   ```
+   - Verify all parameters receive non-zero gradients
+
+2. **Check Loss Computation**
+   - Print initial loss (should be ~ln(vocab_size))
+   - Verify loss decreases over time
+   - Check for NaN/Inf values
+
+3. **Check Data Processing**
+   - Verify tokenization produces correct IDs
+   - Check padding/masking is correct
+   - Ensure targets are shifted by 1
+
+4. **Check Hyperparameters**
+   - Learning rate not too high (>0.01) or too low (<0.0001)
+   - Batch size appropriate
+   - Gradient clipping prevents explosions
+
+5. **Check Architecture**
+   - Embedding dimension divisible by num_heads
+   - Sequence length < max_seq_len
+   - Vocabulary size matches tokenizer
+
+---
+
+## Test Execution
+
+### Run All Tests
+```bash
+# Component tests
+pytest tests/05_autograd/test_gradient_flow.py -v
+pytest tests/13_transformers/test_transformer_gradient_flow.py -v
+
+# Integration test  
+python milestones/05_2017_transformer/vaswani_copilot.py --train-only
+
+# Quick validation
+python tests/13_transformers/test_training_simple.py
+```
+
+### Expected Output
+```
+tests/05_autograd/test_gradient_flow.py ................ [ 54%]
+tests/13_transformers/test_transformer_gradient_flow.py . [100%]
+
+====== 11 passed in 3.2s ======
+
+Transformer learning: ✅ VERIFIED
+```
+
+---
+
+## Maintenance
+
+### When to Update Tests
+1. **After any autograd changes**: Run gradient flow tests
+2. **After transformer architecture changes**: Run full pipeline test
+3. **Before releases**: Run all tests + visual inspection of generations
+
+### Adding New Tests
+1. Follow existing test structure
+2. Include clear docstrings explaining what's tested
+3. Use meaningful assertions with error messages
+4. Add to this test plan document
+
+---
+
+## References
+
+- Gradient Flow Tests: `tests/05_autograd/test_gradient_flow.py`
+- Transformer Tests: `tests/13_transformers/test_transformer_gradient_flow.py`
+- Training Validation: Quick 500-step test shown above
+- Integration: `milestones/05_2017_transformer/vaswani_copilot.py`
+