Remove non-Vaswani transformer examples

Keep only the three Vaswani examples that reference the 2017 Attention Is All You Need paper: - vaswani_chatgpt.py (Q&A generation) - vaswani_copilot.py (Python autocomplete) - vaswani_shakespeare.py (text generation) Removed 14 redundant example files
2026-06-03 13:59:36 -05:00 · 2025-11-05 09:15:17 -05:00
parent a49d4c3810
commit aa36fef9df
14 changed files with 0 additions and 5251 deletions
--- a/milestones/05_2017_transformer/download_tinystories.py
+++ b/milestones/05_2017_transformer/download_tinystories.py
@@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download and prepare TinyStories dataset for TinyTorch training.
-
-TinyStories is a dataset of simple, synthetic stories designed for
-training small language models. It's much easier than Shakespeare!
-"""
-
-import os
-import urllib.request
-
-def download_tinystories():
-    """Download TinyStories dataset."""
-    
-    # Create data directory
-    data_dir = os.path.join(os.path.dirname(__file__), '../datasets/tinystories')
-    os.makedirs(data_dir, exist_ok=True)
-    
-    # TinyStories validation set (smaller, good for testing)
-    urls = {
-        'tiny_val': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-valid.txt',
-        'tiny_train_small': 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories-train.txt'
-    }
-    
-    print("📥 Downloading TinyStories dataset...")
-    print("="*70)
-    
-    # Start with validation set (much smaller for testing)
-    filename = 'tinystories_val.txt'
-    filepath = os.path.join(data_dir, filename)
-    
-    if os.path.exists(filepath):
-        print(f"✅ {filename} already exists")
-        size = os.path.getsize(filepath) / (1024 * 1024)
-        print(f"   Size: {size:.2f} MB")
-    else:
-        print(f"⬇️  Downloading {filename}...")
-        try:
-            urllib.request.urlretrieve(urls['tiny_val'], filepath)
-            size = os.path.getsize(filepath) / (1024 * 1024)
-            print(f"✅ Downloaded! Size: {size:.2f} MB")
-        except Exception as e:
-            print(f"❌ Error downloading: {e}")
-            print("\n💡 Alternative: Download manually from:")
-            print(f"   {urls['tiny_val']}")
-            print(f"   Save to: {filepath}")
-            return None
-    
-    # Read and show sample
-    with open(filepath, 'r', encoding='utf-8') as f:
-        text = f.read()
-    
-    print(f"\n📊 Dataset Stats:")
-    print(f"   Total characters: {len(text):,}")
-    print(f"   Total words: {len(text.split()):,}")
-    print(f"   Unique characters: {len(set(text))}")
-    
-    # Show first story
-    stories = text.split('<|endoftext|>')
-    if len(stories) > 0:
-        first_story = stories[0].strip()
-        print(f"\n📖 Sample Story:")
-        print("   " + "-"*66)
-        print("   " + first_story[:300].replace('\n', '\n   '))
-        if len(first_story) > 300:
-            print("   ...")
-        print("   " + "-"*66)
-    
-    print(f"\n✅ TinyStories ready for training!")
-    print(f"   Location: {filepath}")
-    
-    return filepath
-
-if __name__ == '__main__':
-    download_tinystories()
--- a/milestones/05_2017_transformer/level1_memorization.py
+++ b/milestones/05_2017_transformer/level1_memorization.py
@@ -1,338 +0,0 @@
-"""
-Milestone 05 - Level 1: Transformer Memorization Test
-======================================================
-
-SIMPLEST POSSIBLE TRANSFORMER TEST:
-Can the transformer memorize and reproduce simple sequences?
-
-Task: Given "ABCD", predict "BCDE"
-      Given "1234", predict "2345"
-
-Expected: 
- Train in < 2 minutes
- Loss should drop from ~3.0 to < 0.1
- Should perfectly predict next character
-
-This validates:
-✓ Transformer architecture works
-✓ Attention mechanism works
-✓ Gradient flow works
-✓ Training loop works
-"""
-
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-import numpy as np
-import time
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.models.transformer import GPT
-
-enable_autograd()
-
-# ============================================================================
-# Level 1: Simple Memorization Dataset
-# ============================================================================
-
-def create_memorization_dataset():
-    """
-    Create ultra-simple sequences to memorize:
-    - Alphabet sequences: ABCD, EFGH, etc.
-    - Number sequences: 1234, 5678, etc.
-    - Pattern sequences: AAAA, BBBB, etc.
-    """
-    sequences = [
-        # Alphabet
-        "ABCDE",
-        "FGHIJ",
-        "KLMNO",
-        "PQRST",
-        "UVWXY",
-        # Numbers
-        "12345",
-        "67890",
-        # Patterns
-        "AAAAA",
-        "BBBBB",
-        "CCCCC",
-        # Mixed
-        "A1B2C",
-        "X9Y8Z",
-    ]
-    return sequences
-
-
-def create_simple_tokenizer(sequences):
-    """Create character-level tokenizer for sequences."""
-    # Get all unique characters
-    all_chars = sorted(set(''.join(sequences)))
-    
-    # Create mappings (0 is reserved for padding)
-    char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)}
-    idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)}
-    char_to_idx['<PAD>'] = 0
-    idx_to_char[0] = '<PAD>'
-    
-    return char_to_idx, idx_to_char
-
-
-def encode_sequence(seq, char_to_idx, max_len=8):
-    """Encode sequence to token IDs."""
-    tokens = [char_to_idx.get(c, 0) for c in seq]
-    # Pad to max_len
-    if len(tokens) < max_len:
-        tokens = tokens + [0] * (max_len - len(tokens))
-    else:
-        tokens = tokens[:max_len]
-    return tokens
-
-
-def decode_sequence(tokens, idx_to_char):
-    """Decode token IDs to string."""
-    chars = [idx_to_char.get(t, '') for t in tokens if t != 0]
-    return ''.join(chars)
-
-
-# ============================================================================
-# Training
-# ============================================================================
-
-def train_memorization(model, optimizer, loss_fn, train_data, vocab_size, max_steps=200):
-    """
-    Train transformer to memorize sequences.
-    Target: < 2 minutes, loss < 0.1
-    """
-    print("=" * 70)
-    print("TRAINING LEVEL 1: MEMORIZATION")
-    print("=" * 70)
-    print(f"Dataset: {len(train_data)} sequences")
-    print(f"Vocab size: {vocab_size}")
-    print(f"Max steps: {max_steps}")
-    print(f"Target: Loss < 0.1 in < 2 minutes")
-    print()
-    
-    start_time = time.time()
-    losses = []
-    
-    for step in range(max_steps):
-        # Sample random sequence
-        tokens = train_data[np.random.randint(len(train_data))]
-        
-        # Input: all but last token
-        # Target: all but first token (next token prediction)
-        input_seq = tokens[:-1]
-        target_seq = tokens[1:]
-        
-        # Convert to tensors
-        x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
-        y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
-        
-        # Forward pass
-        logits = model.forward(x)
-        
-        # Compute loss
-        batch_size, seq_len, vocab_size_out = logits.shape
-        logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out)
-        targets_flat = y_true.reshape(batch_size * seq_len)
-        loss = loss_fn.forward(logits_flat, targets_flat)
-        
-        # Backward pass
-        optimizer.zero_grad()
-        loss.backward()
-        
-        # Clip gradients
-        for param in model.parameters():
-            if param.grad is not None:
-                np.clip(param.grad, -1.0, 1.0, out=param.grad)
-        
-        # Update
-        optimizer.step()
-        
-        losses.append(loss.data.item())
-        
-        # Progress every 50 steps
-        if step % 50 == 0:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            elapsed = time.time() - start_time
-            print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s")
-            
-            # Early stopping
-            if avg_loss < 0.2:
-                print(f"\n✓ Target reached! Loss < 0.2 at step {step}")
-                break
-    
-    elapsed = time.time() - start_time
-    final_loss = np.mean(losses[-100:])
-    initial_loss = np.mean(losses[:10])
-    improvement = (1 - final_loss / initial_loss) * 100
-    
-    print()
-    print("=" * 70)
-    print("TRAINING COMPLETE")
-    print("=" * 70)
-    print(f"Time: {elapsed:.1f} seconds")
-    print(f"Initial loss: {initial_loss:.4f}")
-    print(f"Final loss: {final_loss:.4f}")
-    print(f"Improvement: {improvement:.1f}%")
-    print()
-    
-    return losses
-
-
-# ============================================================================
-# Testing
-# ============================================================================
-
-def test_memorization(model, test_sequences, char_to_idx, idx_to_char):
-    """
-    Test if model can reproduce memorized sequences.
-    """
-    print("=" * 70)
-    print("TESTING LEVEL 1: MEMORIZATION")
-    print("=" * 70)
-    print()
-    
-    correct = 0
-    total = len(test_sequences)
-    
-    for seq in test_sequences:
-        # Encode
-        tokens = encode_sequence(seq, char_to_idx, max_len=8)
-        
-        # Get model predictions
-        x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False)
-        logits = model.forward(x)
-        
-        # Decode predictions (greedy)
-        predicted_tokens = []
-        for i in range(logits.shape[1]):
-            next_token = int(np.argmax(logits.data[0, i, :]))
-            predicted_tokens.append(next_token)
-        
-        # Compare
-        expected = tokens[1:]  # Target sequence
-        predicted = predicted_tokens
-        
-        # Check if match (ignoring padding)
-        match = True
-        for exp, pred in zip(expected, predicted):
-            if exp == 0:  # Padding, stop checking
-                break
-            if exp != pred:
-                match = False
-                break
-        
-        if match:
-            correct += 1
-            status = "✓"
-        else:
-            status = "✗"
-        
-        # Decode for display
-        expected_str = decode_sequence(expected, idx_to_char)
-        predicted_str = decode_sequence(predicted, idx_to_char)
-        
-        print(f"{status} Input: {seq[:4]:8s} → Expected: {expected_str:8s} | Got: {predicted_str:8s}")
-    
-    accuracy = (correct / total) * 100
-    print()
-    print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)")
-    print()
-    
-    if accuracy >= 90:
-        print("✓ LEVEL 1 PASSED: Transformer can memorize sequences!")
-    else:
-        print("✗ LEVEL 1 FAILED: Needs more training or debugging")
-    
-    return accuracy
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    print()
-    print("=" * 70)
-    print("MILESTONE 05 - LEVEL 1: TRANSFORMER MEMORIZATION TEST")
-    print("=" * 70)
-    print()
-    print("Goal: Train transformer to memorize simple sequences in < 2 minutes")
-    print()
-    
-    # Create dataset
-    sequences = create_memorization_dataset()
-    char_to_idx, idx_to_char = create_simple_tokenizer(sequences)
-    vocab_size = len(idx_to_char)
-    
-    print(f"Dataset: {len(sequences)} sequences")
-    print(f"Vocabulary: {vocab_size} tokens")
-    print(f"Example: {sequences[0]} → {encode_sequence(sequences[0], char_to_idx)}")
-    print()
-    
-    # Encode all sequences
-    train_data = [encode_sequence(seq, char_to_idx, max_len=8) for seq in sequences]
-    
-    # Create ULTRA-tiny model for speed
-    config = {
-        'vocab_size': vocab_size,
-        'embed_dim': 16,      # Super tiny!
-        'num_layers': 1,      # Just 1 layer
-        'num_heads': 2,       # 2 heads
-        'max_seq_len': 8,     # Short sequences
-    }
-    
-    print("Model configuration:")
-    for key, val in config.items():
-        print(f"  {key}: {val}")
-    print()
-    
-    model = GPT(**config)
-    num_params = sum(np.prod(p.shape) for p in model.parameters())
-    print(f"Parameters: {num_params:,}")
-    print()
-    
-    # Optimizer and loss
-    optimizer = Adam(model.parameters(), lr=0.001)
-    loss_fn = CrossEntropyLoss()
-    
-    # Train
-    print("Starting training...")
-    print()
-    losses = train_memorization(
-        model=model,
-        optimizer=optimizer,
-        loss_fn=loss_fn,
-        train_data=train_data,
-        vocab_size=vocab_size,
-        max_steps=200  # Reduced for speed (ultra-tiny model)
-    )
-    
-    # Test
-    print("Starting testing...")
-    print()
-    accuracy = test_memorization(model, sequences, char_to_idx, idx_to_char)
-    
-    # Summary
-    print("=" * 70)
-    print("LEVEL 1 SUMMARY")
-    print("=" * 70)
-    print(f"✓ Training: {len(losses)} steps")
-    print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
-    print(f"✓ Accuracy: {accuracy:.1f}%")
-    print()
-    
-    if accuracy >= 90:
-        print("🎉 LEVEL 1 COMPLETE! Ready for Level 2: Pattern Completion")
-    else:
-        print("⚠️  LEVEL 1 INCOMPLETE: Needs debugging")
-    print()
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/level2_patterns.py
+++ b/milestones/05_2017_transformer/level2_patterns.py
@@ -1,357 +0,0 @@
-"""
-Milestone 05 - Level 2: Transformer Pattern Completion
-=======================================================
-
-SIMPLE PATTERN COMPLETION TEST:
-Can the transformer learn to complete simple patterns?
-
-Task: Given "A B C", predict "D"
-      Given "1 2 3", predict "4"
-      Given "do re mi", predict "fa"
-
-Expected: 
- Train in < 5 minutes
- Loss should drop from ~3.0 to < 0.5
- Should complete 70%+ of patterns correctly
-
-This validates:
-✓ Transformer can learn relationships
-✓ Attention mechanism captures patterns
-✓ Model generalizes beyond memorization
-"""
-
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-import numpy as np
-import time
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.models.transformer import GPT
-
-enable_autograd()
-
-# ============================================================================
-# Level 2: Pattern Completion Dataset
-# ============================================================================
-
-def create_pattern_dataset():
-    """
-    Create simple completion patterns:
-    - Sequences: A B C → D
-    - Counting: 1 2 3 → 4
-    - Musical: do re mi → fa
-    """
-    patterns = [
-        # Alphabet sequences
-        ("A B C", "D"),
-        ("D E F", "G"),
-        ("M N O", "P"),
-        ("W X Y", "Z"),
-        # Numbers
-        ("1 2 3", "4"),
-        ("5 6 7", "8"),
-        # Words (short)
-        ("cat dog", "rat"),
-        ("up down", "left"),
-        # Repetition
-        ("A A A", "A"),
-        ("B B B", "B"),
-        ("1 1 1", "1"),
-    ]
-    return patterns
-
-
-def create_tokenizer(patterns):
-    """Create character-level tokenizer."""
-    # Get all unique characters
-    all_text = ' '.join([p[0] + ' ' + p[1] for p in patterns])
-    all_chars = sorted(set(all_text))
-    
-    # Create mappings (0 = padding, 1 = EOS)
-    char_to_idx = {char: idx + 2 for idx, char in enumerate(all_chars)}
-    idx_to_char = {idx + 2: char for idx, char in enumerate(all_chars)}
-    char_to_idx['<PAD>'] = 0
-    char_to_idx['<EOS>'] = 1
-    idx_to_char[0] = '<PAD>'
-    idx_to_char[1] = '<EOS>'
-    
-    return char_to_idx, idx_to_char
-
-
-def encode_pattern(input_str, target_str, char_to_idx, max_len=16):
-    """Encode pattern as: input + <EOS> + target + <EOS>, then pad."""
-    # Encode input
-    input_tokens = [char_to_idx.get(c, 0) for c in input_str]
-    input_tokens.append(1)  # EOS
-    
-    # Encode target
-    target_tokens = [char_to_idx.get(c, 0) for c in target_str]
-    target_tokens.append(1)  # EOS
-    
-    # Combine
-    tokens = input_tokens + target_tokens
-    
-    # Pad
-    if len(tokens) < max_len:
-        tokens = tokens + [0] * (max_len - len(tokens))
-    else:
-        tokens = tokens[:max_len]
-    
-    return tokens
-
-
-def decode_tokens(tokens, idx_to_char):
-    """Decode tokens to string."""
-    chars = []
-    for t in tokens:
-        if t == 0:  # padding
-            break
-        if t == 1:  # EOS
-            break
-        chars.append(idx_to_char.get(t, '?'))
-    return ''.join(chars)
-
-
-# ============================================================================
-# Training
-# ============================================================================
-
-def train_patterns(model, optimizer, loss_fn, train_data, vocab_size, max_steps=400):
-    """
-    Train transformer to complete patterns.
-    Target: < 5 minutes, loss < 0.5
-    """
-    print("=" * 70)
-    print("TRAINING LEVEL 2: PATTERN COMPLETION")
-    print("=" * 70)
-    print(f"Dataset: {len(train_data)} patterns")
-    print(f"Vocab size: {vocab_size}")
-    print(f"Max steps: {max_steps}")
-    print(f"Target: Loss < 0.5 in < 5 minutes")
-    print()
-    
-    start_time = time.time()
-    losses = []
-    
-    for step in range(max_steps):
-        # Sample random pattern
-        tokens = train_data[np.random.randint(len(train_data))]
-        
-        # Input: all but last
-        # Target: all but first (shifted by 1)
-        input_seq = tokens[:-1]
-        target_seq = tokens[1:]
-        
-        # Convert to tensors
-        x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
-        y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
-        
-        # Forward pass
-        logits = model.forward(x)
-        
-        # Compute loss
-        batch_size, seq_len, vocab_size_out = logits.shape
-        logits_flat = logits.reshape(batch_size * seq_len, vocab_size_out)
-        targets_flat = y_true.reshape(batch_size * seq_len)
-        loss = loss_fn.forward(logits_flat, targets_flat)
-        
-        # Backward pass
-        optimizer.zero_grad()
-        loss.backward()
-        
-        # Clip gradients
-        for param in model.parameters():
-            if param.grad is not None:
-                np.clip(param.grad, -1.0, 1.0, out=param.grad)
-        
-        # Update
-        optimizer.step()
-        
-        losses.append(loss.data.item())
-        
-        # Progress every 50 steps
-        if step % 50 == 0 or step == max_steps - 1:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            elapsed = time.time() - start_time
-            print(f"Step {step:4d}/{max_steps} | Loss: {avg_loss:.4f} | Time: {elapsed:.1f}s")
-            
-            # Early stopping
-            if avg_loss < 0.5:
-                print(f"\n✓ Target reached! Loss < 0.5 at step {step}")
-                break
-    
-    elapsed = time.time() - start_time
-    final_loss = np.mean(losses[-100:])
-    initial_loss = np.mean(losses[:10])
-    improvement = (1 - final_loss / initial_loss) * 100
-    
-    print()
-    print("=" * 70)
-    print("TRAINING COMPLETE")
-    print("=" * 70)
-    print(f"Time: {elapsed:.1f} seconds")
-    print(f"Initial loss: {initial_loss:.4f}")
-    print(f"Final loss: {final_loss:.4f}")
-    print(f"Improvement: {improvement:.1f}%")
-    print()
-    
-    return losses
-
-
-# ============================================================================
-# Testing
-# ============================================================================
-
-def test_patterns(model, test_patterns, char_to_idx, idx_to_char, max_len=16):
-    """
-    Test if model can complete patterns.
-    """
-    print("=" * 70)
-    print("TESTING LEVEL 2: PATTERN COMPLETION")
-    print("=" * 70)
-    print()
-    
-    correct = 0
-    total = len(test_patterns)
-    
-    for input_str, expected_target in test_patterns:
-        # Encode input + EOS
-        input_tokens = [char_to_idx.get(c, 0) for c in input_str]
-        input_tokens.append(1)  # EOS
-        
-        # Pad to max_len-1 (leave room for generation)
-        while len(input_tokens) < max_len - 1:
-            input_tokens.append(0)
-        input_tokens = input_tokens[:max_len-1]
-        
-        # Forward pass
-        x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
-        logits = model.forward(x)
-        
-        # Get prediction for next token (after input + EOS)
-        input_len = len([c for c in input_str]) + 1  # +1 for EOS
-        if input_len < len(input_tokens):
-            next_token_logits = logits.data[0, input_len - 1, :]  # Predict position after EOS
-            predicted_token = int(np.argmax(next_token_logits))
-            
-            # Decode
-            predicted_char = idx_to_char.get(predicted_token, '?')
-            
-            # Check if correct (compare first character of target)
-            expected_first_char = expected_target[0] if len(expected_target) > 0 else ''
-            match = (predicted_char == expected_first_char)
-        else:
-            match = False
-            predicted_char = '?'
-        
-        if match:
-            correct += 1
-            status = "✓"
-        else:
-            status = "✗"
-        
-        print(f"{status} Input: \"{input_str:12s}\" → Expected: \"{expected_target:6s}\" | Got: \"{predicted_char}\"")
-    
-    accuracy = (correct / total) * 100
-    print()
-    print(f"Accuracy: {correct}/{total} ({accuracy:.1f}%)")
-    print()
-    
-    if accuracy >= 70:
-        print("✓ LEVEL 2 PASSED: Transformer can complete patterns!")
-    else:
-        print("✗ LEVEL 2 FAILED: Needs more training")
-    
-    return accuracy
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    print()
-    print("=" * 70)
-    print("MILESTONE 05 - LEVEL 2: TRANSFORMER PATTERN COMPLETION")
-    print("=" * 70)
-    print()
-    print("Goal: Train transformer to complete patterns in < 5 minutes")
-    print()
-    
-    # Create dataset
-    patterns = create_pattern_dataset()
-    char_to_idx, idx_to_char = create_tokenizer(patterns)
-    vocab_size = len(idx_to_char)
-    
-    print(f"Dataset: {len(patterns)} patterns")
-    print(f"Vocabulary: {vocab_size} tokens")
-    print(f"Example: \"{patterns[0][0]}\" → \"{patterns[0][1]}\"")
-    print()
-    
-    # Encode all patterns
-    max_len = 16
-    train_data = [encode_pattern(inp, out, char_to_idx, max_len) for inp, out in patterns]
-    
-    # Create small model (bigger than Level 1)
-    config = {
-        'vocab_size': vocab_size,
-        'embed_dim': 24,      # Slightly bigger
-        'num_layers': 2,      # 2 layers
-        'num_heads': 2,       # 2 heads
-        'max_seq_len': max_len,
-    }
-    
-    print("Model configuration:")
-    for key, val in config.items():
-        print(f"  {key}: {val}")
-    print()
-    
-    model = GPT(**config)
-    num_params = sum(np.prod(p.shape) for p in model.parameters())
-    print(f"Parameters: {num_params:,}")
-    print()
-    
-    # Optimizer and loss
-    optimizer = Adam(model.parameters(), lr=0.001)
-    loss_fn = CrossEntropyLoss()
-    
-    # Train
-    print("Starting training...")
-    print()
-    losses = train_patterns(
-        model=model,
-        optimizer=optimizer,
-        loss_fn=loss_fn,
-        train_data=train_data,
-        vocab_size=vocab_size,
-        max_steps=400
-    )
-    
-    # Test
-    print("Starting testing...")
-    print()
-    accuracy = test_patterns(model, patterns, char_to_idx, idx_to_char, max_len)
-    
-    # Summary
-    print("=" * 70)
-    print("LEVEL 2 SUMMARY")
-    print("=" * 70)
-    print(f"✓ Training: {len(losses)} steps")
-    print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
-    print(f"✓ Accuracy: {accuracy:.1f}%")
-    print()
-    
-    if accuracy >= 70:
-        print("🎉 LEVEL 2 COMPLETE! Ready for Level 3: Text Generation")
-    else:
-        print("⚠️  LEVEL 2 INCOMPLETE: Needs more training")
-    print()
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/simple_gpt.py
+++ b/milestones/05_2017_transformer/simple_gpt.py
@@ -1,109 +0,0 @@
-"""
-Simple GPT model for CodeBot milestone - bypasses LayerNorm gradient bug.
-
-This is a workaround for the milestone until core Tensor operations
-(subtraction, mean) are fixed to maintain gradient flow.
-"""
-
-import numpy as np
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.layers import Linear
-from tinytorch.core.attention import MultiHeadAttention  
-from tinytorch.core.activations import GELU
-from tinytorch.text.embeddings import Embedding
-
-
-class SimpleGPT:
-    """
-    Simplified GPT without LayerNorm (workaround for gradient flow bugs).
-    
-    Architecture:
-    - Token + Position embeddings
-    - N transformer blocks (attention + MLP, NO LayerNorm)
-    - Output projection to vocabulary
-    
-    Note: This is a temporary solution for the milestone. The full GPT
-    with LayerNorm requires fixes to core Tensor subtraction/mean operations.
-    """
-    
-    def __init__(
-        self,
-        vocab_size: int,
-        embed_dim: int,
-        num_layers: int,
-        num_heads: int,
-        max_seq_len: int,
-        mlp_ratio: int = 4
-    ):
-        self.vocab_size = vocab_size
-        self.embed_dim = embed_dim
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.max_seq_len = max_seq_len
-        
-        # Embeddings
-        self.token_embedding = Embedding(vocab_size, embed_dim)
-        self.position_embedding = Embedding(max_seq_len, embed_dim)
-        
-        # Transformer blocks (simplified - no LayerNorm)
-        self.blocks = []
-        for _ in range(num_layers):
-            block = {
-                'attention': MultiHeadAttention(embed_dim, num_heads),
-                'mlp_fc1': Linear(embed_dim, embed_dim * mlp_ratio),
-                'mlp_gelu': GELU(),  # Use tinytorch's GELU
-                'mlp_fc2': Linear(embed_dim * mlp_ratio, embed_dim),
-            }
-            self.blocks.append(block)
-        
-        # Output projection
-        self.lm_head = Linear(embed_dim, vocab_size)
-    
-    def forward(self, tokens: Tensor) -> Tensor:
-        """
-        Forward pass through simplified GPT.
-        
-        Args:
-            tokens: Token indices, shape (batch_size, seq_len)
-            
-        Returns:
-            logits: Predictions, shape (batch_size, seq_len, vocab_size)
-        """
-        batch_size, seq_len = tokens.shape
-        
-        # Embeddings
-        token_emb = self.token_embedding.forward(tokens)
-        positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
-        pos_emb = self.position_embedding.forward(positions)
-        x = token_emb + pos_emb  # (batch, seq, embed)
-        
-        # Transformer blocks
-        for block in self.blocks:
-            # Self-attention with residual
-            attn_out = block['attention'].forward(x)
-            x = x + attn_out  # Residual connection
-            
-            # MLP with residual
-            mlp_out = block['mlp_fc1'].forward(x)
-            mlp_out = block['mlp_gelu'].forward(mlp_out)  # Activation
-            mlp_out = block['mlp_fc2'].forward(mlp_out)
-            x = x + mlp_out  # Residual connection
-        
-        # Project to vocabulary
-        logits = self.lm_head.forward(x)
-        return logits
-    
-    def parameters(self):
-        """Return all trainable parameters."""
-        params = []
-        params.extend(self.token_embedding.parameters())
-        params.extend(self.position_embedding.parameters())
-        
-        for block in self.blocks:
-            params.extend(block['attention'].parameters())
-            params.extend(block['mlp_fc1'].parameters())
-            params.extend(block['mlp_fc2'].parameters())
-        
-        params.extend(self.lm_head.parameters())
-        return params
-
--- a/milestones/05_2017_transformer/test_5min_training.py
+++ b/milestones/05_2017_transformer/test_5min_training.py
@@ -1,316 +0,0 @@
-"""
-Milestone 05 - 5-Minute Training Test
-======================================
-
-GOAL: Train the best possible transformer in exactly 5 minutes.
-
-We'll optimize for:
- Maximum learning in 5 minutes
- Clear progress visualization
- Actual generation testing
- Student-friendly output
-
-This will show what's realistically achievable in a classroom demo.
-"""
-
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-import numpy as np
-import time
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.models.transformer import GPT
-
-enable_autograd()
-
-# ============================================================================
-# Dataset: Mix of memorization + patterns
-# ============================================================================
-
-def create_dataset():
-    """Create a diverse but simple dataset."""
-    sequences = [
-        # Easy memorization
-        "AAAA", "BBBB", "CCCC", "1111", "2222",
-        # Simple sequences
-        "ABCD", "EFGH", "IJKL", "MNOP", "QRST",
-        "1234", "5678", "9012",
-        # Patterns (with repetition for learning)
-        "AB", "CD", "EF", "GH",
-        "12", "34", "56", "78",
-    ] * 3  # Triple the dataset for better learning
-    return sequences
-
-
-def create_tokenizer(sequences):
-    """Simple character tokenizer."""
-    all_chars = sorted(set(''.join(sequences)))
-    char_to_idx = {char: idx + 1 for idx, char in enumerate(all_chars)}
-    idx_to_char = {idx + 1: char for idx, char in enumerate(all_chars)}
-    char_to_idx['<PAD>'] = 0
-    idx_to_char[0] = '<PAD>'
-    return char_to_idx, idx_to_char
-
-
-def encode(seq, char_to_idx, max_len=10):
-    """Encode and pad sequence."""
-    tokens = [char_to_idx.get(c, 0) for c in seq]
-    if len(tokens) < max_len:
-        tokens = tokens + [0] * (max_len - len(tokens))
-    else:
-        tokens = tokens[:max_len]
-    return tokens
-
-
-def decode(tokens, idx_to_char):
-    """Decode tokens to string."""
-    return ''.join([idx_to_char.get(t, '') for t in tokens if t != 0])
-
-
-# ============================================================================
-# Training with 5-minute time limit
-# ============================================================================
-
-def train_5_minutes(model, optimizer, loss_fn, train_data, max_time_seconds=300):
-    """
-    Train for exactly 5 minutes, show progress throughout.
-    """
-    print("=" * 70)
-    print("TRAINING FOR 5 MINUTES")
-    print("=" * 70)
-    print(f"Dataset: {len(train_data)} sequences")
-    print(f"Time limit: {max_time_seconds}s ({max_time_seconds/60:.1f} minutes)")
-    print()
-    
-    start_time = time.time()
-    losses = []
-    step = 0
-    
-    # Progress checkpoints at 1, 2, 3, 4, 5 minutes
-    checkpoints = [60, 120, 180, 240, 300]
-    checkpoint_idx = 0
-    
-    print("Training started...")
-    print()
-    
-    while True:
-        # Check time limit
-        elapsed = time.time() - start_time
-        if elapsed >= max_time_seconds:
-            break
-        
-        # Sample random sequence
-        tokens = train_data[np.random.randint(len(train_data))]
-        
-        # Next token prediction
-        input_seq = tokens[:-1]
-        target_seq = tokens[1:]
-        
-        x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
-        y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
-        
-        # Forward
-        logits = model.forward(x)
-        
-        # Loss
-        batch_size, seq_len, vocab_size = logits.shape
-        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
-        targets_flat = y_true.reshape(batch_size * seq_len)
-        loss = loss_fn.forward(logits_flat, targets_flat)
-        
-        # Backward
-        optimizer.zero_grad()
-        loss.backward()
-        
-        # Clip gradients
-        for param in model.parameters():
-            if param.grad is not None:
-                np.clip(param.grad, -1.0, 1.0, out=param.grad)
-        
-        # Update
-        optimizer.step()
-        
-        losses.append(loss.data.item())
-        step += 1
-        
-        # Show progress at checkpoints
-        if checkpoint_idx < len(checkpoints) and elapsed >= checkpoints[checkpoint_idx]:
-            avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses)
-            steps_per_sec = step / elapsed
-            print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.2f} steps/sec")
-            checkpoint_idx += 1
-        
-        # Also show every 50 steps if we're going fast
-        if step % 50 == 0:
-            if checkpoint_idx == 0 or elapsed < checkpoints[0]:  # Only if we haven't hit first checkpoint
-                avg_loss = np.mean(losses[-50:]) if len(losses) >= 50 else np.mean(losses)
-                print(f"[{int(elapsed):3d}s] Step {step:4d} | Loss: {avg_loss:.4f}")
-    
-    final_elapsed = time.time() - start_time
-    final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-    initial_loss = np.mean(losses[:10])
-    improvement = (1 - final_loss / initial_loss) * 100
-    
-    print()
-    print("=" * 70)
-    print("TRAINING COMPLETE")
-    print("=" * 70)
-    print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.2f} minutes)")
-    print(f"Total steps: {step}")
-    print(f"Steps/second: {step/final_elapsed:.2f}")
-    print(f"Initial loss: {initial_loss:.4f}")
-    print(f"Final loss: {final_loss:.4f}")
-    print(f"Improvement: {improvement:.1f}%")
-    print()
-    
-    return losses, step
-
-
-# ============================================================================
-# Testing
-# ============================================================================
-
-def test_generation(model, test_sequences, char_to_idx, idx_to_char):
-    """Test generation quality."""
-    print("=" * 70)
-    print("TESTING GENERATION")
-    print("=" * 70)
-    print()
-    
-    correct = 0
-    total = len(test_sequences)
-    
-    for seq in test_sequences[:15]:  # Test first 15
-        tokens = encode(seq, char_to_idx, max_len=10)
-        
-        # Get predictions
-        x = Tensor(np.array([tokens[:-1]], dtype=np.int32), requires_grad=False)
-        logits = model.forward(x)
-        
-        # Predict each position
-        predicted_tokens = []
-        for i in range(logits.shape[1]):
-            pred = int(np.argmax(logits.data[0, i, :]))
-            predicted_tokens.append(pred)
-        
-        # Compare
-        expected = tokens[1:]
-        match = all(e == p for e, p in zip(expected, predicted_tokens) if e != 0)
-        
-        if match:
-            correct += 1
-            status = "✓"
-        else:
-            status = "✗"
-        
-        expected_str = decode(expected, idx_to_char)
-        predicted_str = decode(predicted_tokens, idx_to_char)
-        
-        print(f"{status} Input: {seq[:6]:8s} → Expected: {expected_str:8s} | Got: {predicted_str:8s}")
-    
-    accuracy = (correct / 15) * 100  # Out of 15 tested
-    print()
-    print(f"Accuracy: {correct}/15 ({accuracy:.1f}%)")
-    print()
-    
-    return accuracy
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    print()
-    print("=" * 70)
-    print("MILESTONE 05 - 5-MINUTE TRAINING TEST")
-    print("=" * 70)
-    print()
-    print("Let's find out what we can learn in exactly 5 minutes!")
-    print()
-    
-    # Dataset
-    sequences = create_dataset()
-    char_to_idx, idx_to_char = create_tokenizer(sequences)
-    vocab_size = len(idx_to_char)
-    
-    print(f"Dataset: {len(sequences)} sequences (with repetition)")
-    print(f"Unique sequences: {len(set(sequences))}")
-    print(f"Vocabulary: {vocab_size} tokens")
-    print()
-    
-    # Encode
-    train_data = [encode(seq, char_to_idx, max_len=10) for seq in sequences]
-    
-    # Model: Ultra-tiny for maximum steps in 5 mins
-    # Goal: <1s per step → ~300+ steps in 5 mins
-    # Strategy: Minimize params for speed
-    config = {
-        'vocab_size': vocab_size,
-        'embed_dim': 16,      # Very small
-        'num_layers': 1,      # Just 1 layer!
-        'num_heads': 2,       # 2 heads
-        'max_seq_len': 10,
-    }
-    
-    print("Model configuration:")
-    for key, val in config.items():
-        print(f"  {key}: {val}")
-    print()
-    
-    model = GPT(**config)
-    num_params = sum(np.prod(p.shape) for p in model.parameters())
-    print(f"Parameters: {num_params:,}")
-    print()
-    
-    # Optimizer
-    optimizer = Adam(model.parameters(), lr=0.001)
-    loss_fn = CrossEntropyLoss()
-    
-    # Train for 5 minutes
-    print("Starting 5-minute training run...")
-    print("(Progress will be shown every minute)")
-    print()
-    
-    losses, total_steps = train_5_minutes(
-        model=model,
-        optimizer=optimizer,
-        loss_fn=loss_fn,
-        train_data=train_data,
-        max_time_seconds=300  # 5 minutes
-    )
-    
-    # Test
-    print("Testing what the model learned...")
-    print()
-    accuracy = test_generation(model, sequences, char_to_idx, idx_to_char)
-    
-    # Final summary
-    print("=" * 70)
-    print("5-MINUTE TRAINING SUMMARY")
-    print("=" * 70)
-    print(f"✓ Model: {num_params:,} parameters")
-    print(f"✓ Steps completed: {total_steps}")
-    print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
-    print(f"✓ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%")
-    print(f"✓ Accuracy: {accuracy:.1f}%")
-    print()
-    
-    if accuracy >= 60:
-        print("🎉 EXCELLENT! Model learned well in 5 minutes!")
-    elif accuracy >= 40:
-        print("✓ GOOD! Model is learning, could use more training.")
-    elif accuracy >= 20:
-        print("⚠️  FAIR: Model is learning but needs optimization.")
-    else:
-        print("⚠️  Model needs more training time or tuning.")
-    print()
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/test_gpt_learning.py
+++ b/milestones/05_2017_transformer/test_gpt_learning.py
@@ -1,744 +0,0 @@
-#!/usr/bin/env python3
-"""
-Progressive Test Suite for TinyGPT Learning
-
-Tests transformer learning from absolute simplest to complex:
-0. Memorize single sequence (MUST work)
-1. Pattern completion (A B A → B)
-2. Copy task (COPY: X → X)
-3. Simple arithmetic (2+3 → 5)
-4. TinyTalks greetings
-
-This helps identify EXACTLY where learning breaks down.
-"""
-
-import sys
-import os
-import numpy as np
-import time
-
-project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.append(project_root)
-
-from rich.console import Console
-from rich.panel import Panel
-from rich.table import Table
-from rich import box
-
-console = Console()
-
-
-def run_test_0_memorize_sequence():
-    """
-    TEST 0: Memorize Single Sequence
-    
-    The ABSOLUTE simplest test. Can the model memorize ONE sequence?
-    "HELLO WORLD" repeated many times.
-    
-    If this fails, there's a fundamental bug in:
-    - Forward pass
-    - Loss computation
-    - Backward pass
-    - Parameter updates
-    """
-    console.print("\n" + "=" * 70)
-    console.print("[bold cyan]TEST 0: Single Sequence Memorization[/bold cyan]")
-    console.print("=" * 70)
-    console.print("Task: Memorize 'HELLO WORLD' (repeated 100 times)")
-    console.print("Expected: Loss should drop to near 0")
-    console.print("Why: If this fails, autograd/optimizer is broken\n")
-    
-    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.optimizers import Adam
-    from tinytorch.core.losses import CrossEntropyLoss
-    from tinytorch.core.autograd import enable_autograd
-    from tinytorch.text.tokenization import CharTokenizer
-    from tinytorch.text.embeddings import Embedding, PositionalEncoding
-    from tinytorch.models.transformer import TransformerBlock, LayerNorm
-    from tinytorch.core.layers import Linear
-    
-    enable_autograd()
-    
-    # Super simple data: just repeat "HELLO WORLD"
-    text = "HELLO WORLD " * 100
-    
-    # Tokenize
-    tokenizer = CharTokenizer()
-    tokenizer.build_vocab([text])
-    data = tokenizer.encode(text)
-    
-    console.print(f"Data length: {len(data)} tokens")
-    console.print(f"Vocabulary: {tokenizer.vocab_size} chars")
-    console.print(f"Unique text: '{text[:50]}...'\n")
-    
-    # Tiny model
-    vocab_size = tokenizer.vocab_size
-    embed_dim = 32
-    seq_len = 16
-    
-    # Build minimal model
-    embedding = Embedding(vocab_size, embed_dim)
-    pos_enc = PositionalEncoding(seq_len, embed_dim)
-    transformer = TransformerBlock(embed_dim, num_heads=2, mlp_ratio=2, dropout_prob=0.1)
-    ln = LayerNorm(embed_dim)
-    output_proj = Linear(embed_dim, vocab_size)
-    
-    params = []
-    params.extend(embedding.parameters())
-    params.extend(pos_enc.parameters())
-    params.extend(transformer.parameters())
-    params.extend(ln.parameters())
-    params.extend(output_proj.parameters())
-    
-    for p in params:
-        p.requires_grad = True
-    
-    console.print(f"Model: {len(params)} parameter tensors")
-    console.print(f"Embed dim: {embed_dim}, Seq len: {seq_len}\n")
-    
-    # Train
-    optimizer = Adam(params, lr=0.01)
-    criterion = CrossEntropyLoss()
-    
-    console.print("[yellow]Training (10 steps)...[/yellow]")
-    console.print("[dim]Watching for: loss decrease, gradient flow, parameter updates[/dim]\n")
-    
-    initial_loss = None
-    final_loss = None
-    
-    for step in range(10):
-        # Random sequence
-        start = np.random.randint(0, len(data) - seq_len - 1)
-        input_seq = data[start:start+seq_len]
-        target_seq = data[start+1:start+seq_len+1]
-        
-        console.print(f"[dim]Step {step+1}:[/dim]", end=" ")
-        
-        # Forward
-        x = Tensor(np.array([input_seq]))
-        y = Tensor(np.array([target_seq]))
-        
-        console.print(f"input shape={x.shape}", end=" ")
-        
-        # Through model
-        x = embedding(x)
-        console.print(f"embed_out={x.shape}", end=" ")
-        
-        x = pos_enc(x)
-        console.print(f"pos_out={x.shape}", end=" ")
-        
-        x = transformer(x)
-        console.print(f"trans_out={x.shape}", end=" ")
-        
-        x = ln(x)
-        console.print(f"ln_out={x.shape}", end=" ")
-        
-        # Reshape
-        batch, seq, dim = x.shape
-        x_2d = x.reshape(batch * seq, dim)
-        logits_2d = output_proj(x_2d)
-        logits = logits_2d.reshape(batch, seq, vocab_size)
-        
-        console.print(f"logits={logits.shape}", end=" ")
-        
-        # Loss
-        logits_flat = logits.reshape(batch * seq, vocab_size)
-        targets_flat = y.reshape(-1)
-        
-        console.print(f"logits_flat={logits_flat.shape} targets_flat={targets_flat.shape}", end=" ")
-        
-        loss = criterion(logits_flat, targets_flat)
-        
-        loss_val = float(loss.data)
-        console.print(f"loss={loss_val:.4f}", end=" ")
-        
-        # Check if loss has grad_fn
-        has_grad_fn = hasattr(loss, '_grad_fn') and loss._grad_fn is not None
-        console.print(f"has_grad_fn={has_grad_fn}", end=" ")
-        
-        # Backward
-        optimizer.zero_grad()
-        
-        console.print("backward...", end=" ")
-        loss.backward()
-        
-        # Check if params got gradients
-        params_with_grad = sum(1 for p in params if p.grad is not None and np.any(p.grad != 0))
-        console.print(f"params_w_grad={params_with_grad}/{len(params)}", end=" ")
-        
-        optimizer.step()
-        console.print("updated")
-        
-        if step == 0:
-            initial_loss = loss_val
-            console.print(f"  [yellow]→ Initial loss: {initial_loss:.4f}[/yellow]")
-        if step == 9:
-            final_loss = loss_val
-        
-        if step % 2 == 0 and step > 0:
-            console.print(f"  [cyan]→ Loss so far: {loss_val:.4f}[/cyan]")
-    
-    # Result
-    console.print(f"\n[bold]Results:[/bold]")
-    console.print(f"  Initial loss: {initial_loss:.4f}")
-    console.print(f"  Final loss: {final_loss:.4f}")
-    console.print(f"  Decrease: {initial_loss - final_loss:.4f}")
-    
-    if final_loss < initial_loss * 0.8:
-        console.print(f"  [green]✓ PASS: Loss decreased significantly[/green]")
-        return True
-    else:
-        console.print(f"  [red]✗ FAIL: Loss didn't decrease enough[/red]")
-        console.print(f"  [red]→ Bug in: autograd, optimizer, or forward pass[/red]")
-        return False
-
-
-def run_test_1_pattern_completion():
-    """
-    TEST 1: Pattern Completion
-    
-    Can it learn: "A B A B A B" → next is "A"
-                  "1 2 1 2 1 2" → next is "1"
-    
-    Tests: Can model learn simple repeating patterns?
-    """
-    console.print("\n" + "=" * 70)
-    console.print("[bold cyan]TEST 1: Pattern Completion[/bold cyan]")
-    console.print("=" * 70)
-    console.print("Task: Learn repeating patterns (ABAB... → A, 1212... → 1)")
-    console.print("Expected: Predict next token correctly after training")
-    console.print("Why: Tests if attention can learn simple sequences\n")
-    
-    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.optimizers import Adam
-    from tinytorch.core.losses import CrossEntropyLoss
-    from tinytorch.text.embeddings import Embedding, PositionalEncoding
-    from tinytorch.models.transformer import TransformerBlock, LayerNorm
-    from tinytorch.core.layers import Linear
-    
-    # Create pattern data
-    patterns = [
-        "A B A B A B A B A B ",
-        "1 2 1 2 1 2 1 2 1 2 ",
-        "X Y X Y X Y X Y X Y ",
-    ]
-    
-    text = "".join(patterns * 50)  # Repeat 50 times
-    
-    console.print(f"Data: {len(text)} chars")
-    console.print(f"Patterns: ABAB, 1212, XYXY")
-    console.print(f"Sample: '{text[:40]}...'\n")
-    
-    # Tokenize
-    chars = sorted(set(text))
-    vocab_size = len(chars)
-    char_to_idx = {ch: i for i, ch in enumerate(chars)}
-    idx_to_char = {i: ch for i, ch in enumerate(chars)}
-    data = np.array([char_to_idx[ch] for ch in text])
-    
-    console.print(f"Vocab: {vocab_size} chars: {repr(''.join(chars))}\n")
-    
-    # Build tiny model
-    embed_dim = 32
-    num_heads = 2
-    seq_len = 8
-    
-    embedding = Embedding(vocab_size, embed_dim)
-    pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
-    transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
-    ln = LayerNorm(embed_dim)
-    output_proj = Linear(embed_dim, vocab_size)
-    
-    params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
-    
-    # Set requires_grad
-    for p in params:
-        p.requires_grad = True
-    
-    optimizer = Adam(params, lr=0.01)
-    criterion = CrossEntropyLoss()
-    
-    console.print(f"[yellow]Training (30 steps on patterns)...[/yellow]")
-    
-    initial_loss = None
-    final_loss = None
-    
-    for step in range(30):
-        start = np.random.randint(0, len(data) - seq_len - 1)
-        input_seq = data[start:start+seq_len]
-        target_seq = data[start+1:start+seq_len+1]
-        
-        x = Tensor(np.array([input_seq]))
-        y = Tensor(np.array([target_seq]))
-        
-        x = embedding(x)
-        x = pos_enc(x)
-        x = transformer(x)
-        x = ln(x)
-        
-        batch, seq, dim = x.shape
-        x_2d = x.reshape(batch * seq, dim)
-        logits_2d = output_proj(x_2d)
-        logits = logits_2d.reshape(batch, seq, vocab_size)
-        
-        logits_flat = logits.reshape(batch * seq, vocab_size)
-        targets_flat = y.reshape(-1)
-        loss = criterion(logits_flat, targets_flat)
-        
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        loss_val = float(loss.data)
-        if step == 0:
-            initial_loss = loss_val
-        if step == 29:
-            final_loss = loss_val
-        
-        if step % 10 == 0 or step == 29:
-            console.print(f"  Step {step+1}: Loss = {loss_val:.4f}")
-    
-    decrease = initial_loss - final_loss
-    console.print(f"\n[bold]Results:[/bold]")
-    console.print(f"  Initial: {initial_loss:.4f}")
-    console.print(f"  Final: {final_loss:.4f}")
-    console.print(f"  Decrease: {decrease:.4f}")
-    
-    if decrease > 0.5:
-        console.print(f"  [green]✓ PASS: Loss decreased significantly[/green]")
-        return True
-    else:
-        console.print(f"  [red]✗ FAIL: Loss didn't decrease enough[/red]")
-        return False
-
-
-def run_test_2_copy_task():
-    """
-    TEST 2: Copy Task
-    
-    Input: "COPY: hello"
-    Output: "hello"
-    
-    Classic transformer test from research papers.
-    """
-    console.print("\n" + "=" * 70)
-    console.print("[bold cyan]TEST 2: Copy Task[/bold cyan]")
-    console.print("=" * 70)
-    console.print("Task: COPY: X → X (reproduce input)")
-    console.print("Expected: Model learns to copy the input text")
-    console.print("Why: Classic test of attention mechanism\n")
-    
-    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.optimizers import Adam
-    from tinytorch.core.losses import CrossEntropyLoss
-    from tinytorch.text.embeddings import Embedding, PositionalEncoding
-    from tinytorch.models.transformer import TransformerBlock, LayerNorm
-    from tinytorch.core.layers import Linear
-    
-    # Create copy task data
-    words = ["hello", "world", "test", "copy", "learn", "task"]
-    examples = []
-    for word in words:
-        examples.append(f"COPY:{word}={word} ")
-    
-    text = "".join(examples * 50)  # Repeat
-    
-    console.print(f"Data: {len(text)} chars")
-    console.print(f"Examples: COPY:hello=hello, COPY:world=world")
-    console.print(f"Sample: '{text[:50]}...'\n")
-    
-    # Tokenize
-    chars = sorted(set(text))
-    vocab_size = len(chars)
-    char_to_idx = {ch: i for i, ch in enumerate(chars)}
-    data = np.array([char_to_idx[ch] for ch in text])
-    
-    console.print(f"Vocab: {vocab_size} chars\n")
-    
-    # Build model
-    embed_dim = 32
-    num_heads = 2
-    seq_len = 16
-    
-    embedding = Embedding(vocab_size, embed_dim)
-    pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
-    transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
-    ln = LayerNorm(embed_dim)
-    output_proj = Linear(embed_dim, vocab_size)
-    
-    params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
-    for p in params:
-        p.requires_grad = True
-    
-    optimizer = Adam(params, lr=0.01)
-    criterion = CrossEntropyLoss()
-    
-    console.print(f"[yellow]Training (40 steps on copy task)...[/yellow]")
-    
-    initial_loss = None
-    final_loss = None
-    
-    for step in range(40):
-        start = np.random.randint(0, len(data) - seq_len - 1)
-        input_seq = data[start:start+seq_len]
-        target_seq = data[start+1:start+seq_len+1]
-        
-        x = Tensor(np.array([input_seq]))
-        y = Tensor(np.array([target_seq]))
-        
-        x = embedding(x)
-        x = pos_enc(x)
-        x = transformer(x)
-        x = ln(x)
-        
-        batch, seq, dim = x.shape
-        x_2d = x.reshape(batch * seq, dim)
-        logits_2d = output_proj(x_2d)
-        logits = logits_2d.reshape(batch, seq, vocab_size)
-        
-        logits_flat = logits.reshape(batch * seq, vocab_size)
-        targets_flat = y.reshape(-1)
-        loss = criterion(logits_flat, targets_flat)
-        
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        loss_val = float(loss.data)
-        if step == 0:
-            initial_loss = loss_val
-        if step == 39:
-            final_loss = loss_val
-        
-        if step % 10 == 0 or step == 39:
-            console.print(f"  Step {step+1}: Loss = {loss_val:.4f}")
-    
-    decrease = initial_loss - final_loss
-    console.print(f"\n[bold]Results:[/bold]")
-    console.print(f"  Initial: {initial_loss:.4f}")
-    console.print(f"  Final: {final_loss:.4f}")
-    console.print(f"  Decrease: {decrease:.4f}")
-    
-    if decrease > 0.5:
-        console.print(f"  [green]✓ PASS: Loss decreased[/green]")
-        return True
-    else:
-        console.print(f"  [red]✗ FAIL: Loss didn't decrease enough[/red]")
-        return False
-
-
-def run_test_3_simple_arithmetic():
-    """
-    TEST 3: Simple Arithmetic
-    
-    2+3=5
-    1+1=2
-    5-2=3
-    
-    Tests: Can model learn simple rules?
-    """
-    console.print("\n" + "=" * 70)
-    console.print("[bold cyan]TEST 3: Simple Arithmetic[/bold cyan]")
-    console.print("=" * 70)
-    console.print("Task: 2+3=5, 1+1=2, etc. (single digit)")
-    console.print("Expected: Correct answers after training")
-    console.print("Why: Tests reasoning ability\n")
-    
-    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.optimizers import Adam
-    from tinytorch.core.losses import CrossEntropyLoss
-    from tinytorch.text.embeddings import Embedding, PositionalEncoding
-    from tinytorch.models.transformer import TransformerBlock, LayerNorm
-    from tinytorch.core.layers import Linear
-    
-    # Create arithmetic data
-    examples = []
-    for a in range(1, 6):
-        for b in range(1, 6):
-            examples.append(f"{a}+{b}={a+b} ")
-    
-    text = "".join(examples * 30)  # Repeat
-    
-    console.print(f"Data: {len(text)} chars")
-    console.print(f"Examples: 1+1=2, 2+3=5, 4+5=9")
-    console.print(f"Sample: '{text[:40]}...'\n")
-    
-    # Tokenize
-    chars = sorted(set(text))
-    vocab_size = len(chars)
-    char_to_idx = {ch: i for i, ch in enumerate(chars)}
-    data = np.array([char_to_idx[ch] for ch in text])
-    
-    console.print(f"Vocab: {vocab_size} chars\n")
-    
-    # Build model
-    embed_dim = 48
-    num_heads = 3
-    seq_len = 12
-    
-    embedding = Embedding(vocab_size, embed_dim)
-    pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
-    transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
-    ln = LayerNorm(embed_dim)
-    output_proj = Linear(embed_dim, vocab_size)
-    
-    params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
-    for p in params:
-        p.requires_grad = True
-    
-    optimizer = Adam(params, lr=0.01)
-    criterion = CrossEntropyLoss()
-    
-    console.print(f"[yellow]Training (50 steps on arithmetic)...[/yellow]")
-    
-    initial_loss = None
-    final_loss = None
-    
-    for step in range(50):
-        start = np.random.randint(0, len(data) - seq_len - 1)
-        input_seq = data[start:start+seq_len]
-        target_seq = data[start+1:start+seq_len+1]
-        
-        x = Tensor(np.array([input_seq]))
-        y = Tensor(np.array([target_seq]))
-        
-        x = embedding(x)
-        x = pos_enc(x)
-        x = transformer(x)
-        x = ln(x)
-        
-        batch, seq, dim = x.shape
-        x_2d = x.reshape(batch * seq, dim)
-        logits_2d = output_proj(x_2d)
-        logits = logits_2d.reshape(batch, seq, vocab_size)
-        
-        logits_flat = logits.reshape(batch * seq, vocab_size)
-        targets_flat = y.reshape(-1)
-        loss = criterion(logits_flat, targets_flat)
-        
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        loss_val = float(loss.data)
-        if step == 0:
-            initial_loss = loss_val
-        if step == 49:
-            final_loss = loss_val
-        
-        if step % 10 == 0 or step == 49:
-            console.print(f"  Step {step+1}: Loss = {loss_val:.4f}")
-    
-    decrease = initial_loss - final_loss
-    console.print(f"\n[bold]Results:[/bold]")
-    console.print(f"  Initial: {initial_loss:.4f}")
-    console.print(f"  Final: {final_loss:.4f}")
-    console.print(f"  Decrease: {decrease:.4f}")
-    
-    if decrease > 0.3:
-        console.print(f"  [green]✓ PASS: Loss decreased[/green]")
-        console.print(f"  [dim](arithmetic is harder, so lower threshold)[/dim]")
-        return True
-    else:
-        console.print(f"  [red]✗ FAIL: Loss didn't decrease enough[/red]")
-        return False
-
-
-def run_test_4_tinytalks_level1():
-    """
-    TEST 4: TinyTalks Level 1
-    
-    Q: Hello!
-    A: Hi there!
-    
-    The actual task we want to solve.
-    """
-    console.print("\n" + "=" * 70)
-    console.print("[bold cyan]TEST 4: TinyTalks Level 1[/bold cyan]")
-    console.print("=" * 70)
-    console.print("Task: Learn greeting Q&A pairs from TinyTalks")
-    console.print("Expected: Can respond to greetings")
-    console.print("Why: The actual milestone goal\n")
-    
-    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.optimizers import Adam
-    from tinytorch.core.losses import CrossEntropyLoss
-    from tinytorch.text.embeddings import Embedding, PositionalEncoding
-    from tinytorch.models.transformer import TransformerBlock, LayerNorm
-    from tinytorch.core.layers import Linear
-    
-    # Load TinyTalks Level 1 data
-    try:
-        with open("datasets/tinytalks/splits/train.txt", "r") as f:
-            full_text = f.read()
-        
-        # Heuristic: Level 1 = very short Q&A (< 40 chars each)
-        lines = full_text.split('\n')
-        level_1_text = []
-        for i in range(0, len(lines) - 1, 3):  # Q, A, blank
-            if i+1 < len(lines):
-                q_line = lines[i]
-                a_line = lines[i+1]
-                if q_line.startswith('Q:') and a_line.startswith('A:'):
-                    if len(q_line) < 40 and len(a_line) < 40:
-                        level_1_text.append(q_line + '\n' + a_line + '\n\n')
-        
-        if not level_1_text:
-            console.print("[red]No Level 1 data found, using first 10 Q&A[/red]")
-            level_1_text = [full_text[:500]]
-        
-        text = "".join(level_1_text[:10])  # First 10 simple Q&A
-        
-        console.print(f"Data: {len(text)} chars (Level 1 greetings)")
-        console.print(f"Sample:\n{text[:100]}...\n")
-        
-    except FileNotFoundError:
-        console.print("[red]TinyTalks not found, skipping Test 4[/red]")
-        return None
-    
-    # Tokenize
-    chars = sorted(set(text))
-    vocab_size = len(chars)
-    char_to_idx = {ch: i for i, ch in enumerate(chars)}
-    data = np.array([char_to_idx[ch] for ch in text])
-    
-    console.print(f"Vocab: {vocab_size} chars\n")
-    
-    # Build model (slightly larger for Q&A)
-    embed_dim = 64
-    num_heads = 4
-    seq_len = 32
-    
-    embedding = Embedding(vocab_size, embed_dim)
-    pos_enc = PositionalEncoding(max_seq_len=seq_len, embed_dim=embed_dim)
-    transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=2, dropout_prob=0.1)
-    ln = LayerNorm(embed_dim)
-    output_proj = Linear(embed_dim, vocab_size)
-    
-    params = [embedding.weight] + pos_enc.parameters() + transformer.parameters() + ln.parameters() + output_proj.parameters()
-    for p in params:
-        p.requires_grad = True
-    
-    optimizer = Adam(params, lr=0.005)  # Lower LR for Q&A
-    criterion = CrossEntropyLoss()
-    
-    console.print(f"[yellow]Training (100 steps on TinyTalks Level 1)...[/yellow]")
-    
-    initial_loss = None
-    final_loss = None
-    
-    for step in range(100):
-        if len(data) < seq_len + 1:
-            console.print("[red]Dataset too small[/red]")
-            return None
-        
-        start = np.random.randint(0, len(data) - seq_len - 1)
-        input_seq = data[start:start+seq_len]
-        target_seq = data[start+1:start+seq_len+1]
-        
-        x = Tensor(np.array([input_seq]))
-        y = Tensor(np.array([target_seq]))
-        
-        x = embedding(x)
-        x = pos_enc(x)
-        x = transformer(x)
-        x = ln(x)
-        
-        batch, seq, dim = x.shape
-        x_2d = x.reshape(batch * seq, dim)
-        logits_2d = output_proj(x_2d)
-        logits = logits_2d.reshape(batch, seq, vocab_size)
-        
-        logits_flat = logits.reshape(batch * seq, vocab_size)
-        targets_flat = y.reshape(-1)
-        loss = criterion(logits_flat, targets_flat)
-        
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        
-        loss_val = float(loss.data)
-        if step == 0:
-            initial_loss = loss_val
-        if step == 99:
-            final_loss = loss_val
-        
-        if step % 20 == 0 or step == 99:
-            console.print(f"  Step {step+1}: Loss = {loss_val:.4f}")
-    
-    decrease = initial_loss - final_loss
-    console.print(f"\n[bold]Results:[/bold]")
-    console.print(f"  Initial: {initial_loss:.4f}")
-    console.print(f"  Final: {final_loss:.4f}")
-    console.print(f"  Decrease: {decrease:.4f}")
-    
-    if decrease > 0.3:
-        console.print(f"  [green]✓ PASS: Model is learning TinyTalks![/green]")
-        console.print(f"  [cyan]→ Now train full model with tinytalks_gpt.py[/cyan]")
-        return True
-    else:
-        console.print(f"  [yellow]⚠ PARTIAL: Some learning, may need more steps[/yellow]")
-        return False
-
-
-def main():
-    """Run all tests in sequence"""
-    console.print("\n")
-    console.print(Panel(
-        "[bold cyan]TinyGPT Learning Diagnostic Suite[/bold cyan]\n\n"
-        "Progressive tests from simplest to complex:\n"
-        "  0. Single sequence memorization (MUST work)\n"
-        "  1. Pattern completion (A B A → B)\n"
-        "  2. Copy task (COPY: X → X)\n"
-        "  3. Simple arithmetic (2+3 → 5)\n"
-        "  4. TinyTalks greetings (Q&A)\n\n"
-        "[yellow]This identifies EXACTLY where learning breaks down[/yellow]",
-        title="🔬 Diagnostic Tests",
-        border_style="cyan",
-        box=box.DOUBLE
-    ))
-    
-    results = {}
-    
-    # Run tests
-    try:
-        results[0] = run_test_0_memorize_sequence()
-    except Exception as e:
-        console.print(f"\n[red]Test 0 crashed: {str(e)}[/red]")
-        results[0] = False
-    
-    # Only run next tests if previous passed
-    if results.get(0):
-        results[1] = run_test_1_pattern_completion()
-        results[2] = run_test_2_copy_task()
-        results[3] = run_test_3_simple_arithmetic()
-        results[4] = run_test_4_tinytalks_level1()
-    
-    # Summary
-    console.print("\n" + "=" * 70)
-    console.print("[bold]Test Summary:[/bold]")
-    console.print("=" * 70)
-    
-    for test_num, result in results.items():
-        if result is True:
-            console.print(f"  Test {test_num}: [green]✓ PASS[/green]")
-        elif result is False:
-            console.print(f"  Test {test_num}: [red]✗ FAIL[/red]")
-        else:
-            console.print(f"  Test {test_num}: [yellow]○ TODO[/yellow]")
-    
-    console.print("\n" + "=" * 70)
-    
-    if results.get(0) is False:
-        console.print("[bold red]CRITICAL: Test 0 failed![/bold red]")
-        console.print("The transformer cannot even memorize a single sequence.")
-        console.print("This indicates a fundamental bug in:")
-        console.print("  - Forward pass computation")
-        console.print("  - Autograd backward pass")
-        console.print("  - Optimizer parameter updates")
-        console.print("  - Loss computation")
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/test_tinytalks_learning.py
+++ b/milestones/05_2017_transformer/test_tinytalks_learning.py
@@ -1,70 +0,0 @@
-#!/usr/bin/env python3
-"""
-Quick diagnostic to test if the model can learn ANY pattern at all.
-"""
-
-import sys
-import os
-import numpy as np
-
-project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.append(project_root)
-
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.text.tokenization import CharTokenizer
-
-# Enable autograd
-enable_autograd()
-
-# Super simple test: Can the model learn to predict "A" after "Q:"?
-test_data = """Q: Hello!
-A: Hi there!
-
-Q: What is your name?
-A: I am TinyBot.
-
-Q: What color is the sky?
-A: The sky is blue.
-"""
-
-print("Testing if model can learn simple patterns...")
-print(f"Test data: {repr(test_data[:100])}...")
-
-# Build tokenizer
-tokenizer = CharTokenizer()
-tokenizer.build_vocab([test_data])
-tokens = tokenizer.encode(test_data)
-
-print(f"Vocabulary size: {tokenizer.vocab_size}")
-print(f"Total tokens: {len(tokens)}")
-print(f"First 20 tokens: {tokens[:20]}")
-print(f"Decoded: {repr(tokenizer.decode(tokens[:20]))}")
-
-# Check specific patterns
-q_colon_tokens = tokenizer.encode("Q:")
-print(f"\n'Q:' tokens: {q_colon_tokens}")
-print(f"'Q:' decoded: {repr(tokenizer.decode(q_colon_tokens))}")
-
-a_colon_tokens = tokenizer.encode("A:")
-print(f"'A:' tokens: {a_colon_tokens}")
-print(f"'A:' decoded: {repr(tokenizer.decode(a_colon_tokens))}")
-
-# Find all occurrences of "Q:" followed by space/newline then "A:"
-print("\nPattern analysis:")
-text_str = test_data
-q_count = text_str.count("Q:")
-a_count = text_str.count("A:")
-print(f"'Q:' appears: {q_count} times")
-print(f"'A:' appears: {a_count} times")
-
-print("\n✅ Tokenizer is working correctly!")
-print("\nConclusion: The model should be able to learn that 'A:' follows 'Q:'")
-print("If it's generating garbage, the model is either:")
-print("  1. Too small (need more parameters)")
-print("  2. Not trained enough (need more epochs)")
-print("  3. Learning rate is wrong")
-print("  4. Or there's a bug in the training loop")
-
--- a/milestones/05_2017_transformer/tinystories_gpt.py
+++ b/milestones/05_2017_transformer/tinystories_gpt.py
@@ -1,604 +0,0 @@
-#!/usr/bin/env python3
-"""
-TinyStories Text Generation (2017) - Transformer Era
-====================================================
-
-📚 HISTORICAL CONTEXT:
-In 2017, Vaswani et al. published "Attention Is All You Need", showing that
-attention mechanisms alone (no RNNs!) could achieve state-of-the-art results
-on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs.
-
-🎯 WHAT YOU'RE BUILDING:
-Using YOUR TinyTorch implementations, you'll build a character-level language model  
-that generates simple stories - proving YOUR attention mechanism works!
-
-TinyStories is MUCH EASIER than Shakespeare:
- Simple vocabulary (children's stories vs archaic English)
- Clear sentence structure
- Designed specifically for small models like ours!
- Faster convergence and better results
-
-✅ REQUIRED MODULES (Run after Module 13):
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-  Module 02 (Tensor)       : YOUR data structure with autograd
-  Module 03 (Activations)  : YOUR ReLU in feed-forward networks
-  Module 04 (Layers)       : YOUR Linear layers
-  Module 08 (Optimizers)   : YOUR Adam optimizer
-  Module 10 (Tokenization) : YOUR CharTokenizer for text→numbers
-  Module 11 (Embeddings)   : YOUR token & positional embeddings
-  Module 12 (Attention)    : YOUR multi-head self-attention
-  Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-
-🏗️ ARCHITECTURE (Character-Level Language Model):
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                               Output Predictions                             │
-    │                         Character Probabilities (vocab_size)                 │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                            Output Projection                                 │
-    │                       Module 04: vectors → vocabulary                        │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                              Layer Norm                                      │
-    │                        Module 13: Final normalization                        │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ╔══════════════════════════════════════════════════════════════════════════════╗
-    ║                      Transformer Block × N (Repeat)                          ║
-    ║  ┌────────────────────────────────────────────────────────────────────────┐  ║
-    ║  │                       Feed Forward Network                             │  ║
-    ║  │              Module 04: Linear → ReLU → Linear                         │  ║
-    ║  └────────────────────────────────────────────────────────────────────────┘  ║
-    ║                                  ▲                                           ║
-    ║  ┌────────────────────────────────────────────────────────────────────────┐  ║
-    ║  │                    Multi-Head Self-Attention                           │  ║
-    ║  │           Module 12: Query·Key^T·Value across all positions            │  ║
-    ║  └────────────────────────────────────────────────────────────────────────┘  ║
-    ╚══════════════════════════════════════════════════════════════════════════════╝
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                          Positional Encoding                                 │
-    │                   Module 11: Add position information                        │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                         Character Embeddings                                 │
-    │                    Module 11: chars → embed_dim vectors                      │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                            Input Characters                                  │
-    │                    "To be or not to be, that is..."                          │
-    └──────────────────────────────────────────────────────────────────────────────┘
-
-📊 EXPECTED PERFORMANCE:
- Dataset: ~21MB TinyStories validation set (simple children's stories)
- Training time: 30-45 minutes (proper training, faster than Shakespeare!)
- Vocabulary: ~90 unique characters (simple English)
- Expected: Coherent simple stories with proper grammar
- Parameters: ~4.8M (perfect size for this task)
-"""
-
-import sys
-import os
-import numpy as np
-import argparse
-import time
-from rich.console import Console
-from rich.panel import Panel
-from rich.table import Table
-from rich import box
-
-# Add project root to path
-project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(project_root)
-
-console = Console()
-
-# Import TinyTorch components YOU BUILT!
-from tinytorch.core.tensor import Tensor                    # Module 02: YOU built this!
-from tinytorch.core.layers import Linear                    # Module 04: YOU built this!
-from tinytorch.core.activations import ReLU, Softmax        # Module 03: YOU built this!
-from tinytorch.core.optimizers import Adam                  # Module 08: YOU built this!
-from tinytorch.core.losses import CrossEntropyLoss          # Module 04: YOU built this!
-from tinytorch.text.tokenization import CharTokenizer       # Module 10: YOU built this!
-from tinytorch.text.embeddings import Embedding, PositionalEncoding   # Module 11: YOU built this!
-from tinytorch.core.attention import MultiHeadAttention     # Module 12: YOU built this!
-from tinytorch.models.transformer import LayerNorm, TransformerBlock  # Module 13: YOU built this!
-from tinytorch.data.loader import DataLoader, Dataset   # Module 08: YOU built this!
-
-# Import dataset manager
-from data_manager import DatasetManager
-
-
-class TinyStoriesDataset(Dataset):
-    """
-    Character-level TinyStories dataset using YOUR Dataset interface (Module 08)
-    and YOUR CharTokenizer (Module 10)!
-    
-    Tokenizes simple children's stories into characters for language modeling.
-    Much easier to learn than Shakespeare!
-    """
-    
-    def __init__(self, text, seq_length=64):
-        """
-        Initialize dataset with text and sequence length.
-        
-        Args:
-            text: Raw Shakespeare text
-            seq_length: Length of input sequences
-        """
-        # Use YOUR CharTokenizer from Module 10!
-        self.tokenizer = CharTokenizer()
-        self.tokenizer.build_vocab([text])  # Build vocabulary from Shakespeare corpus
-        self.vocab_size = self.tokenizer.vocab_size
-        
-        # Convert text to indices using YOUR tokenizer!
-        self.data = self.tokenizer.encode(text)
-        self.seq_length = seq_length
-        
-        # Calculate number of sequences
-        self.num_sequences = len(self.data) - seq_length
-        
-    def __getitem__(self, idx):
-        """Get a single training sequence - YOUR Dataset interface!"""
-        # Input: characters at positions [idx, idx+seq_length)
-        # Target: characters at positions [idx+1, idx+seq_length+1)
-        input_seq = self.data[idx:idx + self.seq_length]
-        target_seq = self.data[idx + 1:idx + self.seq_length + 1]
-        
-        return Tensor(np.array(input_seq, dtype=np.int32)), Tensor(np.array(target_seq, dtype=np.int32))
-    
-    def __len__(self):
-        """Return dataset size - YOUR Dataset interface!"""
-        return self.num_sequences
-    
-    def decode(self, indices):
-        """Convert indices back to text using YOUR tokenizer!"""
-        return self.tokenizer.decode(indices)
-
-
-class TinyGPT:
-    """
-    Character-level Transformer Language Model using YOUR TinyTorch!
-    
-    This architecture is what powers GPT, ChatGPT, and modern LLMs.
-    """
-    
-    def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
-        # Token representation
-        self.embedding = Embedding(vocab_size, embed_dim)           # Module 11!
-        self.pos_encoding = PositionalEncoding(max_length, embed_dim)  # Module 11!
-
-        # Transformer stack
-        self.layers = []
-        mlp_ratio = 4  # Standard 4x expansion in FFN (embed_dim * 4)
-        for _ in range(num_layers):
-            block = TransformerBlock(embed_dim, num_heads, mlp_ratio)  # Module 13!
-            self.layers.append(block)
-
-        # Output head
-        self.layer_norm = LayerNorm(embed_dim)          # Module 13!
-        self.output_proj = Linear(embed_dim, vocab_size)  # Module 04!
-        
-        self.vocab_size = vocab_size
-        self.embed_dim = embed_dim
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        
-        # Calculate parameters
-        self.total_params = self._count_parameters()
-    
-    def _count_parameters(self):
-        """Count total parameters in model."""
-        count = 0
-        for param in self.parameters():
-            count += param.data.size
-        return count
-
-    def parameters(self):
-        """Get all trainable parameters from YOUR model."""
-        params = []
-        # Embedding parameters
-        params.extend([self.embedding.weight])
-        params.extend(self.pos_encoding.parameters())  # Add positional encoding params!
-        # Transformer block parameters
-        for layer in self.layers:
-            if hasattr(layer, 'parameters'):
-                if callable(layer.parameters):
-                    params.extend(layer.parameters())
-                else:
-                    params.extend(layer.parameters)
-        # Output projection parameters
-        params.extend([self.layer_norm.gamma, self.layer_norm.beta])
-        params.extend([self.output_proj.weight, self.output_proj.bias])
-        
-        # Ensure all parameters have requires_grad=True
-        for param in params:
-            param.requires_grad = True
-        
-        return params
-
-    def forward(self, x):
-        """Forward pass through YOUR transformer stack."""
-        # Convert tokens to contextual vectors
-        x = self.embedding.forward(x)        # Module 11: char → vectors
-        x = self.pos_encoding.forward(x)     # Module 11: add position info
-        
-        # Process through transformer layers
-        for layer in self.layers:
-            x = layer.forward(x)  # Module 13: Attention → FFN
-        
-        # Generate predictions
-        x = self.layer_norm.forward(x)       # Module 13: final norm
-
-        # Reshape for Linear layer - KEEP COMPUTATION GRAPH!
-        batch_size, seq_len, embed_dim = x.shape
-        x_2d = x.reshape(batch_size * seq_len, embed_dim)  # Use Tensor.reshape()
-
-        # Apply output projection
-        logits_2d = self.output_proj(x_2d)   # Module 04: vocab predictions
-
-        # Reshape back - KEEP COMPUTATION GRAPH!
-        logits = logits_2d.reshape(batch_size, seq_len, self.vocab_size)  # Use Tensor.reshape()
-        
-        return logits
-
-
-def visualize_transformer():
-    """Show how transformers process text sequences."""
-    console.print("")
-    console.print(Panel.fit(
-        "[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
-        "[yellow]The Problem:[/yellow]\n"
-        "RNNs process sequences one step at a time\n"
-        "Can't parallelize → slow training on long sequences\n"
-        "Struggle with long-range dependencies\n\n"
-        "[green]The Innovation:[/green]\n"
-        "Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
-        "  • Self-attention: Every token attends to every other token\n"
-        "  • Multi-head attention: Learn multiple attention patterns\n"
-        "  • Positional encoding: Preserve sequence order\n\n"
-        "[bold]Can attention alone match RNN performance?[/bold]",
-        title="🎯 ACT 1: THE CHALLENGE",
-        border_style="cyan",
-        box=box.DOUBLE
-    ))
-    
-    console.print("""
-    How YOUR Transformer Sees Text:      What It Learns:
-    
-    Input: "To be or not to be"          Layer 1 (Attention):
-    ┌─────────────────────┐              • Each word attends to others
-    │ T o   b e   o r ... │              • "be" looks at "To", "or", etc.
-    └─────────────────────┘              • Captures dependencies
-            ↓                            
-    Character Embeddings                 Layer 2-4 (Deep Attention):
-    ┌─────────────────────┐              • Builds complex patterns
-    │ 128-dim vectors     │              • Grammar, style, meaning
-    │ for each character  │              • Shakespeare-specific patterns
-    └─────────────────────┘              
-            ↓                            Output Prediction:
-    Position Encoding                    "To be or not to be, that is the"
-    ┌─────────────────────┐                                         ↓
-    │ Add positional info │              Next char probabilities:
-    │ (order matters!)    │              't' → 0.85  (highest!)
-    └─────────────────────┘              'n' → 0.03
-            ↓                            'a' → 0.02
-    Transformer Layers ×4                ...
-    ┌─────────────────────┐
-    │ Self-Attention      │              Key Transformer Insight:
-    │ Feed-Forward        │              Unlike RNNs, attention lets each
-    │ Layer Norm          │              position look at ALL others
-    └─────────────────────┘              simultaneously - capturing long-range
-            ↓                            dependencies in O(1) operations!
-    Character Predictions
-    ┌─────────────────────┐
-    │ Probability for     │
-    │ each next character │
-    └─────────────────────┘
-    """)
-    print("="*70)
-
-
-def train_tinystories_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.01):
-    """Train TinyGPT using YOUR complete training system with DataLoader!"""
-    console.print("\n[bold]🚀 Training TinyStories TinyGPT with YOUR TinyTorch![/bold]")
-    console.print(f"  Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
-    console.print(f"  Batch size: [cyan]{train_loader.batch_size}[/cyan]")
-    console.print(f"  Learning rate: [cyan]{learning_rate}[/cyan] (1e-2, optimal for 4.8M param model)")
-    console.print(f"  YOUR DataLoader (Module 08) handles batching!")
-    console.print(f"  YOUR Adam optimizer (Module 08)")
-    console.print(f"  YOUR CrossEntropyLoss (Module 04) with autograd!")
-    
-    # YOUR optimizer and loss function
-    # Using 1e-2 learning rate (optimal for our 4.8M param model, validated by debug script)
-    # Note: Large models (100M+) use 3e-4, but smaller models need higher LR
-    optimizer = Adam(model.parameters(), lr=learning_rate)
-    loss_fn = CrossEntropyLoss()  # YOUR loss function with autograd!
-    
-    for epoch in range(epochs):
-        console.print(f"\n  [bold]Epoch {epoch+1}/{epochs}:[/bold]")
-        epoch_loss = 0
-        batch_count = 0
-        
-        # Use YOUR DataLoader to iterate through batches!
-        for batch_idx, (batch_input, batch_target) in enumerate(train_loader):
-            if batch_idx >= 500:  # Training mode - process more batches
-                break
-            
-            if batch_idx == 0:
-                console.print(f"    [dim]Processing first batch... (this may take a moment)[/dim]")
-            
-            # Forward pass with YOUR Transformer
-            logits = model(batch_input)  # YOUR attention mechanism!
-            
-            # Reshape for loss computation: (batch, seq, vocab) -> (batch*seq, vocab)
-            # IMPORTANT: Use Tensor.reshape() to preserve computation graph!
-            batch_size, seq_length, vocab_size = logits.shape
-            logits_2d = logits.reshape(batch_size * seq_length, vocab_size)
-            targets_1d = batch_target.reshape(-1)
-            
-            # Compute loss with YOUR CrossEntropyLoss (connects to autograd!)
-            loss = loss_fn.forward(logits_2d, targets_1d)  # Module 04 + Module 05!
-            loss_value = float(loss.data)
-            
-            # Backward pass with YOUR autograd
-            optimizer.zero_grad()  # Module 08!
-            loss.backward()        # Module 05: YOUR autodiff!
-            optimizer.step()       # Module 08!
-            
-            epoch_loss += loss_value
-            batch_count += 1
-            
-            # Progress - show output frequently so user sees continuous training
-            if batch_idx == 0 or (batch_idx + 1) % 10 == 0 or (batch_idx + 1) % 50 == 0:
-                avg_loss = epoch_loss / batch_count
-                console.print(f"    Batch {batch_idx+1}/500 | Loss: {loss_value:.4f} | Avg: {avg_loss:.4f}")
-        
-        # Epoch summary
-        avg_loss = epoch_loss / max(1, batch_count)
-        console.print(f"    → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
-    
-    return model
-
-
-def generate_text(model, dataset, prompt="To be or not", max_length=200, temperature=0.8):
-    """
-    Generate text from a prompt - THE WOW MOMENT!
-    
-    This is autoregressive generation: predict next char, add it, repeat.
-    """
-    console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
-    console.print("="*70)
-    
-    # Convert prompt to indices
-    prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
-    generated = prompt_indices.copy()
-    
-    console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
-    console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
-    
-    # Generate character by character
-    for _ in range(max_length):
-        # Take last seq_length characters as input
-        input_seq = generated[-dataset.seq_length:] if len(generated) >= dataset.seq_length else generated
-        
-        # Pad if necessary
-        if len(input_seq) < dataset.seq_length:
-            input_seq = [0] * (dataset.seq_length - len(input_seq)) + input_seq
-        
-        # Forward pass
-        input_tensor = Tensor(np.array([input_seq], dtype=np.int32))
-        logits = model(input_tensor)
-        
-        # Get logits for last position
-        logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data)
-        next_logits = logits_np[0, -1, :]  # Last position predictions
-        
-        # Apply temperature and sample
-        next_logits = next_logits / temperature
-        exp_logits = np.exp(next_logits - np.max(next_logits))
-        probs = exp_logits / np.sum(exp_logits)
-        
-        # Sample from distribution
-        next_idx = np.random.choice(len(probs), p=probs)
-        generated.append(next_idx)
-    
-    # Decode to text
-    generated_text = dataset.decode(generated)
-    
-    console.print("[bold]📖 Generated Text:[/bold]")
-    console.print("─" * 70)
-    console.print(f"[green]{generated_text}[/green]")
-    console.print("─" * 70)
-    
-    return generated_text
-
-
-def analyze_transformer_systems(model):
-    """Analyze YOUR Transformer from an ML systems perspective."""
-    console.print("")
-    console.print(Panel.fit(
-        f"[bold]Model Architecture:[/bold]\n"
-        f"  • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
-        f"  • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
-        f"  • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
-        
-        "[bold]Computational Complexity:[/bold]\n"
-        "  • Attention: O(n²·d) where n=sequence, d=dimension\n"
-        "  • Self-attention allows parallel processing (vs RNN sequential)\n"
-        "  • YOUR implementation: Pure Python + NumPy\n\n"
-        
-        f"[bold]Memory Requirements:[/bold]\n"
-        f"  • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
-        "  • Attention matrices: O(n²) per layer\n"
-        "  • YOUR TinyTorch tracks gradients automatically\n\n"
-        
-        "[bold]🏛️ Transformer Evolution:[/bold]\n"
-        "  • 2017: Vaswani et al. 'Attention Is All You Need'\n"
-        "  • 2018: BERT (bidirectional), GPT (autoregressive)\n"
-        "  • 2020: GPT-3 (175B params, same architecture!)\n"
-        "  • 2022: ChatGPT (YOUR architecture at massive scale)\n"
-        "  • YOUR TinyGPT: Core principles that power them all!\n\n"
-        
-        "[bold]💡 Why Transformers Dominate:[/bold]\n"
-        "  • Parallelizable (vs sequential RNNs)\n"
-        "  • Long-range dependencies (attention sees everything)\n"
-        "  • Scalable (architecture works from 1M to 175B params)\n"
-        "  • YOUR implementation demonstrates all of these!",
-        
-        title="🔬 SYSTEMS ANALYSIS",
-        border_style="cyan",
-        box=box.DOUBLE
-    ))
-
-
-def main():
-    """Demonstrate Shakespeare text generation using YOUR TinyTorch!"""
-    
-    parser = argparse.ArgumentParser(description='Shakespeare Transformer 2017')
-    parser.add_argument('--test-only', action='store_true',
-                       help='Test architecture only')
-    parser.add_argument('--epochs', type=int, default=20,
-                       help='Training epochs')
-    parser.add_argument('--batch-size', type=int, default=32,
-                       help='Batch size')
-    parser.add_argument('--seq-length', type=int, default=128,
-                       help='Sequence length')
-    parser.add_argument('--embed-dim', type=int, default=256,
-                       help='Embedding dimension')
-    parser.add_argument('--num-layers', type=int, default=6,
-                       help='Number of transformer layers')
-    parser.add_argument('--num-heads', type=int, default=8,
-                       help='Number of attention heads')
-    parser.add_argument('--visualize', action='store_true', default=True,
-                       help='Show transformer visualization')
-    parser.add_argument('--quick-test', action='store_true',
-                       help='Use small subset for testing')
-    args = parser.parse_args()
-    
-    console.print("")
-    console.print(Panel.fit(
-        "[bold cyan]TinyStories Transformer - Simple Story Generation![/bold cyan]\n\n"
-        "[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
-        "[green]YOUR achievement:[/green] Generate coherent children's stories\n"
-        "[cyan]Components used:[/cyan] YOUR complete NLP pipeline (Modules 2, 3, 4, 8, 10, 11, 12, 13)\n"
-        "[dim]Note: TinyStories is much easier than Shakespeare - designed for small models![/dim]",
-        title="🎯 Milestone 05: Transformer Era (2017)",
-        border_style="cyan",
-        box=box.DOUBLE
-    ))
-    
-    # Visualization
-    if args.visualize:
-        visualize_transformer()
-    
-    # Step 1: Load TinyStories dataset
-    console.print("\n[bold]📥 Loading TinyStories dataset...[/bold]")
-    
-    # Load TinyStories from downloaded file
-    tinystories_path = os.path.join(
-        os.path.dirname(__file__), 
-        '../datasets/tinystories/tinystories_val.txt'
-    )
-    
-    if not os.path.exists(tinystories_path):
-        console.print(f"[red]❌ TinyStories not found at {tinystories_path}[/red]")
-        console.print("[yellow]Run: python milestones/05_2017_transformer/download_tinystories.py[/yellow]")
-        return
-    
-    with open(tinystories_path, 'r', encoding='utf-8') as f:
-        text = f.read()
-    
-    console.print(f"📊 Loaded: {len(text):,} characters, {len(text.split()):,} words")
-    
-    if args.quick_test:
-        text = text[:100000]  # Use small subset for testing (100K chars)
-        console.print("  [dim](Using 100K char subset for quick testing)[/dim]")
-    
-    # Step 2: Create Dataset and DataLoader using YOUR Module 08!
-    console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
-    dataset = TinyStoriesDataset(text, seq_length=args.seq_length)
-    
-    # YOUR DataLoader handles batching and shuffling!
-    train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
-    
-    console.print(f"  Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
-    console.print(f"  Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
-    console.print(f"  DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
-    
-    # Step 3: Build Transformer
-    model = TinyGPT(
-        vocab_size=dataset.vocab_size,
-        embed_dim=args.embed_dim,
-        max_length=args.seq_length,
-        num_heads=args.num_heads,
-        num_layers=args.num_layers
-    )
-    
-    # Display model info
-    console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
-    console.print(f"  Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
-    console.print(f"  Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
-    console.print(f"  Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
-    
-    if args.test_only:
-        console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
-        # Test with minimal data
-        test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
-        test_output = model(test_input)
-        console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
-        console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
-        return
-    
-    # Step 4: Train using YOUR DataLoader
-    start_time = time.time()
-    model = train_tinystories_gpt(model, train_loader, dataset, epochs=args.epochs)
-    train_time = time.time() - start_time
-    
-    # Step 5: Generate text!
-    generated = generate_text(model, dataset, prompt="Once upon a time", max_length=200)
-    
-    # Additional generation examples
-    console.print("\n[bold]🎭 More Generation Examples:[/bold]")
-    console.print("─" * 70)
-    
-    prompts = ["ROMEO:", "The king", "What is"]
-    for prompt in prompts:
-        if all(ch in dataset.char_to_idx for ch in prompt):
-            console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
-            gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
-    
-    # Step 6: Systems Analysis
-    analyze_transformer_systems(model)
-    
-    console.print(f"\n[bold]⏱️  Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
-    console.print(f"  Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
-    
-    console.print("")
-    console.print(Panel.fit(
-        "[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
-        
-        "[bold]🎓 What YOU Accomplished:[/bold]\n"
-        "  • YOUR attention mechanism processes sequences in parallel\n"
-        "  • YOUR transformer captures long-range text dependencies\n"
-        "  • YOUR DataLoader efficiently batches character sequences\n"
-        "  • YOUR TinyGPT generates coherent text!\n"
-        "  • YOUR complete language modeling system works!\n\n"
-        
-        "[bold]🚀 Next Steps:[/bold]\n"
-        "  • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
-        "  • YOUR transformer architecture scales to GPT-scale models\n"
-        "  • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
-        
-        title="🌟 2017 Transformer Revolution Complete",
-        border_style="green",
-        box=box.DOUBLE
-    ))
-
-if __name__ == "__main__":
-    main()
--- a/milestones/05_2017_transformer/tinytalks_chatbot.py
+++ b/milestones/05_2017_transformer/tinytalks_chatbot.py
@@ -1,375 +0,0 @@
-"""
-TinyTalks Chatbot - Train a Simple Conversational AI in 10-15 Minutes
-======================================================================
-
-A minimal but functional chatbot trained on simple Q&A pairs.
-
-Goal: Show that transformers can learn conversational patterns quickly!
-"""
-
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-import numpy as np
-import time
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.models.transformer import GPT
-from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
-
-enable_autograd()
-
-# ============================================================================
-# Tokenization
-# ============================================================================
-
-def create_tokenizer(conversations):
-    """Create character-level tokenizer with special tokens."""
-    # Get all unique characters
-    all_text = ' '.join([q + ' ' + a for q, a in conversations])
-    all_chars = sorted(set(all_text))
-    
-    # Special tokens
-    special_tokens = {
-        '<PAD>': 0,
-        '<SOS>': 1,  # Start of sequence
-        '<SEP>': 2,  # Separator between Q and A
-        '<EOS>': 3,  # End of sequence
-    }
-    
-    # Character mappings
-    char_to_idx = {**special_tokens}
-    idx_to_char = {v: k for k, v in special_tokens.items()}
-    
-    for idx, char in enumerate(all_chars, start=len(special_tokens)):
-        char_to_idx[char] = idx
-        idx_to_char[idx] = char
-    
-    return char_to_idx, idx_to_char
-
-
-def encode_conversation(question, answer, char_to_idx, max_len=80):
-    """
-    Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>...
-    
-    Example:
-    Q: "Hi"
-    A: "Hello"
-    → [<SOS>, H, i, <SEP>, H, e, l, l, o, <EOS>, <PAD>, ...]
-    """
-    # Build sequence
-    tokens = [char_to_idx['<SOS>']]
-    
-    # Add question
-    for c in question:
-        tokens.append(char_to_idx.get(c, 0))
-    
-    # Add separator
-    tokens.append(char_to_idx['<SEP>'])
-    
-    # Add answer
-    for c in answer:
-        tokens.append(char_to_idx.get(c, 0))
-    
-    # Add EOS
-    tokens.append(char_to_idx['<EOS>'])
-    
-    # Pad
-    if len(tokens) < max_len:
-        tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
-    else:
-        tokens = tokens[:max_len]
-    
-    return tokens
-
-
-def decode_tokens(tokens, idx_to_char, stop_at_eos=True):
-    """Decode tokens to string."""
-    chars = []
-    for t in tokens:
-        if t == 0:  # PAD
-            if stop_at_eos:
-                break
-        elif t == 1:  # SOS
-            continue
-        elif t == 2:  # SEP
-            chars.append(' | ')
-        elif t == 3:  # EOS
-            if stop_at_eos:
-                break
-        else:
-            chars.append(idx_to_char.get(t, '?'))
-    return ''.join(chars)
-
-
-# ============================================================================
-# Training
-# ============================================================================
-
-def train_chatbot(model, optimizer, loss_fn, train_data, max_time_minutes=10):
-    """
-    Train TinyTalks chatbot.
-    """
-    max_time_seconds = max_time_minutes * 60
-    
-    print("=" * 70)
-    print(f"TRAINING TINYTALKS CHATBOT FOR {max_time_minutes} MINUTES")
-    print("=" * 70)
-    print(f"Dataset: {len(train_data)} conversations")
-    print(f"Time limit: {max_time_seconds}s ({max_time_minutes} minutes)")
-    print()
-    
-    start_time = time.time()
-    losses = []
-    step = 0
-    
-    # Progress checkpoints every 2 minutes
-    checkpoint_interval = 120  # 2 minutes
-    next_checkpoint = checkpoint_interval
-    
-    print("Training started...")
-    print()
-    
-    while True:
-        elapsed = time.time() - start_time
-        if elapsed >= max_time_seconds:
-            break
-        
-        # Sample random conversation
-        tokens = train_data[np.random.randint(len(train_data))]
-        
-        # Next token prediction
-        input_seq = tokens[:-1]
-        target_seq = tokens[1:]
-        
-        x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
-        y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
-        
-        # Forward
-        logits = model.forward(x)
-        
-        # Loss
-        batch_size, seq_len, vocab_size = logits.shape
-        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
-        targets_flat = y_true.reshape(batch_size * seq_len)
-        loss = loss_fn.forward(logits_flat, targets_flat)
-        
-        # Backward
-        optimizer.zero_grad()
-        loss.backward()
-        
-        # Clip gradients
-        for param in model.parameters():
-            if param.grad is not None:
-                np.clip(param.grad, -1.0, 1.0, out=param.grad)
-        
-        # Update
-        optimizer.step()
-        
-        losses.append(loss.data.item())
-        step += 1
-        
-        # Show progress at checkpoints
-        if elapsed >= next_checkpoint:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            steps_per_sec = step / elapsed
-            mins = int(elapsed / 60)
-            print(f"[{mins:2d} min] Step {step:5d} | Loss: {avg_loss:.4f} | Speed: {steps_per_sec:.1f} steps/sec")
-            next_checkpoint += checkpoint_interval
-        
-        # Also show every 500 steps for early progress
-        if step % 500 == 0 and step <= 2000:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}")
-    
-    final_elapsed = time.time() - start_time
-    final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-    initial_loss = np.mean(losses[:10])
-    improvement = (1 - final_loss / initial_loss) * 100
-    
-    print()
-    print("=" * 70)
-    print("TRAINING COMPLETE")
-    print("=" * 70)
-    print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)")
-    print(f"Total steps: {step:,}")
-    print(f"Steps/second: {step/final_elapsed:.1f}")
-    print(f"Initial loss: {initial_loss:.4f}")
-    print(f"Final loss: {final_loss:.4f}")
-    print(f"Improvement: {improvement:.1f}%")
-    print()
-    
-    return losses, step
-
-
-# ============================================================================
-# Generation / Chat
-# ============================================================================
-
-def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
-    """
-    Generate response to a question.
-    
-    Process:
-    1. Encode: <SOS> question <SEP>
-    2. Generate tokens until <EOS> or max_len
-    3. Decode generated tokens
-    """
-    # Encode question
-    tokens = [char_to_idx['<SOS>']]
-    for c in question:
-        tokens.append(char_to_idx.get(c, 0))
-    tokens.append(char_to_idx['<SEP>'])
-    
-    # Generate response
-    generated_tokens = []
-    for _ in range(max_len):
-        # Pad input to model's expected length
-        input_tokens = tokens + generated_tokens
-        while len(input_tokens) < 80:  # Match training max_len
-            input_tokens.append(char_to_idx['<PAD>'])
-        input_tokens = input_tokens[:80]
-        
-        # Forward pass
-        x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
-        logits = model.forward(x)
-        
-        # Get next token (position after current sequence)
-        next_pos = len(tokens) + len(generated_tokens) - 1
-        if next_pos < logits.shape[1]:
-            next_logits = logits.data[0, next_pos, :]
-            next_token = int(np.argmax(next_logits))
-            
-            # Stop at EOS or PAD
-            if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
-                break
-            
-            generated_tokens.append(next_token)
-        else:
-            break
-    
-    # Decode generated response
-    response = decode_tokens(generated_tokens, idx_to_char, stop_at_eos=False)
-    return response
-
-
-def test_chatbot(model, test_questions, char_to_idx, idx_to_char):
-    """Test chatbot on sample questions."""
-    print("=" * 70)
-    print("TESTING CHATBOT")
-    print("=" * 70)
-    print()
-    
-    for question in test_questions:
-        response = generate_response(model, question, char_to_idx, idx_to_char)
-        print(f"Q: {question}")
-        print(f"A: {response}")
-        print()
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    print()
-    print("=" * 70)
-    print("TINYTALKS CHATBOT - 10-15 MINUTE TRAINING")
-    print("=" * 70)
-    print()
-    
-    # Load dataset
-    conversations = create_tinytalks_dataset()
-    stats = get_dataset_stats()
-    
-    print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)")
-    print(f"Repetition: {stats['repetition_factor']:.1f}x for better learning")
-    print(f"Avg lengths: Q={stats['avg_question_len']:.1f} chars, A={stats['avg_answer_len']:.1f} chars")
-    print()
-    
-    # Create tokenizer
-    char_to_idx, idx_to_char = create_tokenizer(conversations)
-    vocab_size = len(idx_to_char)
-    print(f"Vocabulary: {vocab_size} tokens (including special tokens)")
-    print()
-    
-    # Encode dataset
-    max_seq_len = 80
-    train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
-    
-    # Model: Ultra-tiny for speed (learned from 5-min test!)
-    # Target: ~20-30 steps/sec with longer sequences
-    # In 10 mins (600s): ~12,000-18,000 steps
-    config = {
-        'vocab_size': vocab_size,
-        'embed_dim': 16,      # Keep it tiny!
-        'num_layers': 1,      # Just 1 layer
-        'num_heads': 2,       # 2 heads
-        'max_seq_len': max_seq_len,
-    }
-    
-    print("Model configuration:")
-    for key, val in config.items():
-        print(f"  {key}: {val}")
-    print()
-    
-    model = GPT(**config)
-    num_params = sum(np.prod(p.shape) for p in model.parameters())
-    print(f"Parameters: {num_params:,}")
-    print()
-    
-    # Optimizer
-    optimizer = Adam(model.parameters(), lr=0.001)
-    loss_fn = CrossEntropyLoss()
-    
-    # Train for 15 minutes (adjustable)
-    train_time = 15  # minutes
-    print(f"Training for {train_time} minutes...")
-    print()
-    
-    losses, total_steps = train_chatbot(
-        model=model,
-        optimizer=optimizer,
-        loss_fn=loss_fn,
-        train_data=train_data,
-        max_time_minutes=train_time
-    )
-    
-    # Test with sample questions
-    test_questions = [
-        "Hi",
-        "How are you",
-        "What is your name",
-        "What is the sky",
-        "Is grass green",
-        "What is 1 plus 1",
-        "Are you happy",
-        "Bye",
-    ]
-    
-    print("Testing chatbot responses...")
-    print()
-    test_chatbot(model, test_questions, char_to_idx, idx_to_char)
-    
-    # Summary
-    print("=" * 70)
-    print("TINYTALKS SUMMARY")
-    print("=" * 70)
-    print(f"✓ Model: {num_params:,} parameters")
-    print(f"✓ Training: {train_time} minutes, {total_steps:,} steps")
-    print(f"✓ Loss: {np.mean(losses[:10]):.4f} → {np.mean(losses[-100:]):.4f}")
-    print(f"✓ Improvement: {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}%")
-    print()
-    print("Try it yourself:")
-    print("  1. Ask simple questions from the training set")
-    print("  2. The model should generate learned responses")
-    print("  3. Experiment with model size and training time!")
-    print()
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/tinytalks_dashboard.py
+++ b/milestones/05_2017_transformer/tinytalks_dashboard.py
@@ -1,546 +0,0 @@
-"""
-TinyTalks Interactive Dashboard - Watch Learning Happen Live!
-=============================================================
-
-A beautiful, educational dashboard showing a transformer learn to chat.
-
-Students see:
- Live training metrics
- Responses improving from gibberish to coherent
- Real-time checkpoints with before/after comparison
- Visual feedback on what's correct vs incorrect
-"""
-
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-import numpy as np
-import time
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.models.transformer import GPT
-from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
-
-enable_autograd()
-
-# Rich CLI imports
-from rich.console import Console
-from rich.panel import Panel
-from rich.table import Table
-from rich.layout import Layout
-from rich.live import Live
-from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
-from rich import box
-from rich.text import Text
-
-console = Console()
-
-# ============================================================================
-# Tokenization (same as tinytalks_chatbot.py)
-# ============================================================================
-
-def create_tokenizer(conversations):
-    """Create character-level tokenizer with special tokens."""
-    all_text = ' '.join([q + ' ' + a for q, a in conversations])
-    all_chars = sorted(set(all_text))
-    
-    special_tokens = {
-        '<PAD>': 0,
-        '<SOS>': 1,
-        '<SEP>': 2,
-        '<EOS>': 3,
-    }
-    
-    char_to_idx = {**special_tokens}
-    idx_to_char = {v: k for k, v in special_tokens.items()}
-    
-    for idx, char in enumerate(all_chars, start=len(special_tokens)):
-        char_to_idx[char] = idx
-        idx_to_char[idx] = char
-    
-    return char_to_idx, idx_to_char
-
-
-def encode_conversation(question, answer, char_to_idx, max_len=80):
-    """Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>..."""
-    tokens = [char_to_idx['<SOS>']]
-    
-    for c in question:
-        tokens.append(char_to_idx.get(c, 0))
-    
-    tokens.append(char_to_idx['<SEP>'])
-    
-    for c in answer:
-        tokens.append(char_to_idx.get(c, 0))
-    
-    tokens.append(char_to_idx['<EOS>'])
-    
-    if len(tokens) < max_len:
-        tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
-    else:
-        tokens = tokens[:max_len]
-    
-    return tokens
-
-
-def decode_tokens(tokens, idx_to_char):
-    """Decode tokens to string."""
-    chars = []
-    for t in tokens:
-        if t == 0 or t == 1:  # PAD or SOS
-            continue
-        elif t == 2:  # SEP
-            continue
-        elif t == 3:  # EOS
-            break
-        else:
-            chars.append(idx_to_char.get(t, '?'))
-    return ''.join(chars)
-
-
-def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
-    """Generate response to a question."""
-    tokens = [char_to_idx['<SOS>']]
-    for c in question:
-        tokens.append(char_to_idx.get(c, 0))
-    tokens.append(char_to_idx['<SEP>'])
-    
-    generated_tokens = []
-    for _ in range(max_len):
-        input_tokens = tokens + generated_tokens
-        while len(input_tokens) < 80:
-            input_tokens.append(char_to_idx['<PAD>'])
-        input_tokens = input_tokens[:80]
-        
-        x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
-        logits = model.forward(x)
-        
-        next_pos = len(tokens) + len(generated_tokens) - 1
-        if next_pos < logits.shape[1]:
-            next_logits = logits.data[0, next_pos, :]
-            next_token = int(np.argmax(next_logits))
-            
-            if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
-                break
-            
-            generated_tokens.append(next_token)
-        else:
-            break
-    
-    response = decode_tokens(generated_tokens, idx_to_char)
-    return response
-
-
-# ============================================================================
-# Dashboard Components
-# ============================================================================
-
-def create_welcome_panel():
-    """Create the welcome panel."""
-    return Panel.fit(
-        "[bold cyan]🤖 TINYTALKS - Watch a Transformer Learn to Chat![/bold cyan]\n\n"
-        "[dim]You're about to see AI learning happen in real-time.\n"
-        "The model starts knowing nothing - just random noise.\n"
-        "Every training step makes it slightly smarter.\n"
-        "Watch responses improve from gibberish to coherent conversation![/dim]\n\n"
-        "[bold]Training Duration:[/bold] 10-15 minutes\n"
-        "[bold]Checkpoints:[/bold] Every ~2 minutes\n"
-        "[bold]What to watch:[/bold] Loss ↓ = Better responses ✓",
-        title="🎓 Educational AI Training Demo",
-        border_style="cyan",
-        box=box.DOUBLE
-    )
-
-
-def create_metrics_table(step, loss, elapsed, steps_per_sec):
-    """Create current training metrics table."""
-    table = Table(show_header=False, box=box.SIMPLE, padding=(0, 2))
-    table.add_column("Metric", style="cyan")
-    table.add_column("Value", style="green bold")
-    
-    table.add_row("Step", f"{step:,}")
-    table.add_row("Loss", f"{loss:.4f}")
-    table.add_row("Time", f"{int(elapsed/60)}m {int(elapsed%60)}s")
-    table.add_row("Speed", f"{steps_per_sec:.1f} steps/sec")
-    
-    return table
-
-
-def create_checkpoint_comparison(checkpoint_num, step, loss, test_results, expected_answers):
-    """Create a checkpoint panel showing test results."""
-    
-    # Count correct
-    correct = 0
-    for (q, actual), expected in zip(test_results, expected_answers):
-        if actual.strip().lower() == expected.strip().lower():
-            correct += 1
-    
-    accuracy = (correct / len(test_results)) * 100
-    
-    # Create results table
-    table = Table(
-        title=f"Checkpoint {checkpoint_num} - Step {step:,} | Loss: {loss:.4f} | Accuracy: {accuracy:.0f}%",
-        box=box.ROUNDED,
-        show_header=True
-    )
-    table.add_column("Question", style="cyan", width=22)
-    table.add_column("Model Response", style="white", width=28)
-    table.add_column("Status", justify="center", width=8)
-    
-    for (question, actual), expected in zip(test_results, expected_answers):
-        # Determine if correct
-        is_correct = actual.strip().lower() == expected.strip().lower()
-        is_close = expected.strip().lower() in actual.strip().lower() or actual.strip().lower() in expected.strip().lower()
-        
-        # Color code and emoji
-        if is_correct:
-            status = "[green]✓ Perfect[/green]"
-            response_style = "green"
-        elif is_close:
-            status = "[yellow]≈ Close[/yellow]"
-            response_style = "yellow"
-        elif len(actual.strip()) > 0:
-            status = "[red]✗ Wrong[/red]"
-            response_style = "red"
-        else:
-            status = "[dim]- Empty[/dim]"
-            response_style = "dim"
-        
-        # Truncate long responses
-        display_response = actual[:26] + "..." if len(actual) > 26 else actual
-        
-        table.add_row(
-            question,
-            f"[{response_style}]{display_response}[/{response_style}]",
-            status
-        )
-    
-    return table
-
-
-def create_progress_panel(step, total_steps, checkpoint_num, total_checkpoints):
-    """Create progress indicators panel."""
-    step_progress = (step / total_steps) * 100 if total_steps > 0 else 0
-    checkpoint_progress = (checkpoint_num / total_checkpoints) * 100 if total_checkpoints > 0 else 0
-    
-    # Progress bars (ASCII style)
-    step_bar_filled = int(step_progress / 2.5)  # 40 chars max
-    step_bar = "[" + "=" * step_bar_filled + " " * (40 - step_bar_filled) + "]"
-    
-    checkpoint_bar_filled = int(checkpoint_progress / 2.5)
-    checkpoint_bar = "[" + "=" * checkpoint_bar_filled + " " * (40 - checkpoint_bar_filled) + "]"
-    
-    text = (
-        f"[bold]Training Progress:[/bold]\n"
-        f"{step_bar} {step_progress:.1f}% ({step}/{total_steps} steps)\n\n"
-        f"[bold]Checkpoints:[/bold]\n"
-        f"{checkpoint_bar} {checkpoint_progress:.1f}% ({checkpoint_num}/{total_checkpoints} completed)"
-    )
-    
-    return Panel(text, title="📊 Progress", border_style="blue")
-
-
-# ============================================================================
-# Training with Dashboard
-# ============================================================================
-
-def train_with_dashboard(model, optimizer, loss_fn, train_data, test_questions, expected_answers,
-                        char_to_idx, idx_to_char, max_time_minutes=10, checkpoint_interval_steps=1500):
-    """
-    Train with beautiful dashboard showing live progress.
-    """
-    max_time_seconds = max_time_minutes * 60
-    
-    console.clear()
-    console.print(create_welcome_panel())
-    console.print()
-    
-    input("[bold cyan]Press ENTER to start training...[/bold cyan]")
-    console.clear()
-    
-    # Training setup
-    start_time = time.time()
-    losses = []
-    step = 0
-    checkpoint_num = 0
-    
-    # Calculate expected checkpoints
-    estimated_total_steps = int(max_time_seconds * 12)  # ~12 steps/sec
-    total_checkpoints = estimated_total_steps // checkpoint_interval_steps
-    
-    # Initial evaluation
-    console.print("\n[bold]📊 CHECKPOINT 0: Initial Model (Untrained)[/bold]\n")
-    initial_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
-    console.print(create_checkpoint_comparison(0, 0, 999.9, initial_results, expected_answers))
-    console.print()
-    
-    console.print("[dim]Starting training... Watch the responses improve![/dim]\n")
-    time.sleep(2)
-    
-    next_checkpoint = checkpoint_interval_steps
-    last_print_time = time.time()
-    
-    # Training loop
-    while True:
-        elapsed = time.time() - start_time
-        if elapsed >= max_time_seconds:
-            break
-        
-        # Training step
-        tokens = train_data[np.random.randint(len(train_data))]
-        input_seq = tokens[:-1]
-        target_seq = tokens[1:]
-        
-        x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
-        y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
-        
-        logits = model.forward(x)
-        
-        batch_size, seq_len, vocab_size = logits.shape
-        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
-        targets_flat = y_true.reshape(batch_size * seq_len)
-        loss = loss_fn.forward(logits_flat, targets_flat)
-        
-        optimizer.zero_grad()
-        loss.backward()
-        
-        for param in model.parameters():
-            if param.grad is not None:
-                np.clip(param.grad, -1.0, 1.0, out=param.grad)
-        
-        optimizer.step()
-        
-        losses.append(loss.data.item())
-        step += 1
-        
-        # Print progress every 5 seconds
-        if time.time() - last_print_time >= 5.0:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            steps_per_sec = step / elapsed
-            console.print(
-                f"[dim]Step {step:5d} | "
-                f"Loss: {avg_loss:.4f} | "
-                f"Time: {int(elapsed/60)}m{int(elapsed%60):02d}s | "
-                f"Speed: {steps_per_sec:.1f} steps/sec[/dim]"
-            )
-            last_print_time = time.time()
-        
-        # Checkpoint evaluation
-        if step >= next_checkpoint:
-            checkpoint_num += 1
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            
-            console.print("\n" + "="*70)
-            console.print(f"[bold yellow]⏸️  CHECKPOINT {checkpoint_num}[/bold yellow]")
-            console.print(f"[dim]Pausing training to evaluate... (Step {step:,})[/dim]\n")
-            
-            # Evaluate
-            current_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
-            
-            # Show results
-            console.print(create_checkpoint_comparison(checkpoint_num, step, avg_loss, current_results, expected_answers))
-            console.print()
-            
-            # Show progress
-            console.print(create_progress_panel(step, estimated_total_steps, checkpoint_num, total_checkpoints))
-            console.print()
-            
-            console.print("[dim]Continuing training...[/dim]\n")
-            next_checkpoint += checkpoint_interval_steps
-            time.sleep(1)
-    
-    # Final results
-    final_elapsed = time.time() - start_time
-    final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-    initial_loss = np.mean(losses[:10])
-    improvement = (1 - final_loss / initial_loss) * 100
-    
-    console.print("\n" + "="*70)
-    console.print("[bold green]🎉 TRAINING COMPLETE![/bold green]\n")
-    
-    # Final evaluation
-    final_results = [(q, generate_response(model, q, char_to_idx, idx_to_char)) for q in test_questions]
-    console.print(create_checkpoint_comparison("FINAL", step, final_loss, final_results, expected_answers))
-    console.print()
-    
-    # Summary table
-    summary = Table(title="Training Summary", box=box.DOUBLE, show_header=True)
-    summary.add_column("Metric", style="cyan", width=30)
-    summary.add_column("Value", style="green bold", width=30)
-    
-    summary.add_row("Total Training Time", f"{final_elapsed/60:.1f} minutes")
-    summary.add_row("Total Steps", f"{step:,}")
-    summary.add_row("Steps/Second", f"{step/final_elapsed:.1f}")
-    summary.add_row("Initial Loss", f"{initial_loss:.4f}")
-    summary.add_row("Final Loss", f"{final_loss:.4f}")
-    summary.add_row("Improvement", f"{improvement:.1f}%")
-    summary.add_row("Checkpoints Evaluated", f"{checkpoint_num}")
-    
-    console.print(summary)
-    console.print()
-    
-    # Count perfect responses for milestone card
-    correct = sum(1 for (q, actual), expected in zip(final_results, expected_answers) 
-                  if actual.strip().lower() == expected.strip().lower())
-    accuracy = (correct / len(test_questions)) * 100
-    
-    return losses, step, accuracy
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    # Dataset
-    conversations = create_tinytalks_dataset()
-    char_to_idx, idx_to_char = create_tokenizer(conversations)
-    vocab_size = len(idx_to_char)
-    
-    # Encode
-    max_seq_len = 80
-    train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
-    
-    # Test questions and expected answers
-    test_questions = [
-        "Hi",
-        "How are you",
-        "What is your name",
-        "What is the sky",
-        "Is grass green",
-        "What is 1 plus 1",
-        "Are you happy"
-    ]
-    
-    expected_answers = [
-        "Hello! How can I help you?",
-        "I am doing well, thanks!",
-        "I am TinyBot",
-        "The sky is blue",
-        "Yes, grass is green",
-        "1 plus 1 equals 2",
-        "Yes, I am happy"
-    ]
-    
-    # Model
-    config = {
-        'vocab_size': vocab_size,
-        'embed_dim': 16,
-        'num_layers': 1,
-        'num_heads': 2,
-        'max_seq_len': max_seq_len,
-    }
-    
-    model = GPT(**config)
-    num_params = sum(np.prod(p.shape) for p in model.parameters())
-    
-    # Optimizer
-    optimizer = Adam(model.parameters(), lr=0.001)
-    loss_fn = CrossEntropyLoss()
-    
-    # Train with dashboard
-    train_time = 15  # 15 minutes for better results
-    checkpoint_interval = 2000  # Every ~2.5 minutes
-    
-    console.print(Panel.fit(
-        f"[bold]Model:[/bold] {num_params:,} parameters (ultra-tiny!)\n"
-        f"[bold]Training Time:[/bold] {train_time} minutes\n"
-        f"[bold]Checkpoints:[/bold] Every {checkpoint_interval} steps (~2 min)\n"
-        f"[bold]Test Questions:[/bold] {len(test_questions)} questions\n\n"
-        f"[dim]Watch loss decrease and responses improve![/dim]",
-        title="⚙️ Configuration",
-        border_style="blue"
-    ))
-    
-    losses, total_steps, final_accuracy = train_with_dashboard(
-        model=model,
-        optimizer=optimizer,
-        loss_fn=loss_fn,
-        train_data=train_data,
-        test_questions=test_questions,
-        expected_answers=expected_answers,
-        char_to_idx=char_to_idx,
-        idx_to_char=idx_to_char,
-        max_time_minutes=train_time,
-        checkpoint_interval_steps=checkpoint_interval
-    )
-    
-    # Calculate metrics for milestone card
-    loss_improvement = (1 - np.mean(losses[-100:]) / np.mean(losses[:10])) * 100
-    
-    # Milestone completion card
-    console.print()
-    if final_accuracy >= 50 and loss_improvement >= 80:
-        console.print(Panel.fit(
-            "[bold green]🎉 Congratulations! You've Built a Working Chatbot![/bold green]\n\n"
-            
-            f"Final accuracy: [bold]{final_accuracy:.0f}%[/bold] | "
-            f"Loss improved: [bold]{loss_improvement:.1f}%[/bold]\n\n"
-            
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
-            
-            "[bold]💡 What YOU Just Accomplished:[/bold]\n"
-            "  ✓ Built a TRANSFORMER (2017 Vaswani et al)\n"
-            "  ✓ Trained with attention mechanism from scratch\n"
-            "  ✓ Watched AI learn language patterns in real-time\n"
-            "  ✓ Demonstrated gradient descent on complex architectures\n"
-            f"  ✓ Trained {total_steps:,} steps in {train_time} minutes!\n\n"
-            
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
-            
-            "[bold]🎓 Why This Matters:[/bold]\n"
-            "  This is the SAME architecture behind ChatGPT, GPT-4, and BERT.\n"
-            "  You just witnessed the magic of:\n"
-            "  • Self-attention (learning relationships between words)\n"
-            "  • Position encoding (understanding word order)\n"
-            "  • Autoregressive generation (predicting next token)\n\n"
-            
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
-            
-            "[bold]📌 The Key Insight:[/bold]\n"
-            "  You saw responses evolve from gibberish to coherent:\n"
-            "    Checkpoint 0: Random noise\n"
-            "    Checkpoint 1: Recognizable words\n"
-            "    Checkpoint 2: Partial sentences\n"
-            "    Final: Perfect responses!\n"
-            "  \n"
-            "  [yellow]Scale it up:[/yellow] Same process, more data, more params →\n"
-            "  You get GPT-4 (175B params, trained for weeks)!\n\n"
-            
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
-            
-            "[bold]🚀 What You Can Do Now:[/bold]\n"
-            "• Experiment with different architectures (layers, heads)\n"
-            "• Try longer training (15-20 minutes for better results)\n"
-            "• Add more conversation patterns to the dataset\n"
-            "• Scale up the model (more parameters = better learning)\n\n"
-            
-            "[bold cyan]You've mastered the foundation of modern AI! 🌟[/bold cyan]",
-            
-            title="🌟 2017 Transformer Complete - Milestone 05",
-            border_style="green",
-            box=box.DOUBLE
-        ))
-    else:
-        console.print(Panel.fit(
-            "[bold yellow]⚠️  Training Complete - Needs More Time[/bold yellow]\n\n"
-            f"Current accuracy: {final_accuracy:.0f}% | Loss improved: {loss_improvement:.1f}%\n\n"
-            "Your transformer is learning but needs more training time.\n\n"
-            "[bold]What to try:[/bold]\n"
-            "• Train for 15-20 minutes instead of 10\n"
-            "• Use a slightly bigger model (2 layers, 24 dims)\n"
-            "• Add more data repetition for reinforcement\n\n"
-            "[dim]The attention mechanism is working - it just needs more steps to converge!\n"
-            "Even partial success shows the transformer learned patterns.[/dim]",
-            title="🔄 Learning in Progress",
-            border_style="yellow",
-            box=box.DOUBLE
-        ))
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/tinytalks_dataset.py
+++ b/milestones/05_2017_transformer/tinytalks_dataset.py
@@ -1,208 +0,0 @@
-"""
-TinyTalks Dataset - Small Conversational Dataset for Transformer Training
-==========================================================================
-
-A carefully curated micro-dataset for training a chatbot in 10-15 minutes.
-
-Design Principles:
- Simple Q&A format
- Short responses (< 50 characters)
- Common patterns repeated for learning
- Character-level tokenization friendly
- Covers basic conversational topics
-"""
-
-def create_tinytalks_dataset():
-    """
-    Create a small but diverse conversational dataset.
-    
-    Categories:
-    1. Greetings & Politeness
-    2. Simple Facts
-    3. Yes/No Questions
-    4. Weather & Time
-    5. Feelings & States
-    6. Simple Math
-    7. Colors & Objects
-    8. Names & Identity
-    """
-    
-    conversations = [
-        # === GREETINGS (repeated 5x for strong learning) ===
-        ("Hi", "Hello! How can I help you?"),
-        ("Hello", "Hi there! How are you?"),
-        ("Good morning", "Good morning! Nice day!"),
-        ("How are you", "I am doing well, thanks!"),
-        ("Bye", "Goodbye! Have a great day!"),
-        
-        # Repeat for learning
-        ("Hi", "Hello! How can I help you?"),
-        ("Hello", "Hi there! How are you?"),
-        ("How are you", "I am doing well, thanks!"),
-        ("Bye", "Goodbye! Have a great day!"),
-        
-        ("Hi", "Hello! How can I help you?"),
-        ("Hello", "Hi there! How are you?"),
-        ("Bye", "Goodbye! Have a great day!"),
-        
-        # === SIMPLE FACTS (repeated 3x) ===
-        ("What is the sky", "The sky is blue"),
-        ("What is grass", "Grass is green"),
-        ("What is the sun", "The sun is bright and yellow"),
-        ("What is water", "Water is wet and clear"),
-        ("What is fire", "Fire is hot and red"),
-        
-        ("What is the sky", "The sky is blue"),
-        ("What is grass", "Grass is green"),
-        ("What is water", "Water is wet and clear"),
-        
-        ("What is the sky", "The sky is blue"),
-        ("What is grass", "Grass is green"),
-        
-        # === YES/NO QUESTIONS (repeated 3x) ===
-        ("Is the sky blue", "Yes, the sky is blue"),
-        ("Is grass green", "Yes, grass is green"),
-        ("Is the sun cold", "No, the sun is hot"),
-        ("Is water dry", "No, water is wet"),
-        ("Is fire safe", "No, fire is dangerous"),
-        
-        ("Is the sky blue", "Yes, the sky is blue"),
-        ("Is grass green", "Yes, grass is green"),
-        ("Is the sun cold", "No, the sun is hot"),
-        
-        ("Is the sky blue", "Yes, the sky is blue"),
-        ("Is grass green", "Yes, grass is green"),
-        
-        # === WEATHER (repeated 3x) ===
-        ("Is it sunny", "Yes, it is sunny today"),
-        ("Is it raining", "No, it is not raining"),
-        ("Is it cold", "No, it is warm today"),
-        ("What is the weather", "The weather is nice"),
-        
-        ("Is it sunny", "Yes, it is sunny today"),
-        ("What is the weather", "The weather is nice"),
-        
-        ("Is it sunny", "Yes, it is sunny today"),
-        
-        # === FEELINGS (repeated 3x) ===
-        ("Are you happy", "Yes, I am happy"),
-        ("Are you sad", "No, I am not sad"),
-        ("Are you tired", "No, I feel good"),
-        ("Do you like learning", "Yes, I love learning"),
-        
-        ("Are you happy", "Yes, I am happy"),
-        ("Do you like learning", "Yes, I love learning"),
-        
-        ("Are you happy", "Yes, I am happy"),
-        
-        # === SIMPLE MATH (repeated 3x) ===
-        ("What is 1 plus 1", "1 plus 1 equals 2"),
-        ("What is 2 plus 2", "2 plus 2 equals 4"),
-        ("What is 3 plus 3", "3 plus 3 equals 6"),
-        ("What is 5 plus 5", "5 plus 5 equals 10"),
-        
-        ("What is 1 plus 1", "1 plus 1 equals 2"),
-        ("What is 2 plus 2", "2 plus 2 equals 4"),
-        
-        ("What is 1 plus 1", "1 plus 1 equals 2"),
-        
-        # === COLORS (repeated 3x) ===
-        ("What color is the sky", "The sky is blue"),
-        ("What color is grass", "Grass is green"),
-        ("What color is the sun", "The sun is yellow"),
-        ("What color is snow", "Snow is white"),
-        
-        ("What color is the sky", "The sky is blue"),
-        ("What color is grass", "Grass is green"),
-        
-        ("What color is the sky", "The sky is blue"),
-        
-        # === IDENTITY (repeated 3x) ===
-        ("What is your name", "I am TinyBot"),
-        ("Who are you", "I am TinyBot, your helper"),
-        ("What do you do", "I help answer questions"),
-        
-        ("What is your name", "I am TinyBot"),
-        ("Who are you", "I am TinyBot, your helper"),
-        
-        ("What is your name", "I am TinyBot"),
-        
-        # === CAPABILITIES (repeated 2x) ===
-        ("Can you help me", "Yes, I can help you"),
-        ("Can you talk", "Yes, I can talk with you"),
-        ("Do you understand", "Yes, I understand you"),
-        
-        ("Can you help me", "Yes, I can help you"),
-        ("Can you talk", "Yes, I can talk with you"),
-    ]
-    
-    return conversations
-
-
-def get_dataset_stats():
-    """Get statistics about the dataset."""
-    conversations = create_tinytalks_dataset()
-    
-    unique_conversations = set(conversations)
-    total_chars = sum(len(q) + len(a) for q, a in conversations)
-    avg_question_len = sum(len(q) for q, _ in conversations) / len(conversations)
-    avg_answer_len = sum(len(a) for _, a in conversations) / len(conversations)
-    
-    return {
-        'total_examples': len(conversations),
-        'unique_examples': len(unique_conversations),
-        'repetition_factor': len(conversations) / len(unique_conversations),
-        'total_chars': total_chars,
-        'avg_question_len': avg_question_len,
-        'avg_answer_len': avg_answer_len,
-        'categories': [
-            'Greetings (5x repeat)',
-            'Simple Facts (3x repeat)',
-            'Yes/No Questions (3x repeat)',
-            'Weather (3x repeat)',
-            'Feelings (3x repeat)',
-            'Simple Math (3x repeat)',
-            'Colors (3x repeat)',
-            'Identity (3x repeat)',
-            'Capabilities (2x repeat)'
-        ]
-    }
-
-
-def print_dataset_info():
-    """Print dataset information."""
-    conversations = create_tinytalks_dataset()
-    stats = get_dataset_stats()
-    
-    print("=" * 70)
-    print("TINYTALKS DATASET")
-    print("=" * 70)
-    print()
-    print(f"Total examples: {stats['total_examples']}")
-    print(f"Unique examples: {stats['unique_examples']}")
-    print(f"Repetition factor: {stats['repetition_factor']:.1f}x")
-    print(f"Average question length: {stats['avg_question_len']:.1f} chars")
-    print(f"Average answer length: {stats['avg_answer_len']:.1f} chars")
-    print()
-    print("Categories:")
-    for cat in stats['categories']:
-        print(f"  • {cat}")
-    print()
-    print("Sample conversations:")
-    print("-" * 70)
-    
-    # Show 10 random unique examples
-    unique = list(set(conversations))
-    import random
-    random.seed(42)
-    samples = random.sample(unique, min(10, len(unique)))
-    
-    for q, a in samples:
-        print(f"Q: {q}")
-        print(f"A: {a}")
-        print()
-
-
-if __name__ == "__main__":
-    print_dataset_info()
-
--- a/milestones/05_2017_transformer/tinytalks_gpt.py
+++ b/milestones/05_2017_transformer/tinytalks_gpt.py
@@ -1,746 +0,0 @@
-#!/usr/bin/env python3
-"""
-TinyTalks Q&A Generation (2017) - Transformer Era
-==================================================
-
-📚 HISTORICAL CONTEXT:
-In 2017, Vaswani et al. published "Attention Is All You Need", showing that
-attention mechanisms alone (no RNNs!) could achieve state-of-the-art results
-on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs.
-
-🎯 WHAT YOU'RE BUILDING:
-Using YOUR TinyTorch implementations, you'll build a character-level conversational
-model that learns to answer questions - proving YOUR attention mechanism works!
-
-TinyTalks is PERFECT for learning:
- Small dataset (17.5 KB) = 3-5 minute training!
- Clear Q&A format (easy to verify learning)
- Progressive difficulty (5 levels)
- Instant gratification: Watch your transformer learn to chat!
-
-✅ REQUIRED MODULES (Run after Module 13):
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-  Module 01 (Tensor)        : YOUR data structure with autograd
-  Module 02 (Activations)   : YOUR ReLU and GELU activations
-  Module 03 (Layers)        : YOUR Linear layers
-  Module 04 (Losses)        : YOUR CrossEntropyLoss
-  Module 05 (Autograd)      : YOUR automatic differentiation
-  Module 06 (Optimizers)    : YOUR Adam optimizer
-  Module 08 (DataLoader)    : YOUR data batching
-  Module 10 (Tokenization)  : YOUR CharTokenizer for text→numbers
-  Module 11 (Embeddings)    : YOUR token & positional embeddings
-  Module 12 (Attention)     : YOUR multi-head self-attention
-  Module 13 (Transformers)  : YOUR LayerNorm + TransformerBlock + GPT
-━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-
-🏗️ ARCHITECTURE (Character-Level Q&A Model):
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                               Output Predictions                             │
-    │                         Character Probabilities (vocab_size)                 │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                            Output Projection                                 │
-    │                       Module 03: vectors → vocabulary                        │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                              Layer Norm                                      │
-    │                        Module 13: Final normalization                        │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ╔══════════════════════════════════════════════════════════════════════════════╗
-    ║                      Transformer Block × N (Repeat)                          ║
-    ║  ┌────────────────────────────────────────────────────────────────────────┐  ║
-    ║  │                       Feed Forward Network                             │  ║
-    ║  │              Module 03: Linear → GELU → Linear                         │  ║
-    ║  └────────────────────────────────────────────────────────────────────────┘  ║
-    ║                                  ▲                                           ║
-    ║  ┌────────────────────────────────────────────────────────────────────────┐  ║
-    ║  │                    Multi-Head Self-Attention                           │  ║
-    ║  │           Module 12: Query·Key^T·Value across all positions            │  ║
-    ║  └────────────────────────────────────────────────────────────────────────┘  ║
-    ╚══════════════════════════════════════════════════════════════════════════════╝
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                          Positional Encoding                                 │
-    │                   Module 11: Add position information                        │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                         Character Embeddings                                 │
-    │                    Module 11: chars → embed_dim vectors                      │
-    └──────────────────────────────────────────────────────────────────────────────┘
-                                            ▲
-    ┌──────────────────────────────────────────────────────────────────────────────┐
-    │                            Input Characters                                  │
-    │                    "Q: What color is the sky? A:"                            │
-    └──────────────────────────────────────────────────────────────────────────────┘
-
-📊 EXPECTED PERFORMANCE:
- Dataset: 17.5 KB TinyTalks (301 Q&A pairs, 5 difficulty levels)
- Training time: 3-5 minutes (instant gratification!)
- Vocabulary: ~68 unique characters (simple English Q&A)
- Expected: 70-80% accuracy on Level 1-2 questions after training
- Parameters: ~1.2M (perfect size for fast learning on small data)
-
-💡 WHAT TO WATCH FOR:
- Epoch 1-3: Model learns Q&A structure ("A:" follows "Q:")
- Epoch 4-7: Starts giving sensible (if incorrect) answers
- Epoch 8-12: 50-60% accuracy on simple questions
- Epoch 13-20: 70-80% accuracy, proper grammar
- Success = "Wow, my transformer actually learned to answer questions!"
-"""
-
-import sys
-import os
-import numpy as np
-import argparse
-import time
-from rich.console import Console
-from rich.panel import Panel
-from rich.table import Table
-from rich import box
-
-# Add project root to path
-project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.append(project_root)
-
-console = Console()
-
-
-def print_banner():
-    """Print a beautiful banner for the milestone"""
-    banner_text = """
-╔══════════════════════════════════════════════════════════════════╗
-║                                                                  ║
-║            🤖 TinyTalks Q&A Bot Training (2017)                  ║
-║                   Transformer Architecture                       ║
-║                                                                  ║
-║  "Your first transformer learning to answer questions!"         ║
-║                                                                  ║
-╚══════════════════════════════════════════════════════════════════╝
-    """
-    console.print(Panel(banner_text, border_style="bright_blue", box=box.DOUBLE))
-
-
-def filter_by_levels(text, levels):
-    """
-    Filter TinyTalks dataset to only include specified difficulty levels.
-    
-    Levels are marked in the original generation as:
-    L1: Greetings (47 pairs)
-    L2: Facts (82 pairs)
-    L3: Math (45 pairs)
-    L4: Reasoning (87 pairs)
-    L5: Context (40 pairs)
-    
-    For simplicity, we filter by common patterns:
-    L1: Hello, Hi, What is your name, etc.
-    L2: What color, How many, etc.
-    L3: What is X plus/minus, etc.
-    """
-    if levels is None or levels == [1, 2, 3, 4, 5]:
-        return text  # Use full dataset
-    
-    # Parse Q&A pairs
-    pairs = []
-    blocks = text.strip().split('\n\n')
-    
-    for block in blocks:
-        lines = block.strip().split('\n')
-        if len(lines) == 2 and lines[0].startswith('Q:') and lines[1].startswith('A:'):
-            q = lines[0][3:].strip()
-            a = lines[1][3:].strip()
-            
-            # Classify level (heuristic)
-            level = 5  # default
-            q_lower = q.lower()
-            
-            if any(word in q_lower for word in ['hello', 'hi', 'hey', 'goodbye', 'bye', 'name', 'who are you', 'what are you']):
-                level = 1
-            elif any(word in q_lower for word in ['color', 'legs', 'days', 'months', 'sound', 'capital']):
-                level = 2
-            elif any(word in q_lower for word in ['plus', 'minus', 'times', 'divided', 'equals']):
-                level = 3
-            elif any(word in q_lower for word in ['use', 'where do', 'what do', 'happens if', 'need to']):
-                level = 4
-            
-            if level in levels:
-                pairs.append(f"Q: {q}\nA: {a}")
-    
-    filtered_text = '\n\n'.join(pairs)
-    console.print(f"[yellow]📊 Filtered to Level(s) {levels}:[/yellow]")
-    console.print(f"    Q&A pairs: {len(pairs)}")
-    console.print(f"    Characters: {len(filtered_text)}")
-    
-    return filtered_text
-
-
-class TinyTalksDataset:
-    """
-    Character-level dataset for TinyTalks Q&A.
-    
-    Creates sequences of characters for autoregressive language modeling:
-    - Input: "Q: What color is the sky? A: The sk"
-    - Target: ": What color is the sky? A: The sky"
-    
-    The model learns to predict the next character given previous characters,
-    naturally learning the Q&A pattern.
-    """
-    
-    def __init__(self, text, seq_length=64, levels=None):
-        """
-        Args:
-            text: Full text string (Q&A pairs)
-            seq_length: Length of input sequences
-            levels: List of difficulty levels to include (1-5), None = all
-        """
-        from tinytorch.text.tokenization import CharTokenizer
-        
-        self.seq_length = seq_length
-        
-        # Filter by levels if specified
-        if levels:
-            text = filter_by_levels(text, levels)
-        
-        # Store original text for testing
-        self.text = text
-        
-        # Build character vocabulary using CharTokenizer
-        self.tokenizer = CharTokenizer()
-        self.tokenizer.build_vocab([text])
-        
-        # Encode entire text
-        self.data = self.tokenizer.encode(text)
-        
-        console.print(f"[green]✓[/green] Dataset initialized:")
-        console.print(f"    Total characters: {len(text)}")
-        console.print(f"    Vocabulary size: {self.tokenizer.vocab_size}")
-        console.print(f"    Sequence length: {seq_length}")
-        console.print(f"    Total sequences: {len(self)}")
-    
-    def __len__(self):
-        """Number of possible sequences"""
-        return len(self.data) - self.seq_length
-    
-    def __getitem__(self, idx):
-        """
-        Get one training example.
-        
-        Returns:
-            input_seq: Characters [idx : idx+seq_length]
-            target_seq: Characters [idx+1 : idx+seq_length+1] (shifted by 1)
-        """
-        input_seq = self.data[idx:idx + self.seq_length]
-        target_seq = self.data[idx + 1:idx + self.seq_length + 1]
-        return input_seq, target_seq
-    
-    def decode(self, indices):
-        """Decode token indices back to text"""
-        return self.tokenizer.decode(indices)
-
-
-class TinyGPT:
-    """
-    Character-level GPT model for TinyTalks Q&A.
-    
-    This is a simplified GPT architecture:
-    1. Token embeddings (convert characters to vectors)
-    2. Positional encodings (add position information)
-    3. N transformer blocks (self-attention + feed-forward)
-    4. Output projection (vectors back to character probabilities)
-    
-    Built entirely from YOUR TinyTorch modules!
-    """
-    
-    def __init__(self, vocab_size, embed_dim=128, num_layers=4, num_heads=4, 
-                 max_seq_len=64, dropout=0.1):
-        """
-        Args:
-            vocab_size: Number of unique characters
-            embed_dim: Dimension of embeddings and hidden states
-            num_layers: Number of transformer blocks
-            num_heads: Number of attention heads per block
-            max_seq_len: Maximum sequence length
-            dropout: Dropout probability (for training)
-        """
-        from tinytorch.core.tensor import Tensor
-        from tinytorch.text.embeddings import Embedding, PositionalEncoding
-        from tinytorch.models.transformer import LayerNorm, TransformerBlock
-        from tinytorch.core.layers import Linear
-        
-        self.vocab_size = vocab_size
-        self.embed_dim = embed_dim
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.max_seq_len = max_seq_len
-        
-        # 1. Token embeddings: char_id → embed_dim vector
-        self.token_embedding = Embedding(vocab_size, embed_dim)
-        
-        # 2. Positional encoding: add position information
-        self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
-        
-        # 3. Transformer blocks (stacked)
-        self.blocks = []
-        for _ in range(num_layers):
-            block = TransformerBlock(
-                embed_dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=4,  # FFN hidden_dim = 4 * embed_dim
-                dropout_prob=dropout
-            )
-            self.blocks.append(block)
-        
-        # 4. Final layer normalization
-        self.ln_f = LayerNorm(embed_dim)
-        
-        # 5. Output projection: embed_dim → vocab_size
-        self.output_proj = Linear(embed_dim, vocab_size)
-        
-        console.print(f"[green]✓[/green] TinyGPT model initialized:")
-        console.print(f"    Vocabulary: {vocab_size}")
-        console.print(f"    Embedding dim: {embed_dim}")
-        console.print(f"    Layers: {num_layers}")
-        console.print(f"    Heads: {num_heads}")
-        console.print(f"    Max sequence: {max_seq_len}")
-        
-        # Count parameters
-        total_params = self.count_parameters()
-        console.print(f"    [bold]Total parameters: {total_params:,}[/bold]")
-    
-    def forward(self, x):
-        """
-        Forward pass through the model.
-        
-        Args:
-            x: Input tensor of shape (batch, seq_len) with token indices
-        
-        Returns:
-            logits: Output tensor of shape (batch, seq_len, vocab_size)
-        """
-        from tinytorch.core.tensor import Tensor
-        
-        # 1. Token embeddings: (batch, seq_len) → (batch, seq_len, embed_dim)
-        x = self.token_embedding.forward(x)
-        
-        # 2. Add positional encoding
-        x = self.pos_encoding.forward(x)
-        
-        # 3. Pass through transformer blocks
-        for block in self.blocks:
-            x = block.forward(x)
-        
-        # 4. Final layer norm
-        x = self.ln_f.forward(x)
-        
-        # 5. Project to vocabulary: (batch, seq_len, embed_dim) → (batch, seq_len, vocab_size)
-        logits = self.output_proj.forward(x)
-        
-        return logits
-    
-    def parameters(self):
-        """Get all trainable parameters"""
-        params = []
-        
-        # Token embeddings
-        params.extend(self.token_embedding.parameters())
-        
-        # Positional encoding (learnable parameters)
-        params.extend(self.pos_encoding.parameters())
-        
-        # Transformer blocks
-        for block in self.blocks:
-            params.extend(block.parameters())
-        
-        # Final layer norm
-        params.extend(self.ln_f.parameters())
-        
-        # Output projection
-        params.extend(self.output_proj.parameters())
-        
-        # Ensure all require gradients
-        for param in params:
-            param.requires_grad = True
-        
-        return params
-    
-    def count_parameters(self):
-        """Count total trainable parameters"""
-        total = 0
-        for param in self.parameters():
-            total += param.data.size
-        return total
-    
-    def generate(self, tokenizer, prompt="Q:", max_new_tokens=100, temperature=1.0):
-        """
-        Generate text autoregressively.
-        
-        Args:
-            tokenizer: CharTokenizer for encoding/decoding
-            prompt: Starting text
-            max_new_tokens: How many characters to generate
-            temperature: Sampling temperature (higher = more random)
-        
-        Returns:
-            Generated text string
-        """
-        from tinytorch.core.tensor import Tensor
-        
-        # Encode prompt
-        indices = tokenizer.encode(prompt)
-        
-        # Generate tokens one at a time
-        for _ in range(max_new_tokens):
-            # Get last max_seq_len tokens (context window)
-            context = indices[-self.max_seq_len:]
-            
-            # Prepare input: (1, seq_len)
-            x_input = Tensor(np.array([context]))
-            
-            # Forward pass
-            logits = self.forward(x_input)
-            
-            # Get logits for last position: (vocab_size,)
-            last_logits = logits.data[0, -1, :] / temperature
-            
-            # Apply softmax to get probabilities
-            exp_logits = np.exp(last_logits - np.max(last_logits))
-            probs = exp_logits / np.sum(exp_logits)
-            
-            # Sample from distribution
-            next_idx = np.random.choice(len(probs), p=probs)
-            
-            # Append to sequence
-            indices.append(next_idx)
-            
-            # Stop if we generate newline after "A:"
-            if len(indices) > 3 and tokenizer.decode(indices[-3:]) == "\n\nQ":
-                break
-        
-        return tokenizer.decode(indices)
-
-
-def test_model_predictions(model, dataset, test_prompts=None):
-    """Test model on specific prompts and show predictions"""
-    if test_prompts is None:
-        test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"]
-    
-    console.print("\n[bold yellow]🧪 Testing Live Predictions:[/bold yellow]")
-    for prompt in test_prompts:
-        try:
-            full_prompt = prompt + "\nA:"
-            response = model.generate(dataset.tokenizer, prompt=full_prompt, max_new_tokens=30, temperature=0.5)
-            
-            # Extract just the answer
-            if "\nA:" in response:
-                answer = response.split("\nA:")[1].split("\n")[0].strip()
-            else:
-                answer = response[len(full_prompt):].strip()
-            
-            console.print(f"  {prompt}")
-            console.print(f"  → [cyan]{answer}[/cyan]")
-        except Exception as e:
-            console.print(f"  {prompt} → [red]Error: {str(e)[:50]}[/red]")
-
-
-def train_tinytalks_gpt(model, dataset, optimizer, criterion, epochs=20, batch_size=32, 
-                        log_interval=50, test_prompts=None):
-    """
-    Train the TinyGPT model on TinyTalks dataset.
-    
-    Training loop:
-    1. Sample random batch of sequences
-    2. Forward pass: predict next character for each position
-    3. Compute cross-entropy loss
-    4. Backward pass: compute gradients
-    5. Update parameters with Adam
-    6. Periodically test on sample questions to show learning
-    
-    Args:
-        model: TinyGPT instance
-        dataset: TinyTalksDataset instance
-        optimizer: Adam optimizer
-        criterion: CrossEntropyLoss
-        epochs: Number of training epochs
-        batch_size: Number of sequences per batch
-        log_interval: Print loss every N batches
-        test_prompts: Optional list of questions to test during training
-    """
-    from tinytorch.core.tensor import Tensor
-    from tinytorch.core.autograd import enable_autograd
-    
-    # Enable autograd
-    enable_autograd()
-    
-    console.print("\n[bold cyan]Starting Training...[/bold cyan]")
-    console.print(f"  Epochs: {epochs}")
-    console.print(f"  Batch size: {batch_size}")
-    console.print(f"  Dataset size: {len(dataset)} sequences")
-    
-    start_time = time.time()
-    
-    for epoch in range(epochs):
-        epoch_start = time.time()
-        epoch_loss = 0.0
-        num_batches = 0
-        
-        # Calculate batches per epoch
-        batches_per_epoch = min(500, len(dataset) // batch_size)
-        
-        for batch_idx in range(batches_per_epoch):
-            # Sample random batch
-            batch_indices = np.random.randint(0, len(dataset), size=batch_size)
-            
-            batch_inputs = []
-            batch_targets = []
-            
-            for idx in batch_indices:
-                input_seq, target_seq = dataset[int(idx)]
-                batch_inputs.append(input_seq)
-                batch_targets.append(target_seq)
-            
-            # Convert to tensors: (batch, seq_len)
-            batch_input = Tensor(np.array(batch_inputs))
-            batch_target = Tensor(np.array(batch_targets))
-            
-            # Forward pass
-            logits = model.forward(batch_input)
-            
-            # Reshape for loss computation: (batch, seq, vocab) → (batch*seq, vocab)
-            # IMPORTANT: Use Tensor.reshape() to preserve computation graph!
-            batch_size_actual, seq_length, vocab_size = logits.shape
-            logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size)
-            targets_1d = batch_target.reshape(-1)
-            
-            # Compute loss
-            loss = criterion.forward(logits_2d, targets_1d)
-            
-            # Backward pass
-            loss.backward()
-            
-            # Update parameters
-            optimizer.step()
-            
-            # Zero gradients
-            optimizer.zero_grad()
-            
-            # Track loss
-            batch_loss = float(loss.data)
-            epoch_loss += batch_loss
-            num_batches += 1
-            
-            # Log progress
-            if (batch_idx + 1) % log_interval == 0 or batch_idx == 0:
-                avg_loss = epoch_loss / num_batches
-                elapsed = time.time() - start_time
-                console.print(
-                    f"  Epoch {epoch+1}/{epochs} | "
-                    f"Batch {batch_idx+1}/{batches_per_epoch} | "
-                    f"Loss: {batch_loss:.4f} | "
-                    f"Avg: {avg_loss:.4f} | "
-                    f"Time: {elapsed:.1f}s"
-                )
-        
-        # Epoch summary
-        avg_epoch_loss = epoch_loss / num_batches
-        epoch_time = time.time() - epoch_start
-        console.print(
-            f"[green]✓[/green] Epoch {epoch+1}/{epochs} complete | "
-            f"Avg Loss: {avg_epoch_loss:.4f} | "
-            f"Time: {epoch_time:.1f}s"
-        )
-        
-        # Test model every 5 epochs to show learning progress
-        if (epoch + 1) % 5 == 0 or epoch == 0 or epoch == epochs - 1:
-            test_model_predictions(model, dataset, test_prompts)
-    
-    total_time = time.time() - start_time
-    console.print(f"\n[bold green]✓ Training complete![/bold green]")
-    console.print(f"  Total time: {total_time/60:.2f} minutes")
-
-
-def demo_questions(model, tokenizer):
-    """
-    Demonstrate the model answering questions.
-    
-    Shows how well the model learned from TinyTalks by asking
-    various questions from different difficulty levels.
-    """
-    console.print("\n" + "=" * 70)
-    console.print("[bold cyan]🤖 TinyBot Demo: Ask Me Questions![/bold cyan]")
-    console.print("=" * 70)
-    
-    # Test questions from different levels
-    test_questions = [
-        "Q: Hello!",
-        "Q: What is your name?",
-        "Q: What color is the sky?",
-        "Q: How many legs does a dog have?",
-        "Q: What is 2 plus 3?",
-        "Q: What do you use a pen for?",
-    ]
-    
-    for question in test_questions:
-        console.print(f"\n[yellow]{question}[/yellow]")
-        
-        # Generate answer
-        response = model.generate(tokenizer, prompt=question + "\nA:", max_new_tokens=50, temperature=0.8)
-        
-        # Extract just the answer part
-        if "\nA:" in response:
-            answer = response.split("\nA:")[1].split("\n")[0].strip()
-            console.print(f"[green]A: {answer}[/green]")
-        else:
-            console.print(f"[dim]{response}[/dim]")
-    
-    console.print("\n" + "=" * 70)
-
-
-def main():
-    """Main training pipeline"""
-    parser = argparse.ArgumentParser(description='Train TinyGPT on TinyTalks Q&A')
-    parser.add_argument('--epochs', type=int, default=30, help='Number of training epochs (default: 30)')
-    parser.add_argument('--batch-size', type=int, default=16, help='Batch size (default: 16)')
-    parser.add_argument('--lr', type=float, default=0.001, help='Learning rate (default: 0.001)')
-    parser.add_argument('--seq-length', type=int, default=64, help='Sequence length (default: 64)')
-    parser.add_argument('--embed-dim', type=int, default=96, help='Embedding dimension (default: 96, ~500K params)')
-    parser.add_argument('--num-layers', type=int, default=4, help='Number of transformer layers (default: 4)')
-    parser.add_argument('--num-heads', type=int, default=4, help='Number of attention heads (default: 4)')
-    parser.add_argument('--levels', type=str, default=None, help='Difficulty levels to train on (e.g. "1" or "1,2"). Default: all levels')
-    args = parser.parse_args()
-    
-    # Parse levels argument
-    if args.levels:
-        levels = [int(l.strip()) for l in args.levels.split(',')]
-    else:
-        levels = None
-    
-    print_banner()
-    
-    # Import TinyTorch components
-    console.print("\n[bold]Importing TinyTorch components...[/bold]")
-    try:
-        from tinytorch.core.tensor import Tensor
-        from tinytorch.core.optimizers import Adam
-        from tinytorch.core.losses import CrossEntropyLoss
-        from tinytorch.text.tokenization import CharTokenizer
-        console.print("[green]✓[/green] All modules imported successfully!")
-    except ImportError as e:
-        console.print(f"[red]✗[/red] Import error: {e}")
-        console.print("\nMake sure you have completed all required modules:")
-        console.print("  - Module 01 (Tensor)")
-        console.print("  - Module 02 (Activations)")
-        console.print("  - Module 03 (Layers)")
-        console.print("  - Module 04 (Losses)")
-        console.print("  - Module 05 (Autograd)")
-        console.print("  - Module 06 (Optimizers)")
-        console.print("  - Module 10 (Tokenization)")
-        console.print("  - Module 11 (Embeddings)")
-        console.print("  - Module 12 (Attention)")
-        console.print("  - Module 13 (Transformers)")
-        return
-    
-    # Load TinyTalks dataset
-    console.print("\n[bold]Loading TinyTalks dataset...[/bold]")
-    dataset_path = os.path.join(project_root, "datasets", "tinytalks", "splits", "train.txt")
-    
-    if not os.path.exists(dataset_path):
-        console.print(f"[red]✗[/red] Dataset not found: {dataset_path}")
-        console.print("\nPlease generate the dataset first:")
-        console.print("  python datasets/tinytalks/scripts/generate_tinytalks.py")
-        return
-    
-    with open(dataset_path, 'r', encoding='utf-8') as f:
-        text = f.read()
-    
-    console.print(f"[green]✓[/green] Loaded dataset from: {os.path.basename(dataset_path)}")
-    console.print(f"    File size: {len(text)} characters")
-    
-    # Create dataset with level filtering
-    dataset = TinyTalksDataset(text, seq_length=args.seq_length, levels=levels)
-    
-    # Set test prompts based on levels
-    if levels and 1 in levels:
-        test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: Hi!"]
-    elif levels and 2 in levels:
-        test_prompts = ["Q: What color is the sky?", "Q: How many legs does a dog have?"]
-    elif levels and 3 in levels:
-        test_prompts = ["Q: What is 2 plus 3?", "Q: What is 5 minus 2?"]
-    else:
-        test_prompts = ["Q: Hello!", "Q: What is your name?", "Q: What color is the sky?"]
-    
-    # Initialize model
-    console.print("\n[bold]Initializing TinyGPT model...[/bold]")
-    model = TinyGPT(
-        vocab_size=dataset.tokenizer.vocab_size,
-        embed_dim=args.embed_dim,
-        num_layers=args.num_layers,
-        num_heads=args.num_heads,
-        max_seq_len=args.seq_length,
-        dropout=0.1
-    )
-    
-    # Initialize optimizer and loss
-    console.print("\n[bold]Initializing training components...[/bold]")
-    optimizer = Adam(model.parameters(), lr=args.lr)
-    criterion = CrossEntropyLoss()
-    console.print(f"[green]✓[/green] Optimizer: Adam (lr={args.lr})")
-    console.print(f"[green]✓[/green] Loss: CrossEntropyLoss")
-    
-    # Print configuration
-    table = Table(title="Training Configuration", box=box.ROUNDED)
-    table.add_column("Parameter", style="cyan")
-    table.add_column("Value", style="green")
-    
-    dataset_desc = f"TinyTalks Level(s) {levels}" if levels else "TinyTalks (All Levels)"
-    table.add_row("Dataset", dataset_desc)
-    table.add_row("Vocabulary Size", str(dataset.tokenizer.vocab_size))
-    table.add_row("Model Parameters", f"{model.count_parameters():,}")
-    table.add_row("Epochs", str(args.epochs))
-    table.add_row("Batch Size", str(args.batch_size))
-    table.add_row("Learning Rate", str(args.lr))
-    table.add_row("Sequence Length", str(args.seq_length))
-    table.add_row("Embedding Dim", str(args.embed_dim))
-    table.add_row("Layers", str(args.num_layers))
-    table.add_row("Attention Heads", str(args.num_heads))
-    table.add_row("Expected Time", "3-5 minutes")
-    
-    console.print(table)
-    
-    # Train model
-    train_tinytalks_gpt(
-        model=model,
-        dataset=dataset,
-        optimizer=optimizer,
-        criterion=criterion,
-        epochs=args.epochs,
-        batch_size=args.batch_size,
-        log_interval=50,
-        test_prompts=test_prompts
-    )
-    
-    # Demo Q&A
-    demo_questions(model, dataset.tokenizer)
-    
-    # Success message
-    console.print("\n[bold green]🎉 Congratulations![/bold green]")
-    console.print("You've successfully trained a transformer to answer questions!")
-    console.print("\nYou used:")
-    console.print("  ✓ YOUR Tensor implementation (Module 01)")
-    console.print("  ✓ YOUR Activations (Module 02)")
-    console.print("  ✓ YOUR Linear layers (Module 03)")
-    console.print("  ✓ YOUR CrossEntropyLoss (Module 04)")
-    console.print("  ✓ YOUR Autograd system (Module 05)")
-    console.print("  ✓ YOUR Adam optimizer (Module 06)")
-    console.print("  ✓ YOUR CharTokenizer (Module 10)")
-    console.print("  ✓ YOUR Embeddings (Module 11)")
-    console.print("  ✓ YOUR Multi-Head Attention (Module 12)")
-    console.print("  ✓ YOUR Transformer blocks (Module 13)")
-    console.print("\n[bold]This is the foundation of ChatGPT, built by YOU from scratch![/bold]")
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/tinytalks_interactive.py
+++ b/milestones/05_2017_transformer/tinytalks_interactive.py
@@ -1,427 +0,0 @@
-"""
-TinyTalks Interactive Learning Dashboard
-=========================================
-
-Watch a chatbot learn in real-time!
-
-Students can see:
- Loss decreasing over time
- Responses improving from gibberish to coherent
- Learning progress at multiple checkpoints
- Interactive control (pause/continue)
-"""
-
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-import numpy as np
-import time
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.optimizers import Adam
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.models.transformer import GPT
-from tinytalks_dataset import create_tinytalks_dataset, get_dataset_stats
-
-enable_autograd()
-
-try:
-    from rich.console import Console
-    from rich.panel import Panel
-    from rich.table import Table
-    from rich.live import Live
-    from rich.layout import Layout
-    from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
-    RICH_AVAILABLE = True
-except ImportError:
-    RICH_AVAILABLE = False
-    print("Note: Install 'rich' for better visualization: pip install rich")
-
-# ============================================================================
-# Tokenization (copied from tinytalks_chatbot.py)
-# ============================================================================
-
-def create_tokenizer(conversations):
-    """Create character-level tokenizer with special tokens."""
-    all_text = ' '.join([q + ' ' + a for q, a in conversations])
-    all_chars = sorted(set(all_text))
-    
-    special_tokens = {
-        '<PAD>': 0,
-        '<SOS>': 1,
-        '<SEP>': 2,
-        '<EOS>': 3,
-    }
-    
-    char_to_idx = {**special_tokens}
-    idx_to_char = {v: k for k, v in special_tokens.items()}
-    
-    for idx, char in enumerate(all_chars, start=len(special_tokens)):
-        char_to_idx[char] = idx
-        idx_to_char[idx] = char
-    
-    return char_to_idx, idx_to_char
-
-
-def encode_conversation(question, answer, char_to_idx, max_len=80):
-    """Encode Q&A pair as: <SOS> question <SEP> answer <EOS> <PAD>..."""
-    tokens = [char_to_idx['<SOS>']]
-    
-    for c in question:
-        tokens.append(char_to_idx.get(c, 0))
-    
-    tokens.append(char_to_idx['<SEP>'])
-    
-    for c in answer:
-        tokens.append(char_to_idx.get(c, 0))
-    
-    tokens.append(char_to_idx['<EOS>'])
-    
-    if len(tokens) < max_len:
-        tokens = tokens + [char_to_idx['<PAD>']] * (max_len - len(tokens))
-    else:
-        tokens = tokens[:max_len]
-    
-    return tokens
-
-
-def decode_tokens(tokens, idx_to_char):
-    """Decode tokens to string."""
-    chars = []
-    for t in tokens:
-        if t == 0 or t == 1:  # PAD or SOS
-            continue
-        elif t == 2:  # SEP
-            continue
-        elif t == 3:  # EOS
-            break
-        else:
-            chars.append(idx_to_char.get(t, '?'))
-    return ''.join(chars)
-
-
-def generate_response(model, question, char_to_idx, idx_to_char, max_len=50):
-    """Generate response to a question."""
-    tokens = [char_to_idx['<SOS>']]
-    for c in question:
-        tokens.append(char_to_idx.get(c, 0))
-    tokens.append(char_to_idx['<SEP>'])
-    
-    generated_tokens = []
-    for _ in range(max_len):
-        input_tokens = tokens + generated_tokens
-        while len(input_tokens) < 80:
-            input_tokens.append(char_to_idx['<PAD>'])
-        input_tokens = input_tokens[:80]
-        
-        x = Tensor(np.array([input_tokens], dtype=np.int32), requires_grad=False)
-        logits = model.forward(x)
-        
-        next_pos = len(tokens) + len(generated_tokens) - 1
-        if next_pos < logits.shape[1]:
-            next_logits = logits.data[0, next_pos, :]
-            next_token = int(np.argmax(next_logits))
-            
-            if next_token == char_to_idx['<EOS>'] or next_token == char_to_idx['<PAD>']:
-                break
-            
-            generated_tokens.append(next_token)
-        else:
-            break
-    
-    response = decode_tokens(generated_tokens, idx_to_char)
-    return response
-
-
-# ============================================================================
-# Interactive Training with Checkpoints
-# ============================================================================
-
-def evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char):
-    """Evaluate model on test questions."""
-    results = []
-    for question in test_questions:
-        response = generate_response(model, question, char_to_idx, idx_to_char)
-        results.append((question, response))
-    return results
-
-
-def show_checkpoint_panel(checkpoint_num, step, loss, results, prev_results=None):
-    """Show checkpoint results in a nice panel."""
-    if RICH_AVAILABLE:
-        console = Console()
-        
-        # Header
-        console.print()
-        console.print("=" * 70, style="bold cyan")
-        console.print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}", 
-                     style="bold yellow", justify="center")
-        console.print("=" * 70, style="bold cyan")
-        console.print()
-        
-        # Show responses
-        table = Table(show_header=True, header_style="bold magenta")
-        table.add_column("Question", style="cyan", width=25)
-        table.add_column("Response", style="green", width=35)
-        if prev_results:
-            table.add_column("Previous", style="dim", width=10)
-        
-        for i, (question, response) in enumerate(results):
-            if prev_results and i < len(prev_results):
-                prev_response = prev_results[i][1]
-                improved = "📈" if len(response) > len(prev_response) else "📉"
-                table.add_row(question, response, improved)
-            else:
-                table.add_row(question, response)
-        
-        console.print(table)
-        console.print()
-    else:
-        # Fallback to simple print
-        print()
-        print("=" * 70)
-        print(f"CHECKPOINT {checkpoint_num} - Step {step:,} | Loss: {loss:.4f}")
-        print("=" * 70)
-        print()
-        for question, response in results:
-            print(f"Q: {question}")
-            print(f"A: {response}")
-            print()
-
-
-def train_interactive(model, optimizer, loss_fn, train_data, test_questions, 
-                     char_to_idx, idx_to_char, max_time_minutes=15, 
-                     checkpoint_steps=1000, auto_continue_seconds=10):
-    """
-    Train with interactive checkpoints.
-    
-    Args:
-        checkpoint_steps: Pause every N steps to show results
-        auto_continue_seconds: Auto-continue after N seconds (0 = wait for ENTER)
-    """
-    max_time_seconds = max_time_minutes * 60
-    
-    print("=" * 70)
-    print(f"INTERACTIVE TRAINING - {max_time_minutes} MINUTES")
-    print("=" * 70)
-    print(f"Dataset: {len(train_data)} conversations")
-    print(f"Checkpoints: Every {checkpoint_steps} steps")
-    print(f"Auto-continue: {auto_continue_seconds}s (or press ENTER)")
-    print("=" * 70)
-    print()
-    print("Watch the model learn from gibberish to coherent responses!")
-    print()
-    
-    # Initial evaluation (before training)
-    print("Evaluating initial model (untrained)...")
-    initial_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
-    show_checkpoint_panel(0, 0, 999.9, initial_results)
-    
-    if auto_continue_seconds > 0:
-        print(f"Starting training in {auto_continue_seconds} seconds (or press ENTER)...")
-        time.sleep(auto_continue_seconds)
-    elif auto_continue_seconds == 0:
-        print("Starting training immediately...")
-        time.sleep(0.5)
-    else:
-        input("Press ENTER to start training...")
-    
-    print()
-    print("Training started...")
-    print()
-    
-    start_time = time.time()
-    losses = []
-    step = 0
-    checkpoint_num = 1
-    prev_results = initial_results
-    
-    next_checkpoint = checkpoint_steps
-    
-    while True:
-        elapsed = time.time() - start_time
-        if elapsed >= max_time_seconds:
-            break
-        
-        # Training step
-        tokens = train_data[np.random.randint(len(train_data))]
-        input_seq = tokens[:-1]
-        target_seq = tokens[1:]
-        
-        x = Tensor(np.array([input_seq], dtype=np.int32), requires_grad=False)
-        y_true = Tensor(np.array([target_seq], dtype=np.int32), requires_grad=False)
-        
-        logits = model.forward(x)
-        
-        batch_size, seq_len, vocab_size = logits.shape
-        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
-        targets_flat = y_true.reshape(batch_size * seq_len)
-        loss = loss_fn.forward(logits_flat, targets_flat)
-        
-        optimizer.zero_grad()
-        loss.backward()
-        
-        for param in model.parameters():
-            if param.grad is not None:
-                np.clip(param.grad, -1.0, 1.0, out=param.grad)
-        
-        optimizer.step()
-        
-        losses.append(loss.data.item())
-        step += 1
-        
-        # Show progress every 100 steps
-        if step % 100 == 0:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            print(f"[{int(elapsed):4d}s] Step {step:5d} | Loss: {avg_loss:.4f}")
-        
-        # Checkpoint evaluation
-        if step >= next_checkpoint:
-            avg_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-            
-            print()
-            print(f"Evaluating at step {step}...")
-            current_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
-            
-            show_checkpoint_panel(checkpoint_num, step, avg_loss, current_results, prev_results)
-            
-            prev_results = current_results
-            checkpoint_num += 1
-            next_checkpoint += checkpoint_steps
-            
-            # Interactive pause
-            if auto_continue_seconds > 0:
-                print(f"Continuing in {auto_continue_seconds}s (or press ENTER)...")
-                time.sleep(auto_continue_seconds)
-            elif auto_continue_seconds == 0:
-                print("Continuing immediately...")
-                time.sleep(0.5)
-            else:
-                input("Press ENTER to continue training...")
-            
-            print()
-            print("Training resumed...")
-            print()
-    
-    # Final results
-    final_elapsed = time.time() - start_time
-    final_loss = np.mean(losses[-100:]) if len(losses) >= 100 else np.mean(losses)
-    initial_loss = np.mean(losses[:10])
-    improvement = (1 - final_loss / initial_loss) * 100
-    
-    print()
-    print("=" * 70)
-    print("TRAINING COMPLETE!")
-    print("=" * 70)
-    print(f"Total time: {final_elapsed:.1f}s ({final_elapsed/60:.1f} minutes)")
-    print(f"Total steps: {step:,}")
-    print(f"Initial loss: {initial_loss:.4f}")
-    print(f"Final loss: {final_loss:.4f}")
-    print(f"Improvement: {improvement:.1f}%")
-    print()
-    
-    # Final evaluation
-    print("Final evaluation...")
-    final_results = evaluate_at_checkpoint(model, test_questions, char_to_idx, idx_to_char)
-    show_checkpoint_panel("FINAL", step, final_loss, final_results, prev_results)
-    
-    return losses, step
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    print()
-    print("=" * 70)
-    print("TINYTALKS INTERACTIVE LEARNING DASHBOARD")
-    print("=" * 70)
-    print()
-    print("Watch a transformer learn to chat in real-time!")
-    print("You'll see responses improve from gibberish to coherent answers.")
-    print()
-    
-    # Dataset
-    conversations = create_tinytalks_dataset()
-    stats = get_dataset_stats()
-    
-    print(f"Dataset: {stats['total_examples']} examples ({stats['unique_examples']} unique)")
-    print()
-    
-    # Tokenizer
-    char_to_idx, idx_to_char = create_tokenizer(conversations)
-    vocab_size = len(idx_to_char)
-    
-    # Encode
-    max_seq_len = 80
-    train_data = [encode_conversation(q, a, char_to_idx, max_seq_len) for q, a in conversations]
-    
-    # Test questions for checkpoints
-    test_questions = [
-        "Hi",
-        "How are you",
-        "What is your name",
-        "What is the sky",
-        "Is grass green",
-    ]
-    
-    # Model: Ultra-tiny for speed
-    config = {
-        'vocab_size': vocab_size,
-        'embed_dim': 16,
-        'num_layers': 1,
-        'num_heads': 2,
-        'max_seq_len': max_seq_len,
-    }
-    
-    model = GPT(**config)
-    num_params = sum(np.prod(p.shape) for p in model.parameters())
-    print(f"Model: {num_params:,} parameters")
-    print()
-    
-    # Optimizer
-    optimizer = Adam(model.parameters(), lr=0.001)
-    loss_fn = CrossEntropyLoss()
-    
-    # Settings
-    train_time = 5  # minutes (shorter for demo)
-    checkpoint_steps = 1000  # Evaluate every 1000 steps (~1-2 minutes)
-    auto_continue = 0  # Auto-continue immediately (0 = no wait for demo)
-    
-    print(f"Training for {train_time} minutes")
-    print(f"Checkpoints every {checkpoint_steps} steps")
-    print()
-    
-    # Train with interactive checkpoints
-    losses, total_steps = train_interactive(
-        model=model,
-        optimizer=optimizer,
-        loss_fn=loss_fn,
-        train_data=train_data,
-        test_questions=test_questions,
-        char_to_idx=char_to_idx,
-        idx_to_char=idx_to_char,
-        max_time_minutes=train_time,
-        checkpoint_steps=checkpoint_steps,
-        auto_continue_seconds=auto_continue
-    )
-    
-    print()
-    print("=" * 70)
-    print("DEMO COMPLETE!")
-    print("=" * 70)
-    print()
-    print("You just watched a transformer learn from scratch!")
-    print(f"✓ {total_steps:,} training steps")
-    print(f"✓ {len(losses)} loss values")
-    print(f"✓ {(1 - np.mean(losses[-100:])/np.mean(losses[:10]))*100:.1f}% improvement")
-    print()
-    print("Key takeaway: Loss decrease = Better responses!")
-    print()
-
-
-if __name__ == "__main__":
-    main()
-
--- a/milestones/05_2017_transformer/train_monitored.py
+++ b/milestones/05_2017_transformer/train_monitored.py
@@ -1,336 +0,0 @@
-#!/usr/bin/env python3
-"""
-Monitored Training Script for TinyTalks
-========================================
-
-Features:
- Early stopping if loss doesn't improve
- Continuous progress monitoring
- Automatic experiment termination for bad runs
- Clear feedback on learning progress
-
-Usage:
-    python train_monitored.py --mode test    # 10 epochs, quick validation
-    python train_monitored.py --mode full    # 30 epochs, full training
-"""
-
-import sys
-import os
-import argparse
-import time
-import numpy as np
-from pathlib import Path
-
-# Add project root to path
-project_root = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(project_root))
-
-from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn
-from rich.table import Table
-from rich import box
-
-# Import TinyTorch components
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.autograd import enable_autograd
-from tinytorch.core.losses import CrossEntropyLoss
-from tinytorch.core.optimizers import Adam
-from tinytorch.text.tokenization import CharTokenizer
-
-console = Console()
-
-# Import TinyGPT and dataset classes
-exec(open(project_root / "milestones/05_2017_transformer/tinytalks_gpt.py").read())
-
-
-class TrainingMonitor:
-    """Monitor training progress and implement early stopping"""
-    
-    def __init__(self, patience=5, min_delta=0.01):
-        """
-        Args:
-            patience: Number of checks without improvement before stopping
-            min_delta: Minimum change in loss to count as improvement
-        """
-        self.patience = patience
-        self.min_delta = min_delta
-        self.best_loss = float('inf')
-        self.checks_without_improvement = 0
-        self.losses = []
-        
-    def check(self, current_loss):
-        """
-        Check if training should continue
-        
-        Returns:
-            (should_continue, message)
-        """
-        self.losses.append(current_loss)
-        
-        # Calculate improvement
-        improvement = self.best_loss - current_loss
-        
-        if improvement > self.min_delta:
-            # Significant improvement
-            self.best_loss = current_loss
-            self.checks_without_improvement = 0
-            return True, f"✓ Loss improved by {improvement:.4f}"
-        else:
-            # No significant improvement
-            self.checks_without_improvement += 1
-            
-            if self.checks_without_improvement >= self.patience:
-                return False, f"✗ No improvement for {self.patience} checks. Stopping."
-            else:
-                return True, f"⚠ No improvement ({self.checks_without_improvement}/{self.patience})"
-    
-    def summary(self):
-        """Get training summary"""
-        if len(self.losses) < 2:
-            return "Not enough data"
-        
-        initial = self.losses[0]
-        final = self.losses[-1]
-        best = min(self.losses)
-        decrease = initial - final
-        decrease_pct = (decrease / initial) * 100 if initial > 0 else 0
-        
-        return {
-            'initial_loss': initial,
-            'final_loss': final,
-            'best_loss': best,
-            'total_decrease': decrease,
-            'decrease_percent': decrease_pct,
-            'num_checks': len(self.losses)
-        }
-
-
-def train_with_monitoring(model, dataset, optimizer, criterion, config, monitor):
-    """
-    Train with continuous monitoring and early stopping
-    
-    Args:
-        model: TinyGPT model
-        dataset: TinyTalksDataset
-        optimizer: Adam optimizer
-        criterion: CrossEntropyLoss
-        config: Training configuration dict
-        monitor: TrainingMonitor instance
-    
-    Returns:
-        success: True if training completed successfully
-    """
-    epochs = config['epochs']
-    batch_size = config['batch_size']
-    check_interval = config.get('check_interval', 50)  # Check every N batches
-    
-    console.print(f"\n[bold cyan]Starting Training with Monitoring[/bold cyan]")
-    console.print(f"  Check interval: Every {check_interval} batches")
-    console.print(f"  Early stopping: {monitor.patience} checks without improvement\n")
-    
-    total_batches_processed = 0
-    start_time = time.time()
-    
-    for epoch in range(epochs):
-        epoch_start = time.time()
-        epoch_loss = 0.0
-        batch_count = 0
-        
-        console.print(f"[bold]Epoch {epoch+1}/{epochs}[/bold]")
-        
-        # Create batches
-        num_sequences = len(dataset)
-        indices = np.random.permutation(num_sequences)
-        
-        for batch_start in range(0, num_sequences, batch_size):
-            batch_end = min(batch_start + batch_size, num_sequences)
-            batch_indices = indices[batch_start:batch_end]
-            
-            # Get batch data
-            batch_inputs = []
-            batch_targets = []
-            for idx in batch_indices:
-                input_seq, target_seq = dataset[idx]
-                batch_inputs.append(input_seq)
-                batch_targets.append(target_seq)
-            
-            # Convert to tensors
-            batch_input = Tensor(np.array(batch_inputs))
-            batch_target = Tensor(np.array(batch_targets))
-            
-            # Forward pass
-            logits = model.forward(batch_input)
-            
-            # Reshape for loss
-            batch_size_actual, seq_length, vocab_size = logits.shape
-            logits_2d = logits.reshape(batch_size_actual * seq_length, vocab_size)
-            targets_1d = batch_target.reshape(-1)
-            
-            # Compute loss
-            loss = criterion.forward(logits_2d, targets_1d)
-            
-            # Backward and optimize
-            loss.backward()
-            optimizer.step()
-            optimizer.zero_grad()
-            
-            # Track loss
-            loss_value = float(loss.data)
-            epoch_loss += loss_value
-            batch_count += 1
-            total_batches_processed += 1
-            
-            # Monitor progress at check intervals
-            if total_batches_processed % check_interval == 0:
-                avg_loss = epoch_loss / batch_count
-                should_continue, message = monitor.check(avg_loss)
-                
-                elapsed = time.time() - start_time
-                console.print(f"  Batch {total_batches_processed} | Loss: {avg_loss:.4f} | {message} | Time: {elapsed:.1f}s")
-                
-                if not should_continue:
-                    console.print(f"\n[yellow]Early stopping triggered at epoch {epoch+1}, batch {batch_count}[/yellow]")
-                    return False
-        
-        # Epoch summary
-        avg_epoch_loss = epoch_loss / batch_count
-        epoch_time = time.time() - epoch_start
-        console.print(f"  → Epoch {epoch+1} complete: Avg Loss = {avg_epoch_loss:.4f} | Time: {epoch_time:.1f}s\n")
-    
-    console.print(f"[green]✓ Training completed successfully![/green]\n")
-    return True
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Monitored TinyTalks Training')
-    parser.add_argument('--mode', choices=['test', 'full'], default='test',
-                       help='Training mode: test (10 epochs) or full (30 epochs)')
-    parser.add_argument('--patience', type=int, default=5,
-                       help='Early stopping patience (checks without improvement)')
-    parser.add_argument('--min-delta', type=float, default=0.01,
-                       help='Minimum loss decrease to count as improvement')
-    parser.add_argument('--check-interval', type=int, default=50,
-                       help='Check progress every N batches')
-    
-    args = parser.parse_args()
-    
-    # Enable autograd
-    enable_autograd()
-    
-    # Configuration based on mode
-    if args.mode == 'test':
-        config = {
-            'epochs': 10,
-            'batch_size': 32,
-            'lr': 0.001,
-            'embed_dim': 128,
-            'num_layers': 6,
-            'num_heads': 8,
-            'check_interval': args.check_interval,
-            'mode': 'TEST (Quick Validation)'
-        }
-    else:  # full
-        config = {
-            'epochs': 30,
-            'batch_size': 32,
-            'lr': 0.001,
-            'embed_dim': 128,
-            'num_layers': 6,
-            'num_heads': 8,
-            'check_interval': args.check_interval,
-            'mode': 'FULL (Complete Training)'
-        }
-    
-    # Display configuration
-    console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]")
-    console.print("[bold cyan]    Monitored TinyTalks Training - Option C       [/bold cyan]")
-    console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n")
-    
-    table = Table(box=box.ROUNDED)
-    table.add_column("Parameter", style="cyan")
-    table.add_column("Value", style="yellow")
-    
-    table.add_row("Mode", config['mode'])
-    table.add_row("Epochs", str(config['epochs']))
-    table.add_row("Batch Size", str(config['batch_size']))
-    table.add_row("Learning Rate", str(config['lr']))
-    table.add_row("Model Size", f"{config['embed_dim']}d, {config['num_layers']}L, {config['num_heads']}H")
-    table.add_row("Early Stopping Patience", str(args.patience))
-    table.add_row("Min Delta", str(args.min_delta))
-    table.add_row("Check Interval", f"Every {args.check_interval} batches")
-    
-    console.print(table)
-    console.print()
-    
-    # Load dataset
-    console.print("[bold]Loading TinyTalks dataset...[/bold]")
-    dataset_path = project_root / "datasets/tinytalks/splits/train.txt"
-    with open(dataset_path, 'r') as f:
-        text = f.read()
-    
-    dataset = TinyTalksDataset(text, seq_length=64)
-    console.print(f"  ✓ Loaded: {len(text):,} chars, {dataset.tokenizer.vocab_size} vocab\n")
-    
-    # Initialize model
-    console.print("[bold]Initializing model...[/bold]")
-    model = TinyGPT(
-        vocab_size=dataset.tokenizer.vocab_size,
-        embed_dim=config['embed_dim'],
-        num_layers=config['num_layers'],
-        num_heads=config['num_heads'],
-        max_seq_len=64
-    )
-    
-    params = model.parameters()
-    param_count = sum(p.data.size for p in params)
-    console.print(f"  ✓ Model initialized: {param_count:,} parameters\n")
-    
-    # Initialize training components
-    optimizer = Adam(params, lr=config['lr'])
-    criterion = CrossEntropyLoss()
-    monitor = TrainingMonitor(patience=args.patience, min_delta=args.min_delta)
-    
-    # Train
-    console.print("[bold]Starting training...[/bold]\n")
-    start_time = time.time()
-    
-    success = train_with_monitoring(model, dataset, optimizer, criterion, config, monitor)
-    
-    total_time = time.time() - start_time
-    
-    # Summary
-    console.print("\n[bold cyan]═══════════════════════════════════════════════════[/bold cyan]")
-    console.print("[bold cyan]              Training Summary                     [/bold cyan]")
-    console.print("[bold cyan]═══════════════════════════════════════════════════[/bold cyan]\n")
-    
-    summary = monitor.summary()
-    
-    result_table = Table(box=box.ROUNDED)
-    result_table.add_column("Metric", style="cyan")
-    result_table.add_column("Value", style="yellow")
-    
-    result_table.add_row("Status", "✓ SUCCESS" if success else "⚠ EARLY STOP")
-    result_table.add_row("Total Time", f"{total_time/60:.1f} minutes")
-    result_table.add_row("Initial Loss", f"{summary['initial_loss']:.4f}")
-    result_table.add_row("Final Loss", f"{summary['final_loss']:.4f}")
-    result_table.add_row("Best Loss", f"{summary['best_loss']:.4f}")
-    result_table.add_row("Total Decrease", f"{summary['total_decrease']:.4f} ({summary['decrease_percent']:.1f}%)")
-    result_table.add_row("Checks Performed", str(summary['num_checks']))
-    
-    console.print(result_table)
-    console.print()
-    
-    # Recommendation
-    if success and summary['decrease_percent'] > 50:
-        console.print("[bold green]✓ EXCELLENT: Model is learning well! Continue with full training.[/bold green]")
-    elif success and summary['decrease_percent'] > 20:
-        console.print("[bold yellow]⚠ MODERATE: Model is learning but slowly. Consider tuning hyperparameters.[/bold yellow]")
-    elif success:
-        console.print("[bold red]✗ POOR: Model not learning effectively. Needs hyperparameter adjustment.[/bold red]")
-    else:
-        console.print("[bold red]✗ FAILED: Training stopped early. Try different hyperparameters.[/bold red]")
-
-
-if __name__ == "__main__":
-    main()
-