diff --git a/milestones/05_2017_transformer/vaswani_shakespeare.py b/milestones/05_2017_transformer/vaswani_shakespeare.py
index 2903c0e6..199e09bb 100644
--- a/milestones/05_2017_transformer/vaswani_shakespeare.py
+++ b/milestones/05_2017_transformer/vaswani_shakespeare.py
@@ -80,11 +80,17 @@ import os
 import numpy as np
 import argparse
 import time
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich import box
 
 # Add project root to path
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(project_root)
 
+console = Console()
+
 # Import TinyTorch components YOU BUILT!
 from tinytorch.core.tensor import Tensor                    # Module 02: YOU built this!
 from tinytorch.core.layers import Linear                    # Module 04: YOU built this!
@@ -157,8 +163,6 @@ class TinyGPT:
     """
     
     def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
-        print("🧠 Building TinyGPT with YOUR TinyTorch modules...")
-        
         # Token representation
         self.embedding = Embedding(vocab_size, embed_dim)           # Module 11!
         self.pos_encoding = PositionalEncoding(max_length, embed_dim)  # Module 11!
@@ -176,13 +180,11 @@ class TinyGPT:
         
         self.vocab_size = vocab_size
         self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
         
         # Calculate parameters
         self.total_params = self._count_parameters()
-        
-        print(f"   Architecture: {num_layers} layers, {num_heads} heads, {embed_dim}-dim embeddings")
-        print(f"   Vocabulary: {vocab_size} characters")
-        print(f"   Total parameters: {self.total_params:,} (YOUR components!)")
     
     def _count_parameters(self):
         """Count total parameters in model."""
@@ -240,11 +242,25 @@ class TinyGPT:
 
 def visualize_transformer():
     """Show how transformers process text sequences."""
-    print("\n" + "="*70)
-    print("🤖 VISUALIZING TRANSFORMER TEXT GENERATION:")
-    print("="*70)
+    console.print("")
+    console.print(Panel.fit(
+        "[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
+        "[yellow]The Problem:[/yellow]\n"
+        "RNNs process sequences one step at a time\n"
+        "Can't parallelize → slow training on long sequences\n"
+        "Struggle with long-range dependencies\n\n"
+        "[green]The Innovation:[/green]\n"
+        "Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
+        "  • Self-attention: Every token attends to every other token\n"
+        "  • Multi-head attention: Learn multiple attention patterns\n"
+        "  • Positional encoding: Preserve sequence order\n\n"
+        "[bold]Can attention alone match RNN performance?[/bold]",
+        title="🎯 ACT 1: THE CHALLENGE",
+        border_style="cyan",
+        box=box.DOUBLE
+    ))
     
-    print("""
+    console.print("""
     How YOUR Transformer Sees Text:      What It Learns:
     
     Input: "To be or not to be"          Layer 1 (Attention):
@@ -282,17 +298,17 @@ def visualize_transformer():
 
 def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.001):
     """Train TinyGPT using YOUR complete training system with DataLoader!"""
-    print("\n🚀 Training Shakespeare TinyGPT with YOUR TinyTorch!")
-    print(f"   Dataset: {len(train_loader.dataset):,} character sequences")
-    print(f"   Batch size: {train_loader.batch_size}")
-    print(f"   YOUR DataLoader (Module 08) handles batching!")
-    print(f"   YOUR Adam optimizer (Module 08)")
+    console.print("\n[bold]🚀 Training Shakespeare TinyGPT with YOUR TinyTorch![/bold]")
+    console.print(f"  Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
+    console.print(f"  Batch size: [cyan]{train_loader.batch_size}[/cyan]")
+    console.print(f"  YOUR DataLoader (Module 08) handles batching!")
+    console.print(f"  YOUR Adam optimizer (Module 08)")
     
     # YOUR optimizer
     optimizer = Adam(model.parameters(), learning_rate=learning_rate)
     
     for epoch in range(epochs):
-        print(f"\n   Epoch {epoch+1}/{epochs}:")
+        console.print(f"\n  [bold]Epoch {epoch+1}/{epochs}:[/bold]")
         epoch_loss = 0
         batch_count = 0
         
@@ -333,11 +349,11 @@ def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=
             
             # Progress
             if (batch_idx + 1) % 20 == 0:
-                print(f"   Batch {batch_idx+1}: Loss = {loss_value:.4f}")
+                console.print(f"    Batch {batch_idx+1}: Loss = [cyan]{loss_value:.4f}[/cyan]")
         
         # Epoch summary
         avg_loss = epoch_loss / max(1, batch_count)
-        print(f"   → Epoch Complete: Avg Loss = {avg_loss:.4f} (YOUR Transformer learning!)")
+        console.print(f"    → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
     
     return model
 
@@ -348,15 +364,15 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
     
     This is autoregressive generation: predict next char, add it, repeat.
     """
-    print("\n✨ TEXT GENERATION DEMO - THE PAYOFF!")
-    print("="*70)
+    console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
+    console.print("="*70)
     
     # Convert prompt to indices
     prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
     generated = prompt_indices.copy()
     
-    print(f"📝 Prompt: \"{prompt}\"")
-    print(f"🎯 Generating {max_length} characters...\n")
+    console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
+    console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
     
     # Generate character by character
     for _ in range(max_length):
@@ -387,45 +403,50 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
     # Decode to text
     generated_text = dataset.decode(generated)
     
-    print("📖 Generated Text:")
-    print("─" * 70)
-    print(generated_text)
-    print("─" * 70)
+    console.print("[bold]📖 Generated Text:[/bold]")
+    console.print("─" * 70)
+    console.print(f"[green]{generated_text}[/green]")
+    console.print("─" * 70)
     
     return generated_text
 
 
 def analyze_transformer_systems(model):
     """Analyze YOUR Transformer from an ML systems perspective."""
-    print("\n🔬 SYSTEMS ANALYSIS of YOUR Transformer Implementation:")
-    
-    print(f"\n   Model Architecture:")
-    print(f"   • Parameters: {model.total_params:,} weights")
-    print(f"   • Embedding dim: {model.embed_dim}")
-    print(f"   • Vocabulary: {model.vocab_size} characters")
-    
-    print(f"\n   Computational Complexity:")
-    print(f"   • Attention: O(n²·d) where n=sequence, d=dimension")
-    print(f"   • Self-attention allows parallel processing (vs RNN sequential)")
-    print(f"   • YOUR implementation: Pure Python + NumPy")
-    
-    print(f"\n   Memory Requirements:")
-    print(f"   • Parameters: {model.total_params * 4 / 1024:.1f} KB")
-    print(f"   • Attention matrices: O(n²) per layer")
-    print(f"   • YOUR TinyTorch tracks gradients automatically")
-    
-    print(f"\n   🏛️ Transformer Evolution:")
-    print(f"   • 2017: Vaswani et al. 'Attention Is All You Need'")
-    print(f"   • 2018: BERT (bidirectional), GPT (autoregressive)")
-    print(f"   • 2020: GPT-3 (175B params, same architecture!)")
-    print(f"   • 2022: ChatGPT (YOUR architecture at massive scale)")
-    print(f"   • YOUR TinyGPT: Core principles that power them all!")
-    
-    print(f"\n   💡 Why Transformers Dominate:")
-    print(f"   • Parallelizable (vs sequential RNNs)")
-    print(f"   • Long-range dependencies (attention sees everything)")
-    print(f"   • Scalable (architecture works from 1M to 175B params)")
-    print(f"   • YOUR implementation demonstrates all of these!")
+    console.print("")
+    console.print(Panel.fit(
+        f"[bold]Model Architecture:[/bold]\n"
+        f"  • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
+        f"  • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
+        f"  • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
+        
+        "[bold]Computational Complexity:[/bold]\n"
+        "  • Attention: O(n²·d) where n=sequence, d=dimension\n"
+        "  • Self-attention allows parallel processing (vs RNN sequential)\n"
+        "  • YOUR implementation: Pure Python + NumPy\n\n"
+        
+        f"[bold]Memory Requirements:[/bold]\n"
+        f"  • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
+        "  • Attention matrices: O(n²) per layer\n"
+        "  • YOUR TinyTorch tracks gradients automatically\n\n"
+        
+        "[bold]🏛️ Transformer Evolution:[/bold]\n"
+        "  • 2017: Vaswani et al. 'Attention Is All You Need'\n"
+        "  • 2018: BERT (bidirectional), GPT (autoregressive)\n"
+        "  • 2020: GPT-3 (175B params, same architecture!)\n"
+        "  • 2022: ChatGPT (YOUR architecture at massive scale)\n"
+        "  • YOUR TinyGPT: Core principles that power them all!\n\n"
+        
+        "[bold]💡 Why Transformers Dominate:[/bold]\n"
+        "  • Parallelizable (vs sequential RNNs)\n"
+        "  • Long-range dependencies (attention sees everything)\n"
+        "  • Scalable (architecture works from 1M to 175B params)\n"
+        "  • YOUR implementation demonstrates all of these!",
+        
+        title="🔬 SYSTEMS ANALYSIS",
+        border_style="cyan",
+        box=box.DOUBLE
+    ))
 
 
 def main():
@@ -452,17 +473,23 @@ def main():
                        help='Use small subset for testing')
     args = parser.parse_args()
     
-    print("🎯 Shakespeare Transformer - Text Generation with YOUR Attention!")
-    print("   Historical significance: Attention revolutionized sequence modeling")
-    print("   YOUR achievement: Generate Shakespeare-style text")
-    print("   Components used: YOUR complete transformer system (Modules 2-13)")
+    console.print("")
+    console.print(Panel.fit(
+        "[bold cyan]Shakespeare Transformer - Text Generation with YOUR Attention![/bold cyan]\n\n"
+        "[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
+        "[green]YOUR achievement:[/green] Generate Shakespeare-style text\n"
+        "[cyan]Components used:[/cyan] YOUR complete transformer system (Modules 2-13)",
+        title="🎯 Milestone 05: Transformer Era (2017)",
+        border_style="cyan",
+        box=box.DOUBLE
+    ))
     
     # Visualization
     if args.visualize:
         visualize_transformer()
     
     # Step 1: Load Shakespeare dataset
-    print("\n📥 Loading Shakespeare corpus...")
+    console.print("\n[bold]📥 Loading Shakespeare corpus...[/bold]")
     data_manager = DatasetManager()
     
     try:
@@ -470,23 +497,23 @@ def main():
         
         if args.quick_test:
             text = text[:10000]  # Use small subset for testing
-            print("   (Using subset for quick testing)")
+            console.print("  [dim](Using subset for quick testing)[/dim]")
             
     except Exception as e:
-        print(f"⚠️  Shakespeare download failed: {e}")
-        print("   Using synthetic text for demonstration...")
+        console.print(f"[yellow]⚠️  Shakespeare download failed: {e}[/yellow]")
+        console.print("  [dim]Using synthetic text for demonstration...[/dim]")
         text = "To be or not to be, that is the question. " * 100
     
     # Step 2: Create Dataset and DataLoader using YOUR Module 08!
-    print(f"\n📦 Creating YOUR Dataset and DataLoader (Module 08)...")
+    console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
     dataset = ShakespeareDataset(text, seq_length=args.seq_length)
     
     # YOUR DataLoader handles batching and shuffling!
     train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
     
-    print(f"   Vocabulary: {dataset.vocab_size} unique characters")
-    print(f"   Characters: '{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'")
-    print(f"   DataLoader: {len(dataset):,} sequences, batch_size={args.batch_size}")
+    console.print(f"  Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
+    console.print(f"  Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
+    console.print(f"  DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
     
     # Step 3: Build Transformer
     model = TinyGPT(
@@ -497,13 +524,19 @@ def main():
         num_layers=args.num_layers
     )
     
+    # Display model info
+    console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
+    console.print(f"  Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
+    console.print(f"  Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
+    console.print(f"  Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
+    
     if args.test_only:
-        print("\n🧪 ARCHITECTURE TEST MODE")
+        console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
         # Test with minimal data
         test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
         test_output = model.forward(test_input)
-        print(f"✅ Forward pass successful! Output shape: {test_output.data.shape}")
-        print("✅ YOUR Transformer + DataLoader work together!")
+        console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
+        console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
         return
     
     # Step 4: Train using YOUR DataLoader
@@ -515,33 +548,41 @@ def main():
     generated = generate_text(model, dataset, prompt="To be or not", max_length=200)
     
     # Additional generation examples
-    print("\n🎭 More Generation Examples:")
-    print("─" * 70)
+    console.print("\n[bold]🎭 More Generation Examples:[/bold]")
+    console.print("─" * 70)
     
     prompts = ["ROMEO:", "The king", "What is"]
     for prompt in prompts:
         if all(ch in dataset.char_to_idx for ch in prompt):
-            print(f"\nPrompt: \"{prompt}\"")
+            console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
             gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
     
     # Step 6: Systems Analysis
     analyze_transformer_systems(model)
     
-    print(f"\n⏱️  Training time: {train_time:.1f} seconds")
-    print(f"   Sequences/sec: {len(dataset) * args.epochs / train_time:.0f}")
+    console.print(f"\n[bold]⏱️  Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
+    console.print(f"  Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
     
-    print("\n✅ SUCCESS! Shakespeare Transformer Milestone Complete!")
-    print("\n🎓 What YOU Accomplished:")
-    print("   • YOUR attention mechanism processes sequences in parallel")
-    print("   • YOUR transformer captures long-range text dependencies")
-    print("   • YOUR DataLoader efficiently batches character sequences")
-    print("   • YOUR TinyGPT generates coherent text!")
-    print("   • YOUR complete language modeling system works!")
-    
-    print("\n🚀 Next Steps:")
-    print("   • Continue to Module 14 (KV-Caching) for 3x faster inference")
-    print("   • YOUR transformer architecture scales to GPT-scale models")
-    print("   • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!")
+    console.print("")
+    console.print(Panel.fit(
+        "[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
+        
+        "[bold]🎓 What YOU Accomplished:[/bold]\n"
+        "  • YOUR attention mechanism processes sequences in parallel\n"
+        "  • YOUR transformer captures long-range text dependencies\n"
+        "  • YOUR DataLoader efficiently batches character sequences\n"
+        "  • YOUR TinyGPT generates coherent text!\n"
+        "  • YOUR complete language modeling system works!\n\n"
+        
+        "[bold]🚀 Next Steps:[/bold]\n"
+        "  • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
+        "  • YOUR transformer architecture scales to GPT-scale models\n"
+        "  • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
+        
+        title="🌟 2017 Transformer Revolution Complete",
+        border_style="green",
+        box=box.DOUBLE
+    ))
 
 if __name__ == "__main__":
     main()