🎨 Add Rich CLI formatting to transformer milestone 05

Updates to vaswani_shakespeare.py: - Add Rich console, Panel, Table, and box imports - Replace all print() statements with console.print() with Rich markup - Add beautiful Panel.fit() boxes for major sections (Act 1, Systems Analysis, Success) - Use Rich color tags: [bold], [cyan], [green], [yellow], [dim] - Format training progress with colored loss values - Display generated text in green - Add architectural visualization with Rich panels Updates to transformers_dev.py: - Remove all try/except fallback implementations - Clean imports only (no development scaffolding) - Use proper module imports from tinytorch package Milestone now matches the beautiful CLI pattern from cnn_digits.py
2026-04-28 21:22:32 -05:00 · 2025-10-27 16:50:40 -04:00
parent 4f9c352e9d
commit de826e0b9d
1 changed files with 128 additions and 87 deletions
--- a/milestones/05_2017_transformer/vaswani_shakespeare.py
+++ b/milestones/05_2017_transformer/vaswani_shakespeare.py
@@ -80,11 +80,17 @@ import os
 import numpy as np
 import argparse
 import time
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich import box

 # Add project root to path
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(project_root)

+console = Console()
+
 # Import TinyTorch components YOU BUILT!
 from tinytorch.core.tensor import Tensor                    # Module 02: YOU built this!
 from tinytorch.core.layers import Linear                    # Module 04: YOU built this!
@@ -157,8 +163,6 @@ class TinyGPT:
    """
    
    def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
-        print("🧠 Building TinyGPT with YOUR TinyTorch modules...")
-        
        # Token representation
        self.embedding = Embedding(vocab_size, embed_dim)           # Module 11!
        self.pos_encoding = PositionalEncoding(max_length, embed_dim)  # Module 11!
@@ -176,13 +180,11 @@ class TinyGPT:
        
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
        
        # Calculate parameters
        self.total_params = self._count_parameters()
-        
-        print(f"   Architecture: {num_layers} layers, {num_heads} heads, {embed_dim}-dim embeddings")
-        print(f"   Vocabulary: {vocab_size} characters")
-        print(f"   Total parameters: {self.total_params:,} (YOUR components!)")
    
    def _count_parameters(self):
        """Count total parameters in model."""
@@ -240,11 +242,25 @@ class TinyGPT:

 def visualize_transformer():
    """Show how transformers process text sequences."""
-    print("\n" + "="*70)
-    print("🤖 VISUALIZING TRANSFORMER TEXT GENERATION:")
-    print("="*70)
+    console.print("")
+    console.print(Panel.fit(
+        "[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
+        "[yellow]The Problem:[/yellow]\n"
+        "RNNs process sequences one step at a time\n"
+        "Can't parallelize → slow training on long sequences\n"
+        "Struggle with long-range dependencies\n\n"
+        "[green]The Innovation:[/green]\n"
+        "Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
+        "  • Self-attention: Every token attends to every other token\n"
+        "  • Multi-head attention: Learn multiple attention patterns\n"
+        "  • Positional encoding: Preserve sequence order\n\n"
+        "[bold]Can attention alone match RNN performance?[/bold]",
+        title="🎯 ACT 1: THE CHALLENGE",
+        border_style="cyan",
+        box=box.DOUBLE
+    ))
    
-    print("""
+    console.print("""
    How YOUR Transformer Sees Text:      What It Learns:
    
    Input: "To be or not to be"          Layer 1 (Attention):
@@ -282,17 +298,17 @@ def visualize_transformer():

 def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.001):
    """Train TinyGPT using YOUR complete training system with DataLoader!"""
-    print("\n🚀 Training Shakespeare TinyGPT with YOUR TinyTorch!")
-    print(f"   Dataset: {len(train_loader.dataset):,} character sequences")
-    print(f"   Batch size: {train_loader.batch_size}")
-    print(f"   YOUR DataLoader (Module 08) handles batching!")
-    print(f"   YOUR Adam optimizer (Module 08)")
+    console.print("\n[bold]🚀 Training Shakespeare TinyGPT with YOUR TinyTorch![/bold]")
+    console.print(f"  Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
+    console.print(f"  Batch size: [cyan]{train_loader.batch_size}[/cyan]")
+    console.print(f"  YOUR DataLoader (Module 08) handles batching!")
+    console.print(f"  YOUR Adam optimizer (Module 08)")
    
    # YOUR optimizer
    optimizer = Adam(model.parameters(), learning_rate=learning_rate)
    
    for epoch in range(epochs):
-        print(f"\n   Epoch {epoch+1}/{epochs}:")
+        console.print(f"\n  [bold]Epoch {epoch+1}/{epochs}:[/bold]")
        epoch_loss = 0
        batch_count = 0
        
@@ -333,11 +349,11 @@ def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=
            
            # Progress
            if (batch_idx + 1) % 20 == 0:
-                print(f"   Batch {batch_idx+1}: Loss = {loss_value:.4f}")
+                console.print(f"    Batch {batch_idx+1}: Loss = [cyan]{loss_value:.4f}[/cyan]")
        
        # Epoch summary
        avg_loss = epoch_loss / max(1, batch_count)
-        print(f"   → Epoch Complete: Avg Loss = {avg_loss:.4f} (YOUR Transformer learning!)")
+        console.print(f"    → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
    
    return model

@@ -348,15 +364,15 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
    
    This is autoregressive generation: predict next char, add it, repeat.
    """
-    print("\n✨ TEXT GENERATION DEMO - THE PAYOFF!")
-    print("="*70)
+    console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
+    console.print("="*70)
    
    # Convert prompt to indices
    prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
    generated = prompt_indices.copy()
    
-    print(f"📝 Prompt: \"{prompt}\"")
-    print(f"🎯 Generating {max_length} characters...\n")
+    console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
+    console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
    
    # Generate character by character
    for _ in range(max_length):
@@ -387,45 +403,50 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
    # Decode to text
    generated_text = dataset.decode(generated)
    
-    print("📖 Generated Text:")
-    print("─" * 70)
-    print(generated_text)
-    print("─" * 70)
+    console.print("[bold]📖 Generated Text:[/bold]")
+    console.print("─" * 70)
+    console.print(f"[green]{generated_text}[/green]")
+    console.print("─" * 70)
    
    return generated_text


 def analyze_transformer_systems(model):
    """Analyze YOUR Transformer from an ML systems perspective."""
-    print("\n🔬 SYSTEMS ANALYSIS of YOUR Transformer Implementation:")
-    
-    print(f"\n   Model Architecture:")
-    print(f"   • Parameters: {model.total_params:,} weights")
-    print(f"   • Embedding dim: {model.embed_dim}")
-    print(f"   • Vocabulary: {model.vocab_size} characters")
-    
-    print(f"\n   Computational Complexity:")
-    print(f"   • Attention: O(n²·d) where n=sequence, d=dimension")
-    print(f"   • Self-attention allows parallel processing (vs RNN sequential)")
-    print(f"   • YOUR implementation: Pure Python + NumPy")
-    
-    print(f"\n   Memory Requirements:")
-    print(f"   • Parameters: {model.total_params * 4 / 1024:.1f} KB")
-    print(f"   • Attention matrices: O(n²) per layer")
-    print(f"   • YOUR TinyTorch tracks gradients automatically")
-    
-    print(f"\n   🏛️ Transformer Evolution:")
-    print(f"   • 2017: Vaswani et al. 'Attention Is All You Need'")
-    print(f"   • 2018: BERT (bidirectional), GPT (autoregressive)")
-    print(f"   • 2020: GPT-3 (175B params, same architecture!)")
-    print(f"   • 2022: ChatGPT (YOUR architecture at massive scale)")
-    print(f"   • YOUR TinyGPT: Core principles that power them all!")
-    
-    print(f"\n   💡 Why Transformers Dominate:")
-    print(f"   • Parallelizable (vs sequential RNNs)")
-    print(f"   • Long-range dependencies (attention sees everything)")
-    print(f"   • Scalable (architecture works from 1M to 175B params)")
-    print(f"   • YOUR implementation demonstrates all of these!")
+    console.print("")
+    console.print(Panel.fit(
+        f"[bold]Model Architecture:[/bold]\n"
+        f"  • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
+        f"  • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
+        f"  • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
+        
+        "[bold]Computational Complexity:[/bold]\n"
+        "  • Attention: O(n²·d) where n=sequence, d=dimension\n"
+        "  • Self-attention allows parallel processing (vs RNN sequential)\n"
+        "  • YOUR implementation: Pure Python + NumPy\n\n"
+        
+        f"[bold]Memory Requirements:[/bold]\n"
+        f"  • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
+        "  • Attention matrices: O(n²) per layer\n"
+        "  • YOUR TinyTorch tracks gradients automatically\n\n"
+        
+        "[bold]🏛️ Transformer Evolution:[/bold]\n"
+        "  • 2017: Vaswani et al. 'Attention Is All You Need'\n"
+        "  • 2018: BERT (bidirectional), GPT (autoregressive)\n"
+        "  • 2020: GPT-3 (175B params, same architecture!)\n"
+        "  • 2022: ChatGPT (YOUR architecture at massive scale)\n"
+        "  • YOUR TinyGPT: Core principles that power them all!\n\n"
+        
+        "[bold]💡 Why Transformers Dominate:[/bold]\n"
+        "  • Parallelizable (vs sequential RNNs)\n"
+        "  • Long-range dependencies (attention sees everything)\n"
+        "  • Scalable (architecture works from 1M to 175B params)\n"
+        "  • YOUR implementation demonstrates all of these!",
+        
+        title="🔬 SYSTEMS ANALYSIS",
+        border_style="cyan",
+        box=box.DOUBLE
+    ))


 def main():
@@ -452,17 +473,23 @@ def main():
                       help='Use small subset for testing')
    args = parser.parse_args()
    
-    print("🎯 Shakespeare Transformer - Text Generation with YOUR Attention!")
-    print("   Historical significance: Attention revolutionized sequence modeling")
-    print("   YOUR achievement: Generate Shakespeare-style text")
-    print("   Components used: YOUR complete transformer system (Modules 2-13)")
+    console.print("")
+    console.print(Panel.fit(
+        "[bold cyan]Shakespeare Transformer - Text Generation with YOUR Attention![/bold cyan]\n\n"
+        "[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
+        "[green]YOUR achievement:[/green] Generate Shakespeare-style text\n"
+        "[cyan]Components used:[/cyan] YOUR complete transformer system (Modules 2-13)",
+        title="🎯 Milestone 05: Transformer Era (2017)",
+        border_style="cyan",
+        box=box.DOUBLE
+    ))
    
    # Visualization
    if args.visualize:
        visualize_transformer()
    
    # Step 1: Load Shakespeare dataset
-    print("\n📥 Loading Shakespeare corpus...")
+    console.print("\n[bold]📥 Loading Shakespeare corpus...[/bold]")
    data_manager = DatasetManager()
    
    try:
@@ -470,23 +497,23 @@ def main():
        
        if args.quick_test:
            text = text[:10000]  # Use small subset for testing
-            print("   (Using subset for quick testing)")
+            console.print("  [dim](Using subset for quick testing)[/dim]")
            
    except Exception as e:
-        print(f"⚠️  Shakespeare download failed: {e}")
-        print("   Using synthetic text for demonstration...")
+        console.print(f"[yellow]⚠️  Shakespeare download failed: {e}[/yellow]")
+        console.print("  [dim]Using synthetic text for demonstration...[/dim]")
        text = "To be or not to be, that is the question. " * 100
    
    # Step 2: Create Dataset and DataLoader using YOUR Module 08!
-    print(f"\n📦 Creating YOUR Dataset and DataLoader (Module 08)...")
+    console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
    dataset = ShakespeareDataset(text, seq_length=args.seq_length)
    
    # YOUR DataLoader handles batching and shuffling!
    train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
    
-    print(f"   Vocabulary: {dataset.vocab_size} unique characters")
-    print(f"   Characters: '{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'")
-    print(f"   DataLoader: {len(dataset):,} sequences, batch_size={args.batch_size}")
+    console.print(f"  Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
+    console.print(f"  Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
+    console.print(f"  DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
    
    # Step 3: Build Transformer
    model = TinyGPT(
@@ -497,13 +524,19 @@ def main():
        num_layers=args.num_layers
    )
    
+    # Display model info
+    console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
+    console.print(f"  Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
+    console.print(f"  Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
+    console.print(f"  Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
+    
    if args.test_only:
-        print("\n🧪 ARCHITECTURE TEST MODE")
+        console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
        # Test with minimal data
        test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
        test_output = model.forward(test_input)
-        print(f"✅ Forward pass successful! Output shape: {test_output.data.shape}")
-        print("✅ YOUR Transformer + DataLoader work together!")
+        console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
+        console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
        return
    
    # Step 4: Train using YOUR DataLoader
@@ -515,33 +548,41 @@ def main():
    generated = generate_text(model, dataset, prompt="To be or not", max_length=200)
    
    # Additional generation examples
-    print("\n🎭 More Generation Examples:")
-    print("─" * 70)
+    console.print("\n[bold]🎭 More Generation Examples:[/bold]")
+    console.print("─" * 70)
    
    prompts = ["ROMEO:", "The king", "What is"]
    for prompt in prompts:
        if all(ch in dataset.char_to_idx for ch in prompt):
-            print(f"\nPrompt: \"{prompt}\"")
+            console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
            gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
    
    # Step 6: Systems Analysis
    analyze_transformer_systems(model)
    
-    print(f"\n⏱️  Training time: {train_time:.1f} seconds")
-    print(f"   Sequences/sec: {len(dataset) * args.epochs / train_time:.0f}")
+    console.print(f"\n[bold]⏱️  Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
+    console.print(f"  Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
    
-    print("\n✅ SUCCESS! Shakespeare Transformer Milestone Complete!")
-    print("\n🎓 What YOU Accomplished:")
-    print("   • YOUR attention mechanism processes sequences in parallel")
-    print("   • YOUR transformer captures long-range text dependencies")
-    print("   • YOUR DataLoader efficiently batches character sequences")
-    print("   • YOUR TinyGPT generates coherent text!")
-    print("   • YOUR complete language modeling system works!")
-    
-    print("\n🚀 Next Steps:")
-    print("   • Continue to Module 14 (KV-Caching) for 3x faster inference")
-    print("   • YOUR transformer architecture scales to GPT-scale models")
-    print("   • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!")
+    console.print("")
+    console.print(Panel.fit(
+        "[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
+        
+        "[bold]🎓 What YOU Accomplished:[/bold]\n"
+        "  • YOUR attention mechanism processes sequences in parallel\n"
+        "  • YOUR transformer captures long-range text dependencies\n"
+        "  • YOUR DataLoader efficiently batches character sequences\n"
+        "  • YOUR TinyGPT generates coherent text!\n"
+        "  • YOUR complete language modeling system works!\n\n"
+        
+        "[bold]🚀 Next Steps:[/bold]\n"
+        "  • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
+        "  • YOUR transformer architecture scales to GPT-scale models\n"
+        "  • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
+        
+        title="🌟 2017 Transformer Revolution Complete",
+        border_style="green",
+        box=box.DOUBLE
+    ))

 if __name__ == "__main__":
    main()