diff --git a/milestones/05_2017_transformer/vaswani_shakespeare.py b/milestones/05_2017_transformer/vaswani_shakespeare.py index 2903c0e6..199e09bb 100644 --- a/milestones/05_2017_transformer/vaswani_shakespeare.py +++ b/milestones/05_2017_transformer/vaswani_shakespeare.py @@ -80,11 +80,17 @@ import os import numpy as np import argparse import time +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich import box # Add project root to path project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(project_root) +console = Console() + # Import TinyTorch components YOU BUILT! from tinytorch.core.tensor import Tensor # Module 02: YOU built this! from tinytorch.core.layers import Linear # Module 04: YOU built this! @@ -157,8 +163,6 @@ class TinyGPT: """ def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers): - print("🧠 Building TinyGPT with YOUR TinyTorch modules...") - # Token representation self.embedding = Embedding(vocab_size, embed_dim) # Module 11! self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11! @@ -176,13 +180,11 @@ class TinyGPT: self.vocab_size = vocab_size self.embed_dim = embed_dim + self.num_layers = num_layers + self.num_heads = num_heads # Calculate parameters self.total_params = self._count_parameters() - - print(f" Architecture: {num_layers} layers, {num_heads} heads, {embed_dim}-dim embeddings") - print(f" Vocabulary: {vocab_size} characters") - print(f" Total parameters: {self.total_params:,} (YOUR components!)") def _count_parameters(self): """Count total parameters in model.""" @@ -240,11 +242,25 @@ class TinyGPT: def visualize_transformer(): """Show how transformers process text sequences.""" - print("\n" + "="*70) - print("šŸ¤– VISUALIZING TRANSFORMER TEXT GENERATION:") - print("="*70) + console.print("") + console.print(Panel.fit( + "[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n" + "[yellow]The Problem:[/yellow]\n" + "RNNs process sequences one step at a time\n" + "Can't parallelize → slow training on long sequences\n" + "Struggle with long-range dependencies\n\n" + "[green]The Innovation:[/green]\n" + "Transformers: Attention mechanisms process ENTIRE sequences in parallel\n" + " • Self-attention: Every token attends to every other token\n" + " • Multi-head attention: Learn multiple attention patterns\n" + " • Positional encoding: Preserve sequence order\n\n" + "[bold]Can attention alone match RNN performance?[/bold]", + title="šŸŽÆ ACT 1: THE CHALLENGE", + border_style="cyan", + box=box.DOUBLE + )) - print(""" + console.print(""" How YOUR Transformer Sees Text: What It Learns: Input: "To be or not to be" Layer 1 (Attention): @@ -282,17 +298,17 @@ def visualize_transformer(): def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.001): """Train TinyGPT using YOUR complete training system with DataLoader!""" - print("\nšŸš€ Training Shakespeare TinyGPT with YOUR TinyTorch!") - print(f" Dataset: {len(train_loader.dataset):,} character sequences") - print(f" Batch size: {train_loader.batch_size}") - print(f" YOUR DataLoader (Module 08) handles batching!") - print(f" YOUR Adam optimizer (Module 08)") + console.print("\n[bold]šŸš€ Training Shakespeare TinyGPT with YOUR TinyTorch![/bold]") + console.print(f" Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences") + console.print(f" Batch size: [cyan]{train_loader.batch_size}[/cyan]") + console.print(f" YOUR DataLoader (Module 08) handles batching!") + console.print(f" YOUR Adam optimizer (Module 08)") # YOUR optimizer optimizer = Adam(model.parameters(), learning_rate=learning_rate) for epoch in range(epochs): - print(f"\n Epoch {epoch+1}/{epochs}:") + console.print(f"\n [bold]Epoch {epoch+1}/{epochs}:[/bold]") epoch_loss = 0 batch_count = 0 @@ -333,11 +349,11 @@ def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate= # Progress if (batch_idx + 1) % 20 == 0: - print(f" Batch {batch_idx+1}: Loss = {loss_value:.4f}") + console.print(f" Batch {batch_idx+1}: Loss = [cyan]{loss_value:.4f}[/cyan]") # Epoch summary avg_loss = epoch_loss / max(1, batch_count) - print(f" → Epoch Complete: Avg Loss = {avg_loss:.4f} (YOUR Transformer learning!)") + console.print(f" → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)") return model @@ -348,15 +364,15 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera This is autoregressive generation: predict next char, add it, repeat. """ - print("\n✨ TEXT GENERATION DEMO - THE PAYOFF!") - print("="*70) + console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]") + console.print("="*70) # Convert prompt to indices prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx] generated = prompt_indices.copy() - print(f"šŸ“ Prompt: \"{prompt}\"") - print(f"šŸŽÆ Generating {max_length} characters...\n") + console.print(f"šŸ“ Prompt: [cyan]\"{prompt}\"[/cyan]") + console.print(f"šŸŽÆ Generating [cyan]{max_length}[/cyan] characters...\n") # Generate character by character for _ in range(max_length): @@ -387,45 +403,50 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera # Decode to text generated_text = dataset.decode(generated) - print("šŸ“– Generated Text:") - print("─" * 70) - print(generated_text) - print("─" * 70) + console.print("[bold]šŸ“– Generated Text:[/bold]") + console.print("─" * 70) + console.print(f"[green]{generated_text}[/green]") + console.print("─" * 70) return generated_text def analyze_transformer_systems(model): """Analyze YOUR Transformer from an ML systems perspective.""" - print("\nšŸ”¬ SYSTEMS ANALYSIS of YOUR Transformer Implementation:") - - print(f"\n Model Architecture:") - print(f" • Parameters: {model.total_params:,} weights") - print(f" • Embedding dim: {model.embed_dim}") - print(f" • Vocabulary: {model.vocab_size} characters") - - print(f"\n Computational Complexity:") - print(f" • Attention: O(n²·d) where n=sequence, d=dimension") - print(f" • Self-attention allows parallel processing (vs RNN sequential)") - print(f" • YOUR implementation: Pure Python + NumPy") - - print(f"\n Memory Requirements:") - print(f" • Parameters: {model.total_params * 4 / 1024:.1f} KB") - print(f" • Attention matrices: O(n²) per layer") - print(f" • YOUR TinyTorch tracks gradients automatically") - - print(f"\n šŸ›ļø Transformer Evolution:") - print(f" • 2017: Vaswani et al. 'Attention Is All You Need'") - print(f" • 2018: BERT (bidirectional), GPT (autoregressive)") - print(f" • 2020: GPT-3 (175B params, same architecture!)") - print(f" • 2022: ChatGPT (YOUR architecture at massive scale)") - print(f" • YOUR TinyGPT: Core principles that power them all!") - - print(f"\n šŸ’” Why Transformers Dominate:") - print(f" • Parallelizable (vs sequential RNNs)") - print(f" • Long-range dependencies (attention sees everything)") - print(f" • Scalable (architecture works from 1M to 175B params)") - print(f" • YOUR implementation demonstrates all of these!") + console.print("") + console.print(Panel.fit( + f"[bold]Model Architecture:[/bold]\n" + f" • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n" + f" • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n" + f" • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n" + + "[bold]Computational Complexity:[/bold]\n" + " • Attention: O(n²·d) where n=sequence, d=dimension\n" + " • Self-attention allows parallel processing (vs RNN sequential)\n" + " • YOUR implementation: Pure Python + NumPy\n\n" + + f"[bold]Memory Requirements:[/bold]\n" + f" • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n" + " • Attention matrices: O(n²) per layer\n" + " • YOUR TinyTorch tracks gradients automatically\n\n" + + "[bold]šŸ›ļø Transformer Evolution:[/bold]\n" + " • 2017: Vaswani et al. 'Attention Is All You Need'\n" + " • 2018: BERT (bidirectional), GPT (autoregressive)\n" + " • 2020: GPT-3 (175B params, same architecture!)\n" + " • 2022: ChatGPT (YOUR architecture at massive scale)\n" + " • YOUR TinyGPT: Core principles that power them all!\n\n" + + "[bold]šŸ’” Why Transformers Dominate:[/bold]\n" + " • Parallelizable (vs sequential RNNs)\n" + " • Long-range dependencies (attention sees everything)\n" + " • Scalable (architecture works from 1M to 175B params)\n" + " • YOUR implementation demonstrates all of these!", + + title="šŸ”¬ SYSTEMS ANALYSIS", + border_style="cyan", + box=box.DOUBLE + )) def main(): @@ -452,17 +473,23 @@ def main(): help='Use small subset for testing') args = parser.parse_args() - print("šŸŽÆ Shakespeare Transformer - Text Generation with YOUR Attention!") - print(" Historical significance: Attention revolutionized sequence modeling") - print(" YOUR achievement: Generate Shakespeare-style text") - print(" Components used: YOUR complete transformer system (Modules 2-13)") + console.print("") + console.print(Panel.fit( + "[bold cyan]Shakespeare Transformer - Text Generation with YOUR Attention![/bold cyan]\n\n" + "[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n" + "[green]YOUR achievement:[/green] Generate Shakespeare-style text\n" + "[cyan]Components used:[/cyan] YOUR complete transformer system (Modules 2-13)", + title="šŸŽÆ Milestone 05: Transformer Era (2017)", + border_style="cyan", + box=box.DOUBLE + )) # Visualization if args.visualize: visualize_transformer() # Step 1: Load Shakespeare dataset - print("\nšŸ“„ Loading Shakespeare corpus...") + console.print("\n[bold]šŸ“„ Loading Shakespeare corpus...[/bold]") data_manager = DatasetManager() try: @@ -470,23 +497,23 @@ def main(): if args.quick_test: text = text[:10000] # Use small subset for testing - print(" (Using subset for quick testing)") + console.print(" [dim](Using subset for quick testing)[/dim]") except Exception as e: - print(f"āš ļø Shakespeare download failed: {e}") - print(" Using synthetic text for demonstration...") + console.print(f"[yellow]āš ļø Shakespeare download failed: {e}[/yellow]") + console.print(" [dim]Using synthetic text for demonstration...[/dim]") text = "To be or not to be, that is the question. " * 100 # Step 2: Create Dataset and DataLoader using YOUR Module 08! - print(f"\nšŸ“¦ Creating YOUR Dataset and DataLoader (Module 08)...") + console.print(f"\n[bold]šŸ“¦ Creating YOUR Dataset and DataLoader (Module 08)...[/bold]") dataset = ShakespeareDataset(text, seq_length=args.seq_length) # YOUR DataLoader handles batching and shuffling! train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) - print(f" Vocabulary: {dataset.vocab_size} unique characters") - print(f" Characters: '{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'") - print(f" DataLoader: {len(dataset):,} sequences, batch_size={args.batch_size}") + console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters") + console.print(f" Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]") + console.print(f" DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]") # Step 3: Build Transformer model = TinyGPT( @@ -497,13 +524,19 @@ def main(): num_layers=args.num_layers ) + # Display model info + console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]") + console.print(f" Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings") + console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters") + console.print(f" Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)") + if args.test_only: - print("\n🧪 ARCHITECTURE TEST MODE") + console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]") # Test with minimal data test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32)) test_output = model.forward(test_input) - print(f"āœ… Forward pass successful! Output shape: {test_output.data.shape}") - print("āœ… YOUR Transformer + DataLoader work together!") + console.print(f"[green]āœ… Forward pass successful! Output shape: {test_output.data.shape}[/green]") + console.print(f"[green]āœ… YOUR Transformer + DataLoader work together![/green]") return # Step 4: Train using YOUR DataLoader @@ -515,33 +548,41 @@ def main(): generated = generate_text(model, dataset, prompt="To be or not", max_length=200) # Additional generation examples - print("\nšŸŽ­ More Generation Examples:") - print("─" * 70) + console.print("\n[bold]šŸŽ­ More Generation Examples:[/bold]") + console.print("─" * 70) prompts = ["ROMEO:", "The king", "What is"] for prompt in prompts: if all(ch in dataset.char_to_idx for ch in prompt): - print(f"\nPrompt: \"{prompt}\"") + console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]") gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8) # Step 6: Systems Analysis analyze_transformer_systems(model) - print(f"\nā±ļø Training time: {train_time:.1f} seconds") - print(f" Sequences/sec: {len(dataset) * args.epochs / train_time:.0f}") + console.print(f"\n[bold]ā±ļø Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds") + console.print(f" Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]") - print("\nāœ… SUCCESS! Shakespeare Transformer Milestone Complete!") - print("\nšŸŽ“ What YOU Accomplished:") - print(" • YOUR attention mechanism processes sequences in parallel") - print(" • YOUR transformer captures long-range text dependencies") - print(" • YOUR DataLoader efficiently batches character sequences") - print(" • YOUR TinyGPT generates coherent text!") - print(" • YOUR complete language modeling system works!") - - print("\nšŸš€ Next Steps:") - print(" • Continue to Module 14 (KV-Caching) for 3x faster inference") - print(" • YOUR transformer architecture scales to GPT-scale models") - print(" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!") + console.print("") + console.print(Panel.fit( + "[bold green]āœ… SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n" + + "[bold]šŸŽ“ What YOU Accomplished:[/bold]\n" + " • YOUR attention mechanism processes sequences in parallel\n" + " • YOUR transformer captures long-range text dependencies\n" + " • YOUR DataLoader efficiently batches character sequences\n" + " • YOUR TinyGPT generates coherent text!\n" + " • YOUR complete language modeling system works!\n\n" + + "[bold]šŸš€ Next Steps:[/bold]\n" + " • Continue to Module 14 (KV-Caching) for 3x faster inference\n" + " • YOUR transformer architecture scales to GPT-scale models\n" + " • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!", + + title="🌟 2017 Transformer Revolution Complete", + border_style="green", + box=box.DOUBLE + )) if __name__ == "__main__": main()