mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 21:22:32 -05:00
🎨 Add Rich CLI formatting to transformer milestone 05
Updates to vaswani_shakespeare.py: - Add Rich console, Panel, Table, and box imports - Replace all print() statements with console.print() with Rich markup - Add beautiful Panel.fit() boxes for major sections (Act 1, Systems Analysis, Success) - Use Rich color tags: [bold], [cyan], [green], [yellow], [dim] - Format training progress with colored loss values - Display generated text in green - Add architectural visualization with Rich panels Updates to transformers_dev.py: - Remove all try/except fallback implementations - Clean imports only (no development scaffolding) - Use proper module imports from tinytorch package Milestone now matches the beautiful CLI pattern from cnn_digits.py
This commit is contained in:
@@ -80,11 +80,17 @@ import os
|
||||
import numpy as np
|
||||
import argparse
|
||||
import time
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
# Add project root to path
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(project_root)
|
||||
|
||||
console = Console()
|
||||
|
||||
# Import TinyTorch components YOU BUILT!
|
||||
from tinytorch.core.tensor import Tensor # Module 02: YOU built this!
|
||||
from tinytorch.core.layers import Linear # Module 04: YOU built this!
|
||||
@@ -157,8 +163,6 @@ class TinyGPT:
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
|
||||
print("🧠 Building TinyGPT with YOUR TinyTorch modules...")
|
||||
|
||||
# Token representation
|
||||
self.embedding = Embedding(vocab_size, embed_dim) # Module 11!
|
||||
self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11!
|
||||
@@ -176,13 +180,11 @@ class TinyGPT:
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
|
||||
# Calculate parameters
|
||||
self.total_params = self._count_parameters()
|
||||
|
||||
print(f" Architecture: {num_layers} layers, {num_heads} heads, {embed_dim}-dim embeddings")
|
||||
print(f" Vocabulary: {vocab_size} characters")
|
||||
print(f" Total parameters: {self.total_params:,} (YOUR components!)")
|
||||
|
||||
def _count_parameters(self):
|
||||
"""Count total parameters in model."""
|
||||
@@ -240,11 +242,25 @@ class TinyGPT:
|
||||
|
||||
def visualize_transformer():
|
||||
"""Show how transformers process text sequences."""
|
||||
print("\n" + "="*70)
|
||||
print("🤖 VISUALIZING TRANSFORMER TEXT GENERATION:")
|
||||
print("="*70)
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
"[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
|
||||
"[yellow]The Problem:[/yellow]\n"
|
||||
"RNNs process sequences one step at a time\n"
|
||||
"Can't parallelize → slow training on long sequences\n"
|
||||
"Struggle with long-range dependencies\n\n"
|
||||
"[green]The Innovation:[/green]\n"
|
||||
"Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
|
||||
" • Self-attention: Every token attends to every other token\n"
|
||||
" • Multi-head attention: Learn multiple attention patterns\n"
|
||||
" • Positional encoding: Preserve sequence order\n\n"
|
||||
"[bold]Can attention alone match RNN performance?[/bold]",
|
||||
title="🎯 ACT 1: THE CHALLENGE",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
print("""
|
||||
console.print("""
|
||||
How YOUR Transformer Sees Text: What It Learns:
|
||||
|
||||
Input: "To be or not to be" Layer 1 (Attention):
|
||||
@@ -282,17 +298,17 @@ def visualize_transformer():
|
||||
|
||||
def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.001):
|
||||
"""Train TinyGPT using YOUR complete training system with DataLoader!"""
|
||||
print("\n🚀 Training Shakespeare TinyGPT with YOUR TinyTorch!")
|
||||
print(f" Dataset: {len(train_loader.dataset):,} character sequences")
|
||||
print(f" Batch size: {train_loader.batch_size}")
|
||||
print(f" YOUR DataLoader (Module 08) handles batching!")
|
||||
print(f" YOUR Adam optimizer (Module 08)")
|
||||
console.print("\n[bold]🚀 Training Shakespeare TinyGPT with YOUR TinyTorch![/bold]")
|
||||
console.print(f" Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
|
||||
console.print(f" Batch size: [cyan]{train_loader.batch_size}[/cyan]")
|
||||
console.print(f" YOUR DataLoader (Module 08) handles batching!")
|
||||
console.print(f" YOUR Adam optimizer (Module 08)")
|
||||
|
||||
# YOUR optimizer
|
||||
optimizer = Adam(model.parameters(), learning_rate=learning_rate)
|
||||
|
||||
for epoch in range(epochs):
|
||||
print(f"\n Epoch {epoch+1}/{epochs}:")
|
||||
console.print(f"\n [bold]Epoch {epoch+1}/{epochs}:[/bold]")
|
||||
epoch_loss = 0
|
||||
batch_count = 0
|
||||
|
||||
@@ -333,11 +349,11 @@ def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=
|
||||
|
||||
# Progress
|
||||
if (batch_idx + 1) % 20 == 0:
|
||||
print(f" Batch {batch_idx+1}: Loss = {loss_value:.4f}")
|
||||
console.print(f" Batch {batch_idx+1}: Loss = [cyan]{loss_value:.4f}[/cyan]")
|
||||
|
||||
# Epoch summary
|
||||
avg_loss = epoch_loss / max(1, batch_count)
|
||||
print(f" → Epoch Complete: Avg Loss = {avg_loss:.4f} (YOUR Transformer learning!)")
|
||||
console.print(f" → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
|
||||
|
||||
return model
|
||||
|
||||
@@ -348,15 +364,15 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
|
||||
|
||||
This is autoregressive generation: predict next char, add it, repeat.
|
||||
"""
|
||||
print("\n✨ TEXT GENERATION DEMO - THE PAYOFF!")
|
||||
print("="*70)
|
||||
console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
|
||||
console.print("="*70)
|
||||
|
||||
# Convert prompt to indices
|
||||
prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
|
||||
generated = prompt_indices.copy()
|
||||
|
||||
print(f"📝 Prompt: \"{prompt}\"")
|
||||
print(f"🎯 Generating {max_length} characters...\n")
|
||||
console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
|
||||
console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
|
||||
|
||||
# Generate character by character
|
||||
for _ in range(max_length):
|
||||
@@ -387,45 +403,50 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
|
||||
# Decode to text
|
||||
generated_text = dataset.decode(generated)
|
||||
|
||||
print("📖 Generated Text:")
|
||||
print("─" * 70)
|
||||
print(generated_text)
|
||||
print("─" * 70)
|
||||
console.print("[bold]📖 Generated Text:[/bold]")
|
||||
console.print("─" * 70)
|
||||
console.print(f"[green]{generated_text}[/green]")
|
||||
console.print("─" * 70)
|
||||
|
||||
return generated_text
|
||||
|
||||
|
||||
def analyze_transformer_systems(model):
|
||||
"""Analyze YOUR Transformer from an ML systems perspective."""
|
||||
print("\n🔬 SYSTEMS ANALYSIS of YOUR Transformer Implementation:")
|
||||
|
||||
print(f"\n Model Architecture:")
|
||||
print(f" • Parameters: {model.total_params:,} weights")
|
||||
print(f" • Embedding dim: {model.embed_dim}")
|
||||
print(f" • Vocabulary: {model.vocab_size} characters")
|
||||
|
||||
print(f"\n Computational Complexity:")
|
||||
print(f" • Attention: O(n²·d) where n=sequence, d=dimension")
|
||||
print(f" • Self-attention allows parallel processing (vs RNN sequential)")
|
||||
print(f" • YOUR implementation: Pure Python + NumPy")
|
||||
|
||||
print(f"\n Memory Requirements:")
|
||||
print(f" • Parameters: {model.total_params * 4 / 1024:.1f} KB")
|
||||
print(f" • Attention matrices: O(n²) per layer")
|
||||
print(f" • YOUR TinyTorch tracks gradients automatically")
|
||||
|
||||
print(f"\n 🏛️ Transformer Evolution:")
|
||||
print(f" • 2017: Vaswani et al. 'Attention Is All You Need'")
|
||||
print(f" • 2018: BERT (bidirectional), GPT (autoregressive)")
|
||||
print(f" • 2020: GPT-3 (175B params, same architecture!)")
|
||||
print(f" • 2022: ChatGPT (YOUR architecture at massive scale)")
|
||||
print(f" • YOUR TinyGPT: Core principles that power them all!")
|
||||
|
||||
print(f"\n 💡 Why Transformers Dominate:")
|
||||
print(f" • Parallelizable (vs sequential RNNs)")
|
||||
print(f" • Long-range dependencies (attention sees everything)")
|
||||
print(f" • Scalable (architecture works from 1M to 175B params)")
|
||||
print(f" • YOUR implementation demonstrates all of these!")
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
f"[bold]Model Architecture:[/bold]\n"
|
||||
f" • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
|
||||
f" • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
|
||||
f" • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
|
||||
|
||||
"[bold]Computational Complexity:[/bold]\n"
|
||||
" • Attention: O(n²·d) where n=sequence, d=dimension\n"
|
||||
" • Self-attention allows parallel processing (vs RNN sequential)\n"
|
||||
" • YOUR implementation: Pure Python + NumPy\n\n"
|
||||
|
||||
f"[bold]Memory Requirements:[/bold]\n"
|
||||
f" • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
|
||||
" • Attention matrices: O(n²) per layer\n"
|
||||
" • YOUR TinyTorch tracks gradients automatically\n\n"
|
||||
|
||||
"[bold]🏛️ Transformer Evolution:[/bold]\n"
|
||||
" • 2017: Vaswani et al. 'Attention Is All You Need'\n"
|
||||
" • 2018: BERT (bidirectional), GPT (autoregressive)\n"
|
||||
" • 2020: GPT-3 (175B params, same architecture!)\n"
|
||||
" • 2022: ChatGPT (YOUR architecture at massive scale)\n"
|
||||
" • YOUR TinyGPT: Core principles that power them all!\n\n"
|
||||
|
||||
"[bold]💡 Why Transformers Dominate:[/bold]\n"
|
||||
" • Parallelizable (vs sequential RNNs)\n"
|
||||
" • Long-range dependencies (attention sees everything)\n"
|
||||
" • Scalable (architecture works from 1M to 175B params)\n"
|
||||
" • YOUR implementation demonstrates all of these!",
|
||||
|
||||
title="🔬 SYSTEMS ANALYSIS",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
|
||||
def main():
|
||||
@@ -452,17 +473,23 @@ def main():
|
||||
help='Use small subset for testing')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("🎯 Shakespeare Transformer - Text Generation with YOUR Attention!")
|
||||
print(" Historical significance: Attention revolutionized sequence modeling")
|
||||
print(" YOUR achievement: Generate Shakespeare-style text")
|
||||
print(" Components used: YOUR complete transformer system (Modules 2-13)")
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
"[bold cyan]Shakespeare Transformer - Text Generation with YOUR Attention![/bold cyan]\n\n"
|
||||
"[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
|
||||
"[green]YOUR achievement:[/green] Generate Shakespeare-style text\n"
|
||||
"[cyan]Components used:[/cyan] YOUR complete transformer system (Modules 2-13)",
|
||||
title="🎯 Milestone 05: Transformer Era (2017)",
|
||||
border_style="cyan",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
# Visualization
|
||||
if args.visualize:
|
||||
visualize_transformer()
|
||||
|
||||
# Step 1: Load Shakespeare dataset
|
||||
print("\n📥 Loading Shakespeare corpus...")
|
||||
console.print("\n[bold]📥 Loading Shakespeare corpus...[/bold]")
|
||||
data_manager = DatasetManager()
|
||||
|
||||
try:
|
||||
@@ -470,23 +497,23 @@ def main():
|
||||
|
||||
if args.quick_test:
|
||||
text = text[:10000] # Use small subset for testing
|
||||
print(" (Using subset for quick testing)")
|
||||
console.print(" [dim](Using subset for quick testing)[/dim]")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Shakespeare download failed: {e}")
|
||||
print(" Using synthetic text for demonstration...")
|
||||
console.print(f"[yellow]⚠️ Shakespeare download failed: {e}[/yellow]")
|
||||
console.print(" [dim]Using synthetic text for demonstration...[/dim]")
|
||||
text = "To be or not to be, that is the question. " * 100
|
||||
|
||||
# Step 2: Create Dataset and DataLoader using YOUR Module 08!
|
||||
print(f"\n📦 Creating YOUR Dataset and DataLoader (Module 08)...")
|
||||
console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
|
||||
dataset = ShakespeareDataset(text, seq_length=args.seq_length)
|
||||
|
||||
# YOUR DataLoader handles batching and shuffling!
|
||||
train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
|
||||
|
||||
print(f" Vocabulary: {dataset.vocab_size} unique characters")
|
||||
print(f" Characters: '{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'")
|
||||
print(f" DataLoader: {len(dataset):,} sequences, batch_size={args.batch_size}")
|
||||
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
|
||||
console.print(f" Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
|
||||
console.print(f" DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
|
||||
|
||||
# Step 3: Build Transformer
|
||||
model = TinyGPT(
|
||||
@@ -497,13 +524,19 @@ def main():
|
||||
num_layers=args.num_layers
|
||||
)
|
||||
|
||||
# Display model info
|
||||
console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
|
||||
console.print(f" Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
|
||||
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
|
||||
console.print(f" Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
|
||||
|
||||
if args.test_only:
|
||||
print("\n🧪 ARCHITECTURE TEST MODE")
|
||||
console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
|
||||
# Test with minimal data
|
||||
test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
|
||||
test_output = model.forward(test_input)
|
||||
print(f"✅ Forward pass successful! Output shape: {test_output.data.shape}")
|
||||
print("✅ YOUR Transformer + DataLoader work together!")
|
||||
console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
|
||||
console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
|
||||
return
|
||||
|
||||
# Step 4: Train using YOUR DataLoader
|
||||
@@ -515,33 +548,41 @@ def main():
|
||||
generated = generate_text(model, dataset, prompt="To be or not", max_length=200)
|
||||
|
||||
# Additional generation examples
|
||||
print("\n🎭 More Generation Examples:")
|
||||
print("─" * 70)
|
||||
console.print("\n[bold]🎭 More Generation Examples:[/bold]")
|
||||
console.print("─" * 70)
|
||||
|
||||
prompts = ["ROMEO:", "The king", "What is"]
|
||||
for prompt in prompts:
|
||||
if all(ch in dataset.char_to_idx for ch in prompt):
|
||||
print(f"\nPrompt: \"{prompt}\"")
|
||||
console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
|
||||
gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
|
||||
|
||||
# Step 6: Systems Analysis
|
||||
analyze_transformer_systems(model)
|
||||
|
||||
print(f"\n⏱️ Training time: {train_time:.1f} seconds")
|
||||
print(f" Sequences/sec: {len(dataset) * args.epochs / train_time:.0f}")
|
||||
console.print(f"\n[bold]⏱️ Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
|
||||
console.print(f" Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
|
||||
|
||||
print("\n✅ SUCCESS! Shakespeare Transformer Milestone Complete!")
|
||||
print("\n🎓 What YOU Accomplished:")
|
||||
print(" • YOUR attention mechanism processes sequences in parallel")
|
||||
print(" • YOUR transformer captures long-range text dependencies")
|
||||
print(" • YOUR DataLoader efficiently batches character sequences")
|
||||
print(" • YOUR TinyGPT generates coherent text!")
|
||||
print(" • YOUR complete language modeling system works!")
|
||||
|
||||
print("\n🚀 Next Steps:")
|
||||
print(" • Continue to Module 14 (KV-Caching) for 3x faster inference")
|
||||
print(" • YOUR transformer architecture scales to GPT-scale models")
|
||||
print(" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!")
|
||||
console.print("")
|
||||
console.print(Panel.fit(
|
||||
"[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
|
||||
|
||||
"[bold]🎓 What YOU Accomplished:[/bold]\n"
|
||||
" • YOUR attention mechanism processes sequences in parallel\n"
|
||||
" • YOUR transformer captures long-range text dependencies\n"
|
||||
" • YOUR DataLoader efficiently batches character sequences\n"
|
||||
" • YOUR TinyGPT generates coherent text!\n"
|
||||
" • YOUR complete language modeling system works!\n\n"
|
||||
|
||||
"[bold]🚀 Next Steps:[/bold]\n"
|
||||
" • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
|
||||
" • YOUR transformer architecture scales to GPT-scale models\n"
|
||||
" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
|
||||
|
||||
title="🌟 2017 Transformer Revolution Complete",
|
||||
border_style="green",
|
||||
box=box.DOUBLE
|
||||
))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user