🎨 Add Rich CLI formatting to transformer milestone 05

Updates to vaswani_shakespeare.py:
- Add Rich console, Panel, Table, and box imports
- Replace all print() statements with console.print() with Rich markup
- Add beautiful Panel.fit() boxes for major sections (Act 1, Systems Analysis, Success)
- Use Rich color tags: [bold], [cyan], [green], [yellow], [dim]
- Format training progress with colored loss values
- Display generated text in green
- Add architectural visualization with Rich panels

Updates to transformers_dev.py:
- Remove all try/except fallback implementations
- Clean imports only (no development scaffolding)
- Use proper module imports from tinytorch package

Milestone now matches the beautiful CLI pattern from cnn_digits.py
This commit is contained in:
Vijay Janapa Reddi
2025-10-27 16:50:40 -04:00
parent 4f9c352e9d
commit de826e0b9d

View File

@@ -80,11 +80,17 @@ import os
import numpy as np
import argparse
import time
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich import box
# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_root)
console = Console()
# Import TinyTorch components YOU BUILT!
from tinytorch.core.tensor import Tensor # Module 02: YOU built this!
from tinytorch.core.layers import Linear # Module 04: YOU built this!
@@ -157,8 +163,6 @@ class TinyGPT:
"""
def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers):
print("🧠 Building TinyGPT with YOUR TinyTorch modules...")
# Token representation
self.embedding = Embedding(vocab_size, embed_dim) # Module 11!
self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11!
@@ -176,13 +180,11 @@ class TinyGPT:
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
# Calculate parameters
self.total_params = self._count_parameters()
print(f" Architecture: {num_layers} layers, {num_heads} heads, {embed_dim}-dim embeddings")
print(f" Vocabulary: {vocab_size} characters")
print(f" Total parameters: {self.total_params:,} (YOUR components!)")
def _count_parameters(self):
"""Count total parameters in model."""
@@ -240,11 +242,25 @@ class TinyGPT:
def visualize_transformer():
"""Show how transformers process text sequences."""
print("\n" + "="*70)
print("🤖 VISUALIZING TRANSFORMER TEXT GENERATION:")
print("="*70)
console.print("")
console.print(Panel.fit(
"[bold]In 2017, 'Attention Is All You Need' Changed Everything[/bold]\n\n"
"[yellow]The Problem:[/yellow]\n"
"RNNs process sequences one step at a time\n"
"Can't parallelize → slow training on long sequences\n"
"Struggle with long-range dependencies\n\n"
"[green]The Innovation:[/green]\n"
"Transformers: Attention mechanisms process ENTIRE sequences in parallel\n"
" • Self-attention: Every token attends to every other token\n"
" • Multi-head attention: Learn multiple attention patterns\n"
" • Positional encoding: Preserve sequence order\n\n"
"[bold]Can attention alone match RNN performance?[/bold]",
title="🎯 ACT 1: THE CHALLENGE",
border_style="cyan",
box=box.DOUBLE
))
print("""
console.print("""
How YOUR Transformer Sees Text: What It Learns:
Input: "To be or not to be" Layer 1 (Attention):
@@ -282,17 +298,17 @@ def visualize_transformer():
def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.001):
"""Train TinyGPT using YOUR complete training system with DataLoader!"""
print("\n🚀 Training Shakespeare TinyGPT with YOUR TinyTorch!")
print(f" Dataset: {len(train_loader.dataset):,} character sequences")
print(f" Batch size: {train_loader.batch_size}")
print(f" YOUR DataLoader (Module 08) handles batching!")
print(f" YOUR Adam optimizer (Module 08)")
console.print("\n[bold]🚀 Training Shakespeare TinyGPT with YOUR TinyTorch![/bold]")
console.print(f" Dataset: [cyan]{len(train_loader.dataset):,}[/cyan] character sequences")
console.print(f" Batch size: [cyan]{train_loader.batch_size}[/cyan]")
console.print(f" YOUR DataLoader (Module 08) handles batching!")
console.print(f" YOUR Adam optimizer (Module 08)")
# YOUR optimizer
optimizer = Adam(model.parameters(), learning_rate=learning_rate)
for epoch in range(epochs):
print(f"\n Epoch {epoch+1}/{epochs}:")
console.print(f"\n [bold]Epoch {epoch+1}/{epochs}:[/bold]")
epoch_loss = 0
batch_count = 0
@@ -333,11 +349,11 @@ def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=
# Progress
if (batch_idx + 1) % 20 == 0:
print(f" Batch {batch_idx+1}: Loss = {loss_value:.4f}")
console.print(f" Batch {batch_idx+1}: Loss = [cyan]{loss_value:.4f}[/cyan]")
# Epoch summary
avg_loss = epoch_loss / max(1, batch_count)
print(f" → Epoch Complete: Avg Loss = {avg_loss:.4f} (YOUR Transformer learning!)")
console.print(f" → Epoch Complete: Avg Loss = [bold cyan]{avg_loss:.4f}[/bold cyan] (YOUR Transformer learning!)")
return model
@@ -348,15 +364,15 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
This is autoregressive generation: predict next char, add it, repeat.
"""
print("\n✨ TEXT GENERATION DEMO - THE PAYOFF!")
print("="*70)
console.print("\n[bold]✨ TEXT GENERATION DEMO - THE PAYOFF![/bold]")
console.print("="*70)
# Convert prompt to indices
prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx]
generated = prompt_indices.copy()
print(f"📝 Prompt: \"{prompt}\"")
print(f"🎯 Generating {max_length} characters...\n")
console.print(f"📝 Prompt: [cyan]\"{prompt}\"[/cyan]")
console.print(f"🎯 Generating [cyan]{max_length}[/cyan] characters...\n")
# Generate character by character
for _ in range(max_length):
@@ -387,45 +403,50 @@ def generate_text(model, dataset, prompt="To be or not", max_length=200, tempera
# Decode to text
generated_text = dataset.decode(generated)
print("📖 Generated Text:")
print("" * 70)
print(generated_text)
print("" * 70)
console.print("[bold]📖 Generated Text:[/bold]")
console.print("" * 70)
console.print(f"[green]{generated_text}[/green]")
console.print("" * 70)
return generated_text
def analyze_transformer_systems(model):
"""Analyze YOUR Transformer from an ML systems perspective."""
print("\n🔬 SYSTEMS ANALYSIS of YOUR Transformer Implementation:")
print(f"\n Model Architecture:")
print(f" • Parameters: {model.total_params:,} weights")
print(f" • Embedding dim: {model.embed_dim}")
print(f" • Vocabulary: {model.vocab_size} characters")
print(f"\n Computational Complexity:")
print(f" • Attention: O(n²·d) where n=sequence, d=dimension")
print(f" • Self-attention allows parallel processing (vs RNN sequential)")
print(f" • YOUR implementation: Pure Python + NumPy")
print(f"\n Memory Requirements:")
print(f" • Parameters: {model.total_params * 4 / 1024:.1f} KB")
print(f" • Attention matrices: O(n²) per layer")
print(f" • YOUR TinyTorch tracks gradients automatically")
print(f"\n 🏛️ Transformer Evolution:")
print(f" • 2017: Vaswani et al. 'Attention Is All You Need'")
print(f" • 2018: BERT (bidirectional), GPT (autoregressive)")
print(f" • 2020: GPT-3 (175B params, same architecture!)")
print(f" • 2022: ChatGPT (YOUR architecture at massive scale)")
print(f" • YOUR TinyGPT: Core principles that power them all!")
print(f"\n 💡 Why Transformers Dominate:")
print(f" • Parallelizable (vs sequential RNNs)")
print(f" • Long-range dependencies (attention sees everything)")
print(f" • Scalable (architecture works from 1M to 175B params)")
print(f" • YOUR implementation demonstrates all of these!")
console.print("")
console.print(Panel.fit(
f"[bold]Model Architecture:[/bold]\n"
f" • Parameters: [cyan]{model.total_params:,}[/cyan] weights\n"
f" • Embedding dim: [cyan]{model.embed_dim}[/cyan]\n"
f" • Vocabulary: [cyan]{model.vocab_size}[/cyan] characters\n\n"
"[bold]Computational Complexity:[/bold]\n"
" • Attention: O(n²·d) where n=sequence, d=dimension\n"
" • Self-attention allows parallel processing (vs RNN sequential)\n"
" • YOUR implementation: Pure Python + NumPy\n\n"
f"[bold]Memory Requirements:[/bold]\n"
f" • Parameters: [cyan]{model.total_params * 4 / 1024:.1f} KB[/cyan]\n"
" • Attention matrices: O(n²) per layer\n"
" • YOUR TinyTorch tracks gradients automatically\n\n"
"[bold]🏛️ Transformer Evolution:[/bold]\n"
" • 2017: Vaswani et al. 'Attention Is All You Need'\n"
" • 2018: BERT (bidirectional), GPT (autoregressive)\n"
" • 2020: GPT-3 (175B params, same architecture!)\n"
" • 2022: ChatGPT (YOUR architecture at massive scale)\n"
" • YOUR TinyGPT: Core principles that power them all!\n\n"
"[bold]💡 Why Transformers Dominate:[/bold]\n"
" • Parallelizable (vs sequential RNNs)\n"
" • Long-range dependencies (attention sees everything)\n"
" • Scalable (architecture works from 1M to 175B params)\n"
" • YOUR implementation demonstrates all of these!",
title="🔬 SYSTEMS ANALYSIS",
border_style="cyan",
box=box.DOUBLE
))
def main():
@@ -452,17 +473,23 @@ def main():
help='Use small subset for testing')
args = parser.parse_args()
print("🎯 Shakespeare Transformer - Text Generation with YOUR Attention!")
print(" Historical significance: Attention revolutionized sequence modeling")
print(" YOUR achievement: Generate Shakespeare-style text")
print(" Components used: YOUR complete transformer system (Modules 2-13)")
console.print("")
console.print(Panel.fit(
"[bold cyan]Shakespeare Transformer - Text Generation with YOUR Attention![/bold cyan]\n\n"
"[yellow]Historical significance:[/yellow] Attention revolutionized sequence modeling\n"
"[green]YOUR achievement:[/green] Generate Shakespeare-style text\n"
"[cyan]Components used:[/cyan] YOUR complete transformer system (Modules 2-13)",
title="🎯 Milestone 05: Transformer Era (2017)",
border_style="cyan",
box=box.DOUBLE
))
# Visualization
if args.visualize:
visualize_transformer()
# Step 1: Load Shakespeare dataset
print("\n📥 Loading Shakespeare corpus...")
console.print("\n[bold]📥 Loading Shakespeare corpus...[/bold]")
data_manager = DatasetManager()
try:
@@ -470,23 +497,23 @@ def main():
if args.quick_test:
text = text[:10000] # Use small subset for testing
print(" (Using subset for quick testing)")
console.print(" [dim](Using subset for quick testing)[/dim]")
except Exception as e:
print(f"⚠️ Shakespeare download failed: {e}")
print(" Using synthetic text for demonstration...")
console.print(f"[yellow]⚠️ Shakespeare download failed: {e}[/yellow]")
console.print(" [dim]Using synthetic text for demonstration...[/dim]")
text = "To be or not to be, that is the question. " * 100
# Step 2: Create Dataset and DataLoader using YOUR Module 08!
print(f"\n📦 Creating YOUR Dataset and DataLoader (Module 08)...")
console.print(f"\n[bold]📦 Creating YOUR Dataset and DataLoader (Module 08)...[/bold]")
dataset = ShakespeareDataset(text, seq_length=args.seq_length)
# YOUR DataLoader handles batching and shuffling!
train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
print(f" Vocabulary: {dataset.vocab_size} unique characters")
print(f" Characters: '{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'")
print(f" DataLoader: {len(dataset):,} sequences, batch_size={args.batch_size}")
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] unique characters")
console.print(f" Characters: [dim]'{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'[/dim]")
console.print(f" DataLoader: [cyan]{len(dataset):,}[/cyan] sequences, batch_size=[cyan]{args.batch_size}[/cyan]")
# Step 3: Build Transformer
model = TinyGPT(
@@ -497,13 +524,19 @@ def main():
num_layers=args.num_layers
)
# Display model info
console.print("\n[bold]🧠 Building TinyGPT with YOUR TinyTorch...[/bold]")
console.print(f" Architecture: [cyan]{args.num_layers}[/cyan] layers, [cyan]{args.num_heads}[/cyan] heads, [cyan]{args.embed_dim}[/cyan]-dim embeddings")
console.print(f" Vocabulary: [cyan]{dataset.vocab_size}[/cyan] characters")
console.print(f" Total parameters: [bold cyan]{model.total_params:,}[/bold cyan] (YOUR components!)")
if args.test_only:
print("\n🧪 ARCHITECTURE TEST MODE")
console.print("\n[bold yellow]🧪 ARCHITECTURE TEST MODE[/bold yellow]")
# Test with minimal data
test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32))
test_output = model.forward(test_input)
print(f"✅ Forward pass successful! Output shape: {test_output.data.shape}")
print("✅ YOUR Transformer + DataLoader work together!")
console.print(f"[green]✅ Forward pass successful! Output shape: {test_output.data.shape}[/green]")
console.print(f"[green]✅ YOUR Transformer + DataLoader work together![/green]")
return
# Step 4: Train using YOUR DataLoader
@@ -515,33 +548,41 @@ def main():
generated = generate_text(model, dataset, prompt="To be or not", max_length=200)
# Additional generation examples
print("\n🎭 More Generation Examples:")
print("" * 70)
console.print("\n[bold]🎭 More Generation Examples:[/bold]")
console.print("" * 70)
prompts = ["ROMEO:", "The king", "What is"]
for prompt in prompts:
if all(ch in dataset.char_to_idx for ch in prompt):
print(f"\nPrompt: \"{prompt}\"")
console.print(f"\n[cyan]Prompt: \"{prompt}\"[/cyan]")
gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8)
# Step 6: Systems Analysis
analyze_transformer_systems(model)
print(f"\n⏱️ Training time: {train_time:.1f} seconds")
print(f" Sequences/sec: {len(dataset) * args.epochs / train_time:.0f}")
console.print(f"\n[bold]⏱️ Training time:[/bold] [cyan]{train_time:.1f}[/cyan] seconds")
console.print(f" Sequences/sec: [cyan]{len(dataset) * args.epochs / train_time:.0f}[/cyan]")
print("\n✅ SUCCESS! Shakespeare Transformer Milestone Complete!")
print("\n🎓 What YOU Accomplished:")
print(" • YOUR attention mechanism processes sequences in parallel")
print(" • YOUR transformer captures long-range text dependencies")
print(" • YOUR DataLoader efficiently batches character sequences")
print(" • YOUR TinyGPT generates coherent text!")
print(" • YOUR complete language modeling system works!")
print("\n🚀 Next Steps:")
print(" • Continue to Module 14 (KV-Caching) for 3x faster inference")
print(" • YOUR transformer architecture scales to GPT-scale models")
print(" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!")
console.print("")
console.print(Panel.fit(
"[bold green]✅ SUCCESS! Shakespeare Transformer Milestone Complete![/bold green]\n\n"
"[bold]🎓 What YOU Accomplished:[/bold]\n"
" • YOUR attention mechanism processes sequences in parallel\n"
" • YOUR transformer captures long-range text dependencies\n"
" • YOUR DataLoader efficiently batches character sequences\n"
" • YOUR TinyGPT generates coherent text!\n"
" • YOUR complete language modeling system works!\n\n"
"[bold]🚀 Next Steps:[/bold]\n"
" • Continue to Module 14 (KV-Caching) for 3x faster inference\n"
" • YOUR transformer architecture scales to GPT-scale models\n"
" • This is the foundation of ChatGPT, GPT-4, and all modern LLMs!",
title="🌟 2017 Transformer Revolution Complete",
border_style="green",
box=box.DOUBLE
))
if __name__ == "__main__":
main()