diff --git a/milestones/05_2017_transformer/vaswani_shakespeare.py b/milestones/05_2017_transformer/vaswani_shakespeare.py index 573c5a75..6bbaf93a 100644 --- a/milestones/05_2017_transformer/vaswani_shakespeare.py +++ b/milestones/05_2017_transformer/vaswani_shakespeare.py @@ -1,127 +1,203 @@ #!/usr/bin/env python3 """ -Clean TinyGPT Example - What Students Built -========================================== +Shakespeare Text Generation (2017) - Transformer Era +=================================================== -After completing all modules 02-14, students can build complete transformer -language models. This demonstrates how attention enables contextual understanding. +πŸ“š HISTORICAL CONTEXT: +In 2017, Vaswani et al. published "Attention Is All You Need", showing that +attention mechanisms alone (no RNNs!) could achieve state-of-the-art results +on sequence tasks. This breakthrough launched the era of GPT, BERT, and modern LLMs. -MODULES EXERCISED IN THIS EXAMPLE: +🎯 WHAT YOU'RE BUILDING: +Using YOUR TinyTorch implementations, you'll build a character-level language model +that generates Shakespeare-style text - proving YOUR attention mechanism works! + +βœ… REQUIRED MODULES (Run after Module 13): ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Module 02 (Tensor) : Data structure with gradient tracking - Module 03 (Activations) : ReLU in feed-forward networks - Module 04 (Layers) : Linear layers in FFN and output projection - Module 05 (Networks) : Module base class for transformer - Module 06 (Autograd) : Backprop through attention layers - Module 08 (Optimizers) : Adam optimizer for training - Module 10 (Training) : Language modeling loss and training loop - Module 12 (Embeddings) : Token embeddings and positional encoding - Module 13 (Attention) : Multi-head self-attention mechanism - Module 14 (Transformers) : LayerNorm and complete transformer blocks + Module 02 (Tensor) : YOUR data structure with autograd + Module 03 (Activations) : YOUR ReLU in feed-forward networks + Module 04 (Layers) : YOUR Linear layers + Module 08 (Optimizers) : YOUR Adam optimizer + Module 11 (Embeddings) : YOUR token & positional embeddings + Module 12 (Attention) : YOUR multi-head self-attention + Module 13 (Transformers) : YOUR LayerNorm + TransformerBlock ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -Transformer Architecture (Bottom to Top Flow): +πŸ—οΈ ARCHITECTURE (Character-Level Language Model): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Output Predictions β”‚ + β”‚ Character Probabilities (vocab_size) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–² + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Output Projection β”‚ + β”‚ Module 04: vectors β†’ vocabulary β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–² + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Layer Norm β”‚ + β”‚ Module 13: Final normalization β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–² + ╔══════════════════════════════════════════════════════════════════════════════╗ + β•‘ Transformer Block Γ— N (Repeat) β•‘ + β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ + β•‘ β”‚ Feed Forward Network β”‚ β•‘ + β•‘ β”‚ Module 04: Linear β†’ ReLU β†’ Linear β”‚ β•‘ + β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ + β•‘ β–² β•‘ + β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ + β•‘ β”‚ Multi-Head Self-Attention β”‚ β•‘ + β•‘ β”‚ Module 12: QueryΒ·Key^TΒ·Value across all positions β”‚ β•‘ + β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ + β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• + β–² + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Positional Encoding β”‚ + β”‚ Module 11: Add position information β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–² + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Character Embeddings β”‚ + β”‚ Module 11: chars β†’ embed_dim vectors β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β–² + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Input Characters β”‚ + β”‚ "To be or not to be, that is..." β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Output Logits β”‚ - β”‚ Vocabulary Predictions (1000) β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Output Projection β”‚ - β”‚ Module 04: vectors β†’ vocabulary β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Layer Norm β”‚ - β”‚ Module 14: Final normalization β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - ╔══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗ - β•‘ Transformer Block Γ— 4 (Repeat) β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Layer Norm β”‚ β•‘ - β•‘ β”‚ Module 14: Post-FFN normalization β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•‘ β–² β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Feed Forward Network (FFN) β”‚ β•‘ - β•‘ β”‚ Module 04: Linear(128β†’512) β†’ ReLU β†’ Linear(512β†’128) β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•‘ β–² β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Layer Norm β”‚ β•‘ - β•‘ β”‚ Module 14: Post-attention normalization β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•‘ β–² β•‘ - β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘ - β•‘ β”‚ Multi-Head Self-Attention β”‚ β•‘ - β•‘ β”‚ Module 13: 8 heads Γ— (QΒ·K^T/√d_k)Β·V β”‚ β•‘ - β•‘ β”‚ Each head: 16-dim attention on 128-dim embeddings β”‚ β•‘ - β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘ - β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Positional Encoding β”‚ - β”‚ Module 12: Add position information (sin/cos) β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Token Embeddings β”‚ - β”‚ Module 12: tokens β†’ 128-dim vectors β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β–² - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Input Tokens β”‚ - β”‚ [token_1, token_2, ..., token_10] β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - -Key Insight: Attention allows each token to "look at" all other tokens -to understand context and meaning relationships. +πŸ“Š EXPECTED PERFORMANCE: +- Dataset: ~1MB Shakespeare corpus (40,000 lines) +- Training time: 5-10 minutes (demonstration mode) +- Vocabulary: ~65 unique characters +- Expected: Coherent (if not perfect) Shakespeare-style text +- Parameters: ~500K (small by modern standards!) """ -import numpy as np import sys import os +import numpy as np +import argparse +import time # Add project root to path project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(project_root) -from tinytorch.core.tensor import Tensor -from tinytorch.core.layers import Linear -from tinytorch.core.activations import ReLU, Softmax -from tinytorch.core.optimizers import Adam -from tinytorch.core.attention import MultiHeadAttention -from tinytorch.models.transformer import LayerNorm, TransformerBlock -from tinytorch.core.embeddings import Embedding, PositionalEncoding +# Import TinyTorch components YOU BUILT! +from tinytorch.core.tensor import Tensor # Module 02: YOU built this! +from tinytorch.core.layers import Linear # Module 04: YOU built this! +from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this! +from tinytorch.core.optimizers import Adam # Module 08: YOU built this! +from tinytorch.core.attention import MultiHeadAttention # Module 12: YOU built this! +from tinytorch.models.transformer import LayerNorm, TransformerBlock # Module 13: YOU built this! +from tinytorch.text.embeddings import Embedding, PositionalEncoding # Module 11: YOU built this! +from tinytorch.data.loader import DataLoader, Dataset # Module 08: YOU built this! + +# Import dataset manager +try: + from data_manager import DatasetManager +except ImportError: + sys.path.append(os.path.join(project_root, 'milestones')) + from data_manager import DatasetManager + + +class ShakespeareDataset(Dataset): + """ + Character-level Shakespeare dataset using YOUR Dataset interface! + + Tokenizes text into characters and creates sequences for language modeling. + """ + + def __init__(self, text, seq_length=64): + """ + Initialize dataset with text and sequence length. + + Args: + text: Raw Shakespeare text + seq_length: Length of input sequences + """ + # Build character vocabulary + self.chars = sorted(list(set(text))) + self.vocab_size = len(self.chars) + self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)} + self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)} + + # Convert text to indices + self.data = [self.char_to_idx[ch] for ch in text] + self.seq_length = seq_length + + # Calculate number of sequences + self.num_sequences = len(self.data) - seq_length + + def __getitem__(self, idx): + """Get a single training sequence - YOUR Dataset interface!""" + # Input: characters at positions [idx, idx+seq_length) + # Target: characters at positions [idx+1, idx+seq_length+1) + input_seq = self.data[idx:idx + self.seq_length] + target_seq = self.data[idx + 1:idx + self.seq_length + 1] + + return Tensor(np.array(input_seq, dtype=np.int32)), Tensor(np.array(target_seq, dtype=np.int32)) + + def __len__(self): + """Return dataset size - YOUR Dataset interface!""" + return self.num_sequences + + def decode(self, indices): + """Convert indices back to text.""" + return ''.join([self.idx_to_char[int(idx)] for idx in indices]) + class TinyGPT: + """ + Character-level Transformer Language Model using YOUR TinyTorch! + + This architecture is what powers GPT, ChatGPT, and modern LLMs. + """ + def __init__(self, vocab_size, embed_dim, max_length, num_heads, num_layers): + print("🧠 Building TinyGPT with YOUR TinyTorch modules...") + # Token representation - self.embedding = Embedding(vocab_size, embed_dim) - self.pos_encoding = PositionalEncoding(max_length, embed_dim) + self.embedding = Embedding(vocab_size, embed_dim) # Module 11! + self.pos_encoding = PositionalEncoding(max_length, embed_dim) # Module 11! # Transformer stack self.layers = [] hidden_dim = embed_dim * 4 # Standard 4x expansion in FFN for _ in range(num_layers): - block = TransformerBlock(embed_dim, num_heads, hidden_dim) + block = TransformerBlock(embed_dim, num_heads, hidden_dim) # Module 13! self.layers.append(block) # Output head - self.layer_norm = LayerNorm(embed_dim) - self.output_proj = Linear(embed_dim, vocab_size) - self.vocab_size = vocab_size # Store for reshaping + self.layer_norm = LayerNorm(embed_dim) # Module 13! + self.output_proj = Linear(embed_dim, vocab_size) # Module 04! + + self.vocab_size = vocab_size + self.embed_dim = embed_dim + + # Calculate parameters + self.total_params = self._count_parameters() + + print(f" Architecture: {num_layers} layers, {num_heads} heads, {embed_dim}-dim embeddings") + print(f" Vocabulary: {vocab_size} characters") + print(f" Total parameters: {self.total_params:,} (YOUR components!)") + + def _count_parameters(self): + """Count total parameters in model.""" + count = 0 + for param in self.parameters(): + count += param.data.size + return count def parameters(self): - """Get all trainable parameters from the model.""" + """Get all trainable parameters from YOUR model.""" params = [] # Embedding parameters params.extend([self.embedding.weight]) # Transformer block parameters for layer in self.layers: - # TransformerBlock has a parameters attribute (list), not a method if hasattr(layer, 'parameters'): if callable(layer.parameters): params.extend(layer.parameters()) @@ -129,92 +205,343 @@ class TinyGPT: params.extend(layer.parameters) # Output projection parameters params.extend([self.layer_norm.gamma, self.layer_norm.beta]) - params.extend([self.output_proj.weights, self.output_proj.bias]) + params.extend([self.output_proj.weight, self.output_proj.bias]) return params def forward(self, x): + """Forward pass through YOUR transformer stack.""" # Convert tokens to contextual vectors - x = self.embedding.forward(x) # tokens β†’ vectors (Module 11) - x = self.pos_encoding.forward(x) # add position info (Module 11) + x = self.embedding.forward(x) # Module 11: char β†’ vectors + x = self.pos_encoding.forward(x) # Module 11: add position info # Process through transformer layers for layer in self.layers: - # Each layer: Attention β†’ Norm β†’ FFN β†’ Norm (Modules 13+14) - x = layer(x) + x = layer.forward(x) # Module 13: Attention β†’ FFN # Generate predictions - x = self.layer_norm(x) # final normalization (Module 14) + x = self.layer_norm.forward(x) # Module 13: final norm - # Reshape for Linear layer: (batch, seq, embed) β†’ (batch*seq, embed) + # Reshape for Linear layer x_np = np.array(x.data.data if hasattr(x.data, 'data') else x.data) batch_size, seq_len, embed_dim = x_np.shape x_2d_np = x_np.reshape(batch_size * seq_len, embed_dim) x_2d = Tensor(x_2d_np) # Apply output projection - logits_2d = self.output_proj(x_2d) # vocab predictions (Module 04) + logits_2d = self.output_proj(x_2d) # Module 04: vocab predictions - # Reshape back: (batch*seq, vocab) β†’ (batch, seq, vocab) + # Reshape back logits_2d_np = np.array(logits_2d.data.data if hasattr(logits_2d.data, 'data') else logits_2d.data) logits_np = logits_2d_np.reshape(batch_size, seq_len, self.vocab_size) logits = Tensor(logits_np) + return logits -def main(): - # Simpler hyperparameters for validation - vocab_size = 100 # Smaller vocabulary - embed_dim = 32 # Smaller embeddings - max_length = 16 # Shorter sequences - num_heads = 4 # Fewer attention heads - num_layers = 2 # Fewer layers - - model = TinyGPT(vocab_size, embed_dim, max_length, num_heads, num_layers) - optimizer = Adam(model.parameters(), learning_rate=0.001) # Module 08 - - # Demo training data (random tokens) - batch_size, seq_length = 1, 8 # Smaller batch and sequence - input_ids = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_length))) - target_ids = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_length))) - - print("πŸ€– Training Transformer Language Model") - print(f" Architecture: Embedding β†’ Position β†’ Attention Γ— {num_layers} β†’ Output") - print(f" Parameters: {sum(p.data.size for p in model.parameters()):,} weights") - print(f" Vocabulary: {vocab_size:,} possible tokens") - print(f" Context: {max_length} token sequences") - print() - - # What students built: Complete transformer training - for step in range(5): # Fewer steps for validation - logits = model.forward(input_ids) # Forward: Full transformer stack - - # Language modeling loss (Module 10) - logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data) - targets_np = np.array(target_ids.data.data if hasattr(target_ids.data, 'data') else target_ids.data) - batch_size, seq_length = targets_np.shape - targets_one_hot = np.zeros((batch_size, seq_length, vocab_size)) - for b in range(batch_size): - for s in range(seq_length): - targets_one_hot[b, s, int(targets_np[b, s])] = 1.0 - loss_value = np.mean((logits_np - targets_one_hot) ** 2) - loss = Tensor([loss_value]) - - loss.backward() # Autodiff through transformer (Module 06) - optimizer.step() # Adam updates (Module 08) - optimizer.zero_grad() - - if step % 2 == 0: - print(f" Step {step:2d}: Loss = {loss_value:.4f}") +def visualize_transformer(): + """Show how transformers process text sequences.""" + print("\n" + "="*70) + print("πŸ€– VISUALIZING TRANSFORMER TEXT GENERATION:") + print("="*70) - print("\nβœ… Success! Complete transformer language model") - print("\n🎯 What You Learned by Building:") - print(" β€’ How attention creates contextual word representations") - print(" β€’ Why positional encoding is crucial for sequence understanding") - print(" β€’ How layer normalization stabilizes deep network training") - print(" β€’ Complete transformer architecture from first principles") - print("\n🏭 Production Note:") - print(" Real PyTorch uses optimized CUDA kernels for attention,") - print(" but you built and understand the core mathematics!") + print(""" + How YOUR Transformer Sees Text: What It Learns: + + Input: "To be or not to be" Layer 1 (Attention): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β€’ Each word attends to others + β”‚ T o b e o r ... β”‚ β€’ "be" looks at "To", "or", etc. + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β€’ Captures dependencies + ↓ + Character Embeddings Layer 2-4 (Deep Attention): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β€’ Builds complex patterns + β”‚ 128-dim vectors β”‚ β€’ Grammar, style, meaning + β”‚ for each character β”‚ β€’ Shakespeare-specific patterns + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ Output Prediction: + Position Encoding "To be or not to be, that is the" + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” ↓ + β”‚ Add positional info β”‚ Next char probabilities: + β”‚ (order matters!) β”‚ 't' β†’ 0.85 (highest!) + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ 'n' β†’ 0.03 + ↓ 'a' β†’ 0.02 + Transformer Layers Γ—4 ... + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Self-Attention β”‚ Key Transformer Insight: + β”‚ Feed-Forward β”‚ Unlike RNNs, attention lets each + β”‚ Layer Norm β”‚ position look at ALL others + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ simultaneously - capturing long-range + ↓ dependencies in O(1) operations! + Character Predictions + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Probability for β”‚ + β”‚ each next character β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + """) + print("="*70) + + +def train_shakespeare_gpt(model, train_loader, dataset, epochs=5, learning_rate=0.001): + """Train TinyGPT using YOUR complete training system with DataLoader!""" + print("\nπŸš€ Training Shakespeare TinyGPT with YOUR TinyTorch!") + print(f" Dataset: {len(train_loader.dataset):,} character sequences") + print(f" Batch size: {train_loader.batch_size}") + print(f" YOUR DataLoader (Module 08) handles batching!") + print(f" YOUR Adam optimizer (Module 08)") + + # YOUR optimizer + optimizer = Adam(model.parameters(), learning_rate=learning_rate) + + for epoch in range(epochs): + print(f"\n Epoch {epoch+1}/{epochs}:") + epoch_loss = 0 + batch_count = 0 + + # Use YOUR DataLoader to iterate through batches! + for batch_idx, (batch_input, batch_target) in enumerate(train_loader): + if batch_idx >= 100: # Demo mode - limit batches + break + + # Forward pass with YOUR Transformer + logits = model.forward(batch_input) # YOUR attention mechanism! + + # Language modeling loss + logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data) + targets_np = np.array(batch_target.data.data if hasattr(batch_target.data, 'data') else batch_target.data) + + batch_size, seq_length = targets_np.shape + vocab_size = logits_np.shape[-1] + + # Cross-entropy loss + targets_one_hot = np.zeros((batch_size, seq_length, vocab_size)) + for b in range(batch_size): + for s in range(seq_length): + targets_one_hot[b, s, int(targets_np[b, s])] = 1.0 + + # Softmax + cross entropy + exp_logits = np.exp(logits_np - np.max(logits_np, axis=2, keepdims=True)) + softmax = exp_logits / np.sum(exp_logits, axis=2, keepdims=True) + loss_value = -np.mean(np.sum(targets_one_hot * np.log(softmax + 1e-8), axis=2)) + loss = Tensor([loss_value]) + + # Backward pass with YOUR autograd + optimizer.zero_grad() # Module 08! + loss.backward() # Module 05: YOUR autodiff! + optimizer.step() # Module 08! + + epoch_loss += loss_value + batch_count += 1 + + # Progress + if (batch_idx + 1) % 20 == 0: + print(f" Batch {batch_idx+1}: Loss = {loss_value:.4f}") + + # Epoch summary + avg_loss = epoch_loss / max(1, batch_count) + print(f" β†’ Epoch Complete: Avg Loss = {avg_loss:.4f} (YOUR Transformer learning!)") + + return model + + +def generate_text(model, dataset, prompt="To be or not", max_length=200, temperature=0.8): + """ + Generate text from a prompt - THE WOW MOMENT! + + This is autoregressive generation: predict next char, add it, repeat. + """ + print("\n✨ TEXT GENERATION DEMO - THE PAYOFF!") + print("="*70) + + # Convert prompt to indices + prompt_indices = [dataset.char_to_idx[ch] for ch in prompt if ch in dataset.char_to_idx] + generated = prompt_indices.copy() + + print(f"πŸ“ Prompt: \"{prompt}\"") + print(f"🎯 Generating {max_length} characters...\n") + + # Generate character by character + for _ in range(max_length): + # Take last seq_length characters as input + input_seq = generated[-dataset.seq_length:] if len(generated) >= dataset.seq_length else generated + + # Pad if necessary + if len(input_seq) < dataset.seq_length: + input_seq = [0] * (dataset.seq_length - len(input_seq)) + input_seq + + # Forward pass + input_tensor = Tensor(np.array([input_seq], dtype=np.int32)) + logits = model.forward(input_tensor) + + # Get logits for last position + logits_np = np.array(logits.data.data if hasattr(logits.data, 'data') else logits.data) + next_logits = logits_np[0, -1, :] # Last position predictions + + # Apply temperature and sample + next_logits = next_logits / temperature + exp_logits = np.exp(next_logits - np.max(next_logits)) + probs = exp_logits / np.sum(exp_logits) + + # Sample from distribution + next_idx = np.random.choice(len(probs), p=probs) + generated.append(next_idx) + + # Decode to text + generated_text = dataset.decode(generated) + + print("πŸ“– Generated Text:") + print("─" * 70) + print(generated_text) + print("─" * 70) + + return generated_text + + +def analyze_transformer_systems(model): + """Analyze YOUR Transformer from an ML systems perspective.""" + print("\nπŸ”¬ SYSTEMS ANALYSIS of YOUR Transformer Implementation:") + + print(f"\n Model Architecture:") + print(f" β€’ Parameters: {model.total_params:,} weights") + print(f" β€’ Embedding dim: {model.embed_dim}") + print(f" β€’ Vocabulary: {model.vocab_size} characters") + + print(f"\n Computational Complexity:") + print(f" β€’ Attention: O(nΒ²Β·d) where n=sequence, d=dimension") + print(f" β€’ Self-attention allows parallel processing (vs RNN sequential)") + print(f" β€’ YOUR implementation: Pure Python + NumPy") + + print(f"\n Memory Requirements:") + print(f" β€’ Parameters: {model.total_params * 4 / 1024:.1f} KB") + print(f" β€’ Attention matrices: O(nΒ²) per layer") + print(f" β€’ YOUR TinyTorch tracks gradients automatically") + + print(f"\n πŸ›οΈ Transformer Evolution:") + print(f" β€’ 2017: Vaswani et al. 'Attention Is All You Need'") + print(f" β€’ 2018: BERT (bidirectional), GPT (autoregressive)") + print(f" β€’ 2020: GPT-3 (175B params, same architecture!)") + print(f" β€’ 2022: ChatGPT (YOUR architecture at massive scale)") + print(f" β€’ YOUR TinyGPT: Core principles that power them all!") + + print(f"\n πŸ’‘ Why Transformers Dominate:") + print(f" β€’ Parallelizable (vs sequential RNNs)") + print(f" β€’ Long-range dependencies (attention sees everything)") + print(f" β€’ Scalable (architecture works from 1M to 175B params)") + print(f" β€’ YOUR implementation demonstrates all of these!") + + +def main(): + """Demonstrate Shakespeare text generation using YOUR TinyTorch!""" + + parser = argparse.ArgumentParser(description='Shakespeare Transformer 2017') + parser.add_argument('--test-only', action='store_true', + help='Test architecture only') + parser.add_argument('--epochs', type=int, default=5, + help='Training epochs (demo mode)') + parser.add_argument('--batch-size', type=int, default=32, + help='Batch size') + parser.add_argument('--seq-length', type=int, default=64, + help='Sequence length') + parser.add_argument('--embed-dim', type=int, default=128, + help='Embedding dimension') + parser.add_argument('--num-layers', type=int, default=4, + help='Number of transformer layers') + parser.add_argument('--num-heads', type=int, default=4, + help='Number of attention heads') + parser.add_argument('--visualize', action='store_true', default=True, + help='Show transformer visualization') + parser.add_argument('--quick-test', action='store_true', + help='Use small subset for testing') + args = parser.parse_args() + + print("🎯 Shakespeare Transformer - Text Generation with YOUR Attention!") + print(" Historical significance: Attention revolutionized sequence modeling") + print(" YOUR achievement: Generate Shakespeare-style text") + print(" Components used: YOUR complete transformer system (Modules 2-13)") + + # Visualization + if args.visualize: + visualize_transformer() + + # Step 1: Load Shakespeare dataset + print("\nπŸ“₯ Loading Shakespeare corpus...") + data_manager = DatasetManager() + + try: + text = data_manager.get_shakespeare() + + if args.quick_test: + text = text[:10000] # Use small subset for testing + print(" (Using subset for quick testing)") + + except Exception as e: + print(f"⚠️ Shakespeare download failed: {e}") + print(" Using synthetic text for demonstration...") + text = "To be or not to be, that is the question. " * 100 + + # Step 2: Create Dataset and DataLoader using YOUR Module 08! + print(f"\nπŸ“¦ Creating YOUR Dataset and DataLoader (Module 08)...") + dataset = ShakespeareDataset(text, seq_length=args.seq_length) + + # YOUR DataLoader handles batching and shuffling! + train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + print(f" Vocabulary: {dataset.vocab_size} unique characters") + print(f" Characters: '{dataset.decode(list(range(min(20, dataset.vocab_size))))}...'") + print(f" DataLoader: {len(dataset):,} sequences, batch_size={args.batch_size}") + + # Step 3: Build Transformer + model = TinyGPT( + vocab_size=dataset.vocab_size, + embed_dim=args.embed_dim, + max_length=args.seq_length, + num_heads=args.num_heads, + num_layers=args.num_layers + ) + + if args.test_only: + print("\nπŸ§ͺ ARCHITECTURE TEST MODE") + # Test with minimal data + test_input = Tensor(np.random.randint(0, dataset.vocab_size, (1, args.seq_length), dtype=np.int32)) + test_output = model.forward(test_input) + print(f"βœ… Forward pass successful! Output shape: {test_output.data.shape}") + print("βœ… YOUR Transformer + DataLoader work together!") + return + + # Step 4: Train using YOUR DataLoader + start_time = time.time() + model = train_shakespeare_gpt(model, train_loader, dataset, epochs=args.epochs) + train_time = time.time() - start_time + + # Step 5: Generate text! + generated = generate_text(model, dataset, prompt="To be or not", max_length=200) + + # Additional generation examples + print("\n🎭 More Generation Examples:") + print("─" * 70) + + prompts = ["ROMEO:", "The king", "What is"] + for prompt in prompts: + if all(ch in dataset.char_to_idx for ch in prompt): + print(f"\nPrompt: \"{prompt}\"") + gen = generate_text(model, dataset, prompt=prompt, max_length=100, temperature=0.8) + + # Step 6: Systems Analysis + analyze_transformer_systems(model) + + print(f"\n⏱️ Training time: {train_time:.1f} seconds") + print(f" Sequences/sec: {len(dataset) * args.epochs / train_time:.0f}") + + print("\nβœ… SUCCESS! Shakespeare Transformer Milestone Complete!") + print("\nπŸŽ“ What YOU Accomplished:") + print(" β€’ YOUR attention mechanism processes sequences in parallel") + print(" β€’ YOUR transformer captures long-range text dependencies") + print(" β€’ YOUR DataLoader efficiently batches character sequences") + print(" β€’ YOUR TinyGPT generates coherent text!") + print(" β€’ YOUR complete language modeling system works!") + + print("\nπŸš€ Next Steps:") + print(" β€’ Continue to Module 14 (KV-Caching) for 3x faster inference") + print(" β€’ YOUR transformer architecture scales to GPT-scale models") + print(" β€’ This is the foundation of ChatGPT, GPT-4, and all modern LLMs!") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/modules/source/08_dataloader/dataloader_dev.ipynb b/modules/source/08_dataloader/dataloader_dev.ipynb index 805ed498..9de720d2 100644 --- a/modules/source/08_dataloader/dataloader_dev.ipynb +++ b/modules/source/08_dataloader/dataloader_dev.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da4ec172", + "id": "68a64fae", "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "markdown", - "id": "558b9fec", + "id": "a3d0618b", "metadata": { "cell_marker": "\"\"\"" }, @@ -64,7 +64,7 @@ { "cell_type": "code", "execution_count": null, - "id": "701f0773", + "id": "88086df7", "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ }, { "cell_type": "markdown", - "id": "fb08bf4c", + "id": "b43901bd", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -137,7 +137,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4261c2ec", + "id": "6d6abda4", "metadata": { "nbgrader": { "grade": false, @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c72e10fc", + "id": "dc6ce67d", "metadata": { "lines_to_next_cell": 2, "nbgrader": { @@ -251,7 +251,7 @@ }, { "cell_type": "markdown", - "id": "0c18bbe8", + "id": "71c543f0", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -315,7 +315,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46d06a84", + "id": "7088cd2d", "metadata": { "nbgrader": { "grade": false, @@ -406,7 +406,7 @@ { "cell_type": "code", "execution_count": null, - "id": "457a6c4f", + "id": "002e0d79", "metadata": { "lines_to_next_cell": 2, "nbgrader": { @@ -464,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "0e9c9312", + "id": "f4a52948", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -543,7 +543,7 @@ { "cell_type": "code", "execution_count": null, - "id": "251ec10a", + "id": "94032b16", "metadata": { "nbgrader": { "grade": false, @@ -657,7 +657,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0958e333", + "id": "7fcd3543", "metadata": { "lines_to_next_cell": 2, "nbgrader": { @@ -725,7 +725,7 @@ }, { "cell_type": "markdown", - "id": "76b97b7b", + "id": "ab0b6005", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 2 @@ -832,7 +832,7 @@ }, { "cell_type": "markdown", - "id": "f6fe6ac2", + "id": "a9a8d990", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -919,7 +919,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a3a6e403", + "id": "226b8599", "metadata": { "nbgrader": { "grade": false, @@ -1047,7 +1047,7 @@ }, { "cell_type": "markdown", - "id": "b58e6cf0", + "id": "251fd2d2", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1061,7 +1061,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eb6d475a", + "id": "57ca5aa7", "metadata": { "nbgrader": { "grade": false, @@ -1144,7 +1144,7 @@ }, { "cell_type": "markdown", - "id": "00c7bdd8", + "id": "e99790e7", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1158,7 +1158,7 @@ { "cell_type": "code", "execution_count": null, - "id": "320dfadd", + "id": "f22af370", "metadata": { "lines_to_next_cell": 1 }, @@ -1195,7 +1195,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dc99277c", + "id": "5a49ad00", "metadata": {}, "outputs": [], "source": [ @@ -1208,7 +1208,7 @@ }, { "cell_type": "markdown", - "id": "efb1f5ab", + "id": "91161fcc", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/11_embeddings/embeddings_dev.ipynb b/modules/source/11_embeddings/embeddings_dev.ipynb index ca9cf276..20d326c7 100644 --- a/modules/source/11_embeddings/embeddings_dev.ipynb +++ b/modules/source/11_embeddings/embeddings_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a87209c8", + "id": "7d9a210d", "metadata": { "cell_marker": "\"\"\"" }, @@ -51,177 +51,209 @@ { "cell_type": "code", "execution_count": null, - "id": "6db98349", - "metadata": { - "lines_to_next_cell": 1, - "nbgrader": { - "grade": false, - "grade_id": "imports", - "solution": true - } - }, + "id": "9e8440ef", + "metadata": {}, "outputs": [], "source": [ - "\"\"\"\n", - "## 1. Essential Imports and Setup\n", - "\n", - "Setting up our embedding toolkit with tensor operations and mathematical functions.\n", - "\"\"\"\n", - "\n", - "#| default_exp text.embeddings\n", - "#| export\n", - "\n", - "import numpy as np\n", - "import math\n", - "from typing import List, Optional, Tuple\n", - "\n", - "# Core tensor operations - our foundation\n", - "### BEGIN SOLUTION\n", - "# For this educational implementation, we'll create a simple Tensor class\n", - "# In practice, this would import from tinytorch.core.tensor\n", - "\n", - "class Tensor:\n", - " \"\"\"Educational tensor for embeddings module.\"\"\"\n", - "\n", - " def __init__(self, data, requires_grad=False):\n", - " self.data = np.array(data)\n", - " self.shape = self.data.shape\n", - " self.requires_grad = requires_grad\n", - " self.grad = None\n", - "\n", - " def __repr__(self):\n", - " return f\"Tensor({self.data})\"\n", - "\n", - " def __getitem__(self, idx):\n", - " return Tensor(self.data[idx])\n", - "\n", - " def __add__(self, other):\n", - " if isinstance(other, Tensor):\n", - " return Tensor(self.data + other.data)\n", - " return Tensor(self.data + other)\n", - "\n", - " def size(self, dim=None):\n", - " if dim is None:\n", - " return self.shape\n", - " return self.shape[dim]\n", - "\n", - " def reshape(self, *shape):\n", - " return Tensor(self.data.reshape(shape))\n", - "\n", - " def expand(self, *shape):\n", - " return Tensor(np.broadcast_to(self.data, shape))\n", - "\n", - " def parameters(self):\n", - " return [self] if self.requires_grad else []\n", - "\n", - "# Simple Linear layer for this module\n", - "class Linear:\n", - " \"\"\"Educational linear layer.\"\"\"\n", - "\n", - " def __init__(self, in_features, out_features, bias=True):\n", - " # Xavier initialization\n", - " limit = math.sqrt(6.0 / (in_features + out_features))\n", - " self.weight = Tensor(\n", - " np.random.uniform(-limit, limit, (in_features, out_features)),\n", - " requires_grad=True\n", - " )\n", - " self.bias = Tensor(np.zeros(out_features), requires_grad=True) if bias else None\n", - "\n", - " def forward(self, x):\n", - " result = Tensor(np.dot(x.data, self.weight.data))\n", - " if self.bias is not None:\n", - " result = result + self.bias\n", - " return result\n", - "\n", - " def parameters(self):\n", - " params = [self.weight]\n", - " if self.bias is not None:\n", - " params.append(self.bias)\n", - " return params\n", - "### END SOLUTION" - ] - }, - { - "cell_type": "markdown", - "id": "432b1be2", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "## 2. Understanding Token Embeddings - From Discrete to Dense\n", - "\n", - "Before we implement embeddings, let's understand what problem they solve and how the lookup process works.\n", - "\n", - "### The Fundamental Challenge\n", - "\n", - "When dealing with text, we start with discrete symbols (words, characters, tokens) but neural networks need continuous numbers. Embeddings bridge this gap by creating a learned mapping from discrete tokens to dense vector representations.\n", - "\n", - "### Token-to-Vector Transformation Visualization\n", - "\n", - "```\n", - "Traditional One-Hot Encoding (Sparse):\n", - "Token \"cat\" (index 42) β†’ [0, 0, ..., 1, ..., 0] (50,000 elements, mostly zeros)\n", - " position 42\n", - "\n", - "Modern Embedding Lookup (Dense):\n", - "Token \"cat\" (index 42) β†’ [0.1, -0.3, 0.7, 0.2, ...] (512 dense, meaningful values)\n", - "```\n", - "\n", - "### How Embedding Lookup Works\n", - "\n", - "```\n", - "Embedding Table (vocab_size Γ— embed_dim):\n", - " Token ID Embedding Vector\n", - " β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " 0 β”‚ 0 β”‚ β†’ β”‚ [0.2, -0.1, 0.3, ...] β”‚ \"the\"\n", - " 1 β”‚ 1 β”‚ β†’ β”‚ [0.1, 0.4, -0.2, ...] β”‚ \"cat\"\n", - " 2 β”‚ 2 β”‚ β†’ β”‚ [-0.3, 0.1, 0.5, ...] β”‚ \"sat\"\n", - "... β”‚ ... β”‚ β”‚ ... β”‚ ...\n", - "42 β”‚ 42 β”‚ β†’ β”‚ [0.7, -0.2, 0.1, ...] β”‚ \"dog\"\n", - "... β”‚ ... β”‚ β”‚ ... β”‚ ...\n", - " β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - "\n", - "Lookup Process:\n", - "Input tokens: [1, 2, 42] β†’ Output: Matrix (3 Γ— embed_dim)\n", - "Row 0: embedding[1] β†’ [0.1, 0.4, -0.2, ...] \"cat\"\n", - "Row 1: embedding[2] β†’ [-0.3, 0.1, 0.5, ...] \"sat\"\n", - "Row 2: embedding[42] β†’ [0.7, -0.2, 0.1, ...] \"dog\"\n", - "```\n", - "\n", - "### Why Embeddings Are Powerful\n", - "\n", - "1. **Dense Representation**: Every dimension can contribute meaningful information\n", - "2. **Learnable**: Vectors adjust during training to capture semantic relationships\n", - "3. **Efficient**: O(1) lookup time regardless of vocabulary size\n", - "4. **Semantic**: Similar words learn similar vector representations\n", - "\n", - "### Memory Implications\n", - "\n", - "For a vocabulary of 50,000 tokens with 512-dimensional embeddings:\n", - "- **Storage**: 50,000 Γ— 512 Γ— 4 bytes = ~100MB (in FP32)\n", - "- **Scaling**: Memory grows linearly with vocab_size Γ— embed_dim\n", - "- **Trade-off**: Larger embeddings capture more nuance but require more memory\n", - "\n", - "This is why embedding tables often dominate memory usage in large language models!" - ] - }, - { - "cell_type": "markdown", - "id": "e5381660", - "metadata": { - "cell_marker": "\"\"\"", - "lines_to_next_cell": 1 - }, - "source": [ - "## 3. Implementing Token Embeddings\n", - "\n", - "Now let's build the core embedding layer that performs efficient token-to-vector lookups." + "#| default_exp text.embeddings" ] }, { "cell_type": "code", "execution_count": null, - "id": "7be267a8", + "id": "70d81596", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import numpy as np\n", + "import math\n", + "from typing import List, Optional, Tuple\n", + "\n", + "# Import from previous modules - following dependency chain\n", + "from tinytorch.core.tensor import Tensor" + ] + }, + { + "cell_type": "markdown", + "id": "1ddbc881", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## 1. Introduction - Why Embeddings?\n", + "\n", + "Neural networks operate on dense vectors, but language consists of discrete tokens. Embeddings are the crucial bridge that converts discrete tokens into continuous, learnable vector representations that capture semantic meaning.\n", + "\n", + "### The Token-to-Vector Challenge\n", + "\n", + "Consider the tokens from our tokenizer: [1, 42, 7] - how do we turn these discrete indices into meaningful vectors that capture semantic relationships?\n", + "\n", + "```\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ EMBEDDING PIPELINE: Discrete Tokens β†’ Dense Vectors β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ Input (Token IDs): [1, 42, 7] β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ Step 1: Lookup in embedding table β”‚\n", + "β”‚ β”‚ Each ID β†’ vector of learned features β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ Step 2: Add positional information β”‚\n", + "β”‚ β”‚ Same word at different positions β†’ differentβ”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ Step 3: Create position-aware representations β”‚\n", + "β”‚ β”‚ Ready for attention mechanisms β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ └─ Step 4: Enable semantic understanding β”‚\n", + "β”‚ Similar words β†’ similar vectors β”‚\n", + "β”‚ β”‚\n", + "β”‚ Output (Dense Vectors): [[0.1, 0.4, ...], [0.7, -0.2, ...]] β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", + "```\n", + "\n", + "### The Four-Layer Embedding System\n", + "\n", + "Modern embedding systems combine multiple components:\n", + "\n", + "**1. Token embeddings** - Learn semantic representations for each vocabulary token\n", + "**2. Positional encoding** - Add information about position in sequence\n", + "**3. Optional scaling** - Normalize embedding magnitudes (Transformer convention)\n", + "**4. Integration** - Combine everything into position-aware representations\n", + "\n", + "### Why This Matters\n", + "\n", + "The choice of embedding strategy dramatically affects:\n", + "- **Semantic understanding** - How well the model captures word meaning\n", + "- **Memory requirements** - Embedding tables can be gigabytes in size\n", + "- **Position awareness** - Whether the model understands word order\n", + "- **Extrapolation** - How well the model handles longer sequences than training" + ] + }, + { + "cell_type": "markdown", + "id": "f1e4bdc9", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## 2. Foundations - Embedding Strategies\n", + "\n", + "Different embedding approaches make different trade-offs between memory, semantic understanding, and computational efficiency.\n", + "\n", + "### Token Embedding Lookup Process\n", + "\n", + "**Approach**: Each token ID maps to a learned dense vector\n", + "\n", + "```\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ TOKEN EMBEDDING LOOKUP PROCESS β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ Step 1: Build Embedding Table (vocab_size Γ— embed_dim) β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Token ID β”‚ Embedding Vector (learned features) β”‚ β”‚\n", + "β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚\n", + "β”‚ β”‚ 0 β”‚ [0.2, -0.1, 0.3, 0.8, ...] () β”‚ β”‚\n", + "β”‚ β”‚ 1 β”‚ [0.1, 0.4, -0.2, 0.6, ...] (\"the\") β”‚ β”‚\n", + "β”‚ β”‚ 42 β”‚ [0.7, -0.2, 0.1, 0.4, ...] (\"cat\") β”‚ β”‚\n", + "β”‚ β”‚ 7 β”‚ [-0.3, 0.1, 0.5, 0.2, ...] (\"sat\") β”‚ β”‚\n", + "β”‚ β”‚ ... β”‚ ... β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ Step 2: Lookup Process (O(1) per token) β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Input: Token IDs [1, 42, 7] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ ID 1 β†’ embedding[1] β†’ [0.1, 0.4, -0.2, ...] β”‚ β”‚\n", + "β”‚ β”‚ ID 42 β†’ embedding[42] β†’ [0.7, -0.2, 0.1, ...] β”‚ β”‚\n", + "β”‚ β”‚ ID 7 β†’ embedding[7] β†’ [-0.3, 0.1, 0.5, ...] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Output: Matrix (3 Γ— embed_dim) β”‚ β”‚\n", + "β”‚ β”‚ [[0.1, 0.4, -0.2, ...], β”‚ β”‚\n", + "β”‚ β”‚ [0.7, -0.2, 0.1, ...], β”‚ β”‚\n", + "β”‚ β”‚ [-0.3, 0.1, 0.5, ...]] β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ Step 3: Training Updates Embeddings β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Gradients flow back to embedding table β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Similar words learn similar vectors: β”‚ β”‚\n", + "β”‚ β”‚ \"cat\" and \"dog\" β†’ closer in embedding space β”‚ β”‚\n", + "β”‚ β”‚ \"the\" and \"a\" β†’ closer in embedding space β”‚ β”‚\n", + "β”‚ β”‚ \"sat\" and \"run\" β†’ farther in embedding space β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", + "```\n", + "\n", + "**Pros**:\n", + "- Dense representation (every dimension meaningful)\n", + "- Learnable (captures semantic relationships through training)\n", + "- Efficient lookup (O(1) time complexity)\n", + "- Scales to large vocabularies\n", + "\n", + "**Cons**:\n", + "- Memory intensive (vocab_size Γ— embed_dim parameters)\n", + "- Requires training to develop semantic relationships\n", + "- Fixed vocabulary (new tokens need special handling)\n", + "\n", + "### Positional Encoding Strategies\n", + "\n", + "Since embeddings by themselves have no notion of order, we need positional information:\n", + "\n", + "```\n", + "Position-Aware Embeddings = Token Embeddings + Positional Encoding\n", + "\n", + "Learned Approach: Fixed Mathematical Approach:\n", + "Position 0 β†’ [learned] Position 0 β†’ [sin/cos pattern]\n", + "Position 1 β†’ [learned] Position 1 β†’ [sin/cos pattern]\n", + "Position 2 β†’ [learned] Position 2 β†’ [sin/cos pattern]\n", + "... ...\n", + "```\n", + "\n", + "**Learned Positional Encoding**:\n", + "- Trainable position embeddings\n", + "- Can learn task-specific patterns\n", + "- Limited to maximum training sequence length\n", + "\n", + "**Sinusoidal Positional Encoding**:\n", + "- Mathematical sine/cosine patterns\n", + "- No additional parameters\n", + "- Can extrapolate to longer sequences\n", + "\n", + "### Strategy Comparison\n", + "\n", + "```\n", + "Text: \"cat sat on mat\" β†’ Token IDs: [42, 7, 15, 99]\n", + "\n", + "Token Embeddings: [vec_42, vec_7, vec_15, vec_99] # Same vectors anywhere\n", + "Position-Aware: [vec_42+pos_0, vec_7+pos_1, vec_15+pos_2, vec_99+pos_3]\n", + " ↑ Now \"cat\" at position 0 β‰  \"cat\" at position 1\n", + "```\n", + "\n", + "The combination enables transformers to understand both meaning and order!" + ] + }, + { + "cell_type": "markdown", + "id": "b0e68de7", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "## 3. Implementation - Building Embedding Systems\n", + "\n", + "Let's implement embedding systems from basic token lookup to sophisticated position-aware representations. We'll start with the core embedding layer and work up to complete systems." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19fc003f", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -316,7 +348,7 @@ { "cell_type": "code", "execution_count": null, - "id": "313ae173", + "id": "17498acf", "metadata": { "nbgrader": { "grade": true, @@ -366,89 +398,46 @@ }, { "cell_type": "markdown", - "id": "1564add7", + "id": "2aeb2910", "metadata": { "cell_marker": "\"\"\"" }, "source": [ - "## 4. Understanding Positional Encoding - Teaching Models About Order\n", + "### Learned Positional Encoding\n", "\n", - "Sequences have inherent order, but embeddings by themselves are orderless. We need to explicitly encode positional information so the model understands that \"cat chased dog\" is different from \"dog chased cat\".\n", - "\n", - "### Why Position Matters in Sequences\n", - "\n", - "Unlike images where spatial relationships are built into the 2D structure, text sequences need explicit position encoding:\n", + "Trainable position embeddings that can learn position-specific patterns. This approach treats each position as a learnable parameter, similar to token embeddings.\n", "\n", "```\n", - "Word Order Changes Meaning:\n", - "\"The cat chased the dog\" β‰  \"The dog chased the cat\"\n", - "\"Not good\" β‰  \"Good not\"\n", - "\"She told him\" β‰  \"Him told she\"\n", + "Learned Position Embedding Process:\n", + "\n", + "Step 1: Initialize Position Embedding Table\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ Position β”‚ Learnable Vector (trainable parameters) β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ 0 β”‚ [0.1, -0.2, 0.4, ...] ← learns \"start\" patterns β”‚\n", + "β”‚ 1 β”‚ [0.3, 0.1, -0.1, ...] ← learns \"second\" patternsβ”‚\n", + "β”‚ 2 β”‚ [-0.1, 0.5, 0.2, ...] ← learns \"third\" patterns β”‚\n", + "β”‚ ... β”‚ ... β”‚\n", + "β”‚ 511 β”‚ [0.4, -0.3, 0.1, ...] ← learns \"late\" patterns β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", + "\n", + "Step 2: Add to Token Embeddings\n", + "Input: [\"The\", \"cat\", \"sat\"] β†’ Token IDs: [1, 42, 7]\n", + "\n", + "Token embeddings: Position embeddings: Combined:\n", + "[1] β†’ [0.1, 0.4, ...] + [0.1, -0.2, ...] = [0.2, 0.2, ...]\n", + "[42] β†’ [0.7, -0.2, ...] + [0.3, 0.1, ...] = [1.0, -0.1, ...]\n", + "[7] β†’ [-0.3, 0.1, ...] + [-0.1, 0.5, ...] = [-0.4, 0.6, ...]\n", + "\n", + "Result: Position-aware embeddings that can learn task-specific patterns!\n", "```\n", "\n", - "### Two Approaches to Position Encoding\n", - "\n", - "```\n", - "1. Learned Positional Embeddings:\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ Position β”‚ Learned Vector β”‚\n", - " β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", - " β”‚ 0 β”‚ [0.1, -0.2, 0.4, ...] β”‚ (trained)\n", - " β”‚ 1 β”‚ [0.3, 0.1, -0.1, ...] β”‚ (trained)\n", - " β”‚ 2 β”‚ [-0.1, 0.5, 0.2, ...] β”‚ (trained)\n", - " β”‚ ... β”‚ ... β”‚\n", - " β”‚ 511 β”‚ [0.4, -0.3, 0.1, ...] β”‚ (trained)\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " βœ“ Can learn task-specific patterns\n", - " βœ— Fixed maximum sequence length\n", - " βœ— Requires additional parameters\n", - "\n", - "2. Sinusoidal Position Encodings:\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ Position β”‚ Mathematical Pattern β”‚\n", - " β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", - " β”‚ 0 β”‚ [0.0, 1.0, 0.0, ...] β”‚ (computed)\n", - " β”‚ 1 β”‚ [sin1, cos1, sin2, ...] β”‚ (computed)\n", - " β”‚ 2 β”‚ [sin2, cos2, sin4, ...] β”‚ (computed)\n", - " β”‚ ... β”‚ ... β”‚\n", - " β”‚ N β”‚ [sinN, cosN, sin2N,...] β”‚ (computed)\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " βœ“ No additional parameters\n", - " βœ“ Can extrapolate to longer sequences\n", - " βœ— Cannot adapt to specific patterns\n", - "```\n", - "\n", - "### How Positional Information Gets Added\n", - "\n", - "```\n", - "Token Embeddings + Positional Encodings = Position-Aware Representations\n", - "\n", - "Input Sequence: [\"The\", \"cat\", \"sat\"]\n", - "Token IDs: [ 1, 42, 7 ]\n", - "\n", - "Step 1: Token Embeddings\n", - "[1] β†’ [0.1, 0.4, -0.2, ...]\n", - "[42]β†’ [0.7, -0.2, 0.1, ...]\n", - "[7] β†’ [-0.3, 0.1, 0.5, ...]\n", - "\n", - "Step 2: Position Encodings\n", - "pos 0 β†’ [0.0, 1.0, 0.0, ...]\n", - "pos 1 β†’ [0.8, 0.6, 0.1, ...]\n", - "pos 2 β†’ [0.9, -0.4, 0.2, ...]\n", - "\n", - "Step 3: Addition (element-wise)\n", - "Result:\n", - "[0.1+0.0, 0.4+1.0, -0.2+0.0, ...] = [0.1, 1.4, -0.2, ...] \"The\" at position 0\n", - "[0.7+0.8, -0.2+0.6, 0.1+0.1, ...] = [1.5, 0.4, 0.2, ...] \"cat\" at position 1\n", - "[-0.3+0.9, 0.1-0.4, 0.5+0.2, ...] = [0.6, -0.3, 0.7, ...] \"sat\" at position 2\n", - "```\n", - "\n", - "This way, the same word gets different representations based on its position in the sentence!" + "**Why learned positions work**: The model can discover that certain positions have special meaning (like sentence beginnings, question words, etc.) and learn specific representations for those patterns." ] }, { "cell_type": "markdown", - "id": "62e1f2d8", + "id": "2d67a811", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -462,7 +451,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78065712", + "id": "b82481b4", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -571,7 +560,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff5acebc", + "id": "c9f01de4", "metadata": { "nbgrader": { "grade": true, @@ -627,72 +616,75 @@ }, { "cell_type": "markdown", - "id": "e16ad002", + "id": "809099dc", "metadata": { "cell_marker": "\"\"\"" }, "source": [ - "## 6. Understanding Sinusoidal Position Encodings\n", + "### Sinusoidal Positional Encoding\n", "\n", - "Now let's explore the elegant mathematical approach to position encoding used in the original Transformer paper. Instead of learning position patterns, we'll use trigonometric functions to create unique, continuous position signatures.\n", - "\n", - "### The Mathematical Intuition\n", - "\n", - "Sinusoidal encodings use sine and cosine functions at different frequencies to create unique position signatures:\n", + "Mathematical position encoding that creates unique signatures for each position using trigonometric functions. This approach requires no additional parameters and can extrapolate to sequences longer than seen during training.\n", "\n", "```\n", - "PE(pos, 2i) = sin(pos / 10000^(2i/d_model)) # Even dimensions\n", - "PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model)) # Odd dimensions\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ SINUSOIDAL POSITION ENCODING: Mathematical Position Signatures β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ MATHEMATICAL FORMULA: β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ PE(pos, 2i) = sin(pos / 10000^(2i/embed_dim)) # Even dims β”‚ β”‚\n", + "β”‚ β”‚ PE(pos, 2i+1) = cos(pos / 10000^(2i/embed_dim)) # Odd dims β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Where: β”‚ β”‚\n", + "β”‚ β”‚ pos = position in sequence (0, 1, 2, ...) β”‚ β”‚\n", + "β”‚ β”‚ i = dimension pair index (0, 1, 2, ...) β”‚ β”‚\n", + "β”‚ β”‚ 10000 = base frequency (creates different wavelengths) β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ FREQUENCY PATTERN ACROSS DIMENSIONS: β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Dimension: 0 1 2 3 4 5 6 7 β”‚ β”‚\n", + "β”‚ β”‚ Frequency: High High Med Med Low Low VLow VLow β”‚ β”‚\n", + "β”‚ β”‚ Function: sin cos sin cos sin cos sin cos β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ pos=0: [0.00, 1.00, 0.00, 1.00, 0.00, 1.00, 0.00, 1.00] β”‚ β”‚\n", + "β”‚ β”‚ pos=1: [0.84, 0.54, 0.01, 1.00, 0.00, 1.00, 0.00, 1.00] β”‚ β”‚\n", + "β”‚ β”‚ pos=2: [0.91,-0.42, 0.02, 1.00, 0.00, 1.00, 0.00, 1.00] β”‚ β”‚\n", + "β”‚ β”‚ pos=3: [0.14,-0.99, 0.03, 1.00, 0.00, 1.00, 0.00, 1.00] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Each position gets a unique mathematical \"fingerprint\"! β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ WHY THIS WORKS: β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Wave Pattern Visualization: β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Dim 0: ∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿ (rapid oscillation) β”‚ β”‚\n", + "β”‚ β”‚ Dim 2: ∿---∿---∿---∿---∿---∿ (medium frequency) β”‚ β”‚\n", + "β”‚ β”‚ Dim 4: ∿-----∿-----∿-----∿-- (low frequency) β”‚ β”‚\n", + "β”‚ β”‚ Dim 6: ∿----------∿---------- (very slow changes) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β€’ High frequency dims change rapidly between positions β”‚ β”‚\n", + "β”‚ β”‚ β€’ Low frequency dims change slowly β”‚ β”‚\n", + "β”‚ β”‚ β€’ Combination creates unique signature for each position β”‚ β”‚\n", + "β”‚ β”‚ β€’ Similar positions have similar (but distinct) encodings β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ KEY ADVANTAGES: β”‚\n", + "β”‚ β€’ Zero parameters (no memory overhead) β”‚\n", + "β”‚ β€’ Infinite sequence length (can extrapolate) β”‚\n", + "β”‚ β€’ Smooth transitions (nearby positions are similar) β”‚\n", + "β”‚ β€’ Mathematical elegance (interpretable patterns) β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", - "### Why This Works - Frequency Visualization\n", - "\n", - "```\n", - "Position Encoding Pattern (embed_dim=8, showing 4 positions):\n", - "\n", - "Dimension: 0 1 2 3 4 5 6 7\n", - "Frequency: High High Med Med Low Low VLow VLow\n", - "Function: sin cos sin cos sin cos sin cos\n", - "\n", - "pos=0: [0.00, 1.00, 0.00, 1.00, 0.00, 1.00, 0.00, 1.00]\n", - "pos=1: [0.84, 0.54, 0.01, 1.00, 0.00, 1.00, 0.00, 1.00]\n", - "pos=2: [0.91, -0.42, 0.02, 1.00, 0.00, 1.00, 0.00, 1.00]\n", - "pos=3: [0.14, -0.99, 0.03, 1.00, 0.00, 1.00, 0.00, 1.00]\n", - "\n", - "Notice how:\n", - "- High frequency dimensions (0,1) change quickly between positions\n", - "- Low frequency dimensions (6,7) change slowly\n", - "- Each position gets a unique \"fingerprint\"\n", - "```\n", - "\n", - "### Visual Pattern of Sinusoidal Encodings\n", - "\n", - "```\n", - "Frequency Spectrum Across Dimensions:\n", - "High Freq ← - - - - - - - - - - - - - - - - - - - - - β†’ Low Freq\n", - "Dim: 0 1 2 3 4 5 6 7 8 9 ... 510 511\n", - "\n", - "Wave Pattern for Position Progression:\n", - "Dim 0: ∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿∿ (rapid oscillation)\n", - "Dim 2: ∿---∿---∿---∿---∿---∿ (medium frequency)\n", - "Dim 4: ∿-----∿-----∿-----∿-- (low frequency)\n", - "Dim 6: ∿----------∿---------- (very slow changes)\n", - "\n", - "This creates a unique \"barcode\" for each position!\n", - "```\n", - "\n", - "### Advantages of Sinusoidal Encodings\n", - "\n", - "1. **No Parameters**: Zero additional memory overhead\n", - "2. **Extrapolation**: Can handle sequences longer than training data\n", - "3. **Unique Signatures**: Each position gets a distinct encoding\n", - "4. **Smooth Transitions**: Similar positions have similar encodings\n", - "5. **Mathematical Elegance**: Clean, interpretable patterns" + "**Why transformers use this**: The mathematical structure allows the model to learn relative positions (how far apart tokens are) through simple vector operations, which is crucial for attention mechanisms!" ] }, { "cell_type": "markdown", - "id": "c22aab07", + "id": "6fa0a064", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -706,7 +698,7 @@ { "cell_type": "code", "execution_count": null, - "id": "260ddaa3", + "id": "825a0299", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -781,7 +773,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2b69d044", + "id": "440952af", "metadata": { "nbgrader": { "grade": true, @@ -838,62 +830,104 @@ }, { "cell_type": "markdown", - "id": "9dc5b483", + "id": "6ffbaced", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## 4. Integration - Bringing It Together\n", + "\n", + "Now let's build the complete embedding system that combines token and positional embeddings into a production-ready component used in modern transformers and language models.\n", + "\n", + "```\n", + "Complete Embedding Pipeline:\n", + "\n", + "1. Token Lookup β†’ 2. Position Encoding β†’ 3. Combination β†’ 4. Ready for Attention\n", + " ↓ ↓ ↓ ↓\n", + " sparse IDs position info dense vectors context-aware\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "e25bf119", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 }, "source": [ - "## 8. Building the Complete Embedding System\n", + "### Complete Embedding System Architecture\n", "\n", - "Now let's integrate everything into a production-ready embedding system that handles both token and positional embeddings, supports multiple encoding types, and manages the full embedding pipeline used in modern NLP models.\n", - "\n", - "### Complete Embedding Pipeline Visualization\n", + "The production embedding layer that powers modern transformers combines multiple components into an efficient, flexible pipeline.\n", "\n", "```\n", - "Complete Embedding System Architecture:\n", - "\n", - "Input: Token IDs [1, 42, 7, 99]\n", - " ↓\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ Token Embedding β”‚ vocab_size Γ— embed_dim table\n", - " β”‚ Lookup Table β”‚\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓\n", - " Token Vectors (4 Γ— embed_dim)\n", - " [0.1, 0.4, -0.2, ...] ← token 1\n", - " [0.7, -0.2, 0.1, ...] ← token 42\n", - " [-0.3, 0.1, 0.5, ...] ← token 7\n", - " [0.9, -0.1, 0.3, ...] ← token 99\n", - " ↓\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ Positional Encoding β”‚ Choose: Learned, Sinusoidal, or None\n", - " β”‚ (Add position info) β”‚\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓\n", - " Position-Aware Embeddings (4 Γ— embed_dim)\n", - " [0.1+pos0, 0.4+pos0, ...] ← token 1 at position 0\n", - " [0.7+pos1, -0.2+pos1, ...] ← token 42 at position 1\n", - " [-0.3+pos2, 0.1+pos2, ...] ← token 7 at position 2\n", - " [0.9+pos3, -0.1+pos3, ...] ← token 99 at position 3\n", - " ↓\n", - " Optional: Scale by √embed_dim (Transformer convention)\n", - " ↓\n", - " Ready for Attention Mechanisms!\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ COMPLETE EMBEDDING SYSTEM: Token + Position β†’ Attention-Ready β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ INPUT: Token IDs [1, 42, 7, 99] β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ STEP 1: TOKEN EMBEDDING LOOKUP β”‚\n", + "β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ β”‚ Token Embedding Table (vocab_size Γ— embed_dim) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ ID 1 β†’ [0.1, 0.4, -0.2, ...] (semantic features) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ ID 42 β†’ [0.7, -0.2, 0.1, ...] (learned meaning) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ ID 7 β†’ [-0.3, 0.1, 0.5, ...] (dense vector) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ ID 99 β†’ [0.9, -0.1, 0.3, ...] (context-free) β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ STEP 2: POSITIONAL ENCODING (Choose Strategy) β”‚\n", + "β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ β”‚ Strategy A: Learned PE β”‚ β”‚\n", + "β”‚ β”‚ β”‚ pos 0 β†’ [trainable vector] (learns patterns) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ pos 1 β†’ [trainable vector] (task-specific) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ pos 2 β†’ [trainable vector] (fixed max length) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ Strategy B: Sinusoidal PE β”‚ β”‚\n", + "β”‚ β”‚ β”‚ pos 0 β†’ [sin/cos pattern] (mathematical) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ pos 1 β†’ [sin/cos pattern] (no parameters) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ pos 2 β†’ [sin/cos pattern] (infinite length) β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ Strategy C: No PE β”‚ β”‚\n", + "β”‚ β”‚ β”‚ positions ignored (order-agnostic) β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ STEP 3: ELEMENT-WISE ADDITION β”‚\n", + "β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ β”‚ Token + Position = Position-Aware Representation β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ [0.1, 0.4, -0.2] + [pos0] = [0.1+p0, 0.4+p0, ...] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ [0.7, -0.2, 0.1] + [pos1] = [0.7+p1, -0.2+p1, ...] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ [-0.3, 0.1, 0.5] + [pos2] = [-0.3+p2, 0.1+p2, ...] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ [0.9, -0.1, 0.3] + [pos3] = [0.9+p3, -0.1+p3, ...] β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ β”œβ”€ STEP 4: OPTIONAL SCALING (Transformer Convention) β”‚\n", + "β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ β”‚ Scale by √embed_dim for gradient stability β”‚ β”‚\n", + "β”‚ β”‚ β”‚ Helps balance token and position magnitudes β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚ β”‚\n", + "β”‚ └─ OUTPUT: Position-Aware Dense Vectors β”‚\n", + "β”‚ Ready for attention mechanisms and transformers! β”‚\n", + "β”‚ β”‚\n", + "β”‚ INTEGRATION FEATURES: β”‚\n", + "β”‚ β€’ Flexible position encoding (learned/sinusoidal/none) β”‚\n", + "β”‚ β€’ Efficient batch processing with variable sequence lengths β”‚\n", + "β”‚ β€’ Memory optimization (shared position encodings) β”‚\n", + "β”‚ β€’ Production patterns (matches PyTorch/HuggingFace) β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", - "### Integration Features\n", - "\n", - "- **Flexible Position Encoding**: Support learned, sinusoidal, or no positional encoding\n", - "- **Batch Processing**: Handle variable-length sequences with padding\n", - "- **Memory Efficiency**: Reuse position encodings across batches\n", - "- **Production Ready**: Matches PyTorch patterns and conventions" + "**Why this architecture works**: By separating token semantics from positional information, the model can learn meaning and order independently, then combine them optimally for the specific task." ] }, { "cell_type": "code", "execution_count": null, - "id": "c54ac003", + "id": "2f033b57", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1041,7 +1075,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c72c168", + "id": "72670535", "metadata": { "nbgrader": { "grade": true, @@ -1130,51 +1164,21 @@ }, { "cell_type": "markdown", - "id": "77e517a3", + "id": "27c9db3b", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 }, "source": [ - "## 9. Systems Analysis - Embedding Memory and Performance\n", + "## 5. Systems Analysis - Embedding Trade-offs\n", "\n", - "Understanding the systems implications of embedding layers is crucial for building scalable NLP models. Let's analyze memory usage, lookup performance, and trade-offs between different approaches.\n", - "\n", - "### Memory Usage Analysis\n", - "\n", - "```\n", - "Embedding Memory Scaling:\n", - "Vocabulary Size vs Memory Usage (embed_dim=512, FP32):\n", - "\n", - " 10K vocab: 10,000 Γ— 512 Γ— 4 bytes = 20 MB\n", - " 50K vocab: 50,000 Γ— 512 Γ— 4 bytes = 100 MB\n", - "100K vocab: 100,000 Γ— 512 Γ— 4 bytes = 200 MB\n", - " 1M vocab: 1,000,000 Γ— 512 Γ— 4 bytes = 2 GB\n", - "\n", - "GPT-3 Scale: 50,257 Γ— 12,288 Γ— 4 bytes β‰ˆ 2.4 GB just for embeddings!\n", - "\n", - "Memory Formula: vocab_size Γ— embed_dim Γ— 4 bytes (FP32)\n", - "```\n", - "\n", - "### Performance Characteristics\n", - "\n", - "```\n", - "Embedding Lookup Performance:\n", - "- Time Complexity: O(1) per token (hash table lookup)\n", - "- Memory Access: Random access pattern\n", - "- Bottleneck: Memory bandwidth, not computation\n", - "- Batching: Improves throughput via vectorization\n", - "\n", - "Cache Efficiency:\n", - "Repeated tokens β†’ Cache hits β†’ Faster access\n", - "Diverse vocab β†’ Cache misses β†’ Slower access\n", - "```" + "Understanding the performance implications of different embedding strategies is crucial for building efficient NLP systems that scale to production workloads." ] }, { "cell_type": "code", "execution_count": null, - "id": "b8bf22b4", + "id": "d05c51c1", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1185,8 +1189,8 @@ }, "outputs": [], "source": [ - "def analyze_embedding_memory():\n", - " \"\"\"πŸ“Š Analyze embedding memory requirements and scaling behavior.\"\"\"\n", + "def analyze_embedding_memory_scaling():\n", + " \"\"\"πŸ“Š Compare embedding memory requirements across different model scales.\"\"\"\n", " print(\"πŸ“Š Analyzing Embedding Memory Requirements...\")\n", "\n", " # Vocabulary and embedding dimension scenarios\n", @@ -1228,13 +1232,13 @@ " print(\"β€’ Learned PE adds memory but may improve task-specific performance\")\n", " print(\"β€’ Sinusoidal PE saves memory and allows longer sequences\")\n", "\n", - "analyze_embedding_memory()" + "analyze_embedding_memory_scaling()" ] }, { "cell_type": "code", "execution_count": null, - "id": "b0592745", + "id": "1b96f57b", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1245,8 +1249,8 @@ }, "outputs": [], "source": [ - "def analyze_lookup_performance():\n", - " \"\"\"πŸ“Š Analyze embedding lookup performance characteristics.\"\"\"\n", + "def analyze_embedding_performance():\n", + " \"\"\"πŸ“Š Compare embedding lookup performance across different configurations.\"\"\"\n", " print(\"\\nπŸ“Š Analyzing Embedding Lookup Performance...\")\n", "\n", " import time\n", @@ -1295,13 +1299,13 @@ " print(\"β€’ Memory bandwidth becomes bottleneck for large embedding dimensions\")\n", " print(\"β€’ Cache locality important for repeated token patterns\")\n", "\n", - "analyze_lookup_performance()" + "analyze_embedding_performance()" ] }, { "cell_type": "code", "execution_count": null, - "id": "8df93b2c", + "id": "62edc85a", "metadata": { "nbgrader": { "grade": false, @@ -1311,8 +1315,8 @@ }, "outputs": [], "source": [ - "def analyze_positional_encoding_trade_offs():\n", - " \"\"\"πŸ“Š Compare learned vs sinusoidal positional encodings.\"\"\"\n", + "def analyze_positional_encoding_strategies():\n", + " \"\"\"πŸ“Š Compare different positional encoding approaches and trade-offs.\"\"\"\n", " print(\"\\nπŸ“Š Analyzing Positional Encoding Trade-offs...\")\n", "\n", " max_seq_len = 512\n", @@ -1379,26 +1383,26 @@ " print(f\" - Cannot adapt to task-specific position patterns\")\n", " print(f\" - May be suboptimal for highly position-dependent tasks\")\n", "\n", - "analyze_positional_encoding_trade_offs()" + "analyze_positional_encoding_strategies()" ] }, { "cell_type": "markdown", - "id": "44d806f3", + "id": "ffcbbfe8", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 }, "source": [ - "## 10. Module Integration Test\n", + "## 6. Module Integration Test\n", "\n", - "Final validation that our complete embedding system works correctly and integrates with the TinyTorch ecosystem." + "Let's test our complete embedding system to ensure everything works together correctly." ] }, { "cell_type": "code", "execution_count": null, - "id": "6350b42c", + "id": "9a0587e9", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1538,7 +1542,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b60f9636", + "id": "0ed96c77", "metadata": { "nbgrader": { "grade": false, @@ -1557,7 +1561,7 @@ }, { "cell_type": "markdown", - "id": "1627abd1", + "id": "a6aff95f", "metadata": { "cell_marker": "\"\"\"" }, @@ -1591,7 +1595,7 @@ }, { "cell_type": "markdown", - "id": "e1e226ca", + "id": "6b073262", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/12_attention/attention_dev.ipynb b/modules/source/12_attention/attention_dev.ipynb index d6995f70..ed437ec6 100644 --- a/modules/source/12_attention/attention_dev.ipynb +++ b/modules/source/12_attention/attention_dev.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2138437", + "id": "d94b5da2", "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "markdown", - "id": "2e26d6f6", + "id": "9306f576", "metadata": { "cell_marker": "\"\"\"" }, @@ -45,7 +45,7 @@ "\n", "## πŸ“¦ Where This Code Lives in the Final Package\n", "\n", - "**Learning Side:** You work in `modules/12_attention/attention_dev.py` \n", + "**Learning Side:** You work in `modules/12_attention/attention_dev.py`\n", "**Building Side:** Code exports to `tinytorch.core.attention`\n", "\n", "```python\n", @@ -63,96 +63,24 @@ { "cell_type": "code", "execution_count": null, - "id": "15910289", - "metadata": { - "lines_to_next_cell": 1, - "nbgrader": { - "grade": false, - "grade_id": "imports", - "locked": false, - "solution": true - } - }, + "id": "2eaafa86", + "metadata": {}, "outputs": [], "source": [ + "#| export\n", "import numpy as np\n", "import math\n", "import time\n", - "import sys\n", - "import os\n", "from typing import Optional, Tuple, List\n", "\n", - "# Import dependencies from other modules\n", - "sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))\n", + "# Import dependencies from previous modules - following TinyTorch dependency chain\n", "from tinytorch.core.tensor import Tensor\n", - "\n", - "sys.path.append(os.path.join(os.path.dirname(__file__), '..', '03_layers'))\n", - "from tinytorch.core.layers import Linear\n", - "\n", - "# Note: Keeping simplified implementations for reference during development\n", - "class _SimplifiedTensor:\n", - " \"\"\"Simplified tensor for attention operations development.\"\"\"\n", - "\n", - " def __init__(self, data, requires_grad=False):\n", - " self.data = np.array(data, dtype=np.float32)\n", - " self.shape = self.data.shape\n", - " self.requires_grad = requires_grad\n", - " self.grad = None\n", - "\n", - " def __repr__(self):\n", - " return f\"Tensor(shape={self.shape}, data=\\n{self.data})\"\n", - "\n", - " def __add__(self, other):\n", - " if isinstance(other, Tensor):\n", - " return Tensor(self.data + other.data)\n", - " return Tensor(self.data + other)\n", - "\n", - " def __mul__(self, other):\n", - " if isinstance(other, Tensor):\n", - " return Tensor(self.data * other.data)\n", - " return Tensor(self.data * other)\n", - "\n", - " def sum(self, axis=None):\n", - " return Tensor(np.sum(self.data, axis=axis))\n", - "\n", - " def mean(self, axis=None):\n", - " return Tensor(np.mean(self.data, axis=axis))\n", - "\n", - " def matmul(self, other):\n", - " return Tensor(np.matmul(self.data, other.data))\n", - "\n", - " def softmax(self, axis=-1):\n", - " \"\"\"Apply softmax along specified axis.\"\"\"\n", - " # Subtract max for numerical stability\n", - " shifted = self.data - np.max(self.data, axis=axis, keepdims=True)\n", - " exp_values = np.exp(shifted)\n", - " return Tensor(exp_values / np.sum(exp_values, axis=axis, keepdims=True))\n", - "\n", - " # Simplified Linear layer for development\n", - " class Linear:\n", - " \"\"\"Simplified linear layer for attention projections.\"\"\"\n", - "\n", - " def __init__(self, in_features, out_features):\n", - " self.in_features = in_features\n", - " self.out_features = out_features\n", - " # Initialize weights and bias (simplified Xavier initialization)\n", - " self.weight = Tensor(np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features))\n", - " self.bias = Tensor(np.zeros(out_features))\n", - "\n", - " def forward(self, x):\n", - " \"\"\"Forward pass: y = xW + b\"\"\"\n", - " output = x.matmul(self.weight)\n", - " # Add bias (broadcast across batch and sequence dimensions)\n", - " return Tensor(output.data + self.bias.data)\n", - "\n", - " def parameters(self):\n", - " \"\"\"Return list of parameters for this layer.\"\"\"\n", - " return [self.weight, self.bias]" + "from tinytorch.core.layers import Linear" ] }, { "cell_type": "markdown", - "id": "b8ca28ff", + "id": "81ea33fc", "metadata": { "cell_marker": "\"\"\"" }, @@ -209,7 +137,7 @@ }, { "cell_type": "markdown", - "id": "a85b79df", + "id": "9330210a", "metadata": { "cell_marker": "\"\"\"" }, @@ -228,10 +156,10 @@ "\n", "Keys: \"What information is available at each position?\"\n", "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ K₁: [0.2, 0.7, 0.1, 0.4] β”‚ ← Key 1 (description of position 1)\n", - "β”‚ Kβ‚‚: [0.1, 0.9, 0.2, 0.1] β”‚ ← Key 2 (description of position 2)\n", - "β”‚ K₃: [0.3, 0.1, 0.8, 0.3] β”‚ ← Key 3 (description of position 3)\n", - "β”‚ Kβ‚„: [0.4, 0.2, 0.1, 0.9] β”‚ ← Key 4 (description of position 4)\n", + "β”‚ K₁: [0.2, 0.7, 0.1, 0.4] β”‚ ← Key 1 (description of position 1)\n", + "β”‚ Kβ‚‚: [0.1, 0.9, 0.2, 0.1] β”‚ ← Key 2 (description of position 2)\n", + "β”‚ K₃: [0.3, 0.1, 0.8, 0.3] β”‚ ← Key 3 (description of position 3)\n", + "β”‚ Kβ‚„: [0.4, 0.2, 0.1, 0.9] β”‚ ← Key 4 (description of position 4)\n", "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "\n", "Values: \"What actual content can I retrieve?\"\n", @@ -301,7 +229,7 @@ }, { "cell_type": "markdown", - "id": "396fac34", + "id": "394e7884", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -347,18 +275,18 @@ { "cell_type": "code", "execution_count": null, - "id": "08019321", + "id": "7eada95c", "metadata": { "lines_to_next_cell": 1, "nbgrader": { "grade": false, "grade_id": "attention-function", - "locked": false, "solution": true } }, "outputs": [], "source": [ + "#| export\n", "def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:\n", " \"\"\"\n", " Compute scaled dot-product attention.\n", @@ -464,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63ffcc32", + "id": "9e006e03", "metadata": { "nbgrader": { "grade": true, @@ -508,12 +436,14 @@ "\n", " print(\"βœ… scaled_dot_product_attention works correctly!\")\n", "\n", - "test_unit_scaled_dot_product_attention()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_unit_scaled_dot_product_attention()" ] }, { "cell_type": "markdown", - "id": "ef857d5e", + "id": "712ce2a0", "metadata": { "cell_marker": "\"\"\"" }, @@ -534,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "5d7802cf", + "id": "0ae42b8d", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -547,22 +477,46 @@ "### Understanding Multi-Head Architecture\n", "\n", "```\n", - "Single-Head vs Multi-Head Attention:\n", - "\n", - "SINGLE HEAD (Limited):\n", - "Input β†’ [Linear] β†’ Q,K,V β†’ [Attention] β†’ Output\n", - " 512Γ—512 512Γ—512 512\n", - "\n", - "MULTI-HEAD (Rich):\n", - "Input β†’ [Linear] β†’ Q₁,K₁,V₁ β†’ [Attention₁] β†’ Head₁ (64 dims)\n", - " β†’ [Linear] β†’ Qβ‚‚,Kβ‚‚,Vβ‚‚ β†’ [Attentionβ‚‚] β†’ Headβ‚‚ (64 dims)\n", - " β†’ [Linear] β†’ Q₃,K₃,V₃ β†’ [Attention₃] β†’ Head₃ (64 dims)\n", - " ...\n", - " β†’ [Linear] β†’ Qβ‚ˆ,Kβ‚ˆ,Vβ‚ˆ β†’ [Attentionβ‚ˆ] β†’ Headβ‚ˆ (64 dims)\n", - " ↓\n", - " [Concatenate]\n", - " ↓\n", - " [Linear Mix] β†’ Output (512)\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ SINGLE-HEAD vs MULTI-HEAD ATTENTION ARCHITECTURE β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ SINGLE HEAD ATTENTION (Limited Representation): β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Input (512) β†’ [Linear] β†’ Q,K,V (512) β†’ [Attention] β†’ Output (512) β”‚ β”‚\n", + "β”‚ β”‚ ↑ ↑ ↑ ↑ β”‚ β”‚\n", + "β”‚ β”‚ Single proj Full dimensions One head Limited focus β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ MULTI-HEAD ATTENTION (Rich Parallel Processing): β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Input (512) β”‚ β”‚\n", + "β”‚ β”‚ ↓ β”‚ β”‚\n", + "β”‚ β”‚ [Q/K/V Projections] β†’ 512 dimensions each β”‚ β”‚\n", + "β”‚ β”‚ ↓ β”‚ β”‚\n", + "β”‚ β”‚ [Split into 8 heads] β†’ 8 Γ— 64 dimensions per head β”‚ β”‚\n", + "β”‚ β”‚ ↓ β”‚ β”‚\n", + "β”‚ β”‚ Head₁: Q₁(64) βŠ— K₁(64) β†’ Attention₁ β†’ Output₁(64) β”‚ Syntax focus β”‚ β”‚\n", + "β”‚ β”‚ Headβ‚‚: Qβ‚‚(64) βŠ— Kβ‚‚(64) β†’ Attentionβ‚‚ β†’ Outputβ‚‚(64) β”‚ Semantic β”‚ β”‚\n", + "β”‚ β”‚ Head₃: Q₃(64) βŠ— K₃(64) β†’ Attention₃ β†’ Output₃(64) β”‚ Position β”‚ β”‚\n", + "β”‚ β”‚ Headβ‚„: Qβ‚„(64) βŠ— Kβ‚„(64) β†’ Attentionβ‚„ β†’ Outputβ‚„(64) β”‚ Long-range β”‚ β”‚\n", + "β”‚ β”‚ Headβ‚…: Qβ‚…(64) βŠ— Kβ‚…(64) β†’ Attentionβ‚… β†’ Outputβ‚…(64) β”‚ Local deps β”‚ β”‚\n", + "β”‚ β”‚ Head₆: Q₆(64) βŠ— K₆(64) β†’ Attention₆ β†’ Output₆(64) β”‚ Coreference β”‚ β”‚\n", + "β”‚ β”‚ Head₇: Q₇(64) βŠ— K₇(64) β†’ Attention₇ β†’ Output₇(64) β”‚ Composition β”‚ β”‚\n", + "β”‚ β”‚ Headβ‚ˆ: Qβ‚ˆ(64) βŠ— Kβ‚ˆ(64) β†’ Attentionβ‚ˆ β†’ Outputβ‚ˆ(64) β”‚ Global view β”‚ β”‚\n", + "β”‚ β”‚ ↓ β”‚ β”‚\n", + "β”‚ β”‚ [Concatenate] β†’ 8 Γ— 64 = 512 dimensions β”‚ β”‚\n", + "β”‚ β”‚ ↓ β”‚ β”‚\n", + "β”‚ β”‚ [Output Linear] β†’ Final representation (512) β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ Key Benefits of Multi-Head: β”‚\n", + "β”‚ β€’ Parallel specialization across different relationship types β”‚\n", + "β”‚ β€’ Same total parameters, distributed across multiple focused heads β”‚\n", + "β”‚ β€’ Each head can learn distinct attention patterns β”‚\n", + "β”‚ β€’ Enables rich, multifaceted understanding of sequences β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", "### The Multi-Head Process Detailed\n", @@ -600,18 +554,18 @@ { "cell_type": "code", "execution_count": null, - "id": "c02f9af2", + "id": "f540c1d4", "metadata": { "lines_to_next_cell": 1, "nbgrader": { "grade": false, "grade_id": "multihead-attention", - "locked": false, "solution": true } }, "outputs": [], "source": [ + "#| export\n", "class MultiHeadAttention:\n", " \"\"\"\n", " Multi-head attention mechanism.\n", @@ -772,7 +726,7 @@ { "cell_type": "code", "execution_count": null, - "id": "38708375", + "id": "636a3fed", "metadata": { "nbgrader": { "grade": true, @@ -822,12 +776,14 @@ "\n", " print(\"βœ… MultiHeadAttention works correctly!\")\n", "\n", - "test_unit_multihead_attention()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_unit_multihead_attention()" ] }, { "cell_type": "markdown", - "id": "3cd02d15", + "id": "da0586c2", "metadata": { "cell_marker": "\"\"\"" }, @@ -847,7 +803,7 @@ }, { "cell_type": "markdown", - "id": "58152928", + "id": "bd666af7", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -889,13 +845,12 @@ { "cell_type": "code", "execution_count": null, - "id": "6e672761", + "id": "a722af5d", "metadata": { "lines_to_next_cell": 1, "nbgrader": { "grade": false, "grade_id": "attention-complexity", - "locked": false, "solution": true } }, @@ -932,12 +887,11 @@ { "cell_type": "code", "execution_count": null, - "id": "86c3011a", + "id": "692eb505", "metadata": { "nbgrader": { "grade": false, "grade_id": "attention-timing", - "locked": false, "solution": true } }, @@ -987,7 +941,7 @@ }, { "cell_type": "markdown", - "id": "a9ee02ed", + "id": "5012f8f3", "metadata": { "cell_marker": "\"\"\"" }, @@ -1032,7 +986,7 @@ }, { "cell_type": "markdown", - "id": "9be100f1", + "id": "f0cfd879", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1052,8 +1006,8 @@ "1. Local Syntax Attention:\n", " \"The quick brown fox\"\n", " The β†’ quick (determiner-adjective)\n", - " quick β†’ brown (adjective-adjective)\n", - " brown β†’ fox (adjective-noun)\n", + " quick β†’ brown (adjective-adjective)\n", + " brown β†’ fox (adjective-noun)\n", "\n", "2. Long-Range Coreference:\n", " \"John went to the store. He bought milk.\"\n", @@ -1075,12 +1029,11 @@ { "cell_type": "code", "execution_count": null, - "id": "13905da5", + "id": "f8433bd9", "metadata": { "nbgrader": { "grade": false, "grade_id": "attention-scenarios", - "locked": false, "solution": true } }, @@ -1167,12 +1120,14 @@ "\n", " print(\"\\nβœ… All attention scenarios work correctly!\")\n", "\n", - "test_attention_scenarios()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_attention_scenarios()" ] }, { "cell_type": "markdown", - "id": "f55d7fc7", + "id": "76625dbe", "metadata": { "cell_marker": "\"\"\"" }, @@ -1206,13 +1161,13 @@ }, { "cell_type": "markdown", - "id": "e845f69f", + "id": "66c41cfa", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 }, "source": [ - "## πŸ§ͺ Module Integration Test\n", + "## 6. Module Integration Test\n", "\n", "Final validation that everything works together correctly." ] @@ -1220,7 +1175,7 @@ { "cell_type": "code", "execution_count": null, - "id": "43b4ca05", + "id": "c5c381db", "metadata": { "nbgrader": { "grade": true, @@ -1258,14 +1213,15 @@ " print(\"πŸŽ‰ ALL TESTS PASSED! Module ready for export.\")\n", " print(\"Run: tito module complete 12\")\n", "\n", - "# Call before module summary\n", - "test_module()" + "# Run comprehensive module test when executed directly\n", + "if __name__ == \"__main__\":\n", + " test_module()" ] }, { "cell_type": "code", "execution_count": null, - "id": "1b285af6", + "id": "10ced70a", "metadata": {}, "outputs": [], "source": [ @@ -1277,7 +1233,7 @@ }, { "cell_type": "markdown", - "id": "4afd6eb3", + "id": "f42b351d", "metadata": { "cell_marker": "\"\"\"" }, @@ -1317,7 +1273,7 @@ }, { "cell_type": "markdown", - "id": "30c9254b", + "id": "51aafac3", "metadata": { "cell_marker": "\"\"\"" }, @@ -1345,7 +1301,7 @@ "Your attention implementation is the core mechanism that enables modern language models!\n", "Export with: `tito module complete 12`\n", "\n", - "**Next**: Module 13 will combine attention with feed-forward layers to build complete transformer blocks, leading to GPT-style language models!\n", + "**Next**: Module 13 will combine attention with feed-forward layers to build complete transformer blocks!\n", "\n", "### What You Just Built Powers\n", "- **GPT models**: Your attention mechanism is the exact pattern used in ChatGPT and GPT-4\n", diff --git a/modules/source/13_transformers/transformers_dev.ipynb b/modules/source/13_transformers/transformers_dev.ipynb index cacf8001..dc3f4a72 100644 --- a/modules/source/13_transformers/transformers_dev.ipynb +++ b/modules/source/13_transformers/transformers_dev.ipynb @@ -2,24 +2,24 @@ "cells": [ { "cell_type": "markdown", - "id": "0fa7ad93", + "id": "763d8283", "metadata": { "cell_marker": "\"\"\"" }, "source": [ "# Module 13: Transformers - Complete Transformer Architecture\n", "\n", - "Welcome to Module 13! You're about to build the complete transformer architecture that powers modern language models like GPT.\n", + "Welcome to Module 13! You're about to build the complete transformer architecture that powers modern language models like GPT, Claude, and ChatGPT.\n", "\n", "## πŸ”— Prerequisites & Progress\n", - "**You've Built**: Tensors, activations, layers, attention mechanisms, embeddings, and all foundational components\n", + "**You've Built**: Tokenization, embeddings, attention mechanisms, and all foundational components\n", "**You'll Build**: TransformerBlock, complete GPT architecture, and autoregressive generation\n", "**You'll Enable**: Full language model training and text generation capabilities\n", "\n", "**Connection Map**:\n", "```\n", - "Attention + Layers + Embeddings β†’ Transformers β†’ GPT Architecture\n", - "(sequence processing) (building blocks) (complete model) (language generation)\n", + "Tokenization + Embeddings + Attention β†’ Transformers β†’ Language Generation\n", + "(textβ†’numbers) (learnable vectors) (sequence modeling) (complete models)\n", "```\n", "\n", "## Learning Objectives\n", @@ -30,16 +30,49 @@ "4. Understand parameter scaling in large language models\n", "5. Test transformer components and generation pipeline\n", "\n", - "Let's get started!\n", - "\n", + "Let's get started!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0857efbe", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp models.transformer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b58c4de", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import numpy as np\n", + "from tinytorch.core.tensor import Tensor\n", + "from tinytorch.core.layers import Linear\n", + "from tinytorch.core.attention import MultiHeadAttention\n", + "from tinytorch.core.activations import GELU" + ] + }, + { + "cell_type": "markdown", + "id": "b35ba8b8", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ "## πŸ“¦ Where This Code Lives in the Final Package\n", "\n", - "**Learning Side:** You work in `modules/13_transformers/transformers_dev.py` \n", + "**Learning Side:** You work in `modules/13_transformers/transformers_dev.py`\n", "**Building Side:** Code exports to `tinytorch.models.transformer`\n", "\n", "```python\n", "# How to use this module:\n", - "from tinytorch.models.transformer import TransformerBlock, GPT\n", + "from tinytorch.models.transformer import TransformerBlock, GPT, LayerNorm, MLP\n", "```\n", "\n", "**Why this matters:**\n", @@ -52,158 +85,166 @@ { "cell_type": "code", "execution_count": null, - "id": "c5e5dae4", + "id": "e36e4f2c", "metadata": { - "lines_to_next_cell": 1, - "nbgrader": { - "grade": false, - "grade_id": "imports", - "solution": true - } + "lines_to_next_cell": 1 }, "outputs": [], "source": [ - "#| default_exp models.transformer\n", - "#| export\n", - "\n", "import numpy as np\n", "import math\n", "from typing import Optional, List\n", "\n", - "# Minimal implementations for development - in practice these import from previous modules\n", - "class Tensor:\n", - " \"\"\"Minimal Tensor class for transformer development - imports from Module 01 in practice.\"\"\"\n", - " def __init__(self, data, requires_grad=False):\n", - " self.data = np.array(data)\n", - " self.shape = self.data.shape\n", - " self.size = self.data.size\n", - " self.requires_grad = requires_grad\n", - " self.grad = None\n", + "# Import from previous modules - following proper dependency chain\n", + "# Note: Actual imports happen in try/except blocks below with fallback implementations\n", + "from tinytorch.core.tensor import Tensor\n", + "from tinytorch.core.layers import Linear\n", + "# MultiHeadAttention import happens in try/except below\n", "\n", - " def __add__(self, other):\n", - " if isinstance(other, Tensor):\n", - " return Tensor(self.data + other.data)\n", - " return Tensor(self.data + other)\n", + "# For development, we'll use minimal implementations if imports fail\n", + "try:\n", + " from tinytorch.core.tensor import Tensor\n", + "except ImportError:\n", + " print(\"Warning: Using minimal Tensor implementation for development\")\n", + " class Tensor:\n", + " \"\"\"Minimal Tensor class for transformer development.\"\"\"\n", + " def __init__(self, data, requires_grad=False):\n", + " self.data = np.array(data)\n", + " self.shape = self.data.shape\n", + " self.size = self.data.size\n", + " self.requires_grad = requires_grad\n", + " self.grad = None\n", "\n", - " def __mul__(self, other):\n", - " if isinstance(other, Tensor):\n", - " return Tensor(self.data * other.data)\n", - " return Tensor(self.data * other)\n", + " def __add__(self, other):\n", + " if isinstance(other, Tensor):\n", + " return Tensor(self.data + other.data)\n", + " return Tensor(self.data + other)\n", "\n", - " def matmul(self, other):\n", - " return Tensor(np.dot(self.data, other.data))\n", + " def __mul__(self, other):\n", + " if isinstance(other, Tensor):\n", + " return Tensor(self.data * other.data)\n", + " return Tensor(self.data * other)\n", "\n", - " def sum(self, axis=None, keepdims=False):\n", - " return Tensor(self.data.sum(axis=axis, keepdims=keepdims))\n", + " def matmul(self, other):\n", + " return Tensor(np.dot(self.data, other.data))\n", "\n", - " def mean(self, axis=None, keepdims=False):\n", - " return Tensor(self.data.mean(axis=axis, keepdims=keepdims))\n", + " def sum(self, axis=None, keepdims=False):\n", + " return Tensor(self.data.sum(axis=axis, keepdims=keepdims))\n", "\n", - " def reshape(self, *shape):\n", - " return Tensor(self.data.reshape(shape))\n", + " def mean(self, axis=None, keepdims=False):\n", + " return Tensor(self.data.mean(axis=axis, keepdims=keepdims))\n", "\n", - " def __repr__(self):\n", - " return f\"Tensor(data={self.data}, shape={self.shape})\"\n", + " def reshape(self, *shape):\n", + " return Tensor(self.data.reshape(shape))\n", "\n", - "class Linear:\n", - " \"\"\"Minimal Linear layer - imports from Module 03 in practice.\"\"\"\n", - " def __init__(self, in_features, out_features, bias=True):\n", - " # Xavier/Glorot initialization\n", - " std = math.sqrt(2.0 / (in_features + out_features))\n", - " self.weight = Tensor(np.random.normal(0, std, (in_features, out_features)))\n", - " self.bias = Tensor(np.zeros(out_features)) if bias else None\n", + " def __repr__(self):\n", + " return f\"Tensor(data={self.data}, shape={self.shape})\"\n", "\n", - " def forward(self, x):\n", - " output = x.matmul(self.weight)\n", - " if self.bias is not None:\n", - " output = output + self.bias\n", - " return output\n", + "try:\n", + " from tinytorch.core.layers import Linear\n", + "except ImportError:\n", + " class Linear:\n", + " \"\"\"Minimal Linear layer for development.\"\"\"\n", + " def __init__(self, in_features, out_features, bias=True):\n", + " std = math.sqrt(2.0 / (in_features + out_features))\n", + " self.weight = Tensor(np.random.normal(0, std, (in_features, out_features)))\n", + " self.bias = Tensor(np.zeros(out_features)) if bias else None\n", "\n", - " def parameters(self):\n", - " params = [self.weight]\n", - " if self.bias is not None:\n", - " params.append(self.bias)\n", - " return params\n", + " def forward(self, x):\n", + " output = x.matmul(self.weight)\n", + " if self.bias is not None:\n", + " output = output + self.bias\n", + " return output\n", "\n", - "class MultiHeadAttention:\n", - " \"\"\"Minimal MultiHeadAttention - imports from Module 12 in practice.\"\"\"\n", - " def __init__(self, embed_dim, num_heads):\n", - " assert embed_dim % num_heads == 0\n", - " self.embed_dim = embed_dim\n", - " self.num_heads = num_heads\n", - " self.head_dim = embed_dim // num_heads\n", + " def parameters(self):\n", + " params = [self.weight]\n", + " if self.bias is not None:\n", + " params.append(self.bias)\n", + " return params\n", "\n", - " self.q_proj = Linear(embed_dim, embed_dim)\n", - " self.k_proj = Linear(embed_dim, embed_dim)\n", - " self.v_proj = Linear(embed_dim, embed_dim)\n", - " self.out_proj = Linear(embed_dim, embed_dim)\n", + "try:\n", + " from tinytorch.core.attention import MultiHeadAttention\n", + "except ImportError:\n", + " class MultiHeadAttention:\n", + " \"\"\"Minimal MultiHeadAttention for development.\"\"\"\n", + " def __init__(self, embed_dim, num_heads):\n", + " assert embed_dim % num_heads == 0\n", + " self.embed_dim = embed_dim\n", + " self.num_heads = num_heads\n", + " self.head_dim = embed_dim // num_heads\n", "\n", - " def forward(self, x, mask=None):\n", - " batch_size, seq_len, embed_dim = x.shape\n", + " self.q_proj = Linear(embed_dim, embed_dim)\n", + " self.k_proj = Linear(embed_dim, embed_dim)\n", + " self.v_proj = Linear(embed_dim, embed_dim)\n", + " self.out_proj = Linear(embed_dim, embed_dim)\n", "\n", - " # Linear projections\n", - " Q = self.q_proj.forward(x)\n", - " K = self.k_proj.forward(x)\n", - " V = self.v_proj.forward(x)\n", + " def forward(self, query, key, value, mask=None):\n", + " batch_size, seq_len, embed_dim = query.shape\n", "\n", - " # Reshape for multi-head attention\n", - " Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim)\n", - " K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim)\n", - " V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim)\n", + " # Linear projections\n", + " Q = self.q_proj.forward(query)\n", + " K = self.k_proj.forward(key)\n", + " V = self.v_proj.forward(value)\n", "\n", - " # Transpose to (batch_size, num_heads, seq_len, head_dim)\n", - " Q = Tensor(np.transpose(Q.data, (0, 2, 1, 3)))\n", - " K = Tensor(np.transpose(K.data, (0, 2, 1, 3)))\n", - " V = Tensor(np.transpose(V.data, (0, 2, 1, 3)))\n", + " # Reshape for multi-head attention\n", + " Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim)\n", + " K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim)\n", + " V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim)\n", "\n", - " # Scaled dot-product attention\n", - " scores = Tensor(np.matmul(Q.data, np.transpose(K.data, (0, 1, 3, 2))))\n", - " scores = scores * (1.0 / math.sqrt(self.head_dim))\n", + " # Transpose to (batch_size, num_heads, seq_len, head_dim)\n", + " Q = Tensor(np.transpose(Q.data, (0, 2, 1, 3)))\n", + " K = Tensor(np.transpose(K.data, (0, 2, 1, 3)))\n", + " V = Tensor(np.transpose(V.data, (0, 2, 1, 3)))\n", "\n", - " # Apply causal mask for autoregressive generation\n", - " if mask is not None:\n", - " scores = Tensor(scores.data + mask.data)\n", + " # Scaled dot-product attention\n", + " scores = Tensor(np.matmul(Q.data, np.transpose(K.data, (0, 1, 3, 2))))\n", + " scores = scores * (1.0 / math.sqrt(self.head_dim))\n", "\n", - " # Softmax\n", - " attention_weights = self._softmax(scores)\n", + " # Apply causal mask for autoregressive generation\n", + " if mask is not None:\n", + " scores = Tensor(scores.data + mask.data)\n", "\n", - " # Apply attention to values\n", - " out = Tensor(np.matmul(attention_weights.data, V.data))\n", + " # Softmax\n", + " attention_weights = self._softmax(scores)\n", "\n", - " # Transpose back and reshape\n", - " out = Tensor(np.transpose(out.data, (0, 2, 1, 3)))\n", - " out = out.reshape(batch_size, seq_len, embed_dim)\n", + " # Apply attention to values\n", + " out = Tensor(np.matmul(attention_weights.data, V.data))\n", "\n", - " # Final linear projection\n", - " return self.out_proj.forward(out)\n", + " # Transpose back and reshape\n", + " out = Tensor(np.transpose(out.data, (0, 2, 1, 3)))\n", + " out = out.reshape(batch_size, seq_len, embed_dim)\n", "\n", - " def _softmax(self, x):\n", - " \"\"\"Numerically stable softmax.\"\"\"\n", - " exp_x = Tensor(np.exp(x.data - np.max(x.data, axis=-1, keepdims=True)))\n", - " return Tensor(exp_x.data / np.sum(exp_x.data, axis=-1, keepdims=True))\n", + " # Final linear projection\n", + " return self.out_proj.forward(out)\n", "\n", - " def parameters(self):\n", - " params = []\n", - " params.extend(self.q_proj.parameters())\n", - " params.extend(self.k_proj.parameters())\n", - " params.extend(self.v_proj.parameters())\n", - " params.extend(self.out_proj.parameters())\n", - " return params\n", + " def _softmax(self, x):\n", + " \"\"\"Numerically stable softmax.\"\"\"\n", + " exp_x = Tensor(np.exp(x.data - np.max(x.data, axis=-1, keepdims=True)))\n", + " return Tensor(exp_x.data / np.sum(exp_x.data, axis=-1, keepdims=True))\n", "\n", - "class Embedding:\n", - " \"\"\"Minimal Embedding layer - imports from Module 11 in practice.\"\"\"\n", - " def __init__(self, vocab_size, embed_dim):\n", - " self.vocab_size = vocab_size\n", - " self.embed_dim = embed_dim\n", - " # Initialize with small random values\n", - " self.weight = Tensor(np.random.normal(0, 0.02, (vocab_size, embed_dim)))\n", + " def parameters(self):\n", + " params = []\n", + " params.extend(self.q_proj.parameters())\n", + " params.extend(self.k_proj.parameters())\n", + " params.extend(self.v_proj.parameters())\n", + " params.extend(self.out_proj.parameters())\n", + " return params\n", "\n", - " def forward(self, indices):\n", - " # Simple embedding lookup\n", - " return Tensor(self.weight.data[indices.data])\n", + "try:\n", + " from tinytorch.core.embeddings import Embedding\n", + "except ImportError:\n", + " class Embedding:\n", + " \"\"\"Minimal Embedding layer for development.\"\"\"\n", + " def __init__(self, vocab_size, embed_dim):\n", + " self.vocab_size = vocab_size\n", + " self.embed_dim = embed_dim\n", + " self.weight = Tensor(np.random.normal(0, 0.02, (vocab_size, embed_dim)))\n", "\n", - " def parameters(self):\n", - " return [self.weight]\n", + " def forward(self, indices):\n", + " return Tensor(self.weight.data[indices.data.astype(int)])\n", + "\n", + " def parameters(self):\n", + " return [self.weight]\n", "\n", "def gelu(x):\n", " \"\"\"GELU activation function.\"\"\"\n", @@ -212,7 +253,7 @@ }, { "cell_type": "markdown", - "id": "946c33e2", + "id": "77ba5604", "metadata": { "cell_marker": "\"\"\"" }, @@ -228,41 +269,54 @@ "### Complete GPT Architecture Overview\n", "\n", "```\n", - "Input: \"Hello world\" β†’ [Token IDs: 15496, 1917]\n", - " ↓\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ EMBEDDING LAYER β”‚\n", - " β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", - " β”‚ β”‚Token Embed β”‚ + β”‚ Positional Embed β”‚ β”‚\n", - " β”‚ β”‚[15496β†’vec] β”‚ β”‚[pos_0β†’vec, pos_1β†’vec]β”‚ β”‚\n", - " β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ TRANSFORMER BLOCK 1 β”‚\n", - " β”‚ β”‚\n", - " β”‚ Input β†’ LayerNorm β†’ MultiHeadAttention β”‚\n", - " β”‚ ↓ ↓ β”‚\n", - " β”‚ └────── Residual Add β†β”€β”€β”€β”€β”˜ β”‚\n", - " β”‚ ↓ β”‚\n", - " β”‚ Result β†’ LayerNorm β†’ MLP (Feed Forward) β”‚\n", - " β”‚ ↓ ↓ β”‚\n", - " β”‚ └──── Residual Add β†β”€β”€β”˜ β”‚\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ TRANSFORMER BLOCK 2 β”‚\n", - " β”‚ ... (same structure) β”‚\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓\n", - " ... (more blocks)\n", - " ↓\n", - " β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - " β”‚ OUTPUT HEAD β”‚\n", - " β”‚ Final LayerNorm β†’ Linear β†’ Vocabulary Logitsβ”‚\n", - " β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓\n", - "Output: [Prob(\"Hello\"), Prob(\"world\"), Prob(\"!\"), ...]\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ COMPLETE GPT ARCHITECTURE: From Text to Generation β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ INPUT: \"Hello world\" β†’ Token IDs: [15496, 1917] β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ EMBEDDING LAYER β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚\n", + "β”‚ β”‚ β”‚Token Embed β”‚ + β”‚ Positional Embedding β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚15496β†’[0.1, β”‚ β”‚ pos_0β†’[0.05, -0.02, ...] β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ 0.3,..]β”‚ β”‚ pos_1β†’[0.12, 0.08, ...] β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚1917β†’[0.2, β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ -0.1,..]β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ TRANSFORMER BLOCK 1 β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ x β†’ LayerNorm β†’ MultiHeadAttention β†’ + x β†’ result β”‚ β”‚\n", + "β”‚ β”‚ β”‚ ↑ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ residual connection β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ result β†’ LayerNorm β†’ MLP (Feed Forward) β†’ + result β”‚ β”‚\n", + "β”‚ β”‚ β”‚ ↑ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ residual connection β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ TRANSFORMER BLOCK 2 (same pattern) β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ ... (more blocks) ... β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ OUTPUT HEAD β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ final_hidden β†’ LayerNorm β†’ Linear(embed_dim, vocab_size) β”‚ β”‚\n", + "β”‚ β”‚ ↓ β”‚ β”‚\n", + "β”‚ β”‚ Vocabulary Logits: [0.1, 0.05, 0.8, ...] β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ OUTPUT: Next Token Probabilities β”‚\n", + "β”‚ \"Hello\" β†’ 10%, \"world\" β†’ 5%, \"!\" β†’ 80%, ... β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", "### Why Transformers Dominate\n", @@ -285,7 +339,7 @@ }, { "cell_type": "markdown", - "id": "f8388844", + "id": "b4f69559", "metadata": { "cell_marker": "\"\"\"" }, @@ -317,20 +371,38 @@ "Residual connections are the secret to training deep networks. They create \"gradient highways\" that allow information to flow directly through the network.\n", "\n", "```\n", - "Residual Pattern in Transformers:\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ Pre-Norm Architecture (Modern Standard): β”‚\n", - "β”‚ β”‚\n", - "β”‚ x β†’ LayerNorm β†’ MultiHeadAttention β†’ + x β”‚\n", - "β”‚ β”‚ ↑ β”‚\n", - "β”‚ β”‚ residual connection β”‚ β”‚\n", - "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", - "β”‚ β”‚ β”‚\n", - "β”‚ x β†’ LayerNorm β†’ MLP β†’ + x β”‚\n", - "β”‚ β”‚ ↑ ↑ β”‚\n", - "β”‚ β”‚ residual connection β”‚ β”‚\n", - "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ RESIDUAL CONNECTIONS: The Gradient Highway System β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ PRE-NORM ARCHITECTURE (Modern Standard): β”‚\n", + "β”‚ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ ATTENTION SUB-LAYER β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Input (x) ────┬─→ LayerNorm ─→ MultiHeadAttention ─┐ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β–Ό β”‚ β”‚\n", + "β”‚ β”‚ └────→ ADD ─→ Output to next sub-layer β”‚ β”‚\n", + "β”‚ β”‚ (x + attention_output) β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ MLP SUB-LAYER β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Input (x) ────┬─→ LayerNorm ─→ MLP (Feed Forward) ─┐ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β–Ό β”‚ β”‚\n", + "β”‚ β”‚ └────→ ADD ─→ Final Output β”‚ β”‚\n", + "β”‚ β”‚ (x + mlp_output) β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ KEY INSIGHT: Each sub-layer ADDS to the residual stream β”‚\n", + "β”‚ rather than replacing it, preserving information flow! β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", "**Gradient Flow Visualization:**\n", @@ -402,7 +474,7 @@ }, { "cell_type": "markdown", - "id": "aa924c73", + "id": "9a837896", "metadata": { "cell_marker": "\"\"\"" }, @@ -420,7 +492,7 @@ }, { "cell_type": "markdown", - "id": "3dc23c53", + "id": "76f36a18", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -437,31 +509,46 @@ "#### Layer Norm Visualization\n", "\n", "```\n", - "Input Tensor: (batch=2, seq=3, features=4)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ Sample 1: [[1.0, 2.0, 3.0, 4.0], β”‚\n", - "β”‚ [5.0, 6.0, 7.0, 8.0], β”‚\n", - "β”‚ [9.0, 10., 11., 12.]] β”‚\n", - "β”‚ β”‚\n", - "β”‚ Sample 2: [[13., 14., 15., 16.], β”‚\n", - "β”‚ [17., 18., 19., 20.], β”‚\n", - "β”‚ [21., 22., 23., 24.]] β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓ Layer Norm (across features for each position)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ Each position normalized to mean=0, std=1β”‚\n", - "β”‚ Sample 1: [[-1.34, -0.45, 0.45, 1.34], β”‚\n", - "β”‚ [-1.34, -0.45, 0.45, 1.34], β”‚\n", - "β”‚ [-1.34, -0.45, 0.45, 1.34]] β”‚\n", - "β”‚ β”‚\n", - "β”‚ Sample 2: [[-1.34, -0.45, 0.45, 1.34], β”‚\n", - "β”‚ [-1.34, -0.45, 0.45, 1.34], β”‚\n", - "β”‚ [-1.34, -0.45, 0.45, 1.34]] β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", - " ↓ Apply learnable scale (Ξ³) and shift (Ξ²)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ Final Output: Ξ³ * normalized + Ξ² β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ LAYER NORMALIZATION: Stabilizing Deep Networks β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ INPUT TENSOR: (batch=2, seq=3, features=4) β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Sample 1: [[1.0, 2.0, 3.0, 4.0], ← Position 0 β”‚ β”‚\n", + "β”‚ β”‚ [5.0, 6.0, 7.0, 8.0], ← Position 1 β”‚ β”‚\n", + "β”‚ β”‚ [9.0, 10.0, 11.0, 12.0]] ← Position 2 β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Sample 2: [[13., 14., 15., 16.], ← Position 0 β”‚ β”‚\n", + "β”‚ β”‚ [17., 18., 19., 20.], ← Position 1 β”‚ β”‚\n", + "β”‚ β”‚ [21., 22., 23., 24.]] ← Position 2 β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ NORMALIZE ACROSS FEATURES (per position) β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ AFTER NORMALIZATION: Each position β†’ mean=0, std=1 β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Sample 1: [[-1.34, -0.45, 0.45, 1.34], β”‚ β”‚\n", + "β”‚ β”‚ [-1.34, -0.45, 0.45, 1.34], β”‚ β”‚\n", + "β”‚ β”‚ [-1.34, -0.45, 0.45, 1.34]] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ Sample 2: [[-1.34, -0.45, 0.45, 1.34], β”‚ β”‚\n", + "β”‚ β”‚ [-1.34, -0.45, 0.45, 1.34], β”‚ β”‚\n", + "β”‚ β”‚ [-1.34, -0.45, 0.45, 1.34]] β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ APPLY LEARNABLE PARAMETERS: Ξ³ * norm + Ξ² β”‚\n", + "β”‚ ↓ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ FINAL OUTPUT: Model can learn any desired distribution β”‚ β”‚\n", + "β”‚ β”‚ Ξ³ (scale) and Ξ² (shift) are learned during training β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ KEY INSIGHT: Unlike batch norm, each sample normalized β”‚\n", + "β”‚ independently - perfect for variable-length sequences! β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", "#### Key Properties\n", @@ -473,7 +560,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c26bf73", + "id": "6878edf0", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -484,6 +571,7 @@ }, "outputs": [], "source": [ + "#| export\n", "class LayerNorm:\n", " \"\"\"\n", " Layer Normalization for transformer blocks.\n", @@ -564,7 +652,7 @@ }, { "cell_type": "markdown", - "id": "33272f95", + "id": "b57594b0", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -580,7 +668,7 @@ { "cell_type": "code", "execution_count": null, - "id": "42c87208", + "id": "f187ea71", "metadata": { "nbgrader": { "grade": true, @@ -620,12 +708,14 @@ "\n", " print(\"βœ… LayerNorm works correctly!\")\n", "\n", - "test_unit_layer_norm()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_unit_layer_norm()" ] }, { "cell_type": "markdown", - "id": "4eb1e55a", + "id": "20fa9a45", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -710,7 +800,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5acb8f3", + "id": "36edc347", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -721,6 +811,7 @@ }, "outputs": [], "source": [ + "#| export\n", "class MLP:\n", " \"\"\"\n", " Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.\n", @@ -799,7 +890,7 @@ }, { "cell_type": "markdown", - "id": "054236fd", + "id": "51e920ba", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -815,7 +906,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8849696d", + "id": "daa33cf0", "metadata": { "nbgrader": { "grade": true, @@ -856,12 +947,14 @@ "\n", " print(\"βœ… MLP works correctly!\")\n", "\n", - "test_unit_mlp()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_unit_mlp()" ] }, { "cell_type": "markdown", - "id": "dac755a4", + "id": "0f7a5449", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -963,7 +1056,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ad0f601", + "id": "3b54f39c", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -974,6 +1067,7 @@ }, "outputs": [], "source": [ + "#| export\n", "class TransformerBlock:\n", " \"\"\"\n", " Complete Transformer Block with self-attention, MLP, and residual connections.\n", @@ -1046,7 +1140,8 @@ " # First sub-layer: Multi-head self-attention with residual connection\n", " # Pre-norm: LayerNorm before attention\n", " normed1 = self.ln1.forward(x)\n", - " attention_out = self.attention.forward(normed1, mask)\n", + " # Self-attention: query, key, value are all the same (normed1)\n", + " attention_out = self.attention.forward(normed1, normed1, normed1, mask)\n", "\n", " # Residual connection\n", " x = x + attention_out\n", @@ -1074,7 +1169,7 @@ }, { "cell_type": "markdown", - "id": "736d101d", + "id": "78bc4bf0", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1090,7 +1185,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65540a0f", + "id": "2f8fa7e8", "metadata": { "nbgrader": { "grade": true, @@ -1134,12 +1229,14 @@ "\n", " print(\"βœ… TransformerBlock works correctly!\")\n", "\n", - "test_unit_transformer_block()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_unit_transformer_block()" ] }, { "cell_type": "markdown", - "id": "17ad8926", + "id": "d30f17d2", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1209,23 +1306,36 @@ "During training, GPT sees the entire sequence but must not \"cheat\" by looking at future tokens:\n", "\n", "```\n", - "Causal Attention Mask:\n", - "\n", - "Sequence: [\"The\", \"cat\", \"sat\", \"on\"]\n", - "Positions: 0 1 2 3\n", - "\n", - "Attention Matrix (what each position can see):\n", - " 0 1 2 3\n", - " 0 [ βœ“ βœ— βœ— βœ— ] # \"The\" only sees itself\n", - " 1 [ βœ“ βœ“ βœ— βœ— ] # \"cat\" sees \"The\" and itself\n", - " 2 [ βœ“ βœ“ βœ“ βœ— ] # \"sat\" sees \"The\", \"cat\", itself\n", - " 3 [ βœ“ βœ“ βœ“ βœ“ ] # \"on\" sees all previous tokens\n", - "\n", - "Implementation: Upper triangular matrix with -∞\n", - "[[ 0, -∞, -∞, -∞],\n", - " [ 0, 0, -∞, -∞],\n", - " [ 0, 0, 0, -∞],\n", - " [ 0, 0, 0, 0]]\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ CAUSAL MASKING: Preventing Future Information Leakage β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ SEQUENCE: [\"The\", \"cat\", \"sat\", \"on\"] β”‚\n", + "β”‚ POSITIONS: 0 1 2 3 β”‚\n", + "β”‚ β”‚\n", + "β”‚ ATTENTION MATRIX (what each position can see): β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ Pos: 0 1 2 3 β”‚ β”‚\n", + "β”‚ β”‚ Pos 0: [ βœ“ βœ— βœ— βœ— ] ← \"The\" only sees itself β”‚ β”‚\n", + "β”‚ β”‚ Pos 1: [ βœ“ βœ“ βœ— βœ— ] ← \"cat\" sees \"The\" + self β”‚ β”‚\n", + "β”‚ β”‚ Pos 2: [ βœ“ βœ“ βœ“ βœ— ] ← \"sat\" sees all previous β”‚ β”‚\n", + "β”‚ β”‚ Pos 3: [ βœ“ βœ“ βœ“ βœ“ ] ← \"on\" sees everything β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ IMPLEMENTATION: Upper triangular matrix with -∞ β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ [[ 0, -∞, -∞, -∞], β”‚ β”‚\n", + "β”‚ β”‚ [ 0, 0, -∞, -∞], β”‚ β”‚\n", + "β”‚ β”‚ [ 0, 0, 0, -∞], β”‚ β”‚\n", + "β”‚ β”‚ [ 0, 0, 0, 0]] β”‚ β”‚\n", + "β”‚ β”‚ β”‚ β”‚\n", + "β”‚ β”‚ After softmax: -∞ becomes 0 probability β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ WHY THIS WORKS: During training, model sees entire sequence β”‚\n", + "β”‚ but mask ensures position i only attends to positions ≀ i β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```\n", "\n", "#### Generation Temperature Control\n", @@ -1280,7 +1390,7 @@ { "cell_type": "code", "execution_count": null, - "id": "586f2e46", + "id": "1d86de25", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1291,6 +1401,7 @@ }, "outputs": [], "source": [ + "#| export\n", "class GPT:\n", " \"\"\"\n", " Complete GPT (Generative Pre-trained Transformer) model.\n", @@ -1477,7 +1588,7 @@ }, { "cell_type": "markdown", - "id": "c9a7758f", + "id": "6994ec05", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1493,7 +1604,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e4ba240a", + "id": "377dc692", "metadata": { "nbgrader": { "grade": true, @@ -1544,12 +1655,14 @@ "\n", " print(\"βœ… GPT model works correctly!\")\n", "\n", - "test_unit_gpt()" + "# Run test immediately when developing this module\n", + "if __name__ == \"__main__\":\n", + " test_unit_gpt()" ] }, { "cell_type": "markdown", - "id": "1ecc1961", + "id": "66fa0b98", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1595,7 +1708,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04f8fd5c", + "id": "6381a082", "metadata": { "nbgrader": { "grade": false, @@ -1667,7 +1780,7 @@ }, { "cell_type": "markdown", - "id": "0c53c926", + "id": "540a7b4d", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1719,28 +1832,40 @@ "### The Attention Memory Wall\n", "\n", "```\n", - "Attention Memory Wall Visualization:\n", - "\n", - "Sequence Length vs Memory Usage:\n", - "\n", - "1K tokens: [β–“] 16 MB # Manageable\n", - "2K tokens: [β–“β–“β–“β–“] 64 MB # 4Γ— memory (quadratic!)\n", - "4K tokens: [β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“] 256 MB # 16Γ— memory\n", - "8K tokens: [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ] 1 GB # 64Γ— memory\n", - "16K tokens: β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ 4 GB\n", - "32K tokens: β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ 16 GB\n", - "\n", - "This is why:\n", - "- GPT-3 context: 2K tokens\n", - "- GPT-4 context: 8K tokens (32K in turbo)\n", - "- Claude-3: 200K tokens (requires special techniques!)\n", + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ ATTENTION MEMORY WALL: Why Long Context is Expensive β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ β”‚\n", + "β”‚ MEMORY USAGE BY SEQUENCE LENGTH (Quadratic Growth): β”‚\n", + "β”‚ β”‚\n", + "β”‚ 1K tokens: [β–“] 16 MB ← Manageable β”‚\n", + "β”‚ 2K tokens: [β–“β–“β–“β–“] 64 MB ← 4Γ— memory (quadratic!) β”‚\n", + "β”‚ 4K tokens: [β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“β–“] 256 MB ← 16Γ— memory β”‚\n", + "β”‚ 8K tokens: [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ] 1 GB β”‚\n", + "β”‚ 16K tokens: β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ 4 GB β”‚\n", + "β”‚ 32K tokens: β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ 16 GB β”‚\n", + "β”‚ β”‚\n", + "β”‚ REAL-WORLD CONTEXT LIMITS: β”‚\n", + "β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚\n", + "β”‚ β”‚ GPT-3: 2K tokens (limited by memory) β”‚ β”‚\n", + "β”‚ β”‚ GPT-4: 8K tokens (32K with optimizations) β”‚ β”‚\n", + "β”‚ β”‚ Claude-3: 200K tokens (special techniques required!) β”‚ β”‚\n", + "β”‚ β”‚ GPT-4o: 128K tokens (efficient attention) β”‚ β”‚\n", + "β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚\n", + "β”‚ β”‚\n", + "β”‚ MATHEMATICAL SCALING: β”‚\n", + "β”‚ Memory = batch_size Γ— num_heads Γ— seq_lenΒ² Γ— 4 bytes β”‚\n", + "β”‚ ↑ β”‚\n", + "β”‚ This is the killer! β”‚\n", + "β”‚ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜\n", "```" ] }, { "cell_type": "code", "execution_count": null, - "id": "039199a8", + "id": "0849dfd0", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1797,7 +1922,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a249a5a0", + "id": "3d83a8fb", "metadata": { "nbgrader": { "grade": false, @@ -1842,7 +1967,7 @@ }, { "cell_type": "markdown", - "id": "253b8e90", + "id": "61c047e3", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1856,11 +1981,11 @@ { "cell_type": "code", "execution_count": null, - "id": "d9431f80", + "id": "1f23223b", "metadata": { "nbgrader": { "grade": true, - "grade_id": "module-integration", + "grade_id": "test-module", "locked": true, "points": 25 } @@ -1927,15 +2052,16 @@ "\n", " print(\"\\n\" + \"=\" * 50)\n", " print(\"πŸŽ‰ ALL TESTS PASSED! Module ready for export.\")\n", - " print(\"Run: tito module complete 13_transformers\")\n", + " print(\"Run: tito module complete 13\")\n", "\n", + "# Call the comprehensive test\n", "test_module()" ] }, { "cell_type": "code", "execution_count": null, - "id": "28f5c8ca", + "id": "d9c5a7f9", "metadata": {}, "outputs": [], "source": [ @@ -1947,201 +2073,68 @@ }, { "cell_type": "markdown", - "id": "440ab431", + "id": "203f8df1", "metadata": { "cell_marker": "\"\"\"" }, "source": [ - "## πŸ€” ML Systems Thinking: Transformer Architecture\n", + "## πŸ€” ML Systems Thinking: Transformer Architecture Foundations\n", "\n", - "Now that you've built a complete transformer model, let's reflect on the systems implications and design decisions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f6986b9", - "metadata": { - "lines_to_next_cell": 0, - "nbgrader": { - "grade": false, - "grade_id": "systems-q1", - "solution": true - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "a465d45c", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "### Question 1: Attention Complexity Analysis\n", + "### Question 1: Attention Memory Complexity\n", "You implemented multi-head attention that computes attention matrices of size (batch, heads, seq_len, seq_len).\n", "\n", - "**a) Memory Scaling**: For GPT-4 scale (context length 8192, batch size 16, 96 attention heads):\n", - "- Attention matrix elements: _____ (calculate: 16 Γ— 96 Γ— 8192 Γ— 8192)\n", - "- Memory in GB (4 bytes/float): _____ GB per layer\n", - "- For 96 layers: _____ GB total just for attention matrices\n", + "For a model with seq_len=1024, batch_size=4, num_heads=8:\n", + "- How many elements in the attention matrix? _____\n", + "- If each element is 4 bytes (float32), how much memory per layer? _____ MB\n", + "- Why does doubling sequence length quadruple attention memory? _____\n", "\n", - "**b) Why Quadratic Matters**: If processing costs $0.01 per GB, what's the cost difference between:\n", - "- 1K context: $_____\n", - "- 8K context: $_____\n", - "- 32K context: $_____\n", + "### Question 2: Residual Connection Benefits\n", + "Your TransformerBlock uses residual connections (x + attention_output, x + mlp_output).\n", "\n", - "*Think about: Why long-context models are expensive, and why FlashAttention matters*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb3a7788", - "metadata": { - "lines_to_next_cell": 0, - "nbgrader": { - "grade": false, - "grade_id": "systems-q2", - "solution": true - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "a2f18da0", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "### Question 2: Parameter Distribution Analysis\n", - "Your GPT model has parameters in embeddings, transformer blocks, and the language head.\n", + "- What happens to gradients during backpropagation without residual connections? _____\n", + "- How do residual connections help train deeper networks? _____\n", + "- Why is pre-norm (LayerNorm before operations) preferred over post-norm? _____\n", "\n", - "**a) Parameter Breakdown**: For a model with vocab_size=50K, embed_dim=1024, num_layers=24:\n", - "- Token embedding: _____ parameters (vocab_size Γ— embed_dim)\n", - "- Each transformer block: approximately _____ parameters\n", - "- Language head: _____ parameters\n", - "- Total model: approximately _____ parameters\n", + "### Question 3: Parameter Scaling Analysis\n", + "Your GPT model combines embeddings, transformer blocks, and output projection.\n", "\n", - "**b) Memory During Training**: Training requires storing:\n", - "- Parameters (model weights)\n", - "- Gradients (same size as parameters)\n", - "- Optimizer states (2-3Γ— parameters for Adam)\n", - "- Activations (depends on batch size and sequence length)\n", + "For embed_dim=512, vocab_size=10000, num_layers=6:\n", + "- Token embedding parameters: _____ (vocab_size Γ— embed_dim)\n", + "- Approximate parameters per transformer block: _____ (hint: ~4 Γ— embed_dimΒ²)\n", + "- Total model parameters: approximately _____ million\n", "\n", - "For your calculated model size, estimate total training memory: _____ GB\n", + "### Question 4: Autoregressive Generation Efficiency\n", + "Your generate() method processes the full sequence for each new token.\n", "\n", - "*Consider: Why training large models requires hundreds of GPUs*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "259119cb", - "metadata": { - "lines_to_next_cell": 0, - "nbgrader": { - "grade": false, - "grade_id": "systems-q3", - "solution": true - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "680a951e", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "### Question 3: Autoregressive Generation Bottlenecks\n", - "Your generate() method runs the full model forward pass for each new token.\n", - "\n", - "**a) Generation Inefficiency**: To generate 100 tokens with a 24-layer model:\n", - "- Token 1: _____ layer computations (24 layers Γ— 1 position)\n", - "- Token 2: _____ layer computations (24 layers Γ— 2 positions)\n", - "- Token 100: _____ layer computations (24 layers Γ— 100 positions)\n", - "- Total: _____ layer computations\n", - "\n", - "**b) KV-Cache Optimization**: With KV-caching, each new token only needs:\n", - "- _____ layer computations (just the new position)\n", - "- This reduces computation by approximately _____Γ— for 100 tokens\n", - "\n", - "*Think about: Why inference optimization matters for production deployment*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e99d5ae3", - "metadata": { - "lines_to_next_cell": 0, - "nbgrader": { - "grade": false, - "grade_id": "systems-q4", - "solution": true - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "678beea4", - "metadata": { - "cell_marker": "\"\"\"" - }, - "source": [ - "### Question 4: Pre-norm vs Post-norm Architecture\n", - "You implemented pre-norm (LayerNorm before attention/MLP) rather than post-norm (LayerNorm after).\n", - "\n", - "**a) Training Stability**: Pre-norm helps with gradient flow because:\n", - "- Residual connections pass _____ gradients directly through the network\n", - "- LayerNorm before operations provides _____ input distributions\n", - "- This enables training _____ networks compared to post-norm\n", - "\n", - "**b) Performance Trade-offs**:\n", - "- Pre-norm: Better training stability, but slightly _____ final performance\n", - "- Post-norm: Better performance when it trains, but requires _____ learning rates\n", - "- Most modern large models use _____ because scale requires stability\n", - "\n", - "*Consider: Why architectural choices become more important at scale*" + "- Why is this inefficient for long sequences? _____\n", + "- What optimization caches key-value pairs to avoid recomputation? _____\n", + "- How would this change the computational complexity from O(nΒ²) to O(n)? _____" ] }, { "cell_type": "markdown", - "id": "4e8cc6dc", + "id": "13761f1f", "metadata": { "cell_marker": "\"\"\"" }, "source": [ "## 🎯 MODULE SUMMARY: Transformers\n", "\n", - "Congratulations! You've built the complete transformer architecture that powers modern language models!\n", + "Congratulations! You've built the complete transformer architecture that powers modern language models like GPT, Claude, and ChatGPT!\n", "\n", "### Key Accomplishments\n", - "- Built LayerNorm for stable training across deep networks\n", - "- Implemented MLP (feed-forward) networks with GELU activation\n", + "- Built LayerNorm for stable training across deep transformer networks\n", + "- Implemented MLP (feed-forward) networks with GELU activation and 4x expansion\n", "- Created complete TransformerBlock with self-attention, residual connections, and pre-norm architecture\n", "- Built full GPT model with embeddings, positional encoding, and autoregressive generation\n", - "- Analyzed parameter scaling and attention memory complexity\n", + "- Discovered attention memory scaling and parameter distribution patterns\n", "- All tests pass βœ… (validated by `test_module()`)\n", "\n", "### Ready for Next Steps\n", - "Your transformer implementation is the foundation for modern language models! This architecture enables:\n", - "- **Training**: Learn patterns from massive text datasets\n", - "- **Generation**: Produce coherent, contextual text\n", - "- **Transfer Learning**: Fine-tune for specific tasks\n", - "- **Scaling**: Grow to billions of parameters for emergent capabilities\n", + "Your transformer implementation is the capstone of the language modeling pipeline.\n", + "Export with: `tito module complete 13`\n", "\n", - "Export with: `tito module complete 13_transformers`\n", - "\n", - "**Next**: Module 14 will add KV-caching for efficient generation, optimizing the autoregressive inference you just implemented!" + "**Next**: Module 14 will add profiling and optimization techniques to make your transformers production-ready!" ] } ], diff --git a/modules/source/13_transformers/transformers_dev.py b/modules/source/13_transformers/transformers_dev.py index c4ab25da..3028e009 100644 --- a/modules/source/13_transformers/transformers_dev.py +++ b/modules/source/13_transformers/transformers_dev.py @@ -40,8 +40,16 @@ By the end of this module, you will: Let's get started! """ +# %% #| default_exp models.transformer + +# %% #| export +import numpy as np +from tinytorch.core.tensor import Tensor +from tinytorch.core.layers import Linear +from tinytorch.core.attention import MultiHeadAttention +from tinytorch.core.activations import GELU # %% [markdown] """ diff --git a/tinytorch/_modidx.py b/tinytorch/_modidx.py index cbf6acd7..1d4c6a2f 100644 --- a/tinytorch/_modidx.py +++ b/tinytorch/_modidx.py @@ -61,6 +61,16 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/core/activations.py'), 'tinytorch.core.activations.Tanh.forward': ( '02_activations/activations_dev.html#tanh.forward', 'tinytorch/core/activations.py')}, + 'tinytorch.core.attention': { 'tinytorch.core.attention.MultiHeadAttention': ( '12_attention/attention_dev.html#multiheadattention', + 'tinytorch/core/attention.py'), + 'tinytorch.core.attention.MultiHeadAttention.__init__': ( '12_attention/attention_dev.html#multiheadattention.__init__', + 'tinytorch/core/attention.py'), + 'tinytorch.core.attention.MultiHeadAttention.forward': ( '12_attention/attention_dev.html#multiheadattention.forward', + 'tinytorch/core/attention.py'), + 'tinytorch.core.attention.MultiHeadAttention.parameters': ( '12_attention/attention_dev.html#multiheadattention.parameters', + 'tinytorch/core/attention.py'), + 'tinytorch.core.attention.scaled_dot_product_attention': ( '12_attention/attention_dev.html#scaled_dot_product_attention', + 'tinytorch/core/attention.py')}, 'tinytorch.core.autograd': {}, 'tinytorch.core.layers': { 'tinytorch.core.layers.Dropout': ('03_layers/layers_dev.html#dropout', 'tinytorch/core/layers.py'), 'tinytorch.core.layers.Dropout.__call__': ( '03_layers/layers_dev.html#dropout.__call__', @@ -270,6 +280,72 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/data/loader.py'), 'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__', 'tinytorch/data/loader.py')}, + 'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.GPT.forward': ( '13_transformers/transformers_dev.html#gpt.forward', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.GPT.generate': ( '13_transformers/transformers_dev.html#gpt.generate', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.GPT.parameters': ( '13_transformers/transformers_dev.html#gpt.parameters', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.LayerNorm.parameters': ( '13_transformers/transformers_dev.html#layernorm.parameters', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.MLP.parameters': ( '13_transformers/transformers_dev.html#mlp.parameters', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward', + 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters', + 'tinytorch/models/transformer.py')}, + 'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.Embedding.forward': ( '11_embeddings/embeddings_dev.html#embedding.forward', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.Embedding.parameters': ( '11_embeddings/embeddings_dev.html#embedding.parameters', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.EmbeddingLayer.forward': ( '11_embeddings/embeddings_dev.html#embeddinglayer.forward', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.EmbeddingLayer.parameters': ( '11_embeddings/embeddings_dev.html#embeddinglayer.parameters', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.PositionalEncoding.forward': ( '11_embeddings/embeddings_dev.html#positionalencoding.forward', + 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.PositionalEncoding.parameters': ( '11_embeddings/embeddings_dev.html#positionalencoding.parameters', + 'tinytorch/text/embeddings.py')}, 'tinytorch.text.tokenization': { 'tinytorch.text.tokenization.BPETokenizer': ( '10_tokenization/tokenization_dev.html#bpetokenizer', 'tinytorch/text/tokenization.py'), 'tinytorch.text.tokenization.BPETokenizer.__init__': ( '10_tokenization/tokenization_dev.html#bpetokenizer.__init__', diff --git a/tinytorch/core/attention.py b/tinytorch/core/attention.py new file mode 100644 index 00000000..0f981a44 --- /dev/null +++ b/tinytorch/core/attention.py @@ -0,0 +1,291 @@ +# ╔═══════════════════════════════════════════════════════════════════════════════╗ +# β•‘ 🚨 CRITICAL WARNING 🚨 β•‘ +# β•‘ AUTOGENERATED! DO NOT EDIT! β•‘ +# β•‘ β•‘ +# β•‘ This file is AUTOMATICALLY GENERATED from source modules. β•‘ +# β•‘ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! β•‘ +# β•‘ β•‘ +# β•‘ βœ… TO EDIT: modules/source/07_attention/attention_dev.py β•‘ +# β•‘ βœ… TO EXPORT: Run 'tito module complete ' β•‘ +# β•‘ β•‘ +# β•‘ πŸ›‘οΈ STUDENT PROTECTION: This file contains optimized implementations. β•‘ +# β•‘ Editing it directly may break module functionality and training. β•‘ +# β•‘ β•‘ +# β•‘ πŸŽ“ LEARNING TIP: Work in modules/source/ - that's where real development β•‘ +# β•‘ happens! The tinytorch/ directory is just the compiled output. β•‘ +# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• +# %% auto 0 +__all__ = ['scaled_dot_product_attention', 'MultiHeadAttention'] + +# %% ../../modules/source/12_attention/attention_dev.ipynb 0 +#| default_exp core.attention +#| export + +# %% ../../modules/source/12_attention/attention_dev.ipynb 2 +import numpy as np +import math +import time +from typing import Optional, Tuple, List + +# Import dependencies from previous modules - following TinyTorch dependency chain +from .tensor import Tensor +from .layers import Linear + +# %% ../../modules/source/12_attention/attention_dev.ipynb 6 +def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: + """ + Compute scaled dot-product attention. + + This is the fundamental attention operation that powers all transformer models. + We'll implement it with explicit loops first to show the O(nΒ²) complexity. + + TODO: Implement scaled dot-product attention step by step + + APPROACH: + 1. Extract dimensions and validate inputs + 2. Compute attention scores with explicit nested loops (show O(nΒ²) complexity) + 3. Scale by 1/√d_k for numerical stability + 4. Apply causal mask if provided (set masked positions to -inf) + 5. Apply softmax to get attention weights + 6. Apply values with attention weights (another O(nΒ²) operation) + 7. Return output and attention weights + + Args: + Q: Query tensor of shape (batch_size, seq_len, d_model) + K: Key tensor of shape (batch_size, seq_len, d_model) + V: Value tensor of shape (batch_size, seq_len, d_model) + mask: Optional causal mask, True=allow, False=mask (batch_size, seq_len, seq_len) + + Returns: + output: Attended values (batch_size, seq_len, d_model) + attention_weights: Attention matrix (batch_size, seq_len, seq_len) + + EXAMPLE: + >>> Q = Tensor(np.random.randn(2, 4, 64)) # batch=2, seq=4, dim=64 + >>> K = Tensor(np.random.randn(2, 4, 64)) + >>> V = Tensor(np.random.randn(2, 4, 64)) + >>> output, weights = scaled_dot_product_attention(Q, K, V) + >>> print(output.shape) # (2, 4, 64) + >>> print(weights.shape) # (2, 4, 4) + >>> print(weights.data[0].sum(axis=1)) # Each row sums to ~1.0 + + HINTS: + - Use explicit nested loops to compute Q[i] @ K[j] for educational purposes + - Scale factor is 1/√d_k where d_k is the last dimension of Q + - Masked positions should be set to -1e9 before softmax + - Remember that softmax normalizes along the last dimension + """ + ### BEGIN SOLUTION + # Step 1: Extract dimensions and validate + batch_size, seq_len, d_model = Q.shape + assert K.shape == (batch_size, seq_len, d_model), f"K shape {K.shape} doesn't match Q shape {Q.shape}" + assert V.shape == (batch_size, seq_len, d_model), f"V shape {V.shape} doesn't match Q shape {Q.shape}" + + # Step 2: Compute attention scores with explicit loops (educational O(nΒ²) demonstration) + scores = np.zeros((batch_size, seq_len, seq_len)) + + # Show the quadratic complexity explicitly + for b in range(batch_size): # For each batch + for i in range(seq_len): # For each query position + for j in range(seq_len): # Attend to each key position + # Compute dot product between query i and key j + score = 0.0 + for d in range(d_model): # Dot product across embedding dimension + score += Q.data[b, i, d] * K.data[b, j, d] + scores[b, i, j] = score + + # Step 3: Scale by 1/√d_k for numerical stability + scale_factor = 1.0 / math.sqrt(d_model) + scores = scores * scale_factor + + # Step 4: Apply causal mask if provided + if mask is not None: + # mask[i,j] = False means position j should not attend to position i + mask_value = -1e9 # Large negative value becomes 0 after softmax + for b in range(batch_size): + for i in range(seq_len): + for j in range(seq_len): + if not mask.data[b, i, j]: # If mask is False, block attention + scores[b, i, j] = mask_value + + # Step 5: Apply softmax to get attention weights (probability distribution) + attention_weights = np.zeros_like(scores) + for b in range(batch_size): + for i in range(seq_len): + # Softmax over the j dimension (what this query attends to) + row = scores[b, i, :] + max_val = np.max(row) # Numerical stability + exp_row = np.exp(row - max_val) + sum_exp = np.sum(exp_row) + attention_weights[b, i, :] = exp_row / sum_exp + + # Step 6: Apply attention weights to values (another O(nΒ²) operation) + output = np.zeros((batch_size, seq_len, d_model)) + + # Again, show the quadratic complexity + for b in range(batch_size): # For each batch + for i in range(seq_len): # For each output position + for j in range(seq_len): # Weighted sum over all value positions + weight = attention_weights[b, i, j] + for d in range(d_model): # Accumulate across embedding dimension + output[b, i, d] += weight * V.data[b, j, d] + + return Tensor(output), Tensor(attention_weights) + ### END SOLUTION + +# %% ../../modules/source/12_attention/attention_dev.ipynb 10 +class MultiHeadAttention: + """ + Multi-head attention mechanism. + + Runs multiple attention heads in parallel, each learning different relationships. + This is the core component of transformer architectures. + """ + + def __init__(self, embed_dim: int, num_heads: int): + """ + Initialize multi-head attention. + + TODO: Set up linear projections and validate configuration + + APPROACH: + 1. Validate that embed_dim is divisible by num_heads + 2. Calculate head_dim (embed_dim // num_heads) + 3. Create linear layers for Q, K, V projections + 4. Create output projection layer + 5. Store configuration parameters + + Args: + embed_dim: Embedding dimension (d_model) + num_heads: Number of parallel attention heads + + EXAMPLE: + >>> mha = MultiHeadAttention(embed_dim=512, num_heads=8) + >>> mha.head_dim # 64 (512 / 8) + >>> len(mha.parameters()) # 4 linear layers * 2 params each = 8 tensors + + HINTS: + - head_dim = embed_dim // num_heads must be integer + - Need 4 Linear layers: q_proj, k_proj, v_proj, out_proj + - Each projection maps embed_dim β†’ embed_dim + """ + ### BEGIN SOLUTION + assert embed_dim % num_heads == 0, f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})" + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + # Linear projections for queries, keys, values + self.q_proj = Linear(embed_dim, embed_dim) + self.k_proj = Linear(embed_dim, embed_dim) + self.v_proj = Linear(embed_dim, embed_dim) + + # Output projection to mix information across heads + self.out_proj = Linear(embed_dim, embed_dim) + ### END SOLUTION + + def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor: + """ + Forward pass through multi-head attention. + + TODO: Implement the complete multi-head attention forward pass + + APPROACH: + 1. Extract input dimensions (batch_size, seq_len, embed_dim) + 2. Project input to Q, K, V using linear layers + 3. Reshape projections to separate heads: (batch, seq, heads, head_dim) + 4. Transpose to (batch, heads, seq, head_dim) for parallel processing + 5. Apply scaled dot-product attention to each head + 6. Transpose back and reshape to merge heads + 7. Apply output projection + + Args: + x: Input tensor (batch_size, seq_len, embed_dim) + mask: Optional attention mask (batch_size, seq_len, seq_len) + + Returns: + output: Attended representation (batch_size, seq_len, embed_dim) + + EXAMPLE: + >>> mha = MultiHeadAttention(embed_dim=64, num_heads=8) + >>> x = Tensor(np.random.randn(2, 10, 64)) # batch=2, seq=10, dim=64 + >>> output = mha.forward(x) + >>> print(output.shape) # (2, 10, 64) - same as input + + HINTS: + - Reshape: (batch, seq, embed_dim) β†’ (batch, seq, heads, head_dim) + - Transpose: (batch, seq, heads, head_dim) β†’ (batch, heads, seq, head_dim) + - After attention: reverse the process to merge heads + - Use scaled_dot_product_attention for each head + """ + ### BEGIN SOLUTION + # Step 1: Extract dimensions + batch_size, seq_len, embed_dim = x.shape + assert embed_dim == self.embed_dim, f"Input dim {embed_dim} doesn't match expected {self.embed_dim}" + + # Step 2: Project to Q, K, V + Q = self.q_proj.forward(x) # (batch, seq, embed_dim) + K = self.k_proj.forward(x) + V = self.v_proj.forward(x) + + # Step 3: Reshape to separate heads + # From (batch, seq, embed_dim) to (batch, seq, num_heads, head_dim) + Q_heads = Q.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim) + K_heads = K.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim) + V_heads = V.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim) + + # Step 4: Transpose to (batch, num_heads, seq, head_dim) for parallel processing + Q_heads = np.transpose(Q_heads, (0, 2, 1, 3)) + K_heads = np.transpose(K_heads, (0, 2, 1, 3)) + V_heads = np.transpose(V_heads, (0, 2, 1, 3)) + + # Step 5: Apply attention to each head + head_outputs = [] + for h in range(self.num_heads): + # Extract this head's Q, K, V + Q_h = Tensor(Q_heads[:, h, :, :]) # (batch, seq, head_dim) + K_h = Tensor(K_heads[:, h, :, :]) + V_h = Tensor(V_heads[:, h, :, :]) + + # Apply attention for this head + head_out, _ = scaled_dot_product_attention(Q_h, K_h, V_h, mask) + head_outputs.append(head_out.data) + + # Step 6: Concatenate heads back together + # Stack: list of (batch, seq, head_dim) β†’ (batch, num_heads, seq, head_dim) + concat_heads = np.stack(head_outputs, axis=1) + + # Transpose back: (batch, num_heads, seq, head_dim) β†’ (batch, seq, num_heads, head_dim) + concat_heads = np.transpose(concat_heads, (0, 2, 1, 3)) + + # Reshape: (batch, seq, num_heads, head_dim) β†’ (batch, seq, embed_dim) + concat_output = concat_heads.reshape(batch_size, seq_len, self.embed_dim) + + # Step 7: Apply output projection + output = self.out_proj.forward(Tensor(concat_output)) + + return output + ### END SOLUTION + + def parameters(self) -> List[Tensor]: + """ + Return all trainable parameters. + + TODO: Collect parameters from all linear layers + + APPROACH: + 1. Get parameters from q_proj, k_proj, v_proj, out_proj + 2. Combine into single list + + Returns: + List of all parameter tensors + """ + ### BEGIN SOLUTION + params = [] + params.extend(self.q_proj.parameters()) + params.extend(self.k_proj.parameters()) + params.extend(self.v_proj.parameters()) + params.extend(self.out_proj.parameters()) + return params + ### END SOLUTION diff --git a/tinytorch/models/transformer.py b/tinytorch/models/transformer.py new file mode 100644 index 00000000..e96fdb14 --- /dev/null +++ b/tinytorch/models/transformer.py @@ -0,0 +1,462 @@ +# ╔═══════════════════════════════════════════════════════════════════════════════╗ +# β•‘ 🚨 CRITICAL WARNING 🚨 β•‘ +# β•‘ AUTOGENERATED! DO NOT EDIT! β•‘ +# β•‘ β•‘ +# β•‘ This file is AUTOMATICALLY GENERATED from source modules. β•‘ +# β•‘ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! β•‘ +# β•‘ β•‘ +# β•‘ βœ… TO EDIT: modules/source/XX_transformer/transformer_dev.py β•‘ +# β•‘ βœ… TO EXPORT: Run 'tito module complete ' β•‘ +# β•‘ β•‘ +# β•‘ πŸ›‘οΈ STUDENT PROTECTION: This file contains optimized implementations. β•‘ +# β•‘ Editing it directly may break module functionality and training. β•‘ +# β•‘ β•‘ +# β•‘ πŸŽ“ LEARNING TIP: Work in modules/source/ - that's where real development β•‘ +# β•‘ happens! The tinytorch/ directory is just the compiled output. β•‘ +# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• +# %% auto 0 +__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT'] + +# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2 +import numpy as np +from ..core.tensor import Tensor +from ..core.layers import Linear +from ..core.attention import MultiHeadAttention +from ..core.activations import GELU + +# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9 +class LayerNorm: + """ + Layer Normalization for transformer blocks. + + Normalizes across the feature dimension (last axis) for each sample independently, + unlike batch normalization which normalizes across the batch dimension. + """ + + def __init__(self, normalized_shape, eps=1e-5): + """ + Initialize LayerNorm with learnable parameters. + + TODO: Set up normalization parameters + + APPROACH: + 1. Store the shape to normalize over (usually embed_dim) + 2. Initialize learnable scale (gamma) and shift (beta) parameters + 3. Set small epsilon for numerical stability + + EXAMPLE: + >>> ln = LayerNorm(512) # For 512-dimensional embeddings + >>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, features) + >>> normalized = ln.forward(x) + >>> # Each (2, 10) sample normalized independently across 512 features + + HINTS: + - gamma should start at 1.0 (identity scaling) + - beta should start at 0.0 (no shift) + - eps prevents division by zero in variance calculation + """ + ### BEGIN SOLUTION + self.normalized_shape = normalized_shape + self.eps = eps + + # Learnable parameters: scale and shift + self.gamma = Tensor(np.ones(normalized_shape)) # Scale parameter + self.beta = Tensor(np.zeros(normalized_shape)) # Shift parameter + ### END SOLUTION + + def forward(self, x): + """ + Apply layer normalization. + + TODO: Implement layer normalization formula + + APPROACH: + 1. Compute mean and variance across the last dimension + 2. Normalize: (x - mean) / sqrt(variance + eps) + 3. Apply learnable scale and shift: gamma * normalized + beta + + MATHEMATICAL FORMULA: + y = (x - ΞΌ) / Οƒ * Ξ³ + Ξ² + where ΞΌ = mean(x), Οƒ = sqrt(var(x) + Ξ΅) + + HINT: Use keepdims=True to maintain tensor dimensions for broadcasting + """ + ### BEGIN SOLUTION + # Compute statistics across last dimension (features) + mean = x.mean(axis=-1, keepdims=True) + + # Compute variance: E[(x - ΞΌ)Β²] + diff = Tensor(x.data - mean.data) + variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True)) + + # Normalize + std = Tensor(np.sqrt(variance.data + self.eps)) + normalized = Tensor((x.data - mean.data) / std.data) + + # Apply learnable transformation + output = normalized * self.gamma + self.beta + return output + ### END SOLUTION + + def parameters(self): + """Return learnable parameters.""" + return [self.gamma, self.beta] + +# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13 +class MLP: + """ + Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks. + + Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1. + This provides the non-linear transformation in each transformer block. + """ + + def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1): + """ + Initialize MLP with two linear layers. + + TODO: Set up the feed-forward network layers + + APPROACH: + 1. First layer expands from embed_dim to hidden_dim (usually 4x larger) + 2. Second layer projects back to embed_dim + 3. Use GELU activation (smoother than ReLU, preferred in transformers) + + EXAMPLE: + >>> mlp = MLP(512) # Will create 512 -> 2048 -> 512 network + >>> x = Tensor(np.random.randn(2, 10, 512)) + >>> output = mlp.forward(x) + >>> assert output.shape == (2, 10, 512) + + HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim) + """ + ### BEGIN SOLUTION + if hidden_dim is None: + hidden_dim = 4 * embed_dim # Standard 4x expansion + + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + + # Two-layer feed-forward network + self.linear1 = Linear(embed_dim, hidden_dim) + self.linear2 = Linear(hidden_dim, embed_dim) + ### END SOLUTION + + def forward(self, x): + """ + Forward pass through MLP. + + TODO: Implement the feed-forward computation + + APPROACH: + 1. First linear transformation: embed_dim -> hidden_dim + 2. Apply GELU activation (smooth, differentiable) + 3. Second linear transformation: hidden_dim -> embed_dim + + COMPUTATION FLOW: + x -> Linear -> GELU -> Linear -> output + + HINT: GELU activation is implemented above as a function + """ + ### BEGIN SOLUTION + # First linear layer with expansion + hidden = self.linear1.forward(x) + + # GELU activation + hidden = gelu(hidden) + + # Second linear layer back to original size + output = self.linear2.forward(hidden) + + return output + ### END SOLUTION + + def parameters(self): + """Return all learnable parameters.""" + params = [] + params.extend(self.linear1.parameters()) + params.extend(self.linear2.parameters()) + return params + +# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17 +class TransformerBlock: + """ + Complete Transformer Block with self-attention, MLP, and residual connections. + + This is the core building block of GPT and other transformer models. + Each block processes the input sequence and passes it to the next block. + """ + + def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1): + """ + Initialize a complete transformer block. + + TODO: Set up all components of the transformer block + + APPROACH: + 1. Multi-head self-attention for sequence modeling + 2. First layer normalization (pre-norm architecture) + 3. MLP with specified expansion ratio + 4. Second layer normalization + + TRANSFORMER BLOCK ARCHITECTURE: + x β†’ LayerNorm β†’ MultiHeadAttention β†’ + (residual) β†’ + LayerNorm β†’ MLP β†’ + (residual) β†’ output + + EXAMPLE: + >>> block = TransformerBlock(embed_dim=512, num_heads=8) + >>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, embed) + >>> output = block.forward(x) + >>> assert output.shape == (2, 10, 512) + + HINT: We use pre-norm architecture (LayerNorm before attention/MLP) + """ + ### BEGIN SOLUTION + self.embed_dim = embed_dim + self.num_heads = num_heads + + # Multi-head self-attention + self.attention = MultiHeadAttention(embed_dim, num_heads) + + # Layer normalizations (pre-norm architecture) + self.ln1 = LayerNorm(embed_dim) # Before attention + self.ln2 = LayerNorm(embed_dim) # Before MLP + + # Feed-forward network + hidden_dim = int(embed_dim * mlp_ratio) + self.mlp = MLP(embed_dim, hidden_dim) + ### END SOLUTION + + def forward(self, x, mask=None): + """ + Forward pass through transformer block. + + TODO: Implement the complete transformer block computation + + APPROACH: + 1. Apply layer norm, then self-attention, then add residual + 2. Apply layer norm, then MLP, then add residual + 3. Return the transformed sequence + + COMPUTATION FLOW: + x β†’ ln1 β†’ attention β†’ + x β†’ ln2 β†’ mlp β†’ + β†’ output + + RESIDUAL CONNECTIONS: + These are crucial for training deep networks - they allow gradients + to flow directly through the network during backpropagation. + + HINT: Store intermediate results to add residual connections properly + """ + ### BEGIN SOLUTION + # First sub-layer: Multi-head self-attention with residual connection + # Pre-norm: LayerNorm before attention + normed1 = self.ln1.forward(x) + # Self-attention: query, key, value are all the same (normed1) + attention_out = self.attention.forward(normed1, normed1, normed1, mask) + + # Residual connection + x = x + attention_out + + # Second sub-layer: MLP with residual connection + # Pre-norm: LayerNorm before MLP + normed2 = self.ln2.forward(x) + mlp_out = self.mlp.forward(normed2) + + # Residual connection + output = x + mlp_out + + return output + ### END SOLUTION + + def parameters(self): + """Return all learnable parameters.""" + params = [] + params.extend(self.attention.parameters()) + params.extend(self.ln1.parameters()) + params.extend(self.ln2.parameters()) + params.extend(self.mlp.parameters()) + return params + +# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21 +class GPT: + """ + Complete GPT (Generative Pre-trained Transformer) model. + + This combines embeddings, positional encoding, multiple transformer blocks, + and a language modeling head for text generation. + """ + + def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024): + """ + Initialize complete GPT model. + + TODO: Set up all components of the GPT architecture + + APPROACH: + 1. Token embedding layer to convert tokens to vectors + 2. Positional embedding to add position information + 3. Stack of transformer blocks (the main computation) + 4. Final layer norm and language modeling head + + GPT ARCHITECTURE: + tokens β†’ embedding β†’ + pos_embedding β†’ + transformer_blocks β†’ layer_norm β†’ lm_head β†’ logits + + EXAMPLE: + >>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8) + >>> tokens = Tensor(np.random.randint(0, 1000, (2, 10))) # (batch, seq) + >>> logits = model.forward(tokens) + >>> assert logits.shape == (2, 10, 1000) # (batch, seq, vocab) + + HINTS: + - Positional embeddings are learned, not fixed sinusoidal + - Final layer norm stabilizes training + - Language modeling head shares weights with token embedding (tie_weights) + """ + ### BEGIN SOLUTION + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.num_heads = num_heads + self.max_seq_len = max_seq_len + + # Token and positional embeddings + self.token_embedding = Embedding(vocab_size, embed_dim) + self.position_embedding = Embedding(max_seq_len, embed_dim) + + # Stack of transformer blocks + self.blocks = [] + for _ in range(num_layers): + block = TransformerBlock(embed_dim, num_heads) + self.blocks.append(block) + + # Final layer normalization + self.ln_f = LayerNorm(embed_dim) + + # Language modeling head (projects to vocabulary) + self.lm_head = Linear(embed_dim, vocab_size, bias=False) + ### END SOLUTION + + def forward(self, tokens): + """ + Forward pass through GPT model. + + TODO: Implement the complete GPT forward pass + + APPROACH: + 1. Get token embeddings and positional embeddings + 2. Add them together (broadcasting handles different shapes) + 3. Pass through all transformer blocks sequentially + 4. Apply final layer norm and language modeling head + + COMPUTATION FLOW: + tokens β†’ embed + pos_embed β†’ blocks β†’ ln_f β†’ lm_head β†’ logits + + CAUSAL MASKING: + For autoregressive generation, we need to prevent tokens from + seeing future tokens. This is handled by the attention mask. + + HINT: Create position indices as range(seq_len) for positional embedding + """ + ### BEGIN SOLUTION + batch_size, seq_len = tokens.shape + + # Token embeddings + token_emb = self.token_embedding.forward(tokens) + + # Positional embeddings + positions = Tensor(np.arange(seq_len).reshape(1, seq_len)) + pos_emb = self.position_embedding.forward(positions) + + # Combine embeddings + x = token_emb + pos_emb + + # Create causal mask for autoregressive generation + mask = self._create_causal_mask(seq_len) + + # Pass through transformer blocks + for block in self.blocks: + x = block.forward(x, mask) + + # Final layer normalization + x = self.ln_f.forward(x) + + # Language modeling head + logits = self.lm_head.forward(x) + + return logits + ### END SOLUTION + + def _create_causal_mask(self, seq_len): + """Create causal mask to prevent attending to future positions.""" + ### BEGIN SOLUTION + # Upper triangular matrix filled with -inf + mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1) + return Tensor(mask) + ### END SOLUTION + + def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0): + """ + Generate text autoregressively. + + TODO: Implement autoregressive text generation + + APPROACH: + 1. Start with prompt tokens + 2. For each new position: + - Run forward pass to get logits + - Sample next token from logits + - Append to sequence + 3. Return generated sequence + + AUTOREGRESSIVE GENERATION: + At each step, the model predicts the next token based on all + previous tokens. This is how GPT generates coherent text. + + EXAMPLE: + >>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4) + >>> prompt = Tensor([[1, 2, 3]]) # Some token sequence + >>> generated = model.generate(prompt, max_new_tokens=5) + >>> assert generated.shape[1] == 3 + 5 # original + new tokens + + HINT: Use np.random.choice with temperature for sampling + """ + ### BEGIN SOLUTION + current_tokens = Tensor(prompt_tokens.data.copy()) + + for _ in range(max_new_tokens): + # Get logits for current sequence + logits = self.forward(current_tokens) + + # Get logits for last position (next token prediction) + last_logits = logits.data[:, -1, :] # (batch_size, vocab_size) + + # Apply temperature scaling + scaled_logits = last_logits / temperature + + # Convert to probabilities (softmax) + exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True)) + probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True) + + # Sample next token + next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]]) + + # Append to sequence + current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1)) + + return current_tokens + ### END SOLUTION + + def parameters(self): + """Return all learnable parameters.""" + params = [] + params.extend(self.token_embedding.parameters()) + params.extend(self.position_embedding.parameters()) + + for block in self.blocks: + params.extend(block.parameters()) + + params.extend(self.ln_f.parameters()) + params.extend(self.lm_head.parameters()) + + return params diff --git a/tinytorch/text/embeddings.py b/tinytorch/text/embeddings.py new file mode 100644 index 00000000..b71d7c4c --- /dev/null +++ b/tinytorch/text/embeddings.py @@ -0,0 +1,333 @@ +# ╔═══════════════════════════════════════════════════════════════════════════════╗ +# β•‘ 🚨 CRITICAL WARNING 🚨 β•‘ +# β•‘ AUTOGENERATED! DO NOT EDIT! β•‘ +# β•‘ β•‘ +# β•‘ This file is AUTOMATICALLY GENERATED from source modules. β•‘ +# β•‘ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! β•‘ +# β•‘ β•‘ +# β•‘ βœ… TO EDIT: modules/source/XX_embeddings/embeddings_dev.py β•‘ +# β•‘ βœ… TO EXPORT: Run 'tito module complete ' β•‘ +# β•‘ β•‘ +# β•‘ πŸ›‘οΈ STUDENT PROTECTION: This file contains optimized implementations. β•‘ +# β•‘ Editing it directly may break module functionality and training. β•‘ +# β•‘ β•‘ +# β•‘ πŸŽ“ LEARNING TIP: Work in modules/source/ - that's where real development β•‘ +# β•‘ happens! The tinytorch/ directory is just the compiled output. β•‘ +# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• +# %% auto 0 +__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer'] + +# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2 +import numpy as np +import math +from typing import List, Optional, Tuple + +# Import from previous modules - following dependency chain +from ..core.tensor import Tensor + +# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6 +class Embedding: + """ + Learnable embedding layer that maps token indices to dense vectors. + + This is the fundamental building block for converting discrete tokens + into continuous representations that neural networks can process. + + TODO: Implement the Embedding class + + APPROACH: + 1. Initialize embedding matrix with random weights (vocab_size, embed_dim) + 2. Implement forward pass as matrix lookup using numpy indexing + 3. Handle batch dimensions correctly + 4. Return parameters for optimization + + EXAMPLE: + >>> embed = Embedding(vocab_size=100, embed_dim=64) + >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # batch_size=2, seq_len=3 + >>> output = embed.forward(tokens) + >>> print(output.shape) + (2, 3, 64) + + HINTS: + - Use numpy advanced indexing for lookup: weight[indices] + - Embedding matrix shape: (vocab_size, embed_dim) + - Initialize with Xavier/Glorot uniform for stable gradients + - Handle multi-dimensional indices correctly + """ + + ### BEGIN SOLUTION + def __init__(self, vocab_size: int, embed_dim: int): + """ + Initialize embedding layer. + + Args: + vocab_size: Size of vocabulary (number of unique tokens) + embed_dim: Dimension of embedding vectors + """ + self.vocab_size = vocab_size + self.embed_dim = embed_dim + + # Xavier initialization for better gradient flow + limit = math.sqrt(6.0 / (vocab_size + embed_dim)) + self.weight = Tensor( + np.random.uniform(-limit, limit, (vocab_size, embed_dim)), + requires_grad=True + ) + + def forward(self, indices: Tensor) -> Tensor: + """ + Forward pass: lookup embeddings for given indices. + + Args: + indices: Token indices of shape (batch_size, seq_len) or (seq_len,) + + Returns: + Embedded vectors of shape (*indices.shape, embed_dim) + """ + # Handle input validation + if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0): + raise ValueError( + f"Index out of range. Expected 0 <= indices < {self.vocab_size}, " + f"got min={np.min(indices.data)}, max={np.max(indices.data)}" + ) + + # Perform embedding lookup using advanced indexing + # This is equivalent to one-hot multiplication but much more efficient + embedded = self.weight.data[indices.data.astype(int)] + + return Tensor(embedded) + + def parameters(self) -> List[Tensor]: + """Return trainable parameters.""" + return [self.weight] + + def __repr__(self): + return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})" + ### END SOLUTION + +# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10 +class PositionalEncoding: + """ + Learnable positional encoding layer. + + Adds trainable position-specific vectors to token embeddings, + allowing the model to learn positional patterns specific to the task. + + TODO: Implement learnable positional encoding + + APPROACH: + 1. Create embedding matrix for positions: (max_seq_len, embed_dim) + 2. Forward pass: lookup position embeddings and add to input + 3. Handle different sequence lengths gracefully + 4. Return parameters for training + + EXAMPLE: + >>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64) + >>> embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, embed) + >>> output = pos_enc.forward(embeddings) + >>> print(output.shape) + (2, 10, 64) # Same shape, but now position-aware + + HINTS: + - Position embeddings shape: (max_seq_len, embed_dim) + - Use slice [:seq_len] to handle variable lengths + - Add position encodings to input embeddings element-wise + - Initialize with smaller values than token embeddings (they're additive) + """ + + ### BEGIN SOLUTION + def __init__(self, max_seq_len: int, embed_dim: int): + """ + Initialize learnable positional encoding. + + Args: + max_seq_len: Maximum sequence length to support + embed_dim: Embedding dimension (must match token embeddings) + """ + self.max_seq_len = max_seq_len + self.embed_dim = embed_dim + + # Initialize position embedding matrix + # Smaller initialization than token embeddings since these are additive + limit = math.sqrt(2.0 / embed_dim) + self.position_embeddings = Tensor( + np.random.uniform(-limit, limit, (max_seq_len, embed_dim)), + requires_grad=True + ) + + def forward(self, x: Tensor) -> Tensor: + """ + Add positional encodings to input embeddings. + + Args: + x: Input embeddings of shape (batch_size, seq_len, embed_dim) + + Returns: + Position-encoded embeddings of same shape + """ + if len(x.shape) != 3: + raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}") + + batch_size, seq_len, embed_dim = x.shape + + if seq_len > self.max_seq_len: + raise ValueError( + f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}" + ) + + if embed_dim != self.embed_dim: + raise ValueError( + f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}" + ) + + # Get position embeddings for this sequence length + pos_embeddings = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim) + + # Broadcast to match batch dimension: (1, seq_len, embed_dim) + pos_embeddings = pos_embeddings[np.newaxis, :, :] + + # Add positional information to input embeddings + result = x.data + pos_embeddings + + return Tensor(result) + + def parameters(self) -> List[Tensor]: + """Return trainable parameters.""" + return [self.position_embeddings] + + def __repr__(self): + return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})" + ### END SOLUTION + +# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18 +class EmbeddingLayer: + """ + Complete embedding system combining token and positional embeddings. + + This is the production-ready component that handles the full embedding + pipeline used in transformers and other sequence models. + + TODO: Implement complete embedding system + + APPROACH: + 1. Combine token embedding + positional encoding + 2. Support both learned and sinusoidal position encodings + 3. Handle variable sequence lengths gracefully + 4. Add optional embedding scaling (Transformer convention) + + EXAMPLE: + >>> embed_layer = EmbeddingLayer( + ... vocab_size=50000, + ... embed_dim=512, + ... max_seq_len=2048, + ... pos_encoding='learned' + ... ) + >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) + >>> output = embed_layer.forward(tokens) + >>> print(output.shape) + (2, 3, 512) + + HINTS: + - First apply token embedding, then add positional encoding + - Support 'learned', 'sinusoidal', or None for pos_encoding + - Handle both 2D (batch, seq) and 1D (seq) inputs gracefully + - Scale embeddings by sqrt(embed_dim) if requested (transformer convention) + """ + + ### BEGIN SOLUTION + def __init__( + self, + vocab_size: int, + embed_dim: int, + max_seq_len: int = 512, + pos_encoding: str = 'learned', + scale_embeddings: bool = False + ): + """ + Initialize complete embedding system. + + Args: + vocab_size: Size of vocabulary + embed_dim: Embedding dimension + max_seq_len: Maximum sequence length for positional encoding + pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None) + scale_embeddings: Whether to scale embeddings by sqrt(embed_dim) + """ + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.max_seq_len = max_seq_len + self.pos_encoding_type = pos_encoding + self.scale_embeddings = scale_embeddings + + # Token embedding layer + self.token_embedding = Embedding(vocab_size, embed_dim) + + # Positional encoding + if pos_encoding == 'learned': + self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim) + elif pos_encoding == 'sinusoidal': + # Create fixed sinusoidal encodings (no parameters) + self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim) + elif pos_encoding is None: + self.pos_encoding = None + else: + raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None") + + def forward(self, tokens: Tensor) -> Tensor: + """ + Forward pass through complete embedding system. + + Args: + tokens: Token indices of shape (batch_size, seq_len) or (seq_len,) + + Returns: + Embedded tokens with positional information + """ + # Handle 1D input by adding batch dimension + if len(tokens.shape) == 1: + tokens = Tensor(tokens.data[np.newaxis, :]) # (1, seq_len) + squeeze_batch = True + else: + squeeze_batch = False + + # Get token embeddings + token_embeds = self.token_embedding.forward(tokens) # (batch, seq, embed) + + # Scale embeddings if requested (transformer convention) + if self.scale_embeddings: + token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim)) + + # Add positional encoding + if self.pos_encoding_type == 'learned': + # Use learnable positional encoding + output = self.pos_encoding.forward(token_embeds) + elif self.pos_encoding_type == 'sinusoidal': + # Use fixed sinusoidal encoding + batch_size, seq_len, embed_dim = token_embeds.shape + pos_embeddings = self.pos_encoding.data[:seq_len] # (seq_len, embed_dim) + pos_embeddings = pos_embeddings[np.newaxis, :, :] # (1, seq_len, embed_dim) + output = Tensor(token_embeds.data + pos_embeddings) + else: + # No positional encoding + output = token_embeds + + # Remove batch dimension if it was added + if squeeze_batch: + output = Tensor(output.data[0]) # (seq_len, embed_dim) + + return output + + def parameters(self) -> List[Tensor]: + """Return all trainable parameters.""" + params = self.token_embedding.parameters() + + if self.pos_encoding_type == 'learned': + params.extend(self.pos_encoding.parameters()) + + return params + + def __repr__(self): + return (f"EmbeddingLayer(vocab_size={self.vocab_size}, " + f"embed_dim={self.embed_dim}, " + f"pos_encoding='{self.pos_encoding_type}')") + ### END SOLUTION