Files
TinyTorch/tinytorch/models/transformer.py
Vijay Janapa Reddi 8025c66a4b fix(module-13): Rewrite LayerNorm to use Tensor operations
- Change from .data extraction to Tensor arithmetic (x - mean, diff * diff, x / std)
- Preserve computation graph through normalization
- std tensor now preserves requires_grad correctly

LayerNorm is used before and after attention in transformer blocks
2025-10-27 20:30:21 -04:00

467 lines
17 KiB
Python
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2
import numpy as np
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.attention import MultiHeadAttention
from ..core.activations import GELU
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
class LayerNorm:
"""
Layer Normalization for transformer blocks.
Normalizes across the feature dimension (last axis) for each sample independently,
unlike batch normalization which normalizes across the batch dimension.
"""
def __init__(self, normalized_shape, eps=1e-5):
"""
Initialize LayerNorm with learnable parameters.
TODO: Set up normalization parameters
APPROACH:
1. Store the shape to normalize over (usually embed_dim)
2. Initialize learnable scale (gamma) and shift (beta) parameters
3. Set small epsilon for numerical stability
EXAMPLE:
>>> ln = LayerNorm(512) # For 512-dimensional embeddings
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, features)
>>> normalized = ln.forward(x)
>>> # Each (2, 10) sample normalized independently across 512 features
HINTS:
- gamma should start at 1.0 (identity scaling)
- beta should start at 0.0 (no shift)
- eps prevents division by zero in variance calculation
"""
### BEGIN SOLUTION
self.normalized_shape = normalized_shape
self.eps = eps
# Learnable parameters: scale and shift
self.gamma = Tensor(np.ones(normalized_shape)) # Scale parameter
self.beta = Tensor(np.zeros(normalized_shape)) # Shift parameter
### END SOLUTION
def forward(self, x):
"""
Apply layer normalization.
TODO: Implement layer normalization formula
APPROACH:
1. Compute mean and variance across the last dimension
2. Normalize: (x - mean) / sqrt(variance + eps)
3. Apply learnable scale and shift: gamma * normalized + beta
MATHEMATICAL FORMULA:
y = (x - μ) / σ * γ + β
where μ = mean(x), σ = sqrt(var(x) + ε)
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
"""
### BEGIN SOLUTION
# Compute statistics across last dimension (features)
mean = x.mean(axis=-1, keepdims=True)
# Compute variance: E[(x - μ)²]
# Use Tensor operations to preserve computation graph!
diff = x - mean
variance = (diff * diff).mean(axis=-1, keepdims=True)
# Normalize - use Tensor operations to preserve gradients!
# Add eps as a Tensor for proper gradient flow
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
normalized = (x - mean) / std
# Apply learnable transformation
output = normalized * self.gamma + self.beta
return output
### END SOLUTION
def parameters(self):
"""Return learnable parameters."""
return [self.gamma, self.beta]
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13
class MLP:
"""
Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.
Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1.
This provides the non-linear transformation in each transformer block.
"""
def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1):
"""
Initialize MLP with two linear layers.
TODO: Set up the feed-forward network layers
APPROACH:
1. First layer expands from embed_dim to hidden_dim (usually 4x larger)
2. Second layer projects back to embed_dim
3. Use GELU activation (smoother than ReLU, preferred in transformers)
EXAMPLE:
>>> mlp = MLP(512) # Will create 512 -> 2048 -> 512 network
>>> x = Tensor(np.random.randn(2, 10, 512))
>>> output = mlp.forward(x)
>>> assert output.shape == (2, 10, 512)
HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim)
"""
### BEGIN SOLUTION
if hidden_dim is None:
hidden_dim = 4 * embed_dim # Standard 4x expansion
self.embed_dim = embed_dim
self.hidden_dim = hidden_dim
# Two-layer feed-forward network
self.linear1 = Linear(embed_dim, hidden_dim)
self.gelu = GELU() # Use GELU activation from activations module
self.linear2 = Linear(hidden_dim, embed_dim)
### END SOLUTION
def forward(self, x):
"""
Forward pass through MLP.
TODO: Implement the feed-forward computation
APPROACH:
1. First linear transformation: embed_dim -> hidden_dim
2. Apply GELU activation (smooth, differentiable)
3. Second linear transformation: hidden_dim -> embed_dim
COMPUTATION FLOW:
x -> Linear -> GELU -> Linear -> output
HINT: GELU activation is implemented above as a function
"""
### BEGIN SOLUTION
# First linear layer with expansion
hidden = self.linear1.forward(x)
# GELU activation (YOUR activation from Module 03!)
hidden = self.gelu.forward(hidden)
# Second linear layer back to original size
output = self.linear2.forward(hidden)
return output
### END SOLUTION
def parameters(self):
"""Return all learnable parameters."""
params = []
params.extend(self.linear1.parameters())
params.extend(self.linear2.parameters())
return params
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17
class TransformerBlock:
"""
Complete Transformer Block with self-attention, MLP, and residual connections.
This is the core building block of GPT and other transformer models.
Each block processes the input sequence and passes it to the next block.
"""
def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1):
"""
Initialize a complete transformer block.
TODO: Set up all components of the transformer block
APPROACH:
1. Multi-head self-attention for sequence modeling
2. First layer normalization (pre-norm architecture)
3. MLP with specified expansion ratio
4. Second layer normalization
TRANSFORMER BLOCK ARCHITECTURE:
x → LayerNorm → MultiHeadAttention → + (residual) →
LayerNorm → MLP → + (residual) → output
EXAMPLE:
>>> block = TransformerBlock(embed_dim=512, num_heads=8)
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, embed)
>>> output = block.forward(x)
>>> assert output.shape == (2, 10, 512)
HINT: We use pre-norm architecture (LayerNorm before attention/MLP)
"""
### BEGIN SOLUTION
self.embed_dim = embed_dim
self.num_heads = num_heads
# Multi-head self-attention
self.attention = MultiHeadAttention(embed_dim, num_heads)
# Layer normalizations (pre-norm architecture)
self.ln1 = LayerNorm(embed_dim) # Before attention
self.ln2 = LayerNorm(embed_dim) # Before MLP
# Feed-forward network
hidden_dim = int(embed_dim * mlp_ratio)
self.mlp = MLP(embed_dim, hidden_dim)
### END SOLUTION
def forward(self, x, mask=None):
"""
Forward pass through transformer block.
TODO: Implement the complete transformer block computation
APPROACH:
1. Apply layer norm, then self-attention, then add residual
2. Apply layer norm, then MLP, then add residual
3. Return the transformed sequence
COMPUTATION FLOW:
x → ln1 → attention → + x → ln2 → mlp → + → output
RESIDUAL CONNECTIONS:
These are crucial for training deep networks - they allow gradients
to flow directly through the network during backpropagation.
HINT: Store intermediate results to add residual connections properly
"""
### BEGIN SOLUTION
# First sub-layer: Multi-head self-attention with residual connection
# Pre-norm: LayerNorm before attention
normed1 = self.ln1.forward(x)
# Self-attention: query, key, value are all the same (normed1)
attention_out = self.attention.forward(normed1, mask)
# Residual connection
x = x + attention_out
# Second sub-layer: MLP with residual connection
# Pre-norm: LayerNorm before MLP
normed2 = self.ln2.forward(x)
mlp_out = self.mlp.forward(normed2)
# Residual connection
output = x + mlp_out
return output
### END SOLUTION
def parameters(self):
"""Return all learnable parameters."""
params = []
params.extend(self.attention.parameters())
params.extend(self.ln1.parameters())
params.extend(self.ln2.parameters())
params.extend(self.mlp.parameters())
return params
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21
class GPT:
"""
Complete GPT (Generative Pre-trained Transformer) model.
This combines embeddings, positional encoding, multiple transformer blocks,
and a language modeling head for text generation.
"""
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024):
"""
Initialize complete GPT model.
TODO: Set up all components of the GPT architecture
APPROACH:
1. Token embedding layer to convert tokens to vectors
2. Positional embedding to add position information
3. Stack of transformer blocks (the main computation)
4. Final layer norm and language modeling head
GPT ARCHITECTURE:
tokens → embedding → + pos_embedding →
transformer_blocks → layer_norm → lm_head → logits
EXAMPLE:
>>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8)
>>> tokens = Tensor(np.random.randint(0, 1000, (2, 10))) # (batch, seq)
>>> logits = model.forward(tokens)
>>> assert logits.shape == (2, 10, 1000) # (batch, seq, vocab)
HINTS:
- Positional embeddings are learned, not fixed sinusoidal
- Final layer norm stabilizes training
- Language modeling head shares weights with token embedding (tie_weights)
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
self.max_seq_len = max_seq_len
# Token and positional embeddings
self.token_embedding = Embedding(vocab_size, embed_dim)
self.position_embedding = Embedding(max_seq_len, embed_dim)
# Stack of transformer blocks
self.blocks = []
for _ in range(num_layers):
block = TransformerBlock(embed_dim, num_heads)
self.blocks.append(block)
# Final layer normalization
self.ln_f = LayerNorm(embed_dim)
# Language modeling head (projects to vocabulary)
self.lm_head = Linear(embed_dim, vocab_size, bias=False)
### END SOLUTION
def forward(self, tokens):
"""
Forward pass through GPT model.
TODO: Implement the complete GPT forward pass
APPROACH:
1. Get token embeddings and positional embeddings
2. Add them together (broadcasting handles different shapes)
3. Pass through all transformer blocks sequentially
4. Apply final layer norm and language modeling head
COMPUTATION FLOW:
tokens → embed + pos_embed → blocks → ln_f → lm_head → logits
CAUSAL MASKING:
For autoregressive generation, we need to prevent tokens from
seeing future tokens. This is handled by the attention mask.
HINT: Create position indices as range(seq_len) for positional embedding
"""
### BEGIN SOLUTION
batch_size, seq_len = tokens.shape
# Token embeddings
token_emb = self.token_embedding.forward(tokens)
# Positional embeddings
positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
pos_emb = self.position_embedding.forward(positions)
# Combine embeddings
x = token_emb + pos_emb
# Create causal mask for autoregressive generation
mask = self._create_causal_mask(seq_len)
# Pass through transformer blocks
for block in self.blocks:
x = block.forward(x, mask)
# Final layer normalization
x = self.ln_f.forward(x)
# Language modeling head
logits = self.lm_head.forward(x)
return logits
### END SOLUTION
def _create_causal_mask(self, seq_len):
"""Create causal mask to prevent attending to future positions."""
### BEGIN SOLUTION
# Upper triangular matrix filled with -inf
mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1)
return Tensor(mask)
### END SOLUTION
def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
"""
Generate text autoregressively.
TODO: Implement autoregressive text generation
APPROACH:
1. Start with prompt tokens
2. For each new position:
- Run forward pass to get logits
- Sample next token from logits
- Append to sequence
3. Return generated sequence
AUTOREGRESSIVE GENERATION:
At each step, the model predicts the next token based on all
previous tokens. This is how GPT generates coherent text.
EXAMPLE:
>>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4)
>>> prompt = Tensor([[1, 2, 3]]) # Some token sequence
>>> generated = model.generate(prompt, max_new_tokens=5)
>>> assert generated.shape[1] == 3 + 5 # original + new tokens
HINT: Use np.random.choice with temperature for sampling
"""
### BEGIN SOLUTION
current_tokens = Tensor(prompt_tokens.data.copy())
for _ in range(max_new_tokens):
# Get logits for current sequence
logits = self.forward(current_tokens)
# Get logits for last position (next token prediction)
last_logits = logits.data[:, -1, :] # (batch_size, vocab_size)
# Apply temperature scaling
scaled_logits = last_logits / temperature
# Convert to probabilities (softmax)
exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True))
probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
# Sample next token
next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]])
# Append to sequence
current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1))
return current_tokens
### END SOLUTION
def parameters(self):
"""Return all learnable parameters."""
params = []
params.extend(self.token_embedding.parameters())
params.extend(self.position_embedding.parameters())
for block in self.blocks:
params.extend(block.parameters())
params.extend(self.ln_f.parameters())
params.extend(self.lm_head.parameters())
return params