mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 23:53:33 -05:00
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
465 lines
16 KiB
Python
Generated
465 lines
16 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
|
||
|
||
# %% auto 0
|
||
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
|
||
|
||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2
|
||
import numpy as np
|
||
from ..core.tensor import Tensor
|
||
from ..core.layers import Linear
|
||
from ..core.attention import MultiHeadAttention
|
||
from ..core.activations import GELU
|
||
|
||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
|
||
class LayerNorm:
|
||
"""
|
||
Layer Normalization for transformer blocks.
|
||
|
||
Normalizes across the feature dimension (last axis) for each sample independently,
|
||
unlike batch normalization which normalizes across the batch dimension.
|
||
"""
|
||
|
||
def __init__(self, normalized_shape, eps=1e-5):
|
||
"""
|
||
Initialize LayerNorm with learnable parameters.
|
||
|
||
TODO: Set up normalization parameters
|
||
|
||
APPROACH:
|
||
1. Store the shape to normalize over (usually embed_dim)
|
||
2. Initialize learnable scale (gamma) and shift (beta) parameters
|
||
3. Set small epsilon for numerical stability
|
||
|
||
EXAMPLE:
|
||
>>> ln = LayerNorm(512) # For 512-dimensional embeddings
|
||
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, features)
|
||
>>> normalized = ln.forward(x)
|
||
>>> # Each (2, 10) sample normalized independently across 512 features
|
||
|
||
HINTS:
|
||
- gamma should start at 1.0 (identity scaling)
|
||
- beta should start at 0.0 (no shift)
|
||
- eps prevents division by zero in variance calculation
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.normalized_shape = normalized_shape
|
||
self.eps = eps
|
||
|
||
# Learnable parameters: scale and shift
|
||
# CRITICAL: requires_grad=True so optimizer can train these!
|
||
self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True) # Scale parameter
|
||
self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True) # Shift parameter
|
||
### END SOLUTION
|
||
|
||
def forward(self, x):
|
||
"""
|
||
Apply layer normalization.
|
||
|
||
TODO: Implement layer normalization formula
|
||
|
||
APPROACH:
|
||
1. Compute mean and variance across the last dimension
|
||
2. Normalize: (x - mean) / sqrt(variance + eps)
|
||
3. Apply learnable scale and shift: gamma * normalized + beta
|
||
|
||
MATHEMATICAL FORMULA:
|
||
y = (x - μ) / σ * γ + β
|
||
where μ = mean(x), σ = sqrt(var(x) + ε)
|
||
|
||
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
|
||
# Compute statistics across last dimension (features)
|
||
mean = x.mean(axis=-1, keepdims=True)
|
||
|
||
# Compute variance: E[(x - μ)²]
|
||
diff = x - mean # Tensor subtraction maintains gradient
|
||
variance = (diff * diff).mean(axis=-1, keepdims=True) # Tensor ops maintain gradient
|
||
|
||
# Normalize: (x - mean) / sqrt(variance + eps)
|
||
# Note: sqrt and division need to preserve gradient flow
|
||
std_data = np.sqrt(variance.data + self.eps)
|
||
normalized = diff * Tensor(1.0 / std_data) # Scale by reciprocal to maintain gradient
|
||
|
||
# Apply learnable transformation
|
||
output = normalized * self.gamma + self.beta
|
||
return output
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x):
|
||
"""Allows the layer to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def parameters(self):
|
||
"""Return learnable parameters."""
|
||
return [self.gamma, self.beta]
|
||
|
||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13
|
||
class MLP:
|
||
"""
|
||
Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.
|
||
|
||
Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1.
|
||
This provides the non-linear transformation in each transformer block.
|
||
"""
|
||
|
||
def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1):
|
||
"""
|
||
Initialize MLP with two linear layers.
|
||
|
||
TODO: Set up the feed-forward network layers
|
||
|
||
APPROACH:
|
||
1. First layer expands from embed_dim to hidden_dim (usually 4x larger)
|
||
2. Second layer projects back to embed_dim
|
||
3. Use GELU activation (smoother than ReLU, preferred in transformers)
|
||
|
||
EXAMPLE:
|
||
>>> mlp = MLP(512) # Will create 512 -> 2048 -> 512 network
|
||
>>> x = Tensor(np.random.randn(2, 10, 512))
|
||
>>> output = mlp.forward(x)
|
||
>>> assert output.shape == (2, 10, 512)
|
||
|
||
HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
if hidden_dim is None:
|
||
hidden_dim = 4 * embed_dim # Standard 4x expansion
|
||
|
||
self.embed_dim = embed_dim
|
||
self.hidden_dim = hidden_dim
|
||
|
||
# Two-layer feed-forward network
|
||
self.linear1 = Linear(embed_dim, hidden_dim)
|
||
self.gelu = GELU()
|
||
self.linear2 = Linear(hidden_dim, embed_dim)
|
||
### END SOLUTION
|
||
|
||
def forward(self, x):
|
||
"""
|
||
Forward pass through MLP.
|
||
|
||
TODO: Implement the feed-forward computation
|
||
|
||
APPROACH:
|
||
1. First linear transformation: embed_dim -> hidden_dim
|
||
2. Apply GELU activation (smooth, differentiable)
|
||
3. Second linear transformation: hidden_dim -> embed_dim
|
||
|
||
COMPUTATION FLOW:
|
||
x -> Linear -> GELU -> Linear -> output
|
||
|
||
HINT: GELU activation is implemented above as a function
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# First linear layer with expansion
|
||
hidden = self.linear1.forward(x)
|
||
|
||
# GELU activation
|
||
hidden = self.gelu.forward(hidden)
|
||
|
||
# Second linear layer back to original size
|
||
output = self.linear2.forward(hidden)
|
||
|
||
return output
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x):
|
||
"""Allows the MLP to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def parameters(self):
|
||
"""Return all learnable parameters."""
|
||
params = []
|
||
params.extend(self.linear1.parameters())
|
||
params.extend(self.linear2.parameters())
|
||
return params
|
||
|
||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17
|
||
class TransformerBlock:
|
||
"""
|
||
Complete Transformer Block with self-attention, MLP, and residual connections.
|
||
|
||
This is the core building block of GPT and other transformer models.
|
||
Each block processes the input sequence and passes it to the next block.
|
||
"""
|
||
|
||
def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1):
|
||
"""
|
||
Initialize a complete transformer block.
|
||
|
||
TODO: Set up all components of the transformer block
|
||
|
||
APPROACH:
|
||
1. Multi-head self-attention for sequence modeling
|
||
2. First layer normalization (pre-norm architecture)
|
||
3. MLP with specified expansion ratio
|
||
4. Second layer normalization
|
||
|
||
TRANSFORMER BLOCK ARCHITECTURE:
|
||
x → LayerNorm → MultiHeadAttention → + (residual) →
|
||
LayerNorm → MLP → + (residual) → output
|
||
|
||
EXAMPLE:
|
||
>>> block = TransformerBlock(embed_dim=512, num_heads=8)
|
||
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, embed)
|
||
>>> output = block.forward(x)
|
||
>>> assert output.shape == (2, 10, 512)
|
||
|
||
HINT: We use pre-norm architecture (LayerNorm before attention/MLP)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.embed_dim = embed_dim
|
||
self.num_heads = num_heads
|
||
|
||
# Multi-head self-attention
|
||
self.attention = MultiHeadAttention(embed_dim, num_heads)
|
||
|
||
# Layer normalizations (pre-norm architecture)
|
||
self.ln1 = LayerNorm(embed_dim) # Before attention
|
||
self.ln2 = LayerNorm(embed_dim) # Before MLP
|
||
|
||
# Feed-forward network
|
||
hidden_dim = int(embed_dim * mlp_ratio)
|
||
self.mlp = MLP(embed_dim, hidden_dim)
|
||
### END SOLUTION
|
||
|
||
def forward(self, x, mask=None):
|
||
"""
|
||
Forward pass through transformer block.
|
||
|
||
TODO: Implement the complete transformer block computation
|
||
|
||
APPROACH:
|
||
1. Apply layer norm, then self-attention, then add residual
|
||
2. Apply layer norm, then MLP, then add residual
|
||
3. Return the transformed sequence
|
||
|
||
COMPUTATION FLOW:
|
||
x → ln1 → attention → + x → ln2 → mlp → + → output
|
||
|
||
RESIDUAL CONNECTIONS:
|
||
These are crucial for training deep networks - they allow gradients
|
||
to flow directly through the network during backpropagation.
|
||
|
||
HINT: Store intermediate results to add residual connections properly
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# First sub-layer: Multi-head self-attention with residual connection
|
||
# Pre-norm: LayerNorm before attention
|
||
normed1 = self.ln1.forward(x)
|
||
# Self-attention: query, key, value are all the same (normed1)
|
||
attention_out = self.attention.forward(normed1, mask)
|
||
|
||
# Residual connection
|
||
x = x + attention_out
|
||
|
||
# Second sub-layer: MLP with residual connection
|
||
# Pre-norm: LayerNorm before MLP
|
||
normed2 = self.ln2.forward(x)
|
||
mlp_out = self.mlp.forward(normed2)
|
||
|
||
# Residual connection
|
||
output = x + mlp_out
|
||
|
||
return output
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x, mask=None):
|
||
"""Allows the transformer block to be called like a function."""
|
||
return self.forward(x, mask)
|
||
|
||
def parameters(self):
|
||
"""Return all learnable parameters."""
|
||
params = []
|
||
params.extend(self.attention.parameters())
|
||
params.extend(self.ln1.parameters())
|
||
params.extend(self.ln2.parameters())
|
||
params.extend(self.mlp.parameters())
|
||
return params
|
||
|
||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21
|
||
class GPT:
|
||
"""
|
||
Complete GPT (Generative Pre-trained Transformer) model.
|
||
|
||
This combines embeddings, positional encoding, multiple transformer blocks,
|
||
and a language modeling head for text generation.
|
||
"""
|
||
|
||
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024):
|
||
"""
|
||
Initialize complete GPT model.
|
||
|
||
TODO: Set up all components of the GPT architecture
|
||
|
||
APPROACH:
|
||
1. Token embedding layer to convert tokens to vectors
|
||
2. Positional embedding to add position information
|
||
3. Stack of transformer blocks (the main computation)
|
||
4. Final layer norm and language modeling head
|
||
|
||
GPT ARCHITECTURE:
|
||
tokens → embedding → + pos_embedding →
|
||
transformer_blocks → layer_norm → lm_head → logits
|
||
|
||
EXAMPLE:
|
||
>>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8)
|
||
>>> tokens = Tensor(np.random.randint(0, 1000, (2, 10))) # (batch, seq)
|
||
>>> logits = model.forward(tokens)
|
||
>>> assert logits.shape == (2, 10, 1000) # (batch, seq, vocab)
|
||
|
||
HINTS:
|
||
- Positional embeddings are learned, not fixed sinusoidal
|
||
- Final layer norm stabilizes training
|
||
- Language modeling head shares weights with token embedding (tie_weights)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.vocab_size = vocab_size
|
||
self.embed_dim = embed_dim
|
||
self.num_layers = num_layers
|
||
self.num_heads = num_heads
|
||
self.max_seq_len = max_seq_len
|
||
|
||
# Token and positional embeddings
|
||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||
self.position_embedding = Embedding(max_seq_len, embed_dim)
|
||
|
||
# Stack of transformer blocks
|
||
self.blocks = []
|
||
for _ in range(num_layers):
|
||
block = TransformerBlock(embed_dim, num_heads)
|
||
self.blocks.append(block)
|
||
|
||
# Final layer normalization
|
||
self.ln_f = LayerNorm(embed_dim)
|
||
|
||
# Language modeling head (projects to vocabulary)
|
||
self.lm_head = Linear(embed_dim, vocab_size, bias=False)
|
||
### END SOLUTION
|
||
|
||
def forward(self, tokens):
|
||
"""
|
||
Forward pass through GPT model.
|
||
|
||
TODO: Implement the complete GPT forward pass
|
||
|
||
APPROACH:
|
||
1. Get token embeddings and positional embeddings
|
||
2. Add them together (broadcasting handles different shapes)
|
||
3. Pass through all transformer blocks sequentially
|
||
4. Apply final layer norm and language modeling head
|
||
|
||
COMPUTATION FLOW:
|
||
tokens → embed + pos_embed → blocks → ln_f → lm_head → logits
|
||
|
||
CAUSAL MASKING:
|
||
For autoregressive generation, we need to prevent tokens from
|
||
seeing future tokens. This is handled by the attention mask.
|
||
|
||
HINT: Create position indices as range(seq_len) for positional embedding
|
||
"""
|
||
### BEGIN SOLUTION
|
||
batch_size, seq_len = tokens.shape
|
||
|
||
# Token embeddings
|
||
token_emb = self.token_embedding.forward(tokens)
|
||
|
||
# Positional embeddings
|
||
positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
|
||
pos_emb = self.position_embedding.forward(positions)
|
||
|
||
# Combine embeddings
|
||
x = token_emb + pos_emb
|
||
|
||
# Create causal mask for autoregressive generation
|
||
mask = self._create_causal_mask(seq_len)
|
||
|
||
# Pass through transformer blocks
|
||
for block in self.blocks:
|
||
x = block.forward(x, mask)
|
||
|
||
# Final layer normalization
|
||
x = self.ln_f.forward(x)
|
||
|
||
# Language modeling head
|
||
logits = self.lm_head.forward(x)
|
||
|
||
return logits
|
||
### END SOLUTION
|
||
|
||
def _create_causal_mask(self, seq_len):
|
||
"""Create causal mask to prevent attending to future positions."""
|
||
### BEGIN SOLUTION
|
||
# Upper triangular matrix filled with -inf
|
||
mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1)
|
||
return Tensor(mask)
|
||
### END SOLUTION
|
||
|
||
def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
|
||
"""
|
||
Generate text autoregressively.
|
||
|
||
TODO: Implement autoregressive text generation
|
||
|
||
APPROACH:
|
||
1. Start with prompt tokens
|
||
2. For each new position:
|
||
- Run forward pass to get logits
|
||
- Sample next token from logits
|
||
- Append to sequence
|
||
3. Return generated sequence
|
||
|
||
AUTOREGRESSIVE GENERATION:
|
||
At each step, the model predicts the next token based on all
|
||
previous tokens. This is how GPT generates coherent text.
|
||
|
||
EXAMPLE:
|
||
>>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4)
|
||
>>> prompt = Tensor([[1, 2, 3]]) # Some token sequence
|
||
>>> generated = model.generate(prompt, max_new_tokens=5)
|
||
>>> assert generated.shape[1] == 3 + 5 # original + new tokens
|
||
|
||
HINT: Use np.random.choice with temperature for sampling
|
||
"""
|
||
### BEGIN SOLUTION
|
||
current_tokens = Tensor(prompt_tokens.data.copy())
|
||
|
||
for _ in range(max_new_tokens):
|
||
# Get logits for current sequence
|
||
logits = self.forward(current_tokens)
|
||
|
||
# Get logits for last position (next token prediction)
|
||
last_logits = logits.data[:, -1, :] # (batch_size, vocab_size)
|
||
|
||
# Apply temperature scaling
|
||
scaled_logits = last_logits / temperature
|
||
|
||
# Convert to probabilities (softmax)
|
||
exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True))
|
||
probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
|
||
|
||
# Sample next token
|
||
next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]])
|
||
|
||
# Append to sequence
|
||
current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1))
|
||
|
||
return current_tokens
|
||
### END SOLUTION
|
||
|
||
def parameters(self):
|
||
"""Return all learnable parameters."""
|
||
params = []
|
||
params.extend(self.token_embedding.parameters())
|
||
params.extend(self.position_embedding.parameters())
|
||
|
||
for block in self.blocks:
|
||
params.extend(block.parameters())
|
||
|
||
params.extend(self.ln_f.parameters())
|
||
params.extend(self.lm_head.parameters())
|
||
|
||
return params
|