🤖 Fix transformer module exports and milestone 05 imports

Module export fixes:
- Add #|default_exp models.transformer directive to transformers module
- Add imports (MultiHeadAttention, GELU, etc.) to export block
- Export dataloader module (08_dataloader)
- All modules now properly exported to tinytorch package

Milestone 05 fixes:
- Correct import paths (text.embeddings, data.loader, models.transformer)
- Fix Linear.weight vs Linear.weights typo
- Fix indentation in training loop
- Call .forward() explicitly on transformer components

Status: Architecture test mode works, model builds successfully
TODO: Fix TransformerBlock/MultiHeadAttention signature mismatch in module 13
This commit is contained in:
Vijay Janapa Reddi
2025-10-27 16:17:55 -04:00
parent 170dde319a
commit 757e3bf7e1
10 changed files with 2575 additions and 1125 deletions

76
tinytorch/_modidx.py generated
View File

@@ -61,6 +61,16 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/core/activations.py'),
'tinytorch.core.activations.Tanh.forward': ( '02_activations/activations_dev.html#tanh.forward',
'tinytorch/core/activations.py')},
'tinytorch.core.attention': { 'tinytorch.core.attention.MultiHeadAttention': ( '12_attention/attention_dev.html#multiheadattention',
'tinytorch/core/attention.py'),
'tinytorch.core.attention.MultiHeadAttention.__init__': ( '12_attention/attention_dev.html#multiheadattention.__init__',
'tinytorch/core/attention.py'),
'tinytorch.core.attention.MultiHeadAttention.forward': ( '12_attention/attention_dev.html#multiheadattention.forward',
'tinytorch/core/attention.py'),
'tinytorch.core.attention.MultiHeadAttention.parameters': ( '12_attention/attention_dev.html#multiheadattention.parameters',
'tinytorch/core/attention.py'),
'tinytorch.core.attention.scaled_dot_product_attention': ( '12_attention/attention_dev.html#scaled_dot_product_attention',
'tinytorch/core/attention.py')},
'tinytorch.core.autograd': {},
'tinytorch.core.layers': { 'tinytorch.core.layers.Dropout': ('03_layers/layers_dev.html#dropout', 'tinytorch/core/layers.py'),
'tinytorch.core.layers.Dropout.__call__': ( '03_layers/layers_dev.html#dropout.__call__',
@@ -270,6 +280,72 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__',
'tinytorch/data/loader.py')},
'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.forward': ( '13_transformers/transformers_dev.html#gpt.forward',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.generate': ( '13_transformers/transformers_dev.html#gpt.generate',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.parameters': ( '13_transformers/transformers_dev.html#gpt.parameters',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.parameters': ( '13_transformers/transformers_dev.html#layernorm.parameters',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.parameters': ( '13_transformers/transformers_dev.html#mlp.parameters',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
'tinytorch/models/transformer.py')},
'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.forward': ( '11_embeddings/embeddings_dev.html#embedding.forward',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.parameters': ( '11_embeddings/embeddings_dev.html#embedding.parameters',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.forward': ( '11_embeddings/embeddings_dev.html#embeddinglayer.forward',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.parameters': ( '11_embeddings/embeddings_dev.html#embeddinglayer.parameters',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.forward': ( '11_embeddings/embeddings_dev.html#positionalencoding.forward',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.parameters': ( '11_embeddings/embeddings_dev.html#positionalencoding.parameters',
'tinytorch/text/embeddings.py')},
'tinytorch.text.tokenization': { 'tinytorch.text.tokenization.BPETokenizer': ( '10_tokenization/tokenization_dev.html#bpetokenizer',
'tinytorch/text/tokenization.py'),
'tinytorch.text.tokenization.BPETokenizer.__init__': ( '10_tokenization/tokenization_dev.html#bpetokenizer.__init__',

291
tinytorch/core/attention.py generated Normal file
View File

@@ -0,0 +1,291 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/07_attention/attention_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['scaled_dot_product_attention', 'MultiHeadAttention']
# %% ../../modules/source/12_attention/attention_dev.ipynb 0
#| default_exp core.attention
#| export
# %% ../../modules/source/12_attention/attention_dev.ipynb 2
import numpy as np
import math
import time
from typing import Optional, Tuple, List
# Import dependencies from previous modules - following TinyTorch dependency chain
from .tensor import Tensor
from .layers import Linear
# %% ../../modules/source/12_attention/attention_dev.ipynb 6
def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
"""
Compute scaled dot-product attention.
This is the fundamental attention operation that powers all transformer models.
We'll implement it with explicit loops first to show the O(n²) complexity.
TODO: Implement scaled dot-product attention step by step
APPROACH:
1. Extract dimensions and validate inputs
2. Compute attention scores with explicit nested loops (show O() complexity)
3. Scale by 1/d_k for numerical stability
4. Apply causal mask if provided (set masked positions to -inf)
5. Apply softmax to get attention weights
6. Apply values with attention weights (another O() operation)
7. Return output and attention weights
Args:
Q: Query tensor of shape (batch_size, seq_len, d_model)
K: Key tensor of shape (batch_size, seq_len, d_model)
V: Value tensor of shape (batch_size, seq_len, d_model)
mask: Optional causal mask, True=allow, False=mask (batch_size, seq_len, seq_len)
Returns:
output: Attended values (batch_size, seq_len, d_model)
attention_weights: Attention matrix (batch_size, seq_len, seq_len)
EXAMPLE:
>>> Q = Tensor(np.random.randn(2, 4, 64)) # batch=2, seq=4, dim=64
>>> K = Tensor(np.random.randn(2, 4, 64))
>>> V = Tensor(np.random.randn(2, 4, 64))
>>> output, weights = scaled_dot_product_attention(Q, K, V)
>>> print(output.shape) # (2, 4, 64)
>>> print(weights.shape) # (2, 4, 4)
>>> print(weights.data[0].sum(axis=1)) # Each row sums to ~1.0
HINTS:
- Use explicit nested loops to compute Q[i] @ K[j] for educational purposes
- Scale factor is 1/d_k where d_k is the last dimension of Q
- Masked positions should be set to -1e9 before softmax
- Remember that softmax normalizes along the last dimension
"""
### BEGIN SOLUTION
# Step 1: Extract dimensions and validate
batch_size, seq_len, d_model = Q.shape
assert K.shape == (batch_size, seq_len, d_model), f"K shape {K.shape} doesn't match Q shape {Q.shape}"
assert V.shape == (batch_size, seq_len, d_model), f"V shape {V.shape} doesn't match Q shape {Q.shape}"
# Step 2: Compute attention scores with explicit loops (educational O(n²) demonstration)
scores = np.zeros((batch_size, seq_len, seq_len))
# Show the quadratic complexity explicitly
for b in range(batch_size): # For each batch
for i in range(seq_len): # For each query position
for j in range(seq_len): # Attend to each key position
# Compute dot product between query i and key j
score = 0.0
for d in range(d_model): # Dot product across embedding dimension
score += Q.data[b, i, d] * K.data[b, j, d]
scores[b, i, j] = score
# Step 3: Scale by 1/√d_k for numerical stability
scale_factor = 1.0 / math.sqrt(d_model)
scores = scores * scale_factor
# Step 4: Apply causal mask if provided
if mask is not None:
# mask[i,j] = False means position j should not attend to position i
mask_value = -1e9 # Large negative value becomes 0 after softmax
for b in range(batch_size):
for i in range(seq_len):
for j in range(seq_len):
if not mask.data[b, i, j]: # If mask is False, block attention
scores[b, i, j] = mask_value
# Step 5: Apply softmax to get attention weights (probability distribution)
attention_weights = np.zeros_like(scores)
for b in range(batch_size):
for i in range(seq_len):
# Softmax over the j dimension (what this query attends to)
row = scores[b, i, :]
max_val = np.max(row) # Numerical stability
exp_row = np.exp(row - max_val)
sum_exp = np.sum(exp_row)
attention_weights[b, i, :] = exp_row / sum_exp
# Step 6: Apply attention weights to values (another O(n²) operation)
output = np.zeros((batch_size, seq_len, d_model))
# Again, show the quadratic complexity
for b in range(batch_size): # For each batch
for i in range(seq_len): # For each output position
for j in range(seq_len): # Weighted sum over all value positions
weight = attention_weights[b, i, j]
for d in range(d_model): # Accumulate across embedding dimension
output[b, i, d] += weight * V.data[b, j, d]
return Tensor(output), Tensor(attention_weights)
### END SOLUTION
# %% ../../modules/source/12_attention/attention_dev.ipynb 10
class MultiHeadAttention:
"""
Multi-head attention mechanism.
Runs multiple attention heads in parallel, each learning different relationships.
This is the core component of transformer architectures.
"""
def __init__(self, embed_dim: int, num_heads: int):
"""
Initialize multi-head attention.
TODO: Set up linear projections and validate configuration
APPROACH:
1. Validate that embed_dim is divisible by num_heads
2. Calculate head_dim (embed_dim // num_heads)
3. Create linear layers for Q, K, V projections
4. Create output projection layer
5. Store configuration parameters
Args:
embed_dim: Embedding dimension (d_model)
num_heads: Number of parallel attention heads
EXAMPLE:
>>> mha = MultiHeadAttention(embed_dim=512, num_heads=8)
>>> mha.head_dim # 64 (512 / 8)
>>> len(mha.parameters()) # 4 linear layers * 2 params each = 8 tensors
HINTS:
- head_dim = embed_dim // num_heads must be integer
- Need 4 Linear layers: q_proj, k_proj, v_proj, out_proj
- Each projection maps embed_dim embed_dim
"""
### BEGIN SOLUTION
assert embed_dim % num_heads == 0, f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})"
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
# Linear projections for queries, keys, values
self.q_proj = Linear(embed_dim, embed_dim)
self.k_proj = Linear(embed_dim, embed_dim)
self.v_proj = Linear(embed_dim, embed_dim)
# Output projection to mix information across heads
self.out_proj = Linear(embed_dim, embed_dim)
### END SOLUTION
def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
"""
Forward pass through multi-head attention.
TODO: Implement the complete multi-head attention forward pass
APPROACH:
1. Extract input dimensions (batch_size, seq_len, embed_dim)
2. Project input to Q, K, V using linear layers
3. Reshape projections to separate heads: (batch, seq, heads, head_dim)
4. Transpose to (batch, heads, seq, head_dim) for parallel processing
5. Apply scaled dot-product attention to each head
6. Transpose back and reshape to merge heads
7. Apply output projection
Args:
x: Input tensor (batch_size, seq_len, embed_dim)
mask: Optional attention mask (batch_size, seq_len, seq_len)
Returns:
output: Attended representation (batch_size, seq_len, embed_dim)
EXAMPLE:
>>> mha = MultiHeadAttention(embed_dim=64, num_heads=8)
>>> x = Tensor(np.random.randn(2, 10, 64)) # batch=2, seq=10, dim=64
>>> output = mha.forward(x)
>>> print(output.shape) # (2, 10, 64) - same as input
HINTS:
- Reshape: (batch, seq, embed_dim) (batch, seq, heads, head_dim)
- Transpose: (batch, seq, heads, head_dim) (batch, heads, seq, head_dim)
- After attention: reverse the process to merge heads
- Use scaled_dot_product_attention for each head
"""
### BEGIN SOLUTION
# Step 1: Extract dimensions
batch_size, seq_len, embed_dim = x.shape
assert embed_dim == self.embed_dim, f"Input dim {embed_dim} doesn't match expected {self.embed_dim}"
# Step 2: Project to Q, K, V
Q = self.q_proj.forward(x) # (batch, seq, embed_dim)
K = self.k_proj.forward(x)
V = self.v_proj.forward(x)
# Step 3: Reshape to separate heads
# From (batch, seq, embed_dim) to (batch, seq, num_heads, head_dim)
Q_heads = Q.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
K_heads = K.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
V_heads = V.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
# Step 4: Transpose to (batch, num_heads, seq, head_dim) for parallel processing
Q_heads = np.transpose(Q_heads, (0, 2, 1, 3))
K_heads = np.transpose(K_heads, (0, 2, 1, 3))
V_heads = np.transpose(V_heads, (0, 2, 1, 3))
# Step 5: Apply attention to each head
head_outputs = []
for h in range(self.num_heads):
# Extract this head's Q, K, V
Q_h = Tensor(Q_heads[:, h, :, :]) # (batch, seq, head_dim)
K_h = Tensor(K_heads[:, h, :, :])
V_h = Tensor(V_heads[:, h, :, :])
# Apply attention for this head
head_out, _ = scaled_dot_product_attention(Q_h, K_h, V_h, mask)
head_outputs.append(head_out.data)
# Step 6: Concatenate heads back together
# Stack: list of (batch, seq, head_dim) → (batch, num_heads, seq, head_dim)
concat_heads = np.stack(head_outputs, axis=1)
# Transpose back: (batch, num_heads, seq, head_dim) → (batch, seq, num_heads, head_dim)
concat_heads = np.transpose(concat_heads, (0, 2, 1, 3))
# Reshape: (batch, seq, num_heads, head_dim) → (batch, seq, embed_dim)
concat_output = concat_heads.reshape(batch_size, seq_len, self.embed_dim)
# Step 7: Apply output projection
output = self.out_proj.forward(Tensor(concat_output))
return output
### END SOLUTION
def parameters(self) -> List[Tensor]:
"""
Return all trainable parameters.
TODO: Collect parameters from all linear layers
APPROACH:
1. Get parameters from q_proj, k_proj, v_proj, out_proj
2. Combine into single list
Returns:
List of all parameter tensors
"""
### BEGIN SOLUTION
params = []
params.extend(self.q_proj.parameters())
params.extend(self.k_proj.parameters())
params.extend(self.v_proj.parameters())
params.extend(self.out_proj.parameters())
return params
### END SOLUTION

462
tinytorch/models/transformer.py generated Normal file
View File

@@ -0,0 +1,462 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2
import numpy as np
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.attention import MultiHeadAttention
from ..core.activations import GELU
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
class LayerNorm:
"""
Layer Normalization for transformer blocks.
Normalizes across the feature dimension (last axis) for each sample independently,
unlike batch normalization which normalizes across the batch dimension.
"""
def __init__(self, normalized_shape, eps=1e-5):
"""
Initialize LayerNorm with learnable parameters.
TODO: Set up normalization parameters
APPROACH:
1. Store the shape to normalize over (usually embed_dim)
2. Initialize learnable scale (gamma) and shift (beta) parameters
3. Set small epsilon for numerical stability
EXAMPLE:
>>> ln = LayerNorm(512) # For 512-dimensional embeddings
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, features)
>>> normalized = ln.forward(x)
>>> # Each (2, 10) sample normalized independently across 512 features
HINTS:
- gamma should start at 1.0 (identity scaling)
- beta should start at 0.0 (no shift)
- eps prevents division by zero in variance calculation
"""
### BEGIN SOLUTION
self.normalized_shape = normalized_shape
self.eps = eps
# Learnable parameters: scale and shift
self.gamma = Tensor(np.ones(normalized_shape)) # Scale parameter
self.beta = Tensor(np.zeros(normalized_shape)) # Shift parameter
### END SOLUTION
def forward(self, x):
"""
Apply layer normalization.
TODO: Implement layer normalization formula
APPROACH:
1. Compute mean and variance across the last dimension
2. Normalize: (x - mean) / sqrt(variance + eps)
3. Apply learnable scale and shift: gamma * normalized + beta
MATHEMATICAL FORMULA:
y = (x - μ) / σ * γ + β
where μ = mean(x), σ = sqrt(var(x) + ε)
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
"""
### BEGIN SOLUTION
# Compute statistics across last dimension (features)
mean = x.mean(axis=-1, keepdims=True)
# Compute variance: E[(x - μ)²]
diff = Tensor(x.data - mean.data)
variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
# Normalize
std = Tensor(np.sqrt(variance.data + self.eps))
normalized = Tensor((x.data - mean.data) / std.data)
# Apply learnable transformation
output = normalized * self.gamma + self.beta
return output
### END SOLUTION
def parameters(self):
"""Return learnable parameters."""
return [self.gamma, self.beta]
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13
class MLP:
"""
Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.
Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1.
This provides the non-linear transformation in each transformer block.
"""
def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1):
"""
Initialize MLP with two linear layers.
TODO: Set up the feed-forward network layers
APPROACH:
1. First layer expands from embed_dim to hidden_dim (usually 4x larger)
2. Second layer projects back to embed_dim
3. Use GELU activation (smoother than ReLU, preferred in transformers)
EXAMPLE:
>>> mlp = MLP(512) # Will create 512 -> 2048 -> 512 network
>>> x = Tensor(np.random.randn(2, 10, 512))
>>> output = mlp.forward(x)
>>> assert output.shape == (2, 10, 512)
HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim)
"""
### BEGIN SOLUTION
if hidden_dim is None:
hidden_dim = 4 * embed_dim # Standard 4x expansion
self.embed_dim = embed_dim
self.hidden_dim = hidden_dim
# Two-layer feed-forward network
self.linear1 = Linear(embed_dim, hidden_dim)
self.linear2 = Linear(hidden_dim, embed_dim)
### END SOLUTION
def forward(self, x):
"""
Forward pass through MLP.
TODO: Implement the feed-forward computation
APPROACH:
1. First linear transformation: embed_dim -> hidden_dim
2. Apply GELU activation (smooth, differentiable)
3. Second linear transformation: hidden_dim -> embed_dim
COMPUTATION FLOW:
x -> Linear -> GELU -> Linear -> output
HINT: GELU activation is implemented above as a function
"""
### BEGIN SOLUTION
# First linear layer with expansion
hidden = self.linear1.forward(x)
# GELU activation
hidden = gelu(hidden)
# Second linear layer back to original size
output = self.linear2.forward(hidden)
return output
### END SOLUTION
def parameters(self):
"""Return all learnable parameters."""
params = []
params.extend(self.linear1.parameters())
params.extend(self.linear2.parameters())
return params
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17
class TransformerBlock:
"""
Complete Transformer Block with self-attention, MLP, and residual connections.
This is the core building block of GPT and other transformer models.
Each block processes the input sequence and passes it to the next block.
"""
def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1):
"""
Initialize a complete transformer block.
TODO: Set up all components of the transformer block
APPROACH:
1. Multi-head self-attention for sequence modeling
2. First layer normalization (pre-norm architecture)
3. MLP with specified expansion ratio
4. Second layer normalization
TRANSFORMER BLOCK ARCHITECTURE:
x → LayerNorm → MultiHeadAttention → + (residual) →
LayerNorm → MLP → + (residual) → output
EXAMPLE:
>>> block = TransformerBlock(embed_dim=512, num_heads=8)
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, embed)
>>> output = block.forward(x)
>>> assert output.shape == (2, 10, 512)
HINT: We use pre-norm architecture (LayerNorm before attention/MLP)
"""
### BEGIN SOLUTION
self.embed_dim = embed_dim
self.num_heads = num_heads
# Multi-head self-attention
self.attention = MultiHeadAttention(embed_dim, num_heads)
# Layer normalizations (pre-norm architecture)
self.ln1 = LayerNorm(embed_dim) # Before attention
self.ln2 = LayerNorm(embed_dim) # Before MLP
# Feed-forward network
hidden_dim = int(embed_dim * mlp_ratio)
self.mlp = MLP(embed_dim, hidden_dim)
### END SOLUTION
def forward(self, x, mask=None):
"""
Forward pass through transformer block.
TODO: Implement the complete transformer block computation
APPROACH:
1. Apply layer norm, then self-attention, then add residual
2. Apply layer norm, then MLP, then add residual
3. Return the transformed sequence
COMPUTATION FLOW:
x → ln1 → attention → + x → ln2 → mlp → + → output
RESIDUAL CONNECTIONS:
These are crucial for training deep networks - they allow gradients
to flow directly through the network during backpropagation.
HINT: Store intermediate results to add residual connections properly
"""
### BEGIN SOLUTION
# First sub-layer: Multi-head self-attention with residual connection
# Pre-norm: LayerNorm before attention
normed1 = self.ln1.forward(x)
# Self-attention: query, key, value are all the same (normed1)
attention_out = self.attention.forward(normed1, normed1, normed1, mask)
# Residual connection
x = x + attention_out
# Second sub-layer: MLP with residual connection
# Pre-norm: LayerNorm before MLP
normed2 = self.ln2.forward(x)
mlp_out = self.mlp.forward(normed2)
# Residual connection
output = x + mlp_out
return output
### END SOLUTION
def parameters(self):
"""Return all learnable parameters."""
params = []
params.extend(self.attention.parameters())
params.extend(self.ln1.parameters())
params.extend(self.ln2.parameters())
params.extend(self.mlp.parameters())
return params
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21
class GPT:
"""
Complete GPT (Generative Pre-trained Transformer) model.
This combines embeddings, positional encoding, multiple transformer blocks,
and a language modeling head for text generation.
"""
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024):
"""
Initialize complete GPT model.
TODO: Set up all components of the GPT architecture
APPROACH:
1. Token embedding layer to convert tokens to vectors
2. Positional embedding to add position information
3. Stack of transformer blocks (the main computation)
4. Final layer norm and language modeling head
GPT ARCHITECTURE:
tokens → embedding → + pos_embedding →
transformer_blocks → layer_norm → lm_head → logits
EXAMPLE:
>>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8)
>>> tokens = Tensor(np.random.randint(0, 1000, (2, 10))) # (batch, seq)
>>> logits = model.forward(tokens)
>>> assert logits.shape == (2, 10, 1000) # (batch, seq, vocab)
HINTS:
- Positional embeddings are learned, not fixed sinusoidal
- Final layer norm stabilizes training
- Language modeling head shares weights with token embedding (tie_weights)
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
self.max_seq_len = max_seq_len
# Token and positional embeddings
self.token_embedding = Embedding(vocab_size, embed_dim)
self.position_embedding = Embedding(max_seq_len, embed_dim)
# Stack of transformer blocks
self.blocks = []
for _ in range(num_layers):
block = TransformerBlock(embed_dim, num_heads)
self.blocks.append(block)
# Final layer normalization
self.ln_f = LayerNorm(embed_dim)
# Language modeling head (projects to vocabulary)
self.lm_head = Linear(embed_dim, vocab_size, bias=False)
### END SOLUTION
def forward(self, tokens):
"""
Forward pass through GPT model.
TODO: Implement the complete GPT forward pass
APPROACH:
1. Get token embeddings and positional embeddings
2. Add them together (broadcasting handles different shapes)
3. Pass through all transformer blocks sequentially
4. Apply final layer norm and language modeling head
COMPUTATION FLOW:
tokens → embed + pos_embed → blocks → ln_f → lm_head → logits
CAUSAL MASKING:
For autoregressive generation, we need to prevent tokens from
seeing future tokens. This is handled by the attention mask.
HINT: Create position indices as range(seq_len) for positional embedding
"""
### BEGIN SOLUTION
batch_size, seq_len = tokens.shape
# Token embeddings
token_emb = self.token_embedding.forward(tokens)
# Positional embeddings
positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
pos_emb = self.position_embedding.forward(positions)
# Combine embeddings
x = token_emb + pos_emb
# Create causal mask for autoregressive generation
mask = self._create_causal_mask(seq_len)
# Pass through transformer blocks
for block in self.blocks:
x = block.forward(x, mask)
# Final layer normalization
x = self.ln_f.forward(x)
# Language modeling head
logits = self.lm_head.forward(x)
return logits
### END SOLUTION
def _create_causal_mask(self, seq_len):
"""Create causal mask to prevent attending to future positions."""
### BEGIN SOLUTION
# Upper triangular matrix filled with -inf
mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1)
return Tensor(mask)
### END SOLUTION
def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
"""
Generate text autoregressively.
TODO: Implement autoregressive text generation
APPROACH:
1. Start with prompt tokens
2. For each new position:
- Run forward pass to get logits
- Sample next token from logits
- Append to sequence
3. Return generated sequence
AUTOREGRESSIVE GENERATION:
At each step, the model predicts the next token based on all
previous tokens. This is how GPT generates coherent text.
EXAMPLE:
>>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4)
>>> prompt = Tensor([[1, 2, 3]]) # Some token sequence
>>> generated = model.generate(prompt, max_new_tokens=5)
>>> assert generated.shape[1] == 3 + 5 # original + new tokens
HINT: Use np.random.choice with temperature for sampling
"""
### BEGIN SOLUTION
current_tokens = Tensor(prompt_tokens.data.copy())
for _ in range(max_new_tokens):
# Get logits for current sequence
logits = self.forward(current_tokens)
# Get logits for last position (next token prediction)
last_logits = logits.data[:, -1, :] # (batch_size, vocab_size)
# Apply temperature scaling
scaled_logits = last_logits / temperature
# Convert to probabilities (softmax)
exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True))
probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
# Sample next token
next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]])
# Append to sequence
current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1))
return current_tokens
### END SOLUTION
def parameters(self):
"""Return all learnable parameters."""
params = []
params.extend(self.token_embedding.parameters())
params.extend(self.position_embedding.parameters())
for block in self.blocks:
params.extend(block.parameters())
params.extend(self.ln_f.parameters())
params.extend(self.lm_head.parameters())
return params

333
tinytorch/text/embeddings.py generated Normal file
View File

@@ -0,0 +1,333 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2
import numpy as np
import math
from typing import List, Optional, Tuple
# Import from previous modules - following dependency chain
from ..core.tensor import Tensor
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6
class Embedding:
"""
Learnable embedding layer that maps token indices to dense vectors.
This is the fundamental building block for converting discrete tokens
into continuous representations that neural networks can process.
TODO: Implement the Embedding class
APPROACH:
1. Initialize embedding matrix with random weights (vocab_size, embed_dim)
2. Implement forward pass as matrix lookup using numpy indexing
3. Handle batch dimensions correctly
4. Return parameters for optimization
EXAMPLE:
>>> embed = Embedding(vocab_size=100, embed_dim=64)
>>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # batch_size=2, seq_len=3
>>> output = embed.forward(tokens)
>>> print(output.shape)
(2, 3, 64)
HINTS:
- Use numpy advanced indexing for lookup: weight[indices]
- Embedding matrix shape: (vocab_size, embed_dim)
- Initialize with Xavier/Glorot uniform for stable gradients
- Handle multi-dimensional indices correctly
"""
### BEGIN SOLUTION
def __init__(self, vocab_size: int, embed_dim: int):
"""
Initialize embedding layer.
Args:
vocab_size: Size of vocabulary (number of unique tokens)
embed_dim: Dimension of embedding vectors
"""
self.vocab_size = vocab_size
self.embed_dim = embed_dim
# Xavier initialization for better gradient flow
limit = math.sqrt(6.0 / (vocab_size + embed_dim))
self.weight = Tensor(
np.random.uniform(-limit, limit, (vocab_size, embed_dim)),
requires_grad=True
)
def forward(self, indices: Tensor) -> Tensor:
"""
Forward pass: lookup embeddings for given indices.
Args:
indices: Token indices of shape (batch_size, seq_len) or (seq_len,)
Returns:
Embedded vectors of shape (*indices.shape, embed_dim)
"""
# Handle input validation
if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0):
raise ValueError(
f"Index out of range. Expected 0 <= indices < {self.vocab_size}, "
f"got min={np.min(indices.data)}, max={np.max(indices.data)}"
)
# Perform embedding lookup using advanced indexing
# This is equivalent to one-hot multiplication but much more efficient
embedded = self.weight.data[indices.data.astype(int)]
return Tensor(embedded)
def parameters(self) -> List[Tensor]:
"""Return trainable parameters."""
return [self.weight]
def __repr__(self):
return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})"
### END SOLUTION
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10
class PositionalEncoding:
"""
Learnable positional encoding layer.
Adds trainable position-specific vectors to token embeddings,
allowing the model to learn positional patterns specific to the task.
TODO: Implement learnable positional encoding
APPROACH:
1. Create embedding matrix for positions: (max_seq_len, embed_dim)
2. Forward pass: lookup position embeddings and add to input
3. Handle different sequence lengths gracefully
4. Return parameters for training
EXAMPLE:
>>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64)
>>> embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, embed)
>>> output = pos_enc.forward(embeddings)
>>> print(output.shape)
(2, 10, 64) # Same shape, but now position-aware
HINTS:
- Position embeddings shape: (max_seq_len, embed_dim)
- Use slice [:seq_len] to handle variable lengths
- Add position encodings to input embeddings element-wise
- Initialize with smaller values than token embeddings (they're additive)
"""
### BEGIN SOLUTION
def __init__(self, max_seq_len: int, embed_dim: int):
"""
Initialize learnable positional encoding.
Args:
max_seq_len: Maximum sequence length to support
embed_dim: Embedding dimension (must match token embeddings)
"""
self.max_seq_len = max_seq_len
self.embed_dim = embed_dim
# Initialize position embedding matrix
# Smaller initialization than token embeddings since these are additive
limit = math.sqrt(2.0 / embed_dim)
self.position_embeddings = Tensor(
np.random.uniform(-limit, limit, (max_seq_len, embed_dim)),
requires_grad=True
)
def forward(self, x: Tensor) -> Tensor:
"""
Add positional encodings to input embeddings.
Args:
x: Input embeddings of shape (batch_size, seq_len, embed_dim)
Returns:
Position-encoded embeddings of same shape
"""
if len(x.shape) != 3:
raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}")
batch_size, seq_len, embed_dim = x.shape
if seq_len > self.max_seq_len:
raise ValueError(
f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}"
)
if embed_dim != self.embed_dim:
raise ValueError(
f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
)
# Get position embeddings for this sequence length
pos_embeddings = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim)
# Broadcast to match batch dimension: (1, seq_len, embed_dim)
pos_embeddings = pos_embeddings[np.newaxis, :, :]
# Add positional information to input embeddings
result = x.data + pos_embeddings
return Tensor(result)
def parameters(self) -> List[Tensor]:
"""Return trainable parameters."""
return [self.position_embeddings]
def __repr__(self):
return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
### END SOLUTION
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18
class EmbeddingLayer:
"""
Complete embedding system combining token and positional embeddings.
This is the production-ready component that handles the full embedding
pipeline used in transformers and other sequence models.
TODO: Implement complete embedding system
APPROACH:
1. Combine token embedding + positional encoding
2. Support both learned and sinusoidal position encodings
3. Handle variable sequence lengths gracefully
4. Add optional embedding scaling (Transformer convention)
EXAMPLE:
>>> embed_layer = EmbeddingLayer(
... vocab_size=50000,
... embed_dim=512,
... max_seq_len=2048,
... pos_encoding='learned'
... )
>>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])
>>> output = embed_layer.forward(tokens)
>>> print(output.shape)
(2, 3, 512)
HINTS:
- First apply token embedding, then add positional encoding
- Support 'learned', 'sinusoidal', or None for pos_encoding
- Handle both 2D (batch, seq) and 1D (seq) inputs gracefully
- Scale embeddings by sqrt(embed_dim) if requested (transformer convention)
"""
### BEGIN SOLUTION
def __init__(
self,
vocab_size: int,
embed_dim: int,
max_seq_len: int = 512,
pos_encoding: str = 'learned',
scale_embeddings: bool = False
):
"""
Initialize complete embedding system.
Args:
vocab_size: Size of vocabulary
embed_dim: Embedding dimension
max_seq_len: Maximum sequence length for positional encoding
pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None)
scale_embeddings: Whether to scale embeddings by sqrt(embed_dim)
"""
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.max_seq_len = max_seq_len
self.pos_encoding_type = pos_encoding
self.scale_embeddings = scale_embeddings
# Token embedding layer
self.token_embedding = Embedding(vocab_size, embed_dim)
# Positional encoding
if pos_encoding == 'learned':
self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
elif pos_encoding == 'sinusoidal':
# Create fixed sinusoidal encodings (no parameters)
self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim)
elif pos_encoding is None:
self.pos_encoding = None
else:
raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None")
def forward(self, tokens: Tensor) -> Tensor:
"""
Forward pass through complete embedding system.
Args:
tokens: Token indices of shape (batch_size, seq_len) or (seq_len,)
Returns:
Embedded tokens with positional information
"""
# Handle 1D input by adding batch dimension
if len(tokens.shape) == 1:
tokens = Tensor(tokens.data[np.newaxis, :]) # (1, seq_len)
squeeze_batch = True
else:
squeeze_batch = False
# Get token embeddings
token_embeds = self.token_embedding.forward(tokens) # (batch, seq, embed)
# Scale embeddings if requested (transformer convention)
if self.scale_embeddings:
token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim))
# Add positional encoding
if self.pos_encoding_type == 'learned':
# Use learnable positional encoding
output = self.pos_encoding.forward(token_embeds)
elif self.pos_encoding_type == 'sinusoidal':
# Use fixed sinusoidal encoding
batch_size, seq_len, embed_dim = token_embeds.shape
pos_embeddings = self.pos_encoding.data[:seq_len] # (seq_len, embed_dim)
pos_embeddings = pos_embeddings[np.newaxis, :, :] # (1, seq_len, embed_dim)
output = Tensor(token_embeds.data + pos_embeddings)
else:
# No positional encoding
output = token_embeds
# Remove batch dimension if it was added
if squeeze_batch:
output = Tensor(output.data[0]) # (seq_len, embed_dim)
return output
def parameters(self) -> List[Tensor]:
"""Return all trainable parameters."""
params = self.token_embedding.parameters()
if self.pos_encoding_type == 'learned':
params.extend(self.pos_encoding.parameters())
return params
def __repr__(self):
return (f"EmbeddingLayer(vocab_size={self.vocab_size}, "
f"embed_dim={self.embed_dim}, "
f"pos_encoding='{self.pos_encoding_type}')")
### END SOLUTION