mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 18:43:34 -05:00
🤖 Fix transformer module exports and milestone 05 imports
Module export fixes: - Add #|default_exp models.transformer directive to transformers module - Add imports (MultiHeadAttention, GELU, etc.) to export block - Export dataloader module (08_dataloader) - All modules now properly exported to tinytorch package Milestone 05 fixes: - Correct import paths (text.embeddings, data.loader, models.transformer) - Fix Linear.weight vs Linear.weights typo - Fix indentation in training loop - Call .forward() explicitly on transformer components Status: Architecture test mode works, model builds successfully TODO: Fix TransformerBlock/MultiHeadAttention signature mismatch in module 13
This commit is contained in:
76
tinytorch/_modidx.py
generated
76
tinytorch/_modidx.py
generated
@@ -61,6 +61,16 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/core/activations.py'),
|
||||
'tinytorch.core.activations.Tanh.forward': ( '02_activations/activations_dev.html#tanh.forward',
|
||||
'tinytorch/core/activations.py')},
|
||||
'tinytorch.core.attention': { 'tinytorch.core.attention.MultiHeadAttention': ( '12_attention/attention_dev.html#multiheadattention',
|
||||
'tinytorch/core/attention.py'),
|
||||
'tinytorch.core.attention.MultiHeadAttention.__init__': ( '12_attention/attention_dev.html#multiheadattention.__init__',
|
||||
'tinytorch/core/attention.py'),
|
||||
'tinytorch.core.attention.MultiHeadAttention.forward': ( '12_attention/attention_dev.html#multiheadattention.forward',
|
||||
'tinytorch/core/attention.py'),
|
||||
'tinytorch.core.attention.MultiHeadAttention.parameters': ( '12_attention/attention_dev.html#multiheadattention.parameters',
|
||||
'tinytorch/core/attention.py'),
|
||||
'tinytorch.core.attention.scaled_dot_product_attention': ( '12_attention/attention_dev.html#scaled_dot_product_attention',
|
||||
'tinytorch/core/attention.py')},
|
||||
'tinytorch.core.autograd': {},
|
||||
'tinytorch.core.layers': { 'tinytorch.core.layers.Dropout': ('03_layers/layers_dev.html#dropout', 'tinytorch/core/layers.py'),
|
||||
'tinytorch.core.layers.Dropout.__call__': ( '03_layers/layers_dev.html#dropout.__call__',
|
||||
@@ -270,6 +280,72 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__',
|
||||
'tinytorch/data/loader.py')},
|
||||
'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT.forward': ( '13_transformers/transformers_dev.html#gpt.forward',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT.generate': ( '13_transformers/transformers_dev.html#gpt.generate',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT.parameters': ( '13_transformers/transformers_dev.html#gpt.parameters',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm.parameters': ( '13_transformers/transformers_dev.html#layernorm.parameters',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP.parameters': ( '13_transformers/transformers_dev.html#mlp.parameters',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
|
||||
'tinytorch/models/transformer.py')},
|
||||
'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.forward': ( '11_embeddings/embeddings_dev.html#embedding.forward',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.parameters': ( '11_embeddings/embeddings_dev.html#embedding.parameters',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.forward': ( '11_embeddings/embeddings_dev.html#embeddinglayer.forward',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.parameters': ( '11_embeddings/embeddings_dev.html#embeddinglayer.parameters',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.forward': ( '11_embeddings/embeddings_dev.html#positionalencoding.forward',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.parameters': ( '11_embeddings/embeddings_dev.html#positionalencoding.parameters',
|
||||
'tinytorch/text/embeddings.py')},
|
||||
'tinytorch.text.tokenization': { 'tinytorch.text.tokenization.BPETokenizer': ( '10_tokenization/tokenization_dev.html#bpetokenizer',
|
||||
'tinytorch/text/tokenization.py'),
|
||||
'tinytorch.text.tokenization.BPETokenizer.__init__': ( '10_tokenization/tokenization_dev.html#bpetokenizer.__init__',
|
||||
|
||||
291
tinytorch/core/attention.py
generated
Normal file
291
tinytorch/core/attention.py
generated
Normal file
@@ -0,0 +1,291 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/07_attention/attention_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['scaled_dot_product_attention', 'MultiHeadAttention']
|
||||
|
||||
# %% ../../modules/source/12_attention/attention_dev.ipynb 0
|
||||
#| default_exp core.attention
|
||||
#| export
|
||||
|
||||
# %% ../../modules/source/12_attention/attention_dev.ipynb 2
|
||||
import numpy as np
|
||||
import math
|
||||
import time
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
# Import dependencies from previous modules - following TinyTorch dependency chain
|
||||
from .tensor import Tensor
|
||||
from .layers import Linear
|
||||
|
||||
# %% ../../modules/source/12_attention/attention_dev.ipynb 6
|
||||
def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Compute scaled dot-product attention.
|
||||
|
||||
This is the fundamental attention operation that powers all transformer models.
|
||||
We'll implement it with explicit loops first to show the O(n²) complexity.
|
||||
|
||||
TODO: Implement scaled dot-product attention step by step
|
||||
|
||||
APPROACH:
|
||||
1. Extract dimensions and validate inputs
|
||||
2. Compute attention scores with explicit nested loops (show O(n²) complexity)
|
||||
3. Scale by 1/√d_k for numerical stability
|
||||
4. Apply causal mask if provided (set masked positions to -inf)
|
||||
5. Apply softmax to get attention weights
|
||||
6. Apply values with attention weights (another O(n²) operation)
|
||||
7. Return output and attention weights
|
||||
|
||||
Args:
|
||||
Q: Query tensor of shape (batch_size, seq_len, d_model)
|
||||
K: Key tensor of shape (batch_size, seq_len, d_model)
|
||||
V: Value tensor of shape (batch_size, seq_len, d_model)
|
||||
mask: Optional causal mask, True=allow, False=mask (batch_size, seq_len, seq_len)
|
||||
|
||||
Returns:
|
||||
output: Attended values (batch_size, seq_len, d_model)
|
||||
attention_weights: Attention matrix (batch_size, seq_len, seq_len)
|
||||
|
||||
EXAMPLE:
|
||||
>>> Q = Tensor(np.random.randn(2, 4, 64)) # batch=2, seq=4, dim=64
|
||||
>>> K = Tensor(np.random.randn(2, 4, 64))
|
||||
>>> V = Tensor(np.random.randn(2, 4, 64))
|
||||
>>> output, weights = scaled_dot_product_attention(Q, K, V)
|
||||
>>> print(output.shape) # (2, 4, 64)
|
||||
>>> print(weights.shape) # (2, 4, 4)
|
||||
>>> print(weights.data[0].sum(axis=1)) # Each row sums to ~1.0
|
||||
|
||||
HINTS:
|
||||
- Use explicit nested loops to compute Q[i] @ K[j] for educational purposes
|
||||
- Scale factor is 1/√d_k where d_k is the last dimension of Q
|
||||
- Masked positions should be set to -1e9 before softmax
|
||||
- Remember that softmax normalizes along the last dimension
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Step 1: Extract dimensions and validate
|
||||
batch_size, seq_len, d_model = Q.shape
|
||||
assert K.shape == (batch_size, seq_len, d_model), f"K shape {K.shape} doesn't match Q shape {Q.shape}"
|
||||
assert V.shape == (batch_size, seq_len, d_model), f"V shape {V.shape} doesn't match Q shape {Q.shape}"
|
||||
|
||||
# Step 2: Compute attention scores with explicit loops (educational O(n²) demonstration)
|
||||
scores = np.zeros((batch_size, seq_len, seq_len))
|
||||
|
||||
# Show the quadratic complexity explicitly
|
||||
for b in range(batch_size): # For each batch
|
||||
for i in range(seq_len): # For each query position
|
||||
for j in range(seq_len): # Attend to each key position
|
||||
# Compute dot product between query i and key j
|
||||
score = 0.0
|
||||
for d in range(d_model): # Dot product across embedding dimension
|
||||
score += Q.data[b, i, d] * K.data[b, j, d]
|
||||
scores[b, i, j] = score
|
||||
|
||||
# Step 3: Scale by 1/√d_k for numerical stability
|
||||
scale_factor = 1.0 / math.sqrt(d_model)
|
||||
scores = scores * scale_factor
|
||||
|
||||
# Step 4: Apply causal mask if provided
|
||||
if mask is not None:
|
||||
# mask[i,j] = False means position j should not attend to position i
|
||||
mask_value = -1e9 # Large negative value becomes 0 after softmax
|
||||
for b in range(batch_size):
|
||||
for i in range(seq_len):
|
||||
for j in range(seq_len):
|
||||
if not mask.data[b, i, j]: # If mask is False, block attention
|
||||
scores[b, i, j] = mask_value
|
||||
|
||||
# Step 5: Apply softmax to get attention weights (probability distribution)
|
||||
attention_weights = np.zeros_like(scores)
|
||||
for b in range(batch_size):
|
||||
for i in range(seq_len):
|
||||
# Softmax over the j dimension (what this query attends to)
|
||||
row = scores[b, i, :]
|
||||
max_val = np.max(row) # Numerical stability
|
||||
exp_row = np.exp(row - max_val)
|
||||
sum_exp = np.sum(exp_row)
|
||||
attention_weights[b, i, :] = exp_row / sum_exp
|
||||
|
||||
# Step 6: Apply attention weights to values (another O(n²) operation)
|
||||
output = np.zeros((batch_size, seq_len, d_model))
|
||||
|
||||
# Again, show the quadratic complexity
|
||||
for b in range(batch_size): # For each batch
|
||||
for i in range(seq_len): # For each output position
|
||||
for j in range(seq_len): # Weighted sum over all value positions
|
||||
weight = attention_weights[b, i, j]
|
||||
for d in range(d_model): # Accumulate across embedding dimension
|
||||
output[b, i, d] += weight * V.data[b, j, d]
|
||||
|
||||
return Tensor(output), Tensor(attention_weights)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/12_attention/attention_dev.ipynb 10
|
||||
class MultiHeadAttention:
|
||||
"""
|
||||
Multi-head attention mechanism.
|
||||
|
||||
Runs multiple attention heads in parallel, each learning different relationships.
|
||||
This is the core component of transformer architectures.
|
||||
"""
|
||||
|
||||
def __init__(self, embed_dim: int, num_heads: int):
|
||||
"""
|
||||
Initialize multi-head attention.
|
||||
|
||||
TODO: Set up linear projections and validate configuration
|
||||
|
||||
APPROACH:
|
||||
1. Validate that embed_dim is divisible by num_heads
|
||||
2. Calculate head_dim (embed_dim // num_heads)
|
||||
3. Create linear layers for Q, K, V projections
|
||||
4. Create output projection layer
|
||||
5. Store configuration parameters
|
||||
|
||||
Args:
|
||||
embed_dim: Embedding dimension (d_model)
|
||||
num_heads: Number of parallel attention heads
|
||||
|
||||
EXAMPLE:
|
||||
>>> mha = MultiHeadAttention(embed_dim=512, num_heads=8)
|
||||
>>> mha.head_dim # 64 (512 / 8)
|
||||
>>> len(mha.parameters()) # 4 linear layers * 2 params each = 8 tensors
|
||||
|
||||
HINTS:
|
||||
- head_dim = embed_dim // num_heads must be integer
|
||||
- Need 4 Linear layers: q_proj, k_proj, v_proj, out_proj
|
||||
- Each projection maps embed_dim → embed_dim
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
assert embed_dim % num_heads == 0, f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})"
|
||||
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = embed_dim // num_heads
|
||||
|
||||
# Linear projections for queries, keys, values
|
||||
self.q_proj = Linear(embed_dim, embed_dim)
|
||||
self.k_proj = Linear(embed_dim, embed_dim)
|
||||
self.v_proj = Linear(embed_dim, embed_dim)
|
||||
|
||||
# Output projection to mix information across heads
|
||||
self.out_proj = Linear(embed_dim, embed_dim)
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
|
||||
"""
|
||||
Forward pass through multi-head attention.
|
||||
|
||||
TODO: Implement the complete multi-head attention forward pass
|
||||
|
||||
APPROACH:
|
||||
1. Extract input dimensions (batch_size, seq_len, embed_dim)
|
||||
2. Project input to Q, K, V using linear layers
|
||||
3. Reshape projections to separate heads: (batch, seq, heads, head_dim)
|
||||
4. Transpose to (batch, heads, seq, head_dim) for parallel processing
|
||||
5. Apply scaled dot-product attention to each head
|
||||
6. Transpose back and reshape to merge heads
|
||||
7. Apply output projection
|
||||
|
||||
Args:
|
||||
x: Input tensor (batch_size, seq_len, embed_dim)
|
||||
mask: Optional attention mask (batch_size, seq_len, seq_len)
|
||||
|
||||
Returns:
|
||||
output: Attended representation (batch_size, seq_len, embed_dim)
|
||||
|
||||
EXAMPLE:
|
||||
>>> mha = MultiHeadAttention(embed_dim=64, num_heads=8)
|
||||
>>> x = Tensor(np.random.randn(2, 10, 64)) # batch=2, seq=10, dim=64
|
||||
>>> output = mha.forward(x)
|
||||
>>> print(output.shape) # (2, 10, 64) - same as input
|
||||
|
||||
HINTS:
|
||||
- Reshape: (batch, seq, embed_dim) → (batch, seq, heads, head_dim)
|
||||
- Transpose: (batch, seq, heads, head_dim) → (batch, heads, seq, head_dim)
|
||||
- After attention: reverse the process to merge heads
|
||||
- Use scaled_dot_product_attention for each head
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Step 1: Extract dimensions
|
||||
batch_size, seq_len, embed_dim = x.shape
|
||||
assert embed_dim == self.embed_dim, f"Input dim {embed_dim} doesn't match expected {self.embed_dim}"
|
||||
|
||||
# Step 2: Project to Q, K, V
|
||||
Q = self.q_proj.forward(x) # (batch, seq, embed_dim)
|
||||
K = self.k_proj.forward(x)
|
||||
V = self.v_proj.forward(x)
|
||||
|
||||
# Step 3: Reshape to separate heads
|
||||
# From (batch, seq, embed_dim) to (batch, seq, num_heads, head_dim)
|
||||
Q_heads = Q.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
||||
K_heads = K.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
||||
V_heads = V.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
|
||||
|
||||
# Step 4: Transpose to (batch, num_heads, seq, head_dim) for parallel processing
|
||||
Q_heads = np.transpose(Q_heads, (0, 2, 1, 3))
|
||||
K_heads = np.transpose(K_heads, (0, 2, 1, 3))
|
||||
V_heads = np.transpose(V_heads, (0, 2, 1, 3))
|
||||
|
||||
# Step 5: Apply attention to each head
|
||||
head_outputs = []
|
||||
for h in range(self.num_heads):
|
||||
# Extract this head's Q, K, V
|
||||
Q_h = Tensor(Q_heads[:, h, :, :]) # (batch, seq, head_dim)
|
||||
K_h = Tensor(K_heads[:, h, :, :])
|
||||
V_h = Tensor(V_heads[:, h, :, :])
|
||||
|
||||
# Apply attention for this head
|
||||
head_out, _ = scaled_dot_product_attention(Q_h, K_h, V_h, mask)
|
||||
head_outputs.append(head_out.data)
|
||||
|
||||
# Step 6: Concatenate heads back together
|
||||
# Stack: list of (batch, seq, head_dim) → (batch, num_heads, seq, head_dim)
|
||||
concat_heads = np.stack(head_outputs, axis=1)
|
||||
|
||||
# Transpose back: (batch, num_heads, seq, head_dim) → (batch, seq, num_heads, head_dim)
|
||||
concat_heads = np.transpose(concat_heads, (0, 2, 1, 3))
|
||||
|
||||
# Reshape: (batch, seq, num_heads, head_dim) → (batch, seq, embed_dim)
|
||||
concat_output = concat_heads.reshape(batch_size, seq_len, self.embed_dim)
|
||||
|
||||
# Step 7: Apply output projection
|
||||
output = self.out_proj.forward(Tensor(concat_output))
|
||||
|
||||
return output
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self) -> List[Tensor]:
|
||||
"""
|
||||
Return all trainable parameters.
|
||||
|
||||
TODO: Collect parameters from all linear layers
|
||||
|
||||
APPROACH:
|
||||
1. Get parameters from q_proj, k_proj, v_proj, out_proj
|
||||
2. Combine into single list
|
||||
|
||||
Returns:
|
||||
List of all parameter tensors
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
params = []
|
||||
params.extend(self.q_proj.parameters())
|
||||
params.extend(self.k_proj.parameters())
|
||||
params.extend(self.v_proj.parameters())
|
||||
params.extend(self.out_proj.parameters())
|
||||
return params
|
||||
### END SOLUTION
|
||||
462
tinytorch/models/transformer.py
generated
Normal file
462
tinytorch/models/transformer.py
generated
Normal file
@@ -0,0 +1,462 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2
|
||||
import numpy as np
|
||||
from ..core.tensor import Tensor
|
||||
from ..core.layers import Linear
|
||||
from ..core.attention import MultiHeadAttention
|
||||
from ..core.activations import GELU
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
|
||||
class LayerNorm:
|
||||
"""
|
||||
Layer Normalization for transformer blocks.
|
||||
|
||||
Normalizes across the feature dimension (last axis) for each sample independently,
|
||||
unlike batch normalization which normalizes across the batch dimension.
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape, eps=1e-5):
|
||||
"""
|
||||
Initialize LayerNorm with learnable parameters.
|
||||
|
||||
TODO: Set up normalization parameters
|
||||
|
||||
APPROACH:
|
||||
1. Store the shape to normalize over (usually embed_dim)
|
||||
2. Initialize learnable scale (gamma) and shift (beta) parameters
|
||||
3. Set small epsilon for numerical stability
|
||||
|
||||
EXAMPLE:
|
||||
>>> ln = LayerNorm(512) # For 512-dimensional embeddings
|
||||
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, features)
|
||||
>>> normalized = ln.forward(x)
|
||||
>>> # Each (2, 10) sample normalized independently across 512 features
|
||||
|
||||
HINTS:
|
||||
- gamma should start at 1.0 (identity scaling)
|
||||
- beta should start at 0.0 (no shift)
|
||||
- eps prevents division by zero in variance calculation
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.normalized_shape = normalized_shape
|
||||
self.eps = eps
|
||||
|
||||
# Learnable parameters: scale and shift
|
||||
self.gamma = Tensor(np.ones(normalized_shape)) # Scale parameter
|
||||
self.beta = Tensor(np.zeros(normalized_shape)) # Shift parameter
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Apply layer normalization.
|
||||
|
||||
TODO: Implement layer normalization formula
|
||||
|
||||
APPROACH:
|
||||
1. Compute mean and variance across the last dimension
|
||||
2. Normalize: (x - mean) / sqrt(variance + eps)
|
||||
3. Apply learnable scale and shift: gamma * normalized + beta
|
||||
|
||||
MATHEMATICAL FORMULA:
|
||||
y = (x - μ) / σ * γ + β
|
||||
where μ = mean(x), σ = sqrt(var(x) + ε)
|
||||
|
||||
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Compute statistics across last dimension (features)
|
||||
mean = x.mean(axis=-1, keepdims=True)
|
||||
|
||||
# Compute variance: E[(x - μ)²]
|
||||
diff = Tensor(x.data - mean.data)
|
||||
variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
|
||||
|
||||
# Normalize
|
||||
std = Tensor(np.sqrt(variance.data + self.eps))
|
||||
normalized = Tensor((x.data - mean.data) / std.data)
|
||||
|
||||
# Apply learnable transformation
|
||||
output = normalized * self.gamma + self.beta
|
||||
return output
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
"""Return learnable parameters."""
|
||||
return [self.gamma, self.beta]
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13
|
||||
class MLP:
|
||||
"""
|
||||
Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.
|
||||
|
||||
Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1.
|
||||
This provides the non-linear transformation in each transformer block.
|
||||
"""
|
||||
|
||||
def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1):
|
||||
"""
|
||||
Initialize MLP with two linear layers.
|
||||
|
||||
TODO: Set up the feed-forward network layers
|
||||
|
||||
APPROACH:
|
||||
1. First layer expands from embed_dim to hidden_dim (usually 4x larger)
|
||||
2. Second layer projects back to embed_dim
|
||||
3. Use GELU activation (smoother than ReLU, preferred in transformers)
|
||||
|
||||
EXAMPLE:
|
||||
>>> mlp = MLP(512) # Will create 512 -> 2048 -> 512 network
|
||||
>>> x = Tensor(np.random.randn(2, 10, 512))
|
||||
>>> output = mlp.forward(x)
|
||||
>>> assert output.shape == (2, 10, 512)
|
||||
|
||||
HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if hidden_dim is None:
|
||||
hidden_dim = 4 * embed_dim # Standard 4x expansion
|
||||
|
||||
self.embed_dim = embed_dim
|
||||
self.hidden_dim = hidden_dim
|
||||
|
||||
# Two-layer feed-forward network
|
||||
self.linear1 = Linear(embed_dim, hidden_dim)
|
||||
self.linear2 = Linear(hidden_dim, embed_dim)
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Forward pass through MLP.
|
||||
|
||||
TODO: Implement the feed-forward computation
|
||||
|
||||
APPROACH:
|
||||
1. First linear transformation: embed_dim -> hidden_dim
|
||||
2. Apply GELU activation (smooth, differentiable)
|
||||
3. Second linear transformation: hidden_dim -> embed_dim
|
||||
|
||||
COMPUTATION FLOW:
|
||||
x -> Linear -> GELU -> Linear -> output
|
||||
|
||||
HINT: GELU activation is implemented above as a function
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# First linear layer with expansion
|
||||
hidden = self.linear1.forward(x)
|
||||
|
||||
# GELU activation
|
||||
hidden = gelu(hidden)
|
||||
|
||||
# Second linear layer back to original size
|
||||
output = self.linear2.forward(hidden)
|
||||
|
||||
return output
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
"""Return all learnable parameters."""
|
||||
params = []
|
||||
params.extend(self.linear1.parameters())
|
||||
params.extend(self.linear2.parameters())
|
||||
return params
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17
|
||||
class TransformerBlock:
|
||||
"""
|
||||
Complete Transformer Block with self-attention, MLP, and residual connections.
|
||||
|
||||
This is the core building block of GPT and other transformer models.
|
||||
Each block processes the input sequence and passes it to the next block.
|
||||
"""
|
||||
|
||||
def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1):
|
||||
"""
|
||||
Initialize a complete transformer block.
|
||||
|
||||
TODO: Set up all components of the transformer block
|
||||
|
||||
APPROACH:
|
||||
1. Multi-head self-attention for sequence modeling
|
||||
2. First layer normalization (pre-norm architecture)
|
||||
3. MLP with specified expansion ratio
|
||||
4. Second layer normalization
|
||||
|
||||
TRANSFORMER BLOCK ARCHITECTURE:
|
||||
x → LayerNorm → MultiHeadAttention → + (residual) →
|
||||
LayerNorm → MLP → + (residual) → output
|
||||
|
||||
EXAMPLE:
|
||||
>>> block = TransformerBlock(embed_dim=512, num_heads=8)
|
||||
>>> x = Tensor(np.random.randn(2, 10, 512)) # (batch, seq, embed)
|
||||
>>> output = block.forward(x)
|
||||
>>> assert output.shape == (2, 10, 512)
|
||||
|
||||
HINT: We use pre-norm architecture (LayerNorm before attention/MLP)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
|
||||
# Multi-head self-attention
|
||||
self.attention = MultiHeadAttention(embed_dim, num_heads)
|
||||
|
||||
# Layer normalizations (pre-norm architecture)
|
||||
self.ln1 = LayerNorm(embed_dim) # Before attention
|
||||
self.ln2 = LayerNorm(embed_dim) # Before MLP
|
||||
|
||||
# Feed-forward network
|
||||
hidden_dim = int(embed_dim * mlp_ratio)
|
||||
self.mlp = MLP(embed_dim, hidden_dim)
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
"""
|
||||
Forward pass through transformer block.
|
||||
|
||||
TODO: Implement the complete transformer block computation
|
||||
|
||||
APPROACH:
|
||||
1. Apply layer norm, then self-attention, then add residual
|
||||
2. Apply layer norm, then MLP, then add residual
|
||||
3. Return the transformed sequence
|
||||
|
||||
COMPUTATION FLOW:
|
||||
x → ln1 → attention → + x → ln2 → mlp → + → output
|
||||
|
||||
RESIDUAL CONNECTIONS:
|
||||
These are crucial for training deep networks - they allow gradients
|
||||
to flow directly through the network during backpropagation.
|
||||
|
||||
HINT: Store intermediate results to add residual connections properly
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# First sub-layer: Multi-head self-attention with residual connection
|
||||
# Pre-norm: LayerNorm before attention
|
||||
normed1 = self.ln1.forward(x)
|
||||
# Self-attention: query, key, value are all the same (normed1)
|
||||
attention_out = self.attention.forward(normed1, normed1, normed1, mask)
|
||||
|
||||
# Residual connection
|
||||
x = x + attention_out
|
||||
|
||||
# Second sub-layer: MLP with residual connection
|
||||
# Pre-norm: LayerNorm before MLP
|
||||
normed2 = self.ln2.forward(x)
|
||||
mlp_out = self.mlp.forward(normed2)
|
||||
|
||||
# Residual connection
|
||||
output = x + mlp_out
|
||||
|
||||
return output
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
"""Return all learnable parameters."""
|
||||
params = []
|
||||
params.extend(self.attention.parameters())
|
||||
params.extend(self.ln1.parameters())
|
||||
params.extend(self.ln2.parameters())
|
||||
params.extend(self.mlp.parameters())
|
||||
return params
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21
|
||||
class GPT:
|
||||
"""
|
||||
Complete GPT (Generative Pre-trained Transformer) model.
|
||||
|
||||
This combines embeddings, positional encoding, multiple transformer blocks,
|
||||
and a language modeling head for text generation.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024):
|
||||
"""
|
||||
Initialize complete GPT model.
|
||||
|
||||
TODO: Set up all components of the GPT architecture
|
||||
|
||||
APPROACH:
|
||||
1. Token embedding layer to convert tokens to vectors
|
||||
2. Positional embedding to add position information
|
||||
3. Stack of transformer blocks (the main computation)
|
||||
4. Final layer norm and language modeling head
|
||||
|
||||
GPT ARCHITECTURE:
|
||||
tokens → embedding → + pos_embedding →
|
||||
transformer_blocks → layer_norm → lm_head → logits
|
||||
|
||||
EXAMPLE:
|
||||
>>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8)
|
||||
>>> tokens = Tensor(np.random.randint(0, 1000, (2, 10))) # (batch, seq)
|
||||
>>> logits = model.forward(tokens)
|
||||
>>> assert logits.shape == (2, 10, 1000) # (batch, seq, vocab)
|
||||
|
||||
HINTS:
|
||||
- Positional embeddings are learned, not fixed sinusoidal
|
||||
- Final layer norm stabilizes training
|
||||
- Language modeling head shares weights with token embedding (tie_weights)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
self.max_seq_len = max_seq_len
|
||||
|
||||
# Token and positional embeddings
|
||||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||||
self.position_embedding = Embedding(max_seq_len, embed_dim)
|
||||
|
||||
# Stack of transformer blocks
|
||||
self.blocks = []
|
||||
for _ in range(num_layers):
|
||||
block = TransformerBlock(embed_dim, num_heads)
|
||||
self.blocks.append(block)
|
||||
|
||||
# Final layer normalization
|
||||
self.ln_f = LayerNorm(embed_dim)
|
||||
|
||||
# Language modeling head (projects to vocabulary)
|
||||
self.lm_head = Linear(embed_dim, vocab_size, bias=False)
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, tokens):
|
||||
"""
|
||||
Forward pass through GPT model.
|
||||
|
||||
TODO: Implement the complete GPT forward pass
|
||||
|
||||
APPROACH:
|
||||
1. Get token embeddings and positional embeddings
|
||||
2. Add them together (broadcasting handles different shapes)
|
||||
3. Pass through all transformer blocks sequentially
|
||||
4. Apply final layer norm and language modeling head
|
||||
|
||||
COMPUTATION FLOW:
|
||||
tokens → embed + pos_embed → blocks → ln_f → lm_head → logits
|
||||
|
||||
CAUSAL MASKING:
|
||||
For autoregressive generation, we need to prevent tokens from
|
||||
seeing future tokens. This is handled by the attention mask.
|
||||
|
||||
HINT: Create position indices as range(seq_len) for positional embedding
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
batch_size, seq_len = tokens.shape
|
||||
|
||||
# Token embeddings
|
||||
token_emb = self.token_embedding.forward(tokens)
|
||||
|
||||
# Positional embeddings
|
||||
positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
|
||||
pos_emb = self.position_embedding.forward(positions)
|
||||
|
||||
# Combine embeddings
|
||||
x = token_emb + pos_emb
|
||||
|
||||
# Create causal mask for autoregressive generation
|
||||
mask = self._create_causal_mask(seq_len)
|
||||
|
||||
# Pass through transformer blocks
|
||||
for block in self.blocks:
|
||||
x = block.forward(x, mask)
|
||||
|
||||
# Final layer normalization
|
||||
x = self.ln_f.forward(x)
|
||||
|
||||
# Language modeling head
|
||||
logits = self.lm_head.forward(x)
|
||||
|
||||
return logits
|
||||
### END SOLUTION
|
||||
|
||||
def _create_causal_mask(self, seq_len):
|
||||
"""Create causal mask to prevent attending to future positions."""
|
||||
### BEGIN SOLUTION
|
||||
# Upper triangular matrix filled with -inf
|
||||
mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1)
|
||||
return Tensor(mask)
|
||||
### END SOLUTION
|
||||
|
||||
def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
|
||||
"""
|
||||
Generate text autoregressively.
|
||||
|
||||
TODO: Implement autoregressive text generation
|
||||
|
||||
APPROACH:
|
||||
1. Start with prompt tokens
|
||||
2. For each new position:
|
||||
- Run forward pass to get logits
|
||||
- Sample next token from logits
|
||||
- Append to sequence
|
||||
3. Return generated sequence
|
||||
|
||||
AUTOREGRESSIVE GENERATION:
|
||||
At each step, the model predicts the next token based on all
|
||||
previous tokens. This is how GPT generates coherent text.
|
||||
|
||||
EXAMPLE:
|
||||
>>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4)
|
||||
>>> prompt = Tensor([[1, 2, 3]]) # Some token sequence
|
||||
>>> generated = model.generate(prompt, max_new_tokens=5)
|
||||
>>> assert generated.shape[1] == 3 + 5 # original + new tokens
|
||||
|
||||
HINT: Use np.random.choice with temperature for sampling
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
current_tokens = Tensor(prompt_tokens.data.copy())
|
||||
|
||||
for _ in range(max_new_tokens):
|
||||
# Get logits for current sequence
|
||||
logits = self.forward(current_tokens)
|
||||
|
||||
# Get logits for last position (next token prediction)
|
||||
last_logits = logits.data[:, -1, :] # (batch_size, vocab_size)
|
||||
|
||||
# Apply temperature scaling
|
||||
scaled_logits = last_logits / temperature
|
||||
|
||||
# Convert to probabilities (softmax)
|
||||
exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True))
|
||||
probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
|
||||
|
||||
# Sample next token
|
||||
next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]])
|
||||
|
||||
# Append to sequence
|
||||
current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1))
|
||||
|
||||
return current_tokens
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
"""Return all learnable parameters."""
|
||||
params = []
|
||||
params.extend(self.token_embedding.parameters())
|
||||
params.extend(self.position_embedding.parameters())
|
||||
|
||||
for block in self.blocks:
|
||||
params.extend(block.parameters())
|
||||
|
||||
params.extend(self.ln_f.parameters())
|
||||
params.extend(self.lm_head.parameters())
|
||||
|
||||
return params
|
||||
333
tinytorch/text/embeddings.py
generated
Normal file
333
tinytorch/text/embeddings.py
generated
Normal file
@@ -0,0 +1,333 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
|
||||
|
||||
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2
|
||||
import numpy as np
|
||||
import math
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
# Import from previous modules - following dependency chain
|
||||
from ..core.tensor import Tensor
|
||||
|
||||
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6
|
||||
class Embedding:
|
||||
"""
|
||||
Learnable embedding layer that maps token indices to dense vectors.
|
||||
|
||||
This is the fundamental building block for converting discrete tokens
|
||||
into continuous representations that neural networks can process.
|
||||
|
||||
TODO: Implement the Embedding class
|
||||
|
||||
APPROACH:
|
||||
1. Initialize embedding matrix with random weights (vocab_size, embed_dim)
|
||||
2. Implement forward pass as matrix lookup using numpy indexing
|
||||
3. Handle batch dimensions correctly
|
||||
4. Return parameters for optimization
|
||||
|
||||
EXAMPLE:
|
||||
>>> embed = Embedding(vocab_size=100, embed_dim=64)
|
||||
>>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # batch_size=2, seq_len=3
|
||||
>>> output = embed.forward(tokens)
|
||||
>>> print(output.shape)
|
||||
(2, 3, 64)
|
||||
|
||||
HINTS:
|
||||
- Use numpy advanced indexing for lookup: weight[indices]
|
||||
- Embedding matrix shape: (vocab_size, embed_dim)
|
||||
- Initialize with Xavier/Glorot uniform for stable gradients
|
||||
- Handle multi-dimensional indices correctly
|
||||
"""
|
||||
|
||||
### BEGIN SOLUTION
|
||||
def __init__(self, vocab_size: int, embed_dim: int):
|
||||
"""
|
||||
Initialize embedding layer.
|
||||
|
||||
Args:
|
||||
vocab_size: Size of vocabulary (number of unique tokens)
|
||||
embed_dim: Dimension of embedding vectors
|
||||
"""
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
# Xavier initialization for better gradient flow
|
||||
limit = math.sqrt(6.0 / (vocab_size + embed_dim))
|
||||
self.weight = Tensor(
|
||||
np.random.uniform(-limit, limit, (vocab_size, embed_dim)),
|
||||
requires_grad=True
|
||||
)
|
||||
|
||||
def forward(self, indices: Tensor) -> Tensor:
|
||||
"""
|
||||
Forward pass: lookup embeddings for given indices.
|
||||
|
||||
Args:
|
||||
indices: Token indices of shape (batch_size, seq_len) or (seq_len,)
|
||||
|
||||
Returns:
|
||||
Embedded vectors of shape (*indices.shape, embed_dim)
|
||||
"""
|
||||
# Handle input validation
|
||||
if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0):
|
||||
raise ValueError(
|
||||
f"Index out of range. Expected 0 <= indices < {self.vocab_size}, "
|
||||
f"got min={np.min(indices.data)}, max={np.max(indices.data)}"
|
||||
)
|
||||
|
||||
# Perform embedding lookup using advanced indexing
|
||||
# This is equivalent to one-hot multiplication but much more efficient
|
||||
embedded = self.weight.data[indices.data.astype(int)]
|
||||
|
||||
return Tensor(embedded)
|
||||
|
||||
def parameters(self) -> List[Tensor]:
|
||||
"""Return trainable parameters."""
|
||||
return [self.weight]
|
||||
|
||||
def __repr__(self):
|
||||
return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})"
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10
|
||||
class PositionalEncoding:
|
||||
"""
|
||||
Learnable positional encoding layer.
|
||||
|
||||
Adds trainable position-specific vectors to token embeddings,
|
||||
allowing the model to learn positional patterns specific to the task.
|
||||
|
||||
TODO: Implement learnable positional encoding
|
||||
|
||||
APPROACH:
|
||||
1. Create embedding matrix for positions: (max_seq_len, embed_dim)
|
||||
2. Forward pass: lookup position embeddings and add to input
|
||||
3. Handle different sequence lengths gracefully
|
||||
4. Return parameters for training
|
||||
|
||||
EXAMPLE:
|
||||
>>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64)
|
||||
>>> embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, embed)
|
||||
>>> output = pos_enc.forward(embeddings)
|
||||
>>> print(output.shape)
|
||||
(2, 10, 64) # Same shape, but now position-aware
|
||||
|
||||
HINTS:
|
||||
- Position embeddings shape: (max_seq_len, embed_dim)
|
||||
- Use slice [:seq_len] to handle variable lengths
|
||||
- Add position encodings to input embeddings element-wise
|
||||
- Initialize with smaller values than token embeddings (they're additive)
|
||||
"""
|
||||
|
||||
### BEGIN SOLUTION
|
||||
def __init__(self, max_seq_len: int, embed_dim: int):
|
||||
"""
|
||||
Initialize learnable positional encoding.
|
||||
|
||||
Args:
|
||||
max_seq_len: Maximum sequence length to support
|
||||
embed_dim: Embedding dimension (must match token embeddings)
|
||||
"""
|
||||
self.max_seq_len = max_seq_len
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
# Initialize position embedding matrix
|
||||
# Smaller initialization than token embeddings since these are additive
|
||||
limit = math.sqrt(2.0 / embed_dim)
|
||||
self.position_embeddings = Tensor(
|
||||
np.random.uniform(-limit, limit, (max_seq_len, embed_dim)),
|
||||
requires_grad=True
|
||||
)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""
|
||||
Add positional encodings to input embeddings.
|
||||
|
||||
Args:
|
||||
x: Input embeddings of shape (batch_size, seq_len, embed_dim)
|
||||
|
||||
Returns:
|
||||
Position-encoded embeddings of same shape
|
||||
"""
|
||||
if len(x.shape) != 3:
|
||||
raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}")
|
||||
|
||||
batch_size, seq_len, embed_dim = x.shape
|
||||
|
||||
if seq_len > self.max_seq_len:
|
||||
raise ValueError(
|
||||
f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}"
|
||||
)
|
||||
|
||||
if embed_dim != self.embed_dim:
|
||||
raise ValueError(
|
||||
f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
|
||||
)
|
||||
|
||||
# Get position embeddings for this sequence length
|
||||
pos_embeddings = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim)
|
||||
|
||||
# Broadcast to match batch dimension: (1, seq_len, embed_dim)
|
||||
pos_embeddings = pos_embeddings[np.newaxis, :, :]
|
||||
|
||||
# Add positional information to input embeddings
|
||||
result = x.data + pos_embeddings
|
||||
|
||||
return Tensor(result)
|
||||
|
||||
def parameters(self) -> List[Tensor]:
|
||||
"""Return trainable parameters."""
|
||||
return [self.position_embeddings]
|
||||
|
||||
def __repr__(self):
|
||||
return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18
|
||||
class EmbeddingLayer:
|
||||
"""
|
||||
Complete embedding system combining token and positional embeddings.
|
||||
|
||||
This is the production-ready component that handles the full embedding
|
||||
pipeline used in transformers and other sequence models.
|
||||
|
||||
TODO: Implement complete embedding system
|
||||
|
||||
APPROACH:
|
||||
1. Combine token embedding + positional encoding
|
||||
2. Support both learned and sinusoidal position encodings
|
||||
3. Handle variable sequence lengths gracefully
|
||||
4. Add optional embedding scaling (Transformer convention)
|
||||
|
||||
EXAMPLE:
|
||||
>>> embed_layer = EmbeddingLayer(
|
||||
... vocab_size=50000,
|
||||
... embed_dim=512,
|
||||
... max_seq_len=2048,
|
||||
... pos_encoding='learned'
|
||||
... )
|
||||
>>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])
|
||||
>>> output = embed_layer.forward(tokens)
|
||||
>>> print(output.shape)
|
||||
(2, 3, 512)
|
||||
|
||||
HINTS:
|
||||
- First apply token embedding, then add positional encoding
|
||||
- Support 'learned', 'sinusoidal', or None for pos_encoding
|
||||
- Handle both 2D (batch, seq) and 1D (seq) inputs gracefully
|
||||
- Scale embeddings by sqrt(embed_dim) if requested (transformer convention)
|
||||
"""
|
||||
|
||||
### BEGIN SOLUTION
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int,
|
||||
embed_dim: int,
|
||||
max_seq_len: int = 512,
|
||||
pos_encoding: str = 'learned',
|
||||
scale_embeddings: bool = False
|
||||
):
|
||||
"""
|
||||
Initialize complete embedding system.
|
||||
|
||||
Args:
|
||||
vocab_size: Size of vocabulary
|
||||
embed_dim: Embedding dimension
|
||||
max_seq_len: Maximum sequence length for positional encoding
|
||||
pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None)
|
||||
scale_embeddings: Whether to scale embeddings by sqrt(embed_dim)
|
||||
"""
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.max_seq_len = max_seq_len
|
||||
self.pos_encoding_type = pos_encoding
|
||||
self.scale_embeddings = scale_embeddings
|
||||
|
||||
# Token embedding layer
|
||||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||||
|
||||
# Positional encoding
|
||||
if pos_encoding == 'learned':
|
||||
self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
|
||||
elif pos_encoding == 'sinusoidal':
|
||||
# Create fixed sinusoidal encodings (no parameters)
|
||||
self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim)
|
||||
elif pos_encoding is None:
|
||||
self.pos_encoding = None
|
||||
else:
|
||||
raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None")
|
||||
|
||||
def forward(self, tokens: Tensor) -> Tensor:
|
||||
"""
|
||||
Forward pass through complete embedding system.
|
||||
|
||||
Args:
|
||||
tokens: Token indices of shape (batch_size, seq_len) or (seq_len,)
|
||||
|
||||
Returns:
|
||||
Embedded tokens with positional information
|
||||
"""
|
||||
# Handle 1D input by adding batch dimension
|
||||
if len(tokens.shape) == 1:
|
||||
tokens = Tensor(tokens.data[np.newaxis, :]) # (1, seq_len)
|
||||
squeeze_batch = True
|
||||
else:
|
||||
squeeze_batch = False
|
||||
|
||||
# Get token embeddings
|
||||
token_embeds = self.token_embedding.forward(tokens) # (batch, seq, embed)
|
||||
|
||||
# Scale embeddings if requested (transformer convention)
|
||||
if self.scale_embeddings:
|
||||
token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim))
|
||||
|
||||
# Add positional encoding
|
||||
if self.pos_encoding_type == 'learned':
|
||||
# Use learnable positional encoding
|
||||
output = self.pos_encoding.forward(token_embeds)
|
||||
elif self.pos_encoding_type == 'sinusoidal':
|
||||
# Use fixed sinusoidal encoding
|
||||
batch_size, seq_len, embed_dim = token_embeds.shape
|
||||
pos_embeddings = self.pos_encoding.data[:seq_len] # (seq_len, embed_dim)
|
||||
pos_embeddings = pos_embeddings[np.newaxis, :, :] # (1, seq_len, embed_dim)
|
||||
output = Tensor(token_embeds.data + pos_embeddings)
|
||||
else:
|
||||
# No positional encoding
|
||||
output = token_embeds
|
||||
|
||||
# Remove batch dimension if it was added
|
||||
if squeeze_batch:
|
||||
output = Tensor(output.data[0]) # (seq_len, embed_dim)
|
||||
|
||||
return output
|
||||
|
||||
def parameters(self) -> List[Tensor]:
|
||||
"""Return all trainable parameters."""
|
||||
params = self.token_embedding.parameters()
|
||||
|
||||
if self.pos_encoding_type == 'learned':
|
||||
params.extend(self.pos_encoding.parameters())
|
||||
|
||||
return params
|
||||
|
||||
def __repr__(self):
|
||||
return (f"EmbeddingLayer(vocab_size={self.vocab_size}, "
|
||||
f"embed_dim={self.embed_dim}, "
|
||||
f"pos_encoding='{self.pos_encoding_type}')")
|
||||
### END SOLUTION
|
||||
Reference in New Issue
Block a user