mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 19:53:33 -05:00
Critical fixes for transformer gradient flow:
EmbeddingBackward:
- Implements scatter-add gradient accumulation for embedding lookups
- Added to Module 05 (autograd_dev.py)
- Module 11 imports and uses it in Embedding.forward()
- Gradients now flow back to embedding weights
ReshapeBackward:
- reshape() was breaking computation graph (no _grad_fn)
- Added backward function that reshapes gradient back to original shape
- Patched Tensor.reshape() in enable_autograd()
- Critical for GPT forward pass (logits.reshape before loss)
Results:
- Before: 0/37 parameters receive gradients, loss stuck
- After: 13/37 parameters receive gradients (35%)
- Single batch overfitting: 4.46 → 0.03 (99.4% improvement!)
- MODEL NOW LEARNS! 🎉
Remaining work: 24 parameters still missing gradients (likely attention)
Tests added:
- tests/milestones/test_05_transformer_architecture.py (Phase 1)
- Multiple debug scripts to isolate issues
345 lines
13 KiB
Python
Generated
345 lines
13 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
|
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
|
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
|
# ║ ║
|
|
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
|
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
|
# ║ ║
|
|
# ║ ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py ║
|
|
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
|
# ║ ║
|
|
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
|
# ║ Editing it directly may break module functionality and training. ║
|
|
# ║ ║
|
|
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
|
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
|
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
|
# %% auto 0
|
|
__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
|
|
|
|
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2
|
|
import numpy as np
|
|
import math
|
|
from typing import List, Optional, Tuple
|
|
|
|
# Import from previous modules - following dependency chain
|
|
from ..core.tensor import Tensor
|
|
|
|
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6
|
|
class Embedding:
|
|
"""
|
|
Learnable embedding layer that maps token indices to dense vectors.
|
|
|
|
This is the fundamental building block for converting discrete tokens
|
|
into continuous representations that neural networks can process.
|
|
|
|
TODO: Implement the Embedding class
|
|
|
|
APPROACH:
|
|
1. Initialize embedding matrix with random weights (vocab_size, embed_dim)
|
|
2. Implement forward pass as matrix lookup using numpy indexing
|
|
3. Handle batch dimensions correctly
|
|
4. Return parameters for optimization
|
|
|
|
EXAMPLE:
|
|
>>> embed = Embedding(vocab_size=100, embed_dim=64)
|
|
>>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # batch_size=2, seq_len=3
|
|
>>> output = embed.forward(tokens)
|
|
>>> print(output.shape)
|
|
(2, 3, 64)
|
|
|
|
HINTS:
|
|
- Use numpy advanced indexing for lookup: weight[indices]
|
|
- Embedding matrix shape: (vocab_size, embed_dim)
|
|
- Initialize with Xavier/Glorot uniform for stable gradients
|
|
- Handle multi-dimensional indices correctly
|
|
"""
|
|
|
|
### BEGIN SOLUTION
|
|
def __init__(self, vocab_size: int, embed_dim: int):
|
|
"""
|
|
Initialize embedding layer.
|
|
|
|
Args:
|
|
vocab_size: Size of vocabulary (number of unique tokens)
|
|
embed_dim: Dimension of embedding vectors
|
|
"""
|
|
self.vocab_size = vocab_size
|
|
self.embed_dim = embed_dim
|
|
|
|
# Xavier initialization for better gradient flow
|
|
limit = math.sqrt(6.0 / (vocab_size + embed_dim))
|
|
self.weight = Tensor(
|
|
np.random.uniform(-limit, limit, (vocab_size, embed_dim)),
|
|
requires_grad=True
|
|
)
|
|
|
|
def forward(self, indices: Tensor) -> Tensor:
|
|
"""
|
|
Forward pass: lookup embeddings for given indices.
|
|
|
|
Args:
|
|
indices: Token indices of shape (batch_size, seq_len) or (seq_len,)
|
|
|
|
Returns:
|
|
Embedded vectors of shape (*indices.shape, embed_dim)
|
|
"""
|
|
# Handle input validation
|
|
if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0):
|
|
raise ValueError(
|
|
f"Index out of range. Expected 0 <= indices < {self.vocab_size}, "
|
|
f"got min={np.min(indices.data)}, max={np.max(indices.data)}"
|
|
)
|
|
|
|
# Perform embedding lookup using advanced indexing
|
|
# This is equivalent to one-hot multiplication but much more efficient
|
|
embedded = self.weight.data[indices.data.astype(int)]
|
|
|
|
# Create result tensor
|
|
result = Tensor(embedded, requires_grad=self.weight.requires_grad)
|
|
|
|
# Attach gradient function (students learned this in Module 05!)
|
|
if self.weight.requires_grad:
|
|
from tinytorch.core.autograd import EmbeddingBackward
|
|
result._grad_fn = EmbeddingBackward(self.weight, indices)
|
|
|
|
return result
|
|
|
|
def parameters(self) -> List[Tensor]:
|
|
"""Return trainable parameters."""
|
|
return [self.weight]
|
|
|
|
def __repr__(self):
|
|
return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})"
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10
|
|
class PositionalEncoding:
|
|
"""
|
|
Learnable positional encoding layer.
|
|
|
|
Adds trainable position-specific vectors to token embeddings,
|
|
allowing the model to learn positional patterns specific to the task.
|
|
|
|
TODO: Implement learnable positional encoding
|
|
|
|
APPROACH:
|
|
1. Create embedding matrix for positions: (max_seq_len, embed_dim)
|
|
2. Forward pass: lookup position embeddings and add to input
|
|
3. Handle different sequence lengths gracefully
|
|
4. Return parameters for training
|
|
|
|
EXAMPLE:
|
|
>>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64)
|
|
>>> embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, embed)
|
|
>>> output = pos_enc.forward(embeddings)
|
|
>>> print(output.shape)
|
|
(2, 10, 64) # Same shape, but now position-aware
|
|
|
|
HINTS:
|
|
- Position embeddings shape: (max_seq_len, embed_dim)
|
|
- Use slice [:seq_len] to handle variable lengths
|
|
- Add position encodings to input embeddings element-wise
|
|
- Initialize with smaller values than token embeddings (they're additive)
|
|
"""
|
|
|
|
### BEGIN SOLUTION
|
|
def __init__(self, max_seq_len: int, embed_dim: int):
|
|
"""
|
|
Initialize learnable positional encoding.
|
|
|
|
Args:
|
|
max_seq_len: Maximum sequence length to support
|
|
embed_dim: Embedding dimension (must match token embeddings)
|
|
"""
|
|
self.max_seq_len = max_seq_len
|
|
self.embed_dim = embed_dim
|
|
|
|
# Initialize position embedding matrix
|
|
# Smaller initialization than token embeddings since these are additive
|
|
limit = math.sqrt(2.0 / embed_dim)
|
|
self.position_embeddings = Tensor(
|
|
np.random.uniform(-limit, limit, (max_seq_len, embed_dim)),
|
|
requires_grad=True
|
|
)
|
|
|
|
def forward(self, x: Tensor) -> Tensor:
|
|
"""
|
|
Add positional encodings to input embeddings.
|
|
|
|
Args:
|
|
x: Input embeddings of shape (batch_size, seq_len, embed_dim)
|
|
|
|
Returns:
|
|
Position-encoded embeddings of same shape
|
|
"""
|
|
if len(x.shape) != 3:
|
|
raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}")
|
|
|
|
batch_size, seq_len, embed_dim = x.shape
|
|
|
|
if seq_len > self.max_seq_len:
|
|
raise ValueError(
|
|
f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}"
|
|
)
|
|
|
|
if embed_dim != self.embed_dim:
|
|
raise ValueError(
|
|
f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
|
|
)
|
|
|
|
# Get position embeddings for this sequence length (slice using .data for efficiency)
|
|
pos_embeddings_data = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim)
|
|
|
|
# Broadcast to match batch dimension: (1, seq_len, embed_dim)
|
|
pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]
|
|
|
|
# Wrap in Tensor to preserve requires_grad
|
|
pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)
|
|
|
|
# Add positional information using Tensor operation to preserve gradients!
|
|
result = x + pos_embeddings
|
|
|
|
return result
|
|
|
|
def parameters(self) -> List[Tensor]:
|
|
"""Return trainable parameters."""
|
|
return [self.position_embeddings]
|
|
|
|
def __repr__(self):
|
|
return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18
|
|
class EmbeddingLayer:
|
|
"""
|
|
Complete embedding system combining token and positional embeddings.
|
|
|
|
This is the production-ready component that handles the full embedding
|
|
pipeline used in transformers and other sequence models.
|
|
|
|
TODO: Implement complete embedding system
|
|
|
|
APPROACH:
|
|
1. Combine token embedding + positional encoding
|
|
2. Support both learned and sinusoidal position encodings
|
|
3. Handle variable sequence lengths gracefully
|
|
4. Add optional embedding scaling (Transformer convention)
|
|
|
|
EXAMPLE:
|
|
>>> embed_layer = EmbeddingLayer(
|
|
... vocab_size=50000,
|
|
... embed_dim=512,
|
|
... max_seq_len=2048,
|
|
... pos_encoding='learned'
|
|
... )
|
|
>>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])
|
|
>>> output = embed_layer.forward(tokens)
|
|
>>> print(output.shape)
|
|
(2, 3, 512)
|
|
|
|
HINTS:
|
|
- First apply token embedding, then add positional encoding
|
|
- Support 'learned', 'sinusoidal', or None for pos_encoding
|
|
- Handle both 2D (batch, seq) and 1D (seq) inputs gracefully
|
|
- Scale embeddings by sqrt(embed_dim) if requested (transformer convention)
|
|
"""
|
|
|
|
### BEGIN SOLUTION
|
|
def __init__(
|
|
self,
|
|
vocab_size: int,
|
|
embed_dim: int,
|
|
max_seq_len: int = 512,
|
|
pos_encoding: str = 'learned',
|
|
scale_embeddings: bool = False
|
|
):
|
|
"""
|
|
Initialize complete embedding system.
|
|
|
|
Args:
|
|
vocab_size: Size of vocabulary
|
|
embed_dim: Embedding dimension
|
|
max_seq_len: Maximum sequence length for positional encoding
|
|
pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None)
|
|
scale_embeddings: Whether to scale embeddings by sqrt(embed_dim)
|
|
"""
|
|
self.vocab_size = vocab_size
|
|
self.embed_dim = embed_dim
|
|
self.max_seq_len = max_seq_len
|
|
self.pos_encoding_type = pos_encoding
|
|
self.scale_embeddings = scale_embeddings
|
|
|
|
# Token embedding layer
|
|
self.token_embedding = Embedding(vocab_size, embed_dim)
|
|
|
|
# Positional encoding
|
|
if pos_encoding == 'learned':
|
|
self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
|
|
elif pos_encoding == 'sinusoidal':
|
|
# Create fixed sinusoidal encodings (no parameters)
|
|
self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim)
|
|
elif pos_encoding is None:
|
|
self.pos_encoding = None
|
|
else:
|
|
raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None")
|
|
|
|
def forward(self, tokens: Tensor) -> Tensor:
|
|
"""
|
|
Forward pass through complete embedding system.
|
|
|
|
Args:
|
|
tokens: Token indices of shape (batch_size, seq_len) or (seq_len,)
|
|
|
|
Returns:
|
|
Embedded tokens with positional information
|
|
"""
|
|
# Handle 1D input by adding batch dimension
|
|
if len(tokens.shape) == 1:
|
|
tokens = Tensor(tokens.data[np.newaxis, :]) # (1, seq_len)
|
|
squeeze_batch = True
|
|
else:
|
|
squeeze_batch = False
|
|
|
|
# Get token embeddings
|
|
token_embeds = self.token_embedding.forward(tokens) # (batch, seq, embed)
|
|
|
|
# Scale embeddings if requested (transformer convention)
|
|
if self.scale_embeddings:
|
|
token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim))
|
|
|
|
# Add positional encoding
|
|
if self.pos_encoding_type == 'learned':
|
|
# Use learnable positional encoding
|
|
output = self.pos_encoding.forward(token_embeds)
|
|
elif self.pos_encoding_type == 'sinusoidal':
|
|
# Use fixed sinusoidal encoding
|
|
batch_size, seq_len, embed_dim = token_embeds.shape
|
|
pos_embeddings = self.pos_encoding.data[:seq_len] # (seq_len, embed_dim)
|
|
pos_embeddings = pos_embeddings[np.newaxis, :, :] # (1, seq_len, embed_dim)
|
|
output = Tensor(token_embeds.data + pos_embeddings)
|
|
else:
|
|
# No positional encoding
|
|
output = token_embeds
|
|
|
|
# Remove batch dimension if it was added
|
|
if squeeze_batch:
|
|
output = Tensor(output.data[0]) # (seq_len, embed_dim)
|
|
|
|
return output
|
|
|
|
def parameters(self) -> List[Tensor]:
|
|
"""Return all trainable parameters."""
|
|
params = self.token_embedding.parameters()
|
|
|
|
if self.pos_encoding_type == 'learned':
|
|
params.extend(self.pos_encoding.parameters())
|
|
|
|
return params
|
|
|
|
def __repr__(self):
|
|
return (f"EmbeddingLayer(vocab_size={self.vocab_size}, "
|
|
f"embed_dim={self.embed_dim}, "
|
|
f"pos_encoding='{self.pos_encoding_type}')")
|
|
### END SOLUTION
|