mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-10 16:38:39 -05:00
📦 Module File Organization: - Renamed networks_dev.py → dense_dev.py in 05_dense module - Renamed cnn_dev.py → spatial_dev.py in 06_spatial module - Added new 07_attention module with attention_dev.py - Updated module.yaml files to reference correct filenames - Updated #| default_exp directives for proper package exports 🔄 Core Package Updates: - Added tinytorch.core.dense (Sequential, MLP architectures) - Added tinytorch.core.spatial (Conv2D, pooling operations) - Added tinytorch.core.attention (self-attention mechanisms) - Updated all core modules with latest implementations - Fixed tensor assignment issues in compression module 🧪 Test Integration Fixes: - Updated integration tests to use correct module imports - Fixed tensor activation tests for new module structure - Ensured compatibility with renamed components - Maintained 100% individual module test success rate Result: Complete 14-module TinyTorch framework with proper organization, working integrations, and comprehensive test coverage ready for production use.
335 lines
12 KiB
Python
335 lines
12 KiB
Python
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_attention/attention_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['scaled_dot_product_attention', 'SelfAttention', 'create_causal_mask', 'create_padding_mask',
|
|
'create_bidirectional_mask']
|
|
|
|
# %% ../../modules/source/07_attention/attention_dev.ipynb 1
|
|
import numpy as np
|
|
import math
|
|
import sys
|
|
import os
|
|
from typing import List, Union, Optional, Tuple
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Import our building blocks - try package first, then local modules
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
except ImportError:
|
|
# For development, import from local modules
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
|
|
from tensor_dev import Tensor
|
|
|
|
# %% ../../modules/source/07_attention/attention_dev.ipynb 2
|
|
def _should_show_plots():
|
|
"""Check if we should show plots (disable during testing)"""
|
|
# Check multiple conditions that indicate we're in test mode
|
|
is_pytest = (
|
|
'pytest' in sys.modules or
|
|
'test' in sys.argv or
|
|
os.environ.get('PYTEST_CURRENT_TEST') is not None or
|
|
any('test' in arg for arg in sys.argv) or
|
|
any('pytest' in arg for arg in sys.argv)
|
|
)
|
|
|
|
# Show plots in development mode (when not in test mode)
|
|
return not is_pytest
|
|
|
|
# %% ../../modules/source/07_attention/attention_dev.ipynb 7
|
|
def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
|
mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""
|
|
Scaled Dot-Product Attention - The foundation of all transformer models.
|
|
|
|
This is the exact mechanism used in GPT, BERT, and all modern language models.
|
|
|
|
TODO: Implement the core attention mechanism.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Get d_k (dimension of keys) from Q.shape[-1]
|
|
2. Compute attention scores: Q @ K^T (matrix multiplication)
|
|
3. Scale by √d_k: scores / sqrt(d_k)
|
|
4. Apply mask if provided: set masked positions to -1e9
|
|
5. Apply softmax to get attention weights (probabilities)
|
|
6. Apply attention weights to values: weights @ V
|
|
7. Return (output, attention_weights)
|
|
|
|
MATHEMATICAL OPERATION:
|
|
Attention(Q,K,V) = softmax(QK^T/√d_k)V
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use np.matmul() for matrix multiplication
|
|
- Use np.swapaxes(K, -2, -1) to transpose last two dimensions
|
|
- Use math.sqrt() for square root
|
|
- Use np.where() for masking: np.where(mask == 0, -1e9, scores)
|
|
- Implement softmax manually: exp(x) / sum(exp(x))
|
|
- Use keepdims=True for broadcasting
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This exact function powers ChatGPT, BERT, GPT-4
|
|
- The scaling prevents gradient vanishing in deep networks
|
|
- Masking enables causal (GPT) and bidirectional (BERT) models
|
|
- Attention weights are interpretable - you can visualize them!
|
|
|
|
Args:
|
|
Q: Query matrix of shape (..., seq_len_q, d_k)
|
|
K: Key matrix of shape (..., seq_len_k, d_k)
|
|
V: Value matrix of shape (..., seq_len_v, d_v)
|
|
mask: Optional mask of shape (..., seq_len_q, seq_len_k)
|
|
|
|
Returns:
|
|
output: Attention output (..., seq_len_q, d_v)
|
|
attention_weights: Attention probabilities (..., seq_len_q, seq_len_k)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Get the dimension for scaling
|
|
d_k = Q.shape[-1]
|
|
|
|
# Step 1: Compute attention scores (QK^T)
|
|
# This measures similarity between each query and each key
|
|
scores = np.matmul(Q, np.swapaxes(K, -2, -1)) # (..., seq_len_q, seq_len_k)
|
|
|
|
# Step 2: Scale by √d_k to prevent exploding gradients
|
|
scores = scores / math.sqrt(d_k)
|
|
|
|
# Step 3: Apply mask if provided (for padding or causality)
|
|
if mask is not None:
|
|
# Replace masked positions with large negative values
|
|
# This makes softmax output ~0 for these positions
|
|
scores = np.where(mask == 0, -1e9, scores)
|
|
|
|
# Step 4: Apply softmax to get attention probabilities
|
|
# Each row sums to 1, representing where to focus attention
|
|
# Using numerically stable softmax
|
|
scores_max = np.max(scores, axis=-1, keepdims=True)
|
|
scores_exp = np.exp(scores - scores_max)
|
|
attention_weights = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
|
|
|
|
# Step 5: Apply attention weights to values
|
|
# This gives us the weighted combination of values
|
|
output = np.matmul(attention_weights, V) # (..., seq_len_q, d_v)
|
|
|
|
return output, attention_weights
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_attention/attention_dev.ipynb 11
|
|
class SelfAttention:
|
|
"""
|
|
Self-Attention wrapper - Convenience class for self-attention where Q=K=V.
|
|
|
|
This is the most common use case in transformer models where each position
|
|
attends to all positions in the same sequence.
|
|
"""
|
|
|
|
def __init__(self, d_model: int):
|
|
"""
|
|
Initialize Self-Attention.
|
|
|
|
TODO: Store the model dimension for this self-attention layer.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Store d_model as an instance variable (self.d_model)
|
|
2. Print initialization message for debugging
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
self_attn = SelfAttention(d_model=64)
|
|
output, weights = self_attn(input_sequence)
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Simply store d_model parameter: self.d_model = d_model
|
|
- Print message: print(f"🔧 SelfAttention: d_model={d_model}")
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is like nn.MultiheadAttention in PyTorch (but simpler)
|
|
- Used in every transformer layer for self-attention
|
|
- Foundation for understanding GPT, BERT architectures
|
|
|
|
Args:
|
|
d_model: Model dimension
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.d_model = d_model
|
|
print(f"🔧 SelfAttention: d_model={d_model}")
|
|
### END SOLUTION
|
|
|
|
def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""
|
|
Forward pass of self-attention.
|
|
|
|
TODO: Apply self-attention where Q=K=V=x.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Call scaled_dot_product_attention with Q=K=V=x
|
|
2. Pass the mask parameter through
|
|
3. Return the output and attention weights
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
x = np.random.randn(seq_len, d_model) # Input sequence
|
|
output, weights = self_attn.forward(x)
|
|
# weights[i,j] = how much position i attends to position j
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use the function you implemented above
|
|
- Self-attention means: Q = K = V = x
|
|
- Return: scaled_dot_product_attention(x, x, x, mask)
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how transformers process sequences
|
|
- Each position can attend to any other position
|
|
- Enables understanding of long-range dependencies
|
|
|
|
Args:
|
|
x: Input tensor (..., seq_len, d_model)
|
|
mask: Optional attention mask
|
|
|
|
Returns:
|
|
output: Self-attention output (..., seq_len, d_model)
|
|
attention_weights: Attention weights
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Self-attention: Q = K = V = x
|
|
return scaled_dot_product_attention(x, x, x, mask)
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Make the class callable."""
|
|
return self.forward(x, mask)
|
|
|
|
# %% ../../modules/source/07_attention/attention_dev.ipynb 15
|
|
def create_causal_mask(seq_len: int) -> np.ndarray:
|
|
"""
|
|
Create a causal (lower triangular) mask for autoregressive models.
|
|
|
|
Used in models like GPT where each position can only attend to
|
|
previous positions, not future ones.
|
|
|
|
TODO: Create a lower triangular matrix of ones.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Use np.tril() to create lower triangular matrix
|
|
2. Create matrix of ones with shape (seq_len, seq_len)
|
|
3. Return the lower triangular part
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
mask = create_causal_mask(4)
|
|
# mask = [[1, 0, 0, 0],
|
|
# [1, 1, 0, 0],
|
|
# [1, 1, 1, 0],
|
|
# [1, 1, 1, 1]]
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use np.ones((seq_len, seq_len)) to create matrix of ones
|
|
- Use np.tril() to get lower triangular part
|
|
- Or combine: np.tril(np.ones((seq_len, seq_len)))
|
|
|
|
LEARNING CONNECTIONS:
|
|
- Used in GPT for autoregressive generation
|
|
- Prevents looking into the future during training
|
|
- Essential for language modeling tasks
|
|
|
|
Args:
|
|
seq_len: Sequence length
|
|
|
|
Returns:
|
|
mask: Causal mask (seq_len, seq_len) with 1s for allowed positions, 0s for blocked
|
|
"""
|
|
### BEGIN SOLUTION
|
|
return np.tril(np.ones((seq_len, seq_len)))
|
|
### END SOLUTION
|
|
|
|
#| export
|
|
def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
|
|
"""
|
|
Create padding mask for variable-length sequences.
|
|
|
|
TODO: Create mask that ignores padding tokens.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Initialize zero array with shape (batch_size, max_length, max_length)
|
|
2. For each sequence in the batch, set valid positions to 1
|
|
3. Valid positions are [:length, :length] for each sequence
|
|
4. Return the mask array
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
lengths = [3, 2, 4] # Actual sequence lengths
|
|
mask = create_padding_mask(lengths, max_length=4)
|
|
# For sequence 0 (length=3): positions [0,1,2] can attend to [0,1,2]
|
|
# For sequence 1 (length=2): positions [0,1] can attend to [0,1]
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- batch_size = len(lengths)
|
|
- Use np.zeros((batch_size, max_length, max_length))
|
|
- Loop through lengths: for i, length in enumerate(lengths)
|
|
- Set valid region: mask[i, :length, :length] = 1
|
|
|
|
LEARNING CONNECTIONS:
|
|
- Used when sequences have different lengths
|
|
- Prevents attention to padding tokens
|
|
- Essential for efficient batch processing
|
|
|
|
Args:
|
|
lengths: List of actual sequence lengths
|
|
max_length: Maximum sequence length (padded length)
|
|
|
|
Returns:
|
|
mask: Padding mask (batch_size, max_length, max_length)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
batch_size = len(lengths)
|
|
mask = np.zeros((batch_size, max_length, max_length))
|
|
|
|
for i, length in enumerate(lengths):
|
|
mask[i, :length, :length] = 1
|
|
|
|
return mask
|
|
### END SOLUTION
|
|
|
|
#| export
|
|
def create_bidirectional_mask(seq_len: int) -> np.ndarray:
|
|
"""
|
|
Create a bidirectional mask where all positions can attend to all positions.
|
|
|
|
Used in models like BERT for bidirectional context understanding.
|
|
|
|
TODO: Create a matrix of all ones.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Use np.ones() to create matrix of all ones
|
|
2. Shape should be (seq_len, seq_len)
|
|
3. Return the matrix
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
mask = create_bidirectional_mask(3)
|
|
# mask = [[1, 1, 1],
|
|
# [1, 1, 1],
|
|
# [1, 1, 1]]
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Very simple: np.ones((seq_len, seq_len))
|
|
- All positions can attend to all positions
|
|
|
|
LEARNING CONNECTIONS:
|
|
- Used in BERT for bidirectional understanding
|
|
- Allows looking at past and future context
|
|
- Good for understanding tasks, not generation
|
|
|
|
Args:
|
|
seq_len: Sequence length
|
|
|
|
Returns:
|
|
mask: All-ones mask (seq_len, seq_len)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
return np.ones((seq_len, seq_len))
|
|
### END SOLUTION
|