mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-06 08:12:32 -05:00
refactor: Restructure attention module to match TinyTorch NBGrader patterns
✅ NBGrader solution/test structure: ### BEGIN/END SOLUTION blocks ✅ Educational TODO sections: STEP-BY-STEP, HINTS, EXAMPLES, LEARNING CONNECTIONS ✅ Immediate unit tests: proper assertions after each solution ✅ TinyTorch consistency: same patterns as tensor, layers, activations modules ✅ All tests passing: 100% success rate with comprehensive coverage Module now follows established TinyTorch educational format: - Detailed TODO instructions for student implementation - Solution blocks wrapped in NBGrader tags - Immediate feedback with unit tests after each piece - Progress tracking with emojis and clear status messages Ready for NBGrader processing and student use.
This commit is contained in:
@@ -185,6 +185,34 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
||||
|
||||
This is the exact mechanism used in GPT, BERT, and all modern language models.
|
||||
|
||||
TODO: Implement the core attention mechanism.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Get d_k (dimension of keys) from Q.shape[-1]
|
||||
2. Compute attention scores: Q @ K^T (matrix multiplication)
|
||||
3. Scale by √d_k: scores / sqrt(d_k)
|
||||
4. Apply mask if provided: set masked positions to -1e9
|
||||
5. Apply softmax to get attention weights (probabilities)
|
||||
6. Apply attention weights to values: weights @ V
|
||||
7. Return (output, attention_weights)
|
||||
|
||||
MATHEMATICAL OPERATION:
|
||||
Attention(Q,K,V) = softmax(QK^T/√d_k)V
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Use np.matmul() for matrix multiplication
|
||||
- Use np.swapaxes(K, -2, -1) to transpose last two dimensions
|
||||
- Use math.sqrt() for square root
|
||||
- Use np.where() for masking: np.where(mask == 0, -1e9, scores)
|
||||
- Implement softmax manually: exp(x) / sum(exp(x))
|
||||
- Use keepdims=True for broadcasting
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- This exact function powers ChatGPT, BERT, GPT-4
|
||||
- The scaling prevents gradient vanishing in deep networks
|
||||
- Masking enables causal (GPT) and bidirectional (BERT) models
|
||||
- Attention weights are interpretable - you can visualize them!
|
||||
|
||||
Args:
|
||||
Q: Query matrix of shape (..., seq_len_q, d_k)
|
||||
K: Key matrix of shape (..., seq_len_k, d_k)
|
||||
@@ -194,10 +222,8 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
||||
Returns:
|
||||
output: Attention output (..., seq_len_q, d_v)
|
||||
attention_weights: Attention probabilities (..., seq_len_q, seq_len_k)
|
||||
|
||||
Mathematical operation:
|
||||
Attention(Q,K,V) = softmax(QK^T/√d_k)V
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Get the dimension for scaling
|
||||
d_k = Q.shape[-1]
|
||||
|
||||
@@ -226,56 +252,64 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
|
||||
output = np.matmul(attention_weights, V) # (..., seq_len_q, d_v)
|
||||
|
||||
return output, attention_weights
|
||||
### END SOLUTION
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### 🧪 Unit Test: Scaled Dot-Product Attention
|
||||
### 🧪 Test Your Attention Implementation
|
||||
|
||||
**This is a unit test** - it tests the core attention mechanism in isolation.
|
||||
|
||||
Let's verify our attention implementation works correctly with a simple example.
|
||||
Once you implement the `scaled_dot_product_attention` function above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "test-attention", "locked": false, "schema_version": 3, "solution": false, "task": false}
|
||||
print("🔬 Unit Test: Scaled Dot-Product Attention...")
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-attention-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_scaled_dot_product_attention():
|
||||
"""Test scaled dot-product attention implementation"""
|
||||
print("🔬 Unit Test: Scaled Dot-Product Attention...")
|
||||
|
||||
# Create simple test data
|
||||
seq_len, d_model = 4, 6
|
||||
np.random.seed(42)
|
||||
# Create simple test data
|
||||
seq_len, d_model = 4, 6
|
||||
np.random.seed(42)
|
||||
|
||||
# Create Q, K, V matrices
|
||||
Q = np.random.randn(seq_len, d_model) * 0.1
|
||||
K = np.random.randn(seq_len, d_model) * 0.1
|
||||
V = np.random.randn(seq_len, d_model) * 0.1
|
||||
# Create Q, K, V matrices
|
||||
Q = np.random.randn(seq_len, d_model) * 0.1
|
||||
K = np.random.randn(seq_len, d_model) * 0.1
|
||||
V = np.random.randn(seq_len, d_model) * 0.1
|
||||
|
||||
print(f"📊 Input shapes: Q{Q.shape}, K{K.shape}, V{V.shape}")
|
||||
print(f"📊 Input shapes: Q{Q.shape}, K{K.shape}, V{V.shape}")
|
||||
|
||||
# Test attention
|
||||
output, weights = scaled_dot_product_attention(Q, K, V)
|
||||
# Test attention
|
||||
output, weights = scaled_dot_product_attention(Q, K, V)
|
||||
|
||||
print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
|
||||
print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
|
||||
|
||||
# Verify properties
|
||||
weights_sum = np.sum(weights, axis=-1)
|
||||
print(f"✅ Attention weights sum to 1: {np.allclose(weights_sum, 1.0)}")
|
||||
print(f"✅ Output has correct shape: {output.shape == (seq_len, d_model)}")
|
||||
print(f"✅ All weights are non-negative: {np.all(weights >= 0)}")
|
||||
# Verify properties
|
||||
weights_sum = np.sum(weights, axis=-1)
|
||||
assert np.allclose(weights_sum, 1.0), f"Attention weights should sum to 1, got {weights_sum}"
|
||||
assert output.shape == (seq_len, d_model), f"Output shape should be {(seq_len, d_model)}, got {output.shape}"
|
||||
assert np.all(weights >= 0), "All attention weights should be non-negative"
|
||||
|
||||
# Test with mask
|
||||
mask = np.array([
|
||||
[1, 1, 0, 0],
|
||||
[1, 1, 1, 0],
|
||||
[1, 1, 1, 1],
|
||||
[1, 1, 1, 1]
|
||||
])
|
||||
output_masked, weights_masked = scaled_dot_product_attention(Q, K, V, mask)
|
||||
# Test with mask
|
||||
mask = np.array([
|
||||
[1, 1, 0, 0],
|
||||
[1, 1, 1, 0],
|
||||
[1, 1, 1, 1],
|
||||
[1, 1, 1, 1]
|
||||
])
|
||||
output_masked, weights_masked = scaled_dot_product_attention(Q, K, V, mask)
|
||||
|
||||
# Check that masked positions have near-zero attention
|
||||
masked_positions = (mask == 0)
|
||||
masked_weights = weights_masked[masked_positions]
|
||||
print(f"✅ Masked positions have near-zero weights: {np.all(masked_weights < 1e-6)}")
|
||||
# Check that masked positions have near-zero attention
|
||||
masked_positions = (mask == 0)
|
||||
masked_weights = weights_masked[masked_positions]
|
||||
assert np.all(masked_weights < 1e-6), "Masked positions should have near-zero weights"
|
||||
|
||||
print("📈 Progress: Scaled Dot-Product Attention ✓")
|
||||
print("✅ Attention weights sum to 1: True")
|
||||
print("✅ Output has correct shape: True")
|
||||
print("✅ All weights are non-negative: True")
|
||||
print("✅ Masked positions have near-zero weights: True")
|
||||
print("📈 Progress: Scaled Dot-Product Attention ✓")
|
||||
|
||||
# Run the test
|
||||
test_scaled_dot_product_attention()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -307,16 +341,63 @@ class SelfAttention:
|
||||
"""
|
||||
Initialize Self-Attention.
|
||||
|
||||
TODO: Store the model dimension for this self-attention layer.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Store d_model as an instance variable (self.d_model)
|
||||
2. Print initialization message for debugging
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
self_attn = SelfAttention(d_model=64)
|
||||
output, weights = self_attn(input_sequence)
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Simply store d_model parameter: self.d_model = d_model
|
||||
- Print message: print(f"🔧 SelfAttention: d_model={d_model}")
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- This is like nn.MultiheadAttention in PyTorch (but simpler)
|
||||
- Used in every transformer layer for self-attention
|
||||
- Foundation for understanding GPT, BERT architectures
|
||||
|
||||
Args:
|
||||
d_model: Model dimension
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.d_model = d_model
|
||||
print(f"🔧 SelfAttention: d_model={d_model}")
|
||||
### END SOLUTION
|
||||
|
||||
def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Forward pass of self-attention.
|
||||
|
||||
TODO: Apply self-attention where Q=K=V=x.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Call scaled_dot_product_attention with Q=K=V=x
|
||||
2. Pass the mask parameter through
|
||||
3. Return the output and attention weights
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
x = np.random.randn(seq_len, d_model) # Input sequence
|
||||
output, weights = self_attn.forward(x)
|
||||
# weights[i,j] = how much position i attends to position j
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Use the function you implemented above
|
||||
- Self-attention means: Q = K = V = x
|
||||
- Return: scaled_dot_product_attention(x, x, x, mask)
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- This is how transformers process sequences
|
||||
- Each position can attend to any other position
|
||||
- Enables understanding of long-range dependencies
|
||||
|
||||
Args:
|
||||
x: Input tensor (..., seq_len, d_model)
|
||||
mask: Optional attention mask
|
||||
@@ -325,8 +406,10 @@ class SelfAttention:
|
||||
output: Self-attention output (..., seq_len, d_model)
|
||||
attention_weights: Attention weights
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Self-attention: Q = K = V = x
|
||||
return scaled_dot_product_attention(x, x, x, mask)
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Make the class callable."""
|
||||
@@ -334,41 +417,48 @@ class SelfAttention:
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### 🧪 Unit Test: Self-Attention
|
||||
### 🧪 Test Your Self-Attention Implementation
|
||||
|
||||
**This is a unit test** - it tests self-attention wrapper functionality.
|
||||
|
||||
Let's verify our self-attention wrapper works correctly.
|
||||
Once you implement the SelfAttention class above, run this cell to test it:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "test-self-attention", "locked": false, "schema_version": 3, "solution": false, "task": false}
|
||||
print("🔬 Unit Test: Self-Attention...")
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-self-attention-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_self_attention():
|
||||
"""Test self-attention wrapper"""
|
||||
print("🔬 Unit Test: Self-Attention...")
|
||||
|
||||
# Test parameters
|
||||
d_model = 32
|
||||
seq_len = 8
|
||||
np.random.seed(42)
|
||||
# Test parameters
|
||||
d_model = 32
|
||||
seq_len = 8
|
||||
np.random.seed(42)
|
||||
|
||||
# Create test data (like word embeddings)
|
||||
x = np.random.randn(seq_len, d_model) * 0.1
|
||||
# Create test data (like word embeddings)
|
||||
x = np.random.randn(seq_len, d_model) * 0.1
|
||||
|
||||
print(f"📊 Test setup: d_model={d_model}, seq_len={seq_len}")
|
||||
print(f"📊 Test setup: d_model={d_model}, seq_len={seq_len}")
|
||||
|
||||
# Create self-attention
|
||||
self_attn = SelfAttention(d_model)
|
||||
# Create self-attention
|
||||
self_attn = SelfAttention(d_model)
|
||||
|
||||
# Test forward pass
|
||||
output, weights = self_attn(x)
|
||||
# Test forward pass
|
||||
output, weights = self_attn(x)
|
||||
|
||||
print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
|
||||
print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
|
||||
|
||||
# Verify properties
|
||||
print(f"✅ Output shape preserved: {output.shape == x.shape}")
|
||||
print(f"✅ Attention weights correct shape: {weights.shape == (seq_len, seq_len)}")
|
||||
print(f"✅ Attention weights sum to 1: {np.allclose(np.sum(weights, axis=-1), 1.0)}")
|
||||
print(f"✅ Self-attention is symmetric operation: {weights.shape[0] == weights.shape[1]}")
|
||||
# Verify properties
|
||||
assert output.shape == x.shape, f"Output shape should match input shape {x.shape}, got {output.shape}"
|
||||
assert weights.shape == (seq_len, seq_len), f"Attention weights shape should be {(seq_len, seq_len)}, got {weights.shape}"
|
||||
assert np.allclose(np.sum(weights, axis=-1), 1.0), "Attention weights should sum to 1"
|
||||
assert weights.shape[0] == weights.shape[1], "Self-attention weights should be square matrix"
|
||||
|
||||
print("📈 Progress: Self-Attention ✓")
|
||||
print("✅ Output shape preserved: True")
|
||||
print("✅ Attention weights correct shape: True")
|
||||
print("✅ Attention weights sum to 1: True")
|
||||
print("✅ Self-attention is symmetric operation: True")
|
||||
print("📈 Progress: Self-Attention ✓")
|
||||
|
||||
# Run the test
|
||||
test_self_attention()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
@@ -398,19 +488,74 @@ def create_causal_mask(seq_len: int) -> np.ndarray:
|
||||
Used in models like GPT where each position can only attend to
|
||||
previous positions, not future ones.
|
||||
|
||||
TODO: Create a lower triangular matrix of ones.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Use np.tril() to create lower triangular matrix
|
||||
2. Create matrix of ones with shape (seq_len, seq_len)
|
||||
3. Return the lower triangular part
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
mask = create_causal_mask(4)
|
||||
# mask = [[1, 0, 0, 0],
|
||||
# [1, 1, 0, 0],
|
||||
# [1, 1, 1, 0],
|
||||
# [1, 1, 1, 1]]
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Use np.ones((seq_len, seq_len)) to create matrix of ones
|
||||
- Use np.tril() to get lower triangular part
|
||||
- Or combine: np.tril(np.ones((seq_len, seq_len)))
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- Used in GPT for autoregressive generation
|
||||
- Prevents looking into the future during training
|
||||
- Essential for language modeling tasks
|
||||
|
||||
Args:
|
||||
seq_len: Sequence length
|
||||
|
||||
Returns:
|
||||
mask: Causal mask (seq_len, seq_len) with 1s for allowed positions, 0s for blocked
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
return np.tril(np.ones((seq_len, seq_len)))
|
||||
### END SOLUTION
|
||||
|
||||
#| export
|
||||
def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
|
||||
"""
|
||||
Create padding mask for variable-length sequences.
|
||||
|
||||
TODO: Create mask that ignores padding tokens.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Initialize zero array with shape (batch_size, max_length, max_length)
|
||||
2. For each sequence in the batch, set valid positions to 1
|
||||
3. Valid positions are [:length, :length] for each sequence
|
||||
4. Return the mask array
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
lengths = [3, 2, 4] # Actual sequence lengths
|
||||
mask = create_padding_mask(lengths, max_length=4)
|
||||
# For sequence 0 (length=3): positions [0,1,2] can attend to [0,1,2]
|
||||
# For sequence 1 (length=2): positions [0,1] can attend to [0,1]
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- batch_size = len(lengths)
|
||||
- Use np.zeros((batch_size, max_length, max_length))
|
||||
- Loop through lengths: for i, length in enumerate(lengths)
|
||||
- Set valid region: mask[i, :length, :length] = 1
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- Used when sequences have different lengths
|
||||
- Prevents attention to padding tokens
|
||||
- Essential for efficient batch processing
|
||||
|
||||
Args:
|
||||
lengths: List of actual sequence lengths
|
||||
max_length: Maximum sequence length (padded length)
|
||||
@@ -418,6 +563,7 @@ def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
|
||||
Returns:
|
||||
mask: Padding mask (batch_size, max_length, max_length)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
batch_size = len(lengths)
|
||||
mask = np.zeros((batch_size, max_length, max_length))
|
||||
|
||||
@@ -425,6 +571,7 @@ def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
|
||||
mask[i, :length, :length] = 1
|
||||
|
||||
return mask
|
||||
### END SOLUTION
|
||||
|
||||
#| export
|
||||
def create_bidirectional_mask(seq_len: int) -> np.ndarray:
|
||||
@@ -433,66 +580,168 @@ def create_bidirectional_mask(seq_len: int) -> np.ndarray:
|
||||
|
||||
Used in models like BERT for bidirectional context understanding.
|
||||
|
||||
TODO: Create a matrix of all ones.
|
||||
|
||||
STEP-BY-STEP IMPLEMENTATION:
|
||||
1. Use np.ones() to create matrix of all ones
|
||||
2. Shape should be (seq_len, seq_len)
|
||||
3. Return the matrix
|
||||
|
||||
EXAMPLE USAGE:
|
||||
```python
|
||||
mask = create_bidirectional_mask(3)
|
||||
# mask = [[1, 1, 1],
|
||||
# [1, 1, 1],
|
||||
# [1, 1, 1]]
|
||||
```
|
||||
|
||||
IMPLEMENTATION HINTS:
|
||||
- Very simple: np.ones((seq_len, seq_len))
|
||||
- All positions can attend to all positions
|
||||
|
||||
LEARNING CONNECTIONS:
|
||||
- Used in BERT for bidirectional understanding
|
||||
- Allows looking at past and future context
|
||||
- Good for understanding tasks, not generation
|
||||
|
||||
Args:
|
||||
seq_len: Sequence length
|
||||
|
||||
Returns:
|
||||
mask: All-ones mask (seq_len, seq_len)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
return np.ones((seq_len, seq_len))
|
||||
### END SOLUTION
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### 🧪 Unit Test: Attention Masking
|
||||
### 🧪 Test Your Masking Functions
|
||||
|
||||
**This is a unit test** - it tests all masking utilities work correctly.
|
||||
|
||||
Let's verify our masking functions create the correct patterns.
|
||||
Once you implement the masking functions above, run this cell to test them:
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "test-masking", "locked": false, "schema_version": 3, "solution": false, "task": false}
|
||||
print("🔬 Unit Test: Attention Masking...")
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-masking-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_attention_masking():
|
||||
"""Test attention masking utilities"""
|
||||
print("🔬 Unit Test: Attention Masking...")
|
||||
|
||||
# Test causal mask
|
||||
seq_len = 5
|
||||
causal_mask = create_causal_mask(seq_len)
|
||||
# Test causal mask
|
||||
seq_len = 5
|
||||
causal_mask = create_causal_mask(seq_len)
|
||||
|
||||
print(f"📊 Causal mask for seq_len={seq_len}:")
|
||||
print(causal_mask)
|
||||
print(f"📊 Causal mask for seq_len={seq_len}:")
|
||||
print(causal_mask)
|
||||
|
||||
# Verify causal mask properties
|
||||
print(f"✅ Causal mask is lower triangular: {np.allclose(causal_mask, np.tril(causal_mask))}")
|
||||
print(f"✅ Causal mask has correct shape: {causal_mask.shape == (seq_len, seq_len)}")
|
||||
print(f"✅ Causal mask upper triangle is zeros: {np.all(np.triu(causal_mask, k=1) == 0)}")
|
||||
# Verify causal mask properties
|
||||
assert np.allclose(causal_mask, np.tril(causal_mask)), "Causal mask should be lower triangular"
|
||||
assert causal_mask.shape == (seq_len, seq_len), f"Causal mask should have shape {(seq_len, seq_len)}"
|
||||
assert np.all(np.triu(causal_mask, k=1) == 0), "Causal mask upper triangle should be zeros"
|
||||
|
||||
# Test padding mask
|
||||
lengths = [5, 3, 4]
|
||||
max_length = 5
|
||||
padding_mask = create_padding_mask(lengths, max_length)
|
||||
# Test padding mask
|
||||
lengths = [5, 3, 4]
|
||||
max_length = 5
|
||||
padding_mask = create_padding_mask(lengths, max_length)
|
||||
|
||||
print(f"📊 Padding mask for lengths {lengths}, max_length={max_length}:")
|
||||
print("Mask for sequence 0 (length 5):")
|
||||
print(padding_mask[0])
|
||||
print("Mask for sequence 1 (length 3):")
|
||||
print(padding_mask[1])
|
||||
print(f"📊 Padding mask for lengths {lengths}, max_length={max_length}:")
|
||||
print("Mask for sequence 0 (length 5):")
|
||||
print(padding_mask[0])
|
||||
print("Mask for sequence 1 (length 3):")
|
||||
print(padding_mask[1])
|
||||
|
||||
# Verify padding mask properties
|
||||
print(f"✅ Padding mask has correct shape: {padding_mask.shape == (3, max_length, max_length)}")
|
||||
print(f"✅ Full-length sequence is all ones: {np.all(padding_mask[0] == 1)}")
|
||||
print(f"✅ Short sequence has zeros in padding area: {np.all(padding_mask[1, 3:, :] == 0)}")
|
||||
# Verify padding mask properties
|
||||
assert padding_mask.shape == (3, max_length, max_length), f"Padding mask should have shape {(3, max_length, max_length)}"
|
||||
assert np.all(padding_mask[0] == 1), "Full-length sequence should be all ones"
|
||||
assert np.all(padding_mask[1, 3:, :] == 0), "Short sequence should have zeros in padding area"
|
||||
|
||||
# Test bidirectional mask
|
||||
bidirectional_mask = create_bidirectional_mask(seq_len)
|
||||
print(f"✅ Bidirectional mask is all ones: {np.all(bidirectional_mask == 1)}")
|
||||
print(f"✅ Bidirectional mask has correct shape: {bidirectional_mask.shape == (seq_len, seq_len)}")
|
||||
# Test bidirectional mask
|
||||
bidirectional_mask = create_bidirectional_mask(seq_len)
|
||||
assert np.all(bidirectional_mask == 1), "Bidirectional mask should be all ones"
|
||||
assert bidirectional_mask.shape == (seq_len, seq_len), f"Bidirectional mask should have shape {(seq_len, seq_len)}"
|
||||
|
||||
print("📈 Progress: Attention Masking ✓")
|
||||
print("✅ Causal mask is lower triangular: True")
|
||||
print("✅ Causal mask has correct shape: True")
|
||||
print("✅ Causal mask upper triangle is zeros: True")
|
||||
print("✅ Padding mask has correct shape: True")
|
||||
print("✅ Full-length sequence is all ones: True")
|
||||
print("✅ Short sequence has zeros in padding area: True")
|
||||
print("✅ Bidirectional mask is all ones: True")
|
||||
print("✅ Bidirectional mask has correct shape: True")
|
||||
print("📈 Progress: Attention Masking ✓")
|
||||
|
||||
# Run the test
|
||||
test_attention_masking()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## Step 5: Attention Visualization and Analysis
|
||||
## Step 5: Complete System Integration Test
|
||||
|
||||
### Bringing It All Together
|
||||
Let's test all components working together in a realistic scenario similar to how they would be used in actual transformer models.
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": true, "grade_id": "test-integration-final", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
|
||||
def test_complete_attention_system():
|
||||
"""Test the complete attention system working together"""
|
||||
print("🔬 Unit Test: Complete Attention System Integration...")
|
||||
|
||||
# Test parameters
|
||||
d_model = 64
|
||||
seq_len = 16
|
||||
batch_size = 2
|
||||
np.random.seed(42)
|
||||
|
||||
print(f"📊 Integration test: d_model={d_model}, seq_len={seq_len}, batch_size={batch_size}")
|
||||
|
||||
# Step 1: Create input embeddings (simulating word embeddings)
|
||||
embeddings = np.random.randn(batch_size, seq_len, d_model) * 0.1
|
||||
print(f"📊 Input embeddings: {embeddings.shape}")
|
||||
|
||||
# Step 2: Test basic attention
|
||||
output, attention_weights = scaled_dot_product_attention(embeddings, embeddings, embeddings)
|
||||
assert output.shape == embeddings.shape, "Basic attention should preserve shape"
|
||||
print(f"✅ Basic attention works: {output.shape}")
|
||||
|
||||
# Step 3: Test self-attention wrapper
|
||||
self_attn = SelfAttention(d_model)
|
||||
self_output, self_weights = self_attn(embeddings[0]) # Single batch item
|
||||
assert self_output.shape == (seq_len, d_model), "Self-attention should preserve shape"
|
||||
print(f"✅ Self-attention output: {self_output.shape}")
|
||||
|
||||
# Step 4: Test with causal mask (like GPT)
|
||||
causal_mask = create_causal_mask(seq_len)
|
||||
causal_output, causal_weights = scaled_dot_product_attention(
|
||||
embeddings[0], embeddings[0], embeddings[0], causal_mask
|
||||
)
|
||||
assert causal_output.shape == (seq_len, d_model), "Causal attention should preserve shape"
|
||||
print(f"✅ Causal attention works: {causal_output.shape}")
|
||||
|
||||
# Step 5: Test with padding mask (variable lengths)
|
||||
lengths = [seq_len, seq_len-3] # Different sequence lengths
|
||||
padding_mask = create_padding_mask(lengths, seq_len)
|
||||
padded_output, padded_weights = scaled_dot_product_attention(
|
||||
embeddings[0], embeddings[0], embeddings[0], padding_mask[0]
|
||||
)
|
||||
assert padded_output.shape == (seq_len, d_model), "Padding attention should preserve shape"
|
||||
print(f"✅ Padding mask works: {padded_output.shape}")
|
||||
|
||||
# Step 6: Verify all outputs have correct properties
|
||||
assert np.allclose(np.sum(attention_weights, axis=-1), 1.0), "All attention weights should sum to 1"
|
||||
assert output.shape == embeddings.shape, "All outputs should preserve input shape"
|
||||
assert np.all(np.triu(causal_weights, k=1) < 1e-6), "Causal masking should work"
|
||||
|
||||
print("✅ All attention weights sum to 1: True")
|
||||
print("✅ All outputs preserve input shape: True")
|
||||
print("✅ Causal masking works: True")
|
||||
print("📈 Progress: Complete Attention System ✓")
|
||||
|
||||
# Run the test
|
||||
test_complete_attention_system()
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
## 🎯 Attention Behavior Analysis
|
||||
|
||||
### Understanding What Attention Learns
|
||||
Let's create a simple example to see what attention patterns emerge and understand the behavior.
|
||||
"""
|
||||
|
||||
@@ -562,61 +811,6 @@ if _should_show_plots():
|
||||
|
||||
print("🎯 Attention learns to focus on similar content!")
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### 🧪 Unit Test: Complete Attention System Integration
|
||||
|
||||
**This is a unit test** - it tests the complete attention system working together.
|
||||
|
||||
Let's verify all components work together seamlessly.
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "test-integration", "locked": false, "schema_version": 3, "solution": false, "task": false}
|
||||
print("🔬 Unit Test: Complete Attention System Integration...")
|
||||
|
||||
# Test parameters
|
||||
d_model = 64
|
||||
seq_len = 16
|
||||
batch_size = 2
|
||||
np.random.seed(42)
|
||||
|
||||
print(f"📊 Integration test: d_model={d_model}, seq_len={seq_len}, batch_size={batch_size}")
|
||||
|
||||
# Step 1: Create input embeddings (simulating word embeddings)
|
||||
embeddings = np.random.randn(batch_size, seq_len, d_model) * 0.1
|
||||
print(f"📊 Input embeddings: {embeddings.shape}")
|
||||
|
||||
# Step 2: Test basic attention
|
||||
output, attention_weights = scaled_dot_product_attention(embeddings, embeddings, embeddings)
|
||||
print(f"✅ Basic attention works: {output.shape}")
|
||||
|
||||
# Step 3: Test self-attention wrapper
|
||||
self_attn = SelfAttention(d_model)
|
||||
self_output, self_weights = self_attn(embeddings[0]) # Single batch item
|
||||
print(f"✅ Self-attention output: {self_output.shape}")
|
||||
|
||||
# Step 4: Test with causal mask (like GPT)
|
||||
causal_mask = create_causal_mask(seq_len)
|
||||
causal_output, causal_weights = scaled_dot_product_attention(
|
||||
embeddings[0], embeddings[0], embeddings[0], causal_mask
|
||||
)
|
||||
print(f"✅ Causal attention works: {causal_output.shape}")
|
||||
|
||||
# Step 5: Test with padding mask (variable lengths)
|
||||
lengths = [seq_len, seq_len-3] # Different sequence lengths
|
||||
padding_mask = create_padding_mask(lengths, seq_len)
|
||||
padded_output, padded_weights = scaled_dot_product_attention(
|
||||
embeddings[0], embeddings[0], embeddings[0], padding_mask[0]
|
||||
)
|
||||
print(f"✅ Padding mask works: {padded_output.shape}")
|
||||
|
||||
# Step 6: Verify all outputs have correct properties
|
||||
print(f"✅ All attention weights sum to 1: {np.allclose(np.sum(attention_weights, axis=-1), 1.0)}")
|
||||
print(f"✅ All outputs preserve input shape: {output.shape == embeddings.shape}")
|
||||
print(f"✅ Causal masking works: {np.all(np.triu(causal_weights, k=1) < 1e-6)}")
|
||||
|
||||
print("📈 Progress: Complete Attention System ✓")
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("🔥 ATTENTION MODULE COMPLETE!")
|
||||
print("="*50)
|
||||
|
||||
Reference in New Issue
Block a user