refactor: Restructure attention module to match TinyTorch NBGrader patterns

✅ NBGrader solution/test structure: ### BEGIN/END SOLUTION blocks ✅ Educational TODO sections: STEP-BY-STEP, HINTS, EXAMPLES, LEARNING CONNECTIONS ✅ Immediate unit tests: proper assertions after each solution ✅ TinyTorch consistency: same patterns as tensor, layers, activations modules ✅ All tests passing: 100% success rate with comprehensive coverage Module now follows established TinyTorch educational format: - Detailed TODO instructions for student implementation - Solution blocks wrapped in NBGrader tags - Immediate feedback with unit tests after each piece - Progress tracking with emojis and clear status messages Ready for NBGrader processing and student use.
2026-05-06 08:12:32 -05:00 · 2025-07-17 23:17:06 -04:00
parent 05f59ca56a
commit b3b02eb07f
1 changed files with 346 additions and 152 deletions
--- a/modules/source/06_attention/attention_dev.py
+++ b/modules/source/06_attention/attention_dev.py
@@ -185,6 +185,34 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
    
    This is the exact mechanism used in GPT, BERT, and all modern language models.
    
+    TODO: Implement the core attention mechanism.
+    
+    STEP-BY-STEP IMPLEMENTATION:
+    1. Get d_k (dimension of keys) from Q.shape[-1]
+    2. Compute attention scores: Q @ K^T (matrix multiplication)
+    3. Scale by √d_k: scores / sqrt(d_k)
+    4. Apply mask if provided: set masked positions to -1e9
+    5. Apply softmax to get attention weights (probabilities)
+    6. Apply attention weights to values: weights @ V
+    7. Return (output, attention_weights)
+    
+    MATHEMATICAL OPERATION:
+        Attention(Q,K,V) = softmax(QK^T/√d_k)V
+    
+    IMPLEMENTATION HINTS:
+    - Use np.matmul() for matrix multiplication
+    - Use np.swapaxes(K, -2, -1) to transpose last two dimensions
+    - Use math.sqrt() for square root
+    - Use np.where() for masking: np.where(mask == 0, -1e9, scores)
+    - Implement softmax manually: exp(x) / sum(exp(x))
+    - Use keepdims=True for broadcasting
+    
+    LEARNING CONNECTIONS:
+    - This exact function powers ChatGPT, BERT, GPT-4
+    - The scaling prevents gradient vanishing in deep networks
+    - Masking enables causal (GPT) and bidirectional (BERT) models
+    - Attention weights are interpretable - you can visualize them!
+    
    Args:
        Q: Query matrix of shape (..., seq_len_q, d_k)
        K: Key matrix of shape (..., seq_len_k, d_k)  
@@ -194,10 +222,8 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
    Returns:
        output: Attention output (..., seq_len_q, d_v)
        attention_weights: Attention probabilities (..., seq_len_q, seq_len_k)
-    
-    Mathematical operation:
-        Attention(Q,K,V) = softmax(QK^T/√d_k)V
    """
+    ### BEGIN SOLUTION
    # Get the dimension for scaling
    d_k = Q.shape[-1]
    
@@ -226,56 +252,64 @@ def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
    output = np.matmul(attention_weights, V)  # (..., seq_len_q, d_v)
    
    return output, attention_weights
+    ### END SOLUTION

 # %% [markdown]
 """
-### 🧪 Unit Test: Scaled Dot-Product Attention
+### 🧪 Test Your Attention Implementation

-**This is a unit test** - it tests the core attention mechanism in isolation.
-
-Let's verify our attention implementation works correctly with a simple example.
+Once you implement the `scaled_dot_product_attention` function above, run this cell to test it:
 """

-# %% nbgrader={"grade": false, "grade_id": "test-attention", "locked": false, "schema_version": 3, "solution": false, "task": false}
-print("🔬 Unit Test: Scaled Dot-Product Attention...")
+# %% nbgrader={"grade": true, "grade_id": "test-attention-immediate", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
+def test_scaled_dot_product_attention():
+    """Test scaled dot-product attention implementation"""
+    print("🔬 Unit Test: Scaled Dot-Product Attention...")

-# Create simple test data
-seq_len, d_model = 4, 6
-np.random.seed(42)
+    # Create simple test data
+    seq_len, d_model = 4, 6
+    np.random.seed(42)

-# Create Q, K, V matrices
-Q = np.random.randn(seq_len, d_model) * 0.1
-K = np.random.randn(seq_len, d_model) * 0.1  
-V = np.random.randn(seq_len, d_model) * 0.1
+    # Create Q, K, V matrices
+    Q = np.random.randn(seq_len, d_model) * 0.1
+    K = np.random.randn(seq_len, d_model) * 0.1  
+    V = np.random.randn(seq_len, d_model) * 0.1

-print(f"📊 Input shapes: Q{Q.shape}, K{K.shape}, V{V.shape}")
+    print(f"📊 Input shapes: Q{Q.shape}, K{K.shape}, V{V.shape}")

-# Test attention
-output, weights = scaled_dot_product_attention(Q, K, V)
+    # Test attention
+    output, weights = scaled_dot_product_attention(Q, K, V)

-print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
+    print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")

-# Verify properties
-weights_sum = np.sum(weights, axis=-1)
-print(f"✅ Attention weights sum to 1: {np.allclose(weights_sum, 1.0)}")
-print(f"✅ Output has correct shape: {output.shape == (seq_len, d_model)}")
-print(f"✅ All weights are non-negative: {np.all(weights >= 0)}")
+    # Verify properties
+    weights_sum = np.sum(weights, axis=-1)
+    assert np.allclose(weights_sum, 1.0), f"Attention weights should sum to 1, got {weights_sum}"
+    assert output.shape == (seq_len, d_model), f"Output shape should be {(seq_len, d_model)}, got {output.shape}"
+    assert np.all(weights >= 0), "All attention weights should be non-negative"

-# Test with mask
-mask = np.array([
-    [1, 1, 0, 0],
-    [1, 1, 1, 0], 
-    [1, 1, 1, 1],
-    [1, 1, 1, 1]
-])
-output_masked, weights_masked = scaled_dot_product_attention(Q, K, V, mask)
+    # Test with mask
+    mask = np.array([
+        [1, 1, 0, 0],
+        [1, 1, 1, 0], 
+        [1, 1, 1, 1],
+        [1, 1, 1, 1]
+    ])
+    output_masked, weights_masked = scaled_dot_product_attention(Q, K, V, mask)

-# Check that masked positions have near-zero attention
-masked_positions = (mask == 0)
-masked_weights = weights_masked[masked_positions]
-print(f"✅ Masked positions have near-zero weights: {np.all(masked_weights < 1e-6)}")
+    # Check that masked positions have near-zero attention
+    masked_positions = (mask == 0)
+    masked_weights = weights_masked[masked_positions]
+    assert np.all(masked_weights < 1e-6), "Masked positions should have near-zero weights"

-print("📈 Progress: Scaled Dot-Product Attention ✓")
+    print("✅ Attention weights sum to 1: True")
+    print("✅ Output has correct shape: True")
+    print("✅ All weights are non-negative: True")
+    print("✅ Masked positions have near-zero weights: True")
+    print("📈 Progress: Scaled Dot-Product Attention ✓")
+
+# Run the test
+test_scaled_dot_product_attention()

 # %% [markdown]
 """
@@ -307,16 +341,63 @@ class SelfAttention:
        """
        Initialize Self-Attention.
        
+        TODO: Store the model dimension for this self-attention layer.
+        
+        STEP-BY-STEP IMPLEMENTATION:
+        1. Store d_model as an instance variable (self.d_model)
+        2. Print initialization message for debugging
+        
+        EXAMPLE USAGE:
+        ```python
+        self_attn = SelfAttention(d_model=64)
+        output, weights = self_attn(input_sequence)
+        ```
+        
+        IMPLEMENTATION HINTS:
+        - Simply store d_model parameter: self.d_model = d_model
+        - Print message: print(f"🔧 SelfAttention: d_model={d_model}")
+        
+        LEARNING CONNECTIONS:
+        - This is like nn.MultiheadAttention in PyTorch (but simpler)
+        - Used in every transformer layer for self-attention
+        - Foundation for understanding GPT, BERT architectures
+        
        Args:
            d_model: Model dimension
        """
+        ### BEGIN SOLUTION
        self.d_model = d_model
        print(f"🔧 SelfAttention: d_model={d_model}")
+        ### END SOLUTION
    
    def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
        """
        Forward pass of self-attention.
        
+        TODO: Apply self-attention where Q=K=V=x.
+        
+        STEP-BY-STEP IMPLEMENTATION:
+        1. Call scaled_dot_product_attention with Q=K=V=x
+        2. Pass the mask parameter through
+        3. Return the output and attention weights
+        
+        EXAMPLE USAGE:
+        ```python
+        x = np.random.randn(seq_len, d_model)  # Input sequence
+        output, weights = self_attn.forward(x)
+        # weights[i,j] = how much position i attends to position j
+        ```
+        
+        IMPLEMENTATION HINTS:
+        - Use the function you implemented above
+        - Self-attention means: Q = K = V = x
+        - Return: scaled_dot_product_attention(x, x, x, mask)
+        
+        LEARNING CONNECTIONS:
+        - This is how transformers process sequences
+        - Each position can attend to any other position
+        - Enables understanding of long-range dependencies
+        
        Args:
            x: Input tensor (..., seq_len, d_model)
            mask: Optional attention mask
@@ -325,8 +406,10 @@ class SelfAttention:
            output: Self-attention output (..., seq_len, d_model)
            attention_weights: Attention weights
        """
+        ### BEGIN SOLUTION
        # Self-attention: Q = K = V = x
        return scaled_dot_product_attention(x, x, x, mask)
+        ### END SOLUTION
    
    def __call__(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
        """Make the class callable."""
@@ -334,41 +417,48 @@ class SelfAttention:

 # %% [markdown]
 """
-### 🧪 Unit Test: Self-Attention
+### 🧪 Test Your Self-Attention Implementation

-**This is a unit test** - it tests self-attention wrapper functionality.
-
-Let's verify our self-attention wrapper works correctly.
+Once you implement the SelfAttention class above, run this cell to test it:
 """

-# %% nbgrader={"grade": false, "grade_id": "test-self-attention", "locked": false, "schema_version": 3, "solution": false, "task": false}
-print("🔬 Unit Test: Self-Attention...")
+# %% nbgrader={"grade": true, "grade_id": "test-self-attention-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
+def test_self_attention():
+    """Test self-attention wrapper"""
+    print("🔬 Unit Test: Self-Attention...")

-# Test parameters
-d_model = 32
-seq_len = 8
-np.random.seed(42)
+    # Test parameters
+    d_model = 32
+    seq_len = 8
+    np.random.seed(42)

-# Create test data (like word embeddings)
-x = np.random.randn(seq_len, d_model) * 0.1
+    # Create test data (like word embeddings)
+    x = np.random.randn(seq_len, d_model) * 0.1

-print(f"📊 Test setup: d_model={d_model}, seq_len={seq_len}")
+    print(f"📊 Test setup: d_model={d_model}, seq_len={seq_len}")

-# Create self-attention
-self_attn = SelfAttention(d_model)
+    # Create self-attention
+    self_attn = SelfAttention(d_model)

-# Test forward pass
-output, weights = self_attn(x)
+    # Test forward pass
+    output, weights = self_attn(x)

-print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")
+    print(f"📊 Output shapes: output{output.shape}, weights{weights.shape}")

-# Verify properties
-print(f"✅ Output shape preserved: {output.shape == x.shape}")
-print(f"✅ Attention weights correct shape: {weights.shape == (seq_len, seq_len)}")
-print(f"✅ Attention weights sum to 1: {np.allclose(np.sum(weights, axis=-1), 1.0)}")
-print(f"✅ Self-attention is symmetric operation: {weights.shape[0] == weights.shape[1]}")
+    # Verify properties
+    assert output.shape == x.shape, f"Output shape should match input shape {x.shape}, got {output.shape}"
+    assert weights.shape == (seq_len, seq_len), f"Attention weights shape should be {(seq_len, seq_len)}, got {weights.shape}"
+    assert np.allclose(np.sum(weights, axis=-1), 1.0), "Attention weights should sum to 1"
+    assert weights.shape[0] == weights.shape[1], "Self-attention weights should be square matrix"

-print("📈 Progress: Self-Attention ✓")
+    print("✅ Output shape preserved: True")
+    print("✅ Attention weights correct shape: True")
+    print("✅ Attention weights sum to 1: True")
+    print("✅ Self-attention is symmetric operation: True")
+    print("📈 Progress: Self-Attention ✓")
+
+# Run the test
+test_self_attention()

 # %% [markdown]
 """
@@ -398,19 +488,74 @@ def create_causal_mask(seq_len: int) -> np.ndarray:
    Used in models like GPT where each position can only attend to 
    previous positions, not future ones.
    
+    TODO: Create a lower triangular matrix of ones.
+    
+    STEP-BY-STEP IMPLEMENTATION:
+    1. Use np.tril() to create lower triangular matrix
+    2. Create matrix of ones with shape (seq_len, seq_len)
+    3. Return the lower triangular part
+    
+    EXAMPLE USAGE:
+    ```python
+    mask = create_causal_mask(4)
+    # mask = [[1, 0, 0, 0],
+    #         [1, 1, 0, 0], 
+    #         [1, 1, 1, 0],
+    #         [1, 1, 1, 1]]
+    ```
+    
+    IMPLEMENTATION HINTS:
+    - Use np.ones((seq_len, seq_len)) to create matrix of ones
+    - Use np.tril() to get lower triangular part
+    - Or combine: np.tril(np.ones((seq_len, seq_len)))
+    
+    LEARNING CONNECTIONS:
+    - Used in GPT for autoregressive generation
+    - Prevents looking into the future during training
+    - Essential for language modeling tasks
+    
    Args:
        seq_len: Sequence length
        
    Returns:
        mask: Causal mask (seq_len, seq_len) with 1s for allowed positions, 0s for blocked
    """
+    ### BEGIN SOLUTION
    return np.tril(np.ones((seq_len, seq_len)))
+    ### END SOLUTION

 #| export  
 def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
    """
    Create padding mask for variable-length sequences.
    
+    TODO: Create mask that ignores padding tokens.
+    
+    STEP-BY-STEP IMPLEMENTATION:
+    1. Initialize zero array with shape (batch_size, max_length, max_length)
+    2. For each sequence in the batch, set valid positions to 1
+    3. Valid positions are [:length, :length] for each sequence
+    4. Return the mask array
+    
+    EXAMPLE USAGE:
+    ```python
+    lengths = [3, 2, 4]  # Actual sequence lengths
+    mask = create_padding_mask(lengths, max_length=4)
+    # For sequence 0 (length=3): positions [0,1,2] can attend to [0,1,2]
+    # For sequence 1 (length=2): positions [0,1] can attend to [0,1] 
+    ```
+    
+    IMPLEMENTATION HINTS:
+    - batch_size = len(lengths)
+    - Use np.zeros((batch_size, max_length, max_length))
+    - Loop through lengths: for i, length in enumerate(lengths)
+    - Set valid region: mask[i, :length, :length] = 1
+    
+    LEARNING CONNECTIONS:
+    - Used when sequences have different lengths
+    - Prevents attention to padding tokens
+    - Essential for efficient batch processing
+    
    Args:
        lengths: List of actual sequence lengths
        max_length: Maximum sequence length (padded length)
@@ -418,6 +563,7 @@ def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
    Returns:
        mask: Padding mask (batch_size, max_length, max_length)
    """
+    ### BEGIN SOLUTION
    batch_size = len(lengths)
    mask = np.zeros((batch_size, max_length, max_length))
    
@@ -425,6 +571,7 @@ def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
        mask[i, :length, :length] = 1
    
    return mask
+    ### END SOLUTION

 #| export
 def create_bidirectional_mask(seq_len: int) -> np.ndarray:
@@ -433,66 +580,168 @@ def create_bidirectional_mask(seq_len: int) -> np.ndarray:
    
    Used in models like BERT for bidirectional context understanding.
    
+    TODO: Create a matrix of all ones.
+    
+    STEP-BY-STEP IMPLEMENTATION:
+    1. Use np.ones() to create matrix of all ones
+    2. Shape should be (seq_len, seq_len)
+    3. Return the matrix
+    
+    EXAMPLE USAGE:
+    ```python
+    mask = create_bidirectional_mask(3)
+    # mask = [[1, 1, 1],
+    #         [1, 1, 1],
+    #         [1, 1, 1]]
+    ```
+    
+    IMPLEMENTATION HINTS:
+    - Very simple: np.ones((seq_len, seq_len))
+    - All positions can attend to all positions
+    
+    LEARNING CONNECTIONS:
+    - Used in BERT for bidirectional understanding
+    - Allows looking at past and future context
+    - Good for understanding tasks, not generation
+    
    Args:
        seq_len: Sequence length
        
    Returns:
        mask: All-ones mask (seq_len, seq_len)
    """
+    ### BEGIN SOLUTION
    return np.ones((seq_len, seq_len))
+    ### END SOLUTION

 # %% [markdown]
 """
-### 🧪 Unit Test: Attention Masking
+### 🧪 Test Your Masking Functions

-**This is a unit test** - it tests all masking utilities work correctly.
-
-Let's verify our masking functions create the correct patterns.
+Once you implement the masking functions above, run this cell to test them:
 """

-# %% nbgrader={"grade": false, "grade_id": "test-masking", "locked": false, "schema_version": 3, "solution": false, "task": false}
-print("🔬 Unit Test: Attention Masking...")
+# %% nbgrader={"grade": true, "grade_id": "test-masking-immediate", "locked": true, "points": 5, "schema_version": 3, "solution": false, "task": false}
+def test_attention_masking():
+    """Test attention masking utilities"""
+    print("🔬 Unit Test: Attention Masking...")

-# Test causal mask
-seq_len = 5
-causal_mask = create_causal_mask(seq_len)
+    # Test causal mask
+    seq_len = 5
+    causal_mask = create_causal_mask(seq_len)

-print(f"📊 Causal mask for seq_len={seq_len}:")
-print(causal_mask)
+    print(f"📊 Causal mask for seq_len={seq_len}:")
+    print(causal_mask)

-# Verify causal mask properties
-print(f"✅ Causal mask is lower triangular: {np.allclose(causal_mask, np.tril(causal_mask))}")
-print(f"✅ Causal mask has correct shape: {causal_mask.shape == (seq_len, seq_len)}")
-print(f"✅ Causal mask upper triangle is zeros: {np.all(np.triu(causal_mask, k=1) == 0)}")
+    # Verify causal mask properties
+    assert np.allclose(causal_mask, np.tril(causal_mask)), "Causal mask should be lower triangular"
+    assert causal_mask.shape == (seq_len, seq_len), f"Causal mask should have shape {(seq_len, seq_len)}"
+    assert np.all(np.triu(causal_mask, k=1) == 0), "Causal mask upper triangle should be zeros"

-# Test padding mask
-lengths = [5, 3, 4]
-max_length = 5
-padding_mask = create_padding_mask(lengths, max_length)
+    # Test padding mask
+    lengths = [5, 3, 4]
+    max_length = 5
+    padding_mask = create_padding_mask(lengths, max_length)

-print(f"📊 Padding mask for lengths {lengths}, max_length={max_length}:")
-print("Mask for sequence 0 (length 5):")
-print(padding_mask[0])
-print("Mask for sequence 1 (length 3):")
-print(padding_mask[1])
+    print(f"📊 Padding mask for lengths {lengths}, max_length={max_length}:")
+    print("Mask for sequence 0 (length 5):")
+    print(padding_mask[0])
+    print("Mask for sequence 1 (length 3):")
+    print(padding_mask[1])

-# Verify padding mask properties
-print(f"✅ Padding mask has correct shape: {padding_mask.shape == (3, max_length, max_length)}")
-print(f"✅ Full-length sequence is all ones: {np.all(padding_mask[0] == 1)}")
-print(f"✅ Short sequence has zeros in padding area: {np.all(padding_mask[1, 3:, :] == 0)}")
+    # Verify padding mask properties
+    assert padding_mask.shape == (3, max_length, max_length), f"Padding mask should have shape {(3, max_length, max_length)}"
+    assert np.all(padding_mask[0] == 1), "Full-length sequence should be all ones"
+    assert np.all(padding_mask[1, 3:, :] == 0), "Short sequence should have zeros in padding area"

-# Test bidirectional mask
-bidirectional_mask = create_bidirectional_mask(seq_len)
-print(f"✅ Bidirectional mask is all ones: {np.all(bidirectional_mask == 1)}")
-print(f"✅ Bidirectional mask has correct shape: {bidirectional_mask.shape == (seq_len, seq_len)}")
+    # Test bidirectional mask
+    bidirectional_mask = create_bidirectional_mask(seq_len)
+    assert np.all(bidirectional_mask == 1), "Bidirectional mask should be all ones"
+    assert bidirectional_mask.shape == (seq_len, seq_len), f"Bidirectional mask should have shape {(seq_len, seq_len)}"

-print("📈 Progress: Attention Masking ✓")
+    print("✅ Causal mask is lower triangular: True")
+    print("✅ Causal mask has correct shape: True")
+    print("✅ Causal mask upper triangle is zeros: True")
+    print("✅ Padding mask has correct shape: True")
+    print("✅ Full-length sequence is all ones: True")
+    print("✅ Short sequence has zeros in padding area: True")
+    print("✅ Bidirectional mask is all ones: True")
+    print("✅ Bidirectional mask has correct shape: True")
+    print("📈 Progress: Attention Masking ✓")
+
+# Run the test
+test_attention_masking()

 # %% [markdown]
 """
-## Step 5: Attention Visualization and Analysis
+## Step 5: Complete System Integration Test
+
+### Bringing It All Together
+Let's test all components working together in a realistic scenario similar to how they would be used in actual transformer models.
+"""
+
+# %% nbgrader={"grade": true, "grade_id": "test-integration-final", "locked": true, "points": 10, "schema_version": 3, "solution": false, "task": false}
+def test_complete_attention_system():
+    """Test the complete attention system working together"""
+    print("🔬 Unit Test: Complete Attention System Integration...")
+
+    # Test parameters
+    d_model = 64
+    seq_len = 16
+    batch_size = 2
+    np.random.seed(42)
+
+    print(f"📊 Integration test: d_model={d_model}, seq_len={seq_len}, batch_size={batch_size}")
+
+    # Step 1: Create input embeddings (simulating word embeddings)
+    embeddings = np.random.randn(batch_size, seq_len, d_model) * 0.1
+    print(f"📊 Input embeddings: {embeddings.shape}")
+
+    # Step 2: Test basic attention
+    output, attention_weights = scaled_dot_product_attention(embeddings, embeddings, embeddings)
+    assert output.shape == embeddings.shape, "Basic attention should preserve shape"
+    print(f"✅ Basic attention works: {output.shape}")
+
+    # Step 3: Test self-attention wrapper
+    self_attn = SelfAttention(d_model)
+    self_output, self_weights = self_attn(embeddings[0])  # Single batch item
+    assert self_output.shape == (seq_len, d_model), "Self-attention should preserve shape"
+    print(f"✅ Self-attention output: {self_output.shape}")
+
+    # Step 4: Test with causal mask (like GPT)
+    causal_mask = create_causal_mask(seq_len)
+    causal_output, causal_weights = scaled_dot_product_attention(
+        embeddings[0], embeddings[0], embeddings[0], causal_mask
+    )
+    assert causal_output.shape == (seq_len, d_model), "Causal attention should preserve shape"
+    print(f"✅ Causal attention works: {causal_output.shape}")
+
+    # Step 5: Test with padding mask (variable lengths)
+    lengths = [seq_len, seq_len-3]  # Different sequence lengths
+    padding_mask = create_padding_mask(lengths, seq_len)
+    padded_output, padded_weights = scaled_dot_product_attention(
+        embeddings[0], embeddings[0], embeddings[0], padding_mask[0]
+    )
+    assert padded_output.shape == (seq_len, d_model), "Padding attention should preserve shape"
+    print(f"✅ Padding mask works: {padded_output.shape}")
+
+    # Step 6: Verify all outputs have correct properties
+    assert np.allclose(np.sum(attention_weights, axis=-1), 1.0), "All attention weights should sum to 1"
+    assert output.shape == embeddings.shape, "All outputs should preserve input shape"
+    assert np.all(np.triu(causal_weights, k=1) < 1e-6), "Causal masking should work"
+
+    print("✅ All attention weights sum to 1: True")
+    print("✅ All outputs preserve input shape: True")
+    print("✅ Causal masking works: True")
+    print("📈 Progress: Complete Attention System ✓")
+
+# Run the test
+test_complete_attention_system()
+
+# %% [markdown]
+"""
+## 🎯 Attention Behavior Analysis

-### Understanding What Attention Learns
 Let's create a simple example to see what attention patterns emerge and understand the behavior.
 """

@@ -562,61 +811,6 @@ if _should_show_plots():

 print("🎯 Attention learns to focus on similar content!")

-# %% [markdown]
-"""
-### 🧪 Unit Test: Complete Attention System Integration
-
-**This is a unit test** - it tests the complete attention system working together.
-
-Let's verify all components work together seamlessly.
-"""
-
-# %% nbgrader={"grade": false, "grade_id": "test-integration", "locked": false, "schema_version": 3, "solution": false, "task": false}
-print("🔬 Unit Test: Complete Attention System Integration...")
-
-# Test parameters
-d_model = 64
-seq_len = 16
-batch_size = 2
-np.random.seed(42)
-
-print(f"📊 Integration test: d_model={d_model}, seq_len={seq_len}, batch_size={batch_size}")
-
-# Step 1: Create input embeddings (simulating word embeddings)
-embeddings = np.random.randn(batch_size, seq_len, d_model) * 0.1
-print(f"📊 Input embeddings: {embeddings.shape}")
-
-# Step 2: Test basic attention
-output, attention_weights = scaled_dot_product_attention(embeddings, embeddings, embeddings)
-print(f"✅ Basic attention works: {output.shape}")
-
-# Step 3: Test self-attention wrapper
-self_attn = SelfAttention(d_model)
-self_output, self_weights = self_attn(embeddings[0])  # Single batch item
-print(f"✅ Self-attention output: {self_output.shape}")
-
-# Step 4: Test with causal mask (like GPT)
-causal_mask = create_causal_mask(seq_len)
-causal_output, causal_weights = scaled_dot_product_attention(
-    embeddings[0], embeddings[0], embeddings[0], causal_mask
-)
-print(f"✅ Causal attention works: {causal_output.shape}")
-
-# Step 5: Test with padding mask (variable lengths)
-lengths = [seq_len, seq_len-3]  # Different sequence lengths
-padding_mask = create_padding_mask(lengths, seq_len)
-padded_output, padded_weights = scaled_dot_product_attention(
-    embeddings[0], embeddings[0], embeddings[0], padding_mask[0]
-)
-print(f"✅ Padding mask works: {padded_output.shape}")
-
-# Step 6: Verify all outputs have correct properties
-print(f"✅ All attention weights sum to 1: {np.allclose(np.sum(attention_weights, axis=-1), 1.0)}")
-print(f"✅ All outputs preserve input shape: {output.shape == embeddings.shape}")
-print(f"✅ Causal masking works: {np.all(np.triu(causal_weights, k=1) < 1e-6)}")
-
-print("📈 Progress: Complete Attention System ✓")
-
 print("\n" + "="*50)
 print("🔥 ATTENTION MODULE COMPLETE!")
 print("="*50)