TinyTorch/tinytorch/core/attention.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_attention/attention_dev.ipynb.

# %% auto 0
__all__ = ['scaled_dot_product_attention', 'SelfAttention', 'create_causal_mask', 'create_padding_mask',
           'create_bidirectional_mask']

# %% ../../modules/source/07_attention/attention_dev.ipynb 1
import numpy as np
import math
import sys
import os
from typing import List, Union, Optional, Tuple
import matplotlib.pyplot as plt

# Import our building blocks - try package first, then local modules
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local modules
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
    from tensor_dev import Tensor

# %% ../../modules/source/07_attention/attention_dev.ipynb 2
def _should_show_plots():
    """Check if we should show plots (disable during testing)"""
    # Check multiple conditions that indicate we're in test mode
    is_pytest = (
        'pytest' in sys.modules or
        'test' in sys.argv or
        os.environ.get('PYTEST_CURRENT_TEST') is not None or
        any('test' in arg for arg in sys.argv) or
        any('pytest' in arg for arg in sys.argv)
    )

    # Show plots in development mode (when not in test mode)
    return not is_pytest

# %% ../../modules/source/07_attention/attention_dev.ipynb 7
def scaled_dot_product_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
                                mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
    """
    Scaled Dot-Product Attention - The foundation of all transformer models.

    This is the exact mechanism used in GPT, BERT, and all modern language models.

    TODO: Implement the core attention mechanism.

    STEP-BY-STEP IMPLEMENTATION:
    1. Get d_k (dimension of keys) from Q.shape[-1]
    2. Compute attention scores: Q @ K^T (matrix multiplication)
    3. Scale by √d_k: scores / sqrt(d_k)
    4. Apply mask if provided: set masked positions to -1e9
    5. Apply softmax to get attention weights (probabilities)
    6. Apply attention weights to values: weights @ V
    7. Return (output, attention_weights)

    MATHEMATICAL OPERATION:
        Attention(Q,K,V) = softmax(QK^T/√d_k)V

    IMPLEMENTATION HINTS:
    - Use np.matmul() for matrix multiplication
    - Use np.swapaxes(K, -2, -1) to transpose last two dimensions
    - Use math.sqrt() for square root
    - Use np.where() for masking: np.where(mask == 0, -1e9, scores)
    - Implement softmax manually: exp(x) / sum(exp(x))
    - Use keepdims=True for broadcasting

    LEARNING CONNECTIONS:
    - This exact function powers ChatGPT, BERT, GPT-4
    - The scaling prevents gradient vanishing in deep networks
    - Masking enables causal (GPT) and bidirectional (BERT) models
    - Attention weights are interpretable - you can visualize them!

    Args:
        Q: Query matrix of shape (..., seq_len_q, d_k)
        K: Key matrix of shape (..., seq_len_k, d_k)
        V: Value matrix of shape (..., seq_len_v, d_v)
        mask: Optional mask of shape (..., seq_len_q, seq_len_k)

    Returns:
        output: Attention output (..., seq_len_q, d_v)
        attention_weights: Attention probabilities (..., seq_len_q, seq_len_k)
    """
    ### BEGIN SOLUTION
    # Get the dimension for scaling
    d_k = Q.shape[-1]

    # Step 1: Compute attention scores (QK^T)
    # This measures similarity between each query and each key
    scores = np.matmul(Q, np.swapaxes(K, -2, -1))  # (..., seq_len_q, seq_len_k)

    # Step 2: Scale by √d_k to prevent exploding gradients
    scores = scores / math.sqrt(d_k)

    # Step 3: Apply mask if provided (for padding or causality)
    if mask is not None:
        # Replace masked positions with large negative values
        # This makes softmax output ~0 for these positions
        scores = np.where(mask == 0, -1e9, scores)

    # Step 4: Apply softmax to get attention probabilities
    # Each row sums to 1, representing where to focus attention
    # Using numerically stable softmax
    scores_max = np.max(scores, axis=-1, keepdims=True)
    scores_exp = np.exp(scores - scores_max)
    attention_weights = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)

    # Step 5: Apply attention weights to values
    # This gives us the weighted combination of values
    output = np.matmul(attention_weights, V)  # (..., seq_len_q, d_v)

    return output, attention_weights
    ### END SOLUTION

# %% ../../modules/source/07_attention/attention_dev.ipynb 11
class SelfAttention:
    """
    Self-Attention wrapper - Convenience class for self-attention where Q=K=V.

    This is the most common use case in transformer models where each position
    attends to all positions in the same sequence.
    """

    def __init__(self, d_model: int):
        """
        Initialize Self-Attention.

        TODO: Store the model dimension for this self-attention layer.

        STEP-BY-STEP IMPLEMENTATION:
        1. Store d_model as an instance variable (self.d_model)
        2. Print initialization message for debugging

        EXAMPLE USAGE:
        ```python
        self_attn = SelfAttention(d_model=64)
        output, weights = self_attn(input_sequence)
        ```

        IMPLEMENTATION HINTS:
        - Simply store d_model parameter: self.d_model = d_model
        - Print message: print(f"🔧 SelfAttention: d_model={d_model}")

        LEARNING CONNECTIONS:
        - This is like nn.MultiheadAttention in PyTorch (but simpler)
        - Used in every transformer layer for self-attention
        - Foundation for understanding GPT, BERT architectures

        Args:
            d_model: Model dimension
        """
        ### BEGIN SOLUTION
        self.d_model = d_model
        print(f"🔧 SelfAttention: d_model={d_model}")
        ### END SOLUTION

    def forward(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
        """
        Forward pass of self-attention.

        TODO: Apply self-attention where Q=K=V=x.

        STEP-BY-STEP IMPLEMENTATION:
        1. Call scaled_dot_product_attention with Q=K=V=x
        2. Pass the mask parameter through
        3. Return the output and attention weights

        EXAMPLE USAGE:
        ```python
        x = np.random.randn(seq_len, d_model)  # Input sequence
        output, weights = self_attn.forward(x)
        # weights[i,j] = how much position i attends to position j
        ```

        IMPLEMENTATION HINTS:
        - Use the function you implemented above
        - Self-attention means: Q = K = V = x
        - Return: scaled_dot_product_attention(x, x, x, mask)

        LEARNING CONNECTIONS:
        - This is how transformers process sequences
        - Each position can attend to any other position
        - Enables understanding of long-range dependencies

        Args:
            x: Input tensor (..., seq_len, d_model)
            mask: Optional attention mask

        Returns:
            output: Self-attention output (..., seq_len, d_model)
            attention_weights: Attention weights
        """
        ### BEGIN SOLUTION
        # Self-attention: Q = K = V = x
        return scaled_dot_product_attention(x, x, x, mask)
        ### END SOLUTION

    def __call__(self, x: np.ndarray, mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
        """Make the class callable."""
        return self.forward(x, mask)

# %% ../../modules/source/07_attention/attention_dev.ipynb 15
def create_causal_mask(seq_len: int) -> np.ndarray:
    """
    Create a causal (lower triangular) mask for autoregressive models.

    Used in models like GPT where each position can only attend to
    previous positions, not future ones.

    TODO: Create a lower triangular matrix of ones.

    STEP-BY-STEP IMPLEMENTATION:
    1. Use np.tril() to create lower triangular matrix
    2. Create matrix of ones with shape (seq_len, seq_len)
    3. Return the lower triangular part

    EXAMPLE USAGE:
    ```python
    mask = create_causal_mask(4)
    # mask = [[1, 0, 0, 0],
    #         [1, 1, 0, 0],
    #         [1, 1, 1, 0],
    #         [1, 1, 1, 1]]
    ```

    IMPLEMENTATION HINTS:
    - Use np.ones((seq_len, seq_len)) to create matrix of ones
    - Use np.tril() to get lower triangular part
    - Or combine: np.tril(np.ones((seq_len, seq_len)))

    LEARNING CONNECTIONS:
    - Used in GPT for autoregressive generation
    - Prevents looking into the future during training
    - Essential for language modeling tasks

    Args:
        seq_len: Sequence length

    Returns:
        mask: Causal mask (seq_len, seq_len) with 1s for allowed positions, 0s for blocked
    """
    ### BEGIN SOLUTION
    return np.tril(np.ones((seq_len, seq_len)))
    ### END SOLUTION

#| export
def create_padding_mask(lengths: List[int], max_length: int) -> np.ndarray:
    """
    Create padding mask for variable-length sequences.

    TODO: Create mask that ignores padding tokens.

    STEP-BY-STEP IMPLEMENTATION:
    1. Initialize zero array with shape (batch_size, max_length, max_length)
    2. For each sequence in the batch, set valid positions to 1
    3. Valid positions are [:length, :length] for each sequence
    4. Return the mask array

    EXAMPLE USAGE:
    ```python
    lengths = [3, 2, 4]  # Actual sequence lengths
    mask = create_padding_mask(lengths, max_length=4)
    # For sequence 0 (length=3): positions [0,1,2] can attend to [0,1,2]
    # For sequence 1 (length=2): positions [0,1] can attend to [0,1]
    ```

    IMPLEMENTATION HINTS:
    - batch_size = len(lengths)
    - Use np.zeros((batch_size, max_length, max_length))
    - Loop through lengths: for i, length in enumerate(lengths)
    - Set valid region: mask[i, :length, :length] = 1

    LEARNING CONNECTIONS:
    - Used when sequences have different lengths
    - Prevents attention to padding tokens
    - Essential for efficient batch processing

    Args:
        lengths: List of actual sequence lengths
        max_length: Maximum sequence length (padded length)

    Returns:
        mask: Padding mask (batch_size, max_length, max_length)
    """
    ### BEGIN SOLUTION
    batch_size = len(lengths)
    mask = np.zeros((batch_size, max_length, max_length))

    for i, length in enumerate(lengths):
        mask[i, :length, :length] = 1

    return mask
    ### END SOLUTION

#| export
def create_bidirectional_mask(seq_len: int) -> np.ndarray:
    """
    Create a bidirectional mask where all positions can attend to all positions.

    Used in models like BERT for bidirectional context understanding.

    TODO: Create a matrix of all ones.

    STEP-BY-STEP IMPLEMENTATION:
    1. Use np.ones() to create matrix of all ones
    2. Shape should be (seq_len, seq_len)
    3. Return the matrix

    EXAMPLE USAGE:
    ```python
    mask = create_bidirectional_mask(3)
    # mask = [[1, 1, 1],
    #         [1, 1, 1],
    #         [1, 1, 1]]
    ```

    IMPLEMENTATION HINTS:
    - Very simple: np.ones((seq_len, seq_len))
    - All positions can attend to all positions

    LEARNING CONNECTIONS:
    - Used in BERT for bidirectional understanding
    - Allows looking at past and future context
    - Good for understanding tasks, not generation

    Args:
        seq_len: Sequence length

    Returns:
        mask: All-ones mask (seq_len, seq_len)
    """
    ### BEGIN SOLUTION
    return np.ones((seq_len, seq_len))
    ### END SOLUTION