TinyTorch/tinytorch/models/transformer.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.

# %% auto 0
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']

# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2
import numpy as np
from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.attention import MultiHeadAttention
from ..core.activations import GELU
from ..text.embeddings import Embedding
from ..core.autograd import SqrtBackward, MeanBackward

# Monkey-patch sqrt method onto Tensor for LayerNorm
def _tensor_sqrt(self):
    """
    Compute element-wise square root with gradient tracking.

    Used in normalization layers (LayerNorm, BatchNorm).
    """
    result_data = np.sqrt(self.data)
    result = Tensor(result_data, requires_grad=self.requires_grad)

    if self.requires_grad:
        result._grad_fn = SqrtBackward()
        result._grad_fn.saved_tensors = (self,)
        result._grad_fn.saved_output = result

    return result

Tensor.sqrt = _tensor_sqrt

# Monkey-patch mean method onto Tensor for LayerNorm
def _tensor_mean(self, axis=None, keepdims=False):
    """
    Compute mean with gradient tracking.

    Used in normalization layers (LayerNorm, BatchNorm) and loss functions.
    """
    result_data = np.mean(self.data, axis=axis, keepdims=keepdims)
    result = Tensor(result_data, requires_grad=self.requires_grad)

    if self.requires_grad:
        result._grad_fn = MeanBackward()
        result._grad_fn.saved_tensors = (self,)
        result._grad_fn.axis = axis
        result._grad_fn.keepdims = keepdims

    return result

Tensor.mean = _tensor_mean

# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
class LayerNorm:
    """
    Layer Normalization for transformer blocks.

    Normalizes across the feature dimension (last axis) for each sample independently,
    unlike batch normalization which normalizes across the batch dimension.
    """

    def __init__(self, normalized_shape, eps=1e-5):
        """
        Initialize LayerNorm with learnable parameters.

        TODO: Set up normalization parameters

        APPROACH:
        1. Store the shape to normalize over (usually embed_dim)
        2. Initialize learnable scale (gamma) and shift (beta) parameters
        3. Set small epsilon for numerical stability

        EXAMPLE:
        >>> ln = LayerNorm(512)  # For 512-dimensional embeddings
        >>> x = Tensor(np.random.randn(2, 10, 512))  # (batch, seq, features)
        >>> normalized = ln.forward(x)
        >>> # Each (2, 10) sample normalized independently across 512 features

        HINTS:
        - gamma should start at 1.0 (identity scaling)
        - beta should start at 0.0 (no shift)
        - eps prevents division by zero in variance calculation
        """
        ### BEGIN SOLUTION
        self.normalized_shape = normalized_shape
        self.eps = eps

        # Learnable parameters: scale and shift
        # CRITICAL: requires_grad=True so optimizer can train these!
        self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True)  # Scale parameter
        self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True)  # Shift parameter
        ### END SOLUTION

    def forward(self, x):
        """
        Apply layer normalization.

        TODO: Implement layer normalization formula

        APPROACH:
        1. Compute mean and variance across the last dimension
        2. Normalize: (x - mean) / sqrt(variance + eps)
        3. Apply learnable scale and shift: gamma * normalized + beta

        MATHEMATICAL FORMULA:
        y = (x - μ) / σ * γ + β
        where μ = mean(x), σ = sqrt(var(x) + ε)

        HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
        """
        ### BEGIN SOLUTION
        # CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
        # Compute statistics across last dimension (features)
        mean = x.mean(axis=-1, keepdims=True)

        # Compute variance: E[(x - μ)²]
        diff = x - mean  # Tensor subtraction maintains gradient
        variance = (diff * diff).mean(axis=-1, keepdims=True)  # Tensor ops maintain gradient

        # Normalize: (x - mean) / sqrt(variance + eps)
        # Note: Use Tensor.sqrt() to preserve gradient flow
        std = (variance + self.eps).sqrt()  # sqrt maintains gradient flow
        normalized = diff / std  # Division maintains gradient flow

        # Apply learnable transformation
        output = normalized * self.gamma + self.beta
        return output
        ### END SOLUTION

    def parameters(self):
        """Return learnable parameters."""
        return [self.gamma, self.beta]

# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13
class MLP:
    """
    Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.

    Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1.
    This provides the non-linear transformation in each transformer block.
    """

    def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1):
        """
        Initialize MLP with two linear layers.

        TODO: Set up the feed-forward network layers

        APPROACH:
        1. First layer expands from embed_dim to hidden_dim (usually 4x larger)
        2. Second layer projects back to embed_dim
        3. Use GELU activation (smoother than ReLU, preferred in transformers)

        EXAMPLE:
        >>> mlp = MLP(512)  # Will create 512 -> 2048 -> 512 network
        >>> x = Tensor(np.random.randn(2, 10, 512))
        >>> output = mlp.forward(x)
        >>> assert output.shape == (2, 10, 512)

        HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim)
        """
        ### BEGIN SOLUTION
        if hidden_dim is None:
            hidden_dim = 4 * embed_dim  # Standard 4x expansion

        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim

        # Two-layer feed-forward network
        self.linear1 = Linear(embed_dim, hidden_dim)
        self.linear2 = Linear(hidden_dim, embed_dim)

        # GELU activation
        self.gelu = GELU()
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through MLP.

        TODO: Implement the feed-forward computation

        APPROACH:
        1. First linear transformation: embed_dim -> hidden_dim
        2. Apply GELU activation (smooth, differentiable)
        3. Second linear transformation: hidden_dim -> embed_dim

        COMPUTATION FLOW:
        x -> Linear -> GELU -> Linear -> output

        HINT: GELU activation is implemented above as a function
        """
        ### BEGIN SOLUTION
        # First linear layer with expansion
        hidden = self.linear1.forward(x)

        # GELU activation (callable pattern - activations have __call__)
        hidden = self.gelu(hidden)

        # Second linear layer back to original size
        output = self.linear2.forward(hidden)

        return output
        ### END SOLUTION

    def parameters(self):
        """Return all learnable parameters."""
        params = []
        params.extend(self.linear1.parameters())
        params.extend(self.linear2.parameters())
        return params

# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17
class TransformerBlock:
    """
    Complete Transformer Block with self-attention, MLP, and residual connections.

    This is the core building block of GPT and other transformer models.
    Each block processes the input sequence and passes it to the next block.
    """

    def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1):
        """
        Initialize a complete transformer block.

        TODO: Set up all components of the transformer block

        APPROACH:
        1. Multi-head self-attention for sequence modeling
        2. First layer normalization (pre-norm architecture)
        3. MLP with specified expansion ratio
        4. Second layer normalization

        TRANSFORMER BLOCK ARCHITECTURE:
        x → LayerNorm → MultiHeadAttention → + (residual) →
            LayerNorm → MLP → + (residual) → output

        EXAMPLE:
        >>> block = TransformerBlock(embed_dim=512, num_heads=8)
        >>> x = Tensor(np.random.randn(2, 10, 512))  # (batch, seq, embed)
        >>> output = block.forward(x)
        >>> assert output.shape == (2, 10, 512)

        HINT: We use pre-norm architecture (LayerNorm before attention/MLP)
        """
        ### BEGIN SOLUTION
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        # Multi-head self-attention
        self.attention = MultiHeadAttention(embed_dim, num_heads)

        # Layer normalizations (pre-norm architecture)
        self.ln1 = LayerNorm(embed_dim)  # Before attention
        self.ln2 = LayerNorm(embed_dim)  # Before MLP

        # Feed-forward network
        hidden_dim = int(embed_dim * mlp_ratio)
        self.mlp = MLP(embed_dim, hidden_dim)
        ### END SOLUTION

    def forward(self, x, mask=None):
        """
        Forward pass through transformer block.

        TODO: Implement the complete transformer block computation

        APPROACH:
        1. Apply layer norm, then self-attention, then add residual
        2. Apply layer norm, then MLP, then add residual
        3. Return the transformed sequence

        COMPUTATION FLOW:
        x → ln1 → attention → + x → ln2 → mlp → + → output

        RESIDUAL CONNECTIONS:
        These are crucial for training deep networks - they allow gradients
        to flow directly through the network during backpropagation.

        HINT: Store intermediate results to add residual connections properly
        """
        ### BEGIN SOLUTION
        # First sub-layer: Multi-head self-attention with residual connection
        # Pre-norm: LayerNorm before attention
        normed1 = self.ln1.forward(x)
        # Self-attention: MultiHeadAttention internally creates Q, K, V from input
        attention_out = self.attention.forward(normed1, mask)

        # Residual connection
        x = x + attention_out

        # Second sub-layer: MLP with residual connection
        # Pre-norm: LayerNorm before MLP
        normed2 = self.ln2.forward(x)
        mlp_out = self.mlp.forward(normed2)

        # Residual connection
        output = x + mlp_out

        return output
        ### END SOLUTION

    def parameters(self):
        """Return all learnable parameters."""
        params = []
        params.extend(self.attention.parameters())
        params.extend(self.ln1.parameters())
        params.extend(self.ln2.parameters())
        params.extend(self.mlp.parameters())
        return params

# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21
class GPT:
    """
    Complete GPT (Generative Pre-trained Transformer) model.

    This combines embeddings, positional encoding, multiple transformer blocks,
    and a language modeling head for text generation.
    """

    def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024):
        """
        Initialize complete GPT model.

        TODO: Set up all components of the GPT architecture

        APPROACH:
        1. Token embedding layer to convert tokens to vectors
        2. Positional embedding to add position information
        3. Stack of transformer blocks (the main computation)
        4. Final layer norm and language modeling head

        GPT ARCHITECTURE:
        tokens → embedding → + pos_embedding →
                transformer_blocks → layer_norm → lm_head → logits

        EXAMPLE:
        >>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8)
        >>> tokens = Tensor(np.random.randint(0, 1000, (2, 10)))  # (batch, seq)
        >>> logits = model.forward(tokens)
        >>> assert logits.shape == (2, 10, 1000)  # (batch, seq, vocab)

        HINTS:
        - Positional embeddings are learned, not fixed sinusoidal
        - Final layer norm stabilizes training
        - Language modeling head shares weights with token embedding (tie_weights)
        """
        ### BEGIN SOLUTION
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_seq_len = max_seq_len

        # Token and positional embeddings
        self.token_embedding = Embedding(vocab_size, embed_dim)
        self.position_embedding = Embedding(max_seq_len, embed_dim)

        # Stack of transformer blocks
        self.blocks = []
        for _ in range(num_layers):
            block = TransformerBlock(embed_dim, num_heads)
            self.blocks.append(block)

        # Final layer normalization
        self.ln_f = LayerNorm(embed_dim)

        # Language modeling head (projects to vocabulary)
        self.lm_head = Linear(embed_dim, vocab_size, bias=False)
        ### END SOLUTION

    def forward(self, tokens):
        """
        Forward pass through GPT model.

        TODO: Implement the complete GPT forward pass

        APPROACH:
        1. Get token embeddings and positional embeddings
        2. Add them together (broadcasting handles different shapes)
        3. Pass through all transformer blocks sequentially
        4. Apply final layer norm and language modeling head

        COMPUTATION FLOW:
        tokens → embed + pos_embed → blocks → ln_f → lm_head → logits

        CAUSAL MASKING:
        For autoregressive generation, we need to prevent tokens from
        seeing future tokens. This is handled by the attention mask.

        HINT: Create position indices as range(seq_len) for positional embedding
        """
        ### BEGIN SOLUTION
        batch_size, seq_len = tokens.shape

        # Token embeddings
        token_emb = self.token_embedding.forward(tokens)

        # Positional embeddings
        positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
        pos_emb = self.position_embedding.forward(positions)

        # Combine embeddings
        x = token_emb + pos_emb

        # Create causal mask for autoregressive generation
        mask = self._create_causal_mask(seq_len)

        # Pass through transformer blocks
        for block in self.blocks:
            x = block.forward(x, mask)

        # Final layer normalization
        x = self.ln_f.forward(x)

        # Language modeling head
        logits = self.lm_head.forward(x)

        return logits
        ### END SOLUTION

    def _create_causal_mask(self, seq_len):
        """Create causal mask to prevent attending to future positions."""
        ### BEGIN SOLUTION
        # Upper triangular matrix filled with -inf
        mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1)
        return Tensor(mask)
        ### END SOLUTION

    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
        """
        Generate text autoregressively.

        TODO: Implement autoregressive text generation

        APPROACH:
        1. Start with prompt tokens
        2. For each new position:
           - Run forward pass to get logits
           - Sample next token from logits
           - Append to sequence
        3. Return generated sequence

        AUTOREGRESSIVE GENERATION:
        At each step, the model predicts the next token based on all
        previous tokens. This is how GPT generates coherent text.

        EXAMPLE:
        >>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4)
        >>> prompt = Tensor([[1, 2, 3]])  # Some token sequence
        >>> generated = model.generate(prompt, max_new_tokens=5)
        >>> assert generated.shape[1] == 3 + 5  # original + new tokens

        HINT: Use np.random.choice with temperature for sampling
        """
        ### BEGIN SOLUTION
        current_tokens = Tensor(prompt_tokens.data.copy())

        for _ in range(max_new_tokens):
            # Get logits for current sequence
            logits = self.forward(current_tokens)

            # Get logits for last position (next token prediction)
            last_logits = logits.data[:, -1, :]  # (batch_size, vocab_size)

            # Apply temperature scaling
            scaled_logits = last_logits / temperature

            # Convert to probabilities (softmax)
            exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True))
            probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)

            # Sample next token
            next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]])

            # Append to sequence
            current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1))

        return current_tokens
        ### END SOLUTION

    def parameters(self):
        """Return all learnable parameters."""
        params = []
        params.extend(self.token_embedding.parameters())
        params.extend(self.position_embedding.parameters())

        for block in self.blocks:
            params.extend(block.parameters())

        params.extend(self.ln_f.parameters())
        params.extend(self.lm_head.parameters())

        return params