TinyTorch/tinytorch/generation/kv_cache.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/15_memoization/memoization_dev.ipynb.

# %% auto 0
__all__ = ['KVCache', 'enable_kv_cache', 'disable_kv_cache']

# %% ../../modules/source/15_memoization/memoization_dev.ipynb 1
import numpy as np
import time
from typing import Tuple, Optional, Dict, List

# Import TinyTorch components from previous modules
from ..core.tensor import Tensor

# %% ../../modules/source/15_memoization/memoization_dev.ipynb 7
class KVCache:
    """
    Efficient key-value cache for autoregressive generation.

    Stores K,V matrices for each transformer layer to avoid recomputation
    during sequential token generation. This is THE critical optimization
    that makes production language model serving economically viable.

    ⚠️  IMPORTANT: INFERENCE-ONLY (No Gradient Tracking)
    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    KV caching is designed ONLY for inference (generation), NOT training.
    - During generation: No gradients computed (model.eval() mode)
    - Cache operations use .data (no gradient tracking)
    - This is correct and intentional for maximum speed
    - DO NOT use caching during training (use standard forward pass)

    Architecture:
    - Pre-allocates cache tensors with maximum sequence length
    - Tracks current sequence position for efficient O(1) updates
    - Provides update() method to append new K,V pairs without copying
    - Provides get() method to retrieve cached values for attention
    - Handles multiple layers and attention heads properly

    Memory Layout:
    ```
    Layer 0: [Key_cache, Value_cache]  # Shape: (batch, num_heads, max_seq, head_dim)
    Layer 1: [Key_cache, Value_cache]
    ...
    Layer N: [Key_cache, Value_cache]
    ```

    Performance:
    - Update: O(1) - just index assignment
    - Get: O(1) - just slicing (no data copy)
    - Memory: O(num_layers × batch × heads × max_seq × head_dim)
    """

    def __init__(self, batch_size: int, max_seq_len: int, num_layers: int,
                 num_heads: int, head_dim: int):
        """
        Initialize KV cache for efficient generation.

        TODO: Set up pre-allocated cache storage for all transformer layers

        APPROACH:
        1. Store configuration parameters (batch_size, max_seq_len, etc.)
        2. Initialize sequence position counter to 0
        3. Create empty list for cache storage
        4. For each layer, pre-allocate zero-filled key and value caches
        5. Store each layer's (key_cache, value_cache) tuple in the list

        Args:
            batch_size: Number of sequences to generate simultaneously
            max_seq_len: Maximum sequence length to support
            num_layers: Number of transformer layers
            num_heads: Number of attention heads per layer
            head_dim: Dimension of each attention head

        EXAMPLE:
        >>> cache = KVCache(batch_size=2, max_seq_len=128, num_layers=4,
        ...                 num_heads=8, head_dim=64)
        >>> cache.seq_pos  # 0 (no tokens cached yet)
        >>> len(cache.caches)  # 4 (one per layer)
        >>> cache.caches[0][0].shape  # (2, 8, 128, 64) - key cache for layer 0

        HINTS:
        - Cache shape: (batch_size, num_heads, max_seq_len, head_dim)
        - Use Tensor(np.zeros(...)) to create cache tensors
        - Store caches as list of tuples: [(key_0, val_0), (key_1, val_1), ...]
        - Pre-allocation avoids dynamic resizing overhead during generation
        """
        ### BEGIN SOLUTION
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.head_dim = head_dim

        # Current sequence position (how many tokens are cached)
        self.seq_pos = 0

        # Cache storage: list of (key_cache, value_cache) tuples per layer
        self.caches = []

        for layer_idx in range(num_layers):
            # Pre-allocate cache tensors with maximum size
            # Shape: (batch_size, num_heads, max_seq_len, head_dim)
            key_cache = Tensor(np.zeros((batch_size, num_heads, max_seq_len, head_dim)))
            value_cache = Tensor(np.zeros((batch_size, num_heads, max_seq_len, head_dim)))

            self.caches.append((key_cache, value_cache))
        ### END SOLUTION

    def update(self, layer_idx: int, key: Tensor, value: Tensor) -> None:
        """
        Update cache with new key-value pairs for given layer.

        TODO: Efficiently append new K,V to cache without data copying

        APPROACH:
        1. Validate layer_idx is in range [0, num_layers-1]
        2. Validate seq_pos hasn't exceeded max_seq_len
        3. Retrieve the (key_cache, value_cache) tuple for this layer
        4. Write new key to position seq_pos in key_cache using indexed assignment
        5. Write new value to position seq_pos in value_cache using indexed assignment
        6. Note: seq_pos is advanced externally via advance() after all layers

        This is the core caching operation - efficiently append new K,V
        to the cache without recomputation. This operation is O(1) because
        it's just an indexed assignment.

        IMPORTANT: KV caching is designed for INFERENCE (generation) only,
        not training. During generation, gradients are not computed. If you
        need gradients, don't use caching (use standard forward pass instead).

        Args:
            layer_idx: Which transformer layer (0 to num_layers-1)
            key: New key tensor, shape (batch_size, num_heads, 1, head_dim)
            value: New value tensor, shape (batch_size, num_heads, 1, head_dim)

        EXAMPLE:
        >>> cache = KVCache(batch_size=1, max_seq_len=10, num_layers=2,
        ...                 num_heads=4, head_dim=64)
        >>> new_k = Tensor(np.random.randn(1, 4, 1, 64))
        >>> new_v = Tensor(np.random.randn(1, 4, 1, 64))
        >>> cache.update(layer_idx=0, key=new_k, value=new_v)
        >>> cache.seq_pos  # Still 0 (update doesn't advance position)
        >>> cache.advance()
        >>> cache.seq_pos  # Now 1

        HINTS:
        - Use slicing: cache[:, :, seq_pos:seq_pos+1, :] to write to position
        - Use .data for direct NumPy access (no gradient tracking needed)
        - Raise ValueError with helpful messages for invalid inputs
        - This is an in-place operation (modifies cache, returns None)

        Raises:
            ValueError: If layer_idx is out of range or sequence is full
        """
        ### BEGIN SOLUTION
        if layer_idx >= self.num_layers:
            raise ValueError(f"Layer index {layer_idx} >= num_layers {self.num_layers}")

        if self.seq_pos >= self.max_seq_len:
            raise ValueError(f"Sequence position {self.seq_pos} >= max_seq_len {self.max_seq_len}")

        # Get cache for this layer
        key_cache, value_cache = self.caches[layer_idx]

        # Update cache at current position (efficient O(1) write)
        # Note: We use .data here because caching is inference-only (no gradients needed)
        # This avoids gradient tracking overhead during generation
        key_cache.data[:, :, self.seq_pos:self.seq_pos+1, :] = key.data
        value_cache.data[:, :, self.seq_pos:self.seq_pos+1, :] = value.data

        # Note: seq_pos is advanced externally via advance() after all layers process
        ### END SOLUTION

    def get(self, layer_idx: int) -> Tuple[Tensor, Tensor]:
        """
        Retrieve cached key-value pairs for attention computation.

        TODO: Return only the valid cached portion for this layer

        APPROACH:
        1. Validate layer_idx is in range
        2. Retrieve the (key_cache, value_cache) tuple for this layer
        3. Calculate valid_len = seq_pos (number of tokens currently cached)
        4. Slice key_cache to get [:, :, :valid_len, :] (only filled portion)
        5. Slice value_cache to get [:, :, :valid_len, :] (only filled portion)
        6. Wrap sliced data in new Tensor objects and return

        Returns only the valid portion of the cache (up to current seq_pos).
        This is O(1) because we're just slicing NumPy arrays (view, not copy).

        IMPORTANT: Returns Tensors without gradient tracking since caching
        is inference-only. The returned tensors can be used in attention
        computation but won't propagate gradients backward.

        Args:
            layer_idx: Which transformer layer to get cache for

        Returns:
            (cached_keys, cached_values): Tensors shaped for attention
            Keys: (batch_size, num_heads, seq_pos, head_dim)
            Values: (batch_size, num_heads, seq_pos, head_dim)

        EXAMPLE:
        >>> cache = KVCache(batch_size=1, max_seq_len=100, num_layers=2,
        ...                 num_heads=4, head_dim=64)
        >>> # After processing 3 tokens
        >>> cache.seq_pos = 3
        >>> cached_k, cached_v = cache.get(layer_idx=0)
        >>> cached_k.shape  # (1, 4, 3, 64) - only first 3 positions
        >>> cached_v.shape  # (1, 4, 3, 64)

        HINTS:
        - valid_len = self.seq_pos (how many tokens have been cached so far)
        - Use slicing: cache.data[:, :, :valid_len, :] to get valid portion
        - Wrap result in Tensor() for consistency with TinyTorch API
        - If seq_pos=0, returns empty cache (shape with 0 in sequence dimension)

        Raises:
            ValueError: If layer_idx is out of range
        """
        ### BEGIN SOLUTION
        if layer_idx >= self.num_layers:
            raise ValueError(f"Layer index {layer_idx} >= num_layers {self.num_layers}")

        # Get cache for this layer
        key_cache, value_cache = self.caches[layer_idx]

        # Return only the valid portion (up to current sequence position)
        # seq_pos tracks where to write next, so we have seq_pos valid tokens
        valid_len = self.seq_pos

        # Note: Creating new Tensors from .data (no gradient tracking)
        # This is correct for inference-only caching
        cached_keys = Tensor(key_cache.data[:, :, :valid_len, :])
        cached_values = Tensor(value_cache.data[:, :, :valid_len, :])

        return cached_keys, cached_values
        ### END SOLUTION

    def advance(self) -> None:
        """
        Advance sequence position after processing current token.

        Call this after all layers have processed the current token and
        updated their caches. This moves the write pointer forward.
        """
        self.seq_pos += 1

    def reset(self) -> None:
        """
        Reset cache for new generation sequence.

        Call this when starting a new generation (new prompt).
        Resets the sequence position counter and optionally zeros cache data.
        """
        self.seq_pos = 0

        # Zero out caches for clean state (helps with debugging)
        for layer_idx in range(self.num_layers):
            key_cache, value_cache = self.caches[layer_idx]
            key_cache.data.fill(0.0)
            value_cache.data.fill(0.0)

    def get_memory_usage(self) -> Dict[str, float]:
        """
        Calculate memory usage of the cache system.

        Returns:
            Dictionary with memory statistics in MB
        """
        # Calculate size of one cache tensor
        cache_size = self.batch_size * self.num_heads * self.max_seq_len * self.head_dim
        bytes_per_float = 4  # float32

        # Each layer has key_cache + value_cache
        total_cache_tensors = self.num_layers * 2
        total_elements = cache_size * total_cache_tensors
        total_bytes = total_elements * bytes_per_float
        total_mb = total_bytes / (1024 * 1024)

        return {
            'total_mb': total_mb,
            'per_layer_mb': total_mb / self.num_layers,
            'cache_tensors': total_cache_tensors,
            'total_elements': total_elements
        }

# %% ../../modules/source/15_memoization/memoization_dev.ipynb 11
def enable_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
                    num_heads: int, head_dim: int) -> KVCache:
    """
    Create and return a KVCache instance for model generation.

    This function creates a properly sized cache for the model architecture.
    Call this before starting generation, then pass the cache to your
    generation loop.

    Args:
        batch_size: Number of sequences to generate simultaneously
        max_seq_len: Maximum sequence length to support
        num_layers: Number of transformer layers in model
        num_heads: Number of attention heads per layer
        head_dim: Dimension per attention head (usually embed_dim // num_heads)

    Returns:
        KVCache instance ready for use

    Example:
        ```python
        # Enable caching for generation
        cache = enable_kv_cache(
            batch_size=1,
            max_seq_len=100,
            num_layers=4,
            num_heads=4,
            head_dim=32
        )

        # Use in generation loop (pseudocode)
        for step in range(max_new_tokens):
            # Only process new token with cache
            logits = model.forward_cached(new_token, cache)
            next_token = sample(logits)
        ```
    """
    cache = KVCache(batch_size, max_seq_len, num_layers, num_heads, head_dim)

    print(f"⚡ KV Cache enabled:")
    print(f"   Batch size: {batch_size}")
    print(f"   Max sequence: {max_seq_len}")
    print(f"   Layers: {num_layers}")
    print(f"   Heads: {num_heads}")
    print(f"   Head dim: {head_dim}")

    mem_info = cache.get_memory_usage()
    print(f"   Memory: {mem_info['total_mb']:.2f} MB")
    print()

    return cache

# %% ../../modules/source/15_memoization/memoization_dev.ipynb 16
def enable_kv_cache(model):
    """
    Enable KV caching for a transformer model WITHOUT modifying Module 12/13 code.

    TODO: Create cache and non-invasively patch attention layers

    APPROACH:
    1. Validate model has required attributes (embed_dim, num_layers, num_heads, max_seq_len, blocks)
    2. Calculate head_dim from embed_dim and num_heads
    3. Create KVCache instance sized for this model's architecture
    4. Store cache on model as model._kv_cache and set model._cache_enabled flag
    5. For each transformer block, wrap its attention forward method with caching logic
    6. Print confirmation message with cache statistics
    7. Return the cache object

    This function demonstrates **non-invasive optimization** - adding capabilities
    to existing systems without breaking them. Similar to how Module 05 (Autograd)
    uses enable_autograd() to add gradient tracking to Tensors.

    Args:
        model: A GPT-style transformer model with:
               - model.embed_dim (int)
               - model.num_layers (int)
               - model.num_heads (int)
               - model.max_seq_len (int)
               - model.blocks (list of TransformerBlock objects)

    Returns:
        cache: KVCache object for this model

    EXAMPLE:
    >>> from tinytorch.models.transformer import GPT
    >>> model = GPT(vocab_size=100, embed_dim=128, num_layers=4, num_heads=4)
    >>> cache = enable_kv_cache(model)
    >>> hasattr(model, '_kv_cache')  # True
    >>> model._cache_enabled  # True
    >>> cache.num_layers  # 4 (matches model)

    HINTS:
    - Use hasattr() to validate model attributes exist
    - head_dim = model.embed_dim // model.num_heads
    - Store cache on model with model._kv_cache = cache
    - Set flag with model._cache_enabled = True
    - Save original forward with block._original_attention_forward
    - Use a factory function to create patched forwards (closure captures layer_idx)

    Pedagogical Note:
        This teaches students that optimizations can be LAYERED on top of
        working systems. Module 14 doesn't break Modules 12-13; it enhances them!
    """
    ### BEGIN SOLUTION
    import types

    # Validate model has required attributes
    required_attrs = ['embed_dim', 'num_layers', 'num_heads', 'max_seq_len', 'blocks']
    for attr in required_attrs:
        if not hasattr(model, attr):
            raise AttributeError(
                f"Model missing '{attr}' - enable_kv_cache() requires a GPT-style model "
                f"with {', '.join(required_attrs)}"
            )

    # Calculate head dimension
    head_dim = model.embed_dim // model.num_heads
    if model.embed_dim % model.num_heads != 0:
        raise ValueError(
            f"embed_dim ({model.embed_dim}) must be divisible by num_heads ({model.num_heads})"
        )

    # Create cache for this model
    cache = KVCache(
        batch_size=1,  # Default to single sequence; can be reset for batch inference
        max_seq_len=model.max_seq_len,
        num_layers=model.num_layers,
        num_heads=model.num_heads,
        head_dim=head_dim
    )

    # Store cache on model for easy access
    model._kv_cache = cache
    model._cache_enabled = True

    # Patch each transformer block's attention
    for layer_idx, block in enumerate(model.blocks):
        # Store original attention forward method
        if not hasattr(block, '_original_attention_forward'):
            block._original_attention_forward = block.attention.forward

        # Create cached version
        def make_cached_forward(layer_idx, original_forward, cache_obj):
            """Factory to create cached forward with correct layer_idx closure"""
            def cached_forward(x, mask=None):
                """
                Cached attention forward pass with REAL speedup!

                PATH SELECTION STRATEGY (Key to Understanding KV Caching):
                ──────────────────────────────────────────────────────────

                We have THREE possible paths through attention:

                1️⃣ TRAINING PATH (seq_len > 1):
                   - Input: Full sequence of tokens (e.g., 64 tokens)
                   - Action: Use ORIGINAL attention (no caching)
                   - Why: Need full gradient flow for backpropagation
                   - Complexity: O(n²) but that's fine for training
                   - Example: x.shape = (batch=1, seq=64, embed=128)

                2️⃣ FIRST TOKEN PATH (seq_len == 1 AND cache empty):
                   - Input: Single token (the first one in generation)
                   - Action: Use ORIGINAL attention (initialize cache)
                   - Why: Cache is empty, nothing to retrieve yet
                   - Complexity: O(1) - only one token
                   - Example: x.shape = (batch=1, seq=1, embed=128), cache.seq_pos=0

                3️⃣ CACHED GENERATION PATH (seq_len == 1 AND cache populated):
                   - Input: Single NEW token (during generation)
                   - Action: Compute K,V for new token ONLY, retrieve history from cache
                   - Why: This is where the speedup happens! O(n²) → O(n)
                   - Complexity: O(n) - only compute for new token, reuse cache
                   - Example: x.shape = (batch=1, seq=1, embed=128), cache.seq_pos=5


                WHY .data INSTEAD OF TENSOR OPERATIONS?
                ────────────────────────────────────────

                In the cached path, we use numpy via .data for three reasons:

                1. **Explicit Intent**: Makes it crystal clear this is inference-only
                   - Training: Uses Tensor operations → gradients tracked
                   - Inference: Uses .data → no gradient overhead

                2. **Performance**: Avoids any autograd bookkeeping
                   - Even if small, every bit counts in generation
                   - Production LLMs (vLLM, llama.cpp) use similar patterns

                3. **Educational Clarity**: Shows students the distinction
                   - "When do I need gradients?" (training)
                   - "When can I skip them?" (inference)

                We COULD use Tensor operations with requires_grad=False, but .data
                is more explicit and is the industry-standard pattern.


                THE O(n²) → O(n) TRANSFORMATION:
                ─────────────────────────────────

                WITHOUT Cache (Standard Attention):
                  Step 1: Process token 1  → Compute attention for 1 token  (1²  = 1 op)
                  Step 2: Process tokens 1-2 → Compute attention for 2 tokens (2²  = 4 ops)
                  Step 3: Process tokens 1-3 → Compute attention for 3 tokens (3²  = 9 ops)
                  ...
                  Step N: Process tokens 1-N → Compute attention for N tokens (N² ops)

                  Total: 1 + 4 + 9 + ... + N² = O(N³) across all steps!

                WITH Cache (Our Implementation):
                  Step 1: Process token 1 → Compute K,V for token 1, cache it      (1 op)
                  Step 2: Process token 2 → Compute K,V for token 2, retrieve 1    (2 ops)
                  Step 3: Process token 3 → Compute K,V for token 3, retrieve 1-2  (3 ops)
                  ...
                  Step N: Process token N → Compute K,V for token N, retrieve 1-(N-1) (N ops)

                  Total: 1 + 2 + 3 + ... + N = O(N²) across all steps!

                That's why we see 5-7x speedup on short sequences, and 10-15x on longer ones!
                """
                from tinytorch.core.tensor import Tensor
                import numpy as np

                seq_len = x.shape[1]

                # ═══════════════════════════════════════════════════════════════
                # PATH SELECTION: Choose between training, first token, or cached
                # ═══════════════════════════════════════════════════════════════

                # PATH 1: TRAINING (seq_len > 1)
                # ───────────────────────────────────
                # Input is a full sequence (e.g., 64 tokens during training)
                # We MUST use original attention to preserve gradient flow
                # No caching during training - we need backprop through everything
                if seq_len > 1:
                    return original_forward(x, mask)  # O(n²) but preserves gradients

                # PATH 2: FIRST TOKEN (seq_len == 1, cache empty)
                # ────────────────────────────────────────────────
                # This is the very first token in generation (cache.seq_pos == 0)
                # Cache is empty, so there's nothing to retrieve yet
                # Use original attention to process this token, which will populate cache
                if cache_obj.seq_pos == 0:
                    return original_forward(x, mask)  # O(1) - just one token

                # PATH 3: CACHED GENERATION (seq_len == 1, cache populated)
                # ──────────────────────────────────────────────────────────
                # This is a NEW token during generation (cache has history)
                # We can now use the cache for massive speedup!
                # Compute K,V for ONLY this new token, retrieve cached history

                # Get attention layer (assumes block.attention has the attention object)
                attention = block.attention

                # Step 1: Compute Q, K, V for NEW token only
                # Access the linear projection layers
                Q_new = attention.q_proj.forward(x)  # (batch, 1, embed_dim)
                K_new = attention.k_proj.forward(x)  # (batch, 1, embed_dim)
                V_new = attention.v_proj.forward(x)  # (batch, 1, embed_dim)

                # Step 2: Reshape to multi-head format
                batch_size = x.shape[0]
                num_heads = attention.num_heads
                head_dim = attention.head_dim

                # Reshape: (batch, 1, embed_dim) → (batch, num_heads, 1, head_dim)
                Q_heads = Q_new.reshape(batch_size, 1, num_heads, head_dim)
                Q_heads = Tensor(np.transpose(Q_heads.data, (0, 2, 1, 3)))  # (batch, num_heads, 1, head_dim)

                K_heads = K_new.reshape(batch_size, 1, num_heads, head_dim)
                K_heads = Tensor(np.transpose(K_heads.data, (0, 2, 1, 3)))

                V_heads = V_new.reshape(batch_size, 1, num_heads, head_dim)
                V_heads = Tensor(np.transpose(V_heads.data, (0, 2, 1, 3)))

                # Step 3: Update cache with new K, V (using .data for performance)
                cache_obj.update(layer_idx, K_heads, V_heads)

                # Step 4: Retrieve ALL cached K, V (includes history + new token)
                K_all, V_all = cache_obj.get(layer_idx)

                # Step 5: Compute attention using new Q with ALL cached K, V
                # ─────────────────────────────────────────────────────────
                # Scaled dot-product attention: softmax(Q @ K^T / sqrt(d_k)) @ V
                #
                # NOTE: We use .data (numpy arrays) here instead of Tensor operations
                # Why? This is INFERENCE-ONLY code (no gradients needed):
                # - Explicit: Makes it clear this is inference, not training
                # - Fast: Avoids autograd overhead (even if small)
                # - Standard: Production LLMs (vLLM, llama.cpp) do the same
                #
                # If this were training, we'd use Tensor operations for gradient flow.
                # But in generation (inference), .data is the right choice.

                # Q @ K^T: (batch, num_heads, 1, head_dim) @ (batch, num_heads, head_dim, seq_len)
                #        → (batch, num_heads, 1, seq_len)
                K_transposed = np.transpose(K_all.data, (0, 1, 3, 2))  # .data = numpy array
                scores = np.matmul(Q_heads.data, K_transposed)  # Pure numpy matmul

                # Scale by sqrt(head_dim)
                scores = scores / np.sqrt(head_dim)

                # Apply mask if provided (causal mask for generation)
                if mask is not None:
                    # Mask should be (1, 1, 1, seq_len) for this token
                    # In generation, we can attend to all previous tokens
                    pass  # No masking needed in generation (we see all history)

                # Softmax over key dimension
                scores_max = np.max(scores, axis=-1, keepdims=True)
                exp_scores = np.exp(scores - scores_max)
                attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

                # Apply attention weights to values
                # (batch, num_heads, 1, seq_len) @ (batch, num_heads, seq_len, head_dim)
                # → (batch, num_heads, 1, head_dim)
                attention_output = np.matmul(attention_weights, V_all.data)

                # Step 6: Reshape back and apply output projection
                # (batch, num_heads, 1, head_dim) → (batch, 1, num_heads, head_dim)
                attention_output_transposed = np.transpose(attention_output, (0, 2, 1, 3))

                # Concatenate heads: (batch, 1, num_heads * head_dim)
                concat_data = attention_output_transposed.reshape(batch_size, 1, num_heads * head_dim)
                concat_output = Tensor(concat_data)

                # Output projection
                output = attention.out_proj.forward(concat_output)

                return output

            return cached_forward

        # Patch this block's attention
        block.attention.forward = make_cached_forward(layer_idx, block._original_attention_forward, cache)

    print(f"⚡ KV Cache enabled for model!")
    print(f"   Architecture: {model.num_layers} layers × {model.num_heads} heads × {head_dim}D")
    print(f"   Memory: {cache.get_memory_usage()['total_mb']:.2f} MB")
    print(f"   Cache stored in: model._kv_cache")
    print()
    print(f"💡 To disable: call disable_kv_cache(model)")
    print()

    return cache
    ### END SOLUTION


#| export
def disable_kv_cache(model):
    """
    Disable KV caching and restore original attention behavior.

    Args:
        model: Model with caching enabled

    Example:
        ```python
        cache = enable_kv_cache(model)
        # ... do cached generation ...
        disable_kv_cache(model)  # Back to normal
        ```
    """
    if not hasattr(model, '_cache_enabled') or not model._cache_enabled:
        print("⚠️  KV cache not enabled on this model")
        return

    # Restore original attention forwards
    for block in model.blocks:
        if hasattr(block, '_original_attention_forward'):
            block.attention.forward = block._original_attention_forward

    # Clean up
    model._cache_enabled = False
    if hasattr(model, '_kv_cache'):
        delattr(model, '_kv_cache')

    print("✓ KV cache disabled, original attention restored")