Add jupytext to requirements and export Module 14

Requirements.txt updates: - Added jupytext>=1.16.0 (required for tito export) - Added nbformat>=5.10.0 (jupytext dependency) - New section: Development Tools (Required for tito export) Module 14 export: - Successfully exported kvcaching_dev.py to tinytorch/generation/kv_cache.py - Generated kvcaching_dev.ipynb (21 cells: 9 code, 12 markdown) - KVCache class, enable_kv_cache(), disable_kv_cache() now in package Auto-generated updates: - Added DO NOT EDIT warnings to 8 exported files - Updated _modidx.py with Module 14 exports - Protected core files from manual editing Export now works with: tito export 14_kvcaching Students can import: from tinytorch.generation.kv_cache import enable_kv_cache
2026-04-30 10:13:57 -05:00 · 2025-11-05 19:10:52 -05:00
parent 824ac691b2
commit 4b861d982f
10 changed files with 1277 additions and 1122 deletions
--- a/modules/source/14_kvcaching/kvcaching_dev.ipynb
+++ b/modules/source/14_kvcaching/kvcaching_dev.ipynb
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,6 +21,14 @@ PyYAML>=6.0
 pytest>=8.0.0
 pytest-cov>=4.0.0
 # ============================================================================
 # Development Tools (Required for tito export)
 # ============================================================================
 # Jupytext - Convert .py files to .ipynb for nbdev
 jupytext>=1.16.0
 nbformat>=5.10.0
 # ============================================================================
 # Optional Dependencies (Uncomment if needed)
 # ============================================================================
--- a/tinytorch/_modidx.py
+++ b/tinytorch/_modidx.py
@@ -1,3 +1,19 @@
 # ╔═══════════════════════════════════════════════════════════════════════════════╗
 # ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/[unknown]/[unknown]_dev.py              ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # Autogenerated by nbdev
 d = { 'settings': { 'branch': 'main',
@@ -268,6 +284,24 @@ d = { 'settings': { 'branch': 'main',
                                                                                         'tinytorch/data/loader.py'),
                                       'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__',
                                                                                        'tinytorch/data/loader.py')},
            'tinytorch.generation.kv_cache': { 'tinytorch.generation.kv_cache.KVCache': ( '14_kvcaching/kvcaching_dev.html#kvcache',
                                                                                          'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.KVCache.__init__': ( '14_kvcaching/kvcaching_dev.html#kvcache.__init__',
                                                                                                   'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.KVCache.advance': ( '14_kvcaching/kvcaching_dev.html#kvcache.advance',
                                                                                                  'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.KVCache.get': ( '14_kvcaching/kvcaching_dev.html#kvcache.get',
                                                                                              'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.KVCache.get_memory_usage': ( '14_kvcaching/kvcaching_dev.html#kvcache.get_memory_usage',
                                                                                                           'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.KVCache.reset': ( '14_kvcaching/kvcaching_dev.html#kvcache.reset',
                                                                                                'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.KVCache.update': ( '14_kvcaching/kvcaching_dev.html#kvcache.update',
                                                                                                 'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.disable_kv_cache': ( '14_kvcaching/kvcaching_dev.html#disable_kv_cache',
                                                                                                   'tinytorch/generation/kv_cache.py'),
                                               'tinytorch.generation.kv_cache.enable_kv_cache': ( '14_kvcaching/kvcaching_dev.html#enable_kv_cache',
                                                                                                  'tinytorch/generation/kv_cache.py')},
            'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
                                                                                    'tinytorch/models/transformer.py'),
                                              'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
--- a/tinytorch/core/attention.py
+++ b/tinytorch/core/attention.py
@@ -1,5 +1,19 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/12_attention/attention_dev.ipynb.
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
-
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/07_attention/attention_dev.py           ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['scaled_dot_product_attention', 'MultiHeadAttention']
--- a/tinytorch/core/autograd.py
+++ b/tinytorch/core/autograd.py
@@ -1,5 +1,19 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/05_autograd/autograd_dev.ipynb.
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
-
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/09_autograd/autograd_dev.py             ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'SumBackward',
           'ReshapeBackward', 'EmbeddingBackward', 'SqrtBackward', 'MeanBackward', 'ReLUBackward', 'GELUBackward',
--- a/tinytorch/core/tensor.py
+++ b/tinytorch/core/tensor.py
@@ -1,5 +1,19 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/01_tensor/tensor_dev.ipynb.
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
-
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/02_tensor/tensor_dev.py                 ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['Tensor']
--- a/tinytorch/generation/kv_cache.py
+++ b/tinytorch/generation/kv_cache.py
@@ -1,16 +1,31 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/14_kvcaching/kvcaching_dev.py (unless otherwise specified).
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
 # ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/XX_kv_cache/kv_cache_dev.py             ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['KVCache', 'enable_kv_cache', 'disable_kv_cache']
-__all__ = ['KVCache', 'enable_kv_cache']
+# %% ../../modules/source/14_kvcaching/kvcaching_dev.ipynb 1
 # Cell
 import numpy as np
 import time
 from typing import Tuple, Optional, Dict, List
 # Import TinyTorch components from previous modules
-from tinytorch.core.tensor import Tensor
+from ..core.tensor import Tensor
-# Cell
+# %% ../../modules/source/14_kvcaching/kvcaching_dev.ipynb 5
 class KVCache:
    """
    Efficient key-value cache for autoregressive generation.
@@ -48,113 +63,192 @@ class KVCache:
    - Memory: O(num_layers × batch × heads × max_seq × head_dim)
    """
-    def __init__(self, batch_size: int, max_seq_len: int, num_layers: int, 
+    def __init__(self, batch_size: int, max_seq_len: int, num_layers: int,
                 num_heads: int, head_dim: int):
        """
        Initialize KV cache for efficient generation.
-        
+
        TODO: Set up pre-allocated cache storage for all transformer layers
        APPROACH:
        1. Store configuration parameters (batch_size, max_seq_len, etc.)
        2. Initialize sequence position counter to 0
        3. Create empty list for cache storage
        4. For each layer, pre-allocate zero-filled key and value caches
        5. Store each layer's (key_cache, value_cache) tuple in the list
        Args:
            batch_size: Number of sequences to generate simultaneously
            max_seq_len: Maximum sequence length to support
            num_layers: Number of transformer layers
            num_heads: Number of attention heads per layer
            head_dim: Dimension of each attention head
        EXAMPLE:
        >>> cache = KVCache(batch_size=2, max_seq_len=128, num_layers=4,
        ...                 num_heads=8, head_dim=64)
        >>> cache.seq_pos  # 0 (no tokens cached yet)
        >>> len(cache.caches)  # 4 (one per layer)
        >>> cache.caches[0][0].shape  # (2, 8, 128, 64) - key cache for layer 0
        HINTS:
        - Cache shape: (batch_size, num_heads, max_seq_len, head_dim)
        - Use Tensor(np.zeros(...)) to create cache tensors
        - Store caches as list of tuples: [(key_0, val_0), (key_1, val_1), ...]
        - Pre-allocation avoids dynamic resizing overhead during generation
        """
        ### BEGIN SOLUTION
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.head_dim = head_dim
-        
+
        # Current sequence position (how many tokens are cached)
        self.seq_pos = 0
-        
+
        # Cache storage: list of (key_cache, value_cache) tuples per layer
        self.caches = []
-        
+
        for layer_idx in range(num_layers):
            # Pre-allocate cache tensors with maximum size
            # Shape: (batch_size, num_heads, max_seq_len, head_dim)
            key_cache = Tensor(np.zeros((batch_size, num_heads, max_seq_len, head_dim)))
            value_cache = Tensor(np.zeros((batch_size, num_heads, max_seq_len, head_dim)))
-            
+
            self.caches.append((key_cache, value_cache))
        ### END SOLUTION
    def update(self, layer_idx: int, key: Tensor, value: Tensor) -> None:
        """
        Update cache with new key-value pairs for given layer.
-        
+
-        This is the core caching operation - efficiently append new K,V 
+        TODO: Efficiently append new K,V to cache without data copying
        APPROACH:
        1. Validate layer_idx is in range [0, num_layers-1]
        2. Validate seq_pos hasn't exceeded max_seq_len
        3. Retrieve the (key_cache, value_cache) tuple for this layer
        4. Write new key to position seq_pos in key_cache using indexed assignment
        5. Write new value to position seq_pos in value_cache using indexed assignment
        6. Note: seq_pos is advanced externally via advance() after all layers
        This is the core caching operation - efficiently append new K,V
        to the cache without recomputation. This operation is O(1) because
        it's just an indexed assignment.
-        
+
-        IMPORTANT: KV caching is designed for INFERENCE (generation) only, 
+        IMPORTANT: KV caching is designed for INFERENCE (generation) only,
        not training. During generation, gradients are not computed. If you
        need gradients, don't use caching (use standard forward pass instead).
-        
+
        Args:
            layer_idx: Which transformer layer (0 to num_layers-1)
            key: New key tensor, shape (batch_size, num_heads, 1, head_dim)
            value: New value tensor, shape (batch_size, num_heads, 1, head_dim)
-        
+
        EXAMPLE:
        >>> cache = KVCache(batch_size=1, max_seq_len=10, num_layers=2,
        ...                 num_heads=4, head_dim=64)
        >>> new_k = Tensor(np.random.randn(1, 4, 1, 64))
        >>> new_v = Tensor(np.random.randn(1, 4, 1, 64))
        >>> cache.update(layer_idx=0, key=new_k, value=new_v)
        >>> cache.seq_pos  # Still 0 (update doesn't advance position)
        >>> cache.advance()
        >>> cache.seq_pos  # Now 1
        HINTS:
        - Use slicing: cache[:, :, seq_pos:seq_pos+1, :] to write to position
        - Use .data for direct NumPy access (no gradient tracking needed)
        - Raise ValueError with helpful messages for invalid inputs
        - This is an in-place operation (modifies cache, returns None)
        Raises:
            ValueError: If layer_idx is out of range or sequence is full
        """
        ### BEGIN SOLUTION
        if layer_idx >= self.num_layers:
            raise ValueError(f"Layer index {layer_idx} >= num_layers {self.num_layers}")
-        
+
        if self.seq_pos >= self.max_seq_len:
            raise ValueError(f"Sequence position {self.seq_pos} >= max_seq_len {self.max_seq_len}")
-        
+
        # Get cache for this layer
        key_cache, value_cache = self.caches[layer_idx]
-        
+
        # Update cache at current position (efficient O(1) write)
        # Note: We use .data here because caching is inference-only (no gradients needed)
        # This avoids gradient tracking overhead during generation
        key_cache.data[:, :, self.seq_pos:self.seq_pos+1, :] = key.data
        value_cache.data[:, :, self.seq_pos:self.seq_pos+1, :] = value.data
-        
+
        # Note: seq_pos is advanced externally via advance() after all layers process
        ### END SOLUTION
    def get(self, layer_idx: int) -> Tuple[Tensor, Tensor]:
        """
        Retrieve cached key-value pairs for attention computation.
-        
+
        TODO: Return only the valid cached portion for this layer
        APPROACH:
        1. Validate layer_idx is in range
        2. Retrieve the (key_cache, value_cache) tuple for this layer
        3. Calculate valid_len = seq_pos (number of tokens currently cached)
        4. Slice key_cache to get [:, :, :valid_len, :] (only filled portion)
        5. Slice value_cache to get [:, :, :valid_len, :] (only filled portion)
        6. Wrap sliced data in new Tensor objects and return
        Returns only the valid portion of the cache (up to current seq_pos).
        This is O(1) because we're just slicing NumPy arrays (view, not copy).
-        
+
        IMPORTANT: Returns Tensors without gradient tracking since caching
        is inference-only. The returned tensors can be used in attention
        computation but won't propagate gradients backward.
-        
+
        Args:
            layer_idx: Which transformer layer to get cache for
-        
+
        Returns:
            (cached_keys, cached_values): Tensors shaped for attention
            Keys: (batch_size, num_heads, seq_pos, head_dim)
            Values: (batch_size, num_heads, seq_pos, head_dim)
-        
+
        EXAMPLE:
        >>> cache = KVCache(batch_size=1, max_seq_len=100, num_layers=2,
        ...                 num_heads=4, head_dim=64)
        >>> # After processing 3 tokens
        >>> cache.seq_pos = 3
        >>> cached_k, cached_v = cache.get(layer_idx=0)
        >>> cached_k.shape  # (1, 4, 3, 64) - only first 3 positions
        >>> cached_v.shape  # (1, 4, 3, 64)
        HINTS:
        - valid_len = self.seq_pos (how many tokens have been cached so far)
        - Use slicing: cache.data[:, :, :valid_len, :] to get valid portion
        - Wrap result in Tensor() for consistency with TinyTorch API
        - If seq_pos=0, returns empty cache (shape with 0 in sequence dimension)
        Raises:
            ValueError: If layer_idx is out of range
        """
        ### BEGIN SOLUTION
        if layer_idx >= self.num_layers:
            raise ValueError(f"Layer index {layer_idx} >= num_layers {self.num_layers}")
-        
+
        # Get cache for this layer
        key_cache, value_cache = self.caches[layer_idx]
-        
+
        # Return only the valid portion (up to current sequence position)
        # seq_pos tracks where to write next, so we have seq_pos valid tokens
        valid_len = self.seq_pos
-        
+
        # Note: Creating new Tensors from .data (no gradient tracking)
        # This is correct for inference-only caching
        cached_keys = Tensor(key_cache.data[:, :, :valid_len, :])
        cached_values = Tensor(value_cache.data[:, :, :valid_len, :])
-        
+
        return cached_keys, cached_values
        ### END SOLUTION
    def advance(self) -> None:
        """
@@ -204,7 +298,7 @@ class KVCache:
            'total_elements': total_elements
        }
-# Cell
+# %% ../../modules/source/14_kvcaching/kvcaching_dev.ipynb 9
 def enable_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
                    num_heads: int, head_dim: int) -> KVCache:
    """
@@ -257,3 +351,159 @@ def enable_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
    return cache
 # %% ../../modules/source/14_kvcaching/kvcaching_dev.ipynb 14
 def enable_kv_cache(model):
    """
    Enable KV caching for a transformer model WITHOUT modifying Module 12/13 code.
    TODO: Create cache and non-invasively patch attention layers
    APPROACH:
    1. Validate model has required attributes (embed_dim, num_layers, num_heads, max_seq_len, blocks)
    2. Calculate head_dim from embed_dim and num_heads
    3. Create KVCache instance sized for this model's architecture
    4. Store cache on model as model._kv_cache and set model._cache_enabled flag
    5. For each transformer block, wrap its attention forward method with caching logic
    6. Print confirmation message with cache statistics
    7. Return the cache object
    This function demonstrates **non-invasive optimization** - adding capabilities
    to existing systems without breaking them. Similar to how Module 05 (Autograd)
    uses enable_autograd() to add gradient tracking to Tensors.
    Args:
        model: A GPT-style transformer model with:
               - model.embed_dim (int)
               - model.num_layers (int)
               - model.num_heads (int)
               - model.max_seq_len (int)
               - model.blocks (list of TransformerBlock objects)
    Returns:
        cache: KVCache object for this model
    EXAMPLE:
    >>> from tinytorch.models.transformer import GPT
    >>> model = GPT(vocab_size=100, embed_dim=128, num_layers=4, num_heads=4)
    >>> cache = enable_kv_cache(model)
    >>> hasattr(model, '_kv_cache')  # True
    >>> model._cache_enabled  # True
    >>> cache.num_layers  # 4 (matches model)
    HINTS:
    - Use hasattr() to validate model attributes exist
    - head_dim = model.embed_dim // model.num_heads
    - Store cache on model with model._kv_cache = cache
    - Set flag with model._cache_enabled = True
    - Save original forward with block._original_attention_forward
    - Use a factory function to create patched forwards (closure captures layer_idx)
    Pedagogical Note:
        This teaches students that optimizations can be LAYERED on top of
        working systems. Module 14 doesn't break Modules 12-13; it enhances them!
    """
    ### BEGIN SOLUTION
    import types
    # Validate model has required attributes
    required_attrs = ['embed_dim', 'num_layers', 'num_heads', 'max_seq_len', 'blocks']
    for attr in required_attrs:
        if not hasattr(model, attr):
            raise AttributeError(
                f"Model missing '{attr}' - enable_kv_cache() requires a GPT-style model "
                f"with {', '.join(required_attrs)}"
            )
    # Calculate head dimension
    head_dim = model.embed_dim // model.num_heads
    if model.embed_dim % model.num_heads != 0:
        raise ValueError(
            f"embed_dim ({model.embed_dim}) must be divisible by num_heads ({model.num_heads})"
        )
    # Create cache for this model
    cache = KVCache(
        batch_size=1,  # Default to single sequence; can be reset for batch inference
        max_seq_len=model.max_seq_len,
        num_layers=model.num_layers,
        num_heads=model.num_heads,
        head_dim=head_dim
    )
    # Store cache on model for easy access
    model._kv_cache = cache
    model._cache_enabled = True
    # Patch each transformer block's attention
    for layer_idx, block in enumerate(model.blocks):
        # Store original attention forward method
        if not hasattr(block, '_original_attention_forward'):
            block._original_attention_forward = block.attention.forward
        # Create cached version
        def make_cached_forward(layer_idx, original_forward):
            """Factory to create cached forward with correct layer_idx closure"""
            def cached_forward(x):
                """
                Cached attention forward pass.
                EDUCATIONAL NOTE: In a production implementation, this would:
                1. Check if we're generating (single new token) vs training (full sequence)
                2. For generation: only compute K,V for new token, retrieve history from cache
                3. For training: use original uncached path
                For TinyTorch simplicity, we demonstrate the concept without full implementation.
                The cache is created and tracked, showing students the architecture pattern.
                """
                # In training: use original path (no caching during backprop!)
                # In generation: this is where we'd use cache
                # For now, pass through to original to maintain correctness
                return original_forward(x)
            return cached_forward
        # Patch this block's attention
        block.attention.forward = make_cached_forward(layer_idx, block._original_attention_forward)
    print(f"⚡ KV Cache enabled for model!")
    print(f"   Architecture: {model.num_layers} layers × {model.num_heads} heads × {head_dim}D")
    print(f"   Memory: {cache.get_memory_usage()['total_mb']:.2f} MB")
    print(f"   Cache stored in: model._kv_cache")
    print()
    print(f"💡 To disable: call disable_kv_cache(model)")
    print()
    return cache
    ### END SOLUTION
 #| export  
 def disable_kv_cache(model):
    """
    Disable KV caching and restore original attention behavior.
    Args:
        model: Model with caching enabled
    Example:
        ```python
        cache = enable_kv_cache(model)
        # ... do cached generation ...
        disable_kv_cache(model)  # Back to normal
        ```
    """
    if not hasattr(model, '_cache_enabled') or not model._cache_enabled:
        print("⚠️  KV cache not enabled on this model")
        return
    # Restore original attention forwards
    for block in model.blocks:
        if hasattr(block, '_original_attention_forward'):
            block.attention.forward = block._original_attention_forward
    # Clean up
    model._cache_enabled = False
    if hasattr(model, '_kv_cache'):
        delattr(model, '_kv_cache')
    print("✓ KV cache disabled, original attention restored")
--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -1,5 +1,19 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
-
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py       ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
--- a/tinytorch/text/embeddings.py
+++ b/tinytorch/text/embeddings.py
@@ -1,5 +1,19 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/11_embeddings/embeddings_dev.ipynb.
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
-
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py         ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
--- a/tinytorch/text/tokenization.py
+++ b/tinytorch/text/tokenization.py
@@ -1,5 +1,19 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_tokenization/tokenization_dev.ipynb.
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
-
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
 # ║                     AUTOGENERATED! DO NOT EDIT!                              ║
 # ║                                                                               ║
 # ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
 # ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
 # ║                                                                               ║
 # ║  ✅ TO EDIT: modules/source/XX_tokenization/tokenization_dev.py     ║
 # ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
 # ║                                                                               ║
 # ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
 # ║     Editing it directly may break module functionality and training.         ║
 # ║                                                                               ║
 # ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer']