Files
TinyTorch/tinytorch/generation/kv_cache.py
Vijay Janapa Reddi 96880b3133 Update tinytorch and tito with module exports
Re-exported all modules after restructuring:
- Updated _modidx.py with new module locations
- Removed outdated autogeneration headers
- Updated all core modules (tensor, autograd, layers, etc.)
- Updated optimization modules (quantization, compression, etc.)
- Updated TITO commands for new structure

Changes include:
- 24 tinytorch/ module files
- 24 tito/ command and core files
- Updated references from modules/source/ to modules/

All modules re-exported via nbdev from their new locations.
2025-11-10 19:42:03 -05:00

665 lines
30 KiB
Python
Generated
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/15_memoization/memoization_dev.ipynb.
# %% auto 0
__all__ = ['KVCache', 'enable_kv_cache', 'disable_kv_cache']
# %% ../../modules/source/15_memoization/memoization_dev.ipynb 1
import numpy as np
import time
from typing import Tuple, Optional, Dict, List
# Import TinyTorch components from previous modules
from ..core.tensor import Tensor
# %% ../../modules/source/15_memoization/memoization_dev.ipynb 7
class KVCache:
"""
Efficient key-value cache for autoregressive generation.
Stores K,V matrices for each transformer layer to avoid recomputation
during sequential token generation. This is THE critical optimization
that makes production language model serving economically viable.
⚠️ IMPORTANT: INFERENCE-ONLY (No Gradient Tracking)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
KV caching is designed ONLY for inference (generation), NOT training.
- During generation: No gradients computed (model.eval() mode)
- Cache operations use .data (no gradient tracking)
- This is correct and intentional for maximum speed
- DO NOT use caching during training (use standard forward pass)
Architecture:
- Pre-allocates cache tensors with maximum sequence length
- Tracks current sequence position for efficient O(1) updates
- Provides update() method to append new K,V pairs without copying
- Provides get() method to retrieve cached values for attention
- Handles multiple layers and attention heads properly
Memory Layout:
```
Layer 0: [Key_cache, Value_cache] # Shape: (batch, num_heads, max_seq, head_dim)
Layer 1: [Key_cache, Value_cache]
...
Layer N: [Key_cache, Value_cache]
```
Performance:
- Update: O(1) - just index assignment
- Get: O(1) - just slicing (no data copy)
- Memory: O(num_layers × batch × heads × max_seq × head_dim)
"""
def __init__(self, batch_size: int, max_seq_len: int, num_layers: int,
num_heads: int, head_dim: int):
"""
Initialize KV cache for efficient generation.
TODO: Set up pre-allocated cache storage for all transformer layers
APPROACH:
1. Store configuration parameters (batch_size, max_seq_len, etc.)
2. Initialize sequence position counter to 0
3. Create empty list for cache storage
4. For each layer, pre-allocate zero-filled key and value caches
5. Store each layer's (key_cache, value_cache) tuple in the list
Args:
batch_size: Number of sequences to generate simultaneously
max_seq_len: Maximum sequence length to support
num_layers: Number of transformer layers
num_heads: Number of attention heads per layer
head_dim: Dimension of each attention head
EXAMPLE:
>>> cache = KVCache(batch_size=2, max_seq_len=128, num_layers=4,
... num_heads=8, head_dim=64)
>>> cache.seq_pos # 0 (no tokens cached yet)
>>> len(cache.caches) # 4 (one per layer)
>>> cache.caches[0][0].shape # (2, 8, 128, 64) - key cache for layer 0
HINTS:
- Cache shape: (batch_size, num_heads, max_seq_len, head_dim)
- Use Tensor(np.zeros(...)) to create cache tensors
- Store caches as list of tuples: [(key_0, val_0), (key_1, val_1), ...]
- Pre-allocation avoids dynamic resizing overhead during generation
"""
### BEGIN SOLUTION
self.batch_size = batch_size
self.max_seq_len = max_seq_len
self.num_layers = num_layers
self.num_heads = num_heads
self.head_dim = head_dim
# Current sequence position (how many tokens are cached)
self.seq_pos = 0
# Cache storage: list of (key_cache, value_cache) tuples per layer
self.caches = []
for layer_idx in range(num_layers):
# Pre-allocate cache tensors with maximum size
# Shape: (batch_size, num_heads, max_seq_len, head_dim)
key_cache = Tensor(np.zeros((batch_size, num_heads, max_seq_len, head_dim)))
value_cache = Tensor(np.zeros((batch_size, num_heads, max_seq_len, head_dim)))
self.caches.append((key_cache, value_cache))
### END SOLUTION
def update(self, layer_idx: int, key: Tensor, value: Tensor) -> None:
"""
Update cache with new key-value pairs for given layer.
TODO: Efficiently append new K,V to cache without data copying
APPROACH:
1. Validate layer_idx is in range [0, num_layers-1]
2. Validate seq_pos hasn't exceeded max_seq_len
3. Retrieve the (key_cache, value_cache) tuple for this layer
4. Write new key to position seq_pos in key_cache using indexed assignment
5. Write new value to position seq_pos in value_cache using indexed assignment
6. Note: seq_pos is advanced externally via advance() after all layers
This is the core caching operation - efficiently append new K,V
to the cache without recomputation. This operation is O(1) because
it's just an indexed assignment.
IMPORTANT: KV caching is designed for INFERENCE (generation) only,
not training. During generation, gradients are not computed. If you
need gradients, don't use caching (use standard forward pass instead).
Args:
layer_idx: Which transformer layer (0 to num_layers-1)
key: New key tensor, shape (batch_size, num_heads, 1, head_dim)
value: New value tensor, shape (batch_size, num_heads, 1, head_dim)
EXAMPLE:
>>> cache = KVCache(batch_size=1, max_seq_len=10, num_layers=2,
... num_heads=4, head_dim=64)
>>> new_k = Tensor(np.random.randn(1, 4, 1, 64))
>>> new_v = Tensor(np.random.randn(1, 4, 1, 64))
>>> cache.update(layer_idx=0, key=new_k, value=new_v)
>>> cache.seq_pos # Still 0 (update doesn't advance position)
>>> cache.advance()
>>> cache.seq_pos # Now 1
HINTS:
- Use slicing: cache[:, :, seq_pos:seq_pos+1, :] to write to position
- Use .data for direct NumPy access (no gradient tracking needed)
- Raise ValueError with helpful messages for invalid inputs
- This is an in-place operation (modifies cache, returns None)
Raises:
ValueError: If layer_idx is out of range or sequence is full
"""
### BEGIN SOLUTION
if layer_idx >= self.num_layers:
raise ValueError(f"Layer index {layer_idx} >= num_layers {self.num_layers}")
if self.seq_pos >= self.max_seq_len:
raise ValueError(f"Sequence position {self.seq_pos} >= max_seq_len {self.max_seq_len}")
# Get cache for this layer
key_cache, value_cache = self.caches[layer_idx]
# Update cache at current position (efficient O(1) write)
# Note: We use .data here because caching is inference-only (no gradients needed)
# This avoids gradient tracking overhead during generation
key_cache.data[:, :, self.seq_pos:self.seq_pos+1, :] = key.data
value_cache.data[:, :, self.seq_pos:self.seq_pos+1, :] = value.data
# Note: seq_pos is advanced externally via advance() after all layers process
### END SOLUTION
def get(self, layer_idx: int) -> Tuple[Tensor, Tensor]:
"""
Retrieve cached key-value pairs for attention computation.
TODO: Return only the valid cached portion for this layer
APPROACH:
1. Validate layer_idx is in range
2. Retrieve the (key_cache, value_cache) tuple for this layer
3. Calculate valid_len = seq_pos (number of tokens currently cached)
4. Slice key_cache to get [:, :, :valid_len, :] (only filled portion)
5. Slice value_cache to get [:, :, :valid_len, :] (only filled portion)
6. Wrap sliced data in new Tensor objects and return
Returns only the valid portion of the cache (up to current seq_pos).
This is O(1) because we're just slicing NumPy arrays (view, not copy).
IMPORTANT: Returns Tensors without gradient tracking since caching
is inference-only. The returned tensors can be used in attention
computation but won't propagate gradients backward.
Args:
layer_idx: Which transformer layer to get cache for
Returns:
(cached_keys, cached_values): Tensors shaped for attention
Keys: (batch_size, num_heads, seq_pos, head_dim)
Values: (batch_size, num_heads, seq_pos, head_dim)
EXAMPLE:
>>> cache = KVCache(batch_size=1, max_seq_len=100, num_layers=2,
... num_heads=4, head_dim=64)
>>> # After processing 3 tokens
>>> cache.seq_pos = 3
>>> cached_k, cached_v = cache.get(layer_idx=0)
>>> cached_k.shape # (1, 4, 3, 64) - only first 3 positions
>>> cached_v.shape # (1, 4, 3, 64)
HINTS:
- valid_len = self.seq_pos (how many tokens have been cached so far)
- Use slicing: cache.data[:, :, :valid_len, :] to get valid portion
- Wrap result in Tensor() for consistency with TinyTorch API
- If seq_pos=0, returns empty cache (shape with 0 in sequence dimension)
Raises:
ValueError: If layer_idx is out of range
"""
### BEGIN SOLUTION
if layer_idx >= self.num_layers:
raise ValueError(f"Layer index {layer_idx} >= num_layers {self.num_layers}")
# Get cache for this layer
key_cache, value_cache = self.caches[layer_idx]
# Return only the valid portion (up to current sequence position)
# seq_pos tracks where to write next, so we have seq_pos valid tokens
valid_len = self.seq_pos
# Note: Creating new Tensors from .data (no gradient tracking)
# This is correct for inference-only caching
cached_keys = Tensor(key_cache.data[:, :, :valid_len, :])
cached_values = Tensor(value_cache.data[:, :, :valid_len, :])
return cached_keys, cached_values
### END SOLUTION
def advance(self) -> None:
"""
Advance sequence position after processing current token.
Call this after all layers have processed the current token and
updated their caches. This moves the write pointer forward.
"""
self.seq_pos += 1
def reset(self) -> None:
"""
Reset cache for new generation sequence.
Call this when starting a new generation (new prompt).
Resets the sequence position counter and optionally zeros cache data.
"""
self.seq_pos = 0
# Zero out caches for clean state (helps with debugging)
for layer_idx in range(self.num_layers):
key_cache, value_cache = self.caches[layer_idx]
key_cache.data.fill(0.0)
value_cache.data.fill(0.0)
def get_memory_usage(self) -> Dict[str, float]:
"""
Calculate memory usage of the cache system.
Returns:
Dictionary with memory statistics in MB
"""
# Calculate size of one cache tensor
cache_size = self.batch_size * self.num_heads * self.max_seq_len * self.head_dim
bytes_per_float = 4 # float32
# Each layer has key_cache + value_cache
total_cache_tensors = self.num_layers * 2
total_elements = cache_size * total_cache_tensors
total_bytes = total_elements * bytes_per_float
total_mb = total_bytes / (1024 * 1024)
return {
'total_mb': total_mb,
'per_layer_mb': total_mb / self.num_layers,
'cache_tensors': total_cache_tensors,
'total_elements': total_elements
}
# %% ../../modules/source/15_memoization/memoization_dev.ipynb 11
def enable_kv_cache(batch_size: int, max_seq_len: int, num_layers: int,
num_heads: int, head_dim: int) -> KVCache:
"""
Create and return a KVCache instance for model generation.
This function creates a properly sized cache for the model architecture.
Call this before starting generation, then pass the cache to your
generation loop.
Args:
batch_size: Number of sequences to generate simultaneously
max_seq_len: Maximum sequence length to support
num_layers: Number of transformer layers in model
num_heads: Number of attention heads per layer
head_dim: Dimension per attention head (usually embed_dim // num_heads)
Returns:
KVCache instance ready for use
Example:
```python
# Enable caching for generation
cache = enable_kv_cache(
batch_size=1,
max_seq_len=100,
num_layers=4,
num_heads=4,
head_dim=32
)
# Use in generation loop (pseudocode)
for step in range(max_new_tokens):
# Only process new token with cache
logits = model.forward_cached(new_token, cache)
next_token = sample(logits)
```
"""
cache = KVCache(batch_size, max_seq_len, num_layers, num_heads, head_dim)
print(f"⚡ KV Cache enabled:")
print(f" Batch size: {batch_size}")
print(f" Max sequence: {max_seq_len}")
print(f" Layers: {num_layers}")
print(f" Heads: {num_heads}")
print(f" Head dim: {head_dim}")
mem_info = cache.get_memory_usage()
print(f" Memory: {mem_info['total_mb']:.2f} MB")
print()
return cache
# %% ../../modules/source/15_memoization/memoization_dev.ipynb 16
def enable_kv_cache(model):
"""
Enable KV caching for a transformer model WITHOUT modifying Module 12/13 code.
TODO: Create cache and non-invasively patch attention layers
APPROACH:
1. Validate model has required attributes (embed_dim, num_layers, num_heads, max_seq_len, blocks)
2. Calculate head_dim from embed_dim and num_heads
3. Create KVCache instance sized for this model's architecture
4. Store cache on model as model._kv_cache and set model._cache_enabled flag
5. For each transformer block, wrap its attention forward method with caching logic
6. Print confirmation message with cache statistics
7. Return the cache object
This function demonstrates **non-invasive optimization** - adding capabilities
to existing systems without breaking them. Similar to how Module 05 (Autograd)
uses enable_autograd() to add gradient tracking to Tensors.
Args:
model: A GPT-style transformer model with:
- model.embed_dim (int)
- model.num_layers (int)
- model.num_heads (int)
- model.max_seq_len (int)
- model.blocks (list of TransformerBlock objects)
Returns:
cache: KVCache object for this model
EXAMPLE:
>>> from tinytorch.models.transformer import GPT
>>> model = GPT(vocab_size=100, embed_dim=128, num_layers=4, num_heads=4)
>>> cache = enable_kv_cache(model)
>>> hasattr(model, '_kv_cache') # True
>>> model._cache_enabled # True
>>> cache.num_layers # 4 (matches model)
HINTS:
- Use hasattr() to validate model attributes exist
- head_dim = model.embed_dim // model.num_heads
- Store cache on model with model._kv_cache = cache
- Set flag with model._cache_enabled = True
- Save original forward with block._original_attention_forward
- Use a factory function to create patched forwards (closure captures layer_idx)
Pedagogical Note:
This teaches students that optimizations can be LAYERED on top of
working systems. Module 14 doesn't break Modules 12-13; it enhances them!
"""
### BEGIN SOLUTION
import types
# Validate model has required attributes
required_attrs = ['embed_dim', 'num_layers', 'num_heads', 'max_seq_len', 'blocks']
for attr in required_attrs:
if not hasattr(model, attr):
raise AttributeError(
f"Model missing '{attr}' - enable_kv_cache() requires a GPT-style model "
f"with {', '.join(required_attrs)}"
)
# Calculate head dimension
head_dim = model.embed_dim // model.num_heads
if model.embed_dim % model.num_heads != 0:
raise ValueError(
f"embed_dim ({model.embed_dim}) must be divisible by num_heads ({model.num_heads})"
)
# Create cache for this model
cache = KVCache(
batch_size=1, # Default to single sequence; can be reset for batch inference
max_seq_len=model.max_seq_len,
num_layers=model.num_layers,
num_heads=model.num_heads,
head_dim=head_dim
)
# Store cache on model for easy access
model._kv_cache = cache
model._cache_enabled = True
# Patch each transformer block's attention
for layer_idx, block in enumerate(model.blocks):
# Store original attention forward method
if not hasattr(block, '_original_attention_forward'):
block._original_attention_forward = block.attention.forward
# Create cached version
def make_cached_forward(layer_idx, original_forward, cache_obj):
"""Factory to create cached forward with correct layer_idx closure"""
def cached_forward(x, mask=None):
"""
Cached attention forward pass with REAL speedup!
PATH SELECTION STRATEGY (Key to Understanding KV Caching):
──────────────────────────────────────────────────────────
We have THREE possible paths through attention:
1⃣ TRAINING PATH (seq_len > 1):
- Input: Full sequence of tokens (e.g., 64 tokens)
- Action: Use ORIGINAL attention (no caching)
- Why: Need full gradient flow for backpropagation
- Complexity: O(n²) but that's fine for training
- Example: x.shape = (batch=1, seq=64, embed=128)
2⃣ FIRST TOKEN PATH (seq_len == 1 AND cache empty):
- Input: Single token (the first one in generation)
- Action: Use ORIGINAL attention (initialize cache)
- Why: Cache is empty, nothing to retrieve yet
- Complexity: O(1) - only one token
- Example: x.shape = (batch=1, seq=1, embed=128), cache.seq_pos=0
3⃣ CACHED GENERATION PATH (seq_len == 1 AND cache populated):
- Input: Single NEW token (during generation)
- Action: Compute K,V for new token ONLY, retrieve history from cache
- Why: This is where the speedup happens! O(n²) → O(n)
- Complexity: O(n) - only compute for new token, reuse cache
- Example: x.shape = (batch=1, seq=1, embed=128), cache.seq_pos=5
WHY .data INSTEAD OF TENSOR OPERATIONS?
────────────────────────────────────────
In the cached path, we use numpy via .data for three reasons:
1. **Explicit Intent**: Makes it crystal clear this is inference-only
- Training: Uses Tensor operations → gradients tracked
- Inference: Uses .data → no gradient overhead
2. **Performance**: Avoids any autograd bookkeeping
- Even if small, every bit counts in generation
- Production LLMs (vLLM, llama.cpp) use similar patterns
3. **Educational Clarity**: Shows students the distinction
- "When do I need gradients?" (training)
- "When can I skip them?" (inference)
We COULD use Tensor operations with requires_grad=False, but .data
is more explicit and is the industry-standard pattern.
THE O(n²) → O(n) TRANSFORMATION:
─────────────────────────────────
WITHOUT Cache (Standard Attention):
Step 1: Process token 1 → Compute attention for 1 token (1² = 1 op)
Step 2: Process tokens 1-2 → Compute attention for 2 tokens (2² = 4 ops)
Step 3: Process tokens 1-3 → Compute attention for 3 tokens (3² = 9 ops)
...
Step N: Process tokens 1-N → Compute attention for N tokens (N² ops)
Total: 1 + 4 + 9 + ... + N² = O(N³) across all steps!
WITH Cache (Our Implementation):
Step 1: Process token 1 → Compute K,V for token 1, cache it (1 op)
Step 2: Process token 2 → Compute K,V for token 2, retrieve 1 (2 ops)
Step 3: Process token 3 → Compute K,V for token 3, retrieve 1-2 (3 ops)
...
Step N: Process token N → Compute K,V for token N, retrieve 1-(N-1) (N ops)
Total: 1 + 2 + 3 + ... + N = O(N²) across all steps!
That's why we see 5-7x speedup on short sequences, and 10-15x on longer ones!
"""
from tinytorch.core.tensor import Tensor
import numpy as np
seq_len = x.shape[1]
# ═══════════════════════════════════════════════════════════════
# PATH SELECTION: Choose between training, first token, or cached
# ═══════════════════════════════════════════════════════════════
# PATH 1: TRAINING (seq_len > 1)
# ───────────────────────────────────
# Input is a full sequence (e.g., 64 tokens during training)
# We MUST use original attention to preserve gradient flow
# No caching during training - we need backprop through everything
if seq_len > 1:
return original_forward(x, mask) # O(n²) but preserves gradients
# PATH 2: FIRST TOKEN (seq_len == 1, cache empty)
# ────────────────────────────────────────────────
# This is the very first token in generation (cache.seq_pos == 0)
# Cache is empty, so there's nothing to retrieve yet
# Use original attention to process this token, which will populate cache
if cache_obj.seq_pos == 0:
return original_forward(x, mask) # O(1) - just one token
# PATH 3: CACHED GENERATION (seq_len == 1, cache populated)
# ──────────────────────────────────────────────────────────
# This is a NEW token during generation (cache has history)
# We can now use the cache for massive speedup!
# Compute K,V for ONLY this new token, retrieve cached history
# Get attention layer (assumes block.attention has the attention object)
attention = block.attention
# Step 1: Compute Q, K, V for NEW token only
# Access the linear projection layers
Q_new = attention.q_proj.forward(x) # (batch, 1, embed_dim)
K_new = attention.k_proj.forward(x) # (batch, 1, embed_dim)
V_new = attention.v_proj.forward(x) # (batch, 1, embed_dim)
# Step 2: Reshape to multi-head format
batch_size = x.shape[0]
num_heads = attention.num_heads
head_dim = attention.head_dim
# Reshape: (batch, 1, embed_dim) → (batch, num_heads, 1, head_dim)
Q_heads = Q_new.reshape(batch_size, 1, num_heads, head_dim)
Q_heads = Tensor(np.transpose(Q_heads.data, (0, 2, 1, 3))) # (batch, num_heads, 1, head_dim)
K_heads = K_new.reshape(batch_size, 1, num_heads, head_dim)
K_heads = Tensor(np.transpose(K_heads.data, (0, 2, 1, 3)))
V_heads = V_new.reshape(batch_size, 1, num_heads, head_dim)
V_heads = Tensor(np.transpose(V_heads.data, (0, 2, 1, 3)))
# Step 3: Update cache with new K, V (using .data for performance)
cache_obj.update(layer_idx, K_heads, V_heads)
# Step 4: Retrieve ALL cached K, V (includes history + new token)
K_all, V_all = cache_obj.get(layer_idx)
# Step 5: Compute attention using new Q with ALL cached K, V
# ─────────────────────────────────────────────────────────
# Scaled dot-product attention: softmax(Q @ K^T / sqrt(d_k)) @ V
#
# NOTE: We use .data (numpy arrays) here instead of Tensor operations
# Why? This is INFERENCE-ONLY code (no gradients needed):
# - Explicit: Makes it clear this is inference, not training
# - Fast: Avoids autograd overhead (even if small)
# - Standard: Production LLMs (vLLM, llama.cpp) do the same
#
# If this were training, we'd use Tensor operations for gradient flow.
# But in generation (inference), .data is the right choice.
# Q @ K^T: (batch, num_heads, 1, head_dim) @ (batch, num_heads, head_dim, seq_len)
# → (batch, num_heads, 1, seq_len)
K_transposed = np.transpose(K_all.data, (0, 1, 3, 2)) # .data = numpy array
scores = np.matmul(Q_heads.data, K_transposed) # Pure numpy matmul
# Scale by sqrt(head_dim)
scores = scores / np.sqrt(head_dim)
# Apply mask if provided (causal mask for generation)
if mask is not None:
# Mask should be (1, 1, 1, seq_len) for this token
# In generation, we can attend to all previous tokens
pass # No masking needed in generation (we see all history)
# Softmax over key dimension
scores_max = np.max(scores, axis=-1, keepdims=True)
exp_scores = np.exp(scores - scores_max)
attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
# Apply attention weights to values
# (batch, num_heads, 1, seq_len) @ (batch, num_heads, seq_len, head_dim)
# → (batch, num_heads, 1, head_dim)
attention_output = np.matmul(attention_weights, V_all.data)
# Step 6: Reshape back and apply output projection
# (batch, num_heads, 1, head_dim) → (batch, 1, num_heads, head_dim)
attention_output_transposed = np.transpose(attention_output, (0, 2, 1, 3))
# Concatenate heads: (batch, 1, num_heads * head_dim)
concat_data = attention_output_transposed.reshape(batch_size, 1, num_heads * head_dim)
concat_output = Tensor(concat_data)
# Output projection
output = attention.out_proj.forward(concat_output)
return output
return cached_forward
# Patch this block's attention
block.attention.forward = make_cached_forward(layer_idx, block._original_attention_forward, cache)
print(f"⚡ KV Cache enabled for model!")
print(f" Architecture: {model.num_layers} layers × {model.num_heads} heads × {head_dim}D")
print(f" Memory: {cache.get_memory_usage()['total_mb']:.2f} MB")
print(f" Cache stored in: model._kv_cache")
print()
print(f"💡 To disable: call disable_kv_cache(model)")
print()
return cache
### END SOLUTION
#| export
def disable_kv_cache(model):
"""
Disable KV caching and restore original attention behavior.
Args:
model: Model with caching enabled
Example:
```python
cache = enable_kv_cache(model)
# ... do cached generation ...
disable_kv_cache(model) # Back to normal
```
"""
if not hasattr(model, '_cache_enabled') or not model._cache_enabled:
print("⚠️ KV cache not enabled on this model")
return
# Restore original attention forwards
for block in model.blocks:
if hasattr(block, '_original_attention_forward'):
block.attention.forward = block._original_attention_forward
# Clean up
model._cache_enabled = False
if hasattr(model, '_kv_cache'):
delattr(model, '_kv_cache')
print("✓ KV cache disabled, original attention restored")