mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-24 13:20:47 -05:00
1718 lines
99 KiB
HTML
1718 lines
99 KiB
HTML
|
||
<!DOCTYPE html>
|
||
|
||
|
||
<html lang="en" data-content_root="../" >
|
||
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<title>17. Memoization - Computational Reuse for Inference — Tiny🔥Torch</title>
|
||
|
||
|
||
|
||
<script data-cfasync="false">
|
||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
|
||
</script>
|
||
|
||
<!-- Loaded before other Sphinx assets -->
|
||
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
|
||
|
||
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
|
||
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
|
||
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
|
||
|
||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
|
||
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../_static/copybutton.js?v=f281be69"></script>
|
||
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
|
||
<script>let toggleHintShow = 'Click to show';</script>
|
||
<script>let toggleHintHide = 'Click to hide';</script>
|
||
<script>let toggleOpenOnPrint = 'true';</script>
|
||
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
|
||
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
|
||
<script src="../_static/design-tabs.js?v=f930bc37"></script>
|
||
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
|
||
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
|
||
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
|
||
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
|
||
<script type="module" src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs"></script>
|
||
<script type="module" src="https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs"></script>
|
||
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";import elkLayouts from "https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs";mermaid.registerLayoutLoaders(elkLayouts);mermaid.initialize({startOnLoad:false});</script>
|
||
<script src="https://cdn.jsdelivr.net/npm/d3@7.9.0/dist/d3.min.js"></script>
|
||
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";
|
||
|
||
const defaultStyle = document.createElement('style');
|
||
defaultStyle.textContent = `pre.mermaid {
|
||
/* Same as .mermaid-container > pre */
|
||
display: block;
|
||
width: 100%;
|
||
}
|
||
|
||
pre.mermaid > svg {
|
||
/* Same as .mermaid-container > pre > svg */
|
||
height: 500px;
|
||
width: 100%;
|
||
max-width: 100% !important;
|
||
}
|
||
`;
|
||
document.head.appendChild(defaultStyle);
|
||
|
||
const fullscreenStyle = document.createElement('style');
|
||
fullscreenStyle.textContent = `.mermaid-container {
|
||
display: flex;
|
||
flex-direction: row;
|
||
width: 100%;
|
||
}
|
||
|
||
.mermaid-container > pre {
|
||
display: block;
|
||
width: 100%;
|
||
}
|
||
|
||
.mermaid-container > pre > svg {
|
||
height: 500px;
|
||
width: 100%;
|
||
max-width: 100% !important;
|
||
}
|
||
|
||
.mermaid-fullscreen-btn {
|
||
width: 28px;
|
||
height: 28px;
|
||
background: rgba(255, 255, 255, 0.95);
|
||
border: 1px solid rgba(0, 0, 0, 0.3);
|
||
border-radius: 4px;
|
||
cursor: pointer;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: center;
|
||
transition: all 0.2s;
|
||
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
|
||
font-size: 14px;
|
||
line-height: 1;
|
||
padding: 0;
|
||
color: #333;
|
||
}
|
||
|
||
.mermaid-fullscreen-btn:hover {
|
||
opacity: 100% !important;
|
||
background: rgba(255, 255, 255, 1);
|
||
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.3);
|
||
transform: scale(1.1);
|
||
}
|
||
|
||
.mermaid-fullscreen-btn.dark-theme {
|
||
background: rgba(50, 50, 50, 0.95);
|
||
border: 1px solid rgba(255, 255, 255, 0.3);
|
||
color: #e0e0e0;
|
||
}
|
||
|
||
.mermaid-fullscreen-btn.dark-theme:hover {
|
||
background: rgba(60, 60, 60, 1);
|
||
box-shadow: 0 3px 10px rgba(255, 255, 255, 0.2);
|
||
}
|
||
|
||
.mermaid-fullscreen-modal {
|
||
display: none;
|
||
position: fixed !important;
|
||
top: 0 !important;
|
||
left: 0 !important;
|
||
width: 95vw;
|
||
height: 100vh;
|
||
background: rgba(255, 255, 255, 0.98);
|
||
z-index: 9999;
|
||
padding: 20px;
|
||
overflow: auto;
|
||
}
|
||
|
||
.mermaid-fullscreen-modal.dark-theme {
|
||
background: rgba(0, 0, 0, 0.98);
|
||
}
|
||
|
||
.mermaid-fullscreen-modal.active {
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: center;
|
||
}
|
||
|
||
.mermaid-container-fullscreen {
|
||
position: relative;
|
||
width: 95vw;
|
||
height: 90vh;
|
||
max-width: 95vw;
|
||
max-height: 90vh;
|
||
background: white;
|
||
border-radius: 8px;
|
||
padding: 20px;
|
||
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
|
||
overflow: auto;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: center;
|
||
}
|
||
|
||
.mermaid-container-fullscreen.dark-theme {
|
||
background: #1a1a1a;
|
||
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.8);
|
||
}
|
||
|
||
.mermaid-container-fullscreen pre.mermaid {
|
||
width: 100%;
|
||
height: 100%;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: center;
|
||
}
|
||
|
||
.mermaid-container-fullscreen .mermaid svg {
|
||
height: 100% !important;
|
||
width: 100% !important;
|
||
cursor: grab;
|
||
}
|
||
|
||
.mermaid-fullscreen-close {
|
||
position: fixed !important;
|
||
top: 20px !important;
|
||
right: 20px !important;
|
||
width: 40px;
|
||
height: 40px;
|
||
background: rgba(255, 255, 255, 0.95);
|
||
border: 1px solid rgba(0, 0, 0, 0.2);
|
||
border-radius: 50%;
|
||
cursor: pointer;
|
||
z-index: 10000;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: center;
|
||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
||
transition: all 0.2s;
|
||
font-size: 24px;
|
||
line-height: 1;
|
||
color: #333;
|
||
}
|
||
|
||
.mermaid-fullscreen-close:hover {
|
||
background: white;
|
||
box-shadow: 0 6px 16px rgba(0, 0, 0, 0.4);
|
||
transform: scale(1.1);
|
||
}
|
||
|
||
.mermaid-fullscreen-close.dark-theme {
|
||
background: rgba(50, 50, 50, 0.95);
|
||
border: 1px solid rgba(255, 255, 255, 0.2);
|
||
color: #e0e0e0;
|
||
}
|
||
|
||
.mermaid-fullscreen-close.dark-theme:hover {
|
||
background: rgba(60, 60, 60, 1);
|
||
box-shadow: 0 6px 16px rgba(255, 255, 255, 0.2);
|
||
}
|
||
|
||
.mermaid-fullscreen-modal .mermaid-fullscreen-btn {
|
||
display: none !important;
|
||
}`;
|
||
document.head.appendChild(fullscreenStyle);
|
||
|
||
// Detect if page has dark background
|
||
const isDarkTheme = () => {
|
||
const bgColor = window.getComputedStyle(document.body).backgroundColor;
|
||
const match = bgColor.match(/rgb\((\d+),\s*(\d+),\s*(\d+)/);
|
||
if (match) {
|
||
const r = parseInt(match[1]);
|
||
const g = parseInt(match[2]);
|
||
const b = parseInt(match[3]);
|
||
const brightness = (r * 299 + g * 587 + b * 114) / 1000;
|
||
return brightness < 128;
|
||
}
|
||
return false;
|
||
};
|
||
|
||
const load = async () => {
|
||
await mermaid.run();
|
||
|
||
const all_mermaids = document.querySelectorAll(".mermaid");
|
||
const mermaids_processed = document.querySelectorAll(".mermaid[data-processed='true']");
|
||
|
||
if ("False" === "True") {
|
||
const mermaids_to_add_zoom = -1 === -1 ? all_mermaids.length : -1;
|
||
if(mermaids_to_add_zoom > 0) {
|
||
var svgs = d3.selectAll("");
|
||
if(all_mermaids.length !== mermaids_processed.length) {
|
||
setTimeout(load, 200);
|
||
return;
|
||
} else if(svgs.size() !== mermaids_to_add_zoom) {
|
||
setTimeout(load, 200);
|
||
return;
|
||
} else {
|
||
svgs.each(function() {
|
||
var svg = d3.select(this);
|
||
svg.html("<g class='wrapper'>" + svg.html() + "</g>");
|
||
var inner = svg.select("g");
|
||
var zoom = d3.zoom().on("zoom", function(event) {
|
||
inner.attr("transform", event.transform);
|
||
});
|
||
svg.call(zoom);
|
||
});
|
||
}
|
||
}
|
||
} else if(all_mermaids.length !== mermaids_processed.length) {
|
||
// Wait for mermaid to process all diagrams
|
||
setTimeout(load, 200);
|
||
return;
|
||
}
|
||
|
||
const darkTheme = isDarkTheme();
|
||
|
||
// Stop here if not adding fullscreen capability
|
||
if ("True" !== "True") return;
|
||
|
||
const modal = document.createElement('div');
|
||
modal.className = 'mermaid-fullscreen-modal' + (darkTheme ? ' dark-theme' : '');
|
||
modal.setAttribute('role', 'dialog');
|
||
modal.setAttribute('aria-modal', 'true');
|
||
modal.setAttribute('aria-label', 'Fullscreen diagram viewer');
|
||
modal.innerHTML = `
|
||
<button class="mermaid-fullscreen-close${darkTheme ? ' dark-theme' : ''}" aria-label="Close fullscreen">✕</button>
|
||
<div class="mermaid-container-fullscreen${darkTheme ? ' dark-theme' : ''}"></div>
|
||
`;
|
||
document.body.appendChild(modal);
|
||
|
||
const modalContent = modal.querySelector('.mermaid-container-fullscreen');
|
||
const closeBtn = modal.querySelector('.mermaid-fullscreen-close');
|
||
|
||
let previousScrollOffset = [window.scrollX, window.scrollY];
|
||
|
||
const closeModal = () => {
|
||
modal.classList.remove('active');
|
||
modalContent.innerHTML = '';
|
||
document.body.style.overflow = ''
|
||
window.scrollTo({left: previousScrollOffset[0], top: previousScrollOffset[1], behavior: 'instant'});
|
||
};
|
||
|
||
closeBtn.addEventListener('click', closeModal);
|
||
modal.addEventListener('click', (e) => {
|
||
if (e.target === modal) closeModal();
|
||
});
|
||
document.addEventListener('keydown', (e) => {
|
||
if (e.key === 'Escape' && modal.classList.contains('active')) {
|
||
closeModal();
|
||
}
|
||
});
|
||
|
||
const allButtons = [];
|
||
|
||
document.querySelectorAll('.mermaid').forEach((mermaidDiv) => {
|
||
if (mermaidDiv.parentNode.classList.contains('mermaid-container') ||
|
||
mermaidDiv.closest('.mermaid-fullscreen-modal')) {
|
||
return;
|
||
}
|
||
|
||
const container = document.createElement('div');
|
||
container.className = 'mermaid-container';
|
||
mermaidDiv.parentNode.insertBefore(container, mermaidDiv);
|
||
container.appendChild(mermaidDiv);
|
||
|
||
const fullscreenBtn = document.createElement('button');
|
||
fullscreenBtn.className = 'mermaid-fullscreen-btn' + (darkTheme ? ' dark-theme' : '');
|
||
fullscreenBtn.setAttribute('aria-label', 'View diagram in fullscreen');
|
||
fullscreenBtn.textContent = '⛶';
|
||
fullscreenBtn.style.opacity = '50%';
|
||
|
||
// Calculate dynamic position based on diagram's margin and padding
|
||
const diagramStyle = window.getComputedStyle(mermaidDiv);
|
||
const marginTop = parseFloat(diagramStyle.marginTop) || 0;
|
||
const marginRight = parseFloat(diagramStyle.marginRight) || 0;
|
||
const paddingTop = parseFloat(diagramStyle.paddingTop) || 0;
|
||
const paddingRight = parseFloat(diagramStyle.paddingRight) || 0;
|
||
fullscreenBtn.style.top = `${marginTop + paddingTop + 4}px`;
|
||
fullscreenBtn.style.right = `${marginRight + paddingRight + 4}px`;
|
||
|
||
fullscreenBtn.addEventListener('click', () => {
|
||
previousScrollOffset = [window.scroll, window.scrollY];
|
||
const clone = mermaidDiv.cloneNode(true);
|
||
modalContent.innerHTML = '';
|
||
modalContent.appendChild(clone);
|
||
|
||
const svg = clone.querySelector('svg');
|
||
if (svg) {
|
||
svg.removeAttribute('width');
|
||
svg.removeAttribute('height');
|
||
svg.style.width = '100%';
|
||
svg.style.height = 'auto';
|
||
svg.style.maxWidth = '100%';
|
||
svg.style.sdisplay = 'block';
|
||
|
||
if ("False" === "True") {
|
||
setTimeout(() => {
|
||
const g = svg.querySelector('g');
|
||
if (g) {
|
||
var svgD3 = d3.select(svg);
|
||
svgD3.html("<g class='wrapper'>" + svgD3.html() + "</g>");
|
||
var inner = svgD3.select("g");
|
||
var zoom = d3.zoom().on("zoom", function(event) {
|
||
inner.attr("transform", event.transform);
|
||
});
|
||
svgD3.call(zoom);
|
||
}
|
||
}, 100);
|
||
}
|
||
}
|
||
|
||
modal.classList.add('active');
|
||
document.body.style.overflow = 'hidden';
|
||
});
|
||
|
||
container.appendChild(fullscreenBtn);
|
||
allButtons.push(fullscreenBtn);
|
||
});
|
||
|
||
// Update theme classes when theme changes
|
||
const updateTheme = () => {
|
||
const dark = isDarkTheme();
|
||
allButtons.forEach(btn => {
|
||
if (dark) {
|
||
btn.classList.add('dark-theme');
|
||
} else {
|
||
btn.classList.remove('dark-theme');
|
||
}
|
||
});
|
||
if (dark) {
|
||
modal.classList.add('dark-theme');
|
||
modalContent.classList.add('dark-theme');
|
||
closeBtn.classList.add('dark-theme');
|
||
} else {
|
||
modal.classList.remove('dark-theme');
|
||
modalContent.classList.remove('dark-theme');
|
||
closeBtn.classList.remove('dark-theme');
|
||
}
|
||
};
|
||
|
||
// Watch for theme changes
|
||
const observer = new MutationObserver(updateTheme);
|
||
observer.observe(document.documentElement, {
|
||
attributes: true,
|
||
attributeFilter: ['class', 'style', 'data-theme']
|
||
});
|
||
observer.observe(document.body, {
|
||
attributes: true,
|
||
attributeFilter: ['class', 'style']
|
||
});
|
||
};
|
||
|
||
window.addEventListener("load", load);
|
||
</script>
|
||
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/17_memoization_ABOUT';</script>
|
||
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
|
||
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
|
||
<script src="../_static/marimo-badges.js?v=e6289128"></script>
|
||
<script src="../_static/sidebar-link.js?v=404b701b"></script>
|
||
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
|
||
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
|
||
<link rel="icon" href="../_static/favicon.svg"/>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="18. Acceleration - CPU Vectorization & Cache Optimization" href="18_acceleration_ABOUT.html" />
|
||
<link rel="prev" title="16. Compression - Pruning and Model Compression" href="16_compression_ABOUT.html" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||
<meta name="docsearch:language" content="en"/>
|
||
</head>
|
||
|
||
|
||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||
|
||
|
||
|
||
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
|
||
|
||
<div id="pst-scroll-pixel-helper"></div>
|
||
|
||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
|
||
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-primary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
|
||
|
||
<input type="checkbox"
|
||
class="sidebar-toggle"
|
||
id="pst-secondary-sidebar-checkbox"/>
|
||
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
|
||
|
||
<div class="search-button__wrapper">
|
||
<div class="search-button__overlay"></div>
|
||
<div class="search-button__search-container">
|
||
<form class="bd-search d-flex align-items-center"
|
||
action="../search.html"
|
||
method="get">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<input type="search"
|
||
class="form-control"
|
||
name="q"
|
||
id="search-input"
|
||
placeholder="Search..."
|
||
aria-label="Search..."
|
||
autocomplete="off"
|
||
autocorrect="off"
|
||
autocapitalize="off"
|
||
spellcheck="false"/>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||
</form></div>
|
||
</div>
|
||
|
||
<div class="pst-async-banner-revealer d-none">
|
||
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
|
||
</div>
|
||
|
||
|
||
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
|
||
</header>
|
||
|
||
|
||
<div class="bd-container">
|
||
<div class="bd-container__inner bd-page-width">
|
||
|
||
|
||
|
||
<div class="bd-sidebar-primary bd-sidebar">
|
||
|
||
|
||
|
||
<div class="sidebar-header-items sidebar-primary__section">
|
||
|
||
|
||
|
||
|
||
</div>
|
||
|
||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||
<div class="sidebar-primary-item">
|
||
|
||
|
||
|
||
|
||
|
||
<a class="navbar-brand logo" href="../intro.html">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
|
||
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
|
||
|
||
|
||
</a></div>
|
||
<div class="sidebar-primary-item">
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<span class="search-button__default-text">Search</span>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||
</button>
|
||
`);
|
||
</script></div>
|
||
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
|
||
<div class="bd-toc-item navbar-nav active">
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
|
||
<ul class="current nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
|
||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">17. Memoization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites & Resources</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress & Data</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits & Acknowledgments</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</nav></div>
|
||
</div>
|
||
|
||
|
||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||
</div>
|
||
|
||
<div id="rtd-footer-container"></div>
|
||
|
||
|
||
</div>
|
||
|
||
<main id="main-content" class="bd-main" role="main">
|
||
|
||
|
||
|
||
<div class="sbt-scroll-pixel-helper"></div>
|
||
|
||
<div class="bd-content">
|
||
<div class="bd-article-container">
|
||
|
||
<div class="bd-header-article d-print-none">
|
||
<div class="header-article-items header-article__inner">
|
||
|
||
<div class="header-article-items__start">
|
||
|
||
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-bars"></span>
|
||
</button></div>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="header-article-items__end">
|
||
|
||
<div class="header-article-item">
|
||
|
||
<div class="article-header-buttons">
|
||
|
||
|
||
|
||
|
||
|
||
<div class="dropdown dropdown-download-buttons">
|
||
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
|
||
<i class="fas fa-download"></i>
|
||
</button>
|
||
<ul class="dropdown-menu">
|
||
|
||
|
||
|
||
<li><a href="../_sources/modules/17_memoization_ABOUT.md" target="_blank"
|
||
class="btn btn-sm btn-download-source-button dropdown-item"
|
||
title="Download source file"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file"></i>
|
||
</span>
|
||
<span class="btn__text-container">.md</span>
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
<li>
|
||
<button onclick="window.print()"
|
||
class="btn btn-sm btn-download-pdf-button dropdown-item"
|
||
title="Print to PDF"
|
||
data-bs-placement="left" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-file-pdf"></i>
|
||
</span>
|
||
<span class="btn__text-container">.pdf</span>
|
||
</button>
|
||
</li>
|
||
|
||
</ul>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<button onclick="toggleFullScreen()"
|
||
class="btn btn-sm btn-fullscreen-button"
|
||
title="Fullscreen mode"
|
||
data-bs-placement="bottom" data-bs-toggle="tooltip"
|
||
>
|
||
|
||
|
||
<span class="btn__icon-container">
|
||
<i class="fas fa-expand"></i>
|
||
</span>
|
||
|
||
</button>
|
||
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
|
||
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
|
||
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
|
||
|
||
<script>
|
||
document.write(`
|
||
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
|
||
</button>
|
||
`);
|
||
</script>
|
||
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<span class="fa-solid fa-list"></span>
|
||
</button>
|
||
</div></div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="jb-print-docs-body" class="onlyprint">
|
||
<h1>17. Memoization - Computational Reuse for Inference</h1>
|
||
<!-- Table of contents -->
|
||
<div id="print-main-content">
|
||
<div id="jb-print-toc">
|
||
|
||
<div>
|
||
<h2> Contents </h2>
|
||
</div>
|
||
<nav aria-label="Page">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-optimization-flow">KV Cache Optimization Flow</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-autoregressive-generation-problem">The Autoregressive Generation Problem</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-caching-solution">The Caching Solution</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-impact">Production Impact</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-speed-trade-off">Memory-Speed Trade-off</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-components">Core Components</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kvcache-data-structure">1. KVCache Data Structure</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#non-invasive-cache-integration">2. Non-Invasive Cache Integration</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#cached-attention-logic">3. Cached Attention Logic</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-steps">Implementation Steps</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-1-design-kvcache-structure">Step 1: Design KVCache Structure</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-2-implement-cache-updates">Step 2: Implement Cache Updates</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-3-enable-non-invasive-integration">Step 3: Enable Non-Invasive Integration</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-4-implement-cached-attention-forward">Step 4: Implement Cached Attention Forward</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-5-validate-correctness">Step 5: Validate Correctness</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-data-instead-of-tensor-operations">Why .data Instead of Tensor Operations?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-profiling">Inline Testing & Profiling</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-production-challenges">Real-World Production Challenges</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-optimization-patterns">Production Optimization Patterns</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#huggingface-cache-patterns-comparison">HuggingFace Cache Patterns Comparison</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expected-speedup-by-sequence-length">Expected Speedup by Sequence Length</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-usage-by-model-size">Memory Usage by Model Size</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#throughput-impact">Throughput Impact</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#where-this-code-lives-in-the-final-package">Where This Code Lives in the Final Package</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-challenges-and-solutions">Common Challenges and Solutions</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-1-cache-synchronization-across-layers">Challenge 1: Cache Synchronization Across Layers</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-2-memory-overhead-for-large-models">Challenge 2: Memory Overhead for Large Models</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-3-correctness-validation">Challenge 3: Correctness Validation</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-4-integration-without-breaking-existing-code">Challenge 4: Integration Without Breaking Existing Code</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div id="searchbox"></div>
|
||
<article class="bd-article">
|
||
|
||
<section id="memoization-computational-reuse-for-inference">
|
||
<h1>17. Memoization - Computational Reuse for Inference<a class="headerlink" href="#memoization-computational-reuse-for-inference" title="Link to this heading">#</a></h1>
|
||
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 4-5 hours</p>
|
||
<section id="overview">
|
||
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
|
||
<p>Memoization is a fundamental optimization pattern: cache computational results to avoid redundant work. You’ll apply this pattern to transformers through KV (Key-Value) caching, transforming O(n²) autoregressive generation into O(n) complexity and achieving 10-15x speedup. This optimization makes production language model serving economically viable.</p>
|
||
<p>This is inference-only optimization - you’ll implement caching patterns used in every production LLM from ChatGPT to Claude to GitHub Copilot.</p>
|
||
</section>
|
||
<section id="learning-objectives">
|
||
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
|
||
<p>By the end of this module, you will be able to:</p>
|
||
<ul class="simple">
|
||
<li><p><strong>Understand Memoization Pattern</strong>: Recognize when computational reuse through caching applies to ML problems and understand the memory-speed trade-off</p></li>
|
||
<li><p><strong>Implement KVCache Structure</strong>: Build efficient cache data structures with O(1) updates, proper memory management, and multi-layer support</p></li>
|
||
<li><p><strong>Apply Caching to Transformers</strong>: Integrate KV caching into attention layers without modifying existing transformer code (non-invasive enhancement)</p></li>
|
||
<li><p><strong>Measure Performance Gains</strong>: Profile latency improvements, measure O(n²) → O(n) complexity reduction, and understand speedup characteristics</p></li>
|
||
<li><p><strong>Analyze Production Trade-offs</strong>: Calculate cache memory costs, understand cache invalidation policies, and recognize when caching justifies its overhead</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="build-use-optimize">
|
||
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
|
||
<p>This module follows TinyTorch’s <strong>Build → Use → Optimize</strong> framework:</p>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>Build</strong>: Implement KVCache data structure with efficient updates, cached attention integration, and multi-layer cache management</p></li>
|
||
<li><p><strong>Use</strong>: Apply caching to GPT text generation, measure 10-15x speedup over naive generation, and validate output correctness</p></li>
|
||
<li><p><strong>Optimize</strong>: Profile memory bandwidth bottlenecks, measure cache hit rates, and understand when memory cost exceeds latency benefit</p></li>
|
||
</ol>
|
||
</section>
|
||
<section id="why-this-matters">
|
||
<h2>Why This Matters<a class="headerlink" href="#why-this-matters" title="Link to this heading">#</a></h2>
|
||
<section id="kv-cache-optimization-flow">
|
||
<h3>KV Cache Optimization Flow<a class="headerlink" href="#kv-cache-optimization-flow" title="Link to this heading">#</a></h3>
|
||
<p>Caching stores computed keys and values, avoiding recomputation for each new token:</p>
|
||
<pre class="mermaid">
|
||
graph LR
|
||
A[Token i<br/>Compute K_i, V_i] --> B[Cache<br/>Store K_i, V_i]
|
||
B --> C[Token i+1<br/>New computation]
|
||
C --> D[Reuse<br/>K_i, V_i from cache]
|
||
D --> E[Only compute<br/>K_{i+1}, V_{i+1}]
|
||
E --> F[10-15× speedup]
|
||
|
||
style A fill:#e3f2fd
|
||
style C fill:#e3f2fd
|
||
style B fill:#f3e5f5
|
||
style D fill:#fff3e0
|
||
style E fill:#ffe0b2
|
||
style F fill:#f0fdf4
|
||
</pre><p><strong>Optimization</strong>: Compute K,V once → Cache → Reuse for all future tokens → O(n²) → O(n) complexity</p>
|
||
</section>
|
||
<section id="the-autoregressive-generation-problem">
|
||
<h3>The Autoregressive Generation Problem<a class="headerlink" href="#the-autoregressive-generation-problem" title="Link to this heading">#</a></h3>
|
||
<p>Without caching, transformer generation has quadratic complexity:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Naive Generation (O(n²) complexity):
|
||
Step 1: Generate token 1 → Compute attention for [t₀] (1 computation)
|
||
Step 2: Generate token 2 → Compute attention for [t₀, t₁] (2 computations, t₀ RECOMPUTED!)
|
||
Step 3: Generate token 3 → Compute attention for [t₀, t₁, t₂] (3 computations, t₀,t₁ RECOMPUTED!)
|
||
...
|
||
Step n: Generate token n → Compute attention for [t₀, ..., tₙ] (n computations, ALL RECOMPUTED!)
|
||
|
||
Total: 1 + 2 + 3 + ... + n = n(n+1)/2 = O(n²) complexity!
|
||
For 100 tokens: ~5,050 redundant K,V computations
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>The Key Insight</strong>: K and V matrices for previous tokens NEVER change, yet we recompute them every step!</p>
|
||
</section>
|
||
<section id="the-caching-solution">
|
||
<h3>The Caching Solution<a class="headerlink" href="#the-caching-solution" title="Link to this heading">#</a></h3>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Cached Generation (O(n) complexity):
|
||
Step 1: Compute K₁, V₁ → Cache them → Attention with cached[K₁, V₁]
|
||
Step 2: Compute K₂, V₂ → Cache them → Attention with cached[K₁, K₂, V₁, V₂] (reuse K₁, V₁!)
|
||
Step 3: Compute K₃, V₃ → Cache them → Attention with cached[K₁, K₂, K₃, V₁, V₂, V₃] (reuse all!)
|
||
|
||
Total: 1 + 1 + 1 + ... + 1 = n computations (50x reduction for n=100!)
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="production-impact">
|
||
<h3>Production Impact<a class="headerlink" href="#production-impact" title="Link to this heading">#</a></h3>
|
||
<p>KV caching is mandatory for all production LLM serving:</p>
|
||
<ul class="simple">
|
||
<li><p><strong>ChatGPT/GPT-4</strong>: Would be 50-100x slower without caching, making conversational AI economically infeasible</p></li>
|
||
<li><p><strong>Claude</strong>: Caches up to 100K tokens of context, enabling long document processing</p></li>
|
||
<li><p><strong>GitHub Copilot</strong>: Real-time code completion requires sub-100ms latency - impossible without caching</p></li>
|
||
<li><p><strong>Google Gemini</strong>: Multi-level caching (KV + intermediate layers) serves billions of requests daily</p></li>
|
||
</ul>
|
||
<p>Without KV caching, the computational cost would make these services prohibitively expensive.</p>
|
||
</section>
|
||
<section id="memory-speed-trade-off">
|
||
<h3>Memory-Speed Trade-off<a class="headerlink" href="#memory-speed-trade-off" title="Link to this heading">#</a></h3>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Traditional Approach (No Cache):
|
||
Memory: O(1) Cost: Negligible
|
||
Compute: O(n²) Cost: Prohibitive for long sequences
|
||
|
||
Cached Approach (KV Cache):
|
||
Memory: O(n × d_k) Cost: ~18MB per batch for GPT-2
|
||
Compute: O(n) Cost: 10-15x faster than naive
|
||
|
||
Trade-off Winner: Memory is cheap, compute is expensive!
|
||
Use O(n) memory to save O(n²) compute.
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="implementation-guide">
|
||
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
|
||
<section id="core-components">
|
||
<h3>Core Components<a class="headerlink" href="#core-components" title="Link to this heading">#</a></h3>
|
||
<p>You’ll implement three main components:</p>
|
||
<section id="kvcache-data-structure">
|
||
<h4>1. KVCache Data Structure<a class="headerlink" href="#kvcache-data-structure" title="Link to this heading">#</a></h4>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">KVCache</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Efficient key-value cache for autoregressive generation.</span>
|
||
|
||
<span class="sd"> Memory Layout:</span>
|
||
<span class="sd"> keys: (num_layers, batch, num_heads, seq_len, d_k)</span>
|
||
<span class="sd"> values: (num_layers, batch, num_heads, seq_len, d_v)</span>
|
||
|
||
<span class="sd"> For GPT-2 (12 layers, 12 heads, 1024 seq, 64 dims):</span>
|
||
<span class="sd"> 12 layers × 12 heads × 1024 seq × 64 dims = ~9M values</span>
|
||
<span class="sd"> At FP32 (4 bytes): ~36MB per batch item</span>
|
||
<span class="sd"> At FP16 (2 bytes): ~18MB per batch item</span>
|
||
|
||
<span class="sd"> Operations:</span>
|
||
<span class="sd"> update(layer_idx, key, value) -> None # O(1) append</span>
|
||
<span class="sd"> get(layer_idx) -> (cached_k, cached_v) # O(1) retrieval</span>
|
||
<span class="sd"> advance() -> None # Increment position</span>
|
||
<span class="sd"> reset() -> None # Clear for new sequence</span>
|
||
<span class="sd"> """</span>
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>Key Design Decisions</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Pre-allocate cache tensors to avoid dynamic resizing overhead</p></li>
|
||
<li><p>Use position counter for O(1) indexed updates (no copying)</p></li>
|
||
<li><p>Store per-layer caches to support multi-layer transformers</p></li>
|
||
<li><p>Track sequence position externally for clean separation</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="non-invasive-cache-integration">
|
||
<h4>2. Non-Invasive Cache Integration<a class="headerlink" href="#non-invasive-cache-integration" title="Link to this heading">#</a></h4>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Enable KV caching WITHOUT modifying Module 12/13 code.</span>
|
||
|
||
<span class="sd"> This demonstrates non-invasive optimization - adding capabilities</span>
|
||
<span class="sd"> to existing systems without breaking them. Similar to how Module 05</span>
|
||
<span class="sd"> uses enable_autograd() to add gradient tracking to Tensors.</span>
|
||
|
||
<span class="sd"> Approach:</span>
|
||
<span class="sd"> 1. Create KVCache sized for model architecture</span>
|
||
<span class="sd"> 2. Store cache on model as model._kv_cache</span>
|
||
<span class="sd"> 3. Wrap each attention layer's forward method with caching logic</span>
|
||
<span class="sd"> 4. Intercept attention calls to manage cache automatically</span>
|
||
|
||
<span class="sd"> This is composition + monkey-patching - a critical ML systems pattern!</span>
|
||
<span class="sd"> """</span>
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>Why Non-Invasive?</strong></p>
|
||
<ul class="simple">
|
||
<li><p>Modules 12-13 (Attention, Transformers) work unchanged</p></li>
|
||
<li><p>Module 17 ADDS optimization, doesn’t BREAK old code</p></li>
|
||
<li><p>Teaches “forward-only” systems engineering: never modify earlier modules</p></li>
|
||
<li><p>Matches how production systems layer optimizations (vLLM, HuggingFace)</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="cached-attention-logic">
|
||
<h4>3. Cached Attention Logic<a class="headerlink" href="#cached-attention-logic" title="Link to this heading">#</a></h4>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">cached_forward</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Cache-aware attention with three paths:</span>
|
||
|
||
<span class="sd"> PATH 1: Training (seq_len > 1)</span>
|
||
<span class="sd"> → Use original attention (preserve gradients)</span>
|
||
<span class="sd"> → O(n²) but needed for backpropagation</span>
|
||
|
||
<span class="sd"> PATH 2: First Token (seq_len == 1, cache empty)</span>
|
||
<span class="sd"> → Use original attention (initialize cache)</span>
|
||
<span class="sd"> → O(1) - just one token</span>
|
||
|
||
<span class="sd"> PATH 3: Cached Generation (seq_len == 1, cache populated)</span>
|
||
<span class="sd"> → Compute K,V for NEW token only</span>
|
||
<span class="sd"> → Retrieve ALL cached K,V (includes history)</span>
|
||
<span class="sd"> → Attention with cached context</span>
|
||
<span class="sd"> → O(n) - only compute new, reuse cache</span>
|
||
<span class="sd"> → THIS IS WHERE THE SPEEDUP HAPPENS!</span>
|
||
<span class="sd"> """</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="implementation-steps">
|
||
<h3>Implementation Steps<a class="headerlink" href="#implementation-steps" title="Link to this heading">#</a></h3>
|
||
<section id="step-1-design-kvcache-structure">
|
||
<h4>Step 1: Design KVCache Structure<a class="headerlink" href="#step-1-design-kvcache-structure" title="Link to this heading">#</a></h4>
|
||
<ol class="arabic simple">
|
||
<li><p>Initialize cache storage for all layers</p></li>
|
||
<li><p>Pre-allocate tensors with maximum sequence length</p></li>
|
||
<li><p>Track current sequence position (write pointer)</p></li>
|
||
<li><p>Implement update() for O(1) append operations</p></li>
|
||
<li><p>Implement get() for O(1) retrieval of valid cache portion</p></li>
|
||
</ol>
|
||
</section>
|
||
<section id="step-2-implement-cache-updates">
|
||
<h4>Step 2: Implement Cache Updates<a class="headerlink" href="#step-2-implement-cache-updates" title="Link to this heading">#</a></h4>
|
||
<ol class="arabic simple">
|
||
<li><p>Validate layer index and sequence position</p></li>
|
||
<li><p>Write new K,V to current position (indexed assignment)</p></li>
|
||
<li><p>Advance position counter after all layers processed</p></li>
|
||
<li><p>Handle batch dimension and multi-head structure</p></li>
|
||
</ol>
|
||
</section>
|
||
<section id="step-3-enable-non-invasive-integration">
|
||
<h4>Step 3: Enable Non-Invasive Integration<a class="headerlink" href="#step-3-enable-non-invasive-integration" title="Link to this heading">#</a></h4>
|
||
<ol class="arabic simple">
|
||
<li><p>Validate model has required attributes (embed_dim, num_layers, etc.)</p></li>
|
||
<li><p>Calculate head_dim from embed_dim and num_heads</p></li>
|
||
<li><p>Create KVCache instance sized for model</p></li>
|
||
<li><p>Store cache on model with model._kv_cache flag</p></li>
|
||
<li><p>Wrap each block’s attention.forward with caching logic</p></li>
|
||
</ol>
|
||
</section>
|
||
<section id="step-4-implement-cached-attention-forward">
|
||
<h4>Step 4: Implement Cached Attention Forward<a class="headerlink" href="#step-4-implement-cached-attention-forward" title="Link to this heading">#</a></h4>
|
||
<ol class="arabic simple">
|
||
<li><p>Detect path: training (seq_len > 1), first token (cache empty), or cached generation</p></li>
|
||
<li><p>For cached path: Compute Q,K,V projections for new token only</p></li>
|
||
<li><p>Reshape to multi-head format (batch, num_heads, 1, head_dim)</p></li>
|
||
<li><p>Update cache with new K,V pairs</p></li>
|
||
<li><p>Retrieve ALL cached K,V (history + new)</p></li>
|
||
<li><p>Compute attention: softmax(Q @ K^T / √d_k) @ V using NumPy (.data)</p></li>
|
||
<li><p>Apply output projection and return</p></li>
|
||
</ol>
|
||
</section>
|
||
<section id="step-5-validate-correctness">
|
||
<h4>Step 5: Validate Correctness<a class="headerlink" href="#step-5-validate-correctness" title="Link to this heading">#</a></h4>
|
||
<ol class="arabic simple">
|
||
<li><p>Test cache initialization and memory calculation</p></li>
|
||
<li><p>Verify single-token and multi-token updates</p></li>
|
||
<li><p>Validate multi-layer cache synchronization</p></li>
|
||
<li><p>Test reset functionality</p></li>
|
||
<li><p>Measure speedup vs non-cached generation</p></li>
|
||
</ol>
|
||
</section>
|
||
</section>
|
||
<section id="why-data-instead-of-tensor-operations">
|
||
<h3>Why .data Instead of Tensor Operations?<a class="headerlink" href="#why-data-instead-of-tensor-operations" title="Link to this heading">#</a></h3>
|
||
<p>In cached attention, we use NumPy via <code class="docutils literal notranslate"><span class="pre">.data</span></code> for three reasons:</p>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>Explicit Intent</strong>: Makes it crystal clear this is inference-only</p>
|
||
<ul class="simple">
|
||
<li><p>Training: Uses Tensor operations → gradients tracked</p></li>
|
||
<li><p>Inference: Uses .data → no gradient overhead</p></li>
|
||
</ul>
|
||
</li>
|
||
<li><p><strong>Performance</strong>: Avoids any autograd bookkeeping</p>
|
||
<ul class="simple">
|
||
<li><p>Even small overhead matters in generation hotpath</p></li>
|
||
<li><p>Production LLMs (vLLM, llama.cpp) use similar patterns</p></li>
|
||
</ul>
|
||
</li>
|
||
<li><p><strong>Educational Clarity</strong>: Shows students the distinction</p>
|
||
<ul class="simple">
|
||
<li><p>“When do I need gradients?” (training)</p></li>
|
||
<li><p>“When can I skip them?” (inference)</p></li>
|
||
</ul>
|
||
</li>
|
||
</ol>
|
||
<p>We COULD use Tensor operations with requires_grad=False, but .data is more explicit and follows industry patterns.</p>
|
||
</section>
|
||
</section>
|
||
<section id="getting-started">
|
||
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
|
||
<section id="prerequisites">
|
||
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
|
||
<p>Ensure you understand transformers and profiling:</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
|
||
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
|
||
|
||
<span class="c1"># Verify prerequisite modules</span>
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>transformers
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>profiling
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>Required Understanding</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Multi-head attention mechanism (Module 12)</p></li>
|
||
<li><p>Transformer architecture (Module 13)</p></li>
|
||
<li><p>Latency profiling techniques (Module 14)</p></li>
|
||
<li><p>O(n²) complexity of attention computation</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="development-workflow">
|
||
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
|
||
<ol class="arabic simple">
|
||
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/17_memoization/memoization_dev.ipynb</span></code></p></li>
|
||
<li><p><strong>Profile naive generation</strong>: Measure O(n²) growth in latency as sequence lengthens</p></li>
|
||
<li><p><strong>Implement KVCache class</strong>: Build data structure with update(), get(), advance(), reset()</p></li>
|
||
<li><p><strong>Test cache operations</strong>: Verify single-token, multi-token, and multi-layer caching</p></li>
|
||
<li><p><strong>Implement enable_kv_cache()</strong>: Non-invasively patch model attention layers</p></li>
|
||
<li><p><strong>Build cached attention forward</strong>: Three-path logic (training, first token, cached generation)</p></li>
|
||
<li><p><strong>Measure speedup</strong>: Profile cached vs non-cached generation, validate O(n) complexity</p></li>
|
||
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">17</span> <span class="pre">&&</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">memoization</span></code></p></li>
|
||
</ol>
|
||
</section>
|
||
</section>
|
||
<section id="testing">
|
||
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
|
||
<section id="comprehensive-test-suite">
|
||
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
|
||
<p>Run the full test suite to verify memoization functionality:</p>
|
||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
|
||
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>memoization
|
||
|
||
<span class="c1"># Direct pytest execution</span>
|
||
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>memoization<span class="w"> </span>-v
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="test-coverage-areas">
|
||
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
|
||
<ul class="simple">
|
||
<li><p>✅ <strong>KVCache Initialization</strong>: Validate cache creation, memory calculation, and initial state</p></li>
|
||
<li><p>✅ <strong>Cache Updates</strong>: Test single-token append, multi-token sequences, and O(1) update performance</p></li>
|
||
<li><p>✅ <strong>Multi-Layer Synchronization</strong>: Verify independent per-layer caches with correct indexing</p></li>
|
||
<li><p>✅ <strong>Cache Retrieval</strong>: Test get() returns only valid cached portion (up to seq_pos)</p></li>
|
||
<li><p>✅ <strong>Non-Invasive Integration</strong>: Validate enable_kv_cache() works without breaking model</p></li>
|
||
<li><p>✅ <strong>Correctness Validation</strong>: Compare cached vs non-cached outputs (should be identical)</p></li>
|
||
<li><p>✅ <strong>Performance Measurement</strong>: Measure speedup at different sequence lengths</p></li>
|
||
<li><p>✅ <strong>Memory Tracking</strong>: Calculate cache size and validate memory usage</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="inline-testing-profiling">
|
||
<h3>Inline Testing & Profiling<a class="headerlink" href="#inline-testing-profiling" title="Link to this heading">#</a></h3>
|
||
<p>The module includes comprehensive validation with performance measurement:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Unit Test: KVCache Implementation</span>
|
||
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">KVCache</span> <span class="n">Implementation</span><span class="o">...</span>
|
||
<span class="n">Cache</span> <span class="n">initialized</span><span class="p">:</span> <span class="mf">0.59</span> <span class="n">MB</span>
|
||
<span class="err">✅</span> <span class="n">Cache</span> <span class="n">initialization</span> <span class="n">successful</span>
|
||
<span class="err">✅</span> <span class="n">Append</span> <span class="ow">and</span> <span class="n">retrieval</span> <span class="n">work</span> <span class="n">correctly</span>
|
||
<span class="err">✅</span> <span class="n">Multi</span><span class="o">-</span><span class="n">layer</span> <span class="n">caching</span> <span class="n">validated</span>
|
||
<span class="err">✅</span> <span class="n">Reset</span> <span class="n">functionality</span> <span class="n">verified</span>
|
||
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">KVCache</span> <span class="err">✓</span>
|
||
|
||
<span class="c1"># Integration Test: Performance Measurement</span>
|
||
<span class="err">🔬</span> <span class="n">Profiling</span> <span class="n">Transformer</span> <span class="n">Generation</span> <span class="p">(</span><span class="n">Without</span> <span class="n">Caching</span><span class="p">):</span>
|
||
<span class="n">Seq</span> <span class="n">Len</span> <span class="o">|</span> <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="o">|</span> <span class="n">Growth</span>
|
||
<span class="o">---------|----------------|----------</span>
|
||
<span class="mi">10</span> <span class="o">|</span> <span class="mf">2.34</span> <span class="o">|</span> <span class="n">baseline</span>
|
||
<span class="mi">20</span> <span class="o">|</span> <span class="mf">4.89</span> <span class="o">|</span> <span class="mf">2.09</span><span class="err">×</span>
|
||
<span class="mi">40</span> <span class="o">|</span> <span class="mf">10.12</span> <span class="o">|</span> <span class="mf">2.07</span><span class="err">×</span>
|
||
<span class="mi">80</span> <span class="o">|</span> <span class="mf">21.45</span> <span class="o">|</span> <span class="mf">2.12</span><span class="err">×</span>
|
||
<span class="mi">160</span> <span class="o">|</span> <span class="mf">45.67</span> <span class="o">|</span> <span class="mf">2.13</span><span class="err">×</span>
|
||
|
||
<span class="err">💡</span> <span class="n">Key</span> <span class="n">Observations</span><span class="p">:</span>
|
||
<span class="err">•</span> <span class="n">Latency</span> <span class="n">grows</span> <span class="n">QUADRATICALLY</span> <span class="k">with</span> <span class="n">sequence</span> <span class="n">length</span>
|
||
<span class="err">•</span> <span class="n">Each</span> <span class="n">new</span> <span class="n">token</span> <span class="n">forces</span> <span class="n">recomputation</span> <span class="n">of</span> <span class="n">ALL</span> <span class="n">previous</span> <span class="n">K</span><span class="p">,</span><span class="n">V</span> <span class="n">pairs</span>
|
||
<span class="err">•</span> <span class="n">For</span> <span class="mi">160</span> <span class="n">tokens</span><span class="p">:</span> <span class="o">~</span><span class="mi">4</span><span class="err">×</span> <span class="n">time</span> <span class="n">vs</span> <span class="mi">80</span> <span class="n">tokens</span> <span class="p">(</span><span class="mi">2</span><span class="err">²</span> <span class="n">growth</span><span class="p">)</span>
|
||
|
||
<span class="err">🎯</span> <span class="n">The</span> <span class="n">Solution</span><span class="p">:</span> <span class="n">CACHE</span> <span class="n">the</span> <span class="n">K</span><span class="p">,</span><span class="n">V</span> <span class="n">values</span><span class="err">!</span> <span class="p">(</span><span class="n">That</span><span class="s1">'s memoization)</span>
|
||
<span class="err">✅</span> <span class="n">Speedup</span><span class="p">:</span> <span class="mi">10</span><span class="o">-</span><span class="mi">15</span><span class="err">×</span> <span class="k">for</span> <span class="n">typical</span> <span class="n">generation</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="manual-testing-examples">
|
||
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.memoization</span><span class="w"> </span><span class="kn">import</span> <span class="n">KVCache</span><span class="p">,</span> <span class="n">enable_kv_cache</span>
|
||
|
||
<span class="c1"># Test cache with small transformer</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span>
|
||
<span class="n">batch_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="n">max_seq_len</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span>
|
||
<span class="n">num_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
|
||
<span class="n">num_heads</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span>
|
||
<span class="n">head_dim</span><span class="o">=</span><span class="mi">64</span>
|
||
<span class="p">)</span>
|
||
|
||
<span class="c1"># Simulate generation loop</span>
|
||
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>
|
||
|
||
<span class="k">for</span> <span class="n">step</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">):</span>
|
||
<span class="k">for</span> <span class="n">layer_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">):</span>
|
||
<span class="c1"># New key-value pairs for this step</span>
|
||
<span class="n">new_k</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">64</span><span class="p">))</span>
|
||
<span class="n">new_v</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">64</span><span class="p">))</span>
|
||
|
||
<span class="c1"># Update cache (O(1) operation)</span>
|
||
<span class="n">cache</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">layer_idx</span><span class="p">,</span> <span class="n">new_k</span><span class="p">,</span> <span class="n">new_v</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Advance position after all layers</span>
|
||
<span class="n">cache</span><span class="o">.</span><span class="n">advance</span><span class="p">()</span>
|
||
|
||
<span class="c1"># Retrieve cached values</span>
|
||
<span class="n">cached_k</span><span class="p">,</span> <span class="n">cached_v</span> <span class="o">=</span> <span class="n">cache</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">layer_idx</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Cached 10 tokens: </span><span class="si">{</span><span class="n">cached_k</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> <span class="c1"># (1, 8, 10, 64)</span>
|
||
|
||
<span class="c1"># Calculate memory usage</span>
|
||
<span class="n">mem_info</span> <span class="o">=</span> <span class="n">cache</span><span class="o">.</span><span class="n">get_memory_usage</span><span class="p">()</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Cache memory: </span><span class="si">{</span><span class="n">mem_info</span><span class="p">[</span><span class="s1">'total_mb'</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> MB"</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="systems-thinking-questions">
|
||
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
|
||
<section id="real-world-production-challenges">
|
||
<h3>Real-World Production Challenges<a class="headerlink" href="#real-world-production-challenges" title="Link to this heading">#</a></h3>
|
||
<p><strong>Memory-Speed Trade-off Analysis</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>KV cache uses ~18MB per batch for GPT-2 (FP16). For batch=32, that’s 576MB.</p></li>
|
||
<li><p>On an 8GB GPU, how many concurrent users can you serve?</p></li>
|
||
<li><p>What’s the trade-off between batch size and cache size?</p></li>
|
||
<li><p>When does memory bandwidth (cache access) become the bottleneck instead of compute?</p></li>
|
||
</ul>
|
||
<p><strong>Cache Invalidation Policies</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>In multi-turn chat, when should you clear the cache?</p></li>
|
||
<li><p>What happens when context exceeds max_seq_len?</p></li>
|
||
<li><p>How do production systems like ChatGPT handle context window limits?</p></li>
|
||
<li><p>Compare eviction policies: LRU, FIFO, sliding window, importance-based</p></li>
|
||
</ul>
|
||
<p><strong>Distributed Caching for Large Models</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>For models too large for one GPU, you need tensor parallelism</p></li>
|
||
<li><p>How do you partition the KV cache across GPUs?</p></li>
|
||
<li><p>Which dimension should you shard: layers, heads, or sequence?</p></li>
|
||
<li><p>What’s the communication overhead for cache synchronization?</p></li>
|
||
</ul>
|
||
<p><strong>Quantized Caching</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Storing cache in INT8 instead of FP16 saves 50% memory</p></li>
|
||
<li><p>What’s the accuracy impact of quantized KV cache?</p></li>
|
||
<li><p>When is this trade-off worth it?</p></li>
|
||
<li><p>How does quantization error accumulate over long sequences?</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="production-optimization-patterns">
|
||
<h3>Production Optimization Patterns<a class="headerlink" href="#production-optimization-patterns" title="Link to this heading">#</a></h3>
|
||
<p><strong>Multi-Level Caching</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>What if you cache not just K,V but intermediate layer activations?</p></li>
|
||
<li><p>How does HuggingFace’s <code class="docutils literal notranslate"><span class="pre">DynamicCache</span></code> differ from static pre-allocation?</p></li>
|
||
<li><p>When should you use persistent caching (save to disk) for very long conversations?</p></li>
|
||
</ul>
|
||
<p><strong>Speculation and Prefetching</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>What if you predict the next query and pre-compute KV cache?</p></li>
|
||
<li><p>How would speculative caching improve throughput?</p></li>
|
||
<li><p>What’s the risk if speculation is wrong?</p></li>
|
||
<li><p>When does prefetching justify its overhead?</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="mathematical-foundations">
|
||
<h3>Mathematical Foundations<a class="headerlink" href="#mathematical-foundations" title="Link to this heading">#</a></h3>
|
||
<p><strong>Complexity Reduction</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Why does KV caching transform O(n²) into O(n)?</p></li>
|
||
<li><p>Calculate total operations for naive vs cached generation (n=100)</p></li>
|
||
<li><p>What’s the crossover point where caching overhead exceeds savings?</p></li>
|
||
</ul>
|
||
<p><strong>Memory Layout Optimization</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Why pre-allocate cache instead of dynamic appending?</p></li>
|
||
<li><p>How does cache contiguity affect memory bandwidth?</p></li>
|
||
<li><p>Compare row-major vs column-major cache layouts for performance</p></li>
|
||
</ul>
|
||
<p><strong>Attention Computation Analysis</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Why can we cache K,V but not Q (query)?</p></li>
|
||
<li><p>What property of autoregressive generation makes caching valid?</p></li>
|
||
<li><p>How would bidirectional attention (BERT) change caching strategy?</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="huggingface-cache-patterns-comparison">
|
||
<h3>HuggingFace Cache Patterns Comparison<a class="headerlink" href="#huggingface-cache-patterns-comparison" title="Link to this heading">#</a></h3>
|
||
<p><strong>Static vs Dynamic Cache</strong>:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch (Module 17): Static pre-allocation</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="n">max_seq_len</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span> <span class="c1"># Fixed size, O(1) updates</span>
|
||
|
||
<span class="c1"># HuggingFace: Dynamic cache (DynamicCache class)</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">DynamicCache</span><span class="p">()</span> <span class="c1"># Grows as needed, more flexible but slower</span>
|
||
</pre></div>
|
||
</div>
|
||
<p><strong>When to Use Each</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p><strong>Static (TinyTorch)</strong>: Known max length, maximum performance, inference serving</p></li>
|
||
<li><p><strong>Dynamic (HuggingFace)</strong>: Variable lengths, exploration, research</p></li>
|
||
</ul>
|
||
<p><strong>Production Systems (vLLM, TGI)</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Use PagedAttention for virtual memory management of KV cache</p></li>
|
||
<li><p>Enables efficient memory sharing across requests</p></li>
|
||
<li><p>Reduces memory fragmentation for variable-length sequences</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="performance-characteristics">
|
||
<h2>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h2>
|
||
<section id="expected-speedup-by-sequence-length">
|
||
<h3>Expected Speedup by Sequence Length<a class="headerlink" href="#expected-speedup-by-sequence-length" title="Link to this heading">#</a></h3>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Speedup Characteristics (GPT-2 on CPU):
|
||
┌─────────────┬──────────────┬──────────────┬──────────┐
|
||
│ Seq Length │ No Cache │ With Cache │ Speedup │
|
||
├─────────────┼──────────────┼──────────────┼──────────┤
|
||
│ 10 tokens │ ~80 tok/s │ ~600 tok/s │ 7.5x │
|
||
│ 25 tokens │ ~40 tok/s │ ~500 tok/s │ 12.5x │
|
||
│ 50 tokens │ ~25 tok/s │ ~400 tok/s │ 16.0x │
|
||
│ 100 tokens │ ~12 tok/s │ ~200 tok/s │ 16.7x │
|
||
└─────────────┴──────────────┴──────────────┴──────────┘
|
||
|
||
Key Insight: Speedup increases with sequence length!
|
||
Why? Longer sequences = more redundant computation without cache.
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="memory-usage-by-model-size">
|
||
<h3>Memory Usage by Model Size<a class="headerlink" href="#memory-usage-by-model-size" title="Link to this heading">#</a></h3>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Cache Memory Requirements (FP16, batch_size=1):
|
||
┌──────────────┬────────┬────────┬─────────┬──────────────┐
|
||
│ Model │ Layers │ Heads │ Seq Len │ Cache Memory │
|
||
├──────────────┼────────┼────────┼─────────┼──────────────┤
|
||
│ TinyGPT │ 4 │ 4 │ 128 │ 0.5 MB │
|
||
│ GPT-2 (124M) │ 12 │ 12 │ 1024 │ 18.0 MB │
|
||
│ GPT-3 (175B) │ 96 │ 96 │ 2048 │ 4.7 GB │
|
||
└──────────────┴────────┴────────┴─────────┴──────────────┘
|
||
|
||
Formula: memory = num_layers × num_heads × max_seq_len × head_dim × 2 × 2 bytes
|
||
(2× for K and V, 2 bytes for FP16)
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="throughput-impact">
|
||
<h3>Throughput Impact<a class="headerlink" href="#throughput-impact" title="Link to this heading">#</a></h3>
|
||
<p><strong>Single Sequence Generation</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Without cache: Throughput decreases as sequence grows (O(n²) bottleneck)</p></li>
|
||
<li><p>With cache: Throughput stays relatively constant (O(n) scales well)</p></li>
|
||
</ul>
|
||
<p><strong>Batch Inference</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Cache memory scales linearly with batch size</p></li>
|
||
<li><p>Throughput increases with batching (amortize model loading)</p></li>
|
||
<li><p>Memory becomes limiting factor before compute</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="where-this-code-lives-in-the-final-package">
|
||
<h2>Where This Code Lives in the Final Package<a class="headerlink" href="#where-this-code-lives-in-the-final-package" title="Link to this heading">#</a></h2>
|
||
<p><strong>Package Export</strong>: Code exports to <code class="docutils literal notranslate"><span class="pre">tinytorch.generation.kv_cache</span></code></p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># When students install tinytorch, they import your work like this:</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.memoization</span><span class="w"> </span><span class="kn">import</span> <span class="n">KVCache</span><span class="p">,</span> <span class="n">enable_kv_cache</span><span class="p">,</span> <span class="n">disable_kv_cache</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">MultiHeadAttention</span> <span class="c1"># Base class from Module 12</span>
|
||
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.transformer</span><span class="w"> </span><span class="kn">import</span> <span class="n">GPT</span> <span class="c1"># Architecture from Module 13</span>
|
||
|
||
<span class="c1"># Usage in generation:</span>
|
||
<span class="n">model</span> <span class="o">=</span> <span class="n">GPT</span><span class="p">(</span><span class="n">vocab_size</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span> <span class="n">embed_dim</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span> <span class="n">num_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_heads</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span> <span class="c1"># Non-invasively add caching</span>
|
||
|
||
<span class="c1"># Generate with caching enabled (10-15x faster!)</span>
|
||
<span class="n">output</span> <span class="o">=</span> <span class="n">generate_text</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="o">=</span><span class="s2">"Hello"</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Disable caching if needed</span>
|
||
<span class="n">disable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Your KV caching implementation becomes the foundation for efficient inference in the TinyTorch package, used by subsequent modules for text generation, chat applications, and deployment scenarios.</p>
|
||
</section>
|
||
<section id="common-challenges-and-solutions">
|
||
<h2>Common Challenges and Solutions<a class="headerlink" href="#common-challenges-and-solutions" title="Link to this heading">#</a></h2>
|
||
<section id="challenge-1-cache-synchronization-across-layers">
|
||
<h3>Challenge 1: Cache Synchronization Across Layers<a class="headerlink" href="#challenge-1-cache-synchronization-across-layers" title="Link to this heading">#</a></h3>
|
||
<p><strong>Problem</strong>: Keeping cache consistent when different layers process at different speeds or batch items have variable lengths.</p>
|
||
<p><strong>Solution</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Use layer indexing to maintain independent per-layer caches</p></li>
|
||
<li><p>Advance sequence position only after ALL layers have processed current token</p></li>
|
||
<li><p>Handle variable sequence lengths with padding and attention masks</p></li>
|
||
</ul>
|
||
<p><strong>Code Pattern</strong>:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Process all layers before advancing</span>
|
||
<span class="k">for</span> <span class="n">layer_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_layers</span><span class="p">):</span>
|
||
<span class="n">cache</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">layer_idx</span><span class="p">,</span> <span class="n">new_k</span><span class="p">,</span> <span class="n">new_v</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Now advance position (all layers synchronized)</span>
|
||
<span class="n">cache</span><span class="o">.</span><span class="n">advance</span><span class="p">()</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="challenge-2-memory-overhead-for-large-models">
|
||
<h3>Challenge 2: Memory Overhead for Large Models<a class="headerlink" href="#challenge-2-memory-overhead-for-large-models" title="Link to this heading">#</a></h3>
|
||
<p><strong>Problem</strong>: Cache memory grows with sequence length and batch size, potentially exceeding GPU memory.</p>
|
||
<p><strong>Solution</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Implement cache size limits with eviction policies (LRU, FIFO)</p></li>
|
||
<li><p>Use FP16 or INT8 quantization for cache storage (50% memory reduction)</p></li>
|
||
<li><p>Consider PagedAttention for virtual memory management</p></li>
|
||
<li><p>Tune max_seq_len to expected generation length</p></li>
|
||
</ul>
|
||
<p><strong>Memory Optimization</strong>:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># FP16 caching (2 bytes per element)</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="o">...</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span> <span class="c1"># 50% memory savings</span>
|
||
|
||
<span class="c1"># INT8 caching (1 byte per element)</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="o">...</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int8</span><span class="p">)</span> <span class="c1"># 75% memory savings, accuracy trade-off</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="challenge-3-correctness-validation">
|
||
<h3>Challenge 3: Correctness Validation<a class="headerlink" href="#challenge-3-correctness-validation" title="Link to this heading">#</a></h3>
|
||
<p><strong>Problem</strong>: Cached generation must produce identical outputs to non-cached generation.</p>
|
||
<p><strong>Solution</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Compare cached vs non-cached outputs token-by-token</p></li>
|
||
<li><p>Use deterministic sampling (temperature=0) for testing</p></li>
|
||
<li><p>Validate cache retrieval returns correct sequence positions</p></li>
|
||
<li><p>Test edge cases: first token, cache full, reset</p></li>
|
||
</ul>
|
||
<p><strong>Validation Pattern</strong>:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Generate without cache (ground truth)</span>
|
||
<span class="n">output_nocache</span> <span class="o">=</span> <span class="n">generate</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Generate with cache (optimized)</span>
|
||
<span class="n">cache</span> <span class="o">=</span> <span class="n">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
|
||
<span class="n">output_cached</span> <span class="o">=</span> <span class="n">generate</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Validate identical outputs</span>
|
||
<span class="k">assert</span> <span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">output_nocache</span><span class="p">,</span> <span class="n">output_cached</span><span class="p">),</span> <span class="s2">"Cached output must match!"</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="challenge-4-integration-without-breaking-existing-code">
|
||
<h3>Challenge 4: Integration Without Breaking Existing Code<a class="headerlink" href="#challenge-4-integration-without-breaking-existing-code" title="Link to this heading">#</a></h3>
|
||
<p><strong>Problem</strong>: Adding caching shouldn’t require modifying Modules 12-13 (attention, transformer).</p>
|
||
<p><strong>Solution</strong>:</p>
|
||
<ul class="simple">
|
||
<li><p>Use composition + monkey-patching (wrap, don’t modify)</p></li>
|
||
<li><p>Store original forward methods before patching</p></li>
|
||
<li><p>Provide disable_kv_cache() to restore original behavior</p></li>
|
||
<li><p>Use feature flags (model._cache_enabled) for path selection</p></li>
|
||
</ul>
|
||
<p><strong>Non-Invasive Pattern</strong>:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Save original before patching</span>
|
||
<span class="n">block</span><span class="o">.</span><span class="n">_original_attention_forward</span> <span class="o">=</span> <span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span>
|
||
|
||
<span class="c1"># Patch with cached version</span>
|
||
<span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span> <span class="o">=</span> <span class="n">cached_forward</span>
|
||
|
||
<span class="c1"># Restore later if needed</span>
|
||
<span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span> <span class="o">=</span> <span class="n">block</span><span class="o">.</span><span class="n">_original_attention_forward</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
<section id="ready-to-build">
|
||
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
|
||
<p>You’re about to implement the optimization that makes production language models economically viable! KV caching is THE technique that transformed LLMs from research toys into products used by millions daily.</p>
|
||
<p>This is where theory meets practice in ML systems engineering. You’ll see firsthand how a simple idea - “don’t recompute what never changes” - can deliver 10-15x speedup and make the impossible possible.</p>
|
||
<p><strong>What makes this module special</strong>: Unlike many optimizations that require deep algorithmic changes, KV caching is conceptually simple but profoundly impactful. You’ll implement it from scratch, measure the dramatic speedup, and understand the memory-speed trade-offs that guide production deployments.</p>
|
||
<p>Understanding this optimization from first principles - implementing it yourself, profiling the speedup, analyzing the trade-offs - will give you deep insight into how production ML systems work. This is the optimization that makes ChatGPT, Claude, and GitHub Copilot possible.</p>
|
||
<p>Take your time, measure thoroughly, and enjoy building production-ready ML systems!</p>
|
||
<p>Choose your preferred way to engage with this module:</p>
|
||
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
|
||
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
|
||
<div class="sd-col sd-d-flex-row docutils">
|
||
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
|
||
<div class="sd-card-body docutils">
|
||
<div class="sd-card-title sd-font-weight-bold docutils">
|
||
🚀 Launch Binder</div>
|
||
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
|
||
</div>
|
||
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/17_memoization/memoization_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/17_memoization/memoization_dev.ipynb</span></a></div>
|
||
</div>
|
||
<div class="sd-col sd-d-flex-row docutils">
|
||
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
|
||
<div class="sd-card-body docutils">
|
||
<div class="sd-card-title sd-font-weight-bold docutils">
|
||
⚡ Open in Colab</div>
|
||
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
|
||
</div>
|
||
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb</span></a></div>
|
||
</div>
|
||
<div class="sd-col sd-d-flex-row docutils">
|
||
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
|
||
<div class="sd-card-body docutils">
|
||
<div class="sd-card-title sd-font-weight-bold docutils">
|
||
📖 View Source</div>
|
||
<p class="sd-card-text">Browse the Jupyter notebook source and understand the implementation.</p>
|
||
</div>
|
||
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb</span></a></div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div class="tip admonition">
|
||
<p class="admonition-title">💾 Save Your Progress</p>
|
||
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
|
||
</div>
|
||
<hr class="docutils" />
|
||
<div class="prev-next-area">
|
||
<a class="left-prev" href="../16_compression/ABOUT.html" title="previous page">← Previous Module</a>
|
||
<a class="right-next" href="../18_acceleration/ABOUT.html" title="next page">Next Module →</a>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
|
||
<script type="text/x-thebe-config">
|
||
{
|
||
requestKernel: true,
|
||
binderOptions: {
|
||
repo: "binder-examples/jupyter-stacks-datascience",
|
||
ref: "master",
|
||
},
|
||
codeMirrorConfig: {
|
||
theme: "abcdef",
|
||
mode: "python"
|
||
},
|
||
kernelOptions: {
|
||
name: "python3",
|
||
path: "./modules"
|
||
},
|
||
predefinedOutput: true
|
||
}
|
||
</script>
|
||
<script>kernelName = 'python3'</script>
|
||
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<footer class="prev-next-footer d-print-none">
|
||
|
||
<div class="prev-next-area">
|
||
<a class="left-prev"
|
||
href="16_compression_ABOUT.html"
|
||
title="previous page">
|
||
<i class="fa-solid fa-angle-left"></i>
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">previous</p>
|
||
<p class="prev-next-title">16. Compression - Pruning and Model Compression</p>
|
||
</div>
|
||
</a>
|
||
<a class="right-next"
|
||
href="18_acceleration_ABOUT.html"
|
||
title="next page">
|
||
<div class="prev-next-info">
|
||
<p class="prev-next-subtitle">next</p>
|
||
<p class="prev-next-title">18. Acceleration - CPU Vectorization & Cache Optimization</p>
|
||
</div>
|
||
<i class="fa-solid fa-angle-right"></i>
|
||
</a>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||
|
||
|
||
<div class="sidebar-secondary-item">
|
||
<div class="page-toc tocsection onthispage">
|
||
<i class="fa-solid fa-list"></i> Contents
|
||
</div>
|
||
<nav class="bd-toc-nav page-toc">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-optimization-flow">KV Cache Optimization Flow</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-autoregressive-generation-problem">The Autoregressive Generation Problem</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-caching-solution">The Caching Solution</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-impact">Production Impact</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-speed-trade-off">Memory-Speed Trade-off</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-components">Core Components</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kvcache-data-structure">1. KVCache Data Structure</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#non-invasive-cache-integration">2. Non-Invasive Cache Integration</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#cached-attention-logic">3. Cached Attention Logic</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-steps">Implementation Steps</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-1-design-kvcache-structure">Step 1: Design KVCache Structure</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-2-implement-cache-updates">Step 2: Implement Cache Updates</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-3-enable-non-invasive-integration">Step 3: Enable Non-Invasive Integration</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-4-implement-cached-attention-forward">Step 4: Implement Cached Attention Forward</a></li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-5-validate-correctness">Step 5: Validate Correctness</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-data-instead-of-tensor-operations">Why .data Instead of Tensor Operations?</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-profiling">Inline Testing & Profiling</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-production-challenges">Real-World Production Challenges</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-optimization-patterns">Production Optimization Patterns</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#huggingface-cache-patterns-comparison">HuggingFace Cache Patterns Comparison</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expected-speedup-by-sequence-length">Expected Speedup by Sequence Length</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-usage-by-model-size">Memory Usage by Model Size</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#throughput-impact">Throughput Impact</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#where-this-code-lives-in-the-final-package">Where This Code Lives in the Final Package</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-challenges-and-solutions">Common Challenges and Solutions</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-1-cache-synchronization-across-layers">Challenge 1: Cache Synchronization Across Layers</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-2-memory-overhead-for-large-models">Challenge 2: Memory Overhead for Large Models</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-3-correctness-validation">Challenge 3: Correctness Validation</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-4-integration-without-breaking-existing-code">Challenge 4: Integration Without Breaking Existing Code</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
|
||
</ul>
|
||
</nav></div>
|
||
|
||
</div></div>
|
||
|
||
|
||
</div>
|
||
<footer class="bd-footer-content">
|
||
|
||
<div class="bd-footer-content__inner container">
|
||
|
||
<div class="footer-item">
|
||
|
||
<p class="component-author">
|
||
By Prof. Vijay Janapa Reddi (Harvard University)
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
|
||
<p class="copyright">
|
||
|
||
© Copyright 2025.
|
||
<br/>
|
||
|
||
</p>
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</footer>
|
||
|
||
|
||
</main>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
|
||
|
||
<footer class="bd-footer">
|
||
</footer>
|
||
</body>
|
||
</html> |