Files
TinyTorch/modules/17_memoization_ABOUT.html
2025-12-05 00:52:38 +00:00

1718 lines
99 KiB
HTML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>17. Memoization - Computational Reuse for Inference &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script type="module" src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs"></script>
<script type="module" src="https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs"></script>
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";import elkLayouts from "https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs";mermaid.registerLayoutLoaders(elkLayouts);mermaid.initialize({startOnLoad:false});</script>
<script src="https://cdn.jsdelivr.net/npm/d3@7.9.0/dist/d3.min.js"></script>
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";
const defaultStyle = document.createElement('style');
defaultStyle.textContent = `pre.mermaid {
/* Same as .mermaid-container > pre */
display: block;
width: 100%;
}
pre.mermaid > svg {
/* Same as .mermaid-container > pre > svg */
height: 500px;
width: 100%;
max-width: 100% !important;
}
`;
document.head.appendChild(defaultStyle);
const fullscreenStyle = document.createElement('style');
fullscreenStyle.textContent = `.mermaid-container {
display: flex;
flex-direction: row;
width: 100%;
}
.mermaid-container > pre {
display: block;
width: 100%;
}
.mermaid-container > pre > svg {
height: 500px;
width: 100%;
max-width: 100% !important;
}
.mermaid-fullscreen-btn {
width: 28px;
height: 28px;
background: rgba(255, 255, 255, 0.95);
border: 1px solid rgba(0, 0, 0, 0.3);
border-radius: 4px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s;
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
font-size: 14px;
line-height: 1;
padding: 0;
color: #333;
}
.mermaid-fullscreen-btn:hover {
opacity: 100% !important;
background: rgba(255, 255, 255, 1);
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.3);
transform: scale(1.1);
}
.mermaid-fullscreen-btn.dark-theme {
background: rgba(50, 50, 50, 0.95);
border: 1px solid rgba(255, 255, 255, 0.3);
color: #e0e0e0;
}
.mermaid-fullscreen-btn.dark-theme:hover {
background: rgba(60, 60, 60, 1);
box-shadow: 0 3px 10px rgba(255, 255, 255, 0.2);
}
.mermaid-fullscreen-modal {
display: none;
position: fixed !important;
top: 0 !important;
left: 0 !important;
width: 95vw;
height: 100vh;
background: rgba(255, 255, 255, 0.98);
z-index: 9999;
padding: 20px;
overflow: auto;
}
.mermaid-fullscreen-modal.dark-theme {
background: rgba(0, 0, 0, 0.98);
}
.mermaid-fullscreen-modal.active {
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen {
position: relative;
width: 95vw;
height: 90vh;
max-width: 95vw;
max-height: 90vh;
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
overflow: auto;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen.dark-theme {
background: #1a1a1a;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.8);
}
.mermaid-container-fullscreen pre.mermaid {
width: 100%;
height: 100%;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen .mermaid svg {
height: 100% !important;
width: 100% !important;
cursor: grab;
}
.mermaid-fullscreen-close {
position: fixed !important;
top: 20px !important;
right: 20px !important;
width: 40px;
height: 40px;
background: rgba(255, 255, 255, 0.95);
border: 1px solid rgba(0, 0, 0, 0.2);
border-radius: 50%;
cursor: pointer;
z-index: 10000;
display: flex;
align-items: center;
justify-content: center;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
transition: all 0.2s;
font-size: 24px;
line-height: 1;
color: #333;
}
.mermaid-fullscreen-close:hover {
background: white;
box-shadow: 0 6px 16px rgba(0, 0, 0, 0.4);
transform: scale(1.1);
}
.mermaid-fullscreen-close.dark-theme {
background: rgba(50, 50, 50, 0.95);
border: 1px solid rgba(255, 255, 255, 0.2);
color: #e0e0e0;
}
.mermaid-fullscreen-close.dark-theme:hover {
background: rgba(60, 60, 60, 1);
box-shadow: 0 6px 16px rgba(255, 255, 255, 0.2);
}
.mermaid-fullscreen-modal .mermaid-fullscreen-btn {
display: none !important;
}`;
document.head.appendChild(fullscreenStyle);
// Detect if page has dark background
const isDarkTheme = () => {
const bgColor = window.getComputedStyle(document.body).backgroundColor;
const match = bgColor.match(/rgb\((\d+),\s*(\d+),\s*(\d+)/);
if (match) {
const r = parseInt(match[1]);
const g = parseInt(match[2]);
const b = parseInt(match[3]);
const brightness = (r * 299 + g * 587 + b * 114) / 1000;
return brightness < 128;
}
return false;
};
const load = async () => {
await mermaid.run();
const all_mermaids = document.querySelectorAll(".mermaid");
const mermaids_processed = document.querySelectorAll(".mermaid[data-processed='true']");
if ("False" === "True") {
const mermaids_to_add_zoom = -1 === -1 ? all_mermaids.length : -1;
if(mermaids_to_add_zoom > 0) {
var svgs = d3.selectAll("");
if(all_mermaids.length !== mermaids_processed.length) {
setTimeout(load, 200);
return;
} else if(svgs.size() !== mermaids_to_add_zoom) {
setTimeout(load, 200);
return;
} else {
svgs.each(function() {
var svg = d3.select(this);
svg.html("<g class='wrapper'>" + svg.html() + "</g>");
var inner = svg.select("g");
var zoom = d3.zoom().on("zoom", function(event) {
inner.attr("transform", event.transform);
});
svg.call(zoom);
});
}
}
} else if(all_mermaids.length !== mermaids_processed.length) {
// Wait for mermaid to process all diagrams
setTimeout(load, 200);
return;
}
const darkTheme = isDarkTheme();
// Stop here if not adding fullscreen capability
if ("True" !== "True") return;
const modal = document.createElement('div');
modal.className = 'mermaid-fullscreen-modal' + (darkTheme ? ' dark-theme' : '');
modal.setAttribute('role', 'dialog');
modal.setAttribute('aria-modal', 'true');
modal.setAttribute('aria-label', 'Fullscreen diagram viewer');
modal.innerHTML = `
<button class="mermaid-fullscreen-close${darkTheme ? ' dark-theme' : ''}" aria-label="Close fullscreen">✕</button>
<div class="mermaid-container-fullscreen${darkTheme ? ' dark-theme' : ''}"></div>
`;
document.body.appendChild(modal);
const modalContent = modal.querySelector('.mermaid-container-fullscreen');
const closeBtn = modal.querySelector('.mermaid-fullscreen-close');
let previousScrollOffset = [window.scrollX, window.scrollY];
const closeModal = () => {
modal.classList.remove('active');
modalContent.innerHTML = '';
document.body.style.overflow = ''
window.scrollTo({left: previousScrollOffset[0], top: previousScrollOffset[1], behavior: 'instant'});
};
closeBtn.addEventListener('click', closeModal);
modal.addEventListener('click', (e) => {
if (e.target === modal) closeModal();
});
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape' && modal.classList.contains('active')) {
closeModal();
}
});
const allButtons = [];
document.querySelectorAll('.mermaid').forEach((mermaidDiv) => {
if (mermaidDiv.parentNode.classList.contains('mermaid-container') ||
mermaidDiv.closest('.mermaid-fullscreen-modal')) {
return;
}
const container = document.createElement('div');
container.className = 'mermaid-container';
mermaidDiv.parentNode.insertBefore(container, mermaidDiv);
container.appendChild(mermaidDiv);
const fullscreenBtn = document.createElement('button');
fullscreenBtn.className = 'mermaid-fullscreen-btn' + (darkTheme ? ' dark-theme' : '');
fullscreenBtn.setAttribute('aria-label', 'View diagram in fullscreen');
fullscreenBtn.textContent = '⛶';
fullscreenBtn.style.opacity = '50%';
// Calculate dynamic position based on diagram's margin and padding
const diagramStyle = window.getComputedStyle(mermaidDiv);
const marginTop = parseFloat(diagramStyle.marginTop) || 0;
const marginRight = parseFloat(diagramStyle.marginRight) || 0;
const paddingTop = parseFloat(diagramStyle.paddingTop) || 0;
const paddingRight = parseFloat(diagramStyle.paddingRight) || 0;
fullscreenBtn.style.top = `${marginTop + paddingTop + 4}px`;
fullscreenBtn.style.right = `${marginRight + paddingRight + 4}px`;
fullscreenBtn.addEventListener('click', () => {
previousScrollOffset = [window.scroll, window.scrollY];
const clone = mermaidDiv.cloneNode(true);
modalContent.innerHTML = '';
modalContent.appendChild(clone);
const svg = clone.querySelector('svg');
if (svg) {
svg.removeAttribute('width');
svg.removeAttribute('height');
svg.style.width = '100%';
svg.style.height = 'auto';
svg.style.maxWidth = '100%';
svg.style.sdisplay = 'block';
if ("False" === "True") {
setTimeout(() => {
const g = svg.querySelector('g');
if (g) {
var svgD3 = d3.select(svg);
svgD3.html("<g class='wrapper'>" + svgD3.html() + "</g>");
var inner = svgD3.select("g");
var zoom = d3.zoom().on("zoom", function(event) {
inner.attr("transform", event.transform);
});
svgD3.call(zoom);
}
}, 100);
}
}
modal.classList.add('active');
document.body.style.overflow = 'hidden';
});
container.appendChild(fullscreenBtn);
allButtons.push(fullscreenBtn);
});
// Update theme classes when theme changes
const updateTheme = () => {
const dark = isDarkTheme();
allButtons.forEach(btn => {
if (dark) {
btn.classList.add('dark-theme');
} else {
btn.classList.remove('dark-theme');
}
});
if (dark) {
modal.classList.add('dark-theme');
modalContent.classList.add('dark-theme');
closeBtn.classList.add('dark-theme');
} else {
modal.classList.remove('dark-theme');
modalContent.classList.remove('dark-theme');
closeBtn.classList.remove('dark-theme');
}
};
// Watch for theme changes
const observer = new MutationObserver(updateTheme);
observer.observe(document.documentElement, {
attributes: true,
attributeFilter: ['class', 'style', 'data-theme']
});
observer.observe(document.body, {
attributes: true,
attributeFilter: ['class', 'style']
});
};
window.addEventListener("load", load);
</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/17_memoization_ABOUT';</script>
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
<script src="../_static/marimo-badges.js?v=e6289128"></script>
<script src="../_static/sidebar-link.js?v=404b701b"></script>
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
<link rel="icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="18. Acceleration - CPU Vectorization &amp; Cache Optimization" href="18_acceleration_ABOUT.html" />
<link rel="prev" title="16. Compression - Pruning and Model Compression" href="16_compression_ABOUT.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search..."
aria-label="Search..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/17_memoization_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>17. Memoization - Computational Reuse for Inference</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-optimization-flow">KV Cache Optimization Flow</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-autoregressive-generation-problem">The Autoregressive Generation Problem</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-caching-solution">The Caching Solution</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-impact">Production Impact</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-speed-trade-off">Memory-Speed Trade-off</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-components">Core Components</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kvcache-data-structure">1. KVCache Data Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#non-invasive-cache-integration">2. Non-Invasive Cache Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#cached-attention-logic">3. Cached Attention Logic</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-steps">Implementation Steps</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-1-design-kvcache-structure">Step 1: Design KVCache Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-2-implement-cache-updates">Step 2: Implement Cache Updates</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-3-enable-non-invasive-integration">Step 3: Enable Non-Invasive Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-4-implement-cached-attention-forward">Step 4: Implement Cached Attention Forward</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-5-validate-correctness">Step 5: Validate Correctness</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-data-instead-of-tensor-operations">Why .data Instead of Tensor Operations?</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-profiling">Inline Testing &amp; Profiling</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-production-challenges">Real-World Production Challenges</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-optimization-patterns">Production Optimization Patterns</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#huggingface-cache-patterns-comparison">HuggingFace Cache Patterns Comparison</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expected-speedup-by-sequence-length">Expected Speedup by Sequence Length</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-usage-by-model-size">Memory Usage by Model Size</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#throughput-impact">Throughput Impact</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#where-this-code-lives-in-the-final-package">Where This Code Lives in the Final Package</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-challenges-and-solutions">Common Challenges and Solutions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-1-cache-synchronization-across-layers">Challenge 1: Cache Synchronization Across Layers</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-2-memory-overhead-for-large-models">Challenge 2: Memory Overhead for Large Models</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-3-correctness-validation">Challenge 3: Correctness Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-4-integration-without-breaking-existing-code">Challenge 4: Integration Without Breaking Existing Code</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="memoization-computational-reuse-for-inference">
<h1>17. Memoization - Computational Reuse for Inference<a class="headerlink" href="#memoization-computational-reuse-for-inference" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 4-5 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>Memoization is a fundamental optimization pattern: cache computational results to avoid redundant work. Youll apply this pattern to transformers through KV (Key-Value) caching, transforming O(n²) autoregressive generation into O(n) complexity and achieving 10-15x speedup. This optimization makes production language model serving economically viable.</p>
<p>This is inference-only optimization - youll implement caching patterns used in every production LLM from ChatGPT to Claude to GitHub Copilot.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Understand Memoization Pattern</strong>: Recognize when computational reuse through caching applies to ML problems and understand the memory-speed trade-off</p></li>
<li><p><strong>Implement KVCache Structure</strong>: Build efficient cache data structures with O(1) updates, proper memory management, and multi-layer support</p></li>
<li><p><strong>Apply Caching to Transformers</strong>: Integrate KV caching into attention layers without modifying existing transformer code (non-invasive enhancement)</p></li>
<li><p><strong>Measure Performance Gains</strong>: Profile latency improvements, measure O(n²) → O(n) complexity reduction, and understand speedup characteristics</p></li>
<li><p><strong>Analyze Production Trade-offs</strong>: Calculate cache memory costs, understand cache invalidation policies, and recognize when caching justifies its overhead</p></li>
</ul>
</section>
<section id="build-use-optimize">
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Optimize</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement KVCache data structure with efficient updates, cached attention integration, and multi-layer cache management</p></li>
<li><p><strong>Use</strong>: Apply caching to GPT text generation, measure 10-15x speedup over naive generation, and validate output correctness</p></li>
<li><p><strong>Optimize</strong>: Profile memory bandwidth bottlenecks, measure cache hit rates, and understand when memory cost exceeds latency benefit</p></li>
</ol>
</section>
<section id="why-this-matters">
<h2>Why This Matters<a class="headerlink" href="#why-this-matters" title="Link to this heading">#</a></h2>
<section id="kv-cache-optimization-flow">
<h3>KV Cache Optimization Flow<a class="headerlink" href="#kv-cache-optimization-flow" title="Link to this heading">#</a></h3>
<p>Caching stores computed keys and values, avoiding recomputation for each new token:</p>
<pre class="mermaid">
graph LR
A[Token i&lt;br/&gt;Compute K_i, V_i] --&gt; B[Cache&lt;br/&gt;Store K_i, V_i]
B --&gt; C[Token i+1&lt;br/&gt;New computation]
C --&gt; D[Reuse&lt;br/&gt;K_i, V_i from cache]
D --&gt; E[Only compute&lt;br/&gt;K_{i+1}, V_{i+1}]
E --&gt; F[10-15× speedup]
style A fill:#e3f2fd
style C fill:#e3f2fd
style B fill:#f3e5f5
style D fill:#fff3e0
style E fill:#ffe0b2
style F fill:#f0fdf4
</pre><p><strong>Optimization</strong>: Compute K,V once → Cache → Reuse for all future tokens → O(n²) → O(n) complexity</p>
</section>
<section id="the-autoregressive-generation-problem">
<h3>The Autoregressive Generation Problem<a class="headerlink" href="#the-autoregressive-generation-problem" title="Link to this heading">#</a></h3>
<p>Without caching, transformer generation has quadratic complexity:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Naive Generation (O(n²) complexity):
Step 1: Generate token 1 → Compute attention for [t₀] (1 computation)
Step 2: Generate token 2 → Compute attention for [t₀, t₁] (2 computations, t₀ RECOMPUTED!)
Step 3: Generate token 3 → Compute attention for [t₀, t₁, t₂] (3 computations, t₀,t₁ RECOMPUTED!)
...
Step n: Generate token n → Compute attention for [t₀, ..., tₙ] (n computations, ALL RECOMPUTED!)
Total: 1 + 2 + 3 + ... + n = n(n+1)/2 = O(n²) complexity!
For 100 tokens: ~5,050 redundant K,V computations
</pre></div>
</div>
<p><strong>The Key Insight</strong>: K and V matrices for previous tokens NEVER change, yet we recompute them every step!</p>
</section>
<section id="the-caching-solution">
<h3>The Caching Solution<a class="headerlink" href="#the-caching-solution" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Cached Generation (O(n) complexity):
Step 1: Compute K₁, V₁ → Cache them → Attention with cached[K₁, V₁]
Step 2: Compute K₂, V₂ → Cache them → Attention with cached[K₁, K₂, V₁, V₂] (reuse K₁, V₁!)
Step 3: Compute K₃, V₃ → Cache them → Attention with cached[K₁, K₂, K₃, V₁, V₂, V₃] (reuse all!)
Total: 1 + 1 + 1 + ... + 1 = n computations (50x reduction for n=100!)
</pre></div>
</div>
</section>
<section id="production-impact">
<h3>Production Impact<a class="headerlink" href="#production-impact" title="Link to this heading">#</a></h3>
<p>KV caching is mandatory for all production LLM serving:</p>
<ul class="simple">
<li><p><strong>ChatGPT/GPT-4</strong>: Would be 50-100x slower without caching, making conversational AI economically infeasible</p></li>
<li><p><strong>Claude</strong>: Caches up to 100K tokens of context, enabling long document processing</p></li>
<li><p><strong>GitHub Copilot</strong>: Real-time code completion requires sub-100ms latency - impossible without caching</p></li>
<li><p><strong>Google Gemini</strong>: Multi-level caching (KV + intermediate layers) serves billions of requests daily</p></li>
</ul>
<p>Without KV caching, the computational cost would make these services prohibitively expensive.</p>
</section>
<section id="memory-speed-trade-off">
<h3>Memory-Speed Trade-off<a class="headerlink" href="#memory-speed-trade-off" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Traditional Approach (No Cache):
Memory: O(1) Cost: Negligible
Compute: O(n²) Cost: Prohibitive for long sequences
Cached Approach (KV Cache):
Memory: O(n × d_k) Cost: ~18MB per batch for GPT-2
Compute: O(n) Cost: 10-15x faster than naive
Trade-off Winner: Memory is cheap, compute is expensive!
Use O(n) memory to save O(n²) compute.
</pre></div>
</div>
</section>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="core-components">
<h3>Core Components<a class="headerlink" href="#core-components" title="Link to this heading">#</a></h3>
<p>Youll implement three main components:</p>
<section id="kvcache-data-structure">
<h4>1. KVCache Data Structure<a class="headerlink" href="#kvcache-data-structure" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">KVCache</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Efficient key-value cache for autoregressive generation.</span>
<span class="sd"> Memory Layout:</span>
<span class="sd"> keys: (num_layers, batch, num_heads, seq_len, d_k)</span>
<span class="sd"> values: (num_layers, batch, num_heads, seq_len, d_v)</span>
<span class="sd"> For GPT-2 (12 layers, 12 heads, 1024 seq, 64 dims):</span>
<span class="sd"> 12 layers × 12 heads × 1024 seq × 64 dims = ~9M values</span>
<span class="sd"> At FP32 (4 bytes): ~36MB per batch item</span>
<span class="sd"> At FP16 (2 bytes): ~18MB per batch item</span>
<span class="sd"> Operations:</span>
<span class="sd"> update(layer_idx, key, value) -&gt; None # O(1) append</span>
<span class="sd"> get(layer_idx) -&gt; (cached_k, cached_v) # O(1) retrieval</span>
<span class="sd"> advance() -&gt; None # Increment position</span>
<span class="sd"> reset() -&gt; None # Clear for new sequence</span>
<span class="sd"> &quot;&quot;&quot;</span>
</pre></div>
</div>
<p><strong>Key Design Decisions</strong>:</p>
<ul class="simple">
<li><p>Pre-allocate cache tensors to avoid dynamic resizing overhead</p></li>
<li><p>Use position counter for O(1) indexed updates (no copying)</p></li>
<li><p>Store per-layer caches to support multi-layer transformers</p></li>
<li><p>Track sequence position externally for clean separation</p></li>
</ul>
</section>
<section id="non-invasive-cache-integration">
<h4>2. Non-Invasive Cache Integration<a class="headerlink" href="#non-invasive-cache-integration" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Enable KV caching WITHOUT modifying Module 12/13 code.</span>
<span class="sd"> This demonstrates non-invasive optimization - adding capabilities</span>
<span class="sd"> to existing systems without breaking them. Similar to how Module 05</span>
<span class="sd"> uses enable_autograd() to add gradient tracking to Tensors.</span>
<span class="sd"> Approach:</span>
<span class="sd"> 1. Create KVCache sized for model architecture</span>
<span class="sd"> 2. Store cache on model as model._kv_cache</span>
<span class="sd"> 3. Wrap each attention layer&#39;s forward method with caching logic</span>
<span class="sd"> 4. Intercept attention calls to manage cache automatically</span>
<span class="sd"> This is composition + monkey-patching - a critical ML systems pattern!</span>
<span class="sd"> &quot;&quot;&quot;</span>
</pre></div>
</div>
<p><strong>Why Non-Invasive?</strong></p>
<ul class="simple">
<li><p>Modules 12-13 (Attention, Transformers) work unchanged</p></li>
<li><p>Module 17 ADDS optimization, doesnt BREAK old code</p></li>
<li><p>Teaches “forward-only” systems engineering: never modify earlier modules</p></li>
<li><p>Matches how production systems layer optimizations (vLLM, HuggingFace)</p></li>
</ul>
</section>
<section id="cached-attention-logic">
<h4>3. Cached Attention Logic<a class="headerlink" href="#cached-attention-logic" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">cached_forward</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cache-aware attention with three paths:</span>
<span class="sd"> PATH 1: Training (seq_len &gt; 1)</span>
<span class="sd"> → Use original attention (preserve gradients)</span>
<span class="sd"> → O(n²) but needed for backpropagation</span>
<span class="sd"> PATH 2: First Token (seq_len == 1, cache empty)</span>
<span class="sd"> → Use original attention (initialize cache)</span>
<span class="sd"> → O(1) - just one token</span>
<span class="sd"> PATH 3: Cached Generation (seq_len == 1, cache populated)</span>
<span class="sd"> → Compute K,V for NEW token only</span>
<span class="sd"> → Retrieve ALL cached K,V (includes history)</span>
<span class="sd"> → Attention with cached context</span>
<span class="sd"> → O(n) - only compute new, reuse cache</span>
<span class="sd"> → THIS IS WHERE THE SPEEDUP HAPPENS!</span>
<span class="sd"> &quot;&quot;&quot;</span>
</pre></div>
</div>
</section>
</section>
<section id="implementation-steps">
<h3>Implementation Steps<a class="headerlink" href="#implementation-steps" title="Link to this heading">#</a></h3>
<section id="step-1-design-kvcache-structure">
<h4>Step 1: Design KVCache Structure<a class="headerlink" href="#step-1-design-kvcache-structure" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Initialize cache storage for all layers</p></li>
<li><p>Pre-allocate tensors with maximum sequence length</p></li>
<li><p>Track current sequence position (write pointer)</p></li>
<li><p>Implement update() for O(1) append operations</p></li>
<li><p>Implement get() for O(1) retrieval of valid cache portion</p></li>
</ol>
</section>
<section id="step-2-implement-cache-updates">
<h4>Step 2: Implement Cache Updates<a class="headerlink" href="#step-2-implement-cache-updates" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Validate layer index and sequence position</p></li>
<li><p>Write new K,V to current position (indexed assignment)</p></li>
<li><p>Advance position counter after all layers processed</p></li>
<li><p>Handle batch dimension and multi-head structure</p></li>
</ol>
</section>
<section id="step-3-enable-non-invasive-integration">
<h4>Step 3: Enable Non-Invasive Integration<a class="headerlink" href="#step-3-enable-non-invasive-integration" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Validate model has required attributes (embed_dim, num_layers, etc.)</p></li>
<li><p>Calculate head_dim from embed_dim and num_heads</p></li>
<li><p>Create KVCache instance sized for model</p></li>
<li><p>Store cache on model with model._kv_cache flag</p></li>
<li><p>Wrap each blocks attention.forward with caching logic</p></li>
</ol>
</section>
<section id="step-4-implement-cached-attention-forward">
<h4>Step 4: Implement Cached Attention Forward<a class="headerlink" href="#step-4-implement-cached-attention-forward" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Detect path: training (seq_len &gt; 1), first token (cache empty), or cached generation</p></li>
<li><p>For cached path: Compute Q,K,V projections for new token only</p></li>
<li><p>Reshape to multi-head format (batch, num_heads, 1, head_dim)</p></li>
<li><p>Update cache with new K,V pairs</p></li>
<li><p>Retrieve ALL cached K,V (history + new)</p></li>
<li><p>Compute attention: softmax(Q &#64; K^T / √d_k) &#64; V using NumPy (.data)</p></li>
<li><p>Apply output projection and return</p></li>
</ol>
</section>
<section id="step-5-validate-correctness">
<h4>Step 5: Validate Correctness<a class="headerlink" href="#step-5-validate-correctness" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Test cache initialization and memory calculation</p></li>
<li><p>Verify single-token and multi-token updates</p></li>
<li><p>Validate multi-layer cache synchronization</p></li>
<li><p>Test reset functionality</p></li>
<li><p>Measure speedup vs non-cached generation</p></li>
</ol>
</section>
</section>
<section id="why-data-instead-of-tensor-operations">
<h3>Why .data Instead of Tensor Operations?<a class="headerlink" href="#why-data-instead-of-tensor-operations" title="Link to this heading">#</a></h3>
<p>In cached attention, we use NumPy via <code class="docutils literal notranslate"><span class="pre">.data</span></code> for three reasons:</p>
<ol class="arabic simple">
<li><p><strong>Explicit Intent</strong>: Makes it crystal clear this is inference-only</p>
<ul class="simple">
<li><p>Training: Uses Tensor operations → gradients tracked</p></li>
<li><p>Inference: Uses .data → no gradient overhead</p></li>
</ul>
</li>
<li><p><strong>Performance</strong>: Avoids any autograd bookkeeping</p>
<ul class="simple">
<li><p>Even small overhead matters in generation hotpath</p></li>
<li><p>Production LLMs (vLLM, llama.cpp) use similar patterns</p></li>
</ul>
</li>
<li><p><strong>Educational Clarity</strong>: Shows students the distinction</p>
<ul class="simple">
<li><p>“When do I need gradients?” (training)</p></li>
<li><p>“When can I skip them?” (inference)</p></li>
</ul>
</li>
</ol>
<p>We COULD use Tensor operations with requires_grad=False, but .data is more explicit and follows industry patterns.</p>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you understand transformers and profiling:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
<span class="c1"># Verify prerequisite modules</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>transformers
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>profiling
</pre></div>
</div>
<p><strong>Required Understanding</strong>:</p>
<ul class="simple">
<li><p>Multi-head attention mechanism (Module 12)</p></li>
<li><p>Transformer architecture (Module 13)</p></li>
<li><p>Latency profiling techniques (Module 14)</p></li>
<li><p>O(n²) complexity of attention computation</p></li>
</ul>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/17_memoization/memoization_dev.ipynb</span></code></p></li>
<li><p><strong>Profile naive generation</strong>: Measure O(n²) growth in latency as sequence lengthens</p></li>
<li><p><strong>Implement KVCache class</strong>: Build data structure with update(), get(), advance(), reset()</p></li>
<li><p><strong>Test cache operations</strong>: Verify single-token, multi-token, and multi-layer caching</p></li>
<li><p><strong>Implement enable_kv_cache()</strong>: Non-invasively patch model attention layers</p></li>
<li><p><strong>Build cached attention forward</strong>: Three-path logic (training, first token, cached generation)</p></li>
<li><p><strong>Measure speedup</strong>: Profile cached vs non-cached generation, validate O(n) complexity</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">17</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">memoization</span></code></p></li>
</ol>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify memoization functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>memoization
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>memoization<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>KVCache Initialization</strong>: Validate cache creation, memory calculation, and initial state</p></li>
<li><p><strong>Cache Updates</strong>: Test single-token append, multi-token sequences, and O(1) update performance</p></li>
<li><p><strong>Multi-Layer Synchronization</strong>: Verify independent per-layer caches with correct indexing</p></li>
<li><p><strong>Cache Retrieval</strong>: Test get() returns only valid cached portion (up to seq_pos)</p></li>
<li><p><strong>Non-Invasive Integration</strong>: Validate enable_kv_cache() works without breaking model</p></li>
<li><p><strong>Correctness Validation</strong>: Compare cached vs non-cached outputs (should be identical)</p></li>
<li><p><strong>Performance Measurement</strong>: Measure speedup at different sequence lengths</p></li>
<li><p><strong>Memory Tracking</strong>: Calculate cache size and validate memory usage</p></li>
</ul>
</section>
<section id="inline-testing-profiling">
<h3>Inline Testing &amp; Profiling<a class="headerlink" href="#inline-testing-profiling" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive validation with performance measurement:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Unit Test: KVCache Implementation</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">KVCache</span> <span class="n">Implementation</span><span class="o">...</span>
<span class="n">Cache</span> <span class="n">initialized</span><span class="p">:</span> <span class="mf">0.59</span> <span class="n">MB</span>
<span class="err"></span> <span class="n">Cache</span> <span class="n">initialization</span> <span class="n">successful</span>
<span class="err"></span> <span class="n">Append</span> <span class="ow">and</span> <span class="n">retrieval</span> <span class="n">work</span> <span class="n">correctly</span>
<span class="err"></span> <span class="n">Multi</span><span class="o">-</span><span class="n">layer</span> <span class="n">caching</span> <span class="n">validated</span>
<span class="err"></span> <span class="n">Reset</span> <span class="n">functionality</span> <span class="n">verified</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">KVCache</span> <span class="err"></span>
<span class="c1"># Integration Test: Performance Measurement</span>
<span class="err">🔬</span> <span class="n">Profiling</span> <span class="n">Transformer</span> <span class="n">Generation</span> <span class="p">(</span><span class="n">Without</span> <span class="n">Caching</span><span class="p">):</span>
<span class="n">Seq</span> <span class="n">Len</span> <span class="o">|</span> <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="o">|</span> <span class="n">Growth</span>
<span class="o">---------|----------------|----------</span>
<span class="mi">10</span> <span class="o">|</span> <span class="mf">2.34</span> <span class="o">|</span> <span class="n">baseline</span>
<span class="mi">20</span> <span class="o">|</span> <span class="mf">4.89</span> <span class="o">|</span> <span class="mf">2.09</span><span class="err">×</span>
<span class="mi">40</span> <span class="o">|</span> <span class="mf">10.12</span> <span class="o">|</span> <span class="mf">2.07</span><span class="err">×</span>
<span class="mi">80</span> <span class="o">|</span> <span class="mf">21.45</span> <span class="o">|</span> <span class="mf">2.12</span><span class="err">×</span>
<span class="mi">160</span> <span class="o">|</span> <span class="mf">45.67</span> <span class="o">|</span> <span class="mf">2.13</span><span class="err">×</span>
<span class="err">💡</span> <span class="n">Key</span> <span class="n">Observations</span><span class="p">:</span>
<span class="err"></span> <span class="n">Latency</span> <span class="n">grows</span> <span class="n">QUADRATICALLY</span> <span class="k">with</span> <span class="n">sequence</span> <span class="n">length</span>
<span class="err"></span> <span class="n">Each</span> <span class="n">new</span> <span class="n">token</span> <span class="n">forces</span> <span class="n">recomputation</span> <span class="n">of</span> <span class="n">ALL</span> <span class="n">previous</span> <span class="n">K</span><span class="p">,</span><span class="n">V</span> <span class="n">pairs</span>
<span class="err"></span> <span class="n">For</span> <span class="mi">160</span> <span class="n">tokens</span><span class="p">:</span> <span class="o">~</span><span class="mi">4</span><span class="err">×</span> <span class="n">time</span> <span class="n">vs</span> <span class="mi">80</span> <span class="n">tokens</span> <span class="p">(</span><span class="mi">2</span><span class="err">²</span> <span class="n">growth</span><span class="p">)</span>
<span class="err">🎯</span> <span class="n">The</span> <span class="n">Solution</span><span class="p">:</span> <span class="n">CACHE</span> <span class="n">the</span> <span class="n">K</span><span class="p">,</span><span class="n">V</span> <span class="n">values</span><span class="err">!</span> <span class="p">(</span><span class="n">That</span><span class="s1">&#39;s memoization)</span>
<span class="err"></span> <span class="n">Speedup</span><span class="p">:</span> <span class="mi">10</span><span class="o">-</span><span class="mi">15</span><span class="err">×</span> <span class="k">for</span> <span class="n">typical</span> <span class="n">generation</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.memoization</span><span class="w"> </span><span class="kn">import</span> <span class="n">KVCache</span><span class="p">,</span> <span class="n">enable_kv_cache</span>
<span class="c1"># Test cache with small transformer</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span>
<span class="n">batch_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">max_seq_len</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span>
<span class="n">num_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
<span class="n">num_heads</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span>
<span class="n">head_dim</span><span class="o">=</span><span class="mi">64</span>
<span class="p">)</span>
<span class="c1"># Simulate generation loop</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>
<span class="k">for</span> <span class="n">step</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">):</span>
<span class="k">for</span> <span class="n">layer_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">):</span>
<span class="c1"># New key-value pairs for this step</span>
<span class="n">new_k</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">64</span><span class="p">))</span>
<span class="n">new_v</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">64</span><span class="p">))</span>
<span class="c1"># Update cache (O(1) operation)</span>
<span class="n">cache</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">layer_idx</span><span class="p">,</span> <span class="n">new_k</span><span class="p">,</span> <span class="n">new_v</span><span class="p">)</span>
<span class="c1"># Advance position after all layers</span>
<span class="n">cache</span><span class="o">.</span><span class="n">advance</span><span class="p">()</span>
<span class="c1"># Retrieve cached values</span>
<span class="n">cached_k</span><span class="p">,</span> <span class="n">cached_v</span> <span class="o">=</span> <span class="n">cache</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">layer_idx</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cached 10 tokens: </span><span class="si">{</span><span class="n">cached_k</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="c1"># (1, 8, 10, 64)</span>
<span class="c1"># Calculate memory usage</span>
<span class="n">mem_info</span> <span class="o">=</span> <span class="n">cache</span><span class="o">.</span><span class="n">get_memory_usage</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cache memory: </span><span class="si">{</span><span class="n">mem_info</span><span class="p">[</span><span class="s1">&#39;total_mb&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> MB&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-production-challenges">
<h3>Real-World Production Challenges<a class="headerlink" href="#real-world-production-challenges" title="Link to this heading">#</a></h3>
<p><strong>Memory-Speed Trade-off Analysis</strong>:</p>
<ul class="simple">
<li><p>KV cache uses ~18MB per batch for GPT-2 (FP16). For batch=32, thats 576MB.</p></li>
<li><p>On an 8GB GPU, how many concurrent users can you serve?</p></li>
<li><p>Whats the trade-off between batch size and cache size?</p></li>
<li><p>When does memory bandwidth (cache access) become the bottleneck instead of compute?</p></li>
</ul>
<p><strong>Cache Invalidation Policies</strong>:</p>
<ul class="simple">
<li><p>In multi-turn chat, when should you clear the cache?</p></li>
<li><p>What happens when context exceeds max_seq_len?</p></li>
<li><p>How do production systems like ChatGPT handle context window limits?</p></li>
<li><p>Compare eviction policies: LRU, FIFO, sliding window, importance-based</p></li>
</ul>
<p><strong>Distributed Caching for Large Models</strong>:</p>
<ul class="simple">
<li><p>For models too large for one GPU, you need tensor parallelism</p></li>
<li><p>How do you partition the KV cache across GPUs?</p></li>
<li><p>Which dimension should you shard: layers, heads, or sequence?</p></li>
<li><p>Whats the communication overhead for cache synchronization?</p></li>
</ul>
<p><strong>Quantized Caching</strong>:</p>
<ul class="simple">
<li><p>Storing cache in INT8 instead of FP16 saves 50% memory</p></li>
<li><p>Whats the accuracy impact of quantized KV cache?</p></li>
<li><p>When is this trade-off worth it?</p></li>
<li><p>How does quantization error accumulate over long sequences?</p></li>
</ul>
</section>
<section id="production-optimization-patterns">
<h3>Production Optimization Patterns<a class="headerlink" href="#production-optimization-patterns" title="Link to this heading">#</a></h3>
<p><strong>Multi-Level Caching</strong>:</p>
<ul class="simple">
<li><p>What if you cache not just K,V but intermediate layer activations?</p></li>
<li><p>How does HuggingFaces <code class="docutils literal notranslate"><span class="pre">DynamicCache</span></code> differ from static pre-allocation?</p></li>
<li><p>When should you use persistent caching (save to disk) for very long conversations?</p></li>
</ul>
<p><strong>Speculation and Prefetching</strong>:</p>
<ul class="simple">
<li><p>What if you predict the next query and pre-compute KV cache?</p></li>
<li><p>How would speculative caching improve throughput?</p></li>
<li><p>Whats the risk if speculation is wrong?</p></li>
<li><p>When does prefetching justify its overhead?</p></li>
</ul>
</section>
<section id="mathematical-foundations">
<h3>Mathematical Foundations<a class="headerlink" href="#mathematical-foundations" title="Link to this heading">#</a></h3>
<p><strong>Complexity Reduction</strong>:</p>
<ul class="simple">
<li><p>Why does KV caching transform O(n²) into O(n)?</p></li>
<li><p>Calculate total operations for naive vs cached generation (n=100)</p></li>
<li><p>Whats the crossover point where caching overhead exceeds savings?</p></li>
</ul>
<p><strong>Memory Layout Optimization</strong>:</p>
<ul class="simple">
<li><p>Why pre-allocate cache instead of dynamic appending?</p></li>
<li><p>How does cache contiguity affect memory bandwidth?</p></li>
<li><p>Compare row-major vs column-major cache layouts for performance</p></li>
</ul>
<p><strong>Attention Computation Analysis</strong>:</p>
<ul class="simple">
<li><p>Why can we cache K,V but not Q (query)?</p></li>
<li><p>What property of autoregressive generation makes caching valid?</p></li>
<li><p>How would bidirectional attention (BERT) change caching strategy?</p></li>
</ul>
</section>
<section id="huggingface-cache-patterns-comparison">
<h3>HuggingFace Cache Patterns Comparison<a class="headerlink" href="#huggingface-cache-patterns-comparison" title="Link to this heading">#</a></h3>
<p><strong>Static vs Dynamic Cache</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch (Module 17): Static pre-allocation</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="n">max_seq_len</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span> <span class="c1"># Fixed size, O(1) updates</span>
<span class="c1"># HuggingFace: Dynamic cache (DynamicCache class)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">DynamicCache</span><span class="p">()</span> <span class="c1"># Grows as needed, more flexible but slower</span>
</pre></div>
</div>
<p><strong>When to Use Each</strong>:</p>
<ul class="simple">
<li><p><strong>Static (TinyTorch)</strong>: Known max length, maximum performance, inference serving</p></li>
<li><p><strong>Dynamic (HuggingFace)</strong>: Variable lengths, exploration, research</p></li>
</ul>
<p><strong>Production Systems (vLLM, TGI)</strong>:</p>
<ul class="simple">
<li><p>Use PagedAttention for virtual memory management of KV cache</p></li>
<li><p>Enables efficient memory sharing across requests</p></li>
<li><p>Reduces memory fragmentation for variable-length sequences</p></li>
</ul>
</section>
</section>
<section id="performance-characteristics">
<h2>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h2>
<section id="expected-speedup-by-sequence-length">
<h3>Expected Speedup by Sequence Length<a class="headerlink" href="#expected-speedup-by-sequence-length" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Speedup Characteristics (GPT-2 on CPU):
┌─────────────┬──────────────┬──────────────┬──────────┐
│ Seq Length │ No Cache │ With Cache │ Speedup │
├─────────────┼──────────────┼──────────────┼──────────┤
│ 10 tokens │ ~80 tok/s │ ~600 tok/s │ 7.5x │
│ 25 tokens │ ~40 tok/s │ ~500 tok/s │ 12.5x │
│ 50 tokens │ ~25 tok/s │ ~400 tok/s │ 16.0x │
│ 100 tokens │ ~12 tok/s │ ~200 tok/s │ 16.7x │
└─────────────┴──────────────┴──────────────┴──────────┘
Key Insight: Speedup increases with sequence length!
Why? Longer sequences = more redundant computation without cache.
</pre></div>
</div>
</section>
<section id="memory-usage-by-model-size">
<h3>Memory Usage by Model Size<a class="headerlink" href="#memory-usage-by-model-size" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Cache Memory Requirements (FP16, batch_size=1):
┌──────────────┬────────┬────────┬─────────┬──────────────┐
│ Model │ Layers │ Heads │ Seq Len │ Cache Memory │
├──────────────┼────────┼────────┼─────────┼──────────────┤
│ TinyGPT │ 4 │ 4 │ 128 │ 0.5 MB │
│ GPT-2 (124M) │ 12 │ 12 │ 1024 │ 18.0 MB │
│ GPT-3 (175B) │ 96 │ 96 │ 2048 │ 4.7 GB │
└──────────────┴────────┴────────┴─────────┴──────────────┘
Formula: memory = num_layers × num_heads × max_seq_len × head_dim × 2 × 2 bytes
(2× for K and V, 2 bytes for FP16)
</pre></div>
</div>
</section>
<section id="throughput-impact">
<h3>Throughput Impact<a class="headerlink" href="#throughput-impact" title="Link to this heading">#</a></h3>
<p><strong>Single Sequence Generation</strong>:</p>
<ul class="simple">
<li><p>Without cache: Throughput decreases as sequence grows (O(n²) bottleneck)</p></li>
<li><p>With cache: Throughput stays relatively constant (O(n) scales well)</p></li>
</ul>
<p><strong>Batch Inference</strong>:</p>
<ul class="simple">
<li><p>Cache memory scales linearly with batch size</p></li>
<li><p>Throughput increases with batching (amortize model loading)</p></li>
<li><p>Memory becomes limiting factor before compute</p></li>
</ul>
</section>
</section>
<section id="where-this-code-lives-in-the-final-package">
<h2>Where This Code Lives in the Final Package<a class="headerlink" href="#where-this-code-lives-in-the-final-package" title="Link to this heading">#</a></h2>
<p><strong>Package Export</strong>: Code exports to <code class="docutils literal notranslate"><span class="pre">tinytorch.generation.kv_cache</span></code></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># When students install tinytorch, they import your work like this:</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.memoization</span><span class="w"> </span><span class="kn">import</span> <span class="n">KVCache</span><span class="p">,</span> <span class="n">enable_kv_cache</span><span class="p">,</span> <span class="n">disable_kv_cache</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">MultiHeadAttention</span> <span class="c1"># Base class from Module 12</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.transformer</span><span class="w"> </span><span class="kn">import</span> <span class="n">GPT</span> <span class="c1"># Architecture from Module 13</span>
<span class="c1"># Usage in generation:</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">GPT</span><span class="p">(</span><span class="n">vocab_size</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span> <span class="n">embed_dim</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span> <span class="n">num_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_heads</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span> <span class="c1"># Non-invasively add caching</span>
<span class="c1"># Generate with caching enabled (10-15x faster!)</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">generate_text</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="o">=</span><span class="s2">&quot;Hello&quot;</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
<span class="c1"># Disable caching if needed</span>
<span class="n">disable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>Your KV caching implementation becomes the foundation for efficient inference in the TinyTorch package, used by subsequent modules for text generation, chat applications, and deployment scenarios.</p>
</section>
<section id="common-challenges-and-solutions">
<h2>Common Challenges and Solutions<a class="headerlink" href="#common-challenges-and-solutions" title="Link to this heading">#</a></h2>
<section id="challenge-1-cache-synchronization-across-layers">
<h3>Challenge 1: Cache Synchronization Across Layers<a class="headerlink" href="#challenge-1-cache-synchronization-across-layers" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Keeping cache consistent when different layers process at different speeds or batch items have variable lengths.</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Use layer indexing to maintain independent per-layer caches</p></li>
<li><p>Advance sequence position only after ALL layers have processed current token</p></li>
<li><p>Handle variable sequence lengths with padding and attention masks</p></li>
</ul>
<p><strong>Code Pattern</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Process all layers before advancing</span>
<span class="k">for</span> <span class="n">layer_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_layers</span><span class="p">):</span>
<span class="n">cache</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">layer_idx</span><span class="p">,</span> <span class="n">new_k</span><span class="p">,</span> <span class="n">new_v</span><span class="p">)</span>
<span class="c1"># Now advance position (all layers synchronized)</span>
<span class="n">cache</span><span class="o">.</span><span class="n">advance</span><span class="p">()</span>
</pre></div>
</div>
</section>
<section id="challenge-2-memory-overhead-for-large-models">
<h3>Challenge 2: Memory Overhead for Large Models<a class="headerlink" href="#challenge-2-memory-overhead-for-large-models" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Cache memory grows with sequence length and batch size, potentially exceeding GPU memory.</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Implement cache size limits with eviction policies (LRU, FIFO)</p></li>
<li><p>Use FP16 or INT8 quantization for cache storage (50% memory reduction)</p></li>
<li><p>Consider PagedAttention for virtual memory management</p></li>
<li><p>Tune max_seq_len to expected generation length</p></li>
</ul>
<p><strong>Memory Optimization</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># FP16 caching (2 bytes per element)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="o">...</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span> <span class="c1"># 50% memory savings</span>
<span class="c1"># INT8 caching (1 byte per element)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="o">...</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int8</span><span class="p">)</span> <span class="c1"># 75% memory savings, accuracy trade-off</span>
</pre></div>
</div>
</section>
<section id="challenge-3-correctness-validation">
<h3>Challenge 3: Correctness Validation<a class="headerlink" href="#challenge-3-correctness-validation" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Cached generation must produce identical outputs to non-cached generation.</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Compare cached vs non-cached outputs token-by-token</p></li>
<li><p>Use deterministic sampling (temperature=0) for testing</p></li>
<li><p>Validate cache retrieval returns correct sequence positions</p></li>
<li><p>Test edge cases: first token, cache full, reset</p></li>
</ul>
<p><strong>Validation Pattern</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Generate without cache (ground truth)</span>
<span class="n">output_nocache</span> <span class="o">=</span> <span class="n">generate</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>
<span class="c1"># Generate with cache (optimized)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="n">output_cached</span> <span class="o">=</span> <span class="n">generate</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>
<span class="c1"># Validate identical outputs</span>
<span class="k">assert</span> <span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">output_nocache</span><span class="p">,</span> <span class="n">output_cached</span><span class="p">),</span> <span class="s2">&quot;Cached output must match!&quot;</span>
</pre></div>
</div>
</section>
<section id="challenge-4-integration-without-breaking-existing-code">
<h3>Challenge 4: Integration Without Breaking Existing Code<a class="headerlink" href="#challenge-4-integration-without-breaking-existing-code" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Adding caching shouldnt require modifying Modules 12-13 (attention, transformer).</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Use composition + monkey-patching (wrap, dont modify)</p></li>
<li><p>Store original forward methods before patching</p></li>
<li><p>Provide disable_kv_cache() to restore original behavior</p></li>
<li><p>Use feature flags (model._cache_enabled) for path selection</p></li>
</ul>
<p><strong>Non-Invasive Pattern</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Save original before patching</span>
<span class="n">block</span><span class="o">.</span><span class="n">_original_attention_forward</span> <span class="o">=</span> <span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span>
<span class="c1"># Patch with cached version</span>
<span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span> <span class="o">=</span> <span class="n">cached_forward</span>
<span class="c1"># Restore later if needed</span>
<span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span> <span class="o">=</span> <span class="n">block</span><span class="o">.</span><span class="n">_original_attention_forward</span>
</pre></div>
</div>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>Youre about to implement the optimization that makes production language models economically viable! KV caching is THE technique that transformed LLMs from research toys into products used by millions daily.</p>
<p>This is where theory meets practice in ML systems engineering. Youll see firsthand how a simple idea - “dont recompute what never changes” - can deliver 10-15x speedup and make the impossible possible.</p>
<p><strong>What makes this module special</strong>: Unlike many optimizations that require deep algorithmic changes, KV caching is conceptually simple but profoundly impactful. Youll implement it from scratch, measure the dramatic speedup, and understand the memory-speed trade-offs that guide production deployments.</p>
<p>Understanding this optimization from first principles - implementing it yourself, profiling the speedup, analyzing the trade-offs - will give you deep insight into how production ML systems work. This is the optimization that makes ChatGPT, Claude, and GitHub Copilot possible.</p>
<p>Take your time, measure thoroughly, and enjoy building production-ready ML systems!</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/17_memoization/memoization_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/17_memoization/memoization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Jupyter notebook source and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../16_compression/ABOUT.html" title="previous page">← Previous Module</a>
<a class="right-next" href="../18_acceleration/ABOUT.html" title="next page">Next Module →</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="16_compression_ABOUT.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">16. Compression - Pruning and Model Compression</p>
</div>
</a>
<a class="right-next"
href="18_acceleration_ABOUT.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">18. Acceleration - CPU Vectorization &amp; Cache Optimization</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-optimization-flow">KV Cache Optimization Flow</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-autoregressive-generation-problem">The Autoregressive Generation Problem</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-caching-solution">The Caching Solution</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-impact">Production Impact</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-speed-trade-off">Memory-Speed Trade-off</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-components">Core Components</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kvcache-data-structure">1. KVCache Data Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#non-invasive-cache-integration">2. Non-Invasive Cache Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#cached-attention-logic">3. Cached Attention Logic</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-steps">Implementation Steps</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-1-design-kvcache-structure">Step 1: Design KVCache Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-2-implement-cache-updates">Step 2: Implement Cache Updates</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-3-enable-non-invasive-integration">Step 3: Enable Non-Invasive Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-4-implement-cached-attention-forward">Step 4: Implement Cached Attention Forward</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-5-validate-correctness">Step 5: Validate Correctness</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-data-instead-of-tensor-operations">Why .data Instead of Tensor Operations?</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-profiling">Inline Testing &amp; Profiling</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-production-challenges">Real-World Production Challenges</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-optimization-patterns">Production Optimization Patterns</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#huggingface-cache-patterns-comparison">HuggingFace Cache Patterns Comparison</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expected-speedup-by-sequence-length">Expected Speedup by Sequence Length</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-usage-by-model-size">Memory Usage by Model Size</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#throughput-impact">Throughput Impact</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#where-this-code-lives-in-the-final-package">Where This Code Lives in the Final Package</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-challenges-and-solutions">Common Challenges and Solutions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-1-cache-synchronization-across-layers">Challenge 1: Cache Synchronization Across Layers</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-2-memory-overhead-for-large-models">Challenge 2: Memory Overhead for Large Models</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-3-correctness-validation">Challenge 3: Correctness Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-4-integration-without-breaking-existing-code">Challenge 4: Integration Without Breaking Existing Code</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>