TinyTorch/modules/17_memoization_ABOUT.html


<!DOCTYPE html>


<html lang="en" data-content_root="../" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>17. Memoization - Computational Reuse for Inference &#8212; Tiny🔥Torch</title>


  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
  </script>

  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />


  <link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
  <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
    <link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
    <link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
    <link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
    <link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />

  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
  <script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>

    <script src="../_static/documentation_options.js?v=9eb32ce0"></script>
    <script src="../_static/doctools.js?v=9a2dae69"></script>
    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
    <script src="../_static/copybutton.js?v=f281be69"></script>
    <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
    <script>let toggleHintShow = 'Click to show';</script>
    <script>let toggleHintHide = 'Click to hide';</script>
    <script>let toggleOpenOnPrint = 'true';</script>
    <script src="../_static/togglebutton.js?v=4a39c7ea"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script src="../_static/design-tabs.js?v=f930bc37"></script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
    <script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
    <script type="module" src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs"></script>
    <script type="module" src="https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs"></script>
    <script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";import elkLayouts from "https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.2.0/dist/mermaid-layout-elk.esm.min.mjs";mermaid.registerLayoutLoaders(elkLayouts);mermaid.initialize({startOnLoad:false});</script>
    <script src="https://cdn.jsdelivr.net/npm/d3@7.9.0/dist/d3.min.js"></script>
    <script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";

const defaultStyle = document.createElement('style');
defaultStyle.textContent = `pre.mermaid {
    /* Same as .mermaid-container > pre */
    display: block;
    width: 100%;
}

pre.mermaid > svg {
    /* Same as .mermaid-container > pre > svg */
    height: 500px;
    width: 100%;
    max-width: 100% !important;
}
`;
document.head.appendChild(defaultStyle);

const fullscreenStyle = document.createElement('style');
fullscreenStyle.textContent = `.mermaid-container {
    display: flex;
    flex-direction: row;
    width: 100%;
}

.mermaid-container > pre {
    display: block;
    width: 100%;
}

.mermaid-container > pre > svg {
    height: 500px;
    width: 100%;
    max-width: 100% !important;
}

.mermaid-fullscreen-btn {
    width: 28px;
    height: 28px;
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid rgba(0, 0, 0, 0.3);
    border-radius: 4px;
    cursor: pointer;
    display: flex;
    align-items: center;
    justify-content: center;
    transition: all 0.2s;
    box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
    font-size: 14px;
    line-height: 1;
    padding: 0;
    color: #333;
}

.mermaid-fullscreen-btn:hover {
    opacity: 100% !important;
    background: rgba(255, 255, 255, 1);
    box-shadow: 0 3px 10px rgba(0, 0, 0, 0.3);
    transform: scale(1.1);
}

.mermaid-fullscreen-btn.dark-theme {
    background: rgba(50, 50, 50, 0.95);
    border: 1px solid rgba(255, 255, 255, 0.3);
    color: #e0e0e0;
}

.mermaid-fullscreen-btn.dark-theme:hover {
    background: rgba(60, 60, 60, 1);
    box-shadow: 0 3px 10px rgba(255, 255, 255, 0.2);
}

.mermaid-fullscreen-modal {
    display: none;
    position: fixed !important;
    top: 0 !important;
    left: 0 !important;
    width: 95vw;
    height: 100vh;
    background: rgba(255, 255, 255, 0.98);
    z-index: 9999;
    padding: 20px;
    overflow: auto;
}

.mermaid-fullscreen-modal.dark-theme {
    background: rgba(0, 0, 0, 0.98);
}

.mermaid-fullscreen-modal.active {
    display: flex;
    align-items: center;
    justify-content: center;
}

.mermaid-container-fullscreen {
    position: relative;
    width: 95vw;
    height: 90vh;
    max-width: 95vw;
    max-height: 90vh;
    background: white;
    border-radius: 8px;
    padding: 20px;
    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
    overflow: auto;
    display: flex;
    align-items: center;
    justify-content: center;
}

.mermaid-container-fullscreen.dark-theme {
    background: #1a1a1a;
    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.8);
}

.mermaid-container-fullscreen pre.mermaid {
    width: 100%;
    height: 100%;
    display: flex;
    align-items: center;
    justify-content: center;
}

.mermaid-container-fullscreen .mermaid svg {
    height: 100% !important;
    width: 100% !important;
    cursor: grab;
}

.mermaid-fullscreen-close {
    position: fixed !important;
    top: 20px !important;
    right: 20px !important;
    width: 40px;
    height: 40px;
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid rgba(0, 0, 0, 0.2);
    border-radius: 50%;
    cursor: pointer;
    z-index: 10000;
    display: flex;
    align-items: center;
    justify-content: center;
    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
    transition: all 0.2s;
    font-size: 24px;
    line-height: 1;
    color: #333;
}

.mermaid-fullscreen-close:hover {
    background: white;
    box-shadow: 0 6px 16px rgba(0, 0, 0, 0.4);
    transform: scale(1.1);
}

.mermaid-fullscreen-close.dark-theme {
    background: rgba(50, 50, 50, 0.95);
    border: 1px solid rgba(255, 255, 255, 0.2);
    color: #e0e0e0;
}

.mermaid-fullscreen-close.dark-theme:hover {
    background: rgba(60, 60, 60, 1);
    box-shadow: 0 6px 16px rgba(255, 255, 255, 0.2);
}

.mermaid-fullscreen-modal .mermaid-fullscreen-btn {
    display: none !important;
}`;
document.head.appendChild(fullscreenStyle);

// Detect if page has dark background
const isDarkTheme = () => {
    const bgColor = window.getComputedStyle(document.body).backgroundColor;
    const match = bgColor.match(/rgb\((\d+),\s*(\d+),\s*(\d+)/);
    if (match) {
        const r = parseInt(match[1]);
        const g = parseInt(match[2]);
        const b = parseInt(match[3]);
        const brightness = (r * 299 + g * 587 + b * 114) / 1000;
        return brightness < 128;
    }
    return false;
};

const load = async () => {
    await mermaid.run();

    const all_mermaids = document.querySelectorAll(".mermaid");
    const mermaids_processed = document.querySelectorAll(".mermaid[data-processed='true']");

    if ("False" === "True") {
        const mermaids_to_add_zoom = -1 === -1 ? all_mermaids.length : -1;
        if(mermaids_to_add_zoom > 0) {
            var svgs = d3.selectAll("");
            if(all_mermaids.length !== mermaids_processed.length) {
                setTimeout(load, 200);
                return;
            } else if(svgs.size() !== mermaids_to_add_zoom) {
                setTimeout(load, 200);
                return;
            } else {
                svgs.each(function() {
                    var svg = d3.select(this);
                    svg.html("<g class='wrapper'>" + svg.html() + "</g>");
                    var inner = svg.select("g");
                    var zoom = d3.zoom().on("zoom", function(event) {
                        inner.attr("transform", event.transform);
                    });
                    svg.call(zoom);
                });
            }
        }
    } else if(all_mermaids.length !== mermaids_processed.length) {
        // Wait for mermaid to process all diagrams
        setTimeout(load, 200);
        return;
    }

    const darkTheme = isDarkTheme();

    // Stop here if not adding fullscreen capability
    if ("True" !== "True") return;

    const modal = document.createElement('div');
    modal.className = 'mermaid-fullscreen-modal' + (darkTheme ? ' dark-theme' : '');
    modal.setAttribute('role', 'dialog');
    modal.setAttribute('aria-modal', 'true');
    modal.setAttribute('aria-label', 'Fullscreen diagram viewer');
    modal.innerHTML = `
        <button class="mermaid-fullscreen-close${darkTheme ? ' dark-theme' : ''}" aria-label="Close fullscreen">✕</button>
        <div class="mermaid-container-fullscreen${darkTheme ? ' dark-theme' : ''}"></div>
    `;
    document.body.appendChild(modal);

    const modalContent = modal.querySelector('.mermaid-container-fullscreen');
    const closeBtn = modal.querySelector('.mermaid-fullscreen-close');

    let previousScrollOffset = [window.scrollX, window.scrollY];

    const closeModal = () => {
        modal.classList.remove('active');
        modalContent.innerHTML = '';
        document.body.style.overflow = ''
        window.scrollTo({left: previousScrollOffset[0], top: previousScrollOffset[1], behavior: 'instant'});
    };

    closeBtn.addEventListener('click', closeModal);
    modal.addEventListener('click', (e) => {
        if (e.target === modal) closeModal();
    });
    document.addEventListener('keydown', (e) => {
        if (e.key === 'Escape' && modal.classList.contains('active')) {
            closeModal();
        }
    });

    const allButtons = [];

    document.querySelectorAll('.mermaid').forEach((mermaidDiv) => {
        if (mermaidDiv.parentNode.classList.contains('mermaid-container') ||
            mermaidDiv.closest('.mermaid-fullscreen-modal')) {
            return;
        }

        const container = document.createElement('div');
        container.className = 'mermaid-container';
        mermaidDiv.parentNode.insertBefore(container, mermaidDiv);
        container.appendChild(mermaidDiv);

        const fullscreenBtn = document.createElement('button');
        fullscreenBtn.className = 'mermaid-fullscreen-btn' + (darkTheme ? ' dark-theme' : '');
        fullscreenBtn.setAttribute('aria-label', 'View diagram in fullscreen');
        fullscreenBtn.textContent = '⛶';
        fullscreenBtn.style.opacity = '50%';

        // Calculate dynamic position based on diagram's margin and padding
        const diagramStyle = window.getComputedStyle(mermaidDiv);
        const marginTop = parseFloat(diagramStyle.marginTop) || 0;
        const marginRight = parseFloat(diagramStyle.marginRight) || 0;
        const paddingTop = parseFloat(diagramStyle.paddingTop) || 0;
        const paddingRight = parseFloat(diagramStyle.paddingRight) || 0;
        fullscreenBtn.style.top = `${marginTop + paddingTop + 4}px`;
        fullscreenBtn.style.right = `${marginRight + paddingRight + 4}px`;

        fullscreenBtn.addEventListener('click', () => {
            previousScrollOffset = [window.scroll, window.scrollY];
            const clone = mermaidDiv.cloneNode(true);
            modalContent.innerHTML = '';
            modalContent.appendChild(clone);

            const svg = clone.querySelector('svg');
            if (svg) {
                svg.removeAttribute('width');
                svg.removeAttribute('height');
                svg.style.width = '100%';
                svg.style.height = 'auto';
                svg.style.maxWidth = '100%';
                svg.style.sdisplay = 'block';

                if ("False" === "True") {
                    setTimeout(() => {
                        const g = svg.querySelector('g');
                        if (g) {
                            var svgD3 = d3.select(svg);
                            svgD3.html("<g class='wrapper'>" + svgD3.html() + "</g>");
                            var inner = svgD3.select("g");
                            var zoom = d3.zoom().on("zoom", function(event) {
                                inner.attr("transform", event.transform);
                            });
                            svgD3.call(zoom);
                        }
                    }, 100);
                }
            }

            modal.classList.add('active');
            document.body.style.overflow = 'hidden';
        });

        container.appendChild(fullscreenBtn);
        allButtons.push(fullscreenBtn);
    });

    // Update theme classes when theme changes
    const updateTheme = () => {
        const dark = isDarkTheme();
        allButtons.forEach(btn => {
            if (dark) {
                btn.classList.add('dark-theme');
            } else {
                btn.classList.remove('dark-theme');
            }
        });
        if (dark) {
            modal.classList.add('dark-theme');
            modalContent.classList.add('dark-theme');
            closeBtn.classList.add('dark-theme');
        } else {
            modal.classList.remove('dark-theme');
            modalContent.classList.remove('dark-theme');
            closeBtn.classList.remove('dark-theme');
        }
    };

    // Watch for theme changes
    const observer = new MutationObserver(updateTheme);
    observer.observe(document.documentElement, {
        attributes: true,
        attributeFilter: ['class', 'style', 'data-theme']
    });
    observer.observe(document.body, {
        attributes: true,
        attributeFilter: ['class', 'style']
    });
};

window.addEventListener("load", load);
</script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'modules/17_memoization_ABOUT';</script>
    <script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
    <script src="../_static/wip-banner.js?v=04a7e74d"></script>
    <script src="../_static/marimo-badges.js?v=e6289128"></script>
    <script src="../_static/sidebar-link.js?v=404b701b"></script>
    <script src="../_static/hero-carousel.js?v=10341d2a"></script>
    <script src="../_static/subscribe-modal.js?v=42919b64"></script>
    <link rel="icon" href="../_static/favicon.svg"/>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="18. Acceleration - CPU Vectorization &amp; Cache Optimization" href="18_acceleration_ABOUT.html" />
    <link rel="prev" title="16. Compression - Pruning and Model Compression" href="16_compression_ABOUT.html" />
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  </head>


  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">


  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>

  <div id="pst-scroll-pixel-helper"></div>

  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
    <i class="fa-solid fa-arrow-up"></i>Back to top</button>


  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-primary-sidebar-checkbox"/>
  <label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>

  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-secondary-sidebar-checkbox"/>
  <label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>

  <div class="search-button__wrapper">
    <div class="search-button__overlay"></div>
    <div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
      action="../search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         id="search-input"
         placeholder="Search..."
         aria-label="Search..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
  </div>

  <div class="pst-async-banner-revealer d-none">
  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>


    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
    </header>


  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">


      <div class="bd-sidebar-primary bd-sidebar">


  <div class="sidebar-header-items sidebar-primary__section">


  </div>

    <div class="sidebar-primary-items__start sidebar-primary__section">
        <div class="sidebar-primary-item">


<a class="navbar-brand logo" href="../intro.html">


    <img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
    <script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>


</a></div>
        <div class="sidebar-primary-item">

 <script>
 document.write(`
   <button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass"></i>
    <span class="search-button__default-text">Search</span>
    <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
   </button>
 `);
 </script></div>
        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
    <div class="bd-toc-item navbar-nav active">
        <p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>

    </div>
</nav></div>
    </div>


  <div class="sidebar-primary-items__end sidebar-primary__section">
  </div>

  <div id="rtd-footer-container"></div>


      </div>

      <main id="main-content" class="bd-main" role="main">


<div class="sbt-scroll-pixel-helper"></div>

          <div class="bd-content">
            <div class="bd-article-container">

              <div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">

    <div class="header-article-items__start">

        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
  <span class="fa-solid fa-bars"></span>
</button></div>

    </div>


    <div class="header-article-items__end">

        <div class="header-article-item">

<div class="article-header-buttons">


<div class="dropdown dropdown-download-buttons">
  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
    <i class="fas fa-download"></i>
  </button>
  <ul class="dropdown-menu">


      <li><a href="../_sources/modules/17_memoization_ABOUT.md" target="_blank"
   class="btn btn-sm btn-download-source-button dropdown-item"
   title="Download source file"
   data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-file"></i>
  </span>
<span class="btn__text-container">.md</span>
</a>
</li>


      <li>
<button onclick="window.print()"
  class="btn btn-sm btn-download-pdf-button dropdown-item"
  title="Print to PDF"
  data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-file-pdf"></i>
  </span>
<span class="btn__text-container">.pdf</span>
</button>
</li>

  </ul>
</div>


<button onclick="toggleFullScreen()"
  class="btn btn-sm btn-fullscreen-button"
  title="Fullscreen mode"
  data-bs-placement="bottom" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-expand"></i>
  </span>

</button>


<script>
document.write(`
  <button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
    <i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
    <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
  </button>
`);
</script>


<script>
document.write(`
  <button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
  </button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="fa-solid fa-list"></span>
</button>
</div></div>

    </div>

</div>
</div>


<div id="jb-print-docs-body" class="onlyprint">
    <h1>17. Memoization - Computational Reuse for Inference</h1>
    <!-- Table of contents -->
    <div id="print-main-content">
        <div id="jb-print-toc">

            <div>
                <h2> Contents </h2>
            </div>
            <nav aria-label="Page">
                <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-optimization-flow">KV Cache Optimization Flow</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-autoregressive-generation-problem">The Autoregressive Generation Problem</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-caching-solution">The Caching Solution</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-impact">Production Impact</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-speed-trade-off">Memory-Speed Trade-off</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-components">Core Components</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kvcache-data-structure">1. KVCache Data Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#non-invasive-cache-integration">2. Non-Invasive Cache Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#cached-attention-logic">3. Cached Attention Logic</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-steps">Implementation Steps</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-1-design-kvcache-structure">Step 1: Design KVCache Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-2-implement-cache-updates">Step 2: Implement Cache Updates</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-3-enable-non-invasive-integration">Step 3: Enable Non-Invasive Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-4-implement-cached-attention-forward">Step 4: Implement Cached Attention Forward</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-5-validate-correctness">Step 5: Validate Correctness</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-data-instead-of-tensor-operations">Why .data Instead of Tensor Operations?</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-profiling">Inline Testing &amp; Profiling</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-production-challenges">Real-World Production Challenges</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-optimization-patterns">Production Optimization Patterns</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#huggingface-cache-patterns-comparison">HuggingFace Cache Patterns Comparison</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expected-speedup-by-sequence-length">Expected Speedup by Sequence Length</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-usage-by-model-size">Memory Usage by Model Size</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#throughput-impact">Throughput Impact</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#where-this-code-lives-in-the-final-package">Where This Code Lives in the Final Package</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-challenges-and-solutions">Common Challenges and Solutions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-1-cache-synchronization-across-layers">Challenge 1: Cache Synchronization Across Layers</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-2-memory-overhead-for-large-models">Challenge 2: Memory Overhead for Large Models</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-3-correctness-validation">Challenge 3: Correctness Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-4-integration-without-breaking-existing-code">Challenge 4: Integration Without Breaking Existing Code</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
            </nav>
        </div>
    </div>
</div>


<div id="searchbox"></div>
                <article class="bd-article">

  <section id="memoization-computational-reuse-for-inference">
<h1>17. Memoization - Computational Reuse for Inference<a class="headerlink" href="#memoization-computational-reuse-for-inference" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 4-5 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>Memoization is a fundamental optimization pattern: cache computational results to avoid redundant work. You’ll apply this pattern to transformers through KV (Key-Value) caching, transforming O(n²) autoregressive generation into O(n) complexity and achieving 10-15x speedup. This optimization makes production language model serving economically viable.</p>
<p>This is inference-only optimization - you’ll implement caching patterns used in every production LLM from ChatGPT to Claude to GitHub Copilot.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Understand Memoization Pattern</strong>: Recognize when computational reuse through caching applies to ML problems and understand the memory-speed trade-off</p></li>
<li><p><strong>Implement KVCache Structure</strong>: Build efficient cache data structures with O(1) updates, proper memory management, and multi-layer support</p></li>
<li><p><strong>Apply Caching to Transformers</strong>: Integrate KV caching into attention layers without modifying existing transformer code (non-invasive enhancement)</p></li>
<li><p><strong>Measure Performance Gains</strong>: Profile latency improvements, measure O(n²) → O(n) complexity reduction, and understand speedup characteristics</p></li>
<li><p><strong>Analyze Production Trade-offs</strong>: Calculate cache memory costs, understand cache invalidation policies, and recognize when caching justifies its overhead</p></li>
</ul>
</section>
<section id="build-use-optimize">
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorch’s <strong>Build → Use → Optimize</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement KVCache data structure with efficient updates, cached attention integration, and multi-layer cache management</p></li>
<li><p><strong>Use</strong>: Apply caching to GPT text generation, measure 10-15x speedup over naive generation, and validate output correctness</p></li>
<li><p><strong>Optimize</strong>: Profile memory bandwidth bottlenecks, measure cache hit rates, and understand when memory cost exceeds latency benefit</p></li>
</ol>
</section>
<section id="why-this-matters">
<h2>Why This Matters<a class="headerlink" href="#why-this-matters" title="Link to this heading">#</a></h2>
<section id="kv-cache-optimization-flow">
<h3>KV Cache Optimization Flow<a class="headerlink" href="#kv-cache-optimization-flow" title="Link to this heading">#</a></h3>
<p>Caching stores computed keys and values, avoiding recomputation for each new token:</p>
<pre  class="mermaid">
        graph LR
    A[Token i&lt;br/&gt;Compute K_i, V_i] --&gt; B[Cache&lt;br/&gt;Store K_i, V_i]
    B --&gt; C[Token i+1&lt;br/&gt;New computation]
    C --&gt; D[Reuse&lt;br/&gt;K_i, V_i from cache]
    D --&gt; E[Only compute&lt;br/&gt;K_{i+1}, V_{i+1}]
    E --&gt; F[10-15× speedup]

    style A fill:#e3f2fd
    style C fill:#e3f2fd
    style B fill:#f3e5f5
    style D fill:#fff3e0
    style E fill:#ffe0b2
    style F fill:#f0fdf4
    </pre><p><strong>Optimization</strong>: Compute K,V once → Cache → Reuse for all future tokens → O(n²) → O(n) complexity</p>
</section>
<section id="the-autoregressive-generation-problem">
<h3>The Autoregressive Generation Problem<a class="headerlink" href="#the-autoregressive-generation-problem" title="Link to this heading">#</a></h3>
<p>Without caching, transformer generation has quadratic complexity:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Naive Generation (O(n²) complexity):
Step 1: Generate token 1  → Compute attention for [t₀]                (1 computation)
Step 2: Generate token 2  → Compute attention for [t₀, t₁]            (2 computations, t₀ RECOMPUTED!)
Step 3: Generate token 3  → Compute attention for [t₀, t₁, t₂]        (3 computations, t₀,t₁ RECOMPUTED!)
...
Step n: Generate token n  → Compute attention for [t₀, ..., tₙ]       (n computations, ALL RECOMPUTED!)

Total: 1 + 2 + 3 + ... + n = n(n+1)/2 = O(n²) complexity!
For 100 tokens: ~5,050 redundant K,V computations
</pre></div>
</div>
<p><strong>The Key Insight</strong>: K and V matrices for previous tokens NEVER change, yet we recompute them every step!</p>
</section>
<section id="the-caching-solution">
<h3>The Caching Solution<a class="headerlink" href="#the-caching-solution" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Cached Generation (O(n) complexity):
Step 1: Compute K₁, V₁ → Cache them → Attention with cached[K₁, V₁]
Step 2: Compute K₂, V₂ → Cache them → Attention with cached[K₁, K₂, V₁, V₂]  (reuse K₁, V₁!)
Step 3: Compute K₃, V₃ → Cache them → Attention with cached[K₁, K₂, K₃, V₁, V₂, V₃]  (reuse all!)

Total: 1 + 1 + 1 + ... + 1 = n computations (50x reduction for n=100!)
</pre></div>
</div>
</section>
<section id="production-impact">
<h3>Production Impact<a class="headerlink" href="#production-impact" title="Link to this heading">#</a></h3>
<p>KV caching is mandatory for all production LLM serving:</p>
<ul class="simple">
<li><p><strong>ChatGPT/GPT-4</strong>: Would be 50-100x slower without caching, making conversational AI economically infeasible</p></li>
<li><p><strong>Claude</strong>: Caches up to 100K tokens of context, enabling long document processing</p></li>
<li><p><strong>GitHub Copilot</strong>: Real-time code completion requires sub-100ms latency - impossible without caching</p></li>
<li><p><strong>Google Gemini</strong>: Multi-level caching (KV + intermediate layers) serves billions of requests daily</p></li>
</ul>
<p>Without KV caching, the computational cost would make these services prohibitively expensive.</p>
</section>
<section id="memory-speed-trade-off">
<h3>Memory-Speed Trade-off<a class="headerlink" href="#memory-speed-trade-off" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Traditional Approach (No Cache):
Memory:  O(1)          Cost: Negligible
Compute: O(n²)         Cost: Prohibitive for long sequences

Cached Approach (KV Cache):
Memory:  O(n × d_k)    Cost: ~18MB per batch for GPT-2
Compute: O(n)          Cost: 10-15x faster than naive

Trade-off Winner: Memory is cheap, compute is expensive!
Use O(n) memory to save O(n²) compute.
</pre></div>
</div>
</section>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="core-components">
<h3>Core Components<a class="headerlink" href="#core-components" title="Link to this heading">#</a></h3>
<p>You’ll implement three main components:</p>
<section id="kvcache-data-structure">
<h4>1. KVCache Data Structure<a class="headerlink" href="#kvcache-data-structure" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">KVCache</span><span class="p">:</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    Efficient key-value cache for autoregressive generation.</span>

<span class="sd">    Memory Layout:</span>
<span class="sd">        keys:   (num_layers, batch, num_heads, seq_len, d_k)</span>
<span class="sd">        values: (num_layers, batch, num_heads, seq_len, d_v)</span>

<span class="sd">    For GPT-2 (12 layers, 12 heads, 1024 seq, 64 dims):</span>
<span class="sd">        12 layers × 12 heads × 1024 seq × 64 dims = ~9M values</span>
<span class="sd">        At FP32 (4 bytes): ~36MB per batch item</span>
<span class="sd">        At FP16 (2 bytes): ~18MB per batch item</span>

<span class="sd">    Operations:</span>
<span class="sd">        update(layer_idx, key, value) -&gt; None  # O(1) append</span>
<span class="sd">        get(layer_idx) -&gt; (cached_k, cached_v) # O(1) retrieval</span>
<span class="sd">        advance() -&gt; None                       # Increment position</span>
<span class="sd">        reset() -&gt; None                         # Clear for new sequence</span>
<span class="sd">    &quot;&quot;&quot;</span>
</pre></div>
</div>
<p><strong>Key Design Decisions</strong>:</p>
<ul class="simple">
<li><p>Pre-allocate cache tensors to avoid dynamic resizing overhead</p></li>
<li><p>Use position counter for O(1) indexed updates (no copying)</p></li>
<li><p>Store per-layer caches to support multi-layer transformers</p></li>
<li><p>Track sequence position externally for clean separation</p></li>
</ul>
</section>
<section id="non-invasive-cache-integration">
<h4>2. Non-Invasive Cache Integration<a class="headerlink" href="#non-invasive-cache-integration" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    Enable KV caching WITHOUT modifying Module 12/13 code.</span>

<span class="sd">    This demonstrates non-invasive optimization - adding capabilities</span>
<span class="sd">    to existing systems without breaking them. Similar to how Module 05</span>
<span class="sd">    uses enable_autograd() to add gradient tracking to Tensors.</span>

<span class="sd">    Approach:</span>
<span class="sd">    1. Create KVCache sized for model architecture</span>
<span class="sd">    2. Store cache on model as model._kv_cache</span>
<span class="sd">    3. Wrap each attention layer&#39;s forward method with caching logic</span>
<span class="sd">    4. Intercept attention calls to manage cache automatically</span>

<span class="sd">    This is composition + monkey-patching - a critical ML systems pattern!</span>
<span class="sd">    &quot;&quot;&quot;</span>
</pre></div>
</div>
<p><strong>Why Non-Invasive?</strong></p>
<ul class="simple">
<li><p>Modules 12-13 (Attention, Transformers) work unchanged</p></li>
<li><p>Module 17 ADDS optimization, doesn’t BREAK old code</p></li>
<li><p>Teaches “forward-only” systems engineering: never modify earlier modules</p></li>
<li><p>Matches how production systems layer optimizations (vLLM, HuggingFace)</p></li>
</ul>
</section>
<section id="cached-attention-logic">
<h4>3. Cached Attention Logic<a class="headerlink" href="#cached-attention-logic" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">cached_forward</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    Cache-aware attention with three paths:</span>

<span class="sd">    PATH 1: Training (seq_len &gt; 1)</span>
<span class="sd">        → Use original attention (preserve gradients)</span>
<span class="sd">        → O(n²) but needed for backpropagation</span>

<span class="sd">    PATH 2: First Token (seq_len == 1, cache empty)</span>
<span class="sd">        → Use original attention (initialize cache)</span>
<span class="sd">        → O(1) - just one token</span>

<span class="sd">    PATH 3: Cached Generation (seq_len == 1, cache populated)</span>
<span class="sd">        → Compute K,V for NEW token only</span>
<span class="sd">        → Retrieve ALL cached K,V (includes history)</span>
<span class="sd">        → Attention with cached context</span>
<span class="sd">        → O(n) - only compute new, reuse cache</span>
<span class="sd">        → THIS IS WHERE THE SPEEDUP HAPPENS!</span>
<span class="sd">    &quot;&quot;&quot;</span>
</pre></div>
</div>
</section>
</section>
<section id="implementation-steps">
<h3>Implementation Steps<a class="headerlink" href="#implementation-steps" title="Link to this heading">#</a></h3>
<section id="step-1-design-kvcache-structure">
<h4>Step 1: Design KVCache Structure<a class="headerlink" href="#step-1-design-kvcache-structure" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Initialize cache storage for all layers</p></li>
<li><p>Pre-allocate tensors with maximum sequence length</p></li>
<li><p>Track current sequence position (write pointer)</p></li>
<li><p>Implement update() for O(1) append operations</p></li>
<li><p>Implement get() for O(1) retrieval of valid cache portion</p></li>
</ol>
</section>
<section id="step-2-implement-cache-updates">
<h4>Step 2: Implement Cache Updates<a class="headerlink" href="#step-2-implement-cache-updates" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Validate layer index and sequence position</p></li>
<li><p>Write new K,V to current position (indexed assignment)</p></li>
<li><p>Advance position counter after all layers processed</p></li>
<li><p>Handle batch dimension and multi-head structure</p></li>
</ol>
</section>
<section id="step-3-enable-non-invasive-integration">
<h4>Step 3: Enable Non-Invasive Integration<a class="headerlink" href="#step-3-enable-non-invasive-integration" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Validate model has required attributes (embed_dim, num_layers, etc.)</p></li>
<li><p>Calculate head_dim from embed_dim and num_heads</p></li>
<li><p>Create KVCache instance sized for model</p></li>
<li><p>Store cache on model with model._kv_cache flag</p></li>
<li><p>Wrap each block’s attention.forward with caching logic</p></li>
</ol>
</section>
<section id="step-4-implement-cached-attention-forward">
<h4>Step 4: Implement Cached Attention Forward<a class="headerlink" href="#step-4-implement-cached-attention-forward" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Detect path: training (seq_len &gt; 1), first token (cache empty), or cached generation</p></li>
<li><p>For cached path: Compute Q,K,V projections for new token only</p></li>
<li><p>Reshape to multi-head format (batch, num_heads, 1, head_dim)</p></li>
<li><p>Update cache with new K,V pairs</p></li>
<li><p>Retrieve ALL cached K,V (history + new)</p></li>
<li><p>Compute attention: softmax(Q &#64; K^T / √d_k) &#64; V using NumPy (.data)</p></li>
<li><p>Apply output projection and return</p></li>
</ol>
</section>
<section id="step-5-validate-correctness">
<h4>Step 5: Validate Correctness<a class="headerlink" href="#step-5-validate-correctness" title="Link to this heading">#</a></h4>
<ol class="arabic simple">
<li><p>Test cache initialization and memory calculation</p></li>
<li><p>Verify single-token and multi-token updates</p></li>
<li><p>Validate multi-layer cache synchronization</p></li>
<li><p>Test reset functionality</p></li>
<li><p>Measure speedup vs non-cached generation</p></li>
</ol>
</section>
</section>
<section id="why-data-instead-of-tensor-operations">
<h3>Why .data Instead of Tensor Operations?<a class="headerlink" href="#why-data-instead-of-tensor-operations" title="Link to this heading">#</a></h3>
<p>In cached attention, we use NumPy via <code class="docutils literal notranslate"><span class="pre">.data</span></code> for three reasons:</p>
<ol class="arabic simple">
<li><p><strong>Explicit Intent</strong>: Makes it crystal clear this is inference-only</p>
<ul class="simple">
<li><p>Training: Uses Tensor operations → gradients tracked</p></li>
<li><p>Inference: Uses .data → no gradient overhead</p></li>
</ul>
</li>
<li><p><strong>Performance</strong>: Avoids any autograd bookkeeping</p>
<ul class="simple">
<li><p>Even small overhead matters in generation hotpath</p></li>
<li><p>Production LLMs (vLLM, llama.cpp) use similar patterns</p></li>
</ul>
</li>
<li><p><strong>Educational Clarity</strong>: Shows students the distinction</p>
<ul class="simple">
<li><p>“When do I need gradients?” (training)</p></li>
<li><p>“When can I skip them?” (inference)</p></li>
</ul>
</li>
</ol>
<p>We COULD use Tensor operations with requires_grad=False, but .data is more explicit and follows industry patterns.</p>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you understand transformers and profiling:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch

<span class="c1"># Verify prerequisite modules</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>transformers
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>profiling
</pre></div>
</div>
<p><strong>Required Understanding</strong>:</p>
<ul class="simple">
<li><p>Multi-head attention mechanism (Module 12)</p></li>
<li><p>Transformer architecture (Module 13)</p></li>
<li><p>Latency profiling techniques (Module 14)</p></li>
<li><p>O(n²) complexity of attention computation</p></li>
</ul>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/17_memoization/memoization_dev.ipynb</span></code></p></li>
<li><p><strong>Profile naive generation</strong>: Measure O(n²) growth in latency as sequence lengthens</p></li>
<li><p><strong>Implement KVCache class</strong>: Build data structure with update(), get(), advance(), reset()</p></li>
<li><p><strong>Test cache operations</strong>: Verify single-token, multi-token, and multi-layer caching</p></li>
<li><p><strong>Implement enable_kv_cache()</strong>: Non-invasively patch model attention layers</p></li>
<li><p><strong>Build cached attention forward</strong>: Three-path logic (training, first token, cached generation)</p></li>
<li><p><strong>Measure speedup</strong>: Profile cached vs non-cached generation, validate O(n) complexity</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">17</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">memoization</span></code></p></li>
</ol>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify memoization functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>memoization

<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>memoization<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p>✅ <strong>KVCache Initialization</strong>: Validate cache creation, memory calculation, and initial state</p></li>
<li><p>✅ <strong>Cache Updates</strong>: Test single-token append, multi-token sequences, and O(1) update performance</p></li>
<li><p>✅ <strong>Multi-Layer Synchronization</strong>: Verify independent per-layer caches with correct indexing</p></li>
<li><p>✅ <strong>Cache Retrieval</strong>: Test get() returns only valid cached portion (up to seq_pos)</p></li>
<li><p>✅ <strong>Non-Invasive Integration</strong>: Validate enable_kv_cache() works without breaking model</p></li>
<li><p>✅ <strong>Correctness Validation</strong>: Compare cached vs non-cached outputs (should be identical)</p></li>
<li><p>✅ <strong>Performance Measurement</strong>: Measure speedup at different sequence lengths</p></li>
<li><p>✅ <strong>Memory Tracking</strong>: Calculate cache size and validate memory usage</p></li>
</ul>
</section>
<section id="inline-testing-profiling">
<h3>Inline Testing &amp; Profiling<a class="headerlink" href="#inline-testing-profiling" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive validation with performance measurement:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Unit Test: KVCache Implementation</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">KVCache</span> <span class="n">Implementation</span><span class="o">...</span>
   <span class="n">Cache</span> <span class="n">initialized</span><span class="p">:</span> <span class="mf">0.59</span> <span class="n">MB</span>
<span class="err">✅</span> <span class="n">Cache</span> <span class="n">initialization</span> <span class="n">successful</span>
<span class="err">✅</span> <span class="n">Append</span> <span class="ow">and</span> <span class="n">retrieval</span> <span class="n">work</span> <span class="n">correctly</span>
<span class="err">✅</span> <span class="n">Multi</span><span class="o">-</span><span class="n">layer</span> <span class="n">caching</span> <span class="n">validated</span>
<span class="err">✅</span> <span class="n">Reset</span> <span class="n">functionality</span> <span class="n">verified</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">KVCache</span> <span class="err">✓</span>

<span class="c1"># Integration Test: Performance Measurement</span>
<span class="err">🔬</span> <span class="n">Profiling</span> <span class="n">Transformer</span> <span class="n">Generation</span> <span class="p">(</span><span class="n">Without</span> <span class="n">Caching</span><span class="p">):</span>
   <span class="n">Seq</span> <span class="n">Len</span>  <span class="o">|</span>  <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span>  <span class="o">|</span>  <span class="n">Growth</span>
   <span class="o">---------|----------------|----------</span>
    <span class="mi">10</span>      <span class="o">|</span>    <span class="mf">2.34</span>        <span class="o">|</span>  <span class="n">baseline</span>
    <span class="mi">20</span>      <span class="o">|</span>    <span class="mf">4.89</span>        <span class="o">|</span>  <span class="mf">2.09</span><span class="err">×</span>
    <span class="mi">40</span>      <span class="o">|</span>   <span class="mf">10.12</span>        <span class="o">|</span>  <span class="mf">2.07</span><span class="err">×</span>
    <span class="mi">80</span>      <span class="o">|</span>   <span class="mf">21.45</span>        <span class="o">|</span>  <span class="mf">2.12</span><span class="err">×</span>
   <span class="mi">160</span>      <span class="o">|</span>   <span class="mf">45.67</span>        <span class="o">|</span>  <span class="mf">2.13</span><span class="err">×</span>

<span class="err">💡</span> <span class="n">Key</span> <span class="n">Observations</span><span class="p">:</span>
   <span class="err">•</span> <span class="n">Latency</span> <span class="n">grows</span> <span class="n">QUADRATICALLY</span> <span class="k">with</span> <span class="n">sequence</span> <span class="n">length</span>
   <span class="err">•</span> <span class="n">Each</span> <span class="n">new</span> <span class="n">token</span> <span class="n">forces</span> <span class="n">recomputation</span> <span class="n">of</span> <span class="n">ALL</span> <span class="n">previous</span> <span class="n">K</span><span class="p">,</span><span class="n">V</span> <span class="n">pairs</span>
   <span class="err">•</span> <span class="n">For</span> <span class="mi">160</span> <span class="n">tokens</span><span class="p">:</span> <span class="o">~</span><span class="mi">4</span><span class="err">×</span> <span class="n">time</span> <span class="n">vs</span> <span class="mi">80</span> <span class="n">tokens</span> <span class="p">(</span><span class="mi">2</span><span class="err">²</span> <span class="n">growth</span><span class="p">)</span>

<span class="err">🎯</span> <span class="n">The</span> <span class="n">Solution</span><span class="p">:</span> <span class="n">CACHE</span> <span class="n">the</span> <span class="n">K</span><span class="p">,</span><span class="n">V</span> <span class="n">values</span><span class="err">!</span> <span class="p">(</span><span class="n">That</span><span class="s1">&#39;s memoization)</span>
<span class="err">✅</span> <span class="n">Speedup</span><span class="p">:</span> <span class="mi">10</span><span class="o">-</span><span class="mi">15</span><span class="err">×</span> <span class="k">for</span> <span class="n">typical</span> <span class="n">generation</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.memoization</span><span class="w"> </span><span class="kn">import</span> <span class="n">KVCache</span><span class="p">,</span> <span class="n">enable_kv_cache</span>

<span class="c1"># Test cache with small transformer</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span>
    <span class="n">batch_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
    <span class="n">max_seq_len</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span>
    <span class="n">num_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
    <span class="n">num_heads</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span>
    <span class="n">head_dim</span><span class="o">=</span><span class="mi">64</span>
<span class="p">)</span>

<span class="c1"># Simulate generation loop</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>

<span class="k">for</span> <span class="n">step</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">):</span>
    <span class="k">for</span> <span class="n">layer_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">4</span><span class="p">):</span>
        <span class="c1"># New key-value pairs for this step</span>
        <span class="n">new_k</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">64</span><span class="p">))</span>
        <span class="n">new_v</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">64</span><span class="p">))</span>

        <span class="c1"># Update cache (O(1) operation)</span>
        <span class="n">cache</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">layer_idx</span><span class="p">,</span> <span class="n">new_k</span><span class="p">,</span> <span class="n">new_v</span><span class="p">)</span>

    <span class="c1"># Advance position after all layers</span>
    <span class="n">cache</span><span class="o">.</span><span class="n">advance</span><span class="p">()</span>

<span class="c1"># Retrieve cached values</span>
<span class="n">cached_k</span><span class="p">,</span> <span class="n">cached_v</span> <span class="o">=</span> <span class="n">cache</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">layer_idx</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cached 10 tokens: </span><span class="si">{</span><span class="n">cached_k</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>  <span class="c1"># (1, 8, 10, 64)</span>

<span class="c1"># Calculate memory usage</span>
<span class="n">mem_info</span> <span class="o">=</span> <span class="n">cache</span><span class="o">.</span><span class="n">get_memory_usage</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Cache memory: </span><span class="si">{</span><span class="n">mem_info</span><span class="p">[</span><span class="s1">&#39;total_mb&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> MB&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-production-challenges">
<h3>Real-World Production Challenges<a class="headerlink" href="#real-world-production-challenges" title="Link to this heading">#</a></h3>
<p><strong>Memory-Speed Trade-off Analysis</strong>:</p>
<ul class="simple">
<li><p>KV cache uses ~18MB per batch for GPT-2 (FP16). For batch=32, that’s 576MB.</p></li>
<li><p>On an 8GB GPU, how many concurrent users can you serve?</p></li>
<li><p>What’s the trade-off between batch size and cache size?</p></li>
<li><p>When does memory bandwidth (cache access) become the bottleneck instead of compute?</p></li>
</ul>
<p><strong>Cache Invalidation Policies</strong>:</p>
<ul class="simple">
<li><p>In multi-turn chat, when should you clear the cache?</p></li>
<li><p>What happens when context exceeds max_seq_len?</p></li>
<li><p>How do production systems like ChatGPT handle context window limits?</p></li>
<li><p>Compare eviction policies: LRU, FIFO, sliding window, importance-based</p></li>
</ul>
<p><strong>Distributed Caching for Large Models</strong>:</p>
<ul class="simple">
<li><p>For models too large for one GPU, you need tensor parallelism</p></li>
<li><p>How do you partition the KV cache across GPUs?</p></li>
<li><p>Which dimension should you shard: layers, heads, or sequence?</p></li>
<li><p>What’s the communication overhead for cache synchronization?</p></li>
</ul>
<p><strong>Quantized Caching</strong>:</p>
<ul class="simple">
<li><p>Storing cache in INT8 instead of FP16 saves 50% memory</p></li>
<li><p>What’s the accuracy impact of quantized KV cache?</p></li>
<li><p>When is this trade-off worth it?</p></li>
<li><p>How does quantization error accumulate over long sequences?</p></li>
</ul>
</section>
<section id="production-optimization-patterns">
<h3>Production Optimization Patterns<a class="headerlink" href="#production-optimization-patterns" title="Link to this heading">#</a></h3>
<p><strong>Multi-Level Caching</strong>:</p>
<ul class="simple">
<li><p>What if you cache not just K,V but intermediate layer activations?</p></li>
<li><p>How does HuggingFace’s <code class="docutils literal notranslate"><span class="pre">DynamicCache</span></code> differ from static pre-allocation?</p></li>
<li><p>When should you use persistent caching (save to disk) for very long conversations?</p></li>
</ul>
<p><strong>Speculation and Prefetching</strong>:</p>
<ul class="simple">
<li><p>What if you predict the next query and pre-compute KV cache?</p></li>
<li><p>How would speculative caching improve throughput?</p></li>
<li><p>What’s the risk if speculation is wrong?</p></li>
<li><p>When does prefetching justify its overhead?</p></li>
</ul>
</section>
<section id="mathematical-foundations">
<h3>Mathematical Foundations<a class="headerlink" href="#mathematical-foundations" title="Link to this heading">#</a></h3>
<p><strong>Complexity Reduction</strong>:</p>
<ul class="simple">
<li><p>Why does KV caching transform O(n²) into O(n)?</p></li>
<li><p>Calculate total operations for naive vs cached generation (n=100)</p></li>
<li><p>What’s the crossover point where caching overhead exceeds savings?</p></li>
</ul>
<p><strong>Memory Layout Optimization</strong>:</p>
<ul class="simple">
<li><p>Why pre-allocate cache instead of dynamic appending?</p></li>
<li><p>How does cache contiguity affect memory bandwidth?</p></li>
<li><p>Compare row-major vs column-major cache layouts for performance</p></li>
</ul>
<p><strong>Attention Computation Analysis</strong>:</p>
<ul class="simple">
<li><p>Why can we cache K,V but not Q (query)?</p></li>
<li><p>What property of autoregressive generation makes caching valid?</p></li>
<li><p>How would bidirectional attention (BERT) change caching strategy?</p></li>
</ul>
</section>
<section id="huggingface-cache-patterns-comparison">
<h3>HuggingFace Cache Patterns Comparison<a class="headerlink" href="#huggingface-cache-patterns-comparison" title="Link to this heading">#</a></h3>
<p><strong>Static vs Dynamic Cache</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch (Module 17): Static pre-allocation</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="n">max_seq_len</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>  <span class="c1"># Fixed size, O(1) updates</span>

<span class="c1"># HuggingFace: Dynamic cache (DynamicCache class)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">DynamicCache</span><span class="p">()</span>  <span class="c1"># Grows as needed, more flexible but slower</span>
</pre></div>
</div>
<p><strong>When to Use Each</strong>:</p>
<ul class="simple">
<li><p><strong>Static (TinyTorch)</strong>: Known max length, maximum performance, inference serving</p></li>
<li><p><strong>Dynamic (HuggingFace)</strong>: Variable lengths, exploration, research</p></li>
</ul>
<p><strong>Production Systems (vLLM, TGI)</strong>:</p>
<ul class="simple">
<li><p>Use PagedAttention for virtual memory management of KV cache</p></li>
<li><p>Enables efficient memory sharing across requests</p></li>
<li><p>Reduces memory fragmentation for variable-length sequences</p></li>
</ul>
</section>
</section>
<section id="performance-characteristics">
<h2>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h2>
<section id="expected-speedup-by-sequence-length">
<h3>Expected Speedup by Sequence Length<a class="headerlink" href="#expected-speedup-by-sequence-length" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Speedup Characteristics (GPT-2 on CPU):
┌─────────────┬──────────────┬──────────────┬──────────┐
│ Seq Length  │ No Cache     │ With Cache   │ Speedup  │
├─────────────┼──────────────┼──────────────┼──────────┤
│  10 tokens  │  ~80 tok/s   │  ~600 tok/s  │   7.5x   │
│  25 tokens  │  ~40 tok/s   │  ~500 tok/s  │  12.5x   │
│  50 tokens  │  ~25 tok/s   │  ~400 tok/s  │  16.0x   │
│ 100 tokens  │  ~12 tok/s   │  ~200 tok/s  │  16.7x   │
└─────────────┴──────────────┴──────────────┴──────────┘

Key Insight: Speedup increases with sequence length!
Why? Longer sequences = more redundant computation without cache.
</pre></div>
</div>
</section>
<section id="memory-usage-by-model-size">
<h3>Memory Usage by Model Size<a class="headerlink" href="#memory-usage-by-model-size" title="Link to this heading">#</a></h3>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Cache Memory Requirements (FP16, batch_size=1):
┌──────────────┬────────┬────────┬─────────┬──────────────┐
│ Model        │ Layers │ Heads  │ Seq Len │ Cache Memory │
├──────────────┼────────┼────────┼─────────┼──────────────┤
│ TinyGPT      │   4    │   4    │   128   │   0.5 MB     │
│ GPT-2 (124M) │  12    │  12    │  1024   │  18.0 MB     │
│ GPT-3 (175B) │  96    │  96    │  2048   │   4.7 GB     │
└──────────────┴────────┴────────┴─────────┴──────────────┘

Formula: memory = num_layers × num_heads × max_seq_len × head_dim × 2 × 2 bytes
(2× for K and V, 2 bytes for FP16)
</pre></div>
</div>
</section>
<section id="throughput-impact">
<h3>Throughput Impact<a class="headerlink" href="#throughput-impact" title="Link to this heading">#</a></h3>
<p><strong>Single Sequence Generation</strong>:</p>
<ul class="simple">
<li><p>Without cache: Throughput decreases as sequence grows (O(n²) bottleneck)</p></li>
<li><p>With cache: Throughput stays relatively constant (O(n) scales well)</p></li>
</ul>
<p><strong>Batch Inference</strong>:</p>
<ul class="simple">
<li><p>Cache memory scales linearly with batch size</p></li>
<li><p>Throughput increases with batching (amortize model loading)</p></li>
<li><p>Memory becomes limiting factor before compute</p></li>
</ul>
</section>
</section>
<section id="where-this-code-lives-in-the-final-package">
<h2>Where This Code Lives in the Final Package<a class="headerlink" href="#where-this-code-lives-in-the-final-package" title="Link to this heading">#</a></h2>
<p><strong>Package Export</strong>: Code exports to <code class="docutils literal notranslate"><span class="pre">tinytorch.generation.kv_cache</span></code></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># When students install tinytorch, they import your work like this:</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.memoization</span><span class="w"> </span><span class="kn">import</span> <span class="n">KVCache</span><span class="p">,</span> <span class="n">enable_kv_cache</span><span class="p">,</span> <span class="n">disable_kv_cache</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">MultiHeadAttention</span>  <span class="c1"># Base class from Module 12</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.transformer</span><span class="w"> </span><span class="kn">import</span> <span class="n">GPT</span>  <span class="c1"># Architecture from Module 13</span>

<span class="c1"># Usage in generation:</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">GPT</span><span class="p">(</span><span class="n">vocab_size</span><span class="o">=</span><span class="mi">1000</span><span class="p">,</span> <span class="n">embed_dim</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span> <span class="n">num_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_heads</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>  <span class="c1"># Non-invasively add caching</span>

<span class="c1"># Generate with caching enabled (10-15x faster!)</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">generate_text</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="o">=</span><span class="s2">&quot;Hello&quot;</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>

<span class="c1"># Disable caching if needed</span>
<span class="n">disable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>Your KV caching implementation becomes the foundation for efficient inference in the TinyTorch package, used by subsequent modules for text generation, chat applications, and deployment scenarios.</p>
</section>
<section id="common-challenges-and-solutions">
<h2>Common Challenges and Solutions<a class="headerlink" href="#common-challenges-and-solutions" title="Link to this heading">#</a></h2>
<section id="challenge-1-cache-synchronization-across-layers">
<h3>Challenge 1: Cache Synchronization Across Layers<a class="headerlink" href="#challenge-1-cache-synchronization-across-layers" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Keeping cache consistent when different layers process at different speeds or batch items have variable lengths.</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Use layer indexing to maintain independent per-layer caches</p></li>
<li><p>Advance sequence position only after ALL layers have processed current token</p></li>
<li><p>Handle variable sequence lengths with padding and attention masks</p></li>
</ul>
<p><strong>Code Pattern</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Process all layers before advancing</span>
<span class="k">for</span> <span class="n">layer_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_layers</span><span class="p">):</span>
    <span class="n">cache</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">layer_idx</span><span class="p">,</span> <span class="n">new_k</span><span class="p">,</span> <span class="n">new_v</span><span class="p">)</span>

<span class="c1"># Now advance position (all layers synchronized)</span>
<span class="n">cache</span><span class="o">.</span><span class="n">advance</span><span class="p">()</span>
</pre></div>
</div>
</section>
<section id="challenge-2-memory-overhead-for-large-models">
<h3>Challenge 2: Memory Overhead for Large Models<a class="headerlink" href="#challenge-2-memory-overhead-for-large-models" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Cache memory grows with sequence length and batch size, potentially exceeding GPU memory.</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Implement cache size limits with eviction policies (LRU, FIFO)</p></li>
<li><p>Use FP16 or INT8 quantization for cache storage (50% memory reduction)</p></li>
<li><p>Consider PagedAttention for virtual memory management</p></li>
<li><p>Tune max_seq_len to expected generation length</p></li>
</ul>
<p><strong>Memory Optimization</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># FP16 caching (2 bytes per element)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="o">...</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span>  <span class="c1"># 50% memory savings</span>

<span class="c1"># INT8 caching (1 byte per element)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">KVCache</span><span class="p">(</span><span class="o">...</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int8</span><span class="p">)</span>  <span class="c1"># 75% memory savings, accuracy trade-off</span>
</pre></div>
</div>
</section>
<section id="challenge-3-correctness-validation">
<h3>Challenge 3: Correctness Validation<a class="headerlink" href="#challenge-3-correctness-validation" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Cached generation must produce identical outputs to non-cached generation.</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Compare cached vs non-cached outputs token-by-token</p></li>
<li><p>Use deterministic sampling (temperature=0) for testing</p></li>
<li><p>Validate cache retrieval returns correct sequence positions</p></li>
<li><p>Test edge cases: first token, cache full, reset</p></li>
</ul>
<p><strong>Validation Pattern</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Generate without cache (ground truth)</span>
<span class="n">output_nocache</span> <span class="o">=</span> <span class="n">generate</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>

<span class="c1"># Generate with cache (optimized)</span>
<span class="n">cache</span> <span class="o">=</span> <span class="n">enable_kv_cache</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="n">output_cached</span> <span class="o">=</span> <span class="n">generate</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span> <span class="n">max_new_tokens</span><span class="o">=</span><span class="mi">50</span><span class="p">)</span>

<span class="c1"># Validate identical outputs</span>
<span class="k">assert</span> <span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">output_nocache</span><span class="p">,</span> <span class="n">output_cached</span><span class="p">),</span> <span class="s2">&quot;Cached output must match!&quot;</span>
</pre></div>
</div>
</section>
<section id="challenge-4-integration-without-breaking-existing-code">
<h3>Challenge 4: Integration Without Breaking Existing Code<a class="headerlink" href="#challenge-4-integration-without-breaking-existing-code" title="Link to this heading">#</a></h3>
<p><strong>Problem</strong>: Adding caching shouldn’t require modifying Modules 12-13 (attention, transformer).</p>
<p><strong>Solution</strong>:</p>
<ul class="simple">
<li><p>Use composition + monkey-patching (wrap, don’t modify)</p></li>
<li><p>Store original forward methods before patching</p></li>
<li><p>Provide disable_kv_cache() to restore original behavior</p></li>
<li><p>Use feature flags (model._cache_enabled) for path selection</p></li>
</ul>
<p><strong>Non-Invasive Pattern</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Save original before patching</span>
<span class="n">block</span><span class="o">.</span><span class="n">_original_attention_forward</span> <span class="o">=</span> <span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span>

<span class="c1"># Patch with cached version</span>
<span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span> <span class="o">=</span> <span class="n">cached_forward</span>

<span class="c1"># Restore later if needed</span>
<span class="n">block</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">forward</span> <span class="o">=</span> <span class="n">block</span><span class="o">.</span><span class="n">_original_attention_forward</span>
</pre></div>
</div>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>You’re about to implement the optimization that makes production language models economically viable! KV caching is THE technique that transformed LLMs from research toys into products used by millions daily.</p>
<p>This is where theory meets practice in ML systems engineering. You’ll see firsthand how a simple idea - “don’t recompute what never changes” - can deliver 10-15x speedup and make the impossible possible.</p>
<p><strong>What makes this module special</strong>: Unlike many optimizations that require deep algorithmic changes, KV caching is conceptually simple but profoundly impactful. You’ll implement it from scratch, measure the dramatic speedup, and understand the memory-speed trade-offs that guide production deployments.</p>
<p>Understanding this optimization from first principles - implementing it yourself, profiling the speedup, analyzing the trade-offs - will give you deep insight into how production ML systems work. This is the optimization that makes ChatGPT, Claude, and GitHub Copilot possible.</p>
<p>Take your time, measure thoroughly, and enjoy building production-ready ML systems!</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/17_memoization/memoization_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/17_memoization/memoization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Jupyter notebook source and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/17_memoization/memoization_dev.ipynb</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../16_compression/ABOUT.html" title="previous page">← Previous Module</a>
<a class="right-next" href="../18_acceleration/ABOUT.html" title="next page">Next Module →</a>
</div>
</section>
</section>

    <script type="text/x-thebe-config">
    {
        requestKernel: true,
        binderOptions: {
            repo: "binder-examples/jupyter-stacks-datascience",
            ref: "master",
        },
        codeMirrorConfig: {
            theme: "abcdef",
            mode: "python"
        },
        kernelOptions: {
            name: "python3",
            path: "./modules"
        },
        predefinedOutput: true
    }
    </script>
    <script>kernelName = 'python3'</script>

                </article>


                <footer class="prev-next-footer d-print-none">

<div class="prev-next-area">
    <a class="left-prev"
       href="16_compression_ABOUT.html"
       title="previous page">
      <i class="fa-solid fa-angle-left"></i>
      <div class="prev-next-info">
        <p class="prev-next-subtitle">previous</p>
        <p class="prev-next-title">16. Compression - Pruning and Model Compression</p>
      </div>
    </a>
    <a class="right-next"
       href="18_acceleration_ABOUT.html"
       title="next page">
      <div class="prev-next-info">
        <p class="prev-next-subtitle">next</p>
        <p class="prev-next-title">18. Acceleration - CPU Vectorization &amp; Cache Optimization</p>
      </div>
      <i class="fa-solid fa-angle-right"></i>
    </a>
</div>
                </footer>

            </div>


                <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">


  <div class="sidebar-secondary-item">
  <div class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> Contents
  </div>
  <nav class="bd-toc-nav page-toc">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-optimization-flow">KV Cache Optimization Flow</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-autoregressive-generation-problem">The Autoregressive Generation Problem</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-caching-solution">The Caching Solution</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-impact">Production Impact</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-speed-trade-off">Memory-Speed Trade-off</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-components">Core Components</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kvcache-data-structure">1. KVCache Data Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#non-invasive-cache-integration">2. Non-Invasive Cache Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#cached-attention-logic">3. Cached Attention Logic</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-steps">Implementation Steps</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-1-design-kvcache-structure">Step 1: Design KVCache Structure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-2-implement-cache-updates">Step 2: Implement Cache Updates</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-3-enable-non-invasive-integration">Step 3: Enable Non-Invasive Integration</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-4-implement-cached-attention-forward">Step 4: Implement Cached Attention Forward</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#step-5-validate-correctness">Step 5: Validate Correctness</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#why-data-instead-of-tensor-operations">Why .data Instead of Tensor Operations?</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-profiling">Inline Testing &amp; Profiling</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-production-challenges">Real-World Production Challenges</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-optimization-patterns">Production Optimization Patterns</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mathematical-foundations">Mathematical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#huggingface-cache-patterns-comparison">HuggingFace Cache Patterns Comparison</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expected-speedup-by-sequence-length">Expected Speedup by Sequence Length</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-usage-by-model-size">Memory Usage by Model Size</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#throughput-impact">Throughput Impact</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#where-this-code-lives-in-the-final-package">Where This Code Lives in the Final Package</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#common-challenges-and-solutions">Common Challenges and Solutions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-1-cache-synchronization-across-layers">Challenge 1: Cache Synchronization Across Layers</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-2-memory-overhead-for-large-models">Challenge 2: Memory Overhead for Large Models</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-3-correctness-validation">Challenge 3: Correctness Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#challenge-4-integration-without-breaking-existing-code">Challenge 4: Integration Without Breaking Existing Code</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
  </nav></div>

</div></div>


          </div>
          <footer class="bd-footer-content">

<div class="bd-footer-content__inner container">

  <div class="footer-item">

<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>

  </div>

  <div class="footer-item">


  <p class="copyright">

      © Copyright 2025.
      <br/>

  </p>

  </div>

  <div class="footer-item">

  </div>

  <div class="footer-item">

  </div>

</div>
          </footer>


      </main>
    </div>
  </div>

  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>

  <footer class="bd-footer">
  </footer>
  </body>
</html>