TinyTorch/dev/modules/15_quantization_ABOUT.html


<!DOCTYPE html>


<html lang="en" data-content_root="../" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>15. Quantization - Reduced Precision for Efficiency &#8212; Tiny🔥Torch</title>


  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
  </script>

  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />


  <link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
  <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
    <link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
    <link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
    <link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
    <link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
    <link rel="stylesheet" type="text/css" href="../_static/custom.css?v=afcf7c3c" />

  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
  <script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>

    <script src="../_static/documentation_options.js?v=9eb32ce0"></script>
    <script src="../_static/doctools.js?v=9a2dae69"></script>
    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
    <script src="../_static/copybutton.js?v=f281be69"></script>
    <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
    <script>let toggleHintShow = 'Click to show';</script>
    <script>let toggleHintHide = 'Click to hide';</script>
    <script>let toggleOpenOnPrint = 'true';</script>
    <script src="../_static/togglebutton.js?v=4a39c7ea"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script src="../_static/design-tabs.js?v=f930bc37"></script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
    <script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
    <script type="module" src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs"></script>
    <script type="module" src="https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.1.4/dist/mermaid-layout-elk.esm.min.mjs"></script>
    <script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";import elkLayouts from "https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.1.4/dist/mermaid-layout-elk.esm.min.mjs";mermaid.registerLayoutLoaders(elkLayouts);mermaid.initialize({startOnLoad:false});</script>
    <script src="https://cdn.jsdelivr.net/npm/d3@7.9.0/dist/d3.min.js"></script>
    <script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";

const defaultStyle = document.createElement('style');
defaultStyle.textContent = `pre.mermaid {
    /* Same as .mermaid-container > pre */
    display: block;
    width: 100%;
}

pre.mermaid > svg {
    /* Same as .mermaid-container > pre > svg */
    height: 500px;
    width: 100%;
    max-width: 100% !important;
}
`;
document.head.appendChild(defaultStyle);

const fullscreenStyle = document.createElement('style');
fullscreenStyle.textContent = `.mermaid-container {
    display: flex;
    flex-direction: row;
    width: 100%;
}

.mermaid-container > pre {
    display: block;
    width: 100%;
}

.mermaid-container > pre > svg {
    height: 500px;
    width: 100%;
    max-width: 100% !important;
}

.mermaid-fullscreen-btn {
    width: 28px;
    height: 28px;
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid rgba(0, 0, 0, 0.3);
    border-radius: 4px;
    cursor: pointer;
    display: flex;
    align-items: center;
    justify-content: center;
    transition: all 0.2s;
    box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
    font-size: 14px;
    line-height: 1;
    padding: 0;
    color: #333;
}

.mermaid-fullscreen-btn:hover {
    opacity: 100% !important;
    background: rgba(255, 255, 255, 1);
    box-shadow: 0 3px 10px rgba(0, 0, 0, 0.3);
    transform: scale(1.1);
}

.mermaid-fullscreen-btn.dark-theme {
    background: rgba(50, 50, 50, 0.95);
    border: 1px solid rgba(255, 255, 255, 0.3);
    color: #e0e0e0;
}

.mermaid-fullscreen-btn.dark-theme:hover {
    background: rgba(60, 60, 60, 1);
    box-shadow: 0 3px 10px rgba(255, 255, 255, 0.2);
}

.mermaid-fullscreen-modal {
    display: none;
    position: fixed !important;
    top: 0 !important;
    left: 0 !important;
    width: 95vw;
    height: 100vh;
    background: rgba(255, 255, 255, 0.98);
    z-index: 9999;
    padding: 20px;
    overflow: auto;
}

.mermaid-fullscreen-modal.dark-theme {
    background: rgba(0, 0, 0, 0.98);
}

.mermaid-fullscreen-modal.active {
    display: flex;
    align-items: center;
    justify-content: center;
}

.mermaid-container-fullscreen {
    position: relative;
    width: 95vw;
    height: 90vh;
    max-width: 95vw;
    max-height: 90vh;
    background: white;
    border-radius: 8px;
    padding: 20px;
    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
    overflow: auto;
    display: flex;
    align-items: center;
    justify-content: center;
}

.mermaid-container-fullscreen.dark-theme {
    background: #1a1a1a;
    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.8);
}

.mermaid-container-fullscreen pre.mermaid {
    width: 100%;
    height: 100%;
    display: flex;
    align-items: center;
    justify-content: center;
}

.mermaid-container-fullscreen .mermaid svg {
    height: 100% !important;
    width: 100% !important;
    cursor: grab;
}

.mermaid-fullscreen-close {
    position: fixed !important;
    top: 20px !important;
    right: 20px !important;
    width: 40px;
    height: 40px;
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid rgba(0, 0, 0, 0.2);
    border-radius: 50%;
    cursor: pointer;
    z-index: 10000;
    display: flex;
    align-items: center;
    justify-content: center;
    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
    transition: all 0.2s;
    font-size: 24px;
    line-height: 1;
    color: #333;
}

.mermaid-fullscreen-close:hover {
    background: white;
    box-shadow: 0 6px 16px rgba(0, 0, 0, 0.4);
    transform: scale(1.1);
}

.mermaid-fullscreen-close.dark-theme {
    background: rgba(50, 50, 50, 0.95);
    border: 1px solid rgba(255, 255, 255, 0.2);
    color: #e0e0e0;
}

.mermaid-fullscreen-close.dark-theme:hover {
    background: rgba(60, 60, 60, 1);
    box-shadow: 0 6px 16px rgba(255, 255, 255, 0.2);
}

.mermaid-fullscreen-modal .mermaid-fullscreen-btn {
    display: none !important;
}`;
document.head.appendChild(fullscreenStyle);

// Detect if page has dark background
const isDarkTheme = () => {
    const bgColor = window.getComputedStyle(document.body).backgroundColor;
    const match = bgColor.match(/rgb\((\d+),\s*(\d+),\s*(\d+)/);
    if (match) {
        const r = parseInt(match[1]);
        const g = parseInt(match[2]);
        const b = parseInt(match[3]);
        const brightness = (r * 299 + g * 587 + b * 114) / 1000;
        return brightness < 128;
    }
    return false;
};

const load = async () => {
    await mermaid.run();

    const all_mermaids = document.querySelectorAll(".mermaid");
    const mermaids_processed = document.querySelectorAll(".mermaid[data-processed='true']");

    if ("False" === "True") {
        const mermaids_to_add_zoom = -1 === -1 ? all_mermaids.length : -1;
        if(mermaids_to_add_zoom > 0) {
            var svgs = d3.selectAll("");
            if(all_mermaids.length !== mermaids_processed.length) {
                setTimeout(load, 200);
                return;
            } else if(svgs.size() !== mermaids_to_add_zoom) {
                setTimeout(load, 200);
                return;
            } else {
                svgs.each(function() {
                    var svg = d3.select(this);
                    svg.html("<g class='wrapper'>" + svg.html() + "</g>");
                    var inner = svg.select("g");
                    var zoom = d3.zoom().on("zoom", function(event) {
                        inner.attr("transform", event.transform);
                    });
                    svg.call(zoom);
                });
            }
        }
    } else if(all_mermaids.length !== mermaids_processed.length) {
        // Wait for mermaid to process all diagrams
        setTimeout(load, 200);
        return;
    }

    const darkTheme = isDarkTheme();

    // Stop here if not adding fullscreen capability
    if ("True" !== "True") return;

    const modal = document.createElement('div');
    modal.className = 'mermaid-fullscreen-modal' + (darkTheme ? ' dark-theme' : '');
    modal.setAttribute('role', 'dialog');
    modal.setAttribute('aria-modal', 'true');
    modal.setAttribute('aria-label', 'Fullscreen diagram viewer');
    modal.innerHTML = `
        <button class="mermaid-fullscreen-close${darkTheme ? ' dark-theme' : ''}" aria-label="Close fullscreen">✕</button>
        <div class="mermaid-container-fullscreen${darkTheme ? ' dark-theme' : ''}"></div>
    `;
    document.body.appendChild(modal);

    const modalContent = modal.querySelector('.mermaid-container-fullscreen');
    const closeBtn = modal.querySelector('.mermaid-fullscreen-close');

    let previousScrollOffset = [window.scrollX, window.scrollY];

    const closeModal = () => {
        modal.classList.remove('active');
        modalContent.innerHTML = '';
        document.body.style.overflow = ''
        window.scrollTo({left: previousScrollOffset[0], top: previousScrollOffset[1], behavior: 'instant'});
    };

    closeBtn.addEventListener('click', closeModal);
    modal.addEventListener('click', (e) => {
        if (e.target === modal) closeModal();
    });
    document.addEventListener('keydown', (e) => {
        if (e.key === 'Escape' && modal.classList.contains('active')) {
            closeModal();
        }
    });

    const allButtons = [];

    document.querySelectorAll('.mermaid').forEach((mermaidDiv) => {
        if (mermaidDiv.parentNode.classList.contains('mermaid-container') ||
            mermaidDiv.closest('.mermaid-fullscreen-modal')) {
            return;
        }

        const container = document.createElement('div');
        container.className = 'mermaid-container';
        mermaidDiv.parentNode.insertBefore(container, mermaidDiv);
        container.appendChild(mermaidDiv);

        const fullscreenBtn = document.createElement('button');
        fullscreenBtn.className = 'mermaid-fullscreen-btn' + (darkTheme ? ' dark-theme' : '');
        fullscreenBtn.setAttribute('aria-label', 'View diagram in fullscreen');
        fullscreenBtn.textContent = '⛶';
        fullscreenBtn.style.opacity = '50%';

        // Calculate dynamic position based on diagram's margin and padding
        const diagramStyle = window.getComputedStyle(mermaidDiv);
        const marginTop = parseFloat(diagramStyle.marginTop) || 0;
        const marginRight = parseFloat(diagramStyle.marginRight) || 0;
        const paddingTop = parseFloat(diagramStyle.paddingTop) || 0;
        const paddingRight = parseFloat(diagramStyle.paddingRight) || 0;
        fullscreenBtn.style.top = `${marginTop + paddingTop + 4}px`;
        fullscreenBtn.style.right = `${marginRight + paddingRight + 4}px`;

        fullscreenBtn.addEventListener('click', () => {
            previousScrollOffset = [window.scroll, window.scrollY];
            const clone = mermaidDiv.cloneNode(true);
            modalContent.innerHTML = '';
            modalContent.appendChild(clone);

            const svg = clone.querySelector('svg');
            if (svg) {
                svg.removeAttribute('width');
                svg.removeAttribute('height');
                svg.style.width = '100%';
                svg.style.height = 'auto';
                svg.style.maxWidth = '100%';
                svg.style.sdisplay = 'block';

                if ("False" === "True") {
                    setTimeout(() => {
                        const g = svg.querySelector('g');
                        if (g) {
                            var svgD3 = d3.select(svg);
                            svgD3.html("<g class='wrapper'>" + svgD3.html() + "</g>");
                            var inner = svgD3.select("g");
                            var zoom = d3.zoom().on("zoom", function(event) {
                                inner.attr("transform", event.transform);
                            });
                            svgD3.call(zoom);
                        }
                    }, 100);
                }
            }

            modal.classList.add('active');
            document.body.style.overflow = 'hidden';
        });

        container.appendChild(fullscreenBtn);
        allButtons.push(fullscreenBtn);
    });

    // Update theme classes when theme changes
    const updateTheme = () => {
        const dark = isDarkTheme();
        allButtons.forEach(btn => {
            if (dark) {
                btn.classList.add('dark-theme');
            } else {
                btn.classList.remove('dark-theme');
            }
        });
        if (dark) {
            modal.classList.add('dark-theme');
            modalContent.classList.add('dark-theme');
            closeBtn.classList.add('dark-theme');
        } else {
            modal.classList.remove('dark-theme');
            modalContent.classList.remove('dark-theme');
            closeBtn.classList.remove('dark-theme');
        }
    };

    // Watch for theme changes
    const observer = new MutationObserver(updateTheme);
    observer.observe(document.documentElement, {
        attributes: true,
        attributeFilter: ['class', 'style', 'data-theme']
    });
    observer.observe(document.body, {
        attributes: true,
        attributeFilter: ['class', 'style']
    });
};

window.addEventListener("load", load);
</script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'modules/15_quantization_ABOUT';</script>
    <script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
    <script src="../_static/wip-banner.js?v=5357532b"></script>
    <script src="../_static/marimo-badges.js?v=1e5d2842"></script>
    <script src="../_static/sidebar-link.js?v=404b701b"></script>
    <script src="../_static/hero-carousel.js?v=10341d2a"></script>
    <link rel="icon" href="../_static/favicon.svg"/>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="16. Compression - Pruning and Model Compression" href="16_compression_ABOUT.html" />
    <link rel="prev" title="14. Profiling - Performance Measurement for ML Systems" href="14_profiling_ABOUT.html" />
  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  </head>


  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">


  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>

  <div id="pst-scroll-pixel-helper"></div>

  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
    <i class="fa-solid fa-arrow-up"></i>Back to top</button>


  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-primary-sidebar-checkbox"/>
  <label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>

  <input type="checkbox"
          class="sidebar-toggle"
          id="pst-secondary-sidebar-checkbox"/>
  <label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>

  <div class="search-button__wrapper">
    <div class="search-button__overlay"></div>
    <div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
      action="../search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         id="search-input"
         placeholder="Search this book..."
         aria-label="Search this book..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
  </div>

  <div class="pst-async-banner-revealer d-none">
  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>


    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
    </header>


  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">


      <div class="bd-sidebar-primary bd-sidebar">


  <div class="sidebar-header-items sidebar-primary__section">


  </div>

    <div class="sidebar-primary-items__start sidebar-primary__section">
        <div class="sidebar-primary-item">


<a class="navbar-brand logo" href="../intro.html">


    <img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
    <script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>


</a></div>
        <div class="sidebar-primary-item">

 <script>
 document.write(`
   <button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass"></i>
    <span class="search-button__default-text">Search</span>
    <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
   </button>
 `);
 </script></div>
        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
    <div class="bd-toc-item navbar-nav active">

        <ul class="nav bd-sidenav bd-sidenav__home-link">
            <li class="toctree-l1">
                <a class="reference internal" href="../intro.html">
                    Getting Started
                </a>
            </li>
        </ul>
        <p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../quickstart-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../student-workflow.html">Student Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/classroom-use.html">For Instructors</a></li>
<li class="toctree-l1"><a class="reference internal" href="../instructor-guide.html">Instructor Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/ta-guide.html">TA Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/team-onboarding.html">Team Onboarding</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>

    </div>
</nav></div>
    </div>


  <div class="sidebar-primary-items__end sidebar-primary__section">
  </div>

  <div id="rtd-footer-container"></div>


      </div>

      <main id="main-content" class="bd-main" role="main">


<div class="sbt-scroll-pixel-helper"></div>

          <div class="bd-content">
            <div class="bd-article-container">

              <div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">

    <div class="header-article-items__start">

        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
  <span class="fa-solid fa-bars"></span>
</button></div>

    </div>


    <div class="header-article-items__end">

        <div class="header-article-item">

<div class="article-header-buttons">


<div class="dropdown dropdown-source-buttons">
  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
    <i class="fab fa-github"></i>
  </button>
  <ul class="dropdown-menu">


      <li><a href="https://github.com/mlsysbook/TinyTorch" target="_blank"
   class="btn btn-sm btn-source-repository-button dropdown-item"
   title="Source repository"
   data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fab fa-github"></i>
  </span>
<span class="btn__text-container">Repository</span>
</a>
</li>


      <li><a href="https://github.com/mlsysbook/TinyTorch/edit/main/site/modules/15_quantization_ABOUT.md" target="_blank"
   class="btn btn-sm btn-source-edit-button dropdown-item"
   title="Suggest edit"
   data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-pencil-alt"></i>
  </span>
<span class="btn__text-container">Suggest edit</span>
</a>
</li>


      <li><a href="https://github.com/mlsysbook/TinyTorch/issues/new?title=Issue%20on%20page%20%2Fmodules/15_quantization_ABOUT.html&body=Your%20issue%20content%20here." target="_blank"
   class="btn btn-sm btn-source-issues-button dropdown-item"
   title="Open an issue"
   data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-lightbulb"></i>
  </span>
<span class="btn__text-container">Open issue</span>
</a>
</li>

  </ul>
</div>


<div class="dropdown dropdown-download-buttons">
  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
    <i class="fas fa-download"></i>
  </button>
  <ul class="dropdown-menu">


      <li><a href="../_sources/modules/15_quantization_ABOUT.md" target="_blank"
   class="btn btn-sm btn-download-source-button dropdown-item"
   title="Download source file"
   data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-file"></i>
  </span>
<span class="btn__text-container">.md</span>
</a>
</li>


      <li>
<button onclick="window.print()"
  class="btn btn-sm btn-download-pdf-button dropdown-item"
  title="Print to PDF"
  data-bs-placement="left" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-file-pdf"></i>
  </span>
<span class="btn__text-container">.pdf</span>
</button>
</li>

  </ul>
</div>


<button onclick="toggleFullScreen()"
  class="btn btn-sm btn-fullscreen-button"
  title="Fullscreen mode"
  data-bs-placement="bottom" data-bs-toggle="tooltip"
>


<span class="btn__icon-container">
  <i class="fas fa-expand"></i>
  </span>

</button>


<script>
document.write(`
  <button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
    <i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
    <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
  </button>
`);
</script>


<script>
document.write(`
  <button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
  </button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
    <span class="fa-solid fa-list"></span>
</button>
</div></div>

    </div>

</div>
</div>


<div id="jb-print-docs-body" class="onlyprint">
    <h1>15. Quantization - Reduced Precision for Efficiency</h1>
    <!-- Table of contents -->
    <div id="print-main-content">
        <div id="jb-print-toc">

            <div>
                <h2> Contents </h2>
            </div>
            <nav aria-label="Page">
                <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-flow-fp32-int8">Quantization Flow: FP32 → INT8</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-youre-actually-building-educational-quantization">What You’re Actually Building (Educational Quantization)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-quantization-mathematics">Core Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#calibration-the-critical-step">Calibration - The Critical Step</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#per-tensor-vs-per-channel-quantization">Per-Tensor vs Per-Channel Quantization</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantizedlinear-quantized-neural-network-layer">QuantizedLinear - Quantized Neural Network Layer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#model-level-quantization">Model-Level Quantization</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-quantization-analysis">Inline Testing &amp; Quantization Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-mathematics">Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-deployment-characteristics">Production Deployment Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
            </nav>
        </div>
    </div>
</div>


<div id="searchbox"></div>
                <article class="bd-article">

  <section id="quantization-reduced-precision-for-efficiency">
<h1>15. Quantization - Reduced Precision for Efficiency<a class="headerlink" href="#quantization-reduced-precision-for-efficiency" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 5-6 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>This module implements quantization fundamentals: converting FP32 tensors to INT8 representation to reduce memory by 4×. You’ll build the mathematics of scale/zero-point quantization, implement quantized linear layers, and measure accuracy-efficiency trade-offs. CRITICAL HONESTY: You’re implementing quantization math in Python, NOT actual hardware INT8 operations. This teaches the principles that enable TensorFlow Lite/PyTorch Mobile deployment, but real speedups require specialized hardware (Edge TPU, Neural Engine) or compiled frameworks with INT8 kernels. Your implementation will be 4× more memory-efficient but not faster - understanding WHY teaches you what production quantization frameworks must optimize.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Quantization Mathematics</strong>: Implement symmetric and asymmetric INT8 quantization with scale/zero-point parameter calculation</p></li>
<li><p><strong>Calibration Strategies</strong>: Design percentile-based calibration to minimize accuracy loss when selecting quantization parameters</p></li>
<li><p><strong>Memory-Accuracy Trade-offs</strong>: Measure when 4× memory reduction justifies 0.5-2% accuracy degradation for deployment</p></li>
<li><p><strong>Production Reality</strong>: Distinguish between educational quantization (Python simulation) vs production INT8 (hardware acceleration, kernel fusion)</p></li>
<li><p><strong>When to Quantize</strong>: Recognize deployment scenarios where quantization is mandatory (mobile/edge) vs optional (cloud serving)</p></li>
</ul>
</section>
<section id="build-use-optimize">
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorch’s <strong>Build → Use → Optimize</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement INT8 quantization/dequantization, calibration logic, QuantizedLinear layers</p></li>
<li><p><strong>Use</strong>: Quantize trained models, measure accuracy degradation vs memory savings on MNIST/CIFAR</p></li>
<li><p><strong>Optimize</strong>: Analyze the accuracy-efficiency frontier - when does quantization enable deployment vs hurt accuracy unacceptably?</p></li>
</ol>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="quantization-flow-fp32-int8">
<h3>Quantization Flow: FP32 → INT8<a class="headerlink" href="#quantization-flow-fp32-int8" title="Link to this heading">#</a></h3>
<p>Quantization compresses weights by reducing precision, trading accuracy for memory efficiency:</p>
<pre  class="mermaid">
        graph LR
    A[FP32 Weight&lt;br/&gt;4 bytes&lt;br/&gt;-3.14159] --&gt; B[Quantize&lt;br/&gt;scale + zero_point]
    B --&gt; C[INT8 Weight&lt;br/&gt;1 byte&lt;br/&gt;-126]
    C --&gt; D[Dequantize&lt;br/&gt;Inference]
    D --&gt; E[FP32 Compute&lt;br/&gt;Result]

    style A fill:#e3f2fd
    style B fill:#fff3e0
    style C fill:#f3e5f5
    style D fill:#ffe0b2
    style E fill:#f0fdf4
    </pre><p><strong>Flow</strong>: Original FP32 → Calibrate scale → Store as INT8 (4× smaller) → Dequantize for computation → FP32 result</p>
</section>
<section id="what-youre-actually-building-educational-quantization">
<h3>What You’re Actually Building (Educational Quantization)<a class="headerlink" href="#what-youre-actually-building-educational-quantization" title="Link to this heading">#</a></h3>
<p><strong>Your Implementation:</strong></p>
<ul class="simple">
<li><p>Quantization math: FP32 → INT8 conversion with scale/zero-point</p></li>
<li><p>QuantizedLinear: Store weights as INT8, compute in simulated quantized arithmetic</p></li>
<li><p>Calibration: Find optimal scale parameters from representative data</p></li>
<li><p>Memory measurement: Verify 4× reduction (32 bits → 8 bits)</p></li>
</ul>
<p><strong>What You’re NOT Building:</strong></p>
<ul class="simple">
<li><p>Actual INT8 hardware operations (requires CPU VNNI, ARM NEON, GPU Tensor Cores)</p></li>
<li><p>Kernel fusion (eliminating quantize/dequantize overhead)</p></li>
<li><p>Mixed-precision execution graphs (FP32 for sensitive ops, INT8 for matmul)</p></li>
<li><p>Production deployment pipelines (TensorFlow Lite converter, ONNX Runtime optimization)</p></li>
</ul>
<p><strong>Why This Matters:</strong> Understanding quantization math is essential. But knowing that production speedups require hardware acceleration + compiler optimization prevents unrealistic expectations. Your 4× memory reduction is real; your lack of speedup teaches why TensorFlow Lite needs custom kernels.</p>
</section>
<section id="core-quantization-mathematics">
<h3>Core Quantization Mathematics<a class="headerlink" href="#core-quantization-mathematics" title="Link to this heading">#</a></h3>
<p><strong>Symmetric Quantization (Zero-Point = 0)</strong></p>
<p>Assumes data is centered around zero (common after BatchNorm):</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Quantization: FP32 → INT8</span>
<span class="n">scale</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">tensor</span><span class="p">))</span> <span class="o">/</span> <span class="mf">127.0</span>  <span class="c1"># Scale factor</span>
<span class="n">quantized</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="n">tensor</span> <span class="o">/</span> <span class="n">scale</span><span class="p">)</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">int8</span><span class="p">)</span>

<span class="c1"># Dequantization: INT8 → FP32</span>
<span class="n">dequantized</span> <span class="o">=</span> <span class="n">quantized</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">float32</span><span class="p">)</span> <span class="o">*</span> <span class="n">scale</span>
</pre></div>
</div>
<ul class="simple">
<li><p><strong>Range</strong>: INT8 is [-128, 127] (256 values)</p></li>
<li><p><strong>Scale</strong>: Maps largest FP32 value to 127</p></li>
<li><p><strong>Zero-point</strong>: Always 0 (symmetric around origin)</p></li>
<li><p><strong>Use case</strong>: Weights after normalization, activations after BatchNorm</p></li>
</ul>
<p><strong>Asymmetric Quantization (With Zero-Point)</strong></p>
<p>Handles arbitrary data ranges (e.g., activations after ReLU: [0, max]):</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Quantization: FP32 → INT8</span>
<span class="n">min_val</span><span class="p">,</span> <span class="n">max_val</span> <span class="o">=</span> <span class="n">tensor</span><span class="o">.</span><span class="n">min</span><span class="p">(),</span> <span class="n">tensor</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
<span class="n">scale</span> <span class="o">=</span> <span class="p">(</span><span class="n">max_val</span> <span class="o">-</span> <span class="n">min_val</span><span class="p">)</span> <span class="o">/</span> <span class="mf">255.0</span>
<span class="n">zero_point</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="o">-</span><span class="n">min_val</span> <span class="o">/</span> <span class="n">scale</span><span class="p">)</span>
<span class="n">quantized</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="n">tensor</span> <span class="o">/</span> <span class="n">scale</span> <span class="o">+</span> <span class="n">zero_point</span><span class="p">)</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">int8</span><span class="p">)</span>

<span class="c1"># Dequantization: INT8 → FP32</span>
<span class="n">dequantized</span> <span class="o">=</span> <span class="p">(</span><span class="n">quantized</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">float32</span><span class="p">)</span> <span class="o">-</span> <span class="n">zero_point</span><span class="p">)</span> <span class="o">*</span> <span class="n">scale</span>
</pre></div>
</div>
<ul class="simple">
<li><p><strong>Range</strong>: Uses full [-128, 127] even if data is [0, 5]</p></li>
<li><p><strong>Scale</strong>: Maps data range to INT8 range</p></li>
<li><p><strong>Zero-point</strong>: Offset ensuring FP32 zero maps to specific INT8 value</p></li>
<li><p><strong>Use case</strong>: ReLU activations, input images, any non-centered data</p></li>
</ul>
<p><strong>Trade-off:</strong> Symmetric is simpler (no zero-point storage/computation), asymmetric uses range more efficiently (better for skewed distributions).</p>
</section>
<section id="calibration-the-critical-step">
<h3>Calibration - The Critical Step<a class="headerlink" href="#calibration-the-critical-step" title="Link to this heading">#</a></h3>
<p>Quantization quality depends entirely on scale/zero-point selection. Poor choices destroy accuracy.</p>
<p><strong>Naive Approach (Don’t Do This):</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use global min/max from training data</span>
<span class="n">scale</span> <span class="o">=</span> <span class="p">(</span><span class="n">tensor_max</span> <span class="o">-</span> <span class="n">tensor_min</span><span class="p">)</span> <span class="o">/</span> <span class="mi">255</span>
<span class="c1"># Problem: Single outlier wastes most INT8 range</span>
<span class="c1"># Example: data in [0, 5] but one outlier at 100 → scale = 100/255</span>
<span class="c1"># Result: 95% of data maps to only 13 INT8 values (5/100 * 255 = 13)</span>
</pre></div>
</div>
<p><strong>Calibration Approach (Correct):</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use percentile-based clipping</span>
<span class="n">max_val</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">percentile</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">calibration_data</span><span class="p">),</span> <span class="mf">99.9</span><span class="p">)</span>
<span class="n">scale</span> <span class="o">=</span> <span class="n">max_val</span> <span class="o">/</span> <span class="mi">127</span>
<span class="c1"># Clips 0.1% outliers, uses INT8 range efficiently</span>
<span class="c1"># 99.9th percentile ignores rare outliers, preserves typical range</span>
</pre></div>
</div>
<p><strong>Calibration Process:</strong></p>
<ol class="arabic simple">
<li><p>Collect 100-1000 samples of representative data (validation set)</p></li>
<li><p>For each layer, record activation statistics during forward passes</p></li>
<li><p>Compute percentile-based min/max (typically 99.9th percentile)</p></li>
<li><p>Calculate scale/zero-point from clipped statistics</p></li>
<li><p>Quantize weights/activations using calibrated parameters</p></li>
</ol>
<p><strong>Why It Works:</strong> Most activations follow normal-ish distributions. Outliers are rare but dominate min/max. Clipping 0.1% of outliers uses INT8 range 10-100× more efficiently with negligible accuracy loss.</p>
</section>
<section id="per-tensor-vs-per-channel-quantization">
<h3>Per-Tensor vs Per-Channel Quantization<a class="headerlink" href="#per-tensor-vs-per-channel-quantization" title="Link to this heading">#</a></h3>
<p><strong>Per-Tensor Quantization:</strong></p>
<ul class="simple">
<li><p>One scale/zero-point for entire weight tensor</p></li>
<li><p>Simple: store 2 parameters per layer</p></li>
<li><p>Example: Conv2D with 64×3×3×3 weights uses 1 scale, 1 zero-point</p></li>
</ul>
<p><strong>Per-Channel Quantization:</strong></p>
<ul class="simple">
<li><p>Separate scale/zero-point per output channel</p></li>
<li><p>Better accuracy: each channel uses its natural range</p></li>
<li><p>Example: Conv2D with 64 output channels uses 64 scales, 64 zero-points</p></li>
<li><p>Overhead: 128 extra parameters (64 scales + 64 zero-points)</p></li>
</ul>
<p><strong>When to Use Per-Channel:</strong></p>
<ul class="simple">
<li><p>Weight magnitudes vary significantly across channels (common in Conv layers)</p></li>
<li><p>Accuracy improvement (0.5-1.5%) justifies 0.1-0.5% memory overhead</p></li>
<li><p>Production frameworks (PyTorch, TensorFlow Lite) default to per-channel for Conv/Linear</p></li>
</ul>
<p><strong>Trade-off Table:</strong></p>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Quantization Scheme</p></th>
<th class="head"><p>Parameters</p></th>
<th class="head"><p>Accuracy</p></th>
<th class="head"><p>Complexity</p></th>
<th class="head"><p>Use Case</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Per-Tensor</p></td>
<td><p>2 per layer</p></td>
<td><p>Baseline</p></td>
<td><p>Simple</p></td>
<td><p>Fast prototyping, small models</p></td>
</tr>
<tr class="row-odd"><td><p>Per-Channel (Conv)</p></td>
<td><p>2N (N=channels)</p></td>
<td><p>+0.5-1.5%</p></td>
<td><p>Medium</p></td>
<td><p>Production Conv layers</p></td>
</tr>
<tr class="row-even"><td><p>Per-Channel (Linear)</p></td>
<td><p>2N (N=out_features)</p></td>
<td><p>+0.3-0.8%</p></td>
<td><p>Medium</p></td>
<td><p>Production Linear layers</p></td>
</tr>
<tr class="row-odd"><td><p>Mixed (Conv per-channel, Linear per-tensor)</p></td>
<td><p>Hybrid</p></td>
<td><p>+0.4-1.2%</p></td>
<td><p>Medium</p></td>
<td><p>Balanced approach</p></td>
</tr>
</tbody>
</table>
</div>
</section>
<section id="quantizedlinear-quantized-neural-network-layer">
<h3>QuantizedLinear - Quantized Neural Network Layer<a class="headerlink" href="#quantizedlinear-quantized-neural-network-layer" title="Link to this heading">#</a></h3>
<p>Replaces regular Linear layer with quantized equivalent:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">QuantizedLinear</span><span class="p">:</span>
    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">linear_layer</span><span class="p">:</span> <span class="n">Linear</span><span class="p">):</span>
        <span class="c1"># Quantize weights at initialization</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">weights_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_zp</span> <span class="o">=</span> <span class="n">quantize_int8</span><span class="p">(</span><span class="n">linear_layer</span><span class="o">.</span><span class="n">weight</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">bias_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_zp</span> <span class="o">=</span> <span class="n">quantize_int8</span><span class="p">(</span><span class="n">linear_layer</span><span class="o">.</span><span class="n">bias</span><span class="p">)</span>

        <span class="c1"># Store original FP32 for accuracy comparison</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">original_weight</span> <span class="o">=</span> <span class="n">linear_layer</span><span class="o">.</span><span class="n">weight</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
        <span class="c1"># EDUCATIONAL VERSION: Dequantize → compute in FP32 → quantize result</span>
        <span class="c1"># (Simulates quantization math but doesn&#39;t speed up computation)</span>
        <span class="n">weight_fp32</span> <span class="o">=</span> <span class="n">dequantize_int8</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">weights_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_zp</span><span class="p">)</span>
        <span class="n">bias_fp32</span> <span class="o">=</span> <span class="n">dequantize_int8</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bias_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_zp</span><span class="p">)</span>

        <span class="c1"># Compute in FP32 (not actually faster - just lower precision storage)</span>
        <span class="n">output</span> <span class="o">=</span> <span class="n">x</span> <span class="o">@</span> <span class="n">weight_fp32</span><span class="o">.</span><span class="n">T</span> <span class="o">+</span> <span class="n">bias_fp32</span>
        <span class="k">return</span> <span class="n">output</span>
</pre></div>
</div>
<p><strong>What Happens in Production (TensorFlow Lite, PyTorch Mobile):</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Production quantized matmul (conceptual - happens in C++/assembly)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">quantized_matmul_production</span><span class="p">(</span><span class="n">x_int8</span><span class="p">,</span> <span class="n">weight_int8</span><span class="p">,</span> <span class="n">x_scale</span><span class="p">,</span> <span class="n">weight_scale</span><span class="p">,</span> <span class="n">output_scale</span><span class="p">):</span>
    <span class="c1"># 1. INT8 x INT8 matmul using VNNI/NEON/Tensor Cores (FAST)</span>
    <span class="n">accum_int32</span> <span class="o">=</span> <span class="n">matmul_int8_hardware</span><span class="p">(</span><span class="n">x_int8</span><span class="p">,</span> <span class="n">weight_int8</span><span class="p">)</span>  <span class="c1"># Specialized instruction</span>

    <span class="c1"># 2. Requantize accumulated INT32 → INT8 output</span>
    <span class="n">combined_scale</span> <span class="o">=</span> <span class="p">(</span><span class="n">x_scale</span> <span class="o">*</span> <span class="n">weight_scale</span><span class="p">)</span> <span class="o">/</span> <span class="n">output_scale</span>
    <span class="n">output_int8</span> <span class="o">=</span> <span class="p">(</span><span class="n">accum_int32</span> <span class="o">*</span> <span class="n">combined_scale</span><span class="p">)</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">)</span>

    <span class="c1"># 3. Stay in INT8 for next layer (no dequantization unless necessary)</span>
    <span class="k">return</span> <span class="n">output_int8</span>
</pre></div>
</div>
<p><strong>Key Differences:</strong></p>
<ul class="simple">
<li><p><strong>Your implementation</strong>: Dequantize → FP32 compute → quantize (educational, slow)</p></li>
<li><p><strong>Production</strong>: INT8 → INT8 throughout, specialized hardware (4-10× speedup)</p></li>
</ul>
<p><strong>Memory Savings (Real):</strong> 4× reduction from storing INT8 instead of FP32
<strong>Speed Improvement (Your Code):</strong> ~0× (Python overhead dominates)
<strong>Speed Improvement (Production):</strong> 2-10× (hardware acceleration, kernel fusion)</p>
</section>
<section id="model-level-quantization">
<h3>Model-Level Quantization<a class="headerlink" href="#model-level-quantization" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">quantize_model</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">calibration_data</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd">    Quantize all Linear layers in model.</span>

<span class="sd">    Args:</span>
<span class="sd">        model: Neural network with Linear layers</span>
<span class="sd">        calibration_data: Representative samples for activation calibration</span>

<span class="sd">    Returns:</span>
<span class="sd">        quantized_model: Model with QuantizedLinear layers</span>
<span class="sd">        calibration_stats: Scale/zero-point parameters per layer</span>
<span class="sd">    &quot;&quot;&quot;</span>
    <span class="n">quantized_layers</span> <span class="o">=</span> <span class="p">[]</span>
    <span class="k">for</span> <span class="n">layer</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">layers</span><span class="p">:</span>
        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="n">Linear</span><span class="p">):</span>
            <span class="n">q_layer</span> <span class="o">=</span> <span class="n">QuantizedLinear</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span>
            <span class="k">if</span> <span class="n">calibration_data</span><span class="p">:</span>
                <span class="n">q_layer</span><span class="o">.</span><span class="n">calibrate</span><span class="p">(</span><span class="n">calibration_data</span><span class="p">)</span>  <span class="c1"># Find optimal scales</span>
            <span class="n">quantized_layers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">q_layer</span><span class="p">)</span>
        <span class="k">else</span><span class="p">:</span>
            <span class="n">quantized_layers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span>  <span class="c1"># Keep ReLU, Softmax in FP32</span>

    <span class="k">return</span> <span class="n">quantized_layers</span>
</pre></div>
</div>
<p><strong>Calibration in Practice:</strong></p>
<ol class="arabic simple">
<li><p>Run 100-1000 samples through original FP32 model</p></li>
<li><p>Record min/max activations for each layer</p></li>
<li><p>Compute percentile-clipped scales</p></li>
<li><p>Quantize weights with calibrated parameters</p></li>
<li><p>Test accuracy on validation set</p></li>
</ol>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you’ve completed profiling fundamentals:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch

<span class="c1"># Verify prerequisite modules</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>profiling
</pre></div>
</div>
<p><strong>Required Understanding:</strong></p>
<ul class="simple">
<li><p>Memory profiling (Module 14): Measuring memory consumption</p></li>
<li><p>Tensor operations (Module 01): Understanding FP32 representation</p></li>
<li><p>Linear layers (Module 03): Matrix multiplication mechanics</p></li>
</ul>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/15_quantization/quantization_dev.py</span></code></p></li>
<li><p><strong>Implement quantize_int8()</strong>: FP32 → INT8 conversion with scale/zero-point calculation</p></li>
<li><p><strong>Implement dequantize_int8()</strong>: INT8 → FP32 restoration</p></li>
<li><p><strong>Build QuantizedLinear</strong>: Replace Linear layers with quantized versions</p></li>
<li><p><strong>Add calibration logic</strong>: Percentile-based scale selection</p></li>
<li><p><strong>Implement quantize_model()</strong>: Convert entire networks to quantized form</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">15</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">--module</span> <span class="pre">quantization</span></code></p></li>
</ol>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify quantization functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>quantization

<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>quantization<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p>✅ <strong>Quantization Correctness</strong>: FP32 → INT8 → FP32 roundtrip error bounds (&lt; 0.5% mean error)</p></li>
<li><p>✅ <strong>Memory Reduction</strong>: Verify 4× reduction in model size (weights + biases)</p></li>
<li><p>✅ <strong>Symmetric vs Asymmetric</strong>: Both schemes produce valid INT8 in [-128, 127]</p></li>
<li><p>✅ <strong>Calibration Impact</strong>: Percentile clipping reduces quantization error vs naive min/max</p></li>
<li><p>✅ <strong>QuantizedLinear Equivalence</strong>: Output matches FP32 Linear within tolerance (&lt; 1% difference)</p></li>
<li><p>✅ <strong>Model-Level Quantization</strong>: Full network quantization preserves accuracy (&lt; 2% degradation)</p></li>
</ul>
</section>
<section id="inline-testing-quantization-analysis">
<h3>Inline Testing &amp; Quantization Analysis<a class="headerlink" href="#inline-testing-quantization-analysis" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive validation with real-time feedback:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Example inline test output</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">quantize_int8</span><span class="p">()</span><span class="o">...</span>
<span class="err">✅</span> <span class="n">Symmetric</span> <span class="n">quantization</span><span class="p">:</span> <span class="nb">range</span> <span class="p">[</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">]</span> <span class="err">✓</span>
<span class="err">✅</span> <span class="n">Scale</span> <span class="n">calculation</span><span class="p">:</span> <span class="n">max_val</span> <span class="o">/</span> <span class="mi">127</span> <span class="o">=</span> <span class="mf">0.0234</span> <span class="err">✓</span>
<span class="err">✅</span> <span class="n">Roundtrip</span> <span class="n">error</span><span class="p">:</span> <span class="mf">0.31</span><span class="o">%</span> <span class="n">mean</span> <span class="n">error</span> <span class="err">✓</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">quantize_int8</span><span class="p">()</span> <span class="err">✓</span>

<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">QuantizedLinear</span><span class="o">...</span>
<span class="err">✅</span> <span class="n">Memory</span> <span class="n">reduction</span><span class="p">:</span> <span class="mi">145</span><span class="n">KB</span> <span class="err">→</span> <span class="mi">36</span><span class="n">KB</span> <span class="p">(</span><span class="mf">4.0</span><span class="err">×</span><span class="p">)</span> <span class="err">✓</span>
<span class="err">✅</span> <span class="n">Output</span> <span class="n">equivalence</span><span class="p">:</span> <span class="mf">0.43</span><span class="o">%</span> <span class="nb">max</span> <span class="n">difference</span> <span class="n">vs</span> <span class="n">FP32</span> <span class="err">✓</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">QuantizedLinear</span> <span class="err">✓</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">quantization_dev</span><span class="w"> </span><span class="kn">import</span> <span class="n">quantize_int8</span><span class="p">,</span> <span class="n">dequantize_int8</span><span class="p">,</span> <span class="n">QuantizedLinear</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">Linear</span>

<span class="c1"># Test quantization on random tensor</span>
<span class="n">tensor</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="n">q_tensor</span><span class="p">,</span> <span class="n">scale</span><span class="p">,</span> <span class="n">zero_point</span> <span class="o">=</span> <span class="n">quantize_int8</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>

<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Original range: [</span><span class="si">{</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">min</span><span class="p">()</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Quantized range: [</span><span class="si">{</span><span class="n">q_tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">min</span><span class="p">()</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="n">q_tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Scale: </span><span class="si">{</span><span class="n">scale</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">, Zero-point: </span><span class="si">{</span><span class="n">zero_point</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>

<span class="c1"># Dequantize and measure error</span>
<span class="n">restored</span> <span class="o">=</span> <span class="n">dequantize_int8</span><span class="p">(</span><span class="n">q_tensor</span><span class="p">,</span> <span class="n">scale</span><span class="p">,</span> <span class="n">zero_point</span><span class="p">)</span>
<span class="n">error</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span> <span class="o">-</span> <span class="n">restored</span><span class="o">.</span><span class="n">data</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Roundtrip error: </span><span class="si">{</span><span class="n">error</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">error</span><span class="o">/</span><span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">*</span><span class="mi">100</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">%)&quot;</span><span class="p">)</span>

<span class="c1"># Quantize a Linear layer</span>
<span class="n">linear</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">128</span><span class="p">,</span> <span class="mi">64</span><span class="p">)</span>
<span class="n">q_linear</span> <span class="o">=</span> <span class="n">QuantizedLinear</span><span class="p">(</span><span class="n">linear</span><span class="p">)</span>

<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Original weights: </span><span class="si">{</span><span class="n">linear</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="si">}</span><span class="s2"> bytes&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Quantized weights: </span><span class="si">{</span><span class="n">q_linear</span><span class="o">.</span><span class="n">weights_int8</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="si">}</span><span class="s2"> bytes&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Reduction: </span><span class="si">{</span><span class="n">linear</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="n">q_linear</span><span class="o">.</span><span class="n">weights_int8</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2">×&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Mobile ML Deployment</strong>: TensorFlow Lite converts all models to INT8 for Android/iOS. Without quantization, models exceed app size limits (100-200MB) and drain battery 4× faster. Google Photos, Translate, Keyboard all run quantized models on-device.</p></li>
<li><p><strong>Edge AI Devices</strong>: Google Edge TPU (Coral), NVIDIA Jetson, Intel Neural Compute Stick require INT8 models. Hardware is designed exclusively for quantized operations - FP32 isn’t supported or is 10× slower.</p></li>
<li><p><strong>Cloud Inference Optimization</strong>: AWS Inferentia, Azure Inferentia, Google Cloud TPU serve quantized models. INT8 reduces memory bandwidth (bottleneck for inference) and increases throughput by 2-4×. At scale (millions of requests/day), this saves millions in infrastructure costs.</p></li>
<li><p><strong>Large Language Models</strong>: LLaMA-65B is 130GB in FP16, doesn’t fit on single 80GB A100 GPU. INT8 quantization → 65GB, enables serving. GPTQ pushes to 4-bit (33GB) with &lt; 1% perplexity increase. Quantization is how enthusiasts run 70B models on consumer GPUs.</p></li>
</ul>
</section>
<section id="quantization-mathematics">
<h3>Quantization Mathematics<a class="headerlink" href="#quantization-mathematics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Why INT8 vs INT4 or INT16?</strong> INT8 is the sweet spot: 4× memory reduction with &lt; 1% accuracy loss. INT4 gives 8× reduction but 2-5% accuracy loss (harder to deploy). INT16 only 2× reduction (not worth complexity). Hardware acceleration (VNNI, NEON, Tensor Cores) standardized on INT8.</p></li>
<li><p><strong>Symmetric vs Asymmetric Trade-offs</strong>: Symmetric is simpler (no zero-point) but wastes range for skewed data. ReLU activations are [0, max] - symmetric centers around 0, wasting negative range. Asymmetric uses full INT8 range but costs extra zero-point storage and computation.</p></li>
<li><p><strong>Calibration Data Requirements</strong>: Theory: more data → better statistics. Practice: diminishing returns after 500-1000 samples. Percentile estimates stabilize quickly. Critical requirement: calibration data MUST match deployment distribution. If calibration is ImageNet but deployment is medical images, quantization fails catastrophically.</p></li>
<li><p><strong>Per-Channel Justification</strong>: Conv2D with 64 output channels: per-channel stores 64 scales + 64 zero-points = 512 bytes. Total weights: 3×3×64×64 FP32 = 147KB. Overhead: 0.35%. Accuracy improvement: 0.5-1.5%. Clear win - explains why production frameworks default to per-channel.</p></li>
</ul>
</section>
<section id="production-deployment-characteristics">
<h3>Production Deployment Characteristics<a class="headerlink" href="#production-deployment-characteristics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Speed Reality Check</strong>: INT8 matmul is theoretically 4× faster (4× less memory bandwidth). Practice: 2-3× on CPU (quantize/dequantize overhead), 4-10× on specialized hardware (Edge TPU, Neural Engine designed for pure INT8 graphs). Your Python implementation is 0× faster (simulation overhead &gt; bandwidth savings).</p></li>
<li><p><strong>When Quantization is Mandatory</strong>: Mobile deployment (app size limits, battery constraints, Neural Engine acceleration), Edge devices (limited memory/compute), Cloud serving at scale (cost optimization). Not negotiable - models either quantize or don’t ship.</p></li>
<li><p><strong>When to Avoid Quantization</strong>: Accuracy-critical applications where 1% matters (medical diagnosis, autonomous vehicles), Early research iteration (quantization adds complexity), Models already tiny (&lt; 10MB - quantization overhead not worth it), Cloud serving with abundant resources (FP32 throughput sufficient).</p></li>
<li><p><strong>Quantization-Aware Training vs Post-Training</strong>: PTQ (Post-Training Quantization) is fast (minutes) but loses 1-2% accuracy. QAT (Quantization-Aware Training) requires retraining (days/weeks) but loses &lt; 0.5%. Choose PTQ for rapid iteration, QAT for production deployment. If using pretrained models you don’t own (BERT, ResNet), PTQ is only option.</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>You’re about to implement the precision reduction mathematics that make mobile ML deployment possible. Quantization is the difference between a model that exists in research and a model that ships in apps used by billions.</p>
<p>This module teaches honest quantization: you’ll implement the math correctly, achieve 4× memory reduction, and understand precisely why your Python code isn’t faster (hardware acceleration requires specialized silicon + compiled kernels). This clarity prepares you for production deployment where TensorFlow Lite, PyTorch Mobile, and ONNX Runtime apply your quantization mathematics with real INT8 hardware operations.</p>
<p>Understanding quantization from first principles - implementing the scale/zero-point calculations yourself, calibrating with real data, measuring accuracy-efficiency trade-offs - gives you deep insight into the constraints that define production ML systems.</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/15_quantization/quantization_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/15_quantization/quantization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
View Source</div>
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.py</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">Save Your Progress</p>
<p>Binder sessions are temporary. Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../modules/14_profiling/ABOUT.html" title="previous page">← Module 14: Profiling</a>
<a class="right-next" href="../modules/16_compression/ABOUT.html" title="next page">Module 16: Compression →</a>
</div>
</section>
</section>

    <script type="text/x-thebe-config">
    {
        requestKernel: true,
        binderOptions: {
            repo: "binder-examples/jupyter-stacks-datascience",
            ref: "master",
        },
        codeMirrorConfig: {
            theme: "abcdef",
            mode: "python"
        },
        kernelOptions: {
            name: "python3",
            path: "./modules"
        },
        predefinedOutput: true
    }
    </script>
    <script>kernelName = 'python3'</script>

                </article>


                <footer class="prev-next-footer d-print-none">

<div class="prev-next-area">
    <a class="left-prev"
       href="14_profiling_ABOUT.html"
       title="previous page">
      <i class="fa-solid fa-angle-left"></i>
      <div class="prev-next-info">
        <p class="prev-next-subtitle">previous</p>
        <p class="prev-next-title">14. Profiling - Performance Measurement for ML Systems</p>
      </div>
    </a>
    <a class="right-next"
       href="16_compression_ABOUT.html"
       title="next page">
      <div class="prev-next-info">
        <p class="prev-next-subtitle">next</p>
        <p class="prev-next-title">16. Compression - Pruning and Model Compression</p>
      </div>
      <i class="fa-solid fa-angle-right"></i>
    </a>
</div>
                </footer>

            </div>


                <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">


  <div class="sidebar-secondary-item">
  <div class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> Contents
  </div>
  <nav class="bd-toc-nav page-toc">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-flow-fp32-int8">Quantization Flow: FP32 → INT8</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-youre-actually-building-educational-quantization">What You’re Actually Building (Educational Quantization)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-quantization-mathematics">Core Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#calibration-the-critical-step">Calibration - The Critical Step</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#per-tensor-vs-per-channel-quantization">Per-Tensor vs Per-Channel Quantization</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantizedlinear-quantized-neural-network-layer">QuantizedLinear - Quantized Neural Network Layer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#model-level-quantization">Model-Level Quantization</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-quantization-analysis">Inline Testing &amp; Quantization Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-mathematics">Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-deployment-characteristics">Production Deployment Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
  </nav></div>

</div></div>


          </div>
          <footer class="bd-footer-content">

<div class="bd-footer-content__inner container">

  <div class="footer-item">

<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>

  </div>

  <div class="footer-item">


  <p class="copyright">

      © Copyright 2025.
      <br/>

  </p>

  </div>

  <div class="footer-item">

  </div>

  <div class="footer-item">

  </div>

</div>
          </footer>


      </main>
    </div>
  </div>

  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>

  <footer class="bd-footer">
  </footer>
  </body>
</html>