Files
TinyTorch/dev/modules/15_quantization_ABOUT.html
2025-11-25 18:08:28 +00:00

1481 lines
87 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>15. Quantization - Reduced Precision for Efficiency &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=afcf7c3c" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script type="module" src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs"></script>
<script type="module" src="https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.1.4/dist/mermaid-layout-elk.esm.min.mjs"></script>
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";import elkLayouts from "https://cdn.jsdelivr.net/npm/@mermaid-js/layout-elk@0.1.4/dist/mermaid-layout-elk.esm.min.mjs";mermaid.registerLayoutLoaders(elkLayouts);mermaid.initialize({startOnLoad:false});</script>
<script src="https://cdn.jsdelivr.net/npm/d3@7.9.0/dist/d3.min.js"></script>
<script type="module">import mermaid from "https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs";
const defaultStyle = document.createElement('style');
defaultStyle.textContent = `pre.mermaid {
/* Same as .mermaid-container > pre */
display: block;
width: 100%;
}
pre.mermaid > svg {
/* Same as .mermaid-container > pre > svg */
height: 500px;
width: 100%;
max-width: 100% !important;
}
`;
document.head.appendChild(defaultStyle);
const fullscreenStyle = document.createElement('style');
fullscreenStyle.textContent = `.mermaid-container {
display: flex;
flex-direction: row;
width: 100%;
}
.mermaid-container > pre {
display: block;
width: 100%;
}
.mermaid-container > pre > svg {
height: 500px;
width: 100%;
max-width: 100% !important;
}
.mermaid-fullscreen-btn {
width: 28px;
height: 28px;
background: rgba(255, 255, 255, 0.95);
border: 1px solid rgba(0, 0, 0, 0.3);
border-radius: 4px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s;
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
font-size: 14px;
line-height: 1;
padding: 0;
color: #333;
}
.mermaid-fullscreen-btn:hover {
opacity: 100% !important;
background: rgba(255, 255, 255, 1);
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.3);
transform: scale(1.1);
}
.mermaid-fullscreen-btn.dark-theme {
background: rgba(50, 50, 50, 0.95);
border: 1px solid rgba(255, 255, 255, 0.3);
color: #e0e0e0;
}
.mermaid-fullscreen-btn.dark-theme:hover {
background: rgba(60, 60, 60, 1);
box-shadow: 0 3px 10px rgba(255, 255, 255, 0.2);
}
.mermaid-fullscreen-modal {
display: none;
position: fixed !important;
top: 0 !important;
left: 0 !important;
width: 95vw;
height: 100vh;
background: rgba(255, 255, 255, 0.98);
z-index: 9999;
padding: 20px;
overflow: auto;
}
.mermaid-fullscreen-modal.dark-theme {
background: rgba(0, 0, 0, 0.98);
}
.mermaid-fullscreen-modal.active {
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen {
position: relative;
width: 95vw;
height: 90vh;
max-width: 95vw;
max-height: 90vh;
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
overflow: auto;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen.dark-theme {
background: #1a1a1a;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.8);
}
.mermaid-container-fullscreen pre.mermaid {
width: 100%;
height: 100%;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-container-fullscreen .mermaid svg {
height: 100% !important;
width: 100% !important;
cursor: grab;
}
.mermaid-fullscreen-close {
position: fixed !important;
top: 20px !important;
right: 20px !important;
width: 40px;
height: 40px;
background: rgba(255, 255, 255, 0.95);
border: 1px solid rgba(0, 0, 0, 0.2);
border-radius: 50%;
cursor: pointer;
z-index: 10000;
display: flex;
align-items: center;
justify-content: center;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
transition: all 0.2s;
font-size: 24px;
line-height: 1;
color: #333;
}
.mermaid-fullscreen-close:hover {
background: white;
box-shadow: 0 6px 16px rgba(0, 0, 0, 0.4);
transform: scale(1.1);
}
.mermaid-fullscreen-close.dark-theme {
background: rgba(50, 50, 50, 0.95);
border: 1px solid rgba(255, 255, 255, 0.2);
color: #e0e0e0;
}
.mermaid-fullscreen-close.dark-theme:hover {
background: rgba(60, 60, 60, 1);
box-shadow: 0 6px 16px rgba(255, 255, 255, 0.2);
}
.mermaid-fullscreen-modal .mermaid-fullscreen-btn {
display: none !important;
}`;
document.head.appendChild(fullscreenStyle);
// Detect if page has dark background
const isDarkTheme = () => {
const bgColor = window.getComputedStyle(document.body).backgroundColor;
const match = bgColor.match(/rgb\((\d+),\s*(\d+),\s*(\d+)/);
if (match) {
const r = parseInt(match[1]);
const g = parseInt(match[2]);
const b = parseInt(match[3]);
const brightness = (r * 299 + g * 587 + b * 114) / 1000;
return brightness < 128;
}
return false;
};
const load = async () => {
await mermaid.run();
const all_mermaids = document.querySelectorAll(".mermaid");
const mermaids_processed = document.querySelectorAll(".mermaid[data-processed='true']");
if ("False" === "True") {
const mermaids_to_add_zoom = -1 === -1 ? all_mermaids.length : -1;
if(mermaids_to_add_zoom > 0) {
var svgs = d3.selectAll("");
if(all_mermaids.length !== mermaids_processed.length) {
setTimeout(load, 200);
return;
} else if(svgs.size() !== mermaids_to_add_zoom) {
setTimeout(load, 200);
return;
} else {
svgs.each(function() {
var svg = d3.select(this);
svg.html("<g class='wrapper'>" + svg.html() + "</g>");
var inner = svg.select("g");
var zoom = d3.zoom().on("zoom", function(event) {
inner.attr("transform", event.transform);
});
svg.call(zoom);
});
}
}
} else if(all_mermaids.length !== mermaids_processed.length) {
// Wait for mermaid to process all diagrams
setTimeout(load, 200);
return;
}
const darkTheme = isDarkTheme();
// Stop here if not adding fullscreen capability
if ("True" !== "True") return;
const modal = document.createElement('div');
modal.className = 'mermaid-fullscreen-modal' + (darkTheme ? ' dark-theme' : '');
modal.setAttribute('role', 'dialog');
modal.setAttribute('aria-modal', 'true');
modal.setAttribute('aria-label', 'Fullscreen diagram viewer');
modal.innerHTML = `
<button class="mermaid-fullscreen-close${darkTheme ? ' dark-theme' : ''}" aria-label="Close fullscreen">✕</button>
<div class="mermaid-container-fullscreen${darkTheme ? ' dark-theme' : ''}"></div>
`;
document.body.appendChild(modal);
const modalContent = modal.querySelector('.mermaid-container-fullscreen');
const closeBtn = modal.querySelector('.mermaid-fullscreen-close');
let previousScrollOffset = [window.scrollX, window.scrollY];
const closeModal = () => {
modal.classList.remove('active');
modalContent.innerHTML = '';
document.body.style.overflow = ''
window.scrollTo({left: previousScrollOffset[0], top: previousScrollOffset[1], behavior: 'instant'});
};
closeBtn.addEventListener('click', closeModal);
modal.addEventListener('click', (e) => {
if (e.target === modal) closeModal();
});
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape' && modal.classList.contains('active')) {
closeModal();
}
});
const allButtons = [];
document.querySelectorAll('.mermaid').forEach((mermaidDiv) => {
if (mermaidDiv.parentNode.classList.contains('mermaid-container') ||
mermaidDiv.closest('.mermaid-fullscreen-modal')) {
return;
}
const container = document.createElement('div');
container.className = 'mermaid-container';
mermaidDiv.parentNode.insertBefore(container, mermaidDiv);
container.appendChild(mermaidDiv);
const fullscreenBtn = document.createElement('button');
fullscreenBtn.className = 'mermaid-fullscreen-btn' + (darkTheme ? ' dark-theme' : '');
fullscreenBtn.setAttribute('aria-label', 'View diagram in fullscreen');
fullscreenBtn.textContent = '⛶';
fullscreenBtn.style.opacity = '50%';
// Calculate dynamic position based on diagram's margin and padding
const diagramStyle = window.getComputedStyle(mermaidDiv);
const marginTop = parseFloat(diagramStyle.marginTop) || 0;
const marginRight = parseFloat(diagramStyle.marginRight) || 0;
const paddingTop = parseFloat(diagramStyle.paddingTop) || 0;
const paddingRight = parseFloat(diagramStyle.paddingRight) || 0;
fullscreenBtn.style.top = `${marginTop + paddingTop + 4}px`;
fullscreenBtn.style.right = `${marginRight + paddingRight + 4}px`;
fullscreenBtn.addEventListener('click', () => {
previousScrollOffset = [window.scroll, window.scrollY];
const clone = mermaidDiv.cloneNode(true);
modalContent.innerHTML = '';
modalContent.appendChild(clone);
const svg = clone.querySelector('svg');
if (svg) {
svg.removeAttribute('width');
svg.removeAttribute('height');
svg.style.width = '100%';
svg.style.height = 'auto';
svg.style.maxWidth = '100%';
svg.style.sdisplay = 'block';
if ("False" === "True") {
setTimeout(() => {
const g = svg.querySelector('g');
if (g) {
var svgD3 = d3.select(svg);
svgD3.html("<g class='wrapper'>" + svgD3.html() + "</g>");
var inner = svgD3.select("g");
var zoom = d3.zoom().on("zoom", function(event) {
inner.attr("transform", event.transform);
});
svgD3.call(zoom);
}
}, 100);
}
}
modal.classList.add('active');
document.body.style.overflow = 'hidden';
});
container.appendChild(fullscreenBtn);
allButtons.push(fullscreenBtn);
});
// Update theme classes when theme changes
const updateTheme = () => {
const dark = isDarkTheme();
allButtons.forEach(btn => {
if (dark) {
btn.classList.add('dark-theme');
} else {
btn.classList.remove('dark-theme');
}
});
if (dark) {
modal.classList.add('dark-theme');
modalContent.classList.add('dark-theme');
closeBtn.classList.add('dark-theme');
} else {
modal.classList.remove('dark-theme');
modalContent.classList.remove('dark-theme');
closeBtn.classList.remove('dark-theme');
}
};
// Watch for theme changes
const observer = new MutationObserver(updateTheme);
observer.observe(document.documentElement, {
attributes: true,
attributeFilter: ['class', 'style', 'data-theme']
});
observer.observe(document.body, {
attributes: true,
attributeFilter: ['class', 'style']
});
};
window.addEventListener("load", load);
</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/15_quantization_ABOUT';</script>
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
<script src="../_static/wip-banner.js?v=5357532b"></script>
<script src="../_static/marimo-badges.js?v=1e5d2842"></script>
<script src="../_static/sidebar-link.js?v=404b701b"></script>
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
<link rel="icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="16. Compression - Pruning and Model Compression" href="16_compression_ABOUT.html" />
<link rel="prev" title="14. Profiling - Performance Measurement for ML Systems" href="14_profiling_ABOUT.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search this book..."
aria-label="Search this book..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<ul class="nav bd-sidenav bd-sidenav__home-link">
<li class="toctree-l1">
<a class="reference internal" href="../intro.html">
Getting Started
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../quickstart-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../student-workflow.html">Student Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/classroom-use.html">For Instructors</a></li>
<li class="toctree-l1"><a class="reference internal" href="../instructor-guide.html">Instructor Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/ta-guide.html">TA Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../usage-paths/team-onboarding.html">Team Onboarding</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-source-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Source repositories">
<i class="fab fa-github"></i>
</button>
<ul class="dropdown-menu">
<li><a href="https://github.com/mlsysbook/TinyTorch" target="_blank"
class="btn btn-sm btn-source-repository-button dropdown-item"
title="Source repository"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fab fa-github"></i>
</span>
<span class="btn__text-container">Repository</span>
</a>
</li>
<li><a href="https://github.com/mlsysbook/TinyTorch/edit/main/site/modules/15_quantization_ABOUT.md" target="_blank"
class="btn btn-sm btn-source-edit-button dropdown-item"
title="Suggest edit"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-pencil-alt"></i>
</span>
<span class="btn__text-container">Suggest edit</span>
</a>
</li>
<li><a href="https://github.com/mlsysbook/TinyTorch/issues/new?title=Issue%20on%20page%20%2Fmodules/15_quantization_ABOUT.html&body=Your%20issue%20content%20here." target="_blank"
class="btn btn-sm btn-source-issues-button dropdown-item"
title="Open an issue"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-lightbulb"></i>
</span>
<span class="btn__text-container">Open issue</span>
</a>
</li>
</ul>
</div>
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/15_quantization_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>15. Quantization - Reduced Precision for Efficiency</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-flow-fp32-int8">Quantization Flow: FP32 → INT8</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-youre-actually-building-educational-quantization">What Youre Actually Building (Educational Quantization)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-quantization-mathematics">Core Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#calibration-the-critical-step">Calibration - The Critical Step</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#per-tensor-vs-per-channel-quantization">Per-Tensor vs Per-Channel Quantization</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantizedlinear-quantized-neural-network-layer">QuantizedLinear - Quantized Neural Network Layer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#model-level-quantization">Model-Level Quantization</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-quantization-analysis">Inline Testing &amp; Quantization Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-mathematics">Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-deployment-characteristics">Production Deployment Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="quantization-reduced-precision-for-efficiency">
<h1>15. Quantization - Reduced Precision for Efficiency<a class="headerlink" href="#quantization-reduced-precision-for-efficiency" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 5-6 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>This module implements quantization fundamentals: converting FP32 tensors to INT8 representation to reduce memory by 4×. Youll build the mathematics of scale/zero-point quantization, implement quantized linear layers, and measure accuracy-efficiency trade-offs. CRITICAL HONESTY: Youre implementing quantization math in Python, NOT actual hardware INT8 operations. This teaches the principles that enable TensorFlow Lite/PyTorch Mobile deployment, but real speedups require specialized hardware (Edge TPU, Neural Engine) or compiled frameworks with INT8 kernels. Your implementation will be 4× more memory-efficient but not faster - understanding WHY teaches you what production quantization frameworks must optimize.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Quantization Mathematics</strong>: Implement symmetric and asymmetric INT8 quantization with scale/zero-point parameter calculation</p></li>
<li><p><strong>Calibration Strategies</strong>: Design percentile-based calibration to minimize accuracy loss when selecting quantization parameters</p></li>
<li><p><strong>Memory-Accuracy Trade-offs</strong>: Measure when 4× memory reduction justifies 0.5-2% accuracy degradation for deployment</p></li>
<li><p><strong>Production Reality</strong>: Distinguish between educational quantization (Python simulation) vs production INT8 (hardware acceleration, kernel fusion)</p></li>
<li><p><strong>When to Quantize</strong>: Recognize deployment scenarios where quantization is mandatory (mobile/edge) vs optional (cloud serving)</p></li>
</ul>
</section>
<section id="build-use-optimize">
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Optimize</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement INT8 quantization/dequantization, calibration logic, QuantizedLinear layers</p></li>
<li><p><strong>Use</strong>: Quantize trained models, measure accuracy degradation vs memory savings on MNIST/CIFAR</p></li>
<li><p><strong>Optimize</strong>: Analyze the accuracy-efficiency frontier - when does quantization enable deployment vs hurt accuracy unacceptably?</p></li>
</ol>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="quantization-flow-fp32-int8">
<h3>Quantization Flow: FP32 → INT8<a class="headerlink" href="#quantization-flow-fp32-int8" title="Link to this heading">#</a></h3>
<p>Quantization compresses weights by reducing precision, trading accuracy for memory efficiency:</p>
<pre class="mermaid">
graph LR
A[FP32 Weight&lt;br/&gt;4 bytes&lt;br/&gt;-3.14159] --&gt; B[Quantize&lt;br/&gt;scale + zero_point]
B --&gt; C[INT8 Weight&lt;br/&gt;1 byte&lt;br/&gt;-126]
C --&gt; D[Dequantize&lt;br/&gt;Inference]
D --&gt; E[FP32 Compute&lt;br/&gt;Result]
style A fill:#e3f2fd
style B fill:#fff3e0
style C fill:#f3e5f5
style D fill:#ffe0b2
style E fill:#f0fdf4
</pre><p><strong>Flow</strong>: Original FP32 → Calibrate scale → Store as INT8 (4× smaller) → Dequantize for computation → FP32 result</p>
</section>
<section id="what-youre-actually-building-educational-quantization">
<h3>What Youre Actually Building (Educational Quantization)<a class="headerlink" href="#what-youre-actually-building-educational-quantization" title="Link to this heading">#</a></h3>
<p><strong>Your Implementation:</strong></p>
<ul class="simple">
<li><p>Quantization math: FP32 → INT8 conversion with scale/zero-point</p></li>
<li><p>QuantizedLinear: Store weights as INT8, compute in simulated quantized arithmetic</p></li>
<li><p>Calibration: Find optimal scale parameters from representative data</p></li>
<li><p>Memory measurement: Verify 4× reduction (32 bits → 8 bits)</p></li>
</ul>
<p><strong>What Youre NOT Building:</strong></p>
<ul class="simple">
<li><p>Actual INT8 hardware operations (requires CPU VNNI, ARM NEON, GPU Tensor Cores)</p></li>
<li><p>Kernel fusion (eliminating quantize/dequantize overhead)</p></li>
<li><p>Mixed-precision execution graphs (FP32 for sensitive ops, INT8 for matmul)</p></li>
<li><p>Production deployment pipelines (TensorFlow Lite converter, ONNX Runtime optimization)</p></li>
</ul>
<p><strong>Why This Matters:</strong> Understanding quantization math is essential. But knowing that production speedups require hardware acceleration + compiler optimization prevents unrealistic expectations. Your 4× memory reduction is real; your lack of speedup teaches why TensorFlow Lite needs custom kernels.</p>
</section>
<section id="core-quantization-mathematics">
<h3>Core Quantization Mathematics<a class="headerlink" href="#core-quantization-mathematics" title="Link to this heading">#</a></h3>
<p><strong>Symmetric Quantization (Zero-Point = 0)</strong></p>
<p>Assumes data is centered around zero (common after BatchNorm):</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Quantization: FP32 → INT8</span>
<span class="n">scale</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">abs</span><span class="p">(</span><span class="n">tensor</span><span class="p">))</span> <span class="o">/</span> <span class="mf">127.0</span> <span class="c1"># Scale factor</span>
<span class="n">quantized</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="n">tensor</span> <span class="o">/</span> <span class="n">scale</span><span class="p">)</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">int8</span><span class="p">)</span>
<span class="c1"># Dequantization: INT8 → FP32</span>
<span class="n">dequantized</span> <span class="o">=</span> <span class="n">quantized</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">float32</span><span class="p">)</span> <span class="o">*</span> <span class="n">scale</span>
</pre></div>
</div>
<ul class="simple">
<li><p><strong>Range</strong>: INT8 is [-128, 127] (256 values)</p></li>
<li><p><strong>Scale</strong>: Maps largest FP32 value to 127</p></li>
<li><p><strong>Zero-point</strong>: Always 0 (symmetric around origin)</p></li>
<li><p><strong>Use case</strong>: Weights after normalization, activations after BatchNorm</p></li>
</ul>
<p><strong>Asymmetric Quantization (With Zero-Point)</strong></p>
<p>Handles arbitrary data ranges (e.g., activations after ReLU: [0, max]):</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Quantization: FP32 → INT8</span>
<span class="n">min_val</span><span class="p">,</span> <span class="n">max_val</span> <span class="o">=</span> <span class="n">tensor</span><span class="o">.</span><span class="n">min</span><span class="p">(),</span> <span class="n">tensor</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
<span class="n">scale</span> <span class="o">=</span> <span class="p">(</span><span class="n">max_val</span> <span class="o">-</span> <span class="n">min_val</span><span class="p">)</span> <span class="o">/</span> <span class="mf">255.0</span>
<span class="n">zero_point</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="o">-</span><span class="n">min_val</span> <span class="o">/</span> <span class="n">scale</span><span class="p">)</span>
<span class="n">quantized</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="n">tensor</span> <span class="o">/</span> <span class="n">scale</span> <span class="o">+</span> <span class="n">zero_point</span><span class="p">)</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">int8</span><span class="p">)</span>
<span class="c1"># Dequantization: INT8 → FP32</span>
<span class="n">dequantized</span> <span class="o">=</span> <span class="p">(</span><span class="n">quantized</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">float32</span><span class="p">)</span> <span class="o">-</span> <span class="n">zero_point</span><span class="p">)</span> <span class="o">*</span> <span class="n">scale</span>
</pre></div>
</div>
<ul class="simple">
<li><p><strong>Range</strong>: Uses full [-128, 127] even if data is [0, 5]</p></li>
<li><p><strong>Scale</strong>: Maps data range to INT8 range</p></li>
<li><p><strong>Zero-point</strong>: Offset ensuring FP32 zero maps to specific INT8 value</p></li>
<li><p><strong>Use case</strong>: ReLU activations, input images, any non-centered data</p></li>
</ul>
<p><strong>Trade-off:</strong> Symmetric is simpler (no zero-point storage/computation), asymmetric uses range more efficiently (better for skewed distributions).</p>
</section>
<section id="calibration-the-critical-step">
<h3>Calibration - The Critical Step<a class="headerlink" href="#calibration-the-critical-step" title="Link to this heading">#</a></h3>
<p>Quantization quality depends entirely on scale/zero-point selection. Poor choices destroy accuracy.</p>
<p><strong>Naive Approach (Dont Do This):</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use global min/max from training data</span>
<span class="n">scale</span> <span class="o">=</span> <span class="p">(</span><span class="n">tensor_max</span> <span class="o">-</span> <span class="n">tensor_min</span><span class="p">)</span> <span class="o">/</span> <span class="mi">255</span>
<span class="c1"># Problem: Single outlier wastes most INT8 range</span>
<span class="c1"># Example: data in [0, 5] but one outlier at 100 → scale = 100/255</span>
<span class="c1"># Result: 95% of data maps to only 13 INT8 values (5/100 * 255 = 13)</span>
</pre></div>
</div>
<p><strong>Calibration Approach (Correct):</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use percentile-based clipping</span>
<span class="n">max_val</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">percentile</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">calibration_data</span><span class="p">),</span> <span class="mf">99.9</span><span class="p">)</span>
<span class="n">scale</span> <span class="o">=</span> <span class="n">max_val</span> <span class="o">/</span> <span class="mi">127</span>
<span class="c1"># Clips 0.1% outliers, uses INT8 range efficiently</span>
<span class="c1"># 99.9th percentile ignores rare outliers, preserves typical range</span>
</pre></div>
</div>
<p><strong>Calibration Process:</strong></p>
<ol class="arabic simple">
<li><p>Collect 100-1000 samples of representative data (validation set)</p></li>
<li><p>For each layer, record activation statistics during forward passes</p></li>
<li><p>Compute percentile-based min/max (typically 99.9th percentile)</p></li>
<li><p>Calculate scale/zero-point from clipped statistics</p></li>
<li><p>Quantize weights/activations using calibrated parameters</p></li>
</ol>
<p><strong>Why It Works:</strong> Most activations follow normal-ish distributions. Outliers are rare but dominate min/max. Clipping 0.1% of outliers uses INT8 range 10-100× more efficiently with negligible accuracy loss.</p>
</section>
<section id="per-tensor-vs-per-channel-quantization">
<h3>Per-Tensor vs Per-Channel Quantization<a class="headerlink" href="#per-tensor-vs-per-channel-quantization" title="Link to this heading">#</a></h3>
<p><strong>Per-Tensor Quantization:</strong></p>
<ul class="simple">
<li><p>One scale/zero-point for entire weight tensor</p></li>
<li><p>Simple: store 2 parameters per layer</p></li>
<li><p>Example: Conv2D with 64×3×3×3 weights uses 1 scale, 1 zero-point</p></li>
</ul>
<p><strong>Per-Channel Quantization:</strong></p>
<ul class="simple">
<li><p>Separate scale/zero-point per output channel</p></li>
<li><p>Better accuracy: each channel uses its natural range</p></li>
<li><p>Example: Conv2D with 64 output channels uses 64 scales, 64 zero-points</p></li>
<li><p>Overhead: 128 extra parameters (64 scales + 64 zero-points)</p></li>
</ul>
<p><strong>When to Use Per-Channel:</strong></p>
<ul class="simple">
<li><p>Weight magnitudes vary significantly across channels (common in Conv layers)</p></li>
<li><p>Accuracy improvement (0.5-1.5%) justifies 0.1-0.5% memory overhead</p></li>
<li><p>Production frameworks (PyTorch, TensorFlow Lite) default to per-channel for Conv/Linear</p></li>
</ul>
<p><strong>Trade-off Table:</strong></p>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Quantization Scheme</p></th>
<th class="head"><p>Parameters</p></th>
<th class="head"><p>Accuracy</p></th>
<th class="head"><p>Complexity</p></th>
<th class="head"><p>Use Case</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Per-Tensor</p></td>
<td><p>2 per layer</p></td>
<td><p>Baseline</p></td>
<td><p>Simple</p></td>
<td><p>Fast prototyping, small models</p></td>
</tr>
<tr class="row-odd"><td><p>Per-Channel (Conv)</p></td>
<td><p>2N (N=channels)</p></td>
<td><p>+0.5-1.5%</p></td>
<td><p>Medium</p></td>
<td><p>Production Conv layers</p></td>
</tr>
<tr class="row-even"><td><p>Per-Channel (Linear)</p></td>
<td><p>2N (N=out_features)</p></td>
<td><p>+0.3-0.8%</p></td>
<td><p>Medium</p></td>
<td><p>Production Linear layers</p></td>
</tr>
<tr class="row-odd"><td><p>Mixed (Conv per-channel, Linear per-tensor)</p></td>
<td><p>Hybrid</p></td>
<td><p>+0.4-1.2%</p></td>
<td><p>Medium</p></td>
<td><p>Balanced approach</p></td>
</tr>
</tbody>
</table>
</div>
</section>
<section id="quantizedlinear-quantized-neural-network-layer">
<h3>QuantizedLinear - Quantized Neural Network Layer<a class="headerlink" href="#quantizedlinear-quantized-neural-network-layer" title="Link to this heading">#</a></h3>
<p>Replaces regular Linear layer with quantized equivalent:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">QuantizedLinear</span><span class="p">:</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">linear_layer</span><span class="p">:</span> <span class="n">Linear</span><span class="p">):</span>
<span class="c1"># Quantize weights at initialization</span>
<span class="bp">self</span><span class="o">.</span><span class="n">weights_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_zp</span> <span class="o">=</span> <span class="n">quantize_int8</span><span class="p">(</span><span class="n">linear_layer</span><span class="o">.</span><span class="n">weight</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">bias_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_zp</span> <span class="o">=</span> <span class="n">quantize_int8</span><span class="p">(</span><span class="n">linear_layer</span><span class="o">.</span><span class="n">bias</span><span class="p">)</span>
<span class="c1"># Store original FP32 for accuracy comparison</span>
<span class="bp">self</span><span class="o">.</span><span class="n">original_weight</span> <span class="o">=</span> <span class="n">linear_layer</span><span class="o">.</span><span class="n">weight</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
<span class="c1"># EDUCATIONAL VERSION: Dequantize → compute in FP32 → quantize result</span>
<span class="c1"># (Simulates quantization math but doesn&#39;t speed up computation)</span>
<span class="n">weight_fp32</span> <span class="o">=</span> <span class="n">dequantize_int8</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">weights_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight_zp</span><span class="p">)</span>
<span class="n">bias_fp32</span> <span class="o">=</span> <span class="n">dequantize_int8</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">bias_int8</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_scale</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias_zp</span><span class="p">)</span>
<span class="c1"># Compute in FP32 (not actually faster - just lower precision storage)</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">x</span> <span class="o">@</span> <span class="n">weight_fp32</span><span class="o">.</span><span class="n">T</span> <span class="o">+</span> <span class="n">bias_fp32</span>
<span class="k">return</span> <span class="n">output</span>
</pre></div>
</div>
<p><strong>What Happens in Production (TensorFlow Lite, PyTorch Mobile):</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Production quantized matmul (conceptual - happens in C++/assembly)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">quantized_matmul_production</span><span class="p">(</span><span class="n">x_int8</span><span class="p">,</span> <span class="n">weight_int8</span><span class="p">,</span> <span class="n">x_scale</span><span class="p">,</span> <span class="n">weight_scale</span><span class="p">,</span> <span class="n">output_scale</span><span class="p">):</span>
<span class="c1"># 1. INT8 x INT8 matmul using VNNI/NEON/Tensor Cores (FAST)</span>
<span class="n">accum_int32</span> <span class="o">=</span> <span class="n">matmul_int8_hardware</span><span class="p">(</span><span class="n">x_int8</span><span class="p">,</span> <span class="n">weight_int8</span><span class="p">)</span> <span class="c1"># Specialized instruction</span>
<span class="c1"># 2. Requantize accumulated INT32 → INT8 output</span>
<span class="n">combined_scale</span> <span class="o">=</span> <span class="p">(</span><span class="n">x_scale</span> <span class="o">*</span> <span class="n">weight_scale</span><span class="p">)</span> <span class="o">/</span> <span class="n">output_scale</span>
<span class="n">output_int8</span> <span class="o">=</span> <span class="p">(</span><span class="n">accum_int32</span> <span class="o">*</span> <span class="n">combined_scale</span><span class="p">)</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">)</span>
<span class="c1"># 3. Stay in INT8 for next layer (no dequantization unless necessary)</span>
<span class="k">return</span> <span class="n">output_int8</span>
</pre></div>
</div>
<p><strong>Key Differences:</strong></p>
<ul class="simple">
<li><p><strong>Your implementation</strong>: Dequantize → FP32 compute → quantize (educational, slow)</p></li>
<li><p><strong>Production</strong>: INT8 → INT8 throughout, specialized hardware (4-10× speedup)</p></li>
</ul>
<p><strong>Memory Savings (Real):</strong> 4× reduction from storing INT8 instead of FP32
<strong>Speed Improvement (Your Code):</strong> ~0× (Python overhead dominates)
<strong>Speed Improvement (Production):</strong> 2-10× (hardware acceleration, kernel fusion)</p>
</section>
<section id="model-level-quantization">
<h3>Model-Level Quantization<a class="headerlink" href="#model-level-quantization" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">quantize_model</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">calibration_data</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Quantize all Linear layers in model.</span>
<span class="sd"> Args:</span>
<span class="sd"> model: Neural network with Linear layers</span>
<span class="sd"> calibration_data: Representative samples for activation calibration</span>
<span class="sd"> Returns:</span>
<span class="sd"> quantized_model: Model with QuantizedLinear layers</span>
<span class="sd"> calibration_stats: Scale/zero-point parameters per layer</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">quantized_layers</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">layer</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">layers</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="n">Linear</span><span class="p">):</span>
<span class="n">q_layer</span> <span class="o">=</span> <span class="n">QuantizedLinear</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span>
<span class="k">if</span> <span class="n">calibration_data</span><span class="p">:</span>
<span class="n">q_layer</span><span class="o">.</span><span class="n">calibrate</span><span class="p">(</span><span class="n">calibration_data</span><span class="p">)</span> <span class="c1"># Find optimal scales</span>
<span class="n">quantized_layers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">q_layer</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">quantized_layers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span> <span class="c1"># Keep ReLU, Softmax in FP32</span>
<span class="k">return</span> <span class="n">quantized_layers</span>
</pre></div>
</div>
<p><strong>Calibration in Practice:</strong></p>
<ol class="arabic simple">
<li><p>Run 100-1000 samples through original FP32 model</p></li>
<li><p>Record min/max activations for each layer</p></li>
<li><p>Compute percentile-clipped scales</p></li>
<li><p>Quantize weights with calibrated parameters</p></li>
<li><p>Test accuracy on validation set</p></li>
</ol>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure youve completed profiling fundamentals:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
<span class="c1"># Verify prerequisite modules</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>profiling
</pre></div>
</div>
<p><strong>Required Understanding:</strong></p>
<ul class="simple">
<li><p>Memory profiling (Module 14): Measuring memory consumption</p></li>
<li><p>Tensor operations (Module 01): Understanding FP32 representation</p></li>
<li><p>Linear layers (Module 03): Matrix multiplication mechanics</p></li>
</ul>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/15_quantization/quantization_dev.py</span></code></p></li>
<li><p><strong>Implement quantize_int8()</strong>: FP32 → INT8 conversion with scale/zero-point calculation</p></li>
<li><p><strong>Implement dequantize_int8()</strong>: INT8 → FP32 restoration</p></li>
<li><p><strong>Build QuantizedLinear</strong>: Replace Linear layers with quantized versions</p></li>
<li><p><strong>Add calibration logic</strong>: Percentile-based scale selection</p></li>
<li><p><strong>Implement quantize_model()</strong>: Convert entire networks to quantized form</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">15</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">--module</span> <span class="pre">quantization</span></code></p></li>
</ol>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify quantization functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>--module<span class="w"> </span>quantization
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>quantization<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Quantization Correctness</strong>: FP32 → INT8 → FP32 roundtrip error bounds (&lt; 0.5% mean error)</p></li>
<li><p><strong>Memory Reduction</strong>: Verify 4× reduction in model size (weights + biases)</p></li>
<li><p><strong>Symmetric vs Asymmetric</strong>: Both schemes produce valid INT8 in [-128, 127]</p></li>
<li><p><strong>Calibration Impact</strong>: Percentile clipping reduces quantization error vs naive min/max</p></li>
<li><p><strong>QuantizedLinear Equivalence</strong>: Output matches FP32 Linear within tolerance (&lt; 1% difference)</p></li>
<li><p><strong>Model-Level Quantization</strong>: Full network quantization preserves accuracy (&lt; 2% degradation)</p></li>
</ul>
</section>
<section id="inline-testing-quantization-analysis">
<h3>Inline Testing &amp; Quantization Analysis<a class="headerlink" href="#inline-testing-quantization-analysis" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive validation with real-time feedback:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Example inline test output</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">quantize_int8</span><span class="p">()</span><span class="o">...</span>
<span class="err"></span> <span class="n">Symmetric</span> <span class="n">quantization</span><span class="p">:</span> <span class="nb">range</span> <span class="p">[</span><span class="o">-</span><span class="mi">128</span><span class="p">,</span> <span class="mi">127</span><span class="p">]</span> <span class="err"></span>
<span class="err"></span> <span class="n">Scale</span> <span class="n">calculation</span><span class="p">:</span> <span class="n">max_val</span> <span class="o">/</span> <span class="mi">127</span> <span class="o">=</span> <span class="mf">0.0234</span> <span class="err"></span>
<span class="err"></span> <span class="n">Roundtrip</span> <span class="n">error</span><span class="p">:</span> <span class="mf">0.31</span><span class="o">%</span> <span class="n">mean</span> <span class="n">error</span> <span class="err"></span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">quantize_int8</span><span class="p">()</span> <span class="err"></span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">QuantizedLinear</span><span class="o">...</span>
<span class="err"></span> <span class="n">Memory</span> <span class="n">reduction</span><span class="p">:</span> <span class="mi">145</span><span class="n">KB</span> <span class="err"></span> <span class="mi">36</span><span class="n">KB</span> <span class="p">(</span><span class="mf">4.0</span><span class="err">×</span><span class="p">)</span> <span class="err"></span>
<span class="err"></span> <span class="n">Output</span> <span class="n">equivalence</span><span class="p">:</span> <span class="mf">0.43</span><span class="o">%</span> <span class="nb">max</span> <span class="n">difference</span> <span class="n">vs</span> <span class="n">FP32</span> <span class="err"></span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">QuantizedLinear</span> <span class="err"></span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">quantization_dev</span><span class="w"> </span><span class="kn">import</span> <span class="n">quantize_int8</span><span class="p">,</span> <span class="n">dequantize_int8</span><span class="p">,</span> <span class="n">QuantizedLinear</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">Linear</span>
<span class="c1"># Test quantization on random tensor</span>
<span class="n">tensor</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="n">q_tensor</span><span class="p">,</span> <span class="n">scale</span><span class="p">,</span> <span class="n">zero_point</span> <span class="o">=</span> <span class="n">quantize_int8</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Original range: [</span><span class="si">{</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">min</span><span class="p">()</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Quantized range: [</span><span class="si">{</span><span class="n">q_tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">min</span><span class="p">()</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="n">q_tensor</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Scale: </span><span class="si">{</span><span class="n">scale</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">, Zero-point: </span><span class="si">{</span><span class="n">zero_point</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Dequantize and measure error</span>
<span class="n">restored</span> <span class="o">=</span> <span class="n">dequantize_int8</span><span class="p">(</span><span class="n">q_tensor</span><span class="p">,</span> <span class="n">scale</span><span class="p">,</span> <span class="n">zero_point</span><span class="p">)</span>
<span class="n">error</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span> <span class="o">-</span> <span class="n">restored</span><span class="o">.</span><span class="n">data</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Roundtrip error: </span><span class="si">{</span><span class="n">error</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">error</span><span class="o">/</span><span class="n">np</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">tensor</span><span class="o">.</span><span class="n">data</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">*</span><span class="mi">100</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">%)&quot;</span><span class="p">)</span>
<span class="c1"># Quantize a Linear layer</span>
<span class="n">linear</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">128</span><span class="p">,</span> <span class="mi">64</span><span class="p">)</span>
<span class="n">q_linear</span> <span class="o">=</span> <span class="n">QuantizedLinear</span><span class="p">(</span><span class="n">linear</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Original weights: </span><span class="si">{</span><span class="n">linear</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="si">}</span><span class="s2"> bytes&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Quantized weights: </span><span class="si">{</span><span class="n">q_linear</span><span class="o">.</span><span class="n">weights_int8</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="si">}</span><span class="s2"> bytes&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Reduction: </span><span class="si">{</span><span class="n">linear</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="n">q_linear</span><span class="o">.</span><span class="n">weights_int8</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">nbytes</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2">×&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Mobile ML Deployment</strong>: TensorFlow Lite converts all models to INT8 for Android/iOS. Without quantization, models exceed app size limits (100-200MB) and drain battery 4× faster. Google Photos, Translate, Keyboard all run quantized models on-device.</p></li>
<li><p><strong>Edge AI Devices</strong>: Google Edge TPU (Coral), NVIDIA Jetson, Intel Neural Compute Stick require INT8 models. Hardware is designed exclusively for quantized operations - FP32 isnt supported or is 10× slower.</p></li>
<li><p><strong>Cloud Inference Optimization</strong>: AWS Inferentia, Azure Inferentia, Google Cloud TPU serve quantized models. INT8 reduces memory bandwidth (bottleneck for inference) and increases throughput by 2-4×. At scale (millions of requests/day), this saves millions in infrastructure costs.</p></li>
<li><p><strong>Large Language Models</strong>: LLaMA-65B is 130GB in FP16, doesnt fit on single 80GB A100 GPU. INT8 quantization → 65GB, enables serving. GPTQ pushes to 4-bit (33GB) with &lt; 1% perplexity increase. Quantization is how enthusiasts run 70B models on consumer GPUs.</p></li>
</ul>
</section>
<section id="quantization-mathematics">
<h3>Quantization Mathematics<a class="headerlink" href="#quantization-mathematics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Why INT8 vs INT4 or INT16?</strong> INT8 is the sweet spot: 4× memory reduction with &lt; 1% accuracy loss. INT4 gives 8× reduction but 2-5% accuracy loss (harder to deploy). INT16 only 2× reduction (not worth complexity). Hardware acceleration (VNNI, NEON, Tensor Cores) standardized on INT8.</p></li>
<li><p><strong>Symmetric vs Asymmetric Trade-offs</strong>: Symmetric is simpler (no zero-point) but wastes range for skewed data. ReLU activations are [0, max] - symmetric centers around 0, wasting negative range. Asymmetric uses full INT8 range but costs extra zero-point storage and computation.</p></li>
<li><p><strong>Calibration Data Requirements</strong>: Theory: more data → better statistics. Practice: diminishing returns after 500-1000 samples. Percentile estimates stabilize quickly. Critical requirement: calibration data MUST match deployment distribution. If calibration is ImageNet but deployment is medical images, quantization fails catastrophically.</p></li>
<li><p><strong>Per-Channel Justification</strong>: Conv2D with 64 output channels: per-channel stores 64 scales + 64 zero-points = 512 bytes. Total weights: 3×3×64×64 FP32 = 147KB. Overhead: 0.35%. Accuracy improvement: 0.5-1.5%. Clear win - explains why production frameworks default to per-channel.</p></li>
</ul>
</section>
<section id="production-deployment-characteristics">
<h3>Production Deployment Characteristics<a class="headerlink" href="#production-deployment-characteristics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Speed Reality Check</strong>: INT8 matmul is theoretically 4× faster (4× less memory bandwidth). Practice: 2-3× on CPU (quantize/dequantize overhead), 4-10× on specialized hardware (Edge TPU, Neural Engine designed for pure INT8 graphs). Your Python implementation is 0× faster (simulation overhead &gt; bandwidth savings).</p></li>
<li><p><strong>When Quantization is Mandatory</strong>: Mobile deployment (app size limits, battery constraints, Neural Engine acceleration), Edge devices (limited memory/compute), Cloud serving at scale (cost optimization). Not negotiable - models either quantize or dont ship.</p></li>
<li><p><strong>When to Avoid Quantization</strong>: Accuracy-critical applications where 1% matters (medical diagnosis, autonomous vehicles), Early research iteration (quantization adds complexity), Models already tiny (&lt; 10MB - quantization overhead not worth it), Cloud serving with abundant resources (FP32 throughput sufficient).</p></li>
<li><p><strong>Quantization-Aware Training vs Post-Training</strong>: PTQ (Post-Training Quantization) is fast (minutes) but loses 1-2% accuracy. QAT (Quantization-Aware Training) requires retraining (days/weeks) but loses &lt; 0.5%. Choose PTQ for rapid iteration, QAT for production deployment. If using pretrained models you dont own (BERT, ResNet), PTQ is only option.</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>Youre about to implement the precision reduction mathematics that make mobile ML deployment possible. Quantization is the difference between a model that exists in research and a model that ships in apps used by billions.</p>
<p>This module teaches honest quantization: youll implement the math correctly, achieve 4× memory reduction, and understand precisely why your Python code isnt faster (hardware acceleration requires specialized silicon + compiled kernels). This clarity prepares you for production deployment where TensorFlow Lite, PyTorch Mobile, and ONNX Runtime apply your quantization mathematics with real INT8 hardware operations.</p>
<p>Understanding quantization from first principles - implementing the scale/zero-point calculations yourself, calibrating with real data, measuring accuracy-efficiency trade-offs - gives you deep insight into the constraints that define production ML systems.</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/15_quantization/quantization_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/15_quantization/quantization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
View Source</div>
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/15_quantization/quantization_dev.py</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">Save Your Progress</p>
<p>Binder sessions are temporary. Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../modules/14_profiling/ABOUT.html" title="previous page">← Module 14: Profiling</a>
<a class="right-next" href="../modules/16_compression/ABOUT.html" title="next page">Module 16: Compression →</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="14_profiling_ABOUT.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">14. Profiling - Performance Measurement for ML Systems</p>
</div>
</a>
<a class="right-next"
href="16_compression_ABOUT.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">16. Compression - Pruning and Model Compression</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-flow-fp32-int8">Quantization Flow: FP32 → INT8</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-youre-actually-building-educational-quantization">What Youre Actually Building (Educational Quantization)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-quantization-mathematics">Core Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#calibration-the-critical-step">Calibration - The Critical Step</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#per-tensor-vs-per-channel-quantization">Per-Tensor vs Per-Channel Quantization</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantizedlinear-quantized-neural-network-layer">QuantizedLinear - Quantized Neural Network Layer</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#model-level-quantization">Model-Level Quantization</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-quantization-analysis">Inline Testing &amp; Quantization Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#quantization-mathematics">Quantization Mathematics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-deployment-characteristics">Production Deployment Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>